summaryrefslogtreecommitdiff
path: root/extools/coarsen_grammar.pl
blob: f2dd668982f88f00d7bb5a7cbca4bce58499e6bc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/perl

# dumb grammar coarsener that maps every nonterminal to X (except S).

use strict;

unless (@ARGV > 1){ 
  die "Usage: $0 <weight file> <grammar file> [<grammar file> ... <grammar file>] \n";
}
my $weight_file = shift @ARGV;

$ENV{"LC_ALL"} = "C";
local(*GRAMMAR, *OUT_GRAMMAR, *WEIGHTS);

my %weights;
unless (open(WEIGHTS, $weight_file)) {die "Could not open weight file $weight_file\n" }
while (<WEIGHTS>){
  if (/(.+) (.+)$/){
    $weights{$1} = $2;
  } 
}
close(WEIGHTS);
unless (keys(%weights)){
  die "Could not find any PhraseModel features in weight file (perhaps you specified the wrong file?)\n\n".
    "Usage: $0 <weight file> <grammar file> [<grammar file> ... <grammar file>] \n";
}

sub cleanup_and_die;
$SIG{INT} = "cleanup_and_die";
$SIG{TERM} = "cleanup_and_die"; 
$SIG{HUP} = "cleanup_and_die";

open(OUT_GRAMMAR, ">grammar.tmp");
while (my $grammar_file = shift @ARGV){
  unless (open(GRAMMAR, $grammar_file)) {die "Could not open grammar file $grammar_file\n"}
  while (<GRAMMAR>){
    if (/^((.*\|{3}){3})(.*)$/){
      my $rule = $1;
      my $rest = $3;
      my $coarse_rule = $rule;
      $coarse_rule =~ s/\[X[^\],]*/[X/g;
      print OUT_GRAMMAR "$coarse_rule $rule $rest\n";
    } else {
      die "Unrecognized rule format: $_\n";
    }
  }
  close(GRAMMAR);
}
close(OUT_GRAMMAR);

`sort grammar.tmp > grammar.tmp.sorted`;
sub dump_rules;
sub compute_score;
unless (open(GRAMMAR, "grammar.tmp.sorted")){ die "Something went wrong; could not open intermediate file grammar.tmp.sorted\n"};
my $prev_coarse_rule = "";
my $best_features = "";
my $best_score = 0;
my @rules = ();
while (<GRAMMAR>){
  if (/^\s*((\S.*\|{3}\s*){3})((\S.*\|{3}\s*){3})(.*)$/){
    my $coarse_rule = $1;
    my $fine_rule = $3;
    my $features = $5;  # This code does not correctly handle rules with other info (e.g. alignments)
    if ($coarse_rule eq $prev_coarse_rule){
      my $score = compute_score($features, %weights);
      if ($score > $best_score){
        $best_score = $score;
        $best_features = $features;
      }
    } else {
      dump_rules($prev_coarse_rule, $best_features, @rules);
      $prev_coarse_rule = $coarse_rule;
      $best_features = $features;
      $best_score = compute_score($features, %weights);
      @rules = ();
    }
    push(@rules, "$fine_rule$features\n");
  } else {
    die "Something went wrong during grammar projection: $_\n";
  }
}
dump_rules($prev_coarse_rule, $best_features, @rules);
close(GRAMMAR);
cleanup();

sub compute_score {
  my($features, %weights) = @_;
  my $score = 0;
  if ($features =~ s/^\s*(\S.*\S)\s*$/$1/) { 
    my @features = split(/\s+/, $features);
    my $pm=0;
    for my $feature (@features) {
      my $feature_name; 
      my $feature_val;
      if ($feature =~ /(.*)=(.*)/){
        $feature_name = $1;
        $feature_val= $2;
      } else {
        $feature_name = "PhraseModel_" . $pm;
        $feature_val= $feature;
      }
      $pm++;
      if ($weights{$feature_name}){
        $score += $weights{$feature_name} * $feature_val;
      } 
    }  
  } else {
    die "Unexpected feature value format: $features\n";
  }
  return $score;
}

sub dump_rules {
  my($coarse_rule, $coarse_rule_scores, @fine_rules) = @_;
  unless($coarse_rule){ return; }
  print "$coarse_rule $coarse_rule_scores\n";
  for my $rule (@fine_rules){
    print "\t$rule";
  }
}

sub cleanup_and_die {
  cleanup();
  die "\n";
}

sub cleanup {
 `rm -rf grammar.tmp grammar.tmp.sorted`;
}