cdec now supports coarse-to-fine decoding (for SCFG only).

CTF has several options: -coarse_to_fine_beam_prune=<double> (required to activate CTF) assign an alpha parameter for pruning the coarse foreast -ctf_beam_widen=<double> (optional, defaults to 2.0): ratio to widen coarse pruning beam if fine parse fails. -ctf_num_widenings=<int> (optional, defaults to 2): number of times to widen coarse beam before defaulting to exhaustive source parsing -ctf_no_exhaustive (optional) do not attempt exhaustive parse if CTF fails to find a parse. Additionally, script extools/coarsen_grammar.pl will create a coarse-to-fine grammar (for X?? categories *only*). cdec will read CTF grammars in a format identical to the original, in which refinements of a rule immediately follow the coarse projection, preceded by an additional whitespace character. Not fully tested, but should be backwards compatible. Also not yet integrated into pipelines, but should work on the command line. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@231 ec762483-ff6d-05da-a07a-a48fb63a330f
author: adam.d.lopez <adam.d.lopez@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-13 03:27:59 +0000
committer: adam.d.lopez <adam.d.lopez@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-13 03:27:59 +0000
commit: 47c75319638866609f669346e15663c5ba43af7f (patch)
tree: 50bc1867bdb1ce27e679fb8981b1b805c4897cd6 /extools
parent: a2360c873ba8b72744e16752a067276a46d63645 (diff)
1 files changed, 133 insertions, 0 deletions
diff --git a/extools/coarsen_grammar.pl b/extools/coarsen_grammar.pl
new file mode 100755
index 00000000..f2dd6689
--- /dev/null
+++ b/extools/coarsen_grammar.pl
@@ -0,0 +1,133 @@
+#!/usr/bin/perl
+
+# dumb grammar coarsener that maps every nonterminal to X (except S).
+
+use strict;
+
+unless (@ARGV > 1){ 
+  die "Usage: $0 <weight file> <grammar file> [<grammar file> ... <grammar file>] \n";
+}
+my $weight_file = shift @ARGV;
+
+$ENV{"LC_ALL"} = "C";
+local(*GRAMMAR, *OUT_GRAMMAR, *WEIGHTS);
+
+my %weights;
+unless (open(WEIGHTS, $weight_file)) {die "Could not open weight file $weight_file\n" }
+while (<WEIGHTS>){
+  if (/(.+) (.+)$/){
+    $weights{$1} = $2;
+  } 
+}
+close(WEIGHTS);
+unless (keys(%weights)){
+  die "Could not find any PhraseModel features in weight file (perhaps you specified the wrong file?)\n\n".
+    "Usage: $0 <weight file> <grammar file> [<grammar file> ... <grammar file>] \n";
+}
+
+sub cleanup_and_die;
+$SIG{INT} = "cleanup_and_die";
+$SIG{TERM} = "cleanup_and_die"; 
+$SIG{HUP} = "cleanup_and_die";
+
+open(OUT_GRAMMAR, ">grammar.tmp");
+while (my $grammar_file = shift @ARGV){
+  unless (open(GRAMMAR, $grammar_file)) {die "Could not open grammar file $grammar_file\n"}
+  while (<GRAMMAR>){
+    if (/^((.*\|{3}){3})(.*)$/){
+      my $rule = $1;
+      my $rest = $3;
+      my $coarse_rule = $rule;
+      $coarse_rule =~ s/\[X[^\],]*/[X/g;
+      print OUT_GRAMMAR "$coarse_rule $rule $rest\n";
+    } else {
+      die "Unrecognized rule format: $_\n";
+    }
+  }
+  close(GRAMMAR);
+}
+close(OUT_GRAMMAR);
+
+`sort grammar.tmp > grammar.tmp.sorted`;
+sub dump_rules;
+sub compute_score;
+unless (open(GRAMMAR, "grammar.tmp.sorted")){ die "Something went wrong; could not open intermediate file grammar.tmp.sorted\n"};
+my $prev_coarse_rule = "";
+my $best_features = "";
+my $best_score = 0;
+my @rules = ();
+while (<GRAMMAR>){
+  if (/^\s*((\S.*\|{3}\s*){3})((\S.*\|{3}\s*){3})(.*)$/){
+    my $coarse_rule = $1;
+    my $fine_rule = $3;
+    my $features = $5;  # This code does not correctly handle rules with other info (e.g. alignments)
+    if ($coarse_rule eq $prev_coarse_rule){
+      my $score = compute_score($features, %weights);
+      if ($score > $best_score){
+        $best_score = $score;
+        $best_features = $features;
+      }
+    } else {
+      dump_rules($prev_coarse_rule, $best_features, @rules);
+      $prev_coarse_rule = $coarse_rule;
+      $best_features = $features;
+      $best_score = compute_score($features, %weights);
+      @rules = ();
+    }
+    push(@rules, "$fine_rule$features\n");
+  } else {
+    die "Something went wrong during grammar projection: $_\n";
+  }
+}
+dump_rules($prev_coarse_rule, $best_features, @rules);
+close(GRAMMAR);
+cleanup();
+
+sub compute_score {
+  my($features, %weights) = @_;
+  my $score = 0;
+  if ($features =~ s/^\s*(\S.*\S)\s*$/$1/) { 
+    my @features = split(/\s+/, $features);
+    my $pm=0;
+    for my $feature (@features) {
+      my $feature_name; 
+      my $feature_val;
+      if ($feature =~ /(.*)=(.*)/){
+        $feature_name = $1;
+        $feature_val= $2;
+      } else {
+        $feature_name = "PhraseModel_" . $pm;
+        $feature_val= $feature;
+      }
+      $pm++;
+      if ($weights{$feature_name}){
+        $score += $weights{$feature_name} * $feature_val;
+      } 
+    }  
+  } else {
+    die "Unexpected feature value format: $features\n";
+  }
+  return $score;
+}
+
+sub dump_rules {
+  my($coarse_rule, $coarse_rule_scores, @fine_rules) = @_;
+  unless($coarse_rule){ return; }
+  print "$coarse_rule $coarse_rule_scores\n";
+  for my $rule (@fine_rules){
+    print "\t$rule";
+  }
+}
+
+sub cleanup_and_die {
+  cleanup();
+  die "\n";
+}
+
+sub cleanup {
+ `rm -rf grammar.tmp grammar.tmp.sorted`;
+}
+
+
+
+
author	adam.d.lopez <adam.d.lopez@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-13 03:27:59 +0000
committer	adam.d.lopez <adam.d.lopez@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-13 03:27:59 +0000
commit	47c75319638866609f669346e15663c5ba43af7f (patch)
tree	50bc1867bdb1ce27e679fb8981b1b805c4897cd6 /extools
parent	a2360c873ba8b72744e16752a067276a46d63645 (diff)