summaryrefslogtreecommitdiff
path: root/extools
diff options
context:
space:
mode:
authoradam.d.lopez <adam.d.lopez@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-13 03:27:59 +0000
committeradam.d.lopez <adam.d.lopez@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-13 03:27:59 +0000
commitbded9a46cb3a27b8049f74e9948be783ae6ec42a (patch)
tree6d161d306ae43db7226dbefc84d5cf58dbe50c43 /extools
parent2530464e1c2cc7eec9445b2f9b0ae90c59265f57 (diff)
cdec now supports coarse-to-fine decoding (for SCFG only).
CTF has several options: -coarse_to_fine_beam_prune=<double> (required to activate CTF) assign an alpha parameter for pruning the coarse foreast -ctf_beam_widen=<double> (optional, defaults to 2.0): ratio to widen coarse pruning beam if fine parse fails. -ctf_num_widenings=<int> (optional, defaults to 2): number of times to widen coarse beam before defaulting to exhaustive source parsing -ctf_no_exhaustive (optional) do not attempt exhaustive parse if CTF fails to find a parse. Additionally, script extools/coarsen_grammar.pl will create a coarse-to-fine grammar (for X?? categories *only*). cdec will read CTF grammars in a format identical to the original, in which refinements of a rule immediately follow the coarse projection, preceded by an additional whitespace character. Not fully tested, but should be backwards compatible. Also not yet integrated into pipelines, but should work on the command line. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@231 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'extools')
-rwxr-xr-xextools/coarsen_grammar.pl133
1 files changed, 133 insertions, 0 deletions
diff --git a/extools/coarsen_grammar.pl b/extools/coarsen_grammar.pl
new file mode 100755
index 00000000..f2dd6689
--- /dev/null
+++ b/extools/coarsen_grammar.pl
@@ -0,0 +1,133 @@
+#!/usr/bin/perl
+
+# dumb grammar coarsener that maps every nonterminal to X (except S).
+
+use strict;
+
+unless (@ARGV > 1){
+ die "Usage: $0 <weight file> <grammar file> [<grammar file> ... <grammar file>] \n";
+}
+my $weight_file = shift @ARGV;
+
+$ENV{"LC_ALL"} = "C";
+local(*GRAMMAR, *OUT_GRAMMAR, *WEIGHTS);
+
+my %weights;
+unless (open(WEIGHTS, $weight_file)) {die "Could not open weight file $weight_file\n" }
+while (<WEIGHTS>){
+ if (/(.+) (.+)$/){
+ $weights{$1} = $2;
+ }
+}
+close(WEIGHTS);
+unless (keys(%weights)){
+ die "Could not find any PhraseModel features in weight file (perhaps you specified the wrong file?)\n\n".
+ "Usage: $0 <weight file> <grammar file> [<grammar file> ... <grammar file>] \n";
+}
+
+sub cleanup_and_die;
+$SIG{INT} = "cleanup_and_die";
+$SIG{TERM} = "cleanup_and_die";
+$SIG{HUP} = "cleanup_and_die";
+
+open(OUT_GRAMMAR, ">grammar.tmp");
+while (my $grammar_file = shift @ARGV){
+ unless (open(GRAMMAR, $grammar_file)) {die "Could not open grammar file $grammar_file\n"}
+ while (<GRAMMAR>){
+ if (/^((.*\|{3}){3})(.*)$/){
+ my $rule = $1;
+ my $rest = $3;
+ my $coarse_rule = $rule;
+ $coarse_rule =~ s/\[X[^\],]*/[X/g;
+ print OUT_GRAMMAR "$coarse_rule $rule $rest\n";
+ } else {
+ die "Unrecognized rule format: $_\n";
+ }
+ }
+ close(GRAMMAR);
+}
+close(OUT_GRAMMAR);
+
+`sort grammar.tmp > grammar.tmp.sorted`;
+sub dump_rules;
+sub compute_score;
+unless (open(GRAMMAR, "grammar.tmp.sorted")){ die "Something went wrong; could not open intermediate file grammar.tmp.sorted\n"};
+my $prev_coarse_rule = "";
+my $best_features = "";
+my $best_score = 0;
+my @rules = ();
+while (<GRAMMAR>){
+ if (/^\s*((\S.*\|{3}\s*){3})((\S.*\|{3}\s*){3})(.*)$/){
+ my $coarse_rule = $1;
+ my $fine_rule = $3;
+ my $features = $5; # This code does not correctly handle rules with other info (e.g. alignments)
+ if ($coarse_rule eq $prev_coarse_rule){
+ my $score = compute_score($features, %weights);
+ if ($score > $best_score){
+ $best_score = $score;
+ $best_features = $features;
+ }
+ } else {
+ dump_rules($prev_coarse_rule, $best_features, @rules);
+ $prev_coarse_rule = $coarse_rule;
+ $best_features = $features;
+ $best_score = compute_score($features, %weights);
+ @rules = ();
+ }
+ push(@rules, "$fine_rule$features\n");
+ } else {
+ die "Something went wrong during grammar projection: $_\n";
+ }
+}
+dump_rules($prev_coarse_rule, $best_features, @rules);
+close(GRAMMAR);
+cleanup();
+
+sub compute_score {
+ my($features, %weights) = @_;
+ my $score = 0;
+ if ($features =~ s/^\s*(\S.*\S)\s*$/$1/) {
+ my @features = split(/\s+/, $features);
+ my $pm=0;
+ for my $feature (@features) {
+ my $feature_name;
+ my $feature_val;
+ if ($feature =~ /(.*)=(.*)/){
+ $feature_name = $1;
+ $feature_val= $2;
+ } else {
+ $feature_name = "PhraseModel_" . $pm;
+ $feature_val= $feature;
+ }
+ $pm++;
+ if ($weights{$feature_name}){
+ $score += $weights{$feature_name} * $feature_val;
+ }
+ }
+ } else {
+ die "Unexpected feature value format: $features\n";
+ }
+ return $score;
+}
+
+sub dump_rules {
+ my($coarse_rule, $coarse_rule_scores, @fine_rules) = @_;
+ unless($coarse_rule){ return; }
+ print "$coarse_rule $coarse_rule_scores\n";
+ for my $rule (@fine_rules){
+ print "\t$rule";
+ }
+}
+
+sub cleanup_and_die {
+ cleanup();
+ die "\n";
+}
+
+sub cleanup {
+ `rm -rf grammar.tmp grammar.tmp.sorted`;
+}
+
+
+
+