From 47c75319638866609f669346e15663c5ba43af7f Mon Sep 17 00:00:00 2001 From: "adam.d.lopez" Date: Tue, 13 Jul 2010 03:27:59 +0000 Subject: cdec now supports coarse-to-fine decoding (for SCFG only). CTF has several options: -coarse_to_fine_beam_prune= (required to activate CTF) assign an alpha parameter for pruning the coarse foreast -ctf_beam_widen= (optional, defaults to 2.0): ratio to widen coarse pruning beam if fine parse fails. -ctf_num_widenings= (optional, defaults to 2): number of times to widen coarse beam before defaulting to exhaustive source parsing -ctf_no_exhaustive (optional) do not attempt exhaustive parse if CTF fails to find a parse. Additionally, script extools/coarsen_grammar.pl will create a coarse-to-fine grammar (for X?? categories *only*). cdec will read CTF grammars in a format identical to the original, in which refinements of a rule immediately follow the coarse projection, preceded by an additional whitespace character. Not fully tested, but should be backwards compatible. Also not yet integrated into pipelines, but should work on the command line. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@231 ec762483-ff6d-05da-a07a-a48fb63a330f --- extools/coarsen_grammar.pl | 133 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100755 extools/coarsen_grammar.pl (limited to 'extools/coarsen_grammar.pl') diff --git a/extools/coarsen_grammar.pl b/extools/coarsen_grammar.pl new file mode 100755 index 00000000..f2dd6689 --- /dev/null +++ b/extools/coarsen_grammar.pl @@ -0,0 +1,133 @@ +#!/usr/bin/perl + +# dumb grammar coarsener that maps every nonterminal to X (except S). + +use strict; + +unless (@ARGV > 1){ + die "Usage: $0 [ ... ] \n"; +} +my $weight_file = shift @ARGV; + +$ENV{"LC_ALL"} = "C"; +local(*GRAMMAR, *OUT_GRAMMAR, *WEIGHTS); + +my %weights; +unless (open(WEIGHTS, $weight_file)) {die "Could not open weight file $weight_file\n" } +while (){ + if (/(.+) (.+)$/){ + $weights{$1} = $2; + } +} +close(WEIGHTS); +unless (keys(%weights)){ + die "Could not find any PhraseModel features in weight file (perhaps you specified the wrong file?)\n\n". + "Usage: $0 [ ... ] \n"; +} + +sub cleanup_and_die; +$SIG{INT} = "cleanup_and_die"; +$SIG{TERM} = "cleanup_and_die"; +$SIG{HUP} = "cleanup_and_die"; + +open(OUT_GRAMMAR, ">grammar.tmp"); +while (my $grammar_file = shift @ARGV){ + unless (open(GRAMMAR, $grammar_file)) {die "Could not open grammar file $grammar_file\n"} + while (){ + if (/^((.*\|{3}){3})(.*)$/){ + my $rule = $1; + my $rest = $3; + my $coarse_rule = $rule; + $coarse_rule =~ s/\[X[^\],]*/[X/g; + print OUT_GRAMMAR "$coarse_rule $rule $rest\n"; + } else { + die "Unrecognized rule format: $_\n"; + } + } + close(GRAMMAR); +} +close(OUT_GRAMMAR); + +`sort grammar.tmp > grammar.tmp.sorted`; +sub dump_rules; +sub compute_score; +unless (open(GRAMMAR, "grammar.tmp.sorted")){ die "Something went wrong; could not open intermediate file grammar.tmp.sorted\n"}; +my $prev_coarse_rule = ""; +my $best_features = ""; +my $best_score = 0; +my @rules = (); +while (){ + if (/^\s*((\S.*\|{3}\s*){3})((\S.*\|{3}\s*){3})(.*)$/){ + my $coarse_rule = $1; + my $fine_rule = $3; + my $features = $5; # This code does not correctly handle rules with other info (e.g. alignments) + if ($coarse_rule eq $prev_coarse_rule){ + my $score = compute_score($features, %weights); + if ($score > $best_score){ + $best_score = $score; + $best_features = $features; + } + } else { + dump_rules($prev_coarse_rule, $best_features, @rules); + $prev_coarse_rule = $coarse_rule; + $best_features = $features; + $best_score = compute_score($features, %weights); + @rules = (); + } + push(@rules, "$fine_rule$features\n"); + } else { + die "Something went wrong during grammar projection: $_\n"; + } +} +dump_rules($prev_coarse_rule, $best_features, @rules); +close(GRAMMAR); +cleanup(); + +sub compute_score { + my($features, %weights) = @_; + my $score = 0; + if ($features =~ s/^\s*(\S.*\S)\s*$/$1/) { + my @features = split(/\s+/, $features); + my $pm=0; + for my $feature (@features) { + my $feature_name; + my $feature_val; + if ($feature =~ /(.*)=(.*)/){ + $feature_name = $1; + $feature_val= $2; + } else { + $feature_name = "PhraseModel_" . $pm; + $feature_val= $feature; + } + $pm++; + if ($weights{$feature_name}){ + $score += $weights{$feature_name} * $feature_val; + } + } + } else { + die "Unexpected feature value format: $features\n"; + } + return $score; +} + +sub dump_rules { + my($coarse_rule, $coarse_rule_scores, @fine_rules) = @_; + unless($coarse_rule){ return; } + print "$coarse_rule $coarse_rule_scores\n"; + for my $rule (@fine_rules){ + print "\t$rule"; + } +} + +sub cleanup_and_die { + cleanup(); + die "\n"; +} + +sub cleanup { + `rm -rf grammar.tmp grammar.tmp.sorted`; +} + + + + -- cgit v1.2.3