diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-11-15 19:33:06 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-11-15 19:33:06 +0000 |
commit | cbf66b16a05627aaad61ac54090452e1cd0bc16f (patch) | |
tree | 61fc46b81c60ab44de55be3cc93f6a8319c6d043 | |
parent | 24d2bfb3ace1ae15ffee661602a7d03ef2599e48 (diff) |
reverse model 1 rescorer
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@724 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-x | rescore/rescore_inv_model1.pl | 122 |
1 files changed, 122 insertions, 0 deletions
diff --git a/rescore/rescore_inv_model1.pl b/rescore/rescore_inv_model1.pl new file mode 100755 index 00000000..3ac86c46 --- /dev/null +++ b/rescore/rescore_inv_model1.pl @@ -0,0 +1,122 @@ +#!/usr/bin/perl -w + +use strict; +use utf8; +use Getopt::Long; + +my $model_file; +my $src_file; +my $hyp_file; +my $help; +my $reverse_model; +my $feature_name='M1SrcGivenTrg'; + +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "model_file|m=s" => \$model_file, + "source_file|s=s" => \$src_file, + "feature_name|f=s" => \$feature_name, + "hypothesis_file|h=s" => \$hyp_file, + "help" => \$help, +) == 0 || @ARGV!=0 || $help || !$model_file || !$src_file || !$hyp_file) { + usage(); + exit; +} + +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +print STDERR "Reading Model 1 probabilities from $model_file...\n"; +open M, "<$model_file" or die "Couldn't read $model_file: $!"; +binmode M, ":utf8"; +my %m1; +while(<M>){ + chomp; + my ($e,$f,$lp) = split /\s+/; + die unless defined $e; + die unless defined $f; + die unless defined $lp; + $m1{$f}->{$e} = $lp; +} +close M; + +open SRC, "<$src_file" or die "Can't read $src_file: $!"; +open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; +binmode(SRC,":utf8"); +binmode(HYP,":utf8"); +binmode(STDOUT,":utf8"); +my @source; while(<SRC>){chomp; push @source, $_; } +close SRC; +my $src_len = scalar @source; +print STDERR "Read $src_len sentences...\n"; +print STDERR "Rescoring...\n"; + +my $cur = undef; +my @hyps = (); +my @feats = (); +while(<HYP>) { + chomp; + my ($id, $hyp, $feats) = split / \|\|\| /; + unless (defined $cur) { $cur = $id; } + die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len; + if ($cur ne $id) { + rescore($cur, $source[$cur], \@hyps, \@feats); + $cur = $id; + @hyps = (); + @feats = (); + } + push @hyps, $hyp; + push @feats, $feats; +} +rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur; + +sub rescore { + my ($id, $src, $rh, $rf) = @_; + my @hyps = @$rh; + my @feats = @$rf; + my $nhyps = scalar @hyps; + print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n"; + for (my $i=0; $i < $nhyps; $i++) { + my $score = 0; + if ($reverse_model) { + die "not implemented"; + } else { + $score = m1_prob($src, $hyps[$i]); + } + print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n"; + } + +} + +sub m1_prob { + my ($fsent, $esent) = @_; + die unless defined $fsent; + die unless defined $esent; + my @fwords = split /\s+/, $fsent; + my @ewords = split /\s+/, $esent; + push @ewords, "<eps>"; + my $tp = 0; + for my $f (@fwords) { + my $m1f = $m1{$f}; + if (!defined $m1f) { $m1f = {}; } + my $tfp = 0; + for my $e (@ewords) { + my $lp = $m1f->{$e}; + if (!defined $lp) { $lp = -100; } + #print "P($f|$e) = $lp\n"; + my $prob = exp($lp); + #if ($prob > $tfp) { $tfp = $prob; } + $tfp += $prob; + } + $tp += log($tfp); + $tp -= log(scalar @ewords); # uniform probability of each generating word + } + return $tp; +} + +sub usage { + print STDERR "Usage: $0 -m model_file.txt -h hypothesis.nbest -s source.txt\n Adds the back-translation probability under Model 1\n"; +} + + |