reverse model 1 rescorer

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@724 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-11-15 19:33:06 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-11-15 19:33:06 +0000
commit: cbf66b16a05627aaad61ac54090452e1cd0bc16f (patch)
tree: 61fc46b81c60ab44de55be3cc93f6a8319c6d043 /rescore
parent: 24d2bfb3ace1ae15ffee661602a7d03ef2599e48 (diff)
1 files changed, 122 insertions, 0 deletions
diff --git a/rescore/rescore_inv_model1.pl b/rescore/rescore_inv_model1.pl
new file mode 100755
index 00000000..3ac86c46
--- /dev/null
+++ b/rescore/rescore_inv_model1.pl
@@ -0,0 +1,122 @@
+#!/usr/bin/perl -w
+
+use strict;
+use utf8;
+use Getopt::Long;
+
+my $model_file;
+my $src_file;
+my $hyp_file;
+my $help;
+my $reverse_model;
+my $feature_name='M1SrcGivenTrg';
+
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+    "model_file|m=s" => \$model_file,
+    "source_file|s=s" => \$src_file,
+    "feature_name|f=s" => \$feature_name,
+    "hypothesis_file|h=s" => \$hyp_file,
+    "help" => \$help,
+) == 0 || @ARGV!=0 || $help || !$model_file || !$src_file || !$hyp_file) {
+  usage();
+  exit;
+}
+
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+print STDERR "Reading Model 1 probabilities from $model_file...\n";
+open M, "<$model_file" or die "Couldn't read $model_file: $!";
+binmode M, ":utf8";
+my %m1;
+while(<M>){
+  chomp;
+  my ($e,$f,$lp) = split /\s+/;
+  die unless defined $e;
+  die unless defined $f;
+  die unless defined $lp;
+  $m1{$f}->{$e} = $lp;
+}
+close M;
+
+open SRC, "<$src_file" or die "Can't read $src_file: $!";
+open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!";
+binmode(SRC,":utf8");
+binmode(HYP,":utf8");
+binmode(STDOUT,":utf8");
+my @source; while(<SRC>){chomp; push @source, $_; }
+close SRC;
+my $src_len = scalar @source;
+print STDERR "Read $src_len sentences...\n";
+print STDERR "Rescoring...\n";
+
+my $cur = undef;
+my @hyps = ();
+my @feats = ();
+while(<HYP>) {
+  chomp;
+  my ($id, $hyp, $feats) = split / \|\|\| /;
+  unless (defined $cur) { $cur = $id; }
+  die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len;
+  if ($cur ne $id) {
+    rescore($cur, $source[$cur], \@hyps, \@feats);
+    $cur = $id;
+    @hyps = ();
+    @feats = ();
+  }
+  push @hyps, $hyp;
+  push @feats, $feats;
+}
+rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur;
+
+sub rescore {
+  my ($id, $src, $rh, $rf) = @_;
+  my @hyps = @$rh;
+  my @feats = @$rf;
+  my $nhyps = scalar @hyps;
+  print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n";
+  for (my $i=0; $i < $nhyps; $i++) {
+    my $score = 0;
+    if ($reverse_model) {
+      die "not implemented";
+    } else {
+      $score = m1_prob($src, $hyps[$i]);
+    }
+    print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n";
+  }
+
+}
+
+sub m1_prob {
+  my ($fsent, $esent) = @_;
+  die unless defined $fsent;
+  die unless defined $esent;
+  my @fwords = split /\s+/, $fsent;
+  my @ewords = split /\s+/, $esent;
+  push @ewords, "<eps>";
+  my $tp = 0;
+  for my $f (@fwords) {
+    my $m1f = $m1{$f};
+    if (!defined $m1f) { $m1f = {}; }
+    my $tfp = 0;
+    for my $e (@ewords) {
+      my $lp = $m1f->{$e};
+      if (!defined $lp) { $lp = -100; }
+      #print "P($f|$e) = $lp\n";
+      my $prob = exp($lp);
+      #if ($prob > $tfp) { $tfp = $prob; }
+      $tfp += $prob;
+    }
+    $tp += log($tfp);
+    $tp -= log(scalar @ewords);  # uniform probability of each generating word
+  }
+  return $tp;
+}
+
+sub usage {
+  print STDERR "Usage: $0 -m model_file.txt -h hypothesis.nbest -s source.txt\n  Adds the back-translation probability under Model 1\n";
+}
+
+
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-11-15 19:33:06 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-11-15 19:33:06 +0000
commit	cbf66b16a05627aaad61ac54090452e1cd0bc16f (patch)
tree	61fc46b81c60ab44de55be3cc93f6a8319c6d043 /rescore
parent	24d2bfb3ace1ae15ffee661602a7d03ef2599e48 (diff)