summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-11-14 21:52:48 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-11-14 21:52:48 +0000
commitc92d1b9f426b6eea6ed8c2cd59808aef60e21ed5 (patch)
treef0df84128cb4774030c01de001c06d22383dc963
parentbc93e6d9e869b9769a5d0273b4292eedf921780e (diff)
basic rescoring framework
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@718 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--rescore/example/README4
-rw-r--r--rescore/example/cdec.ini2
-rw-r--r--rescore/example/hyp.txt5
-rw-r--r--rescore/example/small.scfg9
-rw-r--r--rescore/example/source.txt2
-rw-r--r--rescore/example/weights1
-rwxr-xr-xrescore/rescore_with_cdec_model.pl120
7 files changed, 143 insertions, 0 deletions
diff --git a/rescore/example/README b/rescore/example/README
new file mode 100644
index 00000000..92b657ca
--- /dev/null
+++ b/rescore/example/README
@@ -0,0 +1,4 @@
+Rescoring example:
+
+ ../rescore_with_cdec_model.pl -c cdec.ini -s source.txt -h hyp.txt -w weights -f RescoringModel
+
diff --git a/rescore/example/cdec.ini b/rescore/example/cdec.ini
new file mode 100644
index 00000000..29a1ece3
--- /dev/null
+++ b/rescore/example/cdec.ini
@@ -0,0 +1,2 @@
+formalism=scfg
+grammar=small.scfg
diff --git a/rescore/example/hyp.txt b/rescore/example/hyp.txt
new file mode 100644
index 00000000..c4757f6c
--- /dev/null
+++ b/rescore/example/hyp.txt
@@ -0,0 +1,5 @@
+0 ||| A B C ||| F1=1 F2=1
+0 ||| A b c ||| F1=1 F3=1
+0 ||| A C ||| F4=1
+1 ||| X Y ||| F5=1
+1 ||| XY ||| F6=1
diff --git a/rescore/example/small.scfg b/rescore/example/small.scfg
new file mode 100644
index 00000000..402a585a
--- /dev/null
+++ b/rescore/example/small.scfg
@@ -0,0 +1,9 @@
+[X] ||| a b c ||| A B C ||| fe=0.2
+[X] ||| a b ||| A B ||| fe=0.8
+[X] ||| c ||| C ||| fe=0.3
+[X] ||| c ||| c ||| fe=1.3
+[X] ||| a b c ||| A B c ||| fe=0.8
+[X] ||| a b c ||| A C ||| fe=2
+[X] ||| x ||| X ||| fe=0.2
+[X] ||| y ||| Y ||| fe=0.5
+[X] ||| x y ||| XY ||| fe=0.8
diff --git a/rescore/example/source.txt b/rescore/example/source.txt
new file mode 100644
index 00000000..e8d4eda2
--- /dev/null
+++ b/rescore/example/source.txt
@@ -0,0 +1,2 @@
+a b c
+x y
diff --git a/rescore/example/weights b/rescore/example/weights
new file mode 100644
index 00000000..a22d36f1
--- /dev/null
+++ b/rescore/example/weights
@@ -0,0 +1 @@
+fe -0.8
diff --git a/rescore/rescore_with_cdec_model.pl b/rescore/rescore_with_cdec_model.pl
new file mode 100755
index 00000000..ea4252cb
--- /dev/null
+++ b/rescore/rescore_with_cdec_model.pl
@@ -0,0 +1,120 @@
+#!/usr/bin/perl -w
+
+use strict;
+use utf8;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+
+my $decoder = "$SCRIPT_DIR/../decoder/cdec";
+my $help;
+my $cdec_ini;
+my $src_file;
+my $hyp_file;
+my $reverse_model;
+my $weights_file;
+my $feature_name='NewModel';
+
+sub catch_pipe {
+ my $signame = shift;
+ die "$0 received SIGPIPE: did the decoder die?\n";
+}
+$SIG{PIPE} = \&catch_pipe;
+
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+ "config|c=s" => \$cdec_ini,
+ "weights|w=s" => \$weights_file,
+ "source_file|s=s" => \$src_file,
+ "feature_name|f=s" => \$feature_name,
+ "hypothesis_file|h=s" => \$hyp_file,
+ "reverse" => \$reverse_model, # if true translate hyp -> src
+ "decoder=s" => \$decoder,
+ "help" => \$help,
+) == 0 || @ARGV!=0 || $help || !$cdec_ini || !$src_file || !$hyp_file) {
+ usage();
+ exit;
+}
+die "Can't find $decoder" unless -f $decoder;
+die "Can't run $decoder" unless -x $decoder;
+my $weights = '';
+if (defined $weights_file) {
+ die "Can't read $weights_file" unless -f $weights_file;
+ $weights = "-w $weights_file";
+}
+my $decoder_command = "$decoder -c $cdec_ini --quiet $weights --show_partition_as_translation";
+print STDERR "DECODER COMMAND: $decoder_command\n";
+my $cdec_pid = open2(\*CDEC_IN, \*CDEC_OUT, $decoder_command)
+ or die "Couldn't run $decoder: $!";
+sleep 1;
+
+die "Can't find $cdec_ini" unless -f $cdec_ini;
+open SRC, "<$src_file" or die "Can't read $src_file: $!";
+open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!";
+binmode(SRC,":utf8");
+binmode(HYP,":utf8");
+binmode(STDOUT,":utf8");
+my @source; while(<SRC>){chomp; push @source, $_; }
+close SRC;
+my $src_len = scalar @source;
+print STDERR "Read $src_len sentences...\n";
+binmode(CDEC_IN, ":utf8");
+binmode(CDEC_OUT, ":utf8");
+
+my $cur = undef;
+my @hyps = ();
+my @feats = ();
+while(<HYP>) {
+ chomp;
+ my ($id, $hyp, $feats) = split / \|\|\| /;
+ unless (defined $cur) { $cur = $id; }
+ die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len;
+ if ($cur ne $id) {
+ rescore($cur, $source[$cur], \@hyps, \@feats);
+ $cur = $id;
+ @hyps = ();
+ @feats = ();
+ }
+ push @hyps, $hyp;
+ push @feats, $feats;
+}
+rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur;
+
+close CDEC_IN;
+close CDEC_OUT;
+close HYP;
+waitpid($cdec_pid, 0);
+my $status = $? >> 8;
+if ($status != 0) {
+ print STDERR "Decoder returned bad status!\n";
+}
+
+sub rescore {
+ my ($id, $src, $rh, $rf) = @_;
+ my @hyps = @$rh;
+ my @feats = @$rf;
+ my $nhyps = scalar @hyps;
+ print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n";
+ for (my $i=0; $i < $nhyps; $i++) {
+ if ($reverse_model) {
+ print CDEC_OUT "$hyps[$i] ||| $src\n";
+ } else {
+ print CDEC_OUT "$src ||| $hyps[$i]\n";
+ }
+ my $score = <CDEC_IN>;
+ chomp $score;
+ print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n";
+ }
+}
+
+sub usage {
+ print <<EOT;
+Usage: $0 -c cdec.ini [-w cdec_weights.txt] -s source.txt -h hypothesis.nbest.txt [-f FeatureName]
+EOT
+ exit 0
+}
+