From 4583e37dd9b5528996332d8aa2b3b72cbf07dd04 Mon Sep 17 00:00:00 2001 From: redpony Date: Sun, 14 Nov 2010 21:52:48 +0000 Subject: basic rescoring framework git-svn-id: https://ws10smt.googlecode.com/svn/trunk@718 ec762483-ff6d-05da-a07a-a48fb63a330f --- rescore/example/README | 4 ++ rescore/example/cdec.ini | 2 + rescore/example/hyp.txt | 5 ++ rescore/example/small.scfg | 9 +++ rescore/example/source.txt | 2 + rescore/example/weights | 1 + rescore/rescore_with_cdec_model.pl | 120 +++++++++++++++++++++++++++++++++++++ 7 files changed, 143 insertions(+) create mode 100644 rescore/example/README create mode 100644 rescore/example/cdec.ini create mode 100644 rescore/example/hyp.txt create mode 100644 rescore/example/small.scfg create mode 100644 rescore/example/source.txt create mode 100644 rescore/example/weights create mode 100755 rescore/rescore_with_cdec_model.pl diff --git a/rescore/example/README b/rescore/example/README new file mode 100644 index 00000000..92b657ca --- /dev/null +++ b/rescore/example/README @@ -0,0 +1,4 @@ +Rescoring example: + + ../rescore_with_cdec_model.pl -c cdec.ini -s source.txt -h hyp.txt -w weights -f RescoringModel + diff --git a/rescore/example/cdec.ini b/rescore/example/cdec.ini new file mode 100644 index 00000000..29a1ece3 --- /dev/null +++ b/rescore/example/cdec.ini @@ -0,0 +1,2 @@ +formalism=scfg +grammar=small.scfg diff --git a/rescore/example/hyp.txt b/rescore/example/hyp.txt new file mode 100644 index 00000000..c4757f6c --- /dev/null +++ b/rescore/example/hyp.txt @@ -0,0 +1,5 @@ +0 ||| A B C ||| F1=1 F2=1 +0 ||| A b c ||| F1=1 F3=1 +0 ||| A C ||| F4=1 +1 ||| X Y ||| F5=1 +1 ||| XY ||| F6=1 diff --git a/rescore/example/small.scfg b/rescore/example/small.scfg new file mode 100644 index 00000000..402a585a --- /dev/null +++ b/rescore/example/small.scfg @@ -0,0 +1,9 @@ +[X] ||| a b c ||| A B C ||| fe=0.2 +[X] ||| a b ||| A B ||| fe=0.8 +[X] ||| c ||| C ||| fe=0.3 +[X] ||| c ||| c ||| fe=1.3 +[X] ||| a b c ||| A B c ||| fe=0.8 +[X] ||| a b c ||| A C ||| fe=2 +[X] ||| x ||| X ||| fe=0.2 +[X] ||| y ||| Y ||| fe=0.5 +[X] ||| x y ||| XY ||| fe=0.8 diff --git a/rescore/example/source.txt b/rescore/example/source.txt new file mode 100644 index 00000000..e8d4eda2 --- /dev/null +++ b/rescore/example/source.txt @@ -0,0 +1,2 @@ +a b c +x y diff --git a/rescore/example/weights b/rescore/example/weights new file mode 100644 index 00000000..a22d36f1 --- /dev/null +++ b/rescore/example/weights @@ -0,0 +1 @@ +fe -0.8 diff --git a/rescore/rescore_with_cdec_model.pl b/rescore/rescore_with_cdec_model.pl new file mode 100755 index 00000000..ea4252cb --- /dev/null +++ b/rescore/rescore_with_cdec_model.pl @@ -0,0 +1,120 @@ +#!/usr/bin/perl -w + +use strict; +use utf8; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; + +my $decoder = "$SCRIPT_DIR/../decoder/cdec"; +my $help; +my $cdec_ini; +my $src_file; +my $hyp_file; +my $reverse_model; +my $weights_file; +my $feature_name='NewModel'; + +sub catch_pipe { + my $signame = shift; + die "$0 received SIGPIPE: did the decoder die?\n"; +} +$SIG{PIPE} = \&catch_pipe; + +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "config|c=s" => \$cdec_ini, + "weights|w=s" => \$weights_file, + "source_file|s=s" => \$src_file, + "feature_name|f=s" => \$feature_name, + "hypothesis_file|h=s" => \$hyp_file, + "reverse" => \$reverse_model, # if true translate hyp -> src + "decoder=s" => \$decoder, + "help" => \$help, +) == 0 || @ARGV!=0 || $help || !$cdec_ini || !$src_file || !$hyp_file) { + usage(); + exit; +} +die "Can't find $decoder" unless -f $decoder; +die "Can't run $decoder" unless -x $decoder; +my $weights = ''; +if (defined $weights_file) { + die "Can't read $weights_file" unless -f $weights_file; + $weights = "-w $weights_file"; +} +my $decoder_command = "$decoder -c $cdec_ini --quiet $weights --show_partition_as_translation"; +print STDERR "DECODER COMMAND: $decoder_command\n"; +my $cdec_pid = open2(\*CDEC_IN, \*CDEC_OUT, $decoder_command) + or die "Couldn't run $decoder: $!"; +sleep 1; + +die "Can't find $cdec_ini" unless -f $cdec_ini; +open SRC, "<$src_file" or die "Can't read $src_file: $!"; +open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; +binmode(SRC,":utf8"); +binmode(HYP,":utf8"); +binmode(STDOUT,":utf8"); +my @source; while(){chomp; push @source, $_; } +close SRC; +my $src_len = scalar @source; +print STDERR "Read $src_len sentences...\n"; +binmode(CDEC_IN, ":utf8"); +binmode(CDEC_OUT, ":utf8"); + +my $cur = undef; +my @hyps = (); +my @feats = (); +while() { + chomp; + my ($id, $hyp, $feats) = split / \|\|\| /; + unless (defined $cur) { $cur = $id; } + die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len; + if ($cur ne $id) { + rescore($cur, $source[$cur], \@hyps, \@feats); + $cur = $id; + @hyps = (); + @feats = (); + } + push @hyps, $hyp; + push @feats, $feats; +} +rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur; + +close CDEC_IN; +close CDEC_OUT; +close HYP; +waitpid($cdec_pid, 0); +my $status = $? >> 8; +if ($status != 0) { + print STDERR "Decoder returned bad status!\n"; +} + +sub rescore { + my ($id, $src, $rh, $rf) = @_; + my @hyps = @$rh; + my @feats = @$rf; + my $nhyps = scalar @hyps; + print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n"; + for (my $i=0; $i < $nhyps; $i++) { + if ($reverse_model) { + print CDEC_OUT "$hyps[$i] ||| $src\n"; + } else { + print CDEC_OUT "$src ||| $hyps[$i]\n"; + } + my $score = ; + chomp $score; + print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n"; + } +} + +sub usage { + print <