diff options
Diffstat (limited to 'rescore')
-rwxr-xr-x | rescore/cdec_kbest_to_zmert.pl | 64 | ||||
-rw-r--r-- | rescore/example/README | 4 | ||||
-rw-r--r-- | rescore/example/cdec.ini | 2 | ||||
-rw-r--r-- | rescore/example/hyp.txt | 5 | ||||
-rw-r--r-- | rescore/example/small.scfg | 9 | ||||
-rw-r--r-- | rescore/example/source.txt | 2 | ||||
-rw-r--r-- | rescore/example/weights | 1 | ||||
-rwxr-xr-x | rescore/generate_zmert_params_from_weights.pl | 26 | ||||
-rwxr-xr-x | rescore/rerank.pl | 86 | ||||
-rwxr-xr-x | rescore/rescore_inv_model1.pl | 126 | ||||
-rwxr-xr-x | rescore/rescore_with_cdec_model.pl | 121 |
11 files changed, 0 insertions, 446 deletions
diff --git a/rescore/cdec_kbest_to_zmert.pl b/rescore/cdec_kbest_to_zmert.pl deleted file mode 100755 index 88bc9682..00000000 --- a/rescore/cdec_kbest_to_zmert.pl +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $feature_file; -my $hyp_file; -my $help; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "feature_file|f=s" => \$feature_file, - "hypothesis_file|h=s" => \$hyp_file, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$feature_file || !$hyp_file) { - usage(); - exit(1); -} - -open W, "<$feature_file" or die "Can't read $feature_file: $!"; -my %weights; -my @all_feats; -while(<W>) { - chomp; - next if /^#/; - next if /^\s*$/; - my ($fname, $w) = split /\s+/; - push @all_feats, $fname; - $weights{$fname} = 1; -} -close W; - -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -while(<HYP>) { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - my @afeats = split /\s+/, $feats; - my $tot = 0; - my %fvaldict; - for my $featpair (@afeats) { - my ($fname,$fval) = split /=/, $featpair; - $fvaldict{$fname} = $fval; - my $weight = $weights{$fname}; - warn "Feature '$fname' not mentioned in feature file $feature_file" unless defined $weight; - $weights{$fname} = 1; - } - my @trans; - for my $feat (@all_feats) { - my $v = $fvaldict{$feat}; - if (!defined $v) { $v = '0.0'; } - push @trans, $v; - } - print "$id ||| $hyp ||| @trans\n"; -} -close HYP; - -sub usage { - print <<EOT; -Usage: $0 -f feature-file.txt/weights.txt -h hyp.nbest.txt - Puts a cdec k-best list into Joshua/ZMERT format -EOT -} - diff --git a/rescore/example/README b/rescore/example/README deleted file mode 100644 index 92b657ca..00000000 --- a/rescore/example/README +++ /dev/null @@ -1,4 +0,0 @@ -Rescoring example: - - ../rescore_with_cdec_model.pl -c cdec.ini -s source.txt -h hyp.txt -w weights -f RescoringModel - diff --git a/rescore/example/cdec.ini b/rescore/example/cdec.ini deleted file mode 100644 index 29a1ece3..00000000 --- a/rescore/example/cdec.ini +++ /dev/null @@ -1,2 +0,0 @@ -formalism=scfg -grammar=small.scfg diff --git a/rescore/example/hyp.txt b/rescore/example/hyp.txt deleted file mode 100644 index c4757f6c..00000000 --- a/rescore/example/hyp.txt +++ /dev/null @@ -1,5 +0,0 @@ -0 ||| A B C ||| F1=1 F2=1 -0 ||| A b c ||| F1=1 F3=1 -0 ||| A C ||| F4=1 -1 ||| X Y ||| F5=1 -1 ||| XY ||| F6=1 diff --git a/rescore/example/small.scfg b/rescore/example/small.scfg deleted file mode 100644 index 402a585a..00000000 --- a/rescore/example/small.scfg +++ /dev/null @@ -1,9 +0,0 @@ -[X] ||| a b c ||| A B C ||| fe=0.2 -[X] ||| a b ||| A B ||| fe=0.8 -[X] ||| c ||| C ||| fe=0.3 -[X] ||| c ||| c ||| fe=1.3 -[X] ||| a b c ||| A B c ||| fe=0.8 -[X] ||| a b c ||| A C ||| fe=2 -[X] ||| x ||| X ||| fe=0.2 -[X] ||| y ||| Y ||| fe=0.5 -[X] ||| x y ||| XY ||| fe=0.8 diff --git a/rescore/example/source.txt b/rescore/example/source.txt deleted file mode 100644 index e8d4eda2..00000000 --- a/rescore/example/source.txt +++ /dev/null @@ -1,2 +0,0 @@ -a b c -x y diff --git a/rescore/example/weights b/rescore/example/weights deleted file mode 100644 index a22d36f1..00000000 --- a/rescore/example/weights +++ /dev/null @@ -1 +0,0 @@ -fe -0.8 diff --git a/rescore/generate_zmert_params_from_weights.pl b/rescore/generate_zmert_params_from_weights.pl deleted file mode 100755 index a9287896..00000000 --- a/rescore/generate_zmert_params_from_weights.pl +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl -w - -my %defaults; -$defaults{'LanguageModel'} = "Opt\t0\t10\t0\t2.5"; -$defaults{'EgivenF'} = "Opt\t-5\t0.5\t-3\t0.5"; -$defaults{'LexEGivenF'} = "Opt\t-5\t0.5\t-3\t0.5"; -$defaults{'LexFGivenE'} = "Opt\t-5\t0.5\t-3\t0.5"; -$defaults{'PassThrough'} = "Opt\t-Inf\t+Inf\t-10\t0"; -$defaults{'WordPenalty'} = "Opt\t-Inf\t2\t-5\t0"; -my $DEFAULT = "Opt\t-Inf\t+Inf\t-1\t+1"; - -while(<>) { - next if /^#/; - chomp; - next if /^\s*$/; - s/^\s+//; - s/\s+$//; - my ($a,$b) = split /\s+/; - next unless ($a && $b); - my $line = $DEFAULT; - if ($defaults{$a}) { $line = $defaults{$a}; } - print "$a\t|||\t$b\t$line\n"; -} - -print "normalization = none\n"; - diff --git a/rescore/rerank.pl b/rescore/rerank.pl deleted file mode 100755 index 4a0c5750..00000000 --- a/rescore/rerank.pl +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $weights_file; -my $hyp_file; -my $help; -my $kbest; # flag to extract reranked list - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "weights_file|w=s" => \$weights_file, - "hypothesis_file|h=s" => \$hyp_file, - "kbest" => \$kbest, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$weights_file || !$hyp_file) { - usage(); - exit(1); -} - -open W, "<$weights_file" or die "Can't read $weights_file: $!"; -my %weights; -while(<W>) { - chomp; - next if /^#/; - next if /^\s*$/; - my ($fname, $w) = split /\s+/; - $weights{$fname} = $w; -} -close W; - -my $cur = undef; -my %hyps = (); -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -while(<HYP>) { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - unless (defined $cur) { $cur = $id; } - if ($cur ne $id) { - extract_1best($cur, \%hyps); - $cur = $id; - %hyps = (); - } - my @afeats = split /\s+/, $feats; - my $tot = 0; - for my $featpair (@afeats) { - my ($fname,$fval) = split /=/, $featpair; - my $weight = $weights{$fname}; - die "Unweighted feature '$fname'" unless defined $weight; - $tot += ($weight * $fval); - } - $hyps{"$hyp ||| $feats"} = $tot; -} -extract_1best($cur, \%hyps) if defined $cur; -close HYP; - -sub extract_1best { - my ($id, $rh) = @_; - my %hyps = %$rh; - if ($kbest) { - for my $hyp (sort { $hyps{$b} <=> $hyps{$a} } keys %hyps) { - print "$id ||| $hyp\n"; - } - } else { - my $best_score = undef; - my $best_hyp = undef; - for my $hyp (keys %hyps) { - if (!defined $best_score || $hyps{$hyp} > $best_score) { - $best_score = $hyps{$hyp}; - $best_hyp = $hyp; - } - } - $best_hyp =~ s/ \|\|\|.*$//; - print "$best_hyp\n"; - } -} - -sub usage { - print <<EOT; -Usage: $0 -w weights.txt -h hyp.nbest.txt [--kbest] - Reranks n-best lists with new weights, extracting the new 1/k-best entries. -EOT -} - diff --git a/rescore/rescore_inv_model1.pl b/rescore/rescore_inv_model1.pl deleted file mode 100755 index 780452f5..00000000 --- a/rescore/rescore_inv_model1.pl +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $model_file; -my $src_file; -my $hyp_file; -my $help; -my $reverse_model; -my $feature_name='M1SrcGivenTrg'; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "model_file|m=s" => \$model_file, - "source_file|s=s" => \$src_file, - "feature_name|f=s" => \$feature_name, - "hypothesis_file|h=s" => \$hyp_file, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$model_file || !$src_file || !$hyp_file) { - usage(); - exit; -} - -binmode STDIN, ":utf8"; -binmode STDOUT, ":utf8"; -binmode STDERR, ":utf8"; - -print STDERR "Reading Model 1 probabilities from $model_file...\n"; -open M, "<$model_file" or die "Couldn't read $model_file: $!"; -binmode M, ":utf8"; -my %m1; -while(<M>){ - chomp; - my ($e,$f,$lp) = split /\s+/; - die unless defined $e; - die unless defined $f; - die unless defined $lp; - $m1{$f}->{$e} = $lp; -} -close M; - -open SRC, "<$src_file" or die "Can't read $src_file: $!"; -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -binmode(SRC,":utf8"); -binmode(HYP,":utf8"); -binmode(STDOUT,":utf8"); -my @source; while(<SRC>){chomp; push @source, $_; } -close SRC; -my $src_len = scalar @source; -print STDERR "Read $src_len sentences...\n"; -print STDERR "Rescoring...\n"; - -my $cur = undef; -my @hyps = (); -my @feats = (); -while(<HYP>) { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - unless (defined $cur) { $cur = $id; } - die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len; - if ($cur ne $id) { - rescore($cur, $source[$cur], \@hyps, \@feats); - $cur = $id; - @hyps = (); - @feats = (); - } - push @hyps, $hyp; - push @feats, $feats; -} -rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur; - -sub rescore { - my ($id, $src, $rh, $rf) = @_; - my @hyps = @$rh; - my @feats = @$rf; - my $nhyps = scalar @hyps; - my %cache = (); - print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n"; - for (my $i=0; $i < $nhyps; $i++) { - my $score = $cache{$hyps[$i]}; - if (!defined $score) { - if ($reverse_model) { - die "not implemented"; - } else { - $score = m1_prob($src, $hyps[$i]); - } - $cache{$hyps[$i]} = $score; - } - print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n"; - } - -} - -sub m1_prob { - my ($fsent, $esent) = @_; - die unless defined $fsent; - die unless defined $esent; - my @fwords = split /\s+/, $fsent; - my @ewords = split /\s+/, $esent; - push @ewords, "<eps>"; - my $tp = 0; - for my $f (@fwords) { - my $m1f = $m1{$f}; - if (!defined $m1f) { $m1f = {}; } - my $tfp = 0; - for my $e (@ewords) { - my $lp = $m1f->{$e}; - if (!defined $lp) { $lp = -100; } - #print "P($f|$e) = $lp\n"; - my $prob = exp($lp); - #if ($prob > $tfp) { $tfp = $prob; } - $tfp += $prob; - } - $tp += log($tfp); - $tp -= log(scalar @ewords); # uniform probability of each generating word - } - return $tp; -} - -sub usage { - print STDERR "Usage: $0 -m model_file.txt -h hypothesis.nbest -s source.txt\n Adds the back-translation probability under Model 1\n Use training/model1 to generate the required parameter file\n"; -} - - diff --git a/rescore/rescore_with_cdec_model.pl b/rescore/rescore_with_cdec_model.pl deleted file mode 100755 index cdd8c217..00000000 --- a/rescore/rescore_with_cdec_model.pl +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -my @ORIG_ARGV=@ARGV; -use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } -use LocalConfig; -use Getopt::Long; -use IPC::Open2; -use POSIX ":sys_wait_h"; - -my $decoder = "$SCRIPT_DIR/../decoder/cdec"; -my $help; -my $cdec_ini; -my $src_file; -my $hyp_file; -my $reverse_model; -my $weights_file; -my $feature_name='NewModel'; - -sub catch_pipe { - my $signame = shift; - die "$0 received SIGPIPE: did the decoder die?\n"; -} -$SIG{PIPE} = \&catch_pipe; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "config|c=s" => \$cdec_ini, - "weights|w=s" => \$weights_file, - "source_file|s=s" => \$src_file, - "feature_name|f=s" => \$feature_name, - "hypothesis_file|h=s" => \$hyp_file, - "reverse" => \$reverse_model, # if true translate hyp -> src - "decoder=s" => \$decoder, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$cdec_ini || !$src_file || !$hyp_file) { - usage(); - exit; -} -die "Can't find $decoder" unless -f $decoder; -die "Can't run $decoder" unless -x $decoder; -my $weights = ''; -if (defined $weights_file) { - die "Can't read $weights_file" unless -f $weights_file; - $weights = "-w $weights_file"; -} -my $decoder_command = "$decoder -c $cdec_ini --quiet $weights --show_conditional_prob"; -print STDERR "DECODER COMMAND: $decoder_command\n"; -my $cdec_pid = open2(\*CDEC_IN, \*CDEC_OUT, $decoder_command) - or die "Couldn't run $decoder: $!"; -sleep 1; - -die "Can't find $cdec_ini" unless -f $cdec_ini; -open SRC, "<$src_file" or die "Can't read $src_file: $!"; -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -binmode(SRC,":utf8"); -binmode(HYP,":utf8"); -binmode(STDOUT,":utf8"); -my @source; while(<SRC>){chomp; push @source, $_; } -close SRC; -my $src_len = scalar @source; -print STDERR "Read $src_len sentences...\n"; -binmode(CDEC_IN, ":utf8"); -binmode(CDEC_OUT, ":utf8"); - -my $cur = undef; -my @hyps = (); -my @feats = (); -while(<HYP>) { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - unless (defined $cur) { $cur = $id; } - die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len; - if ($cur ne $id) { - rescore($cur, $source[$cur], \@hyps, \@feats); - $cur = $id; - @hyps = (); - @feats = (); - } - push @hyps, $hyp; - push @feats, $feats; -} -rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur; - -close CDEC_IN; -close CDEC_OUT; -close HYP; -waitpid($cdec_pid, 0); -my $status = $? >> 8; -if ($status != 0) { - print STDERR "Decoder returned bad status!\n"; -} - -sub rescore { - my ($id, $src, $rh, $rf) = @_; - my @hyps = @$rh; - my @feats = @$rf; - my $nhyps = scalar @hyps; - print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n"; - for (my $i=0; $i < $nhyps; $i++) { - if ($reverse_model) { - print CDEC_OUT "<seg id=\"$id\">$hyps[$i] ||| $src</seg>\n"; - } else { - print CDEC_OUT "<seg id=\"$id\">$src ||| $hyps[$i]</seg>\n"; - } - my $score = <CDEC_IN>; - chomp $score; - my @words = split /\s+/, $hyps[$i]; - print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n"; - } -} - -sub usage { - print <<EOT; -Usage: $0 -c cdec.ini [-w cdec_weights.txt] -s source.txt -h hypothesis.nbest.txt [-f FeatureName] -EOT - exit 0 -} - |