summaryrefslogtreecommitdiff
path: root/rescore
diff options
context:
space:
mode:
Diffstat (limited to 'rescore')
-rwxr-xr-xrescore/cdec_kbest_to_zmert.pl64
-rw-r--r--rescore/example/README4
-rw-r--r--rescore/example/cdec.ini2
-rw-r--r--rescore/example/hyp.txt5
-rw-r--r--rescore/example/small.scfg9
-rw-r--r--rescore/example/source.txt2
-rw-r--r--rescore/example/weights1
-rwxr-xr-xrescore/generate_zmert_params_from_weights.pl26
-rwxr-xr-xrescore/rerank.pl86
-rwxr-xr-xrescore/rescore_inv_model1.pl126
-rwxr-xr-xrescore/rescore_with_cdec_model.pl121
11 files changed, 0 insertions, 446 deletions
diff --git a/rescore/cdec_kbest_to_zmert.pl b/rescore/cdec_kbest_to_zmert.pl
deleted file mode 100755
index 88bc9682..00000000
--- a/rescore/cdec_kbest_to_zmert.pl
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use utf8;
-use Getopt::Long;
-
-my $feature_file;
-my $hyp_file;
-my $help;
-
-Getopt::Long::Configure("no_auto_abbrev");
-if (GetOptions(
- "feature_file|f=s" => \$feature_file,
- "hypothesis_file|h=s" => \$hyp_file,
- "help" => \$help,
-) == 0 || @ARGV!=0 || $help || !$feature_file || !$hyp_file) {
- usage();
- exit(1);
-}
-
-open W, "<$feature_file" or die "Can't read $feature_file: $!";
-my %weights;
-my @all_feats;
-while(<W>) {
- chomp;
- next if /^#/;
- next if /^\s*$/;
- my ($fname, $w) = split /\s+/;
- push @all_feats, $fname;
- $weights{$fname} = 1;
-}
-close W;
-
-open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!";
-while(<HYP>) {
- chomp;
- my ($id, $hyp, $feats) = split / \|\|\| /;
- my @afeats = split /\s+/, $feats;
- my $tot = 0;
- my %fvaldict;
- for my $featpair (@afeats) {
- my ($fname,$fval) = split /=/, $featpair;
- $fvaldict{$fname} = $fval;
- my $weight = $weights{$fname};
- warn "Feature '$fname' not mentioned in feature file $feature_file" unless defined $weight;
- $weights{$fname} = 1;
- }
- my @trans;
- for my $feat (@all_feats) {
- my $v = $fvaldict{$feat};
- if (!defined $v) { $v = '0.0'; }
- push @trans, $v;
- }
- print "$id ||| $hyp ||| @trans\n";
-}
-close HYP;
-
-sub usage {
- print <<EOT;
-Usage: $0 -f feature-file.txt/weights.txt -h hyp.nbest.txt
- Puts a cdec k-best list into Joshua/ZMERT format
-EOT
-}
-
diff --git a/rescore/example/README b/rescore/example/README
deleted file mode 100644
index 92b657ca..00000000
--- a/rescore/example/README
+++ /dev/null
@@ -1,4 +0,0 @@
-Rescoring example:
-
- ../rescore_with_cdec_model.pl -c cdec.ini -s source.txt -h hyp.txt -w weights -f RescoringModel
-
diff --git a/rescore/example/cdec.ini b/rescore/example/cdec.ini
deleted file mode 100644
index 29a1ece3..00000000
--- a/rescore/example/cdec.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-formalism=scfg
-grammar=small.scfg
diff --git a/rescore/example/hyp.txt b/rescore/example/hyp.txt
deleted file mode 100644
index c4757f6c..00000000
--- a/rescore/example/hyp.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-0 ||| A B C ||| F1=1 F2=1
-0 ||| A b c ||| F1=1 F3=1
-0 ||| A C ||| F4=1
-1 ||| X Y ||| F5=1
-1 ||| XY ||| F6=1
diff --git a/rescore/example/small.scfg b/rescore/example/small.scfg
deleted file mode 100644
index 402a585a..00000000
--- a/rescore/example/small.scfg
+++ /dev/null
@@ -1,9 +0,0 @@
-[X] ||| a b c ||| A B C ||| fe=0.2
-[X] ||| a b ||| A B ||| fe=0.8
-[X] ||| c ||| C ||| fe=0.3
-[X] ||| c ||| c ||| fe=1.3
-[X] ||| a b c ||| A B c ||| fe=0.8
-[X] ||| a b c ||| A C ||| fe=2
-[X] ||| x ||| X ||| fe=0.2
-[X] ||| y ||| Y ||| fe=0.5
-[X] ||| x y ||| XY ||| fe=0.8
diff --git a/rescore/example/source.txt b/rescore/example/source.txt
deleted file mode 100644
index e8d4eda2..00000000
--- a/rescore/example/source.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-a b c
-x y
diff --git a/rescore/example/weights b/rescore/example/weights
deleted file mode 100644
index a22d36f1..00000000
--- a/rescore/example/weights
+++ /dev/null
@@ -1 +0,0 @@
-fe -0.8
diff --git a/rescore/generate_zmert_params_from_weights.pl b/rescore/generate_zmert_params_from_weights.pl
deleted file mode 100755
index a9287896..00000000
--- a/rescore/generate_zmert_params_from_weights.pl
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/perl -w
-
-my %defaults;
-$defaults{'LanguageModel'} = "Opt\t0\t10\t0\t2.5";
-$defaults{'EgivenF'} = "Opt\t-5\t0.5\t-3\t0.5";
-$defaults{'LexEGivenF'} = "Opt\t-5\t0.5\t-3\t0.5";
-$defaults{'LexFGivenE'} = "Opt\t-5\t0.5\t-3\t0.5";
-$defaults{'PassThrough'} = "Opt\t-Inf\t+Inf\t-10\t0";
-$defaults{'WordPenalty'} = "Opt\t-Inf\t2\t-5\t0";
-my $DEFAULT = "Opt\t-Inf\t+Inf\t-1\t+1";
-
-while(<>) {
- next if /^#/;
- chomp;
- next if /^\s*$/;
- s/^\s+//;
- s/\s+$//;
- my ($a,$b) = split /\s+/;
- next unless ($a && $b);
- my $line = $DEFAULT;
- if ($defaults{$a}) { $line = $defaults{$a}; }
- print "$a\t|||\t$b\t$line\n";
-}
-
-print "normalization = none\n";
-
diff --git a/rescore/rerank.pl b/rescore/rerank.pl
deleted file mode 100755
index 4a0c5750..00000000
--- a/rescore/rerank.pl
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use utf8;
-use Getopt::Long;
-
-my $weights_file;
-my $hyp_file;
-my $help;
-my $kbest; # flag to extract reranked list
-
-Getopt::Long::Configure("no_auto_abbrev");
-if (GetOptions(
- "weights_file|w=s" => \$weights_file,
- "hypothesis_file|h=s" => \$hyp_file,
- "kbest" => \$kbest,
- "help" => \$help,
-) == 0 || @ARGV!=0 || $help || !$weights_file || !$hyp_file) {
- usage();
- exit(1);
-}
-
-open W, "<$weights_file" or die "Can't read $weights_file: $!";
-my %weights;
-while(<W>) {
- chomp;
- next if /^#/;
- next if /^\s*$/;
- my ($fname, $w) = split /\s+/;
- $weights{$fname} = $w;
-}
-close W;
-
-my $cur = undef;
-my %hyps = ();
-open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!";
-while(<HYP>) {
- chomp;
- my ($id, $hyp, $feats) = split / \|\|\| /;
- unless (defined $cur) { $cur = $id; }
- if ($cur ne $id) {
- extract_1best($cur, \%hyps);
- $cur = $id;
- %hyps = ();
- }
- my @afeats = split /\s+/, $feats;
- my $tot = 0;
- for my $featpair (@afeats) {
- my ($fname,$fval) = split /=/, $featpair;
- my $weight = $weights{$fname};
- die "Unweighted feature '$fname'" unless defined $weight;
- $tot += ($weight * $fval);
- }
- $hyps{"$hyp ||| $feats"} = $tot;
-}
-extract_1best($cur, \%hyps) if defined $cur;
-close HYP;
-
-sub extract_1best {
- my ($id, $rh) = @_;
- my %hyps = %$rh;
- if ($kbest) {
- for my $hyp (sort { $hyps{$b} <=> $hyps{$a} } keys %hyps) {
- print "$id ||| $hyp\n";
- }
- } else {
- my $best_score = undef;
- my $best_hyp = undef;
- for my $hyp (keys %hyps) {
- if (!defined $best_score || $hyps{$hyp} > $best_score) {
- $best_score = $hyps{$hyp};
- $best_hyp = $hyp;
- }
- }
- $best_hyp =~ s/ \|\|\|.*$//;
- print "$best_hyp\n";
- }
-}
-
-sub usage {
- print <<EOT;
-Usage: $0 -w weights.txt -h hyp.nbest.txt [--kbest]
- Reranks n-best lists with new weights, extracting the new 1/k-best entries.
-EOT
-}
-
diff --git a/rescore/rescore_inv_model1.pl b/rescore/rescore_inv_model1.pl
deleted file mode 100755
index 780452f5..00000000
--- a/rescore/rescore_inv_model1.pl
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use utf8;
-use Getopt::Long;
-
-my $model_file;
-my $src_file;
-my $hyp_file;
-my $help;
-my $reverse_model;
-my $feature_name='M1SrcGivenTrg';
-
-Getopt::Long::Configure("no_auto_abbrev");
-if (GetOptions(
- "model_file|m=s" => \$model_file,
- "source_file|s=s" => \$src_file,
- "feature_name|f=s" => \$feature_name,
- "hypothesis_file|h=s" => \$hyp_file,
- "help" => \$help,
-) == 0 || @ARGV!=0 || $help || !$model_file || !$src_file || !$hyp_file) {
- usage();
- exit;
-}
-
-binmode STDIN, ":utf8";
-binmode STDOUT, ":utf8";
-binmode STDERR, ":utf8";
-
-print STDERR "Reading Model 1 probabilities from $model_file...\n";
-open M, "<$model_file" or die "Couldn't read $model_file: $!";
-binmode M, ":utf8";
-my %m1;
-while(<M>){
- chomp;
- my ($e,$f,$lp) = split /\s+/;
- die unless defined $e;
- die unless defined $f;
- die unless defined $lp;
- $m1{$f}->{$e} = $lp;
-}
-close M;
-
-open SRC, "<$src_file" or die "Can't read $src_file: $!";
-open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!";
-binmode(SRC,":utf8");
-binmode(HYP,":utf8");
-binmode(STDOUT,":utf8");
-my @source; while(<SRC>){chomp; push @source, $_; }
-close SRC;
-my $src_len = scalar @source;
-print STDERR "Read $src_len sentences...\n";
-print STDERR "Rescoring...\n";
-
-my $cur = undef;
-my @hyps = ();
-my @feats = ();
-while(<HYP>) {
- chomp;
- my ($id, $hyp, $feats) = split / \|\|\| /;
- unless (defined $cur) { $cur = $id; }
- die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len;
- if ($cur ne $id) {
- rescore($cur, $source[$cur], \@hyps, \@feats);
- $cur = $id;
- @hyps = ();
- @feats = ();
- }
- push @hyps, $hyp;
- push @feats, $feats;
-}
-rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur;
-
-sub rescore {
- my ($id, $src, $rh, $rf) = @_;
- my @hyps = @$rh;
- my @feats = @$rf;
- my $nhyps = scalar @hyps;
- my %cache = ();
- print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n";
- for (my $i=0; $i < $nhyps; $i++) {
- my $score = $cache{$hyps[$i]};
- if (!defined $score) {
- if ($reverse_model) {
- die "not implemented";
- } else {
- $score = m1_prob($src, $hyps[$i]);
- }
- $cache{$hyps[$i]} = $score;
- }
- print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n";
- }
-
-}
-
-sub m1_prob {
- my ($fsent, $esent) = @_;
- die unless defined $fsent;
- die unless defined $esent;
- my @fwords = split /\s+/, $fsent;
- my @ewords = split /\s+/, $esent;
- push @ewords, "<eps>";
- my $tp = 0;
- for my $f (@fwords) {
- my $m1f = $m1{$f};
- if (!defined $m1f) { $m1f = {}; }
- my $tfp = 0;
- for my $e (@ewords) {
- my $lp = $m1f->{$e};
- if (!defined $lp) { $lp = -100; }
- #print "P($f|$e) = $lp\n";
- my $prob = exp($lp);
- #if ($prob > $tfp) { $tfp = $prob; }
- $tfp += $prob;
- }
- $tp += log($tfp);
- $tp -= log(scalar @ewords); # uniform probability of each generating word
- }
- return $tp;
-}
-
-sub usage {
- print STDERR "Usage: $0 -m model_file.txt -h hypothesis.nbest -s source.txt\n Adds the back-translation probability under Model 1\n Use training/model1 to generate the required parameter file\n";
-}
-
-
diff --git a/rescore/rescore_with_cdec_model.pl b/rescore/rescore_with_cdec_model.pl
deleted file mode 100755
index cdd8c217..00000000
--- a/rescore/rescore_with_cdec_model.pl
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use utf8;
-my @ORIG_ARGV=@ARGV;
-use Cwd qw(getcwd);
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
-use LocalConfig;
-use Getopt::Long;
-use IPC::Open2;
-use POSIX ":sys_wait_h";
-
-my $decoder = "$SCRIPT_DIR/../decoder/cdec";
-my $help;
-my $cdec_ini;
-my $src_file;
-my $hyp_file;
-my $reverse_model;
-my $weights_file;
-my $feature_name='NewModel';
-
-sub catch_pipe {
- my $signame = shift;
- die "$0 received SIGPIPE: did the decoder die?\n";
-}
-$SIG{PIPE} = \&catch_pipe;
-
-Getopt::Long::Configure("no_auto_abbrev");
-if (GetOptions(
- "config|c=s" => \$cdec_ini,
- "weights|w=s" => \$weights_file,
- "source_file|s=s" => \$src_file,
- "feature_name|f=s" => \$feature_name,
- "hypothesis_file|h=s" => \$hyp_file,
- "reverse" => \$reverse_model, # if true translate hyp -> src
- "decoder=s" => \$decoder,
- "help" => \$help,
-) == 0 || @ARGV!=0 || $help || !$cdec_ini || !$src_file || !$hyp_file) {
- usage();
- exit;
-}
-die "Can't find $decoder" unless -f $decoder;
-die "Can't run $decoder" unless -x $decoder;
-my $weights = '';
-if (defined $weights_file) {
- die "Can't read $weights_file" unless -f $weights_file;
- $weights = "-w $weights_file";
-}
-my $decoder_command = "$decoder -c $cdec_ini --quiet $weights --show_conditional_prob";
-print STDERR "DECODER COMMAND: $decoder_command\n";
-my $cdec_pid = open2(\*CDEC_IN, \*CDEC_OUT, $decoder_command)
- or die "Couldn't run $decoder: $!";
-sleep 1;
-
-die "Can't find $cdec_ini" unless -f $cdec_ini;
-open SRC, "<$src_file" or die "Can't read $src_file: $!";
-open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!";
-binmode(SRC,":utf8");
-binmode(HYP,":utf8");
-binmode(STDOUT,":utf8");
-my @source; while(<SRC>){chomp; push @source, $_; }
-close SRC;
-my $src_len = scalar @source;
-print STDERR "Read $src_len sentences...\n";
-binmode(CDEC_IN, ":utf8");
-binmode(CDEC_OUT, ":utf8");
-
-my $cur = undef;
-my @hyps = ();
-my @feats = ();
-while(<HYP>) {
- chomp;
- my ($id, $hyp, $feats) = split / \|\|\| /;
- unless (defined $cur) { $cur = $id; }
- die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len;
- if ($cur ne $id) {
- rescore($cur, $source[$cur], \@hyps, \@feats);
- $cur = $id;
- @hyps = ();
- @feats = ();
- }
- push @hyps, $hyp;
- push @feats, $feats;
-}
-rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur;
-
-close CDEC_IN;
-close CDEC_OUT;
-close HYP;
-waitpid($cdec_pid, 0);
-my $status = $? >> 8;
-if ($status != 0) {
- print STDERR "Decoder returned bad status!\n";
-}
-
-sub rescore {
- my ($id, $src, $rh, $rf) = @_;
- my @hyps = @$rh;
- my @feats = @$rf;
- my $nhyps = scalar @hyps;
- print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n";
- for (my $i=0; $i < $nhyps; $i++) {
- if ($reverse_model) {
- print CDEC_OUT "<seg id=\"$id\">$hyps[$i] ||| $src</seg>\n";
- } else {
- print CDEC_OUT "<seg id=\"$id\">$src ||| $hyps[$i]</seg>\n";
- }
- my $score = <CDEC_IN>;
- chomp $score;
- my @words = split /\s+/, $hyps[$i];
- print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n";
- }
-}
-
-sub usage {
- print <<EOT;
-Usage: $0 -c cdec.ini [-w cdec_weights.txt] -s source.txt -h hypothesis.nbest.txt [-f FeatureName]
-EOT
- exit 0
-}
-