merge with upstream

author: Patrick Simianer <p@simianer.de> 2012-03-13 09:24:47 +0100
committer: Patrick Simianer <p@simianer.de> 2012-03-13 09:24:47 +0100
commit: ef6085e558e26c8819f1735425761103021b6470 (patch)
tree: 5cf70e4c48c64d838e1326b5a505c8c4061bff4a /rescore
parent: 10a232656a0c882b3b955d2bcfac138ce11e8a2e (diff)
parent: dfbc278c1057555fda9312291c8024049e00b7d8 (diff)
11 files changed, 0 insertions, 446 deletions
diff --git a/rescore/cdec_kbest_to_zmert.pl b/rescore/cdec_kbest_to_zmert.pl
deleted file mode 100755
index 88bc9682..00000000
--- a/rescore/cdec_kbest_to_zmert.pl
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use utf8;
-use Getopt::Long;
-
-my $feature_file;
-my $hyp_file;
-my $help;
-
-Getopt::Long::Configure("no_auto_abbrev");
-if (GetOptions(
-    "feature_file|f=s" => \$feature_file,
-    "hypothesis_file|h=s" => \$hyp_file,
-    "help" => \$help,
-) == 0 || @ARGV!=0 || $help || !$feature_file || !$hyp_file) {
-  usage();
-  exit(1);
-}
-
-open W, "<$feature_file" or die "Can't read $feature_file: $!";
-my %weights;
-my @all_feats;
-while(<W>) {
-  chomp;
-  next if /^#/;
-  next if /^\s*$/;
-  my ($fname, $w) = split /\s+/;
-  push @all_feats, $fname;
-  $weights{$fname} = 1;
-}
-close W;
-
-open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!";
-while(<HYP>) {
-  chomp;
-  my ($id, $hyp, $feats) = split / \|\|\| /;
-  my @afeats = split /\s+/, $feats;
-  my $tot = 0;
-  my %fvaldict;
-  for my $featpair (@afeats) {
-    my ($fname,$fval) = split /=/, $featpair;
-    $fvaldict{$fname} = $fval;
-    my $weight = $weights{$fname};
-    warn "Feature '$fname' not mentioned in feature file $feature_file" unless defined $weight;
-    $weights{$fname} = 1;
-  }
-  my @trans;
-  for my $feat (@all_feats) {
-    my $v = $fvaldict{$feat};
-    if (!defined $v) { $v = '0.0'; }
-    push @trans, $v;
-  }
-  print "$id ||| $hyp ||| @trans\n";
-}
-close HYP;
-
-sub usage {
-  print <<EOT;
-Usage: $0 -f feature-file.txt/weights.txt -h hyp.nbest.txt
-  Puts a cdec k-best list into Joshua/ZMERT format
-EOT
-}
-
diff --git a/rescore/example/README b/rescore/example/README
deleted file mode 100644
index 92b657ca..00000000
--- a/rescore/example/README
+++ /dev/null
@@ -1,4 +0,0 @@
-Rescoring example:
-
-  ../rescore_with_cdec_model.pl -c cdec.ini -s source.txt  -h hyp.txt  -w weights -f RescoringModel
-
diff --git a/rescore/example/cdec.ini b/rescore/example/cdec.ini
deleted file mode 100644
index 29a1ece3..00000000
--- a/rescore/example/cdec.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-formalism=scfg
-grammar=small.scfg
diff --git a/rescore/example/hyp.txt b/rescore/example/hyp.txt
deleted file mode 100644
index c4757f6c..00000000
--- a/rescore/example/hyp.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-0 ||| A B C ||| F1=1 F2=1
-0 ||| A b c ||| F1=1 F3=1
-0 ||| A C ||| F4=1
-1 ||| X Y ||| F5=1
-1 ||| XY ||| F6=1
diff --git a/rescore/example/small.scfg b/rescore/example/small.scfg
deleted file mode 100644
index 402a585a..00000000
--- a/rescore/example/small.scfg
+++ /dev/null
@@ -1,9 +0,0 @@
-[X] ||| a b c ||| A B C ||| fe=0.2
-[X] ||| a b ||| A B ||| fe=0.8
-[X] ||| c ||| C ||| fe=0.3
-[X] ||| c ||| c ||| fe=1.3
-[X] ||| a b c ||| A B c ||| fe=0.8
-[X] ||| a b c ||| A C ||| fe=2
-[X] ||| x ||| X ||| fe=0.2
-[X] ||| y ||| Y ||| fe=0.5
-[X] ||| x y ||| XY ||| fe=0.8
diff --git a/rescore/example/source.txt b/rescore/example/source.txt
deleted file mode 100644
index e8d4eda2..00000000
--- a/rescore/example/source.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-a b c
-x y
diff --git a/rescore/example/weights b/rescore/example/weights
deleted file mode 100644
index a22d36f1..00000000
--- a/rescore/example/weights
+++ /dev/null
@@ -1 +0,0 @@
-fe -0.8
diff --git a/rescore/generate_zmert_params_from_weights.pl b/rescore/generate_zmert_params_from_weights.pl
deleted file mode 100755
index a9287896..00000000
--- a/rescore/generate_zmert_params_from_weights.pl
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/perl -w
-
-my %defaults;
-$defaults{'LanguageModel'} = "Opt\t0\t10\t0\t2.5";
-$defaults{'EgivenF'} = "Opt\t-5\t0.5\t-3\t0.5";
-$defaults{'LexEGivenF'} = "Opt\t-5\t0.5\t-3\t0.5";
-$defaults{'LexFGivenE'} = "Opt\t-5\t0.5\t-3\t0.5";
-$defaults{'PassThrough'} = "Opt\t-Inf\t+Inf\t-10\t0";
-$defaults{'WordPenalty'} = "Opt\t-Inf\t2\t-5\t0";
-my $DEFAULT = "Opt\t-Inf\t+Inf\t-1\t+1";
-
-while(<>) {
-  next if /^#/;
-  chomp;
-  next if /^\s*$/;
-  s/^\s+//;
-  s/\s+$//;
-  my ($a,$b) = split /\s+/;
-  next unless ($a && $b);
-  my $line = $DEFAULT;
-  if ($defaults{$a}) { $line = $defaults{$a}; }
-  print "$a\t|||\t$b\t$line\n";
-}
-
-print "normalization = none\n";
-
diff --git a/rescore/rerank.pl b/rescore/rerank.pl
deleted file mode 100755
index 4a0c5750..00000000
--- a/rescore/rerank.pl
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use utf8;
-use Getopt::Long;
-
-my $weights_file;
-my $hyp_file;
-my $help;
-my $kbest; # flag to extract reranked list
-
-Getopt::Long::Configure("no_auto_abbrev");
-if (GetOptions(
-    "weights_file|w=s" => \$weights_file,
-    "hypothesis_file|h=s" => \$hyp_file,
-    "kbest" => \$kbest,
-    "help" => \$help,
-) == 0 || @ARGV!=0 || $help || !$weights_file || !$hyp_file) {
-  usage();
-  exit(1);
-}
-
-open W, "<$weights_file" or die "Can't read $weights_file: $!";
-my %weights;
-while(<W>) {
-  chomp;
-  next if /^#/;
-  next if /^\s*$/;
-  my ($fname, $w) = split /\s+/;
-  $weights{$fname} = $w;
-}
-close W;
-
-my $cur = undef;
-my %hyps = ();
-open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!";
-while(<HYP>) {
-  chomp;
-  my ($id, $hyp, $feats) = split / \|\|\| /;
-  unless (defined $cur) { $cur = $id; }
-  if ($cur ne $id) {
-    extract_1best($cur, \%hyps);
-    $cur = $id;
-    %hyps = ();
-  }
-  my @afeats = split /\s+/, $feats;
-  my $tot = 0;
-  for my $featpair (@afeats) {
-    my ($fname,$fval) = split /=/, $featpair;
-    my $weight = $weights{$fname};
-    die "Unweighted feature '$fname'" unless defined $weight;
-    $tot += ($weight * $fval);
-  }
-  $hyps{"$hyp ||| $feats"} = $tot;
-}
-extract_1best($cur, \%hyps) if defined $cur;
-close HYP;
-
-sub extract_1best {
-  my ($id, $rh) = @_;
-  my %hyps = %$rh;
-  if ($kbest) {
-    for my $hyp (sort { $hyps{$b} <=> $hyps{$a} } keys %hyps) {
-      print "$id ||| $hyp\n";
-    }
-  } else {
-    my $best_score = undef;
-    my $best_hyp = undef;
-    for my $hyp (keys %hyps) {
-      if (!defined $best_score || $hyps{$hyp} > $best_score) {
-        $best_score = $hyps{$hyp};
-        $best_hyp = $hyp;
-      }
-    }
-    $best_hyp =~ s/ \|\|\|.*$//;
-    print "$best_hyp\n";
-  }
-}
-
-sub usage {
-  print <<EOT;
-Usage: $0 -w weights.txt -h hyp.nbest.txt [--kbest]
-  Reranks n-best lists with new weights, extracting the new 1/k-best entries.
-EOT
-}
-
diff --git a/rescore/rescore_inv_model1.pl b/rescore/rescore_inv_model1.pl
deleted file mode 100755
index 780452f5..00000000
--- a/rescore/rescore_inv_model1.pl
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use utf8;
-use Getopt::Long;
-
-my $model_file;
-my $src_file;
-my $hyp_file;
-my $help;
-my $reverse_model;
-my $feature_name='M1SrcGivenTrg';
-
-Getopt::Long::Configure("no_auto_abbrev");
-if (GetOptions(
-    "model_file|m=s" => \$model_file,
-    "source_file|s=s" => \$src_file,
-    "feature_name|f=s" => \$feature_name,
-    "hypothesis_file|h=s" => \$hyp_file,
-    "help" => \$help,
-) == 0 || @ARGV!=0 || $help || !$model_file || !$src_file || !$hyp_file) {
-  usage();
-  exit;
-}
-
-binmode STDIN, ":utf8";
-binmode STDOUT, ":utf8";
-binmode STDERR, ":utf8";
-
-print STDERR "Reading Model 1 probabilities from $model_file...\n";
-open M, "<$model_file" or die "Couldn't read $model_file: $!";
-binmode M, ":utf8";
-my %m1;
-while(<M>){
-  chomp;
-  my ($e,$f,$lp) = split /\s+/;
-  die unless defined $e;
-  die unless defined $f;
-  die unless defined $lp;
-  $m1{$f}->{$e} = $lp;
-}
-close M;
-
-open SRC, "<$src_file" or die "Can't read $src_file: $!";
-open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!";
-binmode(SRC,":utf8");
-binmode(HYP,":utf8");
-binmode(STDOUT,":utf8");
-my @source; while(<SRC>){chomp; push @source, $_; }
-close SRC;
-my $src_len = scalar @source;
-print STDERR "Read $src_len sentences...\n";
-print STDERR "Rescoring...\n";
-
-my $cur = undef;
-my @hyps = ();
-my @feats = ();
-while(<HYP>) {
-  chomp;
-  my ($id, $hyp, $feats) = split / \|\|\| /;
-  unless (defined $cur) { $cur = $id; }
-  die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len;
-  if ($cur ne $id) {
-    rescore($cur, $source[$cur], \@hyps, \@feats);
-    $cur = $id;
-    @hyps = ();
-    @feats = ();
-  }
-  push @hyps, $hyp;
-  push @feats, $feats;
-}
-rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur;
-
-sub rescore {
-  my ($id, $src, $rh, $rf) = @_;
-  my @hyps = @$rh;
-  my @feats = @$rf;
-  my $nhyps = scalar @hyps;
-  my %cache = ();
-  print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n";
-  for (my $i=0; $i < $nhyps; $i++) {
-    my $score = $cache{$hyps[$i]};
-    if (!defined $score) {
-      if ($reverse_model) {
-        die "not implemented";
-      } else {
-        $score = m1_prob($src, $hyps[$i]);
-      }
-      $cache{$hyps[$i]} = $score;
-    }
-    print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n";
-  }
-
-}
-
-sub m1_prob {
-  my ($fsent, $esent) = @_;
-  die unless defined $fsent;
-  die unless defined $esent;
-  my @fwords = split /\s+/, $fsent;
-  my @ewords = split /\s+/, $esent;
-  push @ewords, "<eps>";
-  my $tp = 0;
-  for my $f (@fwords) {
-    my $m1f = $m1{$f};
-    if (!defined $m1f) { $m1f = {}; }
-    my $tfp = 0;
-    for my $e (@ewords) {
-      my $lp = $m1f->{$e};
-      if (!defined $lp) { $lp = -100; }
-      #print "P($f|$e) = $lp\n";
-      my $prob = exp($lp);
-      #if ($prob > $tfp) { $tfp = $prob; }
-      $tfp += $prob;
-    }
-    $tp += log($tfp);
-    $tp -= log(scalar @ewords);  # uniform probability of each generating word
-  }
-  return $tp;
-}
-
-sub usage {
-  print STDERR "Usage: $0 -m model_file.txt -h hypothesis.nbest -s source.txt\n  Adds the back-translation probability under Model 1\n  Use training/model1 to generate the required parameter file\n";
-}
-
-
diff --git a/rescore/rescore_with_cdec_model.pl b/rescore/rescore_with_cdec_model.pl
deleted file mode 100755
index cdd8c217..00000000
--- a/rescore/rescore_with_cdec_model.pl
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use utf8;
-my @ORIG_ARGV=@ARGV;
-use Cwd qw(getcwd);
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
-use LocalConfig;
-use Getopt::Long;
-use IPC::Open2;
-use POSIX ":sys_wait_h";
-
-my $decoder = "$SCRIPT_DIR/../decoder/cdec";
-my $help;
-my $cdec_ini;
-my $src_file;
-my $hyp_file;
-my $reverse_model;
-my $weights_file;
-my $feature_name='NewModel';
-
-sub catch_pipe {
-  my $signame = shift;
-  die "$0 received SIGPIPE: did the decoder die?\n";
-}
-$SIG{PIPE} = \&catch_pipe;
-
-Getopt::Long::Configure("no_auto_abbrev");
-if (GetOptions(
-    "config|c=s" => \$cdec_ini,
-    "weights|w=s" => \$weights_file,
-    "source_file|s=s" => \$src_file,
-    "feature_name|f=s" => \$feature_name,
-    "hypothesis_file|h=s" => \$hyp_file,
-    "reverse" => \$reverse_model,  # if true translate hyp -> src
-    "decoder=s" => \$decoder,
-    "help" => \$help,
-) == 0 || @ARGV!=0 || $help || !$cdec_ini || !$src_file || !$hyp_file) {
-  usage();
-  exit;
-}
-die "Can't find $decoder" unless -f $decoder;
-die "Can't run $decoder" unless -x $decoder;
-my $weights = '';
-if (defined $weights_file) {
-  die "Can't read $weights_file" unless -f $weights_file;
-  $weights = "-w $weights_file";
-}
-my $decoder_command = "$decoder -c $cdec_ini --quiet $weights --show_conditional_prob";
-print STDERR "DECODER COMMAND: $decoder_command\n";
-my $cdec_pid = open2(\*CDEC_IN, \*CDEC_OUT, $decoder_command)
-  or die "Couldn't run $decoder: $!";
-sleep 1;
-
-die "Can't find $cdec_ini" unless -f $cdec_ini;
-open SRC, "<$src_file" or die "Can't read $src_file: $!";
-open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!";
-binmode(SRC,":utf8");
-binmode(HYP,":utf8");
-binmode(STDOUT,":utf8");
-my @source; while(<SRC>){chomp; push @source, $_; }
-close SRC;
-my $src_len = scalar @source;
-print STDERR "Read $src_len sentences...\n";
-binmode(CDEC_IN, ":utf8");
-binmode(CDEC_OUT, ":utf8");
-
-my $cur = undef;
-my @hyps = ();
-my @feats = ();
-while(<HYP>) {
-  chomp;
-  my ($id, $hyp, $feats) = split / \|\|\| /;
-  unless (defined $cur) { $cur = $id; }
-  die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len;
-  if ($cur ne $id) {
-    rescore($cur, $source[$cur], \@hyps, \@feats);
-    $cur = $id;
-    @hyps = ();
-    @feats = ();
-  }
-  push @hyps, $hyp;
-  push @feats, $feats;
-}
-rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur;
-
-close CDEC_IN;
-close CDEC_OUT;
-close HYP;
-waitpid($cdec_pid, 0);
-my $status = $? >> 8;
-if ($status != 0) {
-  print STDERR "Decoder returned bad status!\n";
-}
-
-sub rescore {
-  my ($id, $src, $rh, $rf) = @_;
-  my @hyps = @$rh;
-  my @feats = @$rf;
-  my $nhyps = scalar @hyps;
-  print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n";
-  for (my $i=0; $i < $nhyps; $i++) {
-    if ($reverse_model) {
-      print CDEC_OUT "<seg id=\"$id\">$hyps[$i] ||| $src</seg>\n";
-    } else {
-      print CDEC_OUT "<seg id=\"$id\">$src ||| $hyps[$i]</seg>\n";
-    }
-    my $score = <CDEC_IN>;
-    chomp $score;
-    my @words = split /\s+/, $hyps[$i];
-    print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n";
-  }
-}
-
-sub usage {
-  print <<EOT;
-Usage: $0 -c cdec.ini [-w cdec_weights.txt] -s source.txt -h hypothesis.nbest.txt [-f FeatureName]
-EOT
-  exit 0
-}
-
author	Patrick Simianer <p@simianer.de>	2012-03-13 09:24:47 +0100
committer	Patrick Simianer <p@simianer.de>	2012-03-13 09:24:47 +0100
commit	ef6085e558e26c8819f1735425761103021b6470 (patch)
tree	5cf70e4c48c64d838e1326b5a505c8c4061bff4a /rescore
parent	10a232656a0c882b3b955d2bcfac138ce11e8a2e (diff)
parent	dfbc278c1057555fda9312291c8024049e00b7d8 (diff)