diff options
| author | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 | 
| commit | ef6085e558e26c8819f1735425761103021b6470 (patch) | |
| tree | 5cf70e4c48c64d838e1326b5a505c8c4061bff4a /rescore | |
| parent | 10a232656a0c882b3b955d2bcfac138ce11e8a2e (diff) | |
| parent | dfbc278c1057555fda9312291c8024049e00b7d8 (diff) | |
merge with upstream
Diffstat (limited to 'rescore')
| -rwxr-xr-x | rescore/cdec_kbest_to_zmert.pl | 64 | ||||
| -rw-r--r-- | rescore/example/README | 4 | ||||
| -rw-r--r-- | rescore/example/cdec.ini | 2 | ||||
| -rw-r--r-- | rescore/example/hyp.txt | 5 | ||||
| -rw-r--r-- | rescore/example/small.scfg | 9 | ||||
| -rw-r--r-- | rescore/example/source.txt | 2 | ||||
| -rw-r--r-- | rescore/example/weights | 1 | ||||
| -rwxr-xr-x | rescore/generate_zmert_params_from_weights.pl | 26 | ||||
| -rwxr-xr-x | rescore/rerank.pl | 86 | ||||
| -rwxr-xr-x | rescore/rescore_inv_model1.pl | 126 | ||||
| -rwxr-xr-x | rescore/rescore_with_cdec_model.pl | 121 | 
11 files changed, 0 insertions, 446 deletions
| diff --git a/rescore/cdec_kbest_to_zmert.pl b/rescore/cdec_kbest_to_zmert.pl deleted file mode 100755 index 88bc9682..00000000 --- a/rescore/cdec_kbest_to_zmert.pl +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $feature_file; -my $hyp_file; -my $help; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( -    "feature_file|f=s" => \$feature_file, -    "hypothesis_file|h=s" => \$hyp_file, -    "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$feature_file || !$hyp_file) { -  usage(); -  exit(1); -} - -open W, "<$feature_file" or die "Can't read $feature_file: $!"; -my %weights; -my @all_feats; -while(<W>) { -  chomp; -  next if /^#/; -  next if /^\s*$/; -  my ($fname, $w) = split /\s+/; -  push @all_feats, $fname; -  $weights{$fname} = 1; -} -close W; - -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -while(<HYP>) { -  chomp; -  my ($id, $hyp, $feats) = split / \|\|\| /; -  my @afeats = split /\s+/, $feats; -  my $tot = 0; -  my %fvaldict; -  for my $featpair (@afeats) { -    my ($fname,$fval) = split /=/, $featpair; -    $fvaldict{$fname} = $fval; -    my $weight = $weights{$fname}; -    warn "Feature '$fname' not mentioned in feature file $feature_file" unless defined $weight; -    $weights{$fname} = 1; -  } -  my @trans; -  for my $feat (@all_feats) { -    my $v = $fvaldict{$feat}; -    if (!defined $v) { $v = '0.0'; } -    push @trans, $v; -  } -  print "$id ||| $hyp ||| @trans\n"; -} -close HYP; - -sub usage { -  print <<EOT; -Usage: $0 -f feature-file.txt/weights.txt -h hyp.nbest.txt -  Puts a cdec k-best list into Joshua/ZMERT format -EOT -} - diff --git a/rescore/example/README b/rescore/example/README deleted file mode 100644 index 92b657ca..00000000 --- a/rescore/example/README +++ /dev/null @@ -1,4 +0,0 @@ -Rescoring example: - -  ../rescore_with_cdec_model.pl -c cdec.ini -s source.txt  -h hyp.txt  -w weights -f RescoringModel - diff --git a/rescore/example/cdec.ini b/rescore/example/cdec.ini deleted file mode 100644 index 29a1ece3..00000000 --- a/rescore/example/cdec.ini +++ /dev/null @@ -1,2 +0,0 @@ -formalism=scfg -grammar=small.scfg diff --git a/rescore/example/hyp.txt b/rescore/example/hyp.txt deleted file mode 100644 index c4757f6c..00000000 --- a/rescore/example/hyp.txt +++ /dev/null @@ -1,5 +0,0 @@ -0 ||| A B C ||| F1=1 F2=1 -0 ||| A b c ||| F1=1 F3=1 -0 ||| A C ||| F4=1 -1 ||| X Y ||| F5=1 -1 ||| XY ||| F6=1 diff --git a/rescore/example/small.scfg b/rescore/example/small.scfg deleted file mode 100644 index 402a585a..00000000 --- a/rescore/example/small.scfg +++ /dev/null @@ -1,9 +0,0 @@ -[X] ||| a b c ||| A B C ||| fe=0.2 -[X] ||| a b ||| A B ||| fe=0.8 -[X] ||| c ||| C ||| fe=0.3 -[X] ||| c ||| c ||| fe=1.3 -[X] ||| a b c ||| A B c ||| fe=0.8 -[X] ||| a b c ||| A C ||| fe=2 -[X] ||| x ||| X ||| fe=0.2 -[X] ||| y ||| Y ||| fe=0.5 -[X] ||| x y ||| XY ||| fe=0.8 diff --git a/rescore/example/source.txt b/rescore/example/source.txt deleted file mode 100644 index e8d4eda2..00000000 --- a/rescore/example/source.txt +++ /dev/null @@ -1,2 +0,0 @@ -a b c -x y diff --git a/rescore/example/weights b/rescore/example/weights deleted file mode 100644 index a22d36f1..00000000 --- a/rescore/example/weights +++ /dev/null @@ -1 +0,0 @@ -fe -0.8 diff --git a/rescore/generate_zmert_params_from_weights.pl b/rescore/generate_zmert_params_from_weights.pl deleted file mode 100755 index a9287896..00000000 --- a/rescore/generate_zmert_params_from_weights.pl +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl -w - -my %defaults; -$defaults{'LanguageModel'} = "Opt\t0\t10\t0\t2.5"; -$defaults{'EgivenF'} = "Opt\t-5\t0.5\t-3\t0.5"; -$defaults{'LexEGivenF'} = "Opt\t-5\t0.5\t-3\t0.5"; -$defaults{'LexFGivenE'} = "Opt\t-5\t0.5\t-3\t0.5"; -$defaults{'PassThrough'} = "Opt\t-Inf\t+Inf\t-10\t0"; -$defaults{'WordPenalty'} = "Opt\t-Inf\t2\t-5\t0"; -my $DEFAULT = "Opt\t-Inf\t+Inf\t-1\t+1"; - -while(<>) { -  next if /^#/; -  chomp; -  next if /^\s*$/; -  s/^\s+//; -  s/\s+$//; -  my ($a,$b) = split /\s+/; -  next unless ($a && $b); -  my $line = $DEFAULT; -  if ($defaults{$a}) { $line = $defaults{$a}; } -  print "$a\t|||\t$b\t$line\n"; -} - -print "normalization = none\n"; - diff --git a/rescore/rerank.pl b/rescore/rerank.pl deleted file mode 100755 index 4a0c5750..00000000 --- a/rescore/rerank.pl +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $weights_file; -my $hyp_file; -my $help; -my $kbest; # flag to extract reranked list - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( -    "weights_file|w=s" => \$weights_file, -    "hypothesis_file|h=s" => \$hyp_file, -    "kbest" => \$kbest, -    "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$weights_file || !$hyp_file) { -  usage(); -  exit(1); -} - -open W, "<$weights_file" or die "Can't read $weights_file: $!"; -my %weights; -while(<W>) { -  chomp; -  next if /^#/; -  next if /^\s*$/; -  my ($fname, $w) = split /\s+/; -  $weights{$fname} = $w; -} -close W; - -my $cur = undef; -my %hyps = (); -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -while(<HYP>) { -  chomp; -  my ($id, $hyp, $feats) = split / \|\|\| /; -  unless (defined $cur) { $cur = $id; } -  if ($cur ne $id) { -    extract_1best($cur, \%hyps); -    $cur = $id; -    %hyps = (); -  } -  my @afeats = split /\s+/, $feats; -  my $tot = 0; -  for my $featpair (@afeats) { -    my ($fname,$fval) = split /=/, $featpair; -    my $weight = $weights{$fname}; -    die "Unweighted feature '$fname'" unless defined $weight; -    $tot += ($weight * $fval); -  } -  $hyps{"$hyp ||| $feats"} = $tot; -} -extract_1best($cur, \%hyps) if defined $cur; -close HYP; - -sub extract_1best { -  my ($id, $rh) = @_; -  my %hyps = %$rh; -  if ($kbest) { -    for my $hyp (sort { $hyps{$b} <=> $hyps{$a} } keys %hyps) { -      print "$id ||| $hyp\n"; -    } -  } else { -    my $best_score = undef; -    my $best_hyp = undef; -    for my $hyp (keys %hyps) { -      if (!defined $best_score || $hyps{$hyp} > $best_score) { -        $best_score = $hyps{$hyp}; -        $best_hyp = $hyp; -      } -    } -    $best_hyp =~ s/ \|\|\|.*$//; -    print "$best_hyp\n"; -  } -} - -sub usage { -  print <<EOT; -Usage: $0 -w weights.txt -h hyp.nbest.txt [--kbest] -  Reranks n-best lists with new weights, extracting the new 1/k-best entries. -EOT -} - diff --git a/rescore/rescore_inv_model1.pl b/rescore/rescore_inv_model1.pl deleted file mode 100755 index 780452f5..00000000 --- a/rescore/rescore_inv_model1.pl +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $model_file; -my $src_file; -my $hyp_file; -my $help; -my $reverse_model; -my $feature_name='M1SrcGivenTrg'; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( -    "model_file|m=s" => \$model_file, -    "source_file|s=s" => \$src_file, -    "feature_name|f=s" => \$feature_name, -    "hypothesis_file|h=s" => \$hyp_file, -    "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$model_file || !$src_file || !$hyp_file) { -  usage(); -  exit; -} - -binmode STDIN, ":utf8"; -binmode STDOUT, ":utf8"; -binmode STDERR, ":utf8"; - -print STDERR "Reading Model 1 probabilities from $model_file...\n"; -open M, "<$model_file" or die "Couldn't read $model_file: $!"; -binmode M, ":utf8"; -my %m1; -while(<M>){ -  chomp; -  my ($e,$f,$lp) = split /\s+/; -  die unless defined $e; -  die unless defined $f; -  die unless defined $lp; -  $m1{$f}->{$e} = $lp; -} -close M; - -open SRC, "<$src_file" or die "Can't read $src_file: $!"; -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -binmode(SRC,":utf8"); -binmode(HYP,":utf8"); -binmode(STDOUT,":utf8"); -my @source; while(<SRC>){chomp; push @source, $_; } -close SRC; -my $src_len = scalar @source; -print STDERR "Read $src_len sentences...\n"; -print STDERR "Rescoring...\n"; - -my $cur = undef; -my @hyps = (); -my @feats = (); -while(<HYP>) { -  chomp; -  my ($id, $hyp, $feats) = split / \|\|\| /; -  unless (defined $cur) { $cur = $id; } -  die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len; -  if ($cur ne $id) { -    rescore($cur, $source[$cur], \@hyps, \@feats); -    $cur = $id; -    @hyps = (); -    @feats = (); -  } -  push @hyps, $hyp; -  push @feats, $feats; -} -rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur; - -sub rescore { -  my ($id, $src, $rh, $rf) = @_; -  my @hyps = @$rh; -  my @feats = @$rf; -  my $nhyps = scalar @hyps; -  my %cache = (); -  print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n"; -  for (my $i=0; $i < $nhyps; $i++) { -    my $score = $cache{$hyps[$i]}; -    if (!defined $score) { -      if ($reverse_model) { -        die "not implemented"; -      } else { -        $score = m1_prob($src, $hyps[$i]); -      } -      $cache{$hyps[$i]} = $score; -    } -    print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n"; -  } - -} - -sub m1_prob { -  my ($fsent, $esent) = @_; -  die unless defined $fsent; -  die unless defined $esent; -  my @fwords = split /\s+/, $fsent; -  my @ewords = split /\s+/, $esent; -  push @ewords, "<eps>"; -  my $tp = 0; -  for my $f (@fwords) { -    my $m1f = $m1{$f}; -    if (!defined $m1f) { $m1f = {}; } -    my $tfp = 0; -    for my $e (@ewords) { -      my $lp = $m1f->{$e}; -      if (!defined $lp) { $lp = -100; } -      #print "P($f|$e) = $lp\n"; -      my $prob = exp($lp); -      #if ($prob > $tfp) { $tfp = $prob; } -      $tfp += $prob; -    } -    $tp += log($tfp); -    $tp -= log(scalar @ewords);  # uniform probability of each generating word -  } -  return $tp; -} - -sub usage { -  print STDERR "Usage: $0 -m model_file.txt -h hypothesis.nbest -s source.txt\n  Adds the back-translation probability under Model 1\n  Use training/model1 to generate the required parameter file\n"; -} - - diff --git a/rescore/rescore_with_cdec_model.pl b/rescore/rescore_with_cdec_model.pl deleted file mode 100755 index cdd8c217..00000000 --- a/rescore/rescore_with_cdec_model.pl +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -my @ORIG_ARGV=@ARGV; -use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } -use LocalConfig; -use Getopt::Long; -use IPC::Open2; -use POSIX ":sys_wait_h"; - -my $decoder = "$SCRIPT_DIR/../decoder/cdec"; -my $help; -my $cdec_ini; -my $src_file; -my $hyp_file; -my $reverse_model; -my $weights_file; -my $feature_name='NewModel'; - -sub catch_pipe { -  my $signame = shift; -  die "$0 received SIGPIPE: did the decoder die?\n"; -} -$SIG{PIPE} = \&catch_pipe; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( -    "config|c=s" => \$cdec_ini, -    "weights|w=s" => \$weights_file, -    "source_file|s=s" => \$src_file, -    "feature_name|f=s" => \$feature_name, -    "hypothesis_file|h=s" => \$hyp_file, -    "reverse" => \$reverse_model,  # if true translate hyp -> src -    "decoder=s" => \$decoder, -    "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$cdec_ini || !$src_file || !$hyp_file) { -  usage(); -  exit; -} -die "Can't find $decoder" unless -f $decoder; -die "Can't run $decoder" unless -x $decoder; -my $weights = ''; -if (defined $weights_file) { -  die "Can't read $weights_file" unless -f $weights_file; -  $weights = "-w $weights_file"; -} -my $decoder_command = "$decoder -c $cdec_ini --quiet $weights --show_conditional_prob"; -print STDERR "DECODER COMMAND: $decoder_command\n"; -my $cdec_pid = open2(\*CDEC_IN, \*CDEC_OUT, $decoder_command) -  or die "Couldn't run $decoder: $!"; -sleep 1; - -die "Can't find $cdec_ini" unless -f $cdec_ini; -open SRC, "<$src_file" or die "Can't read $src_file: $!"; -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -binmode(SRC,":utf8"); -binmode(HYP,":utf8"); -binmode(STDOUT,":utf8"); -my @source; while(<SRC>){chomp; push @source, $_; } -close SRC; -my $src_len = scalar @source; -print STDERR "Read $src_len sentences...\n"; -binmode(CDEC_IN, ":utf8"); -binmode(CDEC_OUT, ":utf8"); - -my $cur = undef; -my @hyps = (); -my @feats = (); -while(<HYP>) { -  chomp; -  my ($id, $hyp, $feats) = split / \|\|\| /; -  unless (defined $cur) { $cur = $id; } -  die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len; -  if ($cur ne $id) { -    rescore($cur, $source[$cur], \@hyps, \@feats); -    $cur = $id; -    @hyps = (); -    @feats = (); -  } -  push @hyps, $hyp; -  push @feats, $feats; -} -rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur; - -close CDEC_IN; -close CDEC_OUT; -close HYP; -waitpid($cdec_pid, 0); -my $status = $? >> 8; -if ($status != 0) { -  print STDERR "Decoder returned bad status!\n"; -} - -sub rescore { -  my ($id, $src, $rh, $rf) = @_; -  my @hyps = @$rh; -  my @feats = @$rf; -  my $nhyps = scalar @hyps; -  print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n"; -  for (my $i=0; $i < $nhyps; $i++) { -    if ($reverse_model) { -      print CDEC_OUT "<seg id=\"$id\">$hyps[$i] ||| $src</seg>\n"; -    } else { -      print CDEC_OUT "<seg id=\"$id\">$src ||| $hyps[$i]</seg>\n"; -    } -    my $score = <CDEC_IN>; -    chomp $score; -    my @words = split /\s+/, $hyps[$i]; -    print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n"; -  } -} - -sub usage { -  print <<EOT; -Usage: $0 -c cdec.ini [-w cdec_weights.txt] -s source.txt -h hypothesis.nbest.txt [-f FeatureName] -EOT -  exit 0 -} - | 
