summaryrefslogtreecommitdiff
path: root/vest
diff options
context:
space:
mode:
Diffstat (limited to 'vest')
-rw-r--r--vest/Makefile.am32
-rw-r--r--vest/comb_scorer.cc81
-rwxr-xr-xvest/dist-vest.pl642
-rw-r--r--vest/error_surface.cc46
-rw-r--r--vest/fast_score.cc74
-rw-r--r--vest/line_optimizer.cc101
-rw-r--r--vest/lo_test.cc201
-rw-r--r--vest/mr_vest_generate_mapper_input.cc72
-rw-r--r--vest/mr_vest_map.cc98
-rw-r--r--vest/mr_vest_reduce.cc80
-rw-r--r--vest/scorer.cc485
-rw-r--r--vest/scorer_test.cc178
-rw-r--r--vest/ter.cc518
-rw-r--r--vest/test_data/0.json.gzbin0 -> 13709 bytes
-rw-r--r--vest/test_data/1.json.gzbin0 -> 204803 bytes
-rw-r--r--vest/test_data/c2e.txt.02
-rw-r--r--vest/test_data/c2e.txt.12
-rw-r--r--vest/test_data/c2e.txt.22
-rw-r--r--vest/test_data/c2e.txt.32
-rw-r--r--vest/test_data/re.txt.05
-rw-r--r--vest/test_data/re.txt.15
-rw-r--r--vest/test_data/re.txt.25
-rw-r--r--vest/test_data/re.txt.35
-rw-r--r--vest/union_forests.cc73
-rw-r--r--vest/viterbi_envelope.cc167
25 files changed, 2876 insertions, 0 deletions
diff --git a/vest/Makefile.am b/vest/Makefile.am
new file mode 100644
index 00000000..87c2383a
--- /dev/null
+++ b/vest/Makefile.am
@@ -0,0 +1,32 @@
+bin_PROGRAMS = \
+ mr_vest_map \
+ mr_vest_reduce \
+ scorer_test \
+ lo_test \
+ mr_vest_generate_mapper_input \
+ fast_score \
+ union_forests
+
+union_forests_SOURCES = union_forests.cc
+union_forests_LDADD = $(top_srcdir)/src/libhg.a -lz
+
+fast_score_SOURCES = fast_score.cc ter.cc comb_scorer.cc scorer.cc viterbi_envelope.cc
+fast_score_LDADD = $(top_srcdir)/src/libhg.a -lz
+
+mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc
+mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/src/libhg.a -lz
+
+mr_vest_map_SOURCES = viterbi_envelope.cc error_surface.cc mr_vest_map.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc
+mr_vest_map_LDADD = $(top_srcdir)/src/libhg.a -lz
+
+mr_vest_reduce_SOURCES = error_surface.cc mr_vest_reduce.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc viterbi_envelope.cc
+mr_vest_reduce_LDADD = $(top_srcdir)/src/libhg.a -lz
+
+scorer_test_SOURCES = scorer_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc
+scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/src/libhg.a -lz
+
+lo_test_SOURCES = lo_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc
+lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/src/libhg.a -lz
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/src
+AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB)
diff --git a/vest/comb_scorer.cc b/vest/comb_scorer.cc
new file mode 100644
index 00000000..7b2187f4
--- /dev/null
+++ b/vest/comb_scorer.cc
@@ -0,0 +1,81 @@
+#include "comb_scorer.h"
+
+#include <cstdio>
+
+using namespace std;
+
+class BLEUTERCombinationScore : public Score {
+ friend class BLEUTERCombinationScorer;
+ public:
+ ~BLEUTERCombinationScore();
+ float ComputeScore() const {
+ return (bleu->ComputeScore() - ter->ComputeScore()) / 2.0f;
+ }
+ void ScoreDetails(string* details) const {
+ char buf[160];
+ sprintf(buf, "Combi = %.2f, BLEU = %.2f, TER = %.2f",
+ ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, ter->ComputeScore()*100.0f);
+ *details = buf;
+ }
+ void PlusEquals(const Score& delta) {
+ bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu);
+ ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter);
+ }
+ Score* GetZero() const {
+ BLEUTERCombinationScore* res = new BLEUTERCombinationScore;
+ res->bleu = bleu->GetZero();
+ res->ter = ter->GetZero();
+ return res;
+ }
+ void Subtract(const Score& rhs, Score* res) const {
+ bleu->Subtract(*static_cast<const BLEUTERCombinationScore&>(rhs).bleu,
+ static_cast<BLEUTERCombinationScore*>(res)->bleu);
+ ter->Subtract(*static_cast<const BLEUTERCombinationScore&>(rhs).ter,
+ static_cast<BLEUTERCombinationScore*>(res)->ter);
+ }
+ void Encode(std::string* out) const {
+ string bs, ts;
+ bleu->Encode(&bs);
+ ter->Encode(&ts);
+ out->clear();
+ (*out) += static_cast<char>(bs.size());
+ (*out) += bs;
+ (*out) += ts;
+ }
+ bool IsAdditiveIdentity() const {
+ return bleu->IsAdditiveIdentity() && ter->IsAdditiveIdentity();
+ }
+ private:
+ Score* bleu;
+ Score* ter;
+};
+
+BLEUTERCombinationScore::~BLEUTERCombinationScore() {
+ delete bleu;
+ delete ter;
+}
+
+BLEUTERCombinationScorer::BLEUTERCombinationScorer(const vector<vector<WordID> >& refs) {
+ bleu_ = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs);
+ ter_ = SentenceScorer::CreateSentenceScorer(TER, refs);
+}
+
+BLEUTERCombinationScorer::~BLEUTERCombinationScorer() {
+ delete bleu_;
+ delete ter_;
+}
+
+Score* BLEUTERCombinationScorer::ScoreCandidate(const std::vector<WordID>& hyp) const {
+ BLEUTERCombinationScore* res = new BLEUTERCombinationScore;
+ res->bleu = bleu_->ScoreCandidate(hyp);
+ res->ter = ter_->ScoreCandidate(hyp);
+ return res;
+}
+
+Score* BLEUTERCombinationScorer::ScoreFromString(const std::string& in) {
+ int bss = in[0];
+ BLEUTERCombinationScore* r = new BLEUTERCombinationScore;
+ r->bleu = SentenceScorer::CreateScoreFromString(IBM_BLEU, in.substr(1, bss));
+ r->ter = SentenceScorer::CreateScoreFromString(TER, in.substr(1 + bss));
+ return r;
+}
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
new file mode 100755
index 00000000..5528838c
--- /dev/null
+++ b/vest/dist-vest.pl
@@ -0,0 +1,642 @@
+#!/usr/bin/env perl
+
+use Getopt::Long;
+use IPC::Open2;
+use strict;
+use POSIX ":sys_wait_h";
+
+my $mydir = `dirname $0`;
+chomp $mydir;
+# Default settings
+my $srcFile = "/fs/cliplab/mteval/Evaluation/Chinese-English/mt03.src.txt";
+my $refFiles = "/fs/cliplab/mteval/Evaluation/Chinese-English/mt03.ref.txt.*";
+my $bin_dir = "/fs/clip-software/cdec/bin";
+$bin_dir = "/Users/redpony/cdyer-svn-root/cdec/vest/bin_dir";
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/fast_score";
+die "Can't find $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/mr_vest_generate_mapper_input";
+my $MAPPER = "$bin_dir/mr_vest_map";
+my $REDUCER = "$bin_dir/mr_vest_reduce";
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $forestUnion = "$bin_dir/union_forests";
+die "Can't find $forestUnion" unless -x $forestUnion;
+my $cdec = "$bin_dir/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+my $decoder = $cdec;
+my $lines_per_mapper = 440;
+my $rand_directions = 10;
+my $iteration = 1;
+my $run_local = 0;
+my $best_weights;
+my $max_iterations = 40;
+my $optimization_iters = 6;
+my $num_rand_points = 20;
+my $mert_nodes = join(" ", grep(/^c\d\d$/, split(/\n/, `pbsnodes -a`))); # "1 1 1 1 1" fails due to file staging conflicts
+my $decode_nodes = "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"; # start 15 jobs
+my $pmem = "3g";
+my $disable_clean = 0;
+my %seen_weights;
+my $normalize;
+my $help = 0;
+my $epsilon = 0.0001;
+my $interval = 5;
+my $dryrun = 0;
+my $ranges;
+my $restart = 0;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $initialWeights;
+my $decoderOpt;
+
+# Process command-line options
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+ "decoder=s" => \$decoderOpt,
+ "decode-nodes=s" => \$decode_nodes,
+ "dont-clean" => \$disable_clean,
+ "dry-run" => \$dryrun,
+ "epsilon" => \$epsilon,
+ "help" => \$help,
+ "interval" => \$interval,
+ "iteration=i" => \$iteration,
+ "local" => \$run_local,
+ "max-iterations=i" => \$max_iterations,
+ "mert-nodes=s" => \$mert_nodes,
+ "normalize=s" => \$normalize,
+ "pmem=s" => \$pmem,
+ "ranges=s" => \$ranges,
+ "rand-directions=i" => \$rand_directions,
+ "ref-files=s" => \$refFiles,
+ "metric=s" => \$metric,
+ "restart" => \$restart,
+ "source-file=s" => \$srcFile,
+ "weights=s" => \$initialWeights,
+ "workdir=s" => \$dir
+) == 0 || @ARGV!=1 || $help) {
+ print_help();
+ exit;
+}
+
+if ($metric =~ /^(combi|ter)$/i) {
+ $lines_per_mapper = 40;
+}
+
+($iniFile) = @ARGV;
+
+sub write_config;
+sub enseg;
+sub print_help;
+
+my $nodelist;
+my $host =`hostname`; chomp $host;
+my $bleu;
+my $best_bleu = 0.0;
+my $interval_count = 0;
+my $epsilon_bleu = $best_bleu + $epsilon;
+my $logfile;
+my $projected_score;
+
+my $refs_comma_sep = get_comma_sep_refs($refFiles);
+
+unless ($dir){
+ $dir = "vest";
+}
+unless ($dir =~ /^\//){ # convert relative path to absolute path
+ my $basedir = `pwd`;
+ chomp $basedir;
+ $dir = "$basedir/$dir";
+}
+if ($restart){
+ $iniFile = `ls $dir/*.ini`; chomp $iniFile;
+ unless (-e $iniFile){
+ die "ERROR: Could not find ini file in $dir to restart\n";
+ }
+ $logfile = "$dir/mert.log";
+ open(LOGFILE, ">>$logfile");
+ print LOGFILE "RESTARTING STOPPED OPTIMIZATION\n\n";
+
+ # figure out best weights so far and iteration number
+ open(LOG, "$dir/mert.log");
+ my $wi = 0;
+ while (my $line = <LOG>){
+ chomp $line;
+ if ($line =~ /ITERATION (\d+)/) {
+ $iteration = $1;
+ }
+ }
+
+ $iteration = $wi + 1;
+}
+
+if ($decoderOpt){ $decoder = $decoderOpt; }
+
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+ print STDERR "Cleanup...\n";
+ for my $pid (@childpids){ `kill $pid`; }
+ for my $cmd (@cleanupcmds){`$cmd`; }
+ exit 1;
+};
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = `basename $decoder`; chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $parallelize = "$mydir/parallelize.pl";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+if ($dryrun){
+ write_config(*STDERR);
+ exit 0;
+} else {
+ if (-e $dir){
+ unless($restart){
+ die "ERROR: working dir $dir already exists\n\n";
+ }
+ } else {
+ mkdir $dir;
+ mkdir "$dir/hgs";
+ mkdir "$dir/hgs-current";
+ unless (-e $initialWeights) {
+ print STDERR "Please specify an initial weights file with --initial-weights\n";
+ print_help();
+ exit;
+ }
+ `cp $initialWeights $dir/weights.0`;
+ die "Can't find weights.0" unless (-e "$dir/weights.0");
+ }
+ unless($restart){
+ $logfile = "$dir/mert.log";
+ open(LOGFILE, ">$logfile");
+ }
+ write_config(*LOGFILE);
+}
+
+
+# Generate initial files and values
+unless ($restart){ `cp $iniFile $newIniFile`; }
+$iniFile = $newIniFile;
+
+my $newsrc = "$dir/dev.input";
+unless($restart){ enseg($srcFile, $newsrc); }
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+while (1){
+ print LOGFILE "\n\nITERATION $iteration\n==========\n";
+
+ # iteration-specific files
+ my $runFile="$dir/run.raw.$iteration";
+ my $onebestFile="$dir/1best.$iteration";
+ my $logdir="$dir/logs.$iteration";
+ my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+ my $scorerLog="$logdir/scorer.log.$iteration";
+ `mkdir -p $logdir`;
+
+ #decode
+ print LOGFILE "DECODE\n";
+ print LOGFILE `date`;
+ my $im1 = $iteration - 1;
+ my $weightsFile="$dir/weights.$im1";
+ my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile -O $dir/hgs-current";
+ my $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -n \"$decode_nodes\" -- ";
+ if ($run_local) { $pcmd = "cat $srcFile |"; }
+ my $cmd = $pcmd . "$decoder_cmd 2> $decoderLog 1> $runFile";
+ print LOGFILE "COMMAND:\n$cmd\n";
+ my $result = 0;
+ $result = system($cmd);
+ unless ($result == 0){
+ cleanup();
+ print LOGFILE "ERROR: Parallel decoder returned non-zero exit code $result\n";
+ die;
+ }
+ my $dec_score = `cat $runFile | $SCORER $refs_comma_sep -l $metric`;
+ chomp $dec_score;
+ print LOGFILE "DECODER SCORE: $dec_score\n";
+
+ # save space
+ `gzip $runFile`;
+ `gzip $decoderLog`;
+
+ if ($iteration > $max_iterations){
+ print LOGFILE "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+ last;
+ }
+
+ # run optimizer
+ print LOGFILE "\nUNION FORESTS\n";
+ print LOGFILE `date`;
+ my $mergeLog="$logdir/prune-merge.log.$iteration";
+ $cmd = "$forestUnion -r $dir/hgs -n $dir/hgs-current -s $devSize";
+ print LOGFILE "COMMAND:\n$cmd\n";
+ $result = system($cmd);
+ unless ($result == 0){
+ cleanup();
+ print LOGFILE "ERROR: merge command returned non-zero exit code $result\n";
+ die;
+ }
+ `rm -f $dir/hgs-current/*.json.gz`; # clean up old HGs, they've been moved to the repository
+
+ my $score = 0;
+ my $icc = 0;
+ my $inweights="$dir/weights.$im1";
+ for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) {
+ print LOGFILE "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n";
+ print LOGFILE `date`;
+ $icc++;
+ $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter";
+ print LOGFILE "COMMAND:\n$cmd\n";
+ $result = system($cmd);
+ unless ($result == 0){
+ cleanup();
+ print LOGFILE "ERROR: mapinput command returned non-zero exit code $result\n";
+ die;
+ }
+
+ `mkdir $dir/splag.$im1`;
+ $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput.";
+ print LOGFILE "COMMAND:\n$cmd\n";
+ $result = system($cmd);
+ unless ($result == 0){
+ cleanup();
+ print LOGFILE "ERROR: split command returned non-zero exit code $result\n";
+ die;
+ }
+ opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
+ my @shards = grep { /^mapinput\./ } readdir(DIR);
+ closedir DIR;
+ die "No shards!" unless scalar @shards > 0;
+ my $joblist = "";
+ my $nmappers = 0;
+ my @mapoutputs = ();
+ @cleanupcmds = ();
+ my %o2i = ();
+ my $first_shard = 1;
+ for my $shard (@shards) {
+ my $mapoutput = $shard;
+ my $client_name = $shard;
+ $client_name =~ s/mapinput.//;
+ $client_name = "fmert.$client_name";
+ $mapoutput =~ s/mapinput/mapoutput/;
+ push @mapoutputs, "$dir/splag.$im1/$mapoutput";
+ $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
+ my $script = "$MAPPER -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -k1 > $dir/splag.$im1/$mapoutput";
+ if ($run_local) {
+ print LOGFILE "COMMAND:\n$script\n";
+ $result = system($script);
+ unless ($result == 0){
+ cleanup();
+ print LOGFILE "ERROR: mapper returned non-zero exit code $result\n";
+ die;
+ }
+ } else {
+ my $todo = "qsub -q batch -l pmem=3000mb,walltime=5:00:00 -N $client_name -o /dev/null -e $logdir/$client_name.ER";
+ local(*QOUT, *QIN);
+ open2(\*QOUT, \*QIN, $todo);
+ print QIN $script;
+ if ($first_shard) { print LOGFILE "$script\n"; $first_shard=0; }
+ close QIN;
+ $nmappers++;
+ while (my $jobid=<QOUT>){
+ chomp $jobid;
+ push(@cleanupcmds, "`qdel $jobid 2> /dev/null`");
+ $jobid =~ s/^(\d+)(.*?)$/\1/g;
+ print STDERR "short job id $jobid\n";
+ if ($joblist == "") { $joblist = $jobid; }
+ else {$joblist = $joblist . "\|" . $jobid; }
+ }
+ close QOUT;
+ }
+ }
+ if ($run_local) {
+ } else {
+ print LOGFILE "Launched $nmappers mappers.\n";
+ print LOGFILE "Waiting for mappers to complete...\n";
+ while ($nmappers > 0) {
+ sleep 5;
+ my @livejobs = grep(/$joblist/, split(/\n/, `qstat`));
+ $nmappers = scalar @livejobs;
+ }
+ print LOGFILE "All mappers complete.\n";
+ }
+ my $tol = 0;
+ my $til = 0;
+ for my $mo (@mapoutputs) {
+ my $olines = get_lines($mo);
+ my $ilines = get_lines($o2i{$mo});
+ $tol += $olines;
+ $til += $ilines;
+ die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines;
+ }
+ print LOGFILE "Results for $tol/$til lines\n";
+ print LOGFILE "\nSORTING AND RUNNING FMERT REDUCER\n";
+ print LOGFILE `date`;
+ $cmd="sort -k1 @mapoutputs | $REDUCER > $dir/redoutput.$im1";
+ print LOGFILE "COMMAND:\n$cmd\n";
+ $result = system($cmd);
+ unless ($result == 0){
+ cleanup();
+ print LOGFILE "ERROR: reducer command returned non-zero exit code $result\n";
+ die;
+ }
+ $cmd="sort -rnk3 '-t|' $dir/redoutput.$im1 | head -1";
+ my $best=`$cmd`; chomp $best;
+ print LOGFILE "$best\n";
+ my ($oa, $x, $xscore) = split /\|/, $best;
+ $score = $xscore;
+ print LOGFILE "PROJECTED SCORE: $score\n";
+ if (abs($x) < $epsilon) {
+ print LOGFILE "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n";
+ last;
+ }
+ my ($origin, $axis) = split /\s+/, $oa;
+
+ my %ori = convert($origin);
+ my %axi = convert($axis);
+
+ my $finalFile="$dir/weights.$im1-$opt_iter";
+ open W, ">$finalFile" or die "Can't write: $finalFile: $!";
+ for my $k (sort keys %ori) {
+ my $v = $ori{$k} + $axi{$k} * $x;
+ print W "$k $v\n";
+ }
+
+ `rm -rf $dir/splag.$im1`;
+ $inweights = $finalFile;
+ }
+ $lastWeightsFile = "$dir/weights.$iteration";
+ `cp $inweights $lastWeightsFile`;
+ if ($icc < 2) {
+ print LOGFILE "\nREACHED STOPPING CRITERION: score change too little\n";
+ last;
+ }
+ $lastPScore = $score;
+ $iteration++;
+ print LOGFILE "\n==========\n";
+}
+
+print LOGFILE "\nFINAL WEIGHTS: $dir/$lastWeightsFile\n(Use -w <this file> with hiero)\n\n";
+
+sub normalize_weights {
+ my ($rfn, $rpts, $feat) = @_;
+ my @feat_names = @$rfn;
+ my @pts = @$rpts;
+ my $z = 1.0;
+ for (my $i=0; $i < scalar @feat_names; $i++) {
+ if ($feat_names[$i] eq $feat) {
+ $z = $pts[$i];
+ last;
+ }
+ }
+ for (my $i=0; $i < scalar @feat_names; $i++) {
+ $pts[$i] /= $z;
+ }
+ print LOGFILE " NORM WEIGHTS: @pts\n";
+ return @pts;
+}
+
+sub get_lines {
+ my $fn = shift @_;
+ open FL, "<$fn" or die "Couldn't read $fn: $!";
+ my $lc = 0;
+ while(<FL>) { $lc++; }
+ return $lc;
+}
+
+sub get_comma_sep_refs {
+ my ($p) = @_;
+ my $o = `echo $p`;
+ chomp $o;
+ my @files = split /\s+/, $o;
+ return "-r " . join(' -r ', @files);
+}
+
+sub read_weights_file {
+ my ($file) = @_;
+ open F, "<$file" or die "Couldn't read $file: $!";
+ my @r = ();
+ my $pm = -1;
+ while(<F>) {
+ next if /^#/;
+ next if /^\s*$/;
+ chomp;
+ if (/^(.+)\s+(.+)$/) {
+ my $m = $1;
+ my $w = $2;
+ die "Weights out of order: $m <= $pm" unless $m > $pm;
+ push @r, $w;
+ } else {
+ warn "Unexpected feature name in weight file: $_";
+ }
+ }
+ close F;
+ return join ' ', @r;
+}
+
+# subs
+sub write_config {
+ my $fh = shift;
+ my $cleanup = "yes";
+ if ($disable_clean) {$cleanup = "no";}
+
+ print $fh "\n";
+ print $fh "DECODER: $decoder\n";
+ print $fh "INI FILE: $iniFile\n";
+ print $fh "WORKING DIR: $dir\n";
+ print $fh "SOURCE (DEV): $srcFile\n";
+ print $fh "REFS (DEV): $refFiles\n";
+ print $fh "EVAL METRIC: $metric\n";
+ print $fh "START ITERATION: $iteration\n";
+ print $fh "MAX ITERATIONS: $max_iterations\n";
+ print $fh "MERT NODES: $mert_nodes\n";
+ print $fh "DECODE NODES: $decode_nodes\n";
+ print $fh "HEAD NODE: $host\n";
+ print $fh "PMEM (DECODING): $pmem\n";
+ print $fh "CLEANUP: $cleanup\n";
+ print $fh "INITIAL WEIGHTS: $initialWeights\n";
+
+ if ($restart){
+ print $fh "PROJECTED BLEU: $projected_score\n";
+ print $fh "BEST WEIGHTS: $best_weights\n";
+ print $fh "BEST BLEU: $best_bleu\n";
+ }
+}
+
+sub update_weights_file {
+ my ($neww, $rfn, $rpts) = @_;
+ my @feats = @$rfn;
+ my @pts = @$rpts;
+ my $num_feats = scalar @feats;
+ my $num_pts = scalar @pts;
+ die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+ open G, ">$neww" or die;
+ for (my $i = 0; $i < $num_feats; $i++) {
+ my $f = $feats[$i];
+ my $lambda = $pts[$i];
+ print G "$f $lambda\n";
+ }
+ close G;
+}
+
+sub enseg {
+ my $src = shift;
+ my $newsrc = shift;
+ open(SRC, $src);
+ open(NEWSRC, ">$newsrc");
+ my $i=0;
+ while (my $line=<SRC>){
+ chomp $line;
+ print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+ $i++;
+ }
+ close SRC;
+ close NEWSRC;
+}
+
+sub print_help {
+
+ my $executable = `basename $0`; chomp $executable;
+ print << "Help";
+
+Usage: $executable [options] <ini file>
+ $executable --restart <work dir>
+
+ $executable [options] <ini file>
+ Runs a complete MERT optimization and test set decoding, using
+ the decoder configuration in ini file. Note that many of the
+ options have default values that are inferred automatically
+ based on certain conventions. For details, refer to descriptions
+ of the options --decoder, --weights, and --workdir.
+
+ $executable --restart <work dir>
+ Continues an optimization run that was stopped for some reason,
+ using configuration information found in the working directory
+ left behind by the stopped run.
+
+Options:
+
+ --local
+ Run the decoder and optimizer locally.
+
+ --decoder <decoder path>
+ Decoder binary to use.
+
+ --decode-nodes <nodelist>
+ A list of nodes used for parallel decoding. If specific nodes
+ are not desired, use "1" for each node requested. Defaults to
+ "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1", which indicates a request for
+ 15 nodes.
+
+ --dont-clean
+ If present, this flag prevents intermediate files, including
+ run files and cumulative files, from being automatically removed
+ after a successful optimization run (these files are left if the
+ run fails for any reason). If used, a makefile containing
+ cleanup commands is written to the directory. To clean up
+ the intermediate files, invoke make without any arguments.
+
+ --dry-run
+ Prints out the settings and exits without doing anything.
+
+ --epsilon <epsilon>
+ Require that the dev set BLEU score improve by at least <epsilon>
+ within <interval> iterations (controlled by parameter --interval).
+ If not specified, defaults to .002.
+
+ --help
+ Print this message and exit.
+
+ --interval <i>
+ Require that the dev set BLEU score improve by at least <epsilon>
+ (controlled by parameter --epsilon) within <interval> iterations.
+ If not specified, defaults to 5.
+
+ --iteration <I>
+ Starting iteration number. If not specified, defaults to 1.
+
+ --max-iterations <M>
+ Maximum number of iterations to run. If not specified, defaults
+ to 10.
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs,
+ in the format expected by qsub. If not specified, defaults to
+ 2g.
+
+ --ref-files <files>
+ Dev set ref files. This option takes only a single string argument.
+ To use multiple files (including file globbing), this argument should
+ be quoted. If not specified, defaults to
+ /fs/cliplab/mteval/Evaluation/Chinese-English/mt03.ref.txt.*
+
+ --metric <method>
+ Metric to optimize. See fmert's --metric option for values.
+ Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+ --normalize <feature-name>
+ After each iteration, rescale all feature weights such that feature-
+ name has a weight of 1.0.
+
+ --rand-directions <num>
+ MERT will attempt to optimize along all of the principle directions,
+ set this parameter to explore other directions. Defaults to 5.
+
+ --source-file <file>
+ Dev set source file. If not specified, defaults to
+ /fs/cliplab/mteval/Evaluation/Chinese-English/mt03.src.txt
+
+ --weights <file>
+ A file specifying initial feature weights. The format is
+ FeatureName_1 value1
+ FeatureName_2 value2
+
+ --workdir <dir>
+ Directory for intermediate and output files. If not specified, the
+ name is derived from the ini filename. Assuming that the ini
+ filename begins with the decoder name and ends with ini, the default
+ name of the working directory is inferred from the middle part of
+ the filename. E.g. an ini file named decoder.foo.ini would have
+ a default working directory name foo.
+
+Help
+}
+
+sub convert {
+ my ($str) = @_;
+ my @ps = split /;/, $str;
+ my %dict = ();
+ for my $p (@ps) {
+ my ($k, $v) = split /=/, $p;
+ $dict{$k} = $v;
+ }
+ return %dict;
+}
+
+
diff --git a/vest/error_surface.cc b/vest/error_surface.cc
new file mode 100644
index 00000000..4e0af35c
--- /dev/null
+++ b/vest/error_surface.cc
@@ -0,0 +1,46 @@
+#include "error_surface.h"
+
+#include <cassert>
+#include <sstream>
+
+using namespace std;
+
+ErrorSurface::~ErrorSurface() {
+ for (ErrorSurface::iterator i = begin(); i != end(); ++i)
+ //delete i->delta;
+ ;
+}
+
+void ErrorSurface::Serialize(std::string* out) const {
+ const int segments = this->size();
+ ostringstream os(ios::binary);
+ os.write((const char*)&segments,sizeof(segments));
+ for (int i = 0; i < segments; ++i) {
+ const ErrorSegment& cur = (*this)[i];
+ string senc;
+ cur.delta->Encode(&senc);
+ assert(senc.size() < 256);
+ unsigned char len = senc.size();
+ os.write((const char*)&cur.x, sizeof(cur.x));
+ os.write((const char*)&len, sizeof(len));
+ os.write((const char*)&senc[0], len);
+ }
+ *out = os.str();
+}
+
+void ErrorSurface::Deserialize(ScoreType type, const std::string& in) {
+ istringstream is(in, ios::binary);
+ int segments;
+ is.read((char*)&segments, sizeof(segments));
+ this->resize(segments);
+ for (int i = 0; i < segments; ++i) {
+ ErrorSegment& cur = (*this)[i];
+ unsigned char len;
+ is.read((char*)&cur.x, sizeof(cur.x));
+ is.read((char*)&len, sizeof(len));
+ string senc(len, '\0'); assert(senc.size() == len);
+ is.read((char*)&senc[0], len);
+ cur.delta = SentenceScorer::CreateScoreFromString(type, senc);
+ }
+}
+
diff --git a/vest/fast_score.cc b/vest/fast_score.cc
new file mode 100644
index 00000000..45b60d78
--- /dev/null
+++ b/vest/fast_score.cc
@@ -0,0 +1,74 @@
+#include <iostream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "tdict.h"
+#include "scorer.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)")
+ ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)")
+ ("in_file,i", po::value<string>()->default_value("-"), "Input file")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (!conf->count("reference")) {
+ cerr << "Please specify one or more references using -r <REF1.TXT> -r <REF2.TXT> ...\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const string loss_function = conf["loss_function"].as<string>();
+ ScoreType type = ScoreTypeFromString(loss_function);
+ DocScorer ds(type, conf["reference"].as<vector<string> >());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl;
+
+ ReadFile rf(conf["in_file"].as<string>());
+ Score* acc = NULL;
+ istream& in = *rf.stream();
+ int lc = 0;
+ while(in) {
+ string line;
+ getline(in, line);
+ if (line.empty()) continue;
+ vector<WordID> sent;
+ TD::ConvertSentence(line, &sent);
+ Score* sentscore = ds[lc]->ScoreCandidate(sent);
+ if (!acc) { acc = sentscore->GetZero(); }
+ acc->PlusEquals(*sentscore);
+ delete sentscore;
+ ++lc;
+ }
+ assert(lc > 0);
+ if (lc > ds.size()) {
+ cerr << "Too many (" << lc << ") translations in input, expected " << ds.size() << endl;
+ return 1;
+ }
+ if (lc != ds.size())
+ cerr << "Fewer sentences in hyp (" << lc << ") than refs ("
+ << ds.size() << "): scoring partial set!\n";
+ float score = acc->ComputeScore();
+ string details;
+ acc->ScoreDetails(&details);
+ delete acc;
+ cerr << details << endl;
+ cout << score << endl;
+ return 0;
+}
diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc
new file mode 100644
index 00000000..98dcec34
--- /dev/null
+++ b/vest/line_optimizer.cc
@@ -0,0 +1,101 @@
+#include "line_optimizer.h"
+
+#include <limits>
+#include <algorithm>
+
+#include "sparse_vector.h"
+#include "scorer.h"
+
+using namespace std;
+
+typedef ErrorSurface::const_iterator ErrorIter;
+
+// sort by increasing x-ints
+struct IntervalComp {
+ bool operator() (const ErrorIter& a, const ErrorIter& b) const {
+ return a->x < b->x;
+ }
+};
+
+double LineOptimizer::LineOptimize(
+ const vector<ErrorSurface>& surfaces,
+ const LineOptimizer::ScoreType type,
+ float* best_score,
+ const double epsilon) {
+ vector<ErrorIter> all_ints;
+ for (vector<ErrorSurface>::const_iterator i = surfaces.begin();
+ i != surfaces.end(); ++i) {
+ const ErrorSurface& surface = *i;
+ for (ErrorIter j = surface.begin(); j != surface.end(); ++j)
+ all_ints.push_back(j);
+ }
+ sort(all_ints.begin(), all_ints.end(), IntervalComp());
+ double last_boundary = all_ints.front()->x;
+ Score* acc = all_ints.front()->delta->GetZero();
+ float& cur_best_score = *best_score;
+ cur_best_score = (type == MAXIMIZE_SCORE ?
+ -numeric_limits<float>::max() : numeric_limits<float>::max());
+ bool left_edge = true;
+ double pos = numeric_limits<double>::quiet_NaN();
+ for (vector<ErrorIter>::iterator i = all_ints.begin();
+ i != all_ints.end(); ++i) {
+ const ErrorSegment& seg = **i;
+ assert(seg.delta);
+ if (seg.x - last_boundary > epsilon) {
+ float sco = acc->ComputeScore();
+ if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
+ (type == MINIMIZE_SCORE && sco < cur_best_score) ) {
+ cur_best_score = sco;
+ if (left_edge) {
+ pos = seg.x - 0.1;
+ left_edge = false;
+ } else {
+ pos = last_boundary + (seg.x - last_boundary) / 2;
+ }
+ // cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n";
+ }
+ // cerr << "---- s=" << sco << "\n";
+ last_boundary = seg.x;
+ }
+ // cerr << "x-boundary=" << seg.x << "\n";
+ acc->PlusEquals(*seg.delta);
+ }
+ float sco = acc->ComputeScore();
+ if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
+ (type == MINIMIZE_SCORE && sco < cur_best_score) ) {
+ cur_best_score = sco;
+ if (left_edge) {
+ pos = 0;
+ } else {
+ pos = last_boundary + 1000.0;
+ }
+ }
+ delete acc;
+ return pos;
+}
+
+void LineOptimizer::RandomUnitVector(const vector<int>& features_to_optimize,
+ SparseVector<double>* axis,
+ RandomNumberGenerator<boost::mt19937>* rng) {
+ axis->clear();
+ for (int i = 0; i < features_to_optimize.size(); ++i)
+ axis->set_value(features_to_optimize[i], rng->next() - 0.5);
+ (*axis) /= axis->l2norm();
+}
+
+void LineOptimizer::CreateOptimizationDirections(
+ const vector<int>& features_to_optimize,
+ int additional_random_directions,
+ RandomNumberGenerator<boost::mt19937>* rng,
+ vector<SparseVector<double> >* dirs) {
+ const int num_directions = features_to_optimize.size() + additional_random_directions;
+ dirs->resize(num_directions);
+ for (int i = 0; i < num_directions; ++i) {
+ SparseVector<double>& axis = (*dirs)[i];
+ if (i < features_to_optimize.size())
+ axis.set_value(features_to_optimize[i], 1.0);
+ else
+ RandomUnitVector(features_to_optimize, &axis, rng);
+ }
+ cerr << "Generated " << num_directions << " total axes to optimize along.\n";
+}
diff --git a/vest/lo_test.cc b/vest/lo_test.cc
new file mode 100644
index 00000000..0acae5e0
--- /dev/null
+++ b/vest/lo_test.cc
@@ -0,0 +1,201 @@
+#include <cmath>
+#include <iostream>
+#include <fstream>
+
+#include <boost/shared_ptr.hpp>
+#include <gtest/gtest.h>
+
+#include "fdict.h"
+#include "hg.h"
+#include "kbest.h"
+#include "hg_io.h"
+#include "filelib.h"
+#include "inside_outside.h"
+#include "viterbi.h"
+#include "viterbi_envelope.h"
+#include "line_optimizer.h"
+#include "scorer.h"
+
+using namespace std;
+using boost::shared_ptr;
+
+class OptTest : public testing::Test {
+ protected:
+ virtual void SetUp() { }
+ virtual void TearDown() { }
+};
+
+const char* ref11 = "australia reopens embassy in manila";
+const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack .";
+const char* ref21 = "australia reopened manila embassy";
+const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack .";
+const char* ref31 = "australia to reopen embassy in manila";
+const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats .";
+const char* ref41 = "australia to re - open its embassy to manila";
+const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago .";
+
+TEST_F(OptTest, TestCheckNaN) {
+ double x = 0;
+ double y = 0;
+ double z = x / y;
+ EXPECT_EQ(true, isnan(z));
+}
+
+TEST_F(OptTest,TestViterbiEnvelope) {
+ shared_ptr<Segment> a1(new Segment(-1, 0));
+ shared_ptr<Segment> b1(new Segment(1, 0));
+ shared_ptr<Segment> a2(new Segment(-1, 1));
+ shared_ptr<Segment> b2(new Segment(1, -1));
+ vector<shared_ptr<Segment> > sa; sa.push_back(a1); sa.push_back(b1);
+ vector<shared_ptr<Segment> > sb; sb.push_back(a2); sb.push_back(b2);
+ ViterbiEnvelope a(sa);
+ cerr << a << endl;
+ ViterbiEnvelope b(sb);
+ ViterbiEnvelope c = a;
+ c *= b;
+ cerr << a << " (*) " << b << " = " << c << endl;
+ EXPECT_EQ(3, c.size());
+}
+
+TEST_F(OptTest,TestViterbiEnvelopeInside) {
+ const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}";
+ Hypergraph hg;
+ istringstream instr(json);
+ HypergraphIO::ReadFromJSON(&instr, &hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 0.4);
+ wts.set_value(FD::Convert("f2"), 1.0);
+ hg.Reweight(wts);
+ vector<pair<vector<WordID>, prob_t> > list;
+ std::vector<SparseVector<double> > features;
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10);
+ for (int i = 0; i < 10; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
+ }
+ SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0);
+ ViterbiEnvelopeWeightFunction wf(wts, dir);
+ ViterbiEnvelope env = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
+ cerr << env << endl;
+ const vector<boost::shared_ptr<Segment> >& segs = env.GetSortedSegs();
+ dir *= segs[1]->x;
+ wts += dir;
+ hg.Reweight(wts);
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest2(hg, 10);
+ for (int i = 0; i < 10; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest2.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
+ }
+ for (int i = 0; i < segs.size(); ++i) {
+ cerr << "seg=" << i << endl;
+ vector<WordID> trans;
+ segs[i]->ConstructTranslation(&trans);
+ cerr << TD::GetString(trans) << endl;
+ }
+}
+
+TEST_F(OptTest, TestS1) {
+ int fPhraseModel_0 = FD::Convert("PhraseModel_0");
+ int fPhraseModel_1 = FD::Convert("PhraseModel_1");
+ int fPhraseModel_2 = FD::Convert("PhraseModel_2");
+ int fLanguageModel = FD::Convert("LanguageModel");
+ int fWordPenalty = FD::Convert("WordPenalty");
+ int fPassThrough = FD::Convert("PassThrough");
+ SparseVector<double> wts;
+ wts.set_value(fWordPenalty, 4.25);
+ wts.set_value(fLanguageModel, -1.1165);
+ wts.set_value(fPhraseModel_0, -0.96);
+ wts.set_value(fPhraseModel_1, -0.65);
+ wts.set_value(fPhraseModel_2, -0.77);
+ wts.set_value(fPassThrough, -10.0);
+
+ vector<int> to_optimize;
+ to_optimize.push_back(fWordPenalty);
+ to_optimize.push_back(fLanguageModel);
+ to_optimize.push_back(fPhraseModel_0);
+ to_optimize.push_back(fPhraseModel_1);
+ to_optimize.push_back(fPhraseModel_2);
+
+ Hypergraph hg;
+ ReadFile rf("./test_data/0.json.gz");
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ hg.Reweight(wts);
+
+ Hypergraph hg2;
+ ReadFile rf2("./test_data/1.json.gz");
+ HypergraphIO::ReadFromJSON(rf2.stream(), &hg2);
+ hg2.Reweight(wts);
+
+ vector<vector<WordID> > refs1(4);
+ TD::ConvertSentence(ref11, &refs1[0]);
+ TD::ConvertSentence(ref21, &refs1[1]);
+ TD::ConvertSentence(ref31, &refs1[2]);
+ TD::ConvertSentence(ref41, &refs1[3]);
+ vector<vector<WordID> > refs2(4);
+ TD::ConvertSentence(ref12, &refs2[0]);
+ TD::ConvertSentence(ref22, &refs2[1]);
+ TD::ConvertSentence(ref32, &refs2[2]);
+ TD::ConvertSentence(ref42, &refs2[3]);
+ ScoreType type = ScoreTypeFromString("ibm_bleu");
+ SentenceScorer* scorer1 = SentenceScorer::CreateSentenceScorer(type, refs1);
+ SentenceScorer* scorer2 = SentenceScorer::CreateSentenceScorer(type, refs2);
+ vector<ViterbiEnvelope> envs(2);
+
+ RandomNumberGenerator<boost::mt19937> rng;
+
+ vector<SparseVector<double> > axes;
+ LineOptimizer::CreateOptimizationDirections(
+ to_optimize,
+ 10,
+ &rng,
+ &axes);
+ assert(axes.size() == 10 + to_optimize.size());
+ for (int i = 0; i < axes.size(); ++i)
+ cerr << axes[i] << endl;
+ const SparseVector<double>& axis = axes[0];
+
+ cerr << "Computing Viterbi envelope using inside algorithm...\n";
+ cerr << "axis: " << axis << endl;
+ clock_t t_start=clock();
+ ViterbiEnvelopeWeightFunction wf(wts, axis);
+ envs[0] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
+ envs[1] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg2, NULL, wf);
+
+ vector<ErrorSurface> es(2);
+ scorer1->ComputeErrorSurface(envs[0], &es[0]);
+ scorer2->ComputeErrorSurface(envs[1], &es[1]);
+ cerr << envs[0].size() << " " << envs[1].size() << endl;
+ cerr << es[0].size() << " " << es[1].size() << endl;
+ envs.clear();
+ clock_t t_env=clock();
+ float score;
+ double m = LineOptimizer::LineOptimize(es, LineOptimizer::MAXIMIZE_SCORE, &score);
+ clock_t t_opt=clock();
+ cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n";
+ EXPECT_FLOAT_EQ(0.48719698, score);
+ SparseVector<double> res = axis;
+ res *= m;
+ res += wts;
+ cerr << "res: " << res << endl;
+ cerr << "ENVELOPE PROCESSING=" << (static_cast<double>(t_env - t_start) / 1000.0) << endl;
+ cerr << " LINE OPTIMIZATION=" << (static_cast<double>(t_opt - t_env) / 1000.0) << endl;
+ hg.Reweight(res);
+ hg2.Reweight(res);
+ vector<WordID> t1,t2;
+ ViterbiESentence(hg, &t1);
+ ViterbiESentence(hg2, &t2);
+ cerr << TD::GetString(t1) << endl;
+ cerr << TD::GetString(t2) << endl;
+ delete scorer1;
+ delete scorer2;
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc
new file mode 100644
index 00000000..c96a61e4
--- /dev/null
+++ b/vest/mr_vest_generate_mapper_input.cc
@@ -0,0 +1,72 @@
+#include <iostream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "weights.h"
+#include "line_optimizer.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("dev_set_size,s",po::value<unsigned int>(),"[REQD] Development set size (# of parallel sentences)")
+ ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository")
+ ("weights,w",po::value<string>(),"[REQD] Current feature weights file")
+ ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
+ ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (conf->count("dev_set_size") == 0) {
+ cerr << "Please specify the size of the development set using -d N\n";
+ flag = true;
+ }
+ if (conf->count("weights") == 0) {
+ cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
+ flag = true;
+ }
+ if (conf->count("forest_repository") == 0) {
+ cerr << "Please specify the forest repository location using -r <DIR>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+int main(int argc, char** argv) {
+ RandomNumberGenerator<boost::mt19937> rng;
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ Weights weights;
+ vector<string> features;
+ weights.InitFromFile(conf["weights"].as<string>(), &features);
+ const string forest_repository = conf["forest_repository"].as<string>();
+ assert(DirectoryExists(forest_repository));
+ SparseVector<double> origin;
+ weights.InitSparseVector(&origin);
+ if (conf.count("optimize_feature") > 0)
+ features=conf["optimize_feature"].as<vector<string> >();
+ vector<SparseVector<double> > axes;
+ vector<int> fids(features.size());
+ for (int i = 0; i < features.size(); ++i)
+ fids[i] = FD::Convert(features[i]);
+ LineOptimizer::CreateOptimizationDirections(
+ fids,
+ conf["random_directions"].as<unsigned int>(),
+ &rng,
+ &axes);
+ int dev_set_size = conf["dev_set_size"].as<unsigned int>();
+ for (int i = 0; i < dev_set_size; ++i)
+ for (int j = 0; j < axes.size(); ++j)
+ cout << forest_repository << '/' << i << ".json.gz " << i << ' ' << origin << ' ' << axes[j] << endl;
+ return 0;
+}
diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc
new file mode 100644
index 00000000..80e84218
--- /dev/null
+++ b/vest/mr_vest_map.cc
@@ -0,0 +1,98 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "stringlib.h"
+#include "sparse_vector.h"
+#include "scorer.h"
+#include "viterbi_envelope.h"
+#include "inside_outside.h"
+#include "error_surface.h"
+#include "hg.h"
+#include "hg_io.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+ ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (!conf->count("reference")) {
+ cerr << "Please specify one or more references using -r <REF.TXT>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+bool ReadSparseVectorString(const string& s, SparseVector<double>* v) {
+ vector<string> fields;
+ Tokenize(s, ';', &fields);
+ if (fields.empty()) return false;
+ for (int i = 0; i < fields.size(); ++i) {
+ vector<string> pair(2);
+ Tokenize(fields[i], '=', &pair);
+ if (pair.size() != 2) {
+ cerr << "Error parsing vector string: " << fields[i] << endl;
+ return false;
+ }
+ v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str()));
+ }
+ return true;
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const string loss_function = conf["loss_function"].as<string>();
+ ScoreType type = ScoreTypeFromString(loss_function);
+ DocScorer ds(type, conf["reference"].as<vector<string> >());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl;
+ Hypergraph hg;
+ string last_file;
+ while(cin) {
+ string line;
+ getline(cin, line);
+ if (line.empty()) continue;
+ istringstream is(line);
+ int sent_id;
+ string file, s_origin, s_axis;
+ is >> file >> sent_id >> s_origin >> s_axis;
+ SparseVector<double> origin;
+ assert(ReadSparseVectorString(s_origin, &origin));
+ SparseVector<double> axis;
+ assert(ReadSparseVectorString(s_axis, &axis));
+ // cerr << "File: " << file << "\nAxis: " << axis << "\n X: " << origin << endl;
+ if (last_file != file) {
+ last_file = file;
+ ReadFile rf(file);
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ }
+ ViterbiEnvelopeWeightFunction wf(origin, axis);
+ ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
+ ErrorSurface es;
+ ds[sent_id]->ComputeErrorSurface(ve, &es);
+ //cerr << "Viterbi envelope has " << ve.size() << " segments\n";
+ cerr << "Error surface has " << es.size() << " segments\n";
+ string val;
+ es.Serialize(&val);
+ cout << 'M' << ' ' << s_origin << ' ' << s_axis << '\t';
+ B64::b64encode(val.c_str(), val.size(), &cout);
+ cout << endl;
+ }
+ return 0;
+}
diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc
new file mode 100644
index 00000000..c1347065
--- /dev/null
+++ b/vest/mr_vest_reduce.cc
@@ -0,0 +1,80 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sparse_vector.h"
+#include "error_surface.h"
+#include "line_optimizer.h"
+#include "hg_io.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const string loss_function = conf["loss_function"].as<string>();
+ ScoreType type = ScoreTypeFromString(loss_function);
+ LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE;
+ if (type == TER)
+ opt_type = LineOptimizer::MINIMIZE_SCORE;
+ string last_key;
+ vector<ErrorSurface> esv;
+ while(cin) {
+ string line;
+ getline(cin, line);
+ if (line.empty()) continue;
+ size_t ks = line.find("\t");
+ assert(string::npos != ks);
+ assert(ks > 2);
+ string key = line.substr(2, ks - 2);
+ string val = line.substr(ks + 1);
+ if (key != last_key) {
+ if (!last_key.empty()) {
+ float score;
+ double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
+ cout << last_key << "|" << x << "|" << score << endl;
+ }
+ last_key = key;
+ esv.clear();
+ }
+ if (val.size() % 4 != 0) {
+ cerr << "B64 encoding error 1! Skipping.\n";
+ continue;
+ }
+ string encoded(val.size() / 4 * 3, '\0');
+ if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) {
+ cerr << "B64 encoding error 2! Skipping.\n";
+ continue;
+ }
+ esv.push_back(ErrorSurface());
+ esv.back().Deserialize(type, encoded);
+ }
+ if (!esv.empty()) {
+ cerr << "ESV=" << esv.size() << endl;
+ for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; }
+ float score;
+ double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
+ cout << last_key << "|" << x << "|" << score << endl;
+ }
+ return 0;
+}
diff --git a/vest/scorer.cc b/vest/scorer.cc
new file mode 100644
index 00000000..e242bb46
--- /dev/null
+++ b/vest/scorer.cc
@@ -0,0 +1,485 @@
+#include "scorer.h"
+
+#include <map>
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <cstdio>
+#include <valarray>
+
+#include <boost/shared_ptr.hpp>
+
+#include "viterbi_envelope.h"
+#include "error_surface.h"
+#include "ter.h"
+#include "comb_scorer.h"
+#include "tdict.h"
+#include "stringlib.h"
+
+using boost::shared_ptr;
+using namespace std;
+
+const bool minimize_segments = true; // if adjacent segments have equal scores, merge them
+
+ScoreType ScoreTypeFromString(const std::string& st) {
+ const string sl = LowercaseString(st);
+ if (sl == "ser")
+ return SER;
+ if (sl == "ter")
+ return TER;
+ if (sl == "bleu" || sl == "ibm_bleu")
+ return IBM_BLEU;
+ if (sl == "nist_bleu")
+ return NIST_BLEU;
+ if (sl == "koehn_bleu")
+ return Koehn_BLEU;
+ if (sl == "combi")
+ return BLEU_minus_TER_over_2;
+ cerr << "Don't understand score type '" << sl << "', defaulting to ibm_bleu.\n";
+ return IBM_BLEU;
+}
+
+Score::~Score() {}
+SentenceScorer::~SentenceScorer() {}
+
+class SERScore : public Score {
+ friend class SERScorer;
+ public:
+ SERScore() : correct(0), total(0) {}
+ float ComputeScore() const {
+ return static_cast<float>(correct) / static_cast<float>(total);
+ }
+ void ScoreDetails(string* details) const {
+ ostringstream os;
+ os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')';
+ *details = os.str();
+ }
+ void PlusEquals(const Score& delta) {
+ correct += static_cast<const SERScore&>(delta).correct;
+ total += static_cast<const SERScore&>(delta).total;
+ }
+ Score* GetZero() const { return new SERScore; }
+ void Subtract(const Score& rhs, Score* res) const {
+ SERScore* r = static_cast<SERScore*>(res);
+ r->correct = correct - static_cast<const SERScore&>(rhs).correct;
+ r->total = total - static_cast<const SERScore&>(rhs).total;
+ }
+ void Encode(std::string* out) const {
+ assert(!"not implemented");
+ }
+ bool IsAdditiveIdentity() const {
+ return (total == 0 && correct == 0); // correct is always 0 <= n <= total
+ }
+ private:
+ int correct, total;
+};
+
+class SERScorer : public SentenceScorer {
+ public:
+ SERScorer(const vector<vector<WordID> >& references) : refs_(references) {}
+ Score* ScoreCandidate(const std::vector<WordID>& hyp) const {
+ SERScore* res = new SERScore;
+ res->total = 1;
+ for (int i = 0; i < refs_.size(); ++i)
+ if (refs_[i] == hyp) res->correct = 1;
+ return res;
+ }
+ static Score* ScoreFromString(const std::string& data) {
+ assert(!"Not implemented");
+ }
+ private:
+ vector<vector<WordID> > refs_;
+};
+
+class BLEUScore : public Score {
+ friend class BLEUScorerBase;
+ public:
+ BLEUScore(int n) : correct_ngram_hit_counts(0,n), hyp_ngram_counts(0,n) {
+ ref_len = 0;
+ hyp_len = 0; }
+ float ComputeScore() const;
+ void ScoreDetails(string* details) const;
+ void PlusEquals(const Score& delta);
+ Score* GetZero() const;
+ void Subtract(const Score& rhs, Score* res) const;
+ void Encode(std::string* out) const;
+ bool IsAdditiveIdentity() const {
+ if (fabs(ref_len) > 0.1f || hyp_len != 0) return false;
+ for (int i = 0; i < correct_ngram_hit_counts.size(); ++i)
+ if (hyp_ngram_counts[i] != 0 ||
+ correct_ngram_hit_counts[i] != 0) return false;
+ return true;
+ }
+ private:
+ float ComputeScore(vector<float>* precs, float* bp) const;
+ valarray<int> correct_ngram_hit_counts;
+ valarray<int> hyp_ngram_counts;
+ float ref_len;
+ int hyp_len;
+};
+
+class BLEUScorerBase : public SentenceScorer {
+ public:
+ BLEUScorerBase(const std::vector<std::vector<WordID> >& references,
+ int n
+ );
+ Score* ScoreCandidate(const std::vector<WordID>& hyp) const;
+ static Score* ScoreFromString(const std::string& in);
+
+ protected:
+ virtual float ComputeRefLength(const vector<WordID>& hyp) const = 0;
+ private:
+ struct NGramCompare {
+ int operator() (const vector<WordID>& a, const vector<WordID>& b) {
+ size_t as = a.size();
+ size_t bs = b.size();
+ const size_t s = (as < bs ? as : bs);
+ for (size_t i = 0; i < s; ++i) {
+ int d = a[i] - b[i];
+ if (d < 0) return true;
+ if (d > 0) return false;
+ }
+ return as < bs;
+ }
+ };
+ typedef map<vector<WordID>, pair<int,int>, NGramCompare> NGramCountMap;
+ void CountRef(const vector<WordID>& ref) {
+ NGramCountMap tc;
+ vector<WordID> ngram(n_);
+ int s = ref.size();
+ for (int j=0; j<s; ++j) {
+ int remaining = s-j;
+ int k = (n_ < remaining ? n_ : remaining);
+ ngram.clear();
+ for (int i=1; i<=k; ++i) {
+ ngram.push_back(ref[j + i - 1]);
+ tc[ngram].first++;
+ }
+ }
+ for (NGramCountMap::iterator i = tc.begin(); i != tc.end(); ++i) {
+ pair<int,int>& p = ngrams_[i->first];
+ if (p.first < i->second.first)
+ p = i->second;
+ }
+ }
+
+ void ComputeNgramStats(const vector<WordID>& sent,
+ valarray<int>* correct,
+ valarray<int>* hyp) const {
+ assert(correct->size() == n_);
+ assert(hyp->size() == n_);
+ vector<WordID> ngram(n_);
+ (*correct) *= 0;
+ (*hyp) *= 0;
+ int s = sent.size();
+ for (int j=0; j<s; ++j) {
+ int remaining = s-j;
+ int k = (n_ < remaining ? n_ : remaining);
+ ngram.clear();
+ for (int i=1; i<=k; ++i) {
+ ngram.push_back(sent[j + i - 1]);
+ pair<int,int>& p = ngrams_[ngram];
+ if (p.second < p.first) {
+ ++p.second;
+ (*correct)[i-1]++;
+ }
+ // if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams:
+ if (!p.first) {
+ for (; i<=k; ++i)
+ (*hyp)[i-1]++;
+ } else {
+ (*hyp)[i-1]++;
+ }
+ }
+ }
+ }
+
+ mutable NGramCountMap ngrams_;
+ int n_;
+ vector<int> lengths_;
+};
+
+Score* BLEUScorerBase::ScoreFromString(const std::string& in) {
+ istringstream is(in);
+ int n;
+ is >> n;
+ BLEUScore* r = new BLEUScore(n);
+ is >> r->ref_len >> r->hyp_len;
+
+ for (int i = 0; i < n; ++i) {
+ is >> r->correct_ngram_hit_counts[i];
+ is >> r->hyp_ngram_counts[i];
+ }
+ return r;
+}
+
+class IBM_BLEUScorer : public BLEUScorerBase {
+ public:
+ IBM_BLEUScorer(const std::vector<std::vector<WordID> >& references,
+ int n=4) : BLEUScorerBase(references, n), lengths_(references.size()) {
+ for (int i=0; i < references.size(); ++i)
+ lengths_[i] = references[i].size();
+ }
+ protected:
+ float ComputeRefLength(const vector<WordID>& hyp) const {
+ if (lengths_.size() == 1) return lengths_[0];
+ int bestd = 2000000;
+ int hl = hyp.size();
+ int bl = -1;
+ for (vector<int>::const_iterator ci = lengths_.begin(); ci != lengths_.end(); ++ci) {
+ int cl = *ci;
+ if (abs(cl - hl) < bestd) {
+ bestd = abs(cl - hl);
+ bl = cl;
+ }
+ }
+ return bl;
+ }
+ private:
+ vector<int> lengths_;
+};
+
+class NIST_BLEUScorer : public BLEUScorerBase {
+ public:
+ NIST_BLEUScorer(const std::vector<std::vector<WordID> >& references,
+ int n=4) : BLEUScorerBase(references, n),
+ shortest_(references[0].size()) {
+ for (int i=1; i < references.size(); ++i)
+ if (references[i].size() < shortest_)
+ shortest_ = references[i].size();
+ }
+ protected:
+ float ComputeRefLength(const vector<WordID>& hyp) const {
+ return shortest_;
+ }
+ private:
+ float shortest_;
+};
+
+class Koehn_BLEUScorer : public BLEUScorerBase {
+ public:
+ Koehn_BLEUScorer(const std::vector<std::vector<WordID> >& references,
+ int n=4) : BLEUScorerBase(references, n),
+ avg_(0) {
+ for (int i=0; i < references.size(); ++i)
+ avg_ += references[i].size();
+ avg_ /= references.size();
+ }
+ protected:
+ float ComputeRefLength(const vector<WordID>& hyp) const {
+ return avg_;
+ }
+ private:
+ float avg_;
+};
+
+SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type,
+ const std::vector<std::vector<WordID> >& refs) {
+ switch (type) {
+ case IBM_BLEU: return new IBM_BLEUScorer(refs, 4);
+ case NIST_BLEU: return new NIST_BLEUScorer(refs, 4);
+ case Koehn_BLEU: return new Koehn_BLEUScorer(refs, 4);
+ case TER: return new TERScorer(refs);
+ case SER: return new SERScorer(refs);
+ case BLEU_minus_TER_over_2: return new BLEUTERCombinationScorer(refs);
+ default:
+ assert(!"Not implemented!");
+ }
+}
+
+Score* SentenceScorer::CreateScoreFromString(const ScoreType type, const std::string& in) {
+ switch (type) {
+ case IBM_BLEU:
+ case NIST_BLEU:
+ case Koehn_BLEU:
+ return BLEUScorerBase::ScoreFromString(in);
+ case TER:
+ return TERScorer::ScoreFromString(in);
+ case SER:
+ return SERScorer::ScoreFromString(in);
+ case BLEU_minus_TER_over_2:
+ return BLEUTERCombinationScorer::ScoreFromString(in);
+ default:
+ assert(!"Not implemented!");
+ }
+}
+
+void SentenceScorer::ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* env) const {
+ vector<WordID> prev_trans;
+ const vector<shared_ptr<Segment> >& ienv = ve.GetSortedSegs();
+ env->resize(ienv.size());
+ Score* prev_score = NULL;
+ int j = 0;
+ for (int i = 0; i < ienv.size(); ++i) {
+ const Segment& seg = *ienv[i];
+ vector<WordID> trans;
+ seg.ConstructTranslation(&trans);
+ // cerr << "Scoring: " << TD::GetString(trans) << endl;
+ if (trans == prev_trans) {
+ if (!minimize_segments) {
+ assert(prev_score); // if this fails, it means
+ // the decoder can generate null translations
+ ErrorSegment& out = (*env)[j];
+ out.delta = prev_score->GetZero();
+ out.x = seg.x;
+ ++j;
+ }
+ // cerr << "Identical translation, skipping scoring\n";
+ } else {
+ Score* score = ScoreCandidate(trans);
+ // cerr << "score= " << score->ComputeScore() << "\n";
+ Score* cur_delta = score->GetZero();
+ // just record the score diffs
+ if (!prev_score)
+ prev_score = score->GetZero();
+
+ score->Subtract(*prev_score, cur_delta);
+ delete prev_score;
+ prev_trans.swap(trans);
+ prev_score = score;
+ if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) {
+ ErrorSegment& out = (*env)[j];
+ out.delta = cur_delta;
+ out.x = seg.x;
+ ++j;
+ }
+ }
+ }
+ delete prev_score;
+ // cerr << " In segments: " << ienv.size() << endl;
+ // cerr << "Out segments: " << j << endl;
+ assert(j > 0);
+ env->resize(j);
+}
+
+void BLEUScore::ScoreDetails(std::string* details) const {
+ char buf[2000];
+ vector<float> precs(4);
+ float bp;
+ float bleu = ComputeScore(&precs, &bp);
+ sprintf(buf, "BLEU = %.2f, %.1f|%.1f|%.1f|%.1f (brev=%.3f)",
+ bleu*100.0,
+ precs[0]*100.0,
+ precs[1]*100.0,
+ precs[2]*100.0,
+ precs[3]*100.0,
+ bp);
+ *details = buf;
+}
+
+float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const {
+ float log_bleu = 0;
+ if (precs) precs->clear();
+ int count = 0;
+ for (int i = 0; i < hyp_ngram_counts.size(); ++i) {
+ if (hyp_ngram_counts[i] > 0) {
+ float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]);
+ if (precs) precs->push_back(exp(lprec));
+ log_bleu += lprec;
+ ++count;
+ }
+ }
+ log_bleu /= static_cast<float>(count);
+ float lbp = 0.0;
+ if (hyp_len < ref_len)
+ lbp = (hyp_len - ref_len) / hyp_len;
+ log_bleu += lbp;
+ if (bp) *bp = exp(lbp);
+ return exp(log_bleu);
+}
+
+float BLEUScore::ComputeScore() const {
+ return ComputeScore(NULL, NULL);
+}
+
+void BLEUScore::Subtract(const Score& rhs, Score* res) const {
+ const BLEUScore& d = static_cast<const BLEUScore&>(rhs);
+ BLEUScore* o = static_cast<BLEUScore*>(res);
+ o->ref_len = ref_len - d.ref_len;
+ o->hyp_len = hyp_len - d.hyp_len;
+ o->correct_ngram_hit_counts = correct_ngram_hit_counts - d.correct_ngram_hit_counts;
+ o->hyp_ngram_counts = hyp_ngram_counts - d.hyp_ngram_counts;
+}
+
+void BLEUScore::PlusEquals(const Score& delta) {
+ const BLEUScore& d = static_cast<const BLEUScore&>(delta);
+ correct_ngram_hit_counts += d.correct_ngram_hit_counts;
+ hyp_ngram_counts += d.hyp_ngram_counts;
+ ref_len += d.ref_len;
+ hyp_len += d.hyp_len;
+}
+
+Score* BLEUScore::GetZero() const {
+ return new BLEUScore(hyp_ngram_counts.size());
+}
+
+void BLEUScore::Encode(std::string* out) const {
+ ostringstream os;
+ const int n = correct_ngram_hit_counts.size();
+ os << n << ' ' << ref_len << ' ' << hyp_len;
+ for (int i = 0; i < n; ++i)
+ os << ' ' << correct_ngram_hit_counts[i] << ' ' << hyp_ngram_counts[i];
+ *out = os.str();
+}
+
+BLEUScorerBase::BLEUScorerBase(const std::vector<std::vector<WordID> >& references,
+ int n) : n_(n) {
+ for (vector<vector<WordID> >::const_iterator ci = references.begin();
+ ci != references.end(); ++ci) {
+ lengths_.push_back(ci->size());
+ CountRef(*ci);
+ }
+}
+
+Score* BLEUScorerBase::ScoreCandidate(const vector<WordID>& hyp) const {
+ BLEUScore* bs = new BLEUScore(n_);
+ for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i)
+ i->second.second = 0;
+ ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts);
+ bs->ref_len = ComputeRefLength(hyp);
+ bs->hyp_len = hyp.size();
+ return bs;
+}
+
+DocScorer::~DocScorer() {
+ for (int i=0; i < scorers_.size(); ++i)
+ delete scorers_[i];
+}
+
+DocScorer::DocScorer(
+ const ScoreType type,
+ const std::vector<std::string>& ref_files) {
+ // TODO stop using valarray, start using ReadFile
+ cerr << "Loading references (" << ref_files.size() << " files)\n";
+ valarray<ifstream> ifs(ref_files.size());
+ for (int i=0; i < ref_files.size(); ++i) {
+ ifs[i].open(ref_files[i].c_str());
+ assert(ifs[i].good());
+ }
+ char buf[64000];
+ bool expect_eof = false;
+ while (!ifs[0].eof()) {
+ vector<vector<WordID> > refs(ref_files.size());
+ for (int i=0; i < ref_files.size(); ++i) {
+ if (ifs[i].eof()) break;
+ ifs[i].getline(buf, 64000);
+ refs[i].clear();
+ if (strlen(buf) == 0) {
+ if (ifs[i].eof()) {
+ if (!expect_eof) {
+ assert(i == 0);
+ expect_eof = true;
+ }
+ break;
+ }
+ } else {
+ TD::ConvertSentence(buf, &refs[i]);
+ assert(!refs[i].empty());
+ }
+ assert(!expect_eof);
+ }
+ if (!expect_eof) scorers_.push_back(SentenceScorer::CreateSentenceScorer(type, refs));
+ }
+ cerr << "Loaded reference translations for " << scorers_.size() << " sentences.\n";
+}
+
diff --git a/vest/scorer_test.cc b/vest/scorer_test.cc
new file mode 100644
index 00000000..fca219cc
--- /dev/null
+++ b/vest/scorer_test.cc
@@ -0,0 +1,178 @@
+#include <iostream>
+#include <fstream>
+#include <valarray>
+#include <gtest/gtest.h>
+
+#include "tdict.h"
+#include "scorer.h"
+
+using namespace std;
+
+class ScorerTest : public testing::Test {
+ protected:
+ virtual void SetUp() {
+ refs0.resize(4);
+ refs1.resize(4);
+ TD::ConvertSentence("export of high-tech products in guangdong in first two months this year reached 3.76 billion us dollars", &refs0[0]);
+ TD::ConvertSentence("guangdong's export of new high technology products amounts to us $ 3.76 billion in first two months of this year", &refs0[1]);
+ TD::ConvertSentence("guangdong exports us $ 3.76 billion worth of high technology products in the first two months of this year", &refs0[2]);
+ TD::ConvertSentence("in the first 2 months this year , the export volume of new hi-tech products in guangdong province reached 3.76 billion us dollars .", &refs0[3]);
+ TD::ConvertSentence("xinhua news agency , guangzhou , march 16 ( reporter chen ji ) the latest statistics show that from january through february this year , the export of high-tech products in guangdong province reached 3.76 billion us dollars , up 34.8 \% over the same period last year and accounted for 25.5 \% of the total export in the province .", &refs1[0]);
+ TD::ConvertSentence("xinhua news agency , guangzhou , march 16 ( reporter : chen ji ) -- latest statistic indicates that guangdong's export of new high technology products amounts to us $ 3.76 billion , up 34.8 \% over corresponding period and accounts for 25.5 \% of the total exports of the province .", &refs1[1]);
+ TD::ConvertSentence("xinhua news agency report of march 16 from guangzhou ( by staff reporter chen ji ) - latest statistics indicate guangdong province exported us $ 3.76 billion worth of high technology products , up 34.8 percent from the same period last year , which account for 25.5 percent of the total exports of the province .", &refs1[2]);
+ TD::ConvertSentence("guangdong , march 16 , ( xinhua ) -- ( chen ji reports ) as the newest statistics shows , in january and feberuary this year , the export volume of new hi-tech products in guangdong province reached 3.76 billion us dollars , up 34.8 \% than last year , making up 25.5 \% of the province's total .", &refs1[3]);
+ TD::ConvertSentence("one guangdong province will next export us $ 3.76 high-tech product two months first this year 3.76 billion us dollars", &hyp1);
+ TD::ConvertSentence("xinhua news agency , guangzhou , 16th of march ( reporter chen ) -- latest statistics suggest that guangdong exports new advanced technology product totals $ 3.76 million , 34.8 percent last corresponding period and accounts for 25.5 percent of the total export province .", &hyp2);
+ }
+
+ virtual void TearDown() { }
+
+ vector<vector<WordID> > refs0;
+ vector<vector<WordID> > refs1;
+ vector<WordID> hyp1;
+ vector<WordID> hyp2;
+};
+
+TEST_F(ScorerTest, TestCreateFromFiles) {
+ vector<string> files;
+ files.push_back("test_data/re.txt.0");
+ files.push_back("test_data/re.txt.1");
+ files.push_back("test_data/re.txt.2");
+ files.push_back("test_data/re.txt.3");
+ DocScorer ds(IBM_BLEU, files);
+}
+
+TEST_F(ScorerTest, TestBLEUScorer) {
+ SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs0);
+ SentenceScorer* s2 = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs1);
+ Score* b1 = s1->ScoreCandidate(hyp1);
+ EXPECT_FLOAT_EQ(0.23185077, b1->ComputeScore());
+ Score* b2 = s2->ScoreCandidate(hyp2);
+ EXPECT_FLOAT_EQ(0.38101241, b2->ComputeScore());
+ b1->PlusEquals(*b2);
+ EXPECT_FLOAT_EQ(0.348854, b1->ComputeScore());
+ EXPECT_FALSE(b1->IsAdditiveIdentity());
+ string details;
+ b1->ScoreDetails(&details);
+ EXPECT_EQ("BLEU = 34.89, 81.5|50.8|29.5|18.6 (brev=0.898)", details);
+ cerr << details << endl;
+ string enc;
+ b1->Encode(&enc);
+ Score* b3 = SentenceScorer::CreateScoreFromString(IBM_BLEU, enc);
+ details.clear();
+ cerr << "Encoded BLEU score size: " << enc.size() << endl;
+ b3->ScoreDetails(&details);
+ cerr << details << endl;
+ EXPECT_FALSE(b3->IsAdditiveIdentity());
+ EXPECT_EQ("BLEU = 34.89, 81.5|50.8|29.5|18.6 (brev=0.898)", details);
+ Score* bz = b3->GetZero();
+ EXPECT_TRUE(bz->IsAdditiveIdentity());
+ delete bz;
+ delete b1;
+ delete s1;
+ delete b2;
+ delete s2;
+}
+
+TEST_F(ScorerTest, TestTERScorer) {
+ SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(TER, refs0);
+ SentenceScorer* s2 = SentenceScorer::CreateSentenceScorer(TER, refs1);
+ string details;
+ Score* t1 = s1->ScoreCandidate(hyp1);
+ t1->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ cerr << t1->ComputeScore() << endl;
+ Score* t2 = s2->ScoreCandidate(hyp2);
+ t2->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ cerr << t2->ComputeScore() << endl;
+ t1->PlusEquals(*t2);
+ cerr << t1->ComputeScore() << endl;
+ t1->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ EXPECT_EQ("TER = 44.16, 4| 8| 16| 6 (len=77)", details);
+ string enc;
+ t1->Encode(&enc);
+ Score* t3 = SentenceScorer::CreateScoreFromString(TER, enc);
+ details.clear();
+ t3->ScoreDetails(&details);
+ EXPECT_EQ("TER = 44.16, 4| 8| 16| 6 (len=77)", details);
+ EXPECT_FALSE(t3->IsAdditiveIdentity());
+ Score* tz = t3->GetZero();
+ EXPECT_TRUE(tz->IsAdditiveIdentity());
+ delete tz;
+ delete t3;
+ delete t1;
+ delete s1;
+ delete t2;
+ delete s2;
+}
+
+TEST_F(ScorerTest, TestTERScorerSimple) {
+ vector<vector<WordID> > ref(1);
+ TD::ConvertSentence("1 2 3 A B", &ref[0]);
+ vector<WordID> hyp;
+ TD::ConvertSentence("A B 1 2 3", &hyp);
+ SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(TER, ref);
+ string details;
+ Score* t1 = s1->ScoreCandidate(hyp);
+ t1->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ delete t1;
+ delete s1;
+}
+
+TEST_F(ScorerTest, TestSERScorerSimple) {
+ vector<vector<WordID> > ref(1);
+ TD::ConvertSentence("A B C D", &ref[0]);
+ vector<WordID> hyp1;
+ TD::ConvertSentence("A B C", &hyp1);
+ vector<WordID> hyp2;
+ TD::ConvertSentence("A B C D", &hyp2);
+ SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(SER, ref);
+ string details;
+ Score* t1 = s1->ScoreCandidate(hyp1);
+ t1->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ Score* t2 = s1->ScoreCandidate(hyp2);
+ t2->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ t2->PlusEquals(*t1);
+ t2->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ delete t1;
+ delete t2;
+ delete s1;
+}
+
+TEST_F(ScorerTest, TestCombiScorer) {
+ SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(BLEU_minus_TER_over_2, refs0);
+ string details;
+ Score* t1 = s1->ScoreCandidate(hyp1);
+ t1->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ cerr << t1->ComputeScore() << endl;
+ string enc;
+ t1->Encode(&enc);
+ Score* t2 = SentenceScorer::CreateScoreFromString(BLEU_minus_TER_over_2, enc);
+ details.clear();
+ t2->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ Score* cz = t2->GetZero();
+ EXPECT_FALSE(t2->IsAdditiveIdentity());
+ EXPECT_TRUE(cz->IsAdditiveIdentity());
+ cz->PlusEquals(*t2);
+ EXPECT_FALSE(cz->IsAdditiveIdentity());
+ string d2;
+ cz->ScoreDetails(&d2);
+ EXPECT_EQ(d2, details);
+ delete cz;
+ delete t2;
+ delete t1;
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
diff --git a/vest/ter.cc b/vest/ter.cc
new file mode 100644
index 00000000..ef66f3b7
--- /dev/null
+++ b/vest/ter.cc
@@ -0,0 +1,518 @@
+#include "ter.h"
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <tr1/unordered_map>
+#include <set>
+#include <valarray>
+#include <boost/functional/hash.hpp>
+
+#include "tdict.h"
+
+const bool ter_use_average_ref_len = true;
+const int ter_short_circuit_long_sentences = -1;
+
+using namespace std;
+using namespace std::tr1;
+
+struct COSTS {
+ static const float substitution;
+ static const float deletion;
+ static const float insertion;
+ static const float shift;
+};
+const float COSTS::substitution = 1.0f;
+const float COSTS::deletion = 1.0f;
+const float COSTS::insertion = 1.0f;
+const float COSTS::shift = 1.0f;
+
+static const int MAX_SHIFT_SIZE = 10;
+static const int MAX_SHIFT_DIST = 50;
+
+struct Shift {
+ unsigned int d_;
+ Shift() : d_() {}
+ Shift(int b, int e, int m) : d_() {
+ begin(b);
+ end(e);
+ moveto(m);
+ }
+ inline int begin() const {
+ return d_ & 0x3ff;
+ }
+ inline int end() const {
+ return (d_ >> 10) & 0x3ff;
+ }
+ inline int moveto() const {
+ int m = (d_ >> 20) & 0x7ff;
+ if (m > 1024) { m -= 1024; m *= -1; }
+ return m;
+ }
+ inline void begin(int b) {
+ d_ &= 0xfffffc00u;
+ d_ |= (b & 0x3ff);
+ }
+ inline void end(int e) {
+ d_ &= 0xfff003ffu;
+ d_ |= (e & 0x3ff) << 10;
+ }
+ inline void moveto(int m) {
+ bool neg = (m < 0);
+ if (neg) { m *= -1; m += 1024; }
+ d_ &= 0xfffff;
+ d_ |= (m & 0x7ff) << 20;
+ }
+};
+
+class TERScorerImpl {
+
+ public:
+ enum TransType { MATCH, SUBSTITUTION, INSERTION, DELETION };
+
+ explicit TERScorerImpl(const vector<WordID>& ref) : ref_(ref) {
+ for (int i = 0; i < ref.size(); ++i)
+ rwexists_.insert(ref[i]);
+ }
+
+ float Calculate(const vector<WordID>& hyp, int* subs, int* ins, int* dels, int* shifts) const {
+ return CalculateAllShifts(hyp, subs, ins, dels, shifts);
+ }
+
+ inline int GetRefLength() const {
+ return ref_.size();
+ }
+
+ private:
+ vector<WordID> ref_;
+ set<WordID> rwexists_;
+
+ typedef unordered_map<vector<WordID>, set<int>, boost::hash<vector<WordID> > > NgramToIntsMap;
+ mutable NgramToIntsMap nmap_;
+
+ static float MinimumEditDistance(
+ const vector<WordID>& hyp,
+ const vector<WordID>& ref,
+ vector<TransType>* path) {
+ vector<vector<TransType> > bmat(hyp.size() + 1, vector<TransType>(ref.size() + 1, MATCH));
+ vector<vector<float> > cmat(hyp.size() + 1, vector<float>(ref.size() + 1, 0));
+ for (int i = 0; i <= hyp.size(); ++i)
+ cmat[i][0] = i;
+ for (int j = 0; j <= ref.size(); ++j)
+ cmat[0][j] = j;
+ for (int i = 1; i <= hyp.size(); ++i) {
+ const WordID& hw = hyp[i-1];
+ for (int j = 1; j <= ref.size(); ++j) {
+ const WordID& rw = ref[j-1];
+ float& cur_c = cmat[i][j];
+ TransType& cur_b = bmat[i][j];
+
+ if (rw == hw) {
+ cur_c = cmat[i-1][j-1];
+ cur_b = MATCH;
+ } else {
+ cur_c = cmat[i-1][j-1] + COSTS::substitution;
+ cur_b = SUBSTITUTION;
+ }
+ float cwoi = cmat[i-1][j];
+ if (cur_c > cwoi + COSTS::insertion) {
+ cur_c = cwoi + COSTS::insertion;
+ cur_b = INSERTION;
+ }
+ float cwod = cmat[i][j-1];
+ if (cur_c > cwod + COSTS::deletion) {
+ cur_c = cwod + COSTS::deletion;
+ cur_b = DELETION;
+ }
+ }
+ }
+
+ // trace back along the best path and record the transition types
+ path->clear();
+ int i = hyp.size();
+ int j = ref.size();
+ while (i > 0 || j > 0) {
+ if (j == 0) {
+ --i;
+ path->push_back(INSERTION);
+ } else if (i == 0) {
+ --j;
+ path->push_back(DELETION);
+ } else {
+ TransType t = bmat[i][j];
+ path->push_back(t);
+ switch (t) {
+ case SUBSTITUTION:
+ case MATCH:
+ --i; --j; break;
+ case INSERTION:
+ --i; break;
+ case DELETION:
+ --j; break;
+ }
+ }
+ }
+ reverse(path->begin(), path->end());
+ return cmat[hyp.size()][ref.size()];
+ }
+
+ void BuildWordMatches(const vector<WordID>& hyp, NgramToIntsMap* nmap) const {
+ nmap->clear();
+ set<WordID> exists_both;
+ for (int i = 0; i < hyp.size(); ++i)
+ if (rwexists_.find(hyp[i]) != rwexists_.end())
+ exists_both.insert(hyp[i]);
+ for (int start=0; start<ref_.size(); ++start) {
+ if (exists_both.find(ref_[start]) == exists_both.end()) continue;
+ vector<WordID> cp;
+ int mlen = min(MAX_SHIFT_SIZE, static_cast<int>(ref_.size() - start));
+ for (int len=0; len<mlen; ++len) {
+ if (len && exists_both.find(ref_[start + len]) == exists_both.end()) break;
+ cp.push_back(ref_[start + len]);
+ (*nmap)[cp].insert(start);
+ }
+ }
+ }
+
+ static void PerformShift(const vector<WordID>& in,
+ int start, int end, int moveto, vector<WordID>* out) {
+ // cerr << "ps: " << start << " " << end << " " << moveto << endl;
+ out->clear();
+ if (moveto == -1) {
+ for (int i = start; i <= end; ++i)
+ out->push_back(in[i]);
+ for (int i = 0; i < start; ++i)
+ out->push_back(in[i]);
+ for (int i = end+1; i < in.size(); ++i)
+ out->push_back(in[i]);
+ } else if (moveto < start) {
+ for (int i = 0; i <= moveto; ++i)
+ out->push_back(in[i]);
+ for (int i = start; i <= end; ++i)
+ out->push_back(in[i]);
+ for (int i = moveto+1; i < start; ++i)
+ out->push_back(in[i]);
+ for (int i = end+1; i < in.size(); ++i)
+ out->push_back(in[i]);
+ } else if (moveto > end) {
+ for (int i = 0; i < start; ++i)
+ out->push_back(in[i]);
+ for (int i = end+1; i <= moveto; ++i)
+ out->push_back(in[i]);
+ for (int i = start; i <= end; ++i)
+ out->push_back(in[i]);
+ for (int i = moveto+1; i < in.size(); ++i)
+ out->push_back(in[i]);
+ } else {
+ for (int i = 0; i < start; ++i)
+ out->push_back(in[i]);
+ for (int i = end+1; (i < in.size()) && (i <= end + (moveto - start)); ++i)
+ out->push_back(in[i]);
+ for (int i = start; i <= end; ++i)
+ out->push_back(in[i]);
+ for (int i = (end + (moveto - start))+1; i < in.size(); ++i)
+ out->push_back(in[i]);
+ }
+ if (out->size() != in.size()) {
+ cerr << "ps: " << start << " " << end << " " << moveto << endl;
+ cerr << "in=" << TD::GetString(in) << endl;
+ cerr << "out=" << TD::GetString(*out) << endl;
+ }
+ assert(out->size() == in.size());
+ // cerr << "ps: " << TD::GetString(*out) << endl;
+ }
+
+ void GetAllPossibleShifts(const vector<WordID>& hyp,
+ const vector<int>& ralign,
+ const vector<bool>& herr,
+ const vector<bool>& rerr,
+ const int min_size,
+ vector<vector<Shift> >* shifts) const {
+ for (int start = 0; start < hyp.size(); ++start) {
+ vector<WordID> cp(1, hyp[start]);
+ NgramToIntsMap::iterator niter = nmap_.find(cp);
+ if (niter == nmap_.end()) continue;
+ bool ok = false;
+ int moveto;
+ for (set<int>::iterator i = niter->second.begin(); i != niter->second.end(); ++i) {
+ moveto = *i;
+ int rm = ralign[moveto];
+ ok = (start != rm &&
+ (rm - start) < MAX_SHIFT_DIST &&
+ (start - rm - 1) < MAX_SHIFT_DIST);
+ if (ok) break;
+ }
+ if (!ok) continue;
+ cp.clear();
+ for (int end = start + min_size - 1;
+ ok && end < hyp.size() && end < (start + MAX_SHIFT_SIZE); ++end) {
+ cp.push_back(hyp[end]);
+ vector<Shift>& sshifts = (*shifts)[end - start];
+ ok = false;
+ NgramToIntsMap::iterator niter = nmap_.find(cp);
+ if (niter == nmap_.end()) break;
+ bool any_herr = false;
+ for (int i = start; i <= end && !any_herr; ++i)
+ any_herr = herr[i];
+ if (!any_herr) {
+ ok = true;
+ continue;
+ }
+ for (set<int>::iterator mi = niter->second.begin();
+ mi != niter->second.end(); ++mi) {
+ int moveto = *mi;
+ int rm = ralign[moveto];
+ if (! ((rm != start) &&
+ ((rm < start) || (rm > end)) &&
+ (rm - start <= MAX_SHIFT_DIST) &&
+ ((start - rm - 1) <= MAX_SHIFT_DIST))) continue;
+ ok = true;
+ bool any_rerr = false;
+ for (int i = 0; (i <= end - start) && (!any_rerr); ++i)
+ any_rerr = rerr[moveto+i];
+ if (!any_rerr) continue;
+ for (int roff = 0; roff <= (end - start); ++roff) {
+ int rmr = ralign[moveto+roff];
+ if ((start != rmr) && ((roff == 0) || (rmr != ralign[moveto])))
+ sshifts.push_back(Shift(start, end, moveto + roff));
+ }
+ }
+ }
+ }
+ }
+
+ bool CalculateBestShift(const vector<WordID>& cur,
+ const vector<WordID>& hyp,
+ float curerr,
+ const vector<TransType>& path,
+ vector<WordID>* new_hyp,
+ float* newerr,
+ vector<TransType>* new_path) const {
+ vector<bool> herr, rerr;
+ vector<int> ralign;
+ int hpos = -1;
+ for (int i = 0; i < path.size(); ++i) {
+ switch (path[i]) {
+ case MATCH:
+ ++hpos;
+ herr.push_back(false);
+ rerr.push_back(false);
+ ralign.push_back(hpos);
+ break;
+ case SUBSTITUTION:
+ ++hpos;
+ herr.push_back(true);
+ rerr.push_back(true);
+ ralign.push_back(hpos);
+ break;
+ case INSERTION:
+ ++hpos;
+ herr.push_back(true);
+ break;
+ case DELETION:
+ rerr.push_back(true);
+ ralign.push_back(hpos);
+ break;
+ }
+ }
+#if 0
+ cerr << "RALIGN: ";
+ for (int i = 0; i < rerr.size(); ++i)
+ cerr << ralign[i] << " ";
+ cerr << endl;
+ cerr << "RERR: ";
+ for (int i = 0; i < rerr.size(); ++i)
+ cerr << (bool)rerr[i] << " ";
+ cerr << endl;
+ cerr << "HERR: ";
+ for (int i = 0; i < herr.size(); ++i)
+ cerr << (bool)herr[i] << " ";
+ cerr << endl;
+#endif
+
+ vector<vector<Shift> > shifts(MAX_SHIFT_SIZE + 1);
+ GetAllPossibleShifts(cur, ralign, herr, rerr, 1, &shifts);
+ float cur_best_shift_cost = 0;
+ *newerr = curerr;
+ vector<TransType> cur_best_path;
+ vector<WordID> cur_best_hyp;
+
+ bool res = false;
+ for (int i = shifts.size() - 1; i >=0; --i) {
+ float curfix = curerr - (cur_best_shift_cost + *newerr);
+ float maxfix = 2.0f * (1 + i) - COSTS::shift;
+ if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) break;
+ for (int j = 0; j < shifts[i].size(); ++j) {
+ const Shift& s = shifts[i][j];
+ curfix = curerr - (cur_best_shift_cost + *newerr);
+ maxfix = 2.0f * (1 + i) - COSTS::shift; // TODO remove?
+ if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) continue;
+ vector<WordID> shifted(cur.size());
+ PerformShift(cur, s.begin(), s.end(), ralign[s.moveto()], &shifted);
+ vector<TransType> try_path;
+ float try_cost = MinimumEditDistance(shifted, ref_, &try_path);
+ float gain = (*newerr + cur_best_shift_cost) - (try_cost + COSTS::shift);
+ if (gain > 0.0f || ((cur_best_shift_cost == 0.0f) && (gain == 0.0f))) {
+ *newerr = try_cost;
+ cur_best_shift_cost = COSTS::shift;
+ new_path->swap(try_path);
+ new_hyp->swap(shifted);
+ res = true;
+ // cerr << "Found better shift " << s.begin() << "..." << s.end() << " moveto " << s.moveto() << endl;
+ }
+ }
+ }
+
+ return res;
+ }
+
+ static void GetPathStats(const vector<TransType>& path, int* subs, int* ins, int* dels) {
+ *subs = *ins = *dels = 0;
+ for (int i = 0; i < path.size(); ++i) {
+ switch (path[i]) {
+ case SUBSTITUTION:
+ ++(*subs);
+ case MATCH:
+ break;
+ case INSERTION:
+ ++(*ins); break;
+ case DELETION:
+ ++(*dels); break;
+ }
+ }
+ }
+
+ float CalculateAllShifts(const vector<WordID>& hyp,
+ int* subs, int* ins, int* dels, int* shifts) const {
+ BuildWordMatches(hyp, &nmap_);
+ vector<TransType> path;
+ float med_cost = MinimumEditDistance(hyp, ref_, &path);
+ float edits = 0;
+ vector<WordID> cur = hyp;
+ *shifts = 0;
+ if (ter_short_circuit_long_sentences < 0 ||
+ ref_.size() < ter_short_circuit_long_sentences) {
+ while (true) {
+ vector<WordID> new_hyp;
+ vector<TransType> new_path;
+ float new_med_cost;
+ if (!CalculateBestShift(cur, hyp, med_cost, path, &new_hyp, &new_med_cost, &new_path))
+ break;
+ edits += COSTS::shift;
+ ++(*shifts);
+ med_cost = new_med_cost;
+ path.swap(new_path);
+ cur.swap(new_hyp);
+ }
+ }
+ GetPathStats(path, subs, ins, dels);
+ return med_cost + edits;
+ }
+};
+
+class TERScore : public Score {
+ friend class TERScorer;
+
+ public:
+ static const unsigned kINSERTIONS = 0;
+ static const unsigned kDELETIONS = 1;
+ static const unsigned kSUBSTITUTIONS = 2;
+ static const unsigned kSHIFTS = 3;
+ static const unsigned kREF_WORDCOUNT = 4;
+ static const unsigned kDUMMY_LAST_ENTRY = 5;
+
+ TERScore() : stats(0,kDUMMY_LAST_ENTRY) {}
+ float ComputeScore() const {
+ float edits = static_cast<float>(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]);
+ return edits / static_cast<float>(stats[kREF_WORDCOUNT]);
+ }
+ void ScoreDetails(string* details) const;
+ void PlusEquals(const Score& delta) {
+ stats += static_cast<const TERScore&>(delta).stats;
+ }
+ Score* GetZero() const {
+ return new TERScore;
+ }
+ void Subtract(const Score& rhs, Score* res) const {
+ static_cast<TERScore*>(res)->stats = stats - static_cast<const TERScore&>(rhs).stats;
+ }
+ void Encode(std::string* out) const {
+ ostringstream os;
+ os << stats[kINSERTIONS] << ' '
+ << stats[kDELETIONS] << ' '
+ << stats[kSUBSTITUTIONS] << ' '
+ << stats[kSHIFTS] << ' '
+ << stats[kREF_WORDCOUNT];
+ *out = os.str();
+ }
+ bool IsAdditiveIdentity() const {
+ for (int i = 0; i < kDUMMY_LAST_ENTRY; ++i)
+ if (stats[i] != 0) return false;
+ return true;
+ }
+ private:
+ valarray<int> stats;
+};
+
+Score* TERScorer::ScoreFromString(const std::string& data) {
+ istringstream is(data);
+ TERScore* r = new TERScore;
+ is >> r->stats[TERScore::kINSERTIONS]
+ >> r->stats[TERScore::kDELETIONS]
+ >> r->stats[TERScore::kSUBSTITUTIONS]
+ >> r->stats[TERScore::kSHIFTS]
+ >> r->stats[TERScore::kREF_WORDCOUNT];
+ return r;
+}
+
+void TERScore::ScoreDetails(std::string* details) const {
+ char buf[200];
+ sprintf(buf, "TER = %.2f, %3d|%3d|%3d|%3d (len=%d)",
+ ComputeScore() * 100.0f,
+ stats[kINSERTIONS],
+ stats[kDELETIONS],
+ stats[kSUBSTITUTIONS],
+ stats[kSHIFTS],
+ stats[kREF_WORDCOUNT]);
+ *details = buf;
+}
+
+TERScorer::~TERScorer() {
+ for (vector<TERScorerImpl*>::iterator i = impl_.begin(); i != impl_.end(); ++i)
+ delete *i;
+}
+
+TERScorer::TERScorer(const vector<vector<WordID> >& refs) : impl_(refs.size()) {
+ for (int i = 0; i < refs.size(); ++i)
+ impl_[i] = new TERScorerImpl(refs[i]);
+}
+
+Score* TERScorer::ScoreCandidate(const std::vector<WordID>& hyp) const {
+ float best_score = numeric_limits<float>::max();
+ TERScore* res = new TERScore;
+ int avg_len = 0;
+ for (int i = 0; i < impl_.size(); ++i)
+ avg_len += impl_[i]->GetRefLength();
+ avg_len /= impl_.size();
+ for (int i = 0; i < impl_.size(); ++i) {
+ int subs, ins, dels, shifts;
+ float score = impl_[i]->Calculate(hyp, &subs, &ins, &dels, &shifts);
+ // cerr << "Component TER cost: " << score << endl;
+ if (score < best_score) {
+ res->stats[TERScore::kINSERTIONS] = ins;
+ res->stats[TERScore::kDELETIONS] = dels;
+ res->stats[TERScore::kSUBSTITUTIONS] = subs;
+ res->stats[TERScore::kSHIFTS] = shifts;
+ if (ter_use_average_ref_len) {
+ res->stats[TERScore::kREF_WORDCOUNT] = avg_len;
+ } else {
+ res->stats[TERScore::kREF_WORDCOUNT] = impl_[i]->GetRefLength();
+ }
+
+ best_score = score;
+ }
+ }
+ return res;
+}
diff --git a/vest/test_data/0.json.gz b/vest/test_data/0.json.gz
new file mode 100644
index 00000000..30f8dd77
--- /dev/null
+++ b/vest/test_data/0.json.gz
Binary files differ
diff --git a/vest/test_data/1.json.gz b/vest/test_data/1.json.gz
new file mode 100644
index 00000000..c82cc179
--- /dev/null
+++ b/vest/test_data/1.json.gz
Binary files differ
diff --git a/vest/test_data/c2e.txt.0 b/vest/test_data/c2e.txt.0
new file mode 100644
index 00000000..12c4abe9
--- /dev/null
+++ b/vest/test_data/c2e.txt.0
@@ -0,0 +1,2 @@
+australia reopens embassy in manila
+( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack .
diff --git a/vest/test_data/c2e.txt.1 b/vest/test_data/c2e.txt.1
new file mode 100644
index 00000000..4ac12df1
--- /dev/null
+++ b/vest/test_data/c2e.txt.1
@@ -0,0 +1,2 @@
+australia reopened manila embassy
+( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack .
diff --git a/vest/test_data/c2e.txt.2 b/vest/test_data/c2e.txt.2
new file mode 100644
index 00000000..2f67b72f
--- /dev/null
+++ b/vest/test_data/c2e.txt.2
@@ -0,0 +1,2 @@
+australia to reopen embassy in manila
+( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats .
diff --git a/vest/test_data/c2e.txt.3 b/vest/test_data/c2e.txt.3
new file mode 100644
index 00000000..5483cef6
--- /dev/null
+++ b/vest/test_data/c2e.txt.3
@@ -0,0 +1,2 @@
+australia to re - open its embassy to manila
+( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago .
diff --git a/vest/test_data/re.txt.0 b/vest/test_data/re.txt.0
new file mode 100644
index 00000000..86eff087
--- /dev/null
+++ b/vest/test_data/re.txt.0
@@ -0,0 +1,5 @@
+erdogan states turkey to reject any pressures to urge it to recognize cyprus
+ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened .
+erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus .
+we will discuss this dossier in the course of membership negotiations . "
+he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . "
diff --git a/vest/test_data/re.txt.1 b/vest/test_data/re.txt.1
new file mode 100644
index 00000000..2140f198
--- /dev/null
+++ b/vest/test_data/re.txt.1
@@ -0,0 +1,5 @@
+erdogan confirms turkey will resist any pressure to recognize cyprus
+ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara .
+erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus .
+we shall discuss this issue in the course of the membership negotiations . "
+he added : " let me be clear - i cannot confine turkey . this is something we do not accept . "
diff --git a/vest/test_data/re.txt.2 b/vest/test_data/re.txt.2
new file mode 100644
index 00000000..94e46286
--- /dev/null
+++ b/vest/test_data/re.txt.2
@@ -0,0 +1,5 @@
+erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus
+ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara .
+erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus .
+we shall discuss this dossier during the negotiations on joining . "
+and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . "
diff --git a/vest/test_data/re.txt.3 b/vest/test_data/re.txt.3
new file mode 100644
index 00000000..f87c3308
--- /dev/null
+++ b/vest/test_data/re.txt.3
@@ -0,0 +1,5 @@
+erdogan stresses that turkey will reject all pressures to force it to recognize cyprus
+ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not .
+erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus .
+we will discuss this file during the negotiations on joining . "
+he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . "
diff --git a/vest/union_forests.cc b/vest/union_forests.cc
new file mode 100644
index 00000000..207ecb5c
--- /dev/null
+++ b/vest/union_forests.cc
@@ -0,0 +1,73 @@
+#include <iostream>
+#include <string>
+#include <sstream>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "hg.h"
+#include "hg_io.h"
+#include "filelib.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("dev_set_size,s",po::value<unsigned int>(),"[REQD] Development set size (# of parallel sentences)")
+ ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository")
+ ("new_forest_repository,n",po::value<string>(),"[REQD] Path to new forest repository")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (conf->count("dev_set_size") == 0) {
+ cerr << "Please specify the size of the development set using -d N\n";
+ flag = true;
+ }
+ if (conf->count("new_forest_repository") == 0) {
+ cerr << "Please specify the starting-point weights using -n PATH\n";
+ flag = true;
+ }
+ if (conf->count("forest_repository") == 0) {
+ cerr << "Please specify the forest repository location using -r PATH\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const int size = conf["dev_set_size"].as<unsigned int>();
+ const string repo = conf["forest_repository"].as<string>();
+ const string new_repo = conf["new_forest_repository"].as<string>();
+ for (int i = 0; i < size; ++i) {
+ ostringstream sfin, sfout;
+ sfin << new_repo << '/' << i << ".json.gz";
+ sfout << repo << '/' << i << ".json.gz";
+ const string fin = sfin.str();
+ const string fout = sfout.str();
+ Hypergraph existing_hg;
+ cerr << "Processing " << fin << endl;
+ assert(FileExists(fin));
+ if (FileExists(fout)) {
+ ReadFile rf(fout);
+ assert(HypergraphIO::ReadFromJSON(rf.stream(), &existing_hg));
+ }
+ Hypergraph new_hg;
+ if (true) {
+ ReadFile rf(fin);
+ assert(HypergraphIO::ReadFromJSON(rf.stream(), &new_hg));
+ }
+ existing_hg.Union(new_hg);
+ WriteFile wf(fout);
+ assert(HypergraphIO::WriteToJSON(existing_hg, false, wf.stream()));
+ }
+ return 0;
+}
diff --git a/vest/viterbi_envelope.cc b/vest/viterbi_envelope.cc
new file mode 100644
index 00000000..1122030a
--- /dev/null
+++ b/vest/viterbi_envelope.cc
@@ -0,0 +1,167 @@
+#include "viterbi_envelope.h"
+
+#include <cassert>
+#include <limits>
+
+using namespace std;
+using boost::shared_ptr;
+
+ostream& operator<<(ostream& os, const ViterbiEnvelope& env) {
+ os << '<';
+ const vector<shared_ptr<Segment> >& segs = env.GetSortedSegs();
+ for (int i = 0; i < segs.size(); ++i)
+ os << (i==0 ? "" : "|") << "x=" << segs[i]->x << ",b=" << segs[i]->b << ",m=" << segs[i]->m << ",p1=" << segs[i]->p1 << ",p2=" << segs[i]->p2;
+ return os << '>';
+}
+
+ViterbiEnvelope::ViterbiEnvelope(int i) {
+ if (i == 0) {
+ // do nothing - <>
+ } else if (i == 1) {
+ segs.push_back(shared_ptr<Segment>(new Segment(0, 0, 0, shared_ptr<Segment>(), shared_ptr<Segment>())));
+ assert(this->IsMultiplicativeIdentity());
+ } else {
+ cerr << "Only can create ViterbiEnvelope semiring 0 and 1 with this constructor!\n";
+ abort();
+ }
+}
+
+struct SlopeCompare {
+ bool operator() (const shared_ptr<Segment>& a, const shared_ptr<Segment>& b) const {
+ return a->m < b->m;
+ }
+};
+
+const ViterbiEnvelope& ViterbiEnvelope::operator+=(const ViterbiEnvelope& other) {
+ if (!other.is_sorted) other.Sort();
+ if (segs.empty()) {
+ segs = other.segs;
+ return *this;
+ }
+ is_sorted = false;
+ int j = segs.size();
+ segs.resize(segs.size() + other.segs.size());
+ for (int i = 0; i < other.segs.size(); ++i)
+ segs[j++] = other.segs[i];
+ assert(j == segs.size());
+ return *this;
+}
+
+void ViterbiEnvelope::Sort() const {
+ sort(segs.begin(), segs.end(), SlopeCompare());
+ const int k = segs.size();
+ int j = 0;
+ for (int i = 0; i < k; ++i) {
+ Segment l = *segs[i];
+ l.x = kMinusInfinity;
+ // cerr << "m=" << l.m << endl;
+ if (0 < j) {
+ if (segs[j-1]->m == l.m) { // lines are parallel
+ if (l.b <= segs[j-1]->b) continue;
+ --j;
+ }
+ while(0 < j) {
+ l.x = (l.b - segs[j-1]->b) / (segs[j-1]->m - l.m);
+ if (segs[j-1]->x < l.x) break;
+ --j;
+ }
+ if (0 == j) l.x = kMinusInfinity;
+ }
+ *segs[j++] = l;
+ }
+ segs.resize(j);
+ is_sorted = true;
+}
+
+const ViterbiEnvelope& ViterbiEnvelope::operator*=(const ViterbiEnvelope& other) {
+ if (other.IsMultiplicativeIdentity()) { return *this; }
+ if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; }
+
+ if (!is_sorted) Sort();
+ if (!other.is_sorted) other.Sort();
+
+ if (this->IsEdgeEnvelope()) {
+// if (other.size() > 1)
+// cerr << *this << " (TIMES) " << other << endl;
+ shared_ptr<Segment> edge_parent = segs[0];
+ const double& edge_b = edge_parent->b;
+ const double& edge_m = edge_parent->m;
+ segs.clear();
+ for (int i = 0; i < other.segs.size(); ++i) {
+ const Segment& seg = *other.segs[i];
+ const double m = seg.m + edge_m;
+ const double b = seg.b + edge_b;
+ const double& x = seg.x; // x's don't change with *
+ segs.push_back(shared_ptr<Segment>(new Segment(x, m, b, edge_parent, other.segs[i])));
+ assert(segs.back()->p1->rule);
+ }
+// if (other.size() > 1)
+// cerr << " = " << *this << endl;
+ } else {
+ vector<shared_ptr<Segment> > new_segs;
+ int this_i = 0;
+ int other_i = 0;
+ const int this_size = segs.size();
+ const int other_size = other.segs.size();
+ double cur_x = kMinusInfinity; // moves from left to right across the
+ // real numbers, stopping for all inter-
+ // sections
+ double this_next_val = (1 < this_size ? segs[1]->x : kPlusInfinity);
+ double other_next_val = (1 < other_size ? other.segs[1]->x : kPlusInfinity);
+ while (this_i < this_size && other_i < other_size) {
+ const Segment& this_seg = *segs[this_i];
+ const Segment& other_seg= *other.segs[other_i];
+ const double m = this_seg.m + other_seg.m;
+ const double b = this_seg.b + other_seg.b;
+
+ new_segs.push_back(shared_ptr<Segment>(new Segment(cur_x, m, b, segs[this_i], other.segs[other_i])));
+ int comp = 0;
+ if (this_next_val < other_next_val) comp = -1; else
+ if (this_next_val > other_next_val) comp = 1;
+ if (0 == comp) { // the next values are equal, advance both indices
+ ++this_i;
+ ++other_i;
+ cur_x = this_next_val; // could be other_next_val (they're equal!)
+ this_next_val = (this_i+1 < this_size ? segs[this_i+1]->x : kPlusInfinity);
+ other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity);
+ } else { // advance the i with the lower x, update cur_x
+ if (-1 == comp) {
+ ++this_i;
+ cur_x = this_next_val;
+ this_next_val = (this_i+1 < this_size ? segs[this_i+1]->x : kPlusInfinity);
+ } else {
+ ++other_i;
+ cur_x = other_next_val;
+ other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity);
+ }
+ }
+ }
+ segs.swap(new_segs);
+ }
+ //cerr << "Multiply: result=" << (*this) << endl;
+ return *this;
+}
+
+// recursively construct translation
+void Segment::ConstructTranslation(vector<WordID>* trans) const {
+ const Segment* cur = this;
+ vector<vector<WordID> > ant_trans;
+ while(!cur->rule) {
+ ant_trans.resize(ant_trans.size() + 1);
+ cur->p2->ConstructTranslation(&ant_trans.back());
+ cur = cur->p1.get();
+ }
+ size_t ant_size = ant_trans.size();
+ vector<const vector<WordID>*> pants(ant_size);
+ --ant_size;
+ for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i];
+ cur->rule->ESubstitute(pants, trans);
+}
+
+ViterbiEnvelope ViterbiEnvelopeWeightFunction::operator()(const Hypergraph::Edge& e) const {
+ const double m = direction.dot(e.feature_values_);
+ const double b = origin.dot(e.feature_values_);
+ Segment* seg = new Segment(m, b, e.rule_);
+ return ViterbiEnvelope(1, seg);
+}
+