diff options
Diffstat (limited to 'vest')
-rw-r--r-- | vest/Makefile.am | 32 | ||||
-rw-r--r-- | vest/comb_scorer.cc | 81 | ||||
-rwxr-xr-x | vest/dist-vest.pl | 642 | ||||
-rw-r--r-- | vest/error_surface.cc | 46 | ||||
-rw-r--r-- | vest/fast_score.cc | 74 | ||||
-rw-r--r-- | vest/line_optimizer.cc | 101 | ||||
-rw-r--r-- | vest/lo_test.cc | 201 | ||||
-rw-r--r-- | vest/mr_vest_generate_mapper_input.cc | 72 | ||||
-rw-r--r-- | vest/mr_vest_map.cc | 98 | ||||
-rw-r--r-- | vest/mr_vest_reduce.cc | 80 | ||||
-rw-r--r-- | vest/scorer.cc | 485 | ||||
-rw-r--r-- | vest/scorer_test.cc | 178 | ||||
-rw-r--r-- | vest/ter.cc | 518 | ||||
-rw-r--r-- | vest/test_data/0.json.gz | bin | 0 -> 13709 bytes | |||
-rw-r--r-- | vest/test_data/1.json.gz | bin | 0 -> 204803 bytes | |||
-rw-r--r-- | vest/test_data/c2e.txt.0 | 2 | ||||
-rw-r--r-- | vest/test_data/c2e.txt.1 | 2 | ||||
-rw-r--r-- | vest/test_data/c2e.txt.2 | 2 | ||||
-rw-r--r-- | vest/test_data/c2e.txt.3 | 2 | ||||
-rw-r--r-- | vest/test_data/re.txt.0 | 5 | ||||
-rw-r--r-- | vest/test_data/re.txt.1 | 5 | ||||
-rw-r--r-- | vest/test_data/re.txt.2 | 5 | ||||
-rw-r--r-- | vest/test_data/re.txt.3 | 5 | ||||
-rw-r--r-- | vest/union_forests.cc | 73 | ||||
-rw-r--r-- | vest/viterbi_envelope.cc | 167 |
25 files changed, 2876 insertions, 0 deletions
diff --git a/vest/Makefile.am b/vest/Makefile.am new file mode 100644 index 00000000..87c2383a --- /dev/null +++ b/vest/Makefile.am @@ -0,0 +1,32 @@ +bin_PROGRAMS = \ + mr_vest_map \ + mr_vest_reduce \ + scorer_test \ + lo_test \ + mr_vest_generate_mapper_input \ + fast_score \ + union_forests + +union_forests_SOURCES = union_forests.cc +union_forests_LDADD = $(top_srcdir)/src/libhg.a -lz + +fast_score_SOURCES = fast_score.cc ter.cc comb_scorer.cc scorer.cc viterbi_envelope.cc +fast_score_LDADD = $(top_srcdir)/src/libhg.a -lz + +mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc +mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/src/libhg.a -lz + +mr_vest_map_SOURCES = viterbi_envelope.cc error_surface.cc mr_vest_map.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc +mr_vest_map_LDADD = $(top_srcdir)/src/libhg.a -lz + +mr_vest_reduce_SOURCES = error_surface.cc mr_vest_reduce.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc viterbi_envelope.cc +mr_vest_reduce_LDADD = $(top_srcdir)/src/libhg.a -lz + +scorer_test_SOURCES = scorer_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc +scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/src/libhg.a -lz + +lo_test_SOURCES = lo_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc +lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/src/libhg.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/src +AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB) diff --git a/vest/comb_scorer.cc b/vest/comb_scorer.cc new file mode 100644 index 00000000..7b2187f4 --- /dev/null +++ b/vest/comb_scorer.cc @@ -0,0 +1,81 @@ +#include "comb_scorer.h" + +#include <cstdio> + +using namespace std; + +class BLEUTERCombinationScore : public Score { + friend class BLEUTERCombinationScorer; + public: + ~BLEUTERCombinationScore(); + float ComputeScore() const { + return (bleu->ComputeScore() - ter->ComputeScore()) / 2.0f; + } + void ScoreDetails(string* details) const { + char buf[160]; + sprintf(buf, "Combi = %.2f, BLEU = %.2f, TER = %.2f", + ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, ter->ComputeScore()*100.0f); + *details = buf; + } + void PlusEquals(const Score& delta) { + bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu); + ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter); + } + Score* GetZero() const { + BLEUTERCombinationScore* res = new BLEUTERCombinationScore; + res->bleu = bleu->GetZero(); + res->ter = ter->GetZero(); + return res; + } + void Subtract(const Score& rhs, Score* res) const { + bleu->Subtract(*static_cast<const BLEUTERCombinationScore&>(rhs).bleu, + static_cast<BLEUTERCombinationScore*>(res)->bleu); + ter->Subtract(*static_cast<const BLEUTERCombinationScore&>(rhs).ter, + static_cast<BLEUTERCombinationScore*>(res)->ter); + } + void Encode(std::string* out) const { + string bs, ts; + bleu->Encode(&bs); + ter->Encode(&ts); + out->clear(); + (*out) += static_cast<char>(bs.size()); + (*out) += bs; + (*out) += ts; + } + bool IsAdditiveIdentity() const { + return bleu->IsAdditiveIdentity() && ter->IsAdditiveIdentity(); + } + private: + Score* bleu; + Score* ter; +}; + +BLEUTERCombinationScore::~BLEUTERCombinationScore() { + delete bleu; + delete ter; +} + +BLEUTERCombinationScorer::BLEUTERCombinationScorer(const vector<vector<WordID> >& refs) { + bleu_ = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs); + ter_ = SentenceScorer::CreateSentenceScorer(TER, refs); +} + +BLEUTERCombinationScorer::~BLEUTERCombinationScorer() { + delete bleu_; + delete ter_; +} + +Score* BLEUTERCombinationScorer::ScoreCandidate(const std::vector<WordID>& hyp) const { + BLEUTERCombinationScore* res = new BLEUTERCombinationScore; + res->bleu = bleu_->ScoreCandidate(hyp); + res->ter = ter_->ScoreCandidate(hyp); + return res; +} + +Score* BLEUTERCombinationScorer::ScoreFromString(const std::string& in) { + int bss = in[0]; + BLEUTERCombinationScore* r = new BLEUTERCombinationScore; + r->bleu = SentenceScorer::CreateScoreFromString(IBM_BLEU, in.substr(1, bss)); + r->ter = SentenceScorer::CreateScoreFromString(TER, in.substr(1 + bss)); + return r; +} diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl new file mode 100755 index 00000000..5528838c --- /dev/null +++ b/vest/dist-vest.pl @@ -0,0 +1,642 @@ +#!/usr/bin/env perl + +use Getopt::Long; +use IPC::Open2; +use strict; +use POSIX ":sys_wait_h"; + +my $mydir = `dirname $0`; +chomp $mydir; +# Default settings +my $srcFile = "/fs/cliplab/mteval/Evaluation/Chinese-English/mt03.src.txt"; +my $refFiles = "/fs/cliplab/mteval/Evaluation/Chinese-English/mt03.ref.txt.*"; +my $bin_dir = "/fs/clip-software/cdec/bin"; +$bin_dir = "/Users/redpony/cdyer-svn-root/cdec/vest/bin_dir"; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/fast_score"; +die "Can't find $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_vest_generate_mapper_input"; +my $MAPPER = "$bin_dir/mr_vest_map"; +my $REDUCER = "$bin_dir/mr_vest_reduce"; +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $forestUnion = "$bin_dir/union_forests"; +die "Can't find $forestUnion" unless -x $forestUnion; +my $cdec = "$bin_dir/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +my $decoder = $cdec; +my $lines_per_mapper = 440; +my $rand_directions = 10; +my $iteration = 1; +my $run_local = 0; +my $best_weights; +my $max_iterations = 40; +my $optimization_iters = 6; +my $num_rand_points = 20; +my $mert_nodes = join(" ", grep(/^c\d\d$/, split(/\n/, `pbsnodes -a`))); # "1 1 1 1 1" fails due to file staging conflicts +my $decode_nodes = "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"; # start 15 jobs +my $pmem = "3g"; +my $disable_clean = 0; +my %seen_weights; +my $normalize; +my $help = 0; +my $epsilon = 0.0001; +my $interval = 5; +my $dryrun = 0; +my $ranges; +my $restart = 0; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $initialWeights; +my $decoderOpt; + +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "decoder=s" => \$decoderOpt, + "decode-nodes=s" => \$decode_nodes, + "dont-clean" => \$disable_clean, + "dry-run" => \$dryrun, + "epsilon" => \$epsilon, + "help" => \$help, + "interval" => \$interval, + "iteration=i" => \$iteration, + "local" => \$run_local, + "max-iterations=i" => \$max_iterations, + "mert-nodes=s" => \$mert_nodes, + "normalize=s" => \$normalize, + "pmem=s" => \$pmem, + "ranges=s" => \$ranges, + "rand-directions=i" => \$rand_directions, + "ref-files=s" => \$refFiles, + "metric=s" => \$metric, + "restart" => \$restart, + "source-file=s" => \$srcFile, + "weights=s" => \$initialWeights, + "workdir=s" => \$dir +) == 0 || @ARGV!=1 || $help) { + print_help(); + exit; +} + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 40; +} + +($iniFile) = @ARGV; + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =`hostname`; chomp $host; +my $bleu; +my $best_bleu = 0.0; +my $interval_count = 0; +my $epsilon_bleu = $best_bleu + $epsilon; +my $logfile; +my $projected_score; + +my $refs_comma_sep = get_comma_sep_refs($refFiles); + +unless ($dir){ + $dir = "vest"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = `pwd`; + chomp $basedir; + $dir = "$basedir/$dir"; +} +if ($restart){ + $iniFile = `ls $dir/*.ini`; chomp $iniFile; + unless (-e $iniFile){ + die "ERROR: Could not find ini file in $dir to restart\n"; + } + $logfile = "$dir/mert.log"; + open(LOGFILE, ">>$logfile"); + print LOGFILE "RESTARTING STOPPED OPTIMIZATION\n\n"; + + # figure out best weights so far and iteration number + open(LOG, "$dir/mert.log"); + my $wi = 0; + while (my $line = <LOG>){ + chomp $line; + if ($line =~ /ITERATION (\d+)/) { + $iteration = $1; + } + } + + $iteration = $wi + 1; +} + +if ($decoderOpt){ $decoder = $decoderOpt; } + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ `kill $pid`; } + for my $cmd (@cleanupcmds){`$cmd`; } + exit 1; +}; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = `basename $decoder`; chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $parallelize = "$mydir/parallelize.pl"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +if ($dryrun){ + write_config(*STDERR); + exit 0; +} else { + if (-e $dir){ + unless($restart){ + die "ERROR: working dir $dir already exists\n\n"; + } + } else { + mkdir $dir; + mkdir "$dir/hgs"; + mkdir "$dir/hgs-current"; + unless (-e $initialWeights) { + print STDERR "Please specify an initial weights file with --initial-weights\n"; + print_help(); + exit; + } + `cp $initialWeights $dir/weights.0`; + die "Can't find weights.0" unless (-e "$dir/weights.0"); + } + unless($restart){ + $logfile = "$dir/mert.log"; + open(LOGFILE, ">$logfile"); + } + write_config(*LOGFILE); +} + + +# Generate initial files and values +unless ($restart){ `cp $iniFile $newIniFile`; } +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +unless($restart){ enseg($srcFile, $newsrc); } +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print LOGFILE "\n\nITERATION $iteration\n==========\n"; + + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + `mkdir -p $logdir`; + + #decode + print LOGFILE "DECODE\n"; + print LOGFILE `date`; + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile -O $dir/hgs-current"; + my $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -n \"$decode_nodes\" -- "; + if ($run_local) { $pcmd = "cat $srcFile |"; } + my $cmd = $pcmd . "$decoder_cmd 2> $decoderLog 1> $runFile"; + print LOGFILE "COMMAND:\n$cmd\n"; + my $result = 0; + $result = system($cmd); + unless ($result == 0){ + cleanup(); + print LOGFILE "ERROR: Parallel decoder returned non-zero exit code $result\n"; + die; + } + my $dec_score = `cat $runFile | $SCORER $refs_comma_sep -l $metric`; + chomp $dec_score; + print LOGFILE "DECODER SCORE: $dec_score\n"; + + # save space + `gzip $runFile`; + `gzip $decoderLog`; + + if ($iteration > $max_iterations){ + print LOGFILE "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + + # run optimizer + print LOGFILE "\nUNION FORESTS\n"; + print LOGFILE `date`; + my $mergeLog="$logdir/prune-merge.log.$iteration"; + $cmd = "$forestUnion -r $dir/hgs -n $dir/hgs-current -s $devSize"; + print LOGFILE "COMMAND:\n$cmd\n"; + $result = system($cmd); + unless ($result == 0){ + cleanup(); + print LOGFILE "ERROR: merge command returned non-zero exit code $result\n"; + die; + } + `rm -f $dir/hgs-current/*.json.gz`; # clean up old HGs, they've been moved to the repository + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { + print LOGFILE "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; + print LOGFILE `date`; + $icc++; + $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; + print LOGFILE "COMMAND:\n$cmd\n"; + $result = system($cmd); + unless ($result == 0){ + cleanup(); + print LOGFILE "ERROR: mapinput command returned non-zero exit code $result\n"; + die; + } + + `mkdir $dir/splag.$im1`; + $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; + print LOGFILE "COMMAND:\n$cmd\n"; + $result = system($cmd); + unless ($result == 0){ + cleanup(); + print LOGFILE "ERROR: split command returned non-zero exit code $result\n"; + die; + } + opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; + my @shards = grep { /^mapinput\./ } readdir(DIR); + closedir DIR; + die "No shards!" unless scalar @shards > 0; + my $joblist = ""; + my $nmappers = 0; + my @mapoutputs = (); + @cleanupcmds = (); + my %o2i = (); + my $first_shard = 1; + for my $shard (@shards) { + my $mapoutput = $shard; + my $client_name = $shard; + $client_name =~ s/mapinput.//; + $client_name = "fmert.$client_name"; + $mapoutput =~ s/mapinput/mapoutput/; + push @mapoutputs, "$dir/splag.$im1/$mapoutput"; + $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; + my $script = "$MAPPER -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -k1 > $dir/splag.$im1/$mapoutput"; + if ($run_local) { + print LOGFILE "COMMAND:\n$script\n"; + $result = system($script); + unless ($result == 0){ + cleanup(); + print LOGFILE "ERROR: mapper returned non-zero exit code $result\n"; + die; + } + } else { + my $todo = "qsub -q batch -l pmem=3000mb,walltime=5:00:00 -N $client_name -o /dev/null -e $logdir/$client_name.ER"; + local(*QOUT, *QIN); + open2(\*QOUT, \*QIN, $todo); + print QIN $script; + if ($first_shard) { print LOGFILE "$script\n"; $first_shard=0; } + close QIN; + $nmappers++; + while (my $jobid=<QOUT>){ + chomp $jobid; + push(@cleanupcmds, "`qdel $jobid 2> /dev/null`"); + $jobid =~ s/^(\d+)(.*?)$/\1/g; + print STDERR "short job id $jobid\n"; + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + } + close QOUT; + } + } + if ($run_local) { + } else { + print LOGFILE "Launched $nmappers mappers.\n"; + print LOGFILE "Waiting for mappers to complete...\n"; + while ($nmappers > 0) { + sleep 5; + my @livejobs = grep(/$joblist/, split(/\n/, `qstat`)); + $nmappers = scalar @livejobs; + } + print LOGFILE "All mappers complete.\n"; + } + my $tol = 0; + my $til = 0; + for my $mo (@mapoutputs) { + my $olines = get_lines($mo); + my $ilines = get_lines($o2i{$mo}); + $tol += $olines; + $til += $ilines; + die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; + } + print LOGFILE "Results for $tol/$til lines\n"; + print LOGFILE "\nSORTING AND RUNNING FMERT REDUCER\n"; + print LOGFILE `date`; + $cmd="sort -k1 @mapoutputs | $REDUCER > $dir/redoutput.$im1"; + print LOGFILE "COMMAND:\n$cmd\n"; + $result = system($cmd); + unless ($result == 0){ + cleanup(); + print LOGFILE "ERROR: reducer command returned non-zero exit code $result\n"; + die; + } + $cmd="sort -rnk3 '-t|' $dir/redoutput.$im1 | head -1"; + my $best=`$cmd`; chomp $best; + print LOGFILE "$best\n"; + my ($oa, $x, $xscore) = split /\|/, $best; + $score = $xscore; + print LOGFILE "PROJECTED SCORE: $score\n"; + if (abs($x) < $epsilon) { + print LOGFILE "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; + last; + } + my ($origin, $axis) = split /\s+/, $oa; + + my %ori = convert($origin); + my %axi = convert($axis); + + my $finalFile="$dir/weights.$im1-$opt_iter"; + open W, ">$finalFile" or die "Can't write: $finalFile: $!"; + for my $k (sort keys %ori) { + my $v = $ori{$k} + $axi{$k} * $x; + print W "$k $v\n"; + } + + `rm -rf $dir/splag.$im1`; + $inweights = $finalFile; + } + $lastWeightsFile = "$dir/weights.$iteration"; + `cp $inweights $lastWeightsFile`; + if ($icc < 2) { + print LOGFILE "\nREACHED STOPPING CRITERION: score change too little\n"; + last; + } + $lastPScore = $score; + $iteration++; + print LOGFILE "\n==========\n"; +} + +print LOGFILE "\nFINAL WEIGHTS: $dir/$lastWeightsFile\n(Use -w <this file> with hiero)\n\n"; + +sub normalize_weights { + my ($rfn, $rpts, $feat) = @_; + my @feat_names = @$rfn; + my @pts = @$rpts; + my $z = 1.0; + for (my $i=0; $i < scalar @feat_names; $i++) { + if ($feat_names[$i] eq $feat) { + $z = $pts[$i]; + last; + } + } + for (my $i=0; $i < scalar @feat_names; $i++) { + $pts[$i] /= $z; + } + print LOGFILE " NORM WEIGHTS: @pts\n"; + return @pts; +} + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while(<FL>) { $lc++; } + return $lc; +} + +sub get_comma_sep_refs { + my ($p) = @_; + my $o = `echo $p`; + chomp $o; + my @files = split /\s+/, $o; + return "-r " . join(' -r ', @files); +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while(<F>) { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +# subs +sub write_config { + my $fh = shift; + my $cleanup = "yes"; + if ($disable_clean) {$cleanup = "no";} + + print $fh "\n"; + print $fh "DECODER: $decoder\n"; + print $fh "INI FILE: $iniFile\n"; + print $fh "WORKING DIR: $dir\n"; + print $fh "SOURCE (DEV): $srcFile\n"; + print $fh "REFS (DEV): $refFiles\n"; + print $fh "EVAL METRIC: $metric\n"; + print $fh "START ITERATION: $iteration\n"; + print $fh "MAX ITERATIONS: $max_iterations\n"; + print $fh "MERT NODES: $mert_nodes\n"; + print $fh "DECODE NODES: $decode_nodes\n"; + print $fh "HEAD NODE: $host\n"; + print $fh "PMEM (DECODING): $pmem\n"; + print $fh "CLEANUP: $cleanup\n"; + print $fh "INITIAL WEIGHTS: $initialWeights\n"; + + if ($restart){ + print $fh "PROJECTED BLEU: $projected_score\n"; + print $fh "BEST WEIGHTS: $best_weights\n"; + print $fh "BEST BLEU: $best_bleu\n"; + } +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + + my $executable = `basename $0`; chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + $executable --restart <work dir> + + $executable [options] <ini file> + Runs a complete MERT optimization and test set decoding, using + the decoder configuration in ini file. Note that many of the + options have default values that are inferred automatically + based on certain conventions. For details, refer to descriptions + of the options --decoder, --weights, and --workdir. + + $executable --restart <work dir> + Continues an optimization run that was stopped for some reason, + using configuration information found in the working directory + left behind by the stopped run. + +Options: + + --local + Run the decoder and optimizer locally. + + --decoder <decoder path> + Decoder binary to use. + + --decode-nodes <nodelist> + A list of nodes used for parallel decoding. If specific nodes + are not desired, use "1" for each node requested. Defaults to + "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1", which indicates a request for + 15 nodes. + + --dont-clean + If present, this flag prevents intermediate files, including + run files and cumulative files, from being automatically removed + after a successful optimization run (these files are left if the + run fails for any reason). If used, a makefile containing + cleanup commands is written to the directory. To clean up + the intermediate files, invoke make without any arguments. + + --dry-run + Prints out the settings and exits without doing anything. + + --epsilon <epsilon> + Require that the dev set BLEU score improve by at least <epsilon> + within <interval> iterations (controlled by parameter --interval). + If not specified, defaults to .002. + + --help + Print this message and exit. + + --interval <i> + Require that the dev set BLEU score improve by at least <epsilon> + (controlled by parameter --epsilon) within <interval> iterations. + If not specified, defaults to 5. + + --iteration <I> + Starting iteration number. If not specified, defaults to 1. + + --max-iterations <M> + Maximum number of iterations to run. If not specified, defaults + to 10. + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs, + in the format expected by qsub. If not specified, defaults to + 2g. + + --ref-files <files> + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. If not specified, defaults to + /fs/cliplab/mteval/Evaluation/Chinese-English/mt03.ref.txt.* + + --metric <method> + Metric to optimize. See fmert's --metric option for values. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --normalize <feature-name> + After each iteration, rescale all feature weights such that feature- + name has a weight of 1.0. + + --rand-directions <num> + MERT will attempt to optimize along all of the principle directions, + set this parameter to explore other directions. Defaults to 5. + + --source-file <file> + Dev set source file. If not specified, defaults to + /fs/cliplab/mteval/Evaluation/Chinese-English/mt03.src.txt + + --weights <file> + A file specifying initial feature weights. The format is + FeatureName_1 value1 + FeatureName_2 value2 + + --workdir <dir> + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + diff --git a/vest/error_surface.cc b/vest/error_surface.cc new file mode 100644 index 00000000..4e0af35c --- /dev/null +++ b/vest/error_surface.cc @@ -0,0 +1,46 @@ +#include "error_surface.h" + +#include <cassert> +#include <sstream> + +using namespace std; + +ErrorSurface::~ErrorSurface() { + for (ErrorSurface::iterator i = begin(); i != end(); ++i) + //delete i->delta; + ; +} + +void ErrorSurface::Serialize(std::string* out) const { + const int segments = this->size(); + ostringstream os(ios::binary); + os.write((const char*)&segments,sizeof(segments)); + for (int i = 0; i < segments; ++i) { + const ErrorSegment& cur = (*this)[i]; + string senc; + cur.delta->Encode(&senc); + assert(senc.size() < 256); + unsigned char len = senc.size(); + os.write((const char*)&cur.x, sizeof(cur.x)); + os.write((const char*)&len, sizeof(len)); + os.write((const char*)&senc[0], len); + } + *out = os.str(); +} + +void ErrorSurface::Deserialize(ScoreType type, const std::string& in) { + istringstream is(in, ios::binary); + int segments; + is.read((char*)&segments, sizeof(segments)); + this->resize(segments); + for (int i = 0; i < segments; ++i) { + ErrorSegment& cur = (*this)[i]; + unsigned char len; + is.read((char*)&cur.x, sizeof(cur.x)); + is.read((char*)&len, sizeof(len)); + string senc(len, '\0'); assert(senc.size() == len); + is.read((char*)&senc[0], len); + cur.delta = SentenceScorer::CreateScoreFromString(type, senc); + } +} + diff --git a/vest/fast_score.cc b/vest/fast_score.cc new file mode 100644 index 00000000..45b60d78 --- /dev/null +++ b/vest/fast_score.cc @@ -0,0 +1,74 @@ +#include <iostream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "tdict.h" +#include "scorer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)") + ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") + ("in_file,i", po::value<string>()->default_value("-"), "Input file") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r <REF1.TXT> -r <REF2.TXT> ...\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string loss_function = conf["loss_function"].as<string>(); + ScoreType type = ScoreTypeFromString(loss_function); + DocScorer ds(type, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; + + ReadFile rf(conf["in_file"].as<string>()); + Score* acc = NULL; + istream& in = *rf.stream(); + int lc = 0; + while(in) { + string line; + getline(in, line); + if (line.empty()) continue; + vector<WordID> sent; + TD::ConvertSentence(line, &sent); + Score* sentscore = ds[lc]->ScoreCandidate(sent); + if (!acc) { acc = sentscore->GetZero(); } + acc->PlusEquals(*sentscore); + delete sentscore; + ++lc; + } + assert(lc > 0); + if (lc > ds.size()) { + cerr << "Too many (" << lc << ") translations in input, expected " << ds.size() << endl; + return 1; + } + if (lc != ds.size()) + cerr << "Fewer sentences in hyp (" << lc << ") than refs (" + << ds.size() << "): scoring partial set!\n"; + float score = acc->ComputeScore(); + string details; + acc->ScoreDetails(&details); + delete acc; + cerr << details << endl; + cout << score << endl; + return 0; +} diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc new file mode 100644 index 00000000..98dcec34 --- /dev/null +++ b/vest/line_optimizer.cc @@ -0,0 +1,101 @@ +#include "line_optimizer.h" + +#include <limits> +#include <algorithm> + +#include "sparse_vector.h" +#include "scorer.h" + +using namespace std; + +typedef ErrorSurface::const_iterator ErrorIter; + +// sort by increasing x-ints +struct IntervalComp { + bool operator() (const ErrorIter& a, const ErrorIter& b) const { + return a->x < b->x; + } +}; + +double LineOptimizer::LineOptimize( + const vector<ErrorSurface>& surfaces, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon) { + vector<ErrorIter> all_ints; + for (vector<ErrorSurface>::const_iterator i = surfaces.begin(); + i != surfaces.end(); ++i) { + const ErrorSurface& surface = *i; + for (ErrorIter j = surface.begin(); j != surface.end(); ++j) + all_ints.push_back(j); + } + sort(all_ints.begin(), all_ints.end(), IntervalComp()); + double last_boundary = all_ints.front()->x; + Score* acc = all_ints.front()->delta->GetZero(); + float& cur_best_score = *best_score; + cur_best_score = (type == MAXIMIZE_SCORE ? + -numeric_limits<float>::max() : numeric_limits<float>::max()); + bool left_edge = true; + double pos = numeric_limits<double>::quiet_NaN(); + for (vector<ErrorIter>::iterator i = all_ints.begin(); + i != all_ints.end(); ++i) { + const ErrorSegment& seg = **i; + assert(seg.delta); + if (seg.x - last_boundary > epsilon) { + float sco = acc->ComputeScore(); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = seg.x - 0.1; + left_edge = false; + } else { + pos = last_boundary + (seg.x - last_boundary) / 2; + } + // cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; + } + // cerr << "---- s=" << sco << "\n"; + last_boundary = seg.x; + } + // cerr << "x-boundary=" << seg.x << "\n"; + acc->PlusEquals(*seg.delta); + } + float sco = acc->ComputeScore(); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = 0; + } else { + pos = last_boundary + 1000.0; + } + } + delete acc; + return pos; +} + +void LineOptimizer::RandomUnitVector(const vector<int>& features_to_optimize, + SparseVector<double>* axis, + RandomNumberGenerator<boost::mt19937>* rng) { + axis->clear(); + for (int i = 0; i < features_to_optimize.size(); ++i) + axis->set_value(features_to_optimize[i], rng->next() - 0.5); + (*axis) /= axis->l2norm(); +} + +void LineOptimizer::CreateOptimizationDirections( + const vector<int>& features_to_optimize, + int additional_random_directions, + RandomNumberGenerator<boost::mt19937>* rng, + vector<SparseVector<double> >* dirs) { + const int num_directions = features_to_optimize.size() + additional_random_directions; + dirs->resize(num_directions); + for (int i = 0; i < num_directions; ++i) { + SparseVector<double>& axis = (*dirs)[i]; + if (i < features_to_optimize.size()) + axis.set_value(features_to_optimize[i], 1.0); + else + RandomUnitVector(features_to_optimize, &axis, rng); + } + cerr << "Generated " << num_directions << " total axes to optimize along.\n"; +} diff --git a/vest/lo_test.cc b/vest/lo_test.cc new file mode 100644 index 00000000..0acae5e0 --- /dev/null +++ b/vest/lo_test.cc @@ -0,0 +1,201 @@ +#include <cmath> +#include <iostream> +#include <fstream> + +#include <boost/shared_ptr.hpp> +#include <gtest/gtest.h> + +#include "fdict.h" +#include "hg.h" +#include "kbest.h" +#include "hg_io.h" +#include "filelib.h" +#include "inside_outside.h" +#include "viterbi.h" +#include "viterbi_envelope.h" +#include "line_optimizer.h" +#include "scorer.h" + +using namespace std; +using boost::shared_ptr; + +class OptTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +const char* ref11 = "australia reopens embassy in manila"; +const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack ."; +const char* ref21 = "australia reopened manila embassy"; +const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack ."; +const char* ref31 = "australia to reopen embassy in manila"; +const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats ."; +const char* ref41 = "australia to re - open its embassy to manila"; +const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago ."; + +TEST_F(OptTest, TestCheckNaN) { + double x = 0; + double y = 0; + double z = x / y; + EXPECT_EQ(true, isnan(z)); +} + +TEST_F(OptTest,TestViterbiEnvelope) { + shared_ptr<Segment> a1(new Segment(-1, 0)); + shared_ptr<Segment> b1(new Segment(1, 0)); + shared_ptr<Segment> a2(new Segment(-1, 1)); + shared_ptr<Segment> b2(new Segment(1, -1)); + vector<shared_ptr<Segment> > sa; sa.push_back(a1); sa.push_back(b1); + vector<shared_ptr<Segment> > sb; sb.push_back(a2); sb.push_back(b2); + ViterbiEnvelope a(sa); + cerr << a << endl; + ViterbiEnvelope b(sb); + ViterbiEnvelope c = a; + c *= b; + cerr << a << " (*) " << b << " = " << c << endl; + EXPECT_EQ(3, c.size()); +} + +TEST_F(OptTest,TestViterbiEnvelopeInside) { + const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector<double> wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 1.0); + hg.Reweight(wts); + vector<pair<vector<WordID>, prob_t> > list; + std::vector<SparseVector<double> > features; + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0); + ViterbiEnvelopeWeightFunction wf(wts, dir); + ViterbiEnvelope env = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); + cerr << env << endl; + const vector<boost::shared_ptr<Segment> >& segs = env.GetSortedSegs(); + dir *= segs[1]->x; + wts += dir; + hg.Reweight(wts); + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest2(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest2.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + for (int i = 0; i < segs.size(); ++i) { + cerr << "seg=" << i << endl; + vector<WordID> trans; + segs[i]->ConstructTranslation(&trans); + cerr << TD::GetString(trans) << endl; + } +} + +TEST_F(OptTest, TestS1) { + int fPhraseModel_0 = FD::Convert("PhraseModel_0"); + int fPhraseModel_1 = FD::Convert("PhraseModel_1"); + int fPhraseModel_2 = FD::Convert("PhraseModel_2"); + int fLanguageModel = FD::Convert("LanguageModel"); + int fWordPenalty = FD::Convert("WordPenalty"); + int fPassThrough = FD::Convert("PassThrough"); + SparseVector<double> wts; + wts.set_value(fWordPenalty, 4.25); + wts.set_value(fLanguageModel, -1.1165); + wts.set_value(fPhraseModel_0, -0.96); + wts.set_value(fPhraseModel_1, -0.65); + wts.set_value(fPhraseModel_2, -0.77); + wts.set_value(fPassThrough, -10.0); + + vector<int> to_optimize; + to_optimize.push_back(fWordPenalty); + to_optimize.push_back(fLanguageModel); + to_optimize.push_back(fPhraseModel_0); + to_optimize.push_back(fPhraseModel_1); + to_optimize.push_back(fPhraseModel_2); + + Hypergraph hg; + ReadFile rf("./test_data/0.json.gz"); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(wts); + + Hypergraph hg2; + ReadFile rf2("./test_data/1.json.gz"); + HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); + hg2.Reweight(wts); + + vector<vector<WordID> > refs1(4); + TD::ConvertSentence(ref11, &refs1[0]); + TD::ConvertSentence(ref21, &refs1[1]); + TD::ConvertSentence(ref31, &refs1[2]); + TD::ConvertSentence(ref41, &refs1[3]); + vector<vector<WordID> > refs2(4); + TD::ConvertSentence(ref12, &refs2[0]); + TD::ConvertSentence(ref22, &refs2[1]); + TD::ConvertSentence(ref32, &refs2[2]); + TD::ConvertSentence(ref42, &refs2[3]); + ScoreType type = ScoreTypeFromString("ibm_bleu"); + SentenceScorer* scorer1 = SentenceScorer::CreateSentenceScorer(type, refs1); + SentenceScorer* scorer2 = SentenceScorer::CreateSentenceScorer(type, refs2); + vector<ViterbiEnvelope> envs(2); + + RandomNumberGenerator<boost::mt19937> rng; + + vector<SparseVector<double> > axes; + LineOptimizer::CreateOptimizationDirections( + to_optimize, + 10, + &rng, + &axes); + assert(axes.size() == 10 + to_optimize.size()); + for (int i = 0; i < axes.size(); ++i) + cerr << axes[i] << endl; + const SparseVector<double>& axis = axes[0]; + + cerr << "Computing Viterbi envelope using inside algorithm...\n"; + cerr << "axis: " << axis << endl; + clock_t t_start=clock(); + ViterbiEnvelopeWeightFunction wf(wts, axis); + envs[0] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); + envs[1] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg2, NULL, wf); + + vector<ErrorSurface> es(2); + scorer1->ComputeErrorSurface(envs[0], &es[0]); + scorer2->ComputeErrorSurface(envs[1], &es[1]); + cerr << envs[0].size() << " " << envs[1].size() << endl; + cerr << es[0].size() << " " << es[1].size() << endl; + envs.clear(); + clock_t t_env=clock(); + float score; + double m = LineOptimizer::LineOptimize(es, LineOptimizer::MAXIMIZE_SCORE, &score); + clock_t t_opt=clock(); + cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; + EXPECT_FLOAT_EQ(0.48719698, score); + SparseVector<double> res = axis; + res *= m; + res += wts; + cerr << "res: " << res << endl; + cerr << "ENVELOPE PROCESSING=" << (static_cast<double>(t_env - t_start) / 1000.0) << endl; + cerr << " LINE OPTIMIZATION=" << (static_cast<double>(t_opt - t_env) / 1000.0) << endl; + hg.Reweight(res); + hg2.Reweight(res); + vector<WordID> t1,t2; + ViterbiESentence(hg, &t1); + ViterbiESentence(hg2, &t2); + cerr << TD::GetString(t1) << endl; + cerr << TD::GetString(t2) << endl; + delete scorer1; + delete scorer2; +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc new file mode 100644 index 00000000..c96a61e4 --- /dev/null +++ b/vest/mr_vest_generate_mapper_input.cc @@ -0,0 +1,72 @@ +#include <iostream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "weights.h" +#include "line_optimizer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value<unsigned int>(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") + ("weights,w",po::value<string>(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; + } + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; + flag = true; + } + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r <DIR>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + RandomNumberGenerator<boost::mt19937> rng; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + Weights weights; + vector<string> features; + weights.InitFromFile(conf["weights"].as<string>(), &features); + const string forest_repository = conf["forest_repository"].as<string>(); + assert(DirectoryExists(forest_repository)); + SparseVector<double> origin; + weights.InitSparseVector(&origin); + if (conf.count("optimize_feature") > 0) + features=conf["optimize_feature"].as<vector<string> >(); + vector<SparseVector<double> > axes; + vector<int> fids(features.size()); + for (int i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + LineOptimizer::CreateOptimizationDirections( + fids, + conf["random_directions"].as<unsigned int>(), + &rng, + &axes); + int dev_set_size = conf["dev_set_size"].as<unsigned int>(); + for (int i = 0; i < dev_set_size; ++i) + for (int j = 0; j < axes.size(); ++j) + cout << forest_repository << '/' << i << ".json.gz " << i << ' ' << origin << ' ' << axes[j] << endl; + return 0; +} diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc new file mode 100644 index 00000000..80e84218 --- /dev/null +++ b/vest/mr_vest_map.cc @@ -0,0 +1,98 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "stringlib.h" +#include "sparse_vector.h" +#include "scorer.h" +#include "viterbi_envelope.h" +#include "inside_outside.h" +#include "error_surface.h" +#include "hg.h" +#include "hg_io.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") + ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r <REF.TXT>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +bool ReadSparseVectorString(const string& s, SparseVector<double>* v) { + vector<string> fields; + Tokenize(s, ';', &fields); + if (fields.empty()) return false; + for (int i = 0; i < fields.size(); ++i) { + vector<string> pair(2); + Tokenize(fields[i], '=', &pair); + if (pair.size() != 2) { + cerr << "Error parsing vector string: " << fields[i] << endl; + return false; + } + v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str())); + } + return true; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string loss_function = conf["loss_function"].as<string>(); + ScoreType type = ScoreTypeFromString(loss_function); + DocScorer ds(type, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; + Hypergraph hg; + string last_file; + while(cin) { + string line; + getline(cin, line); + if (line.empty()) continue; + istringstream is(line); + int sent_id; + string file, s_origin, s_axis; + is >> file >> sent_id >> s_origin >> s_axis; + SparseVector<double> origin; + assert(ReadSparseVectorString(s_origin, &origin)); + SparseVector<double> axis; + assert(ReadSparseVectorString(s_axis, &axis)); + // cerr << "File: " << file << "\nAxis: " << axis << "\n X: " << origin << endl; + if (last_file != file) { + last_file = file; + ReadFile rf(file); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + } + ViterbiEnvelopeWeightFunction wf(origin, axis); + ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); + ErrorSurface es; + ds[sent_id]->ComputeErrorSurface(ve, &es); + //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; + cerr << "Error surface has " << es.size() << " segments\n"; + string val; + es.Serialize(&val); + cout << 'M' << ' ' << s_origin << ' ' << s_axis << '\t'; + B64::b64encode(val.c_str(), val.size(), &cout); + cout << endl; + } + return 0; +} diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc new file mode 100644 index 00000000..c1347065 --- /dev/null +++ b/vest/mr_vest_reduce.cc @@ -0,0 +1,80 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sparse_vector.h" +#include "error_surface.h" +#include "line_optimizer.h" +#include "hg_io.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string loss_function = conf["loss_function"].as<string>(); + ScoreType type = ScoreTypeFromString(loss_function); + LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; + if (type == TER) + opt_type = LineOptimizer::MINIMIZE_SCORE; + string last_key; + vector<ErrorSurface> esv; + while(cin) { + string line; + getline(cin, line); + if (line.empty()) continue; + size_t ks = line.find("\t"); + assert(string::npos != ks); + assert(ks > 2); + string key = line.substr(2, ks - 2); + string val = line.substr(ks + 1); + if (key != last_key) { + if (!last_key.empty()) { + float score; + double x = LineOptimizer::LineOptimize(esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + last_key = key; + esv.clear(); + } + if (val.size() % 4 != 0) { + cerr << "B64 encoding error 1! Skipping.\n"; + continue; + } + string encoded(val.size() / 4 * 3, '\0'); + if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) { + cerr << "B64 encoding error 2! Skipping.\n"; + continue; + } + esv.push_back(ErrorSurface()); + esv.back().Deserialize(type, encoded); + } + if (!esv.empty()) { + cerr << "ESV=" << esv.size() << endl; + for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; } + float score; + double x = LineOptimizer::LineOptimize(esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + return 0; +} diff --git a/vest/scorer.cc b/vest/scorer.cc new file mode 100644 index 00000000..e242bb46 --- /dev/null +++ b/vest/scorer.cc @@ -0,0 +1,485 @@ +#include "scorer.h" + +#include <map> +#include <sstream> +#include <iostream> +#include <fstream> +#include <cstdio> +#include <valarray> + +#include <boost/shared_ptr.hpp> + +#include "viterbi_envelope.h" +#include "error_surface.h" +#include "ter.h" +#include "comb_scorer.h" +#include "tdict.h" +#include "stringlib.h" + +using boost::shared_ptr; +using namespace std; + +const bool minimize_segments = true; // if adjacent segments have equal scores, merge them + +ScoreType ScoreTypeFromString(const std::string& st) { + const string sl = LowercaseString(st); + if (sl == "ser") + return SER; + if (sl == "ter") + return TER; + if (sl == "bleu" || sl == "ibm_bleu") + return IBM_BLEU; + if (sl == "nist_bleu") + return NIST_BLEU; + if (sl == "koehn_bleu") + return Koehn_BLEU; + if (sl == "combi") + return BLEU_minus_TER_over_2; + cerr << "Don't understand score type '" << sl << "', defaulting to ibm_bleu.\n"; + return IBM_BLEU; +} + +Score::~Score() {} +SentenceScorer::~SentenceScorer() {} + +class SERScore : public Score { + friend class SERScorer; + public: + SERScore() : correct(0), total(0) {} + float ComputeScore() const { + return static_cast<float>(correct) / static_cast<float>(total); + } + void ScoreDetails(string* details) const { + ostringstream os; + os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')'; + *details = os.str(); + } + void PlusEquals(const Score& delta) { + correct += static_cast<const SERScore&>(delta).correct; + total += static_cast<const SERScore&>(delta).total; + } + Score* GetZero() const { return new SERScore; } + void Subtract(const Score& rhs, Score* res) const { + SERScore* r = static_cast<SERScore*>(res); + r->correct = correct - static_cast<const SERScore&>(rhs).correct; + r->total = total - static_cast<const SERScore&>(rhs).total; + } + void Encode(std::string* out) const { + assert(!"not implemented"); + } + bool IsAdditiveIdentity() const { + return (total == 0 && correct == 0); // correct is always 0 <= n <= total + } + private: + int correct, total; +}; + +class SERScorer : public SentenceScorer { + public: + SERScorer(const vector<vector<WordID> >& references) : refs_(references) {} + Score* ScoreCandidate(const std::vector<WordID>& hyp) const { + SERScore* res = new SERScore; + res->total = 1; + for (int i = 0; i < refs_.size(); ++i) + if (refs_[i] == hyp) res->correct = 1; + return res; + } + static Score* ScoreFromString(const std::string& data) { + assert(!"Not implemented"); + } + private: + vector<vector<WordID> > refs_; +}; + +class BLEUScore : public Score { + friend class BLEUScorerBase; + public: + BLEUScore(int n) : correct_ngram_hit_counts(0,n), hyp_ngram_counts(0,n) { + ref_len = 0; + hyp_len = 0; } + float ComputeScore() const; + void ScoreDetails(string* details) const; + void PlusEquals(const Score& delta); + Score* GetZero() const; + void Subtract(const Score& rhs, Score* res) const; + void Encode(std::string* out) const; + bool IsAdditiveIdentity() const { + if (fabs(ref_len) > 0.1f || hyp_len != 0) return false; + for (int i = 0; i < correct_ngram_hit_counts.size(); ++i) + if (hyp_ngram_counts[i] != 0 || + correct_ngram_hit_counts[i] != 0) return false; + return true; + } + private: + float ComputeScore(vector<float>* precs, float* bp) const; + valarray<int> correct_ngram_hit_counts; + valarray<int> hyp_ngram_counts; + float ref_len; + int hyp_len; +}; + +class BLEUScorerBase : public SentenceScorer { + public: + BLEUScorerBase(const std::vector<std::vector<WordID> >& references, + int n + ); + Score* ScoreCandidate(const std::vector<WordID>& hyp) const; + static Score* ScoreFromString(const std::string& in); + + protected: + virtual float ComputeRefLength(const vector<WordID>& hyp) const = 0; + private: + struct NGramCompare { + int operator() (const vector<WordID>& a, const vector<WordID>& b) { + size_t as = a.size(); + size_t bs = b.size(); + const size_t s = (as < bs ? as : bs); + for (size_t i = 0; i < s; ++i) { + int d = a[i] - b[i]; + if (d < 0) return true; + if (d > 0) return false; + } + return as < bs; + } + }; + typedef map<vector<WordID>, pair<int,int>, NGramCompare> NGramCountMap; + void CountRef(const vector<WordID>& ref) { + NGramCountMap tc; + vector<WordID> ngram(n_); + int s = ref.size(); + for (int j=0; j<s; ++j) { + int remaining = s-j; + int k = (n_ < remaining ? n_ : remaining); + ngram.clear(); + for (int i=1; i<=k; ++i) { + ngram.push_back(ref[j + i - 1]); + tc[ngram].first++; + } + } + for (NGramCountMap::iterator i = tc.begin(); i != tc.end(); ++i) { + pair<int,int>& p = ngrams_[i->first]; + if (p.first < i->second.first) + p = i->second; + } + } + + void ComputeNgramStats(const vector<WordID>& sent, + valarray<int>* correct, + valarray<int>* hyp) const { + assert(correct->size() == n_); + assert(hyp->size() == n_); + vector<WordID> ngram(n_); + (*correct) *= 0; + (*hyp) *= 0; + int s = sent.size(); + for (int j=0; j<s; ++j) { + int remaining = s-j; + int k = (n_ < remaining ? n_ : remaining); + ngram.clear(); + for (int i=1; i<=k; ++i) { + ngram.push_back(sent[j + i - 1]); + pair<int,int>& p = ngrams_[ngram]; + if (p.second < p.first) { + ++p.second; + (*correct)[i-1]++; + } + // if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams: + if (!p.first) { + for (; i<=k; ++i) + (*hyp)[i-1]++; + } else { + (*hyp)[i-1]++; + } + } + } + } + + mutable NGramCountMap ngrams_; + int n_; + vector<int> lengths_; +}; + +Score* BLEUScorerBase::ScoreFromString(const std::string& in) { + istringstream is(in); + int n; + is >> n; + BLEUScore* r = new BLEUScore(n); + is >> r->ref_len >> r->hyp_len; + + for (int i = 0; i < n; ++i) { + is >> r->correct_ngram_hit_counts[i]; + is >> r->hyp_ngram_counts[i]; + } + return r; +} + +class IBM_BLEUScorer : public BLEUScorerBase { + public: + IBM_BLEUScorer(const std::vector<std::vector<WordID> >& references, + int n=4) : BLEUScorerBase(references, n), lengths_(references.size()) { + for (int i=0; i < references.size(); ++i) + lengths_[i] = references[i].size(); + } + protected: + float ComputeRefLength(const vector<WordID>& hyp) const { + if (lengths_.size() == 1) return lengths_[0]; + int bestd = 2000000; + int hl = hyp.size(); + int bl = -1; + for (vector<int>::const_iterator ci = lengths_.begin(); ci != lengths_.end(); ++ci) { + int cl = *ci; + if (abs(cl - hl) < bestd) { + bestd = abs(cl - hl); + bl = cl; + } + } + return bl; + } + private: + vector<int> lengths_; +}; + +class NIST_BLEUScorer : public BLEUScorerBase { + public: + NIST_BLEUScorer(const std::vector<std::vector<WordID> >& references, + int n=4) : BLEUScorerBase(references, n), + shortest_(references[0].size()) { + for (int i=1; i < references.size(); ++i) + if (references[i].size() < shortest_) + shortest_ = references[i].size(); + } + protected: + float ComputeRefLength(const vector<WordID>& hyp) const { + return shortest_; + } + private: + float shortest_; +}; + +class Koehn_BLEUScorer : public BLEUScorerBase { + public: + Koehn_BLEUScorer(const std::vector<std::vector<WordID> >& references, + int n=4) : BLEUScorerBase(references, n), + avg_(0) { + for (int i=0; i < references.size(); ++i) + avg_ += references[i].size(); + avg_ /= references.size(); + } + protected: + float ComputeRefLength(const vector<WordID>& hyp) const { + return avg_; + } + private: + float avg_; +}; + +SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type, + const std::vector<std::vector<WordID> >& refs) { + switch (type) { + case IBM_BLEU: return new IBM_BLEUScorer(refs, 4); + case NIST_BLEU: return new NIST_BLEUScorer(refs, 4); + case Koehn_BLEU: return new Koehn_BLEUScorer(refs, 4); + case TER: return new TERScorer(refs); + case SER: return new SERScorer(refs); + case BLEU_minus_TER_over_2: return new BLEUTERCombinationScorer(refs); + default: + assert(!"Not implemented!"); + } +} + +Score* SentenceScorer::CreateScoreFromString(const ScoreType type, const std::string& in) { + switch (type) { + case IBM_BLEU: + case NIST_BLEU: + case Koehn_BLEU: + return BLEUScorerBase::ScoreFromString(in); + case TER: + return TERScorer::ScoreFromString(in); + case SER: + return SERScorer::ScoreFromString(in); + case BLEU_minus_TER_over_2: + return BLEUTERCombinationScorer::ScoreFromString(in); + default: + assert(!"Not implemented!"); + } +} + +void SentenceScorer::ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* env) const { + vector<WordID> prev_trans; + const vector<shared_ptr<Segment> >& ienv = ve.GetSortedSegs(); + env->resize(ienv.size()); + Score* prev_score = NULL; + int j = 0; + for (int i = 0; i < ienv.size(); ++i) { + const Segment& seg = *ienv[i]; + vector<WordID> trans; + seg.ConstructTranslation(&trans); + // cerr << "Scoring: " << TD::GetString(trans) << endl; + if (trans == prev_trans) { + if (!minimize_segments) { + assert(prev_score); // if this fails, it means + // the decoder can generate null translations + ErrorSegment& out = (*env)[j]; + out.delta = prev_score->GetZero(); + out.x = seg.x; + ++j; + } + // cerr << "Identical translation, skipping scoring\n"; + } else { + Score* score = ScoreCandidate(trans); + // cerr << "score= " << score->ComputeScore() << "\n"; + Score* cur_delta = score->GetZero(); + // just record the score diffs + if (!prev_score) + prev_score = score->GetZero(); + + score->Subtract(*prev_score, cur_delta); + delete prev_score; + prev_trans.swap(trans); + prev_score = score; + if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) { + ErrorSegment& out = (*env)[j]; + out.delta = cur_delta; + out.x = seg.x; + ++j; + } + } + } + delete prev_score; + // cerr << " In segments: " << ienv.size() << endl; + // cerr << "Out segments: " << j << endl; + assert(j > 0); + env->resize(j); +} + +void BLEUScore::ScoreDetails(std::string* details) const { + char buf[2000]; + vector<float> precs(4); + float bp; + float bleu = ComputeScore(&precs, &bp); + sprintf(buf, "BLEU = %.2f, %.1f|%.1f|%.1f|%.1f (brev=%.3f)", + bleu*100.0, + precs[0]*100.0, + precs[1]*100.0, + precs[2]*100.0, + precs[3]*100.0, + bp); + *details = buf; +} + +float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const { + float log_bleu = 0; + if (precs) precs->clear(); + int count = 0; + for (int i = 0; i < hyp_ngram_counts.size(); ++i) { + if (hyp_ngram_counts[i] > 0) { + float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]); + if (precs) precs->push_back(exp(lprec)); + log_bleu += lprec; + ++count; + } + } + log_bleu /= static_cast<float>(count); + float lbp = 0.0; + if (hyp_len < ref_len) + lbp = (hyp_len - ref_len) / hyp_len; + log_bleu += lbp; + if (bp) *bp = exp(lbp); + return exp(log_bleu); +} + +float BLEUScore::ComputeScore() const { + return ComputeScore(NULL, NULL); +} + +void BLEUScore::Subtract(const Score& rhs, Score* res) const { + const BLEUScore& d = static_cast<const BLEUScore&>(rhs); + BLEUScore* o = static_cast<BLEUScore*>(res); + o->ref_len = ref_len - d.ref_len; + o->hyp_len = hyp_len - d.hyp_len; + o->correct_ngram_hit_counts = correct_ngram_hit_counts - d.correct_ngram_hit_counts; + o->hyp_ngram_counts = hyp_ngram_counts - d.hyp_ngram_counts; +} + +void BLEUScore::PlusEquals(const Score& delta) { + const BLEUScore& d = static_cast<const BLEUScore&>(delta); + correct_ngram_hit_counts += d.correct_ngram_hit_counts; + hyp_ngram_counts += d.hyp_ngram_counts; + ref_len += d.ref_len; + hyp_len += d.hyp_len; +} + +Score* BLEUScore::GetZero() const { + return new BLEUScore(hyp_ngram_counts.size()); +} + +void BLEUScore::Encode(std::string* out) const { + ostringstream os; + const int n = correct_ngram_hit_counts.size(); + os << n << ' ' << ref_len << ' ' << hyp_len; + for (int i = 0; i < n; ++i) + os << ' ' << correct_ngram_hit_counts[i] << ' ' << hyp_ngram_counts[i]; + *out = os.str(); +} + +BLEUScorerBase::BLEUScorerBase(const std::vector<std::vector<WordID> >& references, + int n) : n_(n) { + for (vector<vector<WordID> >::const_iterator ci = references.begin(); + ci != references.end(); ++ci) { + lengths_.push_back(ci->size()); + CountRef(*ci); + } +} + +Score* BLEUScorerBase::ScoreCandidate(const vector<WordID>& hyp) const { + BLEUScore* bs = new BLEUScore(n_); + for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i) + i->second.second = 0; + ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts); + bs->ref_len = ComputeRefLength(hyp); + bs->hyp_len = hyp.size(); + return bs; +} + +DocScorer::~DocScorer() { + for (int i=0; i < scorers_.size(); ++i) + delete scorers_[i]; +} + +DocScorer::DocScorer( + const ScoreType type, + const std::vector<std::string>& ref_files) { + // TODO stop using valarray, start using ReadFile + cerr << "Loading references (" << ref_files.size() << " files)\n"; + valarray<ifstream> ifs(ref_files.size()); + for (int i=0; i < ref_files.size(); ++i) { + ifs[i].open(ref_files[i].c_str()); + assert(ifs[i].good()); + } + char buf[64000]; + bool expect_eof = false; + while (!ifs[0].eof()) { + vector<vector<WordID> > refs(ref_files.size()); + for (int i=0; i < ref_files.size(); ++i) { + if (ifs[i].eof()) break; + ifs[i].getline(buf, 64000); + refs[i].clear(); + if (strlen(buf) == 0) { + if (ifs[i].eof()) { + if (!expect_eof) { + assert(i == 0); + expect_eof = true; + } + break; + } + } else { + TD::ConvertSentence(buf, &refs[i]); + assert(!refs[i].empty()); + } + assert(!expect_eof); + } + if (!expect_eof) scorers_.push_back(SentenceScorer::CreateSentenceScorer(type, refs)); + } + cerr << "Loaded reference translations for " << scorers_.size() << " sentences.\n"; +} + diff --git a/vest/scorer_test.cc b/vest/scorer_test.cc new file mode 100644 index 00000000..fca219cc --- /dev/null +++ b/vest/scorer_test.cc @@ -0,0 +1,178 @@ +#include <iostream> +#include <fstream> +#include <valarray> +#include <gtest/gtest.h> + +#include "tdict.h" +#include "scorer.h" + +using namespace std; + +class ScorerTest : public testing::Test { + protected: + virtual void SetUp() { + refs0.resize(4); + refs1.resize(4); + TD::ConvertSentence("export of high-tech products in guangdong in first two months this year reached 3.76 billion us dollars", &refs0[0]); + TD::ConvertSentence("guangdong's export of new high technology products amounts to us $ 3.76 billion in first two months of this year", &refs0[1]); + TD::ConvertSentence("guangdong exports us $ 3.76 billion worth of high technology products in the first two months of this year", &refs0[2]); + TD::ConvertSentence("in the first 2 months this year , the export volume of new hi-tech products in guangdong province reached 3.76 billion us dollars .", &refs0[3]); + TD::ConvertSentence("xinhua news agency , guangzhou , march 16 ( reporter chen ji ) the latest statistics show that from january through february this year , the export of high-tech products in guangdong province reached 3.76 billion us dollars , up 34.8 \% over the same period last year and accounted for 25.5 \% of the total export in the province .", &refs1[0]); + TD::ConvertSentence("xinhua news agency , guangzhou , march 16 ( reporter : chen ji ) -- latest statistic indicates that guangdong's export of new high technology products amounts to us $ 3.76 billion , up 34.8 \% over corresponding period and accounts for 25.5 \% of the total exports of the province .", &refs1[1]); + TD::ConvertSentence("xinhua news agency report of march 16 from guangzhou ( by staff reporter chen ji ) - latest statistics indicate guangdong province exported us $ 3.76 billion worth of high technology products , up 34.8 percent from the same period last year , which account for 25.5 percent of the total exports of the province .", &refs1[2]); + TD::ConvertSentence("guangdong , march 16 , ( xinhua ) -- ( chen ji reports ) as the newest statistics shows , in january and feberuary this year , the export volume of new hi-tech products in guangdong province reached 3.76 billion us dollars , up 34.8 \% than last year , making up 25.5 \% of the province's total .", &refs1[3]); + TD::ConvertSentence("one guangdong province will next export us $ 3.76 high-tech product two months first this year 3.76 billion us dollars", &hyp1); + TD::ConvertSentence("xinhua news agency , guangzhou , 16th of march ( reporter chen ) -- latest statistics suggest that guangdong exports new advanced technology product totals $ 3.76 million , 34.8 percent last corresponding period and accounts for 25.5 percent of the total export province .", &hyp2); + } + + virtual void TearDown() { } + + vector<vector<WordID> > refs0; + vector<vector<WordID> > refs1; + vector<WordID> hyp1; + vector<WordID> hyp2; +}; + +TEST_F(ScorerTest, TestCreateFromFiles) { + vector<string> files; + files.push_back("test_data/re.txt.0"); + files.push_back("test_data/re.txt.1"); + files.push_back("test_data/re.txt.2"); + files.push_back("test_data/re.txt.3"); + DocScorer ds(IBM_BLEU, files); +} + +TEST_F(ScorerTest, TestBLEUScorer) { + SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs0); + SentenceScorer* s2 = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs1); + Score* b1 = s1->ScoreCandidate(hyp1); + EXPECT_FLOAT_EQ(0.23185077, b1->ComputeScore()); + Score* b2 = s2->ScoreCandidate(hyp2); + EXPECT_FLOAT_EQ(0.38101241, b2->ComputeScore()); + b1->PlusEquals(*b2); + EXPECT_FLOAT_EQ(0.348854, b1->ComputeScore()); + EXPECT_FALSE(b1->IsAdditiveIdentity()); + string details; + b1->ScoreDetails(&details); + EXPECT_EQ("BLEU = 34.89, 81.5|50.8|29.5|18.6 (brev=0.898)", details); + cerr << details << endl; + string enc; + b1->Encode(&enc); + Score* b3 = SentenceScorer::CreateScoreFromString(IBM_BLEU, enc); + details.clear(); + cerr << "Encoded BLEU score size: " << enc.size() << endl; + b3->ScoreDetails(&details); + cerr << details << endl; + EXPECT_FALSE(b3->IsAdditiveIdentity()); + EXPECT_EQ("BLEU = 34.89, 81.5|50.8|29.5|18.6 (brev=0.898)", details); + Score* bz = b3->GetZero(); + EXPECT_TRUE(bz->IsAdditiveIdentity()); + delete bz; + delete b1; + delete s1; + delete b2; + delete s2; +} + +TEST_F(ScorerTest, TestTERScorer) { + SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(TER, refs0); + SentenceScorer* s2 = SentenceScorer::CreateSentenceScorer(TER, refs1); + string details; + Score* t1 = s1->ScoreCandidate(hyp1); + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + cerr << t1->ComputeScore() << endl; + Score* t2 = s2->ScoreCandidate(hyp2); + t2->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + cerr << t2->ComputeScore() << endl; + t1->PlusEquals(*t2); + cerr << t1->ComputeScore() << endl; + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + EXPECT_EQ("TER = 44.16, 4| 8| 16| 6 (len=77)", details); + string enc; + t1->Encode(&enc); + Score* t3 = SentenceScorer::CreateScoreFromString(TER, enc); + details.clear(); + t3->ScoreDetails(&details); + EXPECT_EQ("TER = 44.16, 4| 8| 16| 6 (len=77)", details); + EXPECT_FALSE(t3->IsAdditiveIdentity()); + Score* tz = t3->GetZero(); + EXPECT_TRUE(tz->IsAdditiveIdentity()); + delete tz; + delete t3; + delete t1; + delete s1; + delete t2; + delete s2; +} + +TEST_F(ScorerTest, TestTERScorerSimple) { + vector<vector<WordID> > ref(1); + TD::ConvertSentence("1 2 3 A B", &ref[0]); + vector<WordID> hyp; + TD::ConvertSentence("A B 1 2 3", &hyp); + SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(TER, ref); + string details; + Score* t1 = s1->ScoreCandidate(hyp); + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + delete t1; + delete s1; +} + +TEST_F(ScorerTest, TestSERScorerSimple) { + vector<vector<WordID> > ref(1); + TD::ConvertSentence("A B C D", &ref[0]); + vector<WordID> hyp1; + TD::ConvertSentence("A B C", &hyp1); + vector<WordID> hyp2; + TD::ConvertSentence("A B C D", &hyp2); + SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(SER, ref); + string details; + Score* t1 = s1->ScoreCandidate(hyp1); + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + Score* t2 = s1->ScoreCandidate(hyp2); + t2->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + t2->PlusEquals(*t1); + t2->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + delete t1; + delete t2; + delete s1; +} + +TEST_F(ScorerTest, TestCombiScorer) { + SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(BLEU_minus_TER_over_2, refs0); + string details; + Score* t1 = s1->ScoreCandidate(hyp1); + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + cerr << t1->ComputeScore() << endl; + string enc; + t1->Encode(&enc); + Score* t2 = SentenceScorer::CreateScoreFromString(BLEU_minus_TER_over_2, enc); + details.clear(); + t2->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + Score* cz = t2->GetZero(); + EXPECT_FALSE(t2->IsAdditiveIdentity()); + EXPECT_TRUE(cz->IsAdditiveIdentity()); + cz->PlusEquals(*t2); + EXPECT_FALSE(cz->IsAdditiveIdentity()); + string d2; + cz->ScoreDetails(&d2); + EXPECT_EQ(d2, details); + delete cz; + delete t2; + delete t1; +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/vest/ter.cc b/vest/ter.cc new file mode 100644 index 00000000..ef66f3b7 --- /dev/null +++ b/vest/ter.cc @@ -0,0 +1,518 @@ +#include "ter.h" + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <limits> +#include <sstream> +#include <tr1/unordered_map> +#include <set> +#include <valarray> +#include <boost/functional/hash.hpp> + +#include "tdict.h" + +const bool ter_use_average_ref_len = true; +const int ter_short_circuit_long_sentences = -1; + +using namespace std; +using namespace std::tr1; + +struct COSTS { + static const float substitution; + static const float deletion; + static const float insertion; + static const float shift; +}; +const float COSTS::substitution = 1.0f; +const float COSTS::deletion = 1.0f; +const float COSTS::insertion = 1.0f; +const float COSTS::shift = 1.0f; + +static const int MAX_SHIFT_SIZE = 10; +static const int MAX_SHIFT_DIST = 50; + +struct Shift { + unsigned int d_; + Shift() : d_() {} + Shift(int b, int e, int m) : d_() { + begin(b); + end(e); + moveto(m); + } + inline int begin() const { + return d_ & 0x3ff; + } + inline int end() const { + return (d_ >> 10) & 0x3ff; + } + inline int moveto() const { + int m = (d_ >> 20) & 0x7ff; + if (m > 1024) { m -= 1024; m *= -1; } + return m; + } + inline void begin(int b) { + d_ &= 0xfffffc00u; + d_ |= (b & 0x3ff); + } + inline void end(int e) { + d_ &= 0xfff003ffu; + d_ |= (e & 0x3ff) << 10; + } + inline void moveto(int m) { + bool neg = (m < 0); + if (neg) { m *= -1; m += 1024; } + d_ &= 0xfffff; + d_ |= (m & 0x7ff) << 20; + } +}; + +class TERScorerImpl { + + public: + enum TransType { MATCH, SUBSTITUTION, INSERTION, DELETION }; + + explicit TERScorerImpl(const vector<WordID>& ref) : ref_(ref) { + for (int i = 0; i < ref.size(); ++i) + rwexists_.insert(ref[i]); + } + + float Calculate(const vector<WordID>& hyp, int* subs, int* ins, int* dels, int* shifts) const { + return CalculateAllShifts(hyp, subs, ins, dels, shifts); + } + + inline int GetRefLength() const { + return ref_.size(); + } + + private: + vector<WordID> ref_; + set<WordID> rwexists_; + + typedef unordered_map<vector<WordID>, set<int>, boost::hash<vector<WordID> > > NgramToIntsMap; + mutable NgramToIntsMap nmap_; + + static float MinimumEditDistance( + const vector<WordID>& hyp, + const vector<WordID>& ref, + vector<TransType>* path) { + vector<vector<TransType> > bmat(hyp.size() + 1, vector<TransType>(ref.size() + 1, MATCH)); + vector<vector<float> > cmat(hyp.size() + 1, vector<float>(ref.size() + 1, 0)); + for (int i = 0; i <= hyp.size(); ++i) + cmat[i][0] = i; + for (int j = 0; j <= ref.size(); ++j) + cmat[0][j] = j; + for (int i = 1; i <= hyp.size(); ++i) { + const WordID& hw = hyp[i-1]; + for (int j = 1; j <= ref.size(); ++j) { + const WordID& rw = ref[j-1]; + float& cur_c = cmat[i][j]; + TransType& cur_b = bmat[i][j]; + + if (rw == hw) { + cur_c = cmat[i-1][j-1]; + cur_b = MATCH; + } else { + cur_c = cmat[i-1][j-1] + COSTS::substitution; + cur_b = SUBSTITUTION; + } + float cwoi = cmat[i-1][j]; + if (cur_c > cwoi + COSTS::insertion) { + cur_c = cwoi + COSTS::insertion; + cur_b = INSERTION; + } + float cwod = cmat[i][j-1]; + if (cur_c > cwod + COSTS::deletion) { + cur_c = cwod + COSTS::deletion; + cur_b = DELETION; + } + } + } + + // trace back along the best path and record the transition types + path->clear(); + int i = hyp.size(); + int j = ref.size(); + while (i > 0 || j > 0) { + if (j == 0) { + --i; + path->push_back(INSERTION); + } else if (i == 0) { + --j; + path->push_back(DELETION); + } else { + TransType t = bmat[i][j]; + path->push_back(t); + switch (t) { + case SUBSTITUTION: + case MATCH: + --i; --j; break; + case INSERTION: + --i; break; + case DELETION: + --j; break; + } + } + } + reverse(path->begin(), path->end()); + return cmat[hyp.size()][ref.size()]; + } + + void BuildWordMatches(const vector<WordID>& hyp, NgramToIntsMap* nmap) const { + nmap->clear(); + set<WordID> exists_both; + for (int i = 0; i < hyp.size(); ++i) + if (rwexists_.find(hyp[i]) != rwexists_.end()) + exists_both.insert(hyp[i]); + for (int start=0; start<ref_.size(); ++start) { + if (exists_both.find(ref_[start]) == exists_both.end()) continue; + vector<WordID> cp; + int mlen = min(MAX_SHIFT_SIZE, static_cast<int>(ref_.size() - start)); + for (int len=0; len<mlen; ++len) { + if (len && exists_both.find(ref_[start + len]) == exists_both.end()) break; + cp.push_back(ref_[start + len]); + (*nmap)[cp].insert(start); + } + } + } + + static void PerformShift(const vector<WordID>& in, + int start, int end, int moveto, vector<WordID>* out) { + // cerr << "ps: " << start << " " << end << " " << moveto << endl; + out->clear(); + if (moveto == -1) { + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = 0; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; i < in.size(); ++i) + out->push_back(in[i]); + } else if (moveto < start) { + for (int i = 0; i <= moveto; ++i) + out->push_back(in[i]); + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = moveto+1; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; i < in.size(); ++i) + out->push_back(in[i]); + } else if (moveto > end) { + for (int i = 0; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; i <= moveto; ++i) + out->push_back(in[i]); + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = moveto+1; i < in.size(); ++i) + out->push_back(in[i]); + } else { + for (int i = 0; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; (i < in.size()) && (i <= end + (moveto - start)); ++i) + out->push_back(in[i]); + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = (end + (moveto - start))+1; i < in.size(); ++i) + out->push_back(in[i]); + } + if (out->size() != in.size()) { + cerr << "ps: " << start << " " << end << " " << moveto << endl; + cerr << "in=" << TD::GetString(in) << endl; + cerr << "out=" << TD::GetString(*out) << endl; + } + assert(out->size() == in.size()); + // cerr << "ps: " << TD::GetString(*out) << endl; + } + + void GetAllPossibleShifts(const vector<WordID>& hyp, + const vector<int>& ralign, + const vector<bool>& herr, + const vector<bool>& rerr, + const int min_size, + vector<vector<Shift> >* shifts) const { + for (int start = 0; start < hyp.size(); ++start) { + vector<WordID> cp(1, hyp[start]); + NgramToIntsMap::iterator niter = nmap_.find(cp); + if (niter == nmap_.end()) continue; + bool ok = false; + int moveto; + for (set<int>::iterator i = niter->second.begin(); i != niter->second.end(); ++i) { + moveto = *i; + int rm = ralign[moveto]; + ok = (start != rm && + (rm - start) < MAX_SHIFT_DIST && + (start - rm - 1) < MAX_SHIFT_DIST); + if (ok) break; + } + if (!ok) continue; + cp.clear(); + for (int end = start + min_size - 1; + ok && end < hyp.size() && end < (start + MAX_SHIFT_SIZE); ++end) { + cp.push_back(hyp[end]); + vector<Shift>& sshifts = (*shifts)[end - start]; + ok = false; + NgramToIntsMap::iterator niter = nmap_.find(cp); + if (niter == nmap_.end()) break; + bool any_herr = false; + for (int i = start; i <= end && !any_herr; ++i) + any_herr = herr[i]; + if (!any_herr) { + ok = true; + continue; + } + for (set<int>::iterator mi = niter->second.begin(); + mi != niter->second.end(); ++mi) { + int moveto = *mi; + int rm = ralign[moveto]; + if (! ((rm != start) && + ((rm < start) || (rm > end)) && + (rm - start <= MAX_SHIFT_DIST) && + ((start - rm - 1) <= MAX_SHIFT_DIST))) continue; + ok = true; + bool any_rerr = false; + for (int i = 0; (i <= end - start) && (!any_rerr); ++i) + any_rerr = rerr[moveto+i]; + if (!any_rerr) continue; + for (int roff = 0; roff <= (end - start); ++roff) { + int rmr = ralign[moveto+roff]; + if ((start != rmr) && ((roff == 0) || (rmr != ralign[moveto]))) + sshifts.push_back(Shift(start, end, moveto + roff)); + } + } + } + } + } + + bool CalculateBestShift(const vector<WordID>& cur, + const vector<WordID>& hyp, + float curerr, + const vector<TransType>& path, + vector<WordID>* new_hyp, + float* newerr, + vector<TransType>* new_path) const { + vector<bool> herr, rerr; + vector<int> ralign; + int hpos = -1; + for (int i = 0; i < path.size(); ++i) { + switch (path[i]) { + case MATCH: + ++hpos; + herr.push_back(false); + rerr.push_back(false); + ralign.push_back(hpos); + break; + case SUBSTITUTION: + ++hpos; + herr.push_back(true); + rerr.push_back(true); + ralign.push_back(hpos); + break; + case INSERTION: + ++hpos; + herr.push_back(true); + break; + case DELETION: + rerr.push_back(true); + ralign.push_back(hpos); + break; + } + } +#if 0 + cerr << "RALIGN: "; + for (int i = 0; i < rerr.size(); ++i) + cerr << ralign[i] << " "; + cerr << endl; + cerr << "RERR: "; + for (int i = 0; i < rerr.size(); ++i) + cerr << (bool)rerr[i] << " "; + cerr << endl; + cerr << "HERR: "; + for (int i = 0; i < herr.size(); ++i) + cerr << (bool)herr[i] << " "; + cerr << endl; +#endif + + vector<vector<Shift> > shifts(MAX_SHIFT_SIZE + 1); + GetAllPossibleShifts(cur, ralign, herr, rerr, 1, &shifts); + float cur_best_shift_cost = 0; + *newerr = curerr; + vector<TransType> cur_best_path; + vector<WordID> cur_best_hyp; + + bool res = false; + for (int i = shifts.size() - 1; i >=0; --i) { + float curfix = curerr - (cur_best_shift_cost + *newerr); + float maxfix = 2.0f * (1 + i) - COSTS::shift; + if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) break; + for (int j = 0; j < shifts[i].size(); ++j) { + const Shift& s = shifts[i][j]; + curfix = curerr - (cur_best_shift_cost + *newerr); + maxfix = 2.0f * (1 + i) - COSTS::shift; // TODO remove? + if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) continue; + vector<WordID> shifted(cur.size()); + PerformShift(cur, s.begin(), s.end(), ralign[s.moveto()], &shifted); + vector<TransType> try_path; + float try_cost = MinimumEditDistance(shifted, ref_, &try_path); + float gain = (*newerr + cur_best_shift_cost) - (try_cost + COSTS::shift); + if (gain > 0.0f || ((cur_best_shift_cost == 0.0f) && (gain == 0.0f))) { + *newerr = try_cost; + cur_best_shift_cost = COSTS::shift; + new_path->swap(try_path); + new_hyp->swap(shifted); + res = true; + // cerr << "Found better shift " << s.begin() << "..." << s.end() << " moveto " << s.moveto() << endl; + } + } + } + + return res; + } + + static void GetPathStats(const vector<TransType>& path, int* subs, int* ins, int* dels) { + *subs = *ins = *dels = 0; + for (int i = 0; i < path.size(); ++i) { + switch (path[i]) { + case SUBSTITUTION: + ++(*subs); + case MATCH: + break; + case INSERTION: + ++(*ins); break; + case DELETION: + ++(*dels); break; + } + } + } + + float CalculateAllShifts(const vector<WordID>& hyp, + int* subs, int* ins, int* dels, int* shifts) const { + BuildWordMatches(hyp, &nmap_); + vector<TransType> path; + float med_cost = MinimumEditDistance(hyp, ref_, &path); + float edits = 0; + vector<WordID> cur = hyp; + *shifts = 0; + if (ter_short_circuit_long_sentences < 0 || + ref_.size() < ter_short_circuit_long_sentences) { + while (true) { + vector<WordID> new_hyp; + vector<TransType> new_path; + float new_med_cost; + if (!CalculateBestShift(cur, hyp, med_cost, path, &new_hyp, &new_med_cost, &new_path)) + break; + edits += COSTS::shift; + ++(*shifts); + med_cost = new_med_cost; + path.swap(new_path); + cur.swap(new_hyp); + } + } + GetPathStats(path, subs, ins, dels); + return med_cost + edits; + } +}; + +class TERScore : public Score { + friend class TERScorer; + + public: + static const unsigned kINSERTIONS = 0; + static const unsigned kDELETIONS = 1; + static const unsigned kSUBSTITUTIONS = 2; + static const unsigned kSHIFTS = 3; + static const unsigned kREF_WORDCOUNT = 4; + static const unsigned kDUMMY_LAST_ENTRY = 5; + + TERScore() : stats(0,kDUMMY_LAST_ENTRY) {} + float ComputeScore() const { + float edits = static_cast<float>(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]); + return edits / static_cast<float>(stats[kREF_WORDCOUNT]); + } + void ScoreDetails(string* details) const; + void PlusEquals(const Score& delta) { + stats += static_cast<const TERScore&>(delta).stats; + } + Score* GetZero() const { + return new TERScore; + } + void Subtract(const Score& rhs, Score* res) const { + static_cast<TERScore*>(res)->stats = stats - static_cast<const TERScore&>(rhs).stats; + } + void Encode(std::string* out) const { + ostringstream os; + os << stats[kINSERTIONS] << ' ' + << stats[kDELETIONS] << ' ' + << stats[kSUBSTITUTIONS] << ' ' + << stats[kSHIFTS] << ' ' + << stats[kREF_WORDCOUNT]; + *out = os.str(); + } + bool IsAdditiveIdentity() const { + for (int i = 0; i < kDUMMY_LAST_ENTRY; ++i) + if (stats[i] != 0) return false; + return true; + } + private: + valarray<int> stats; +}; + +Score* TERScorer::ScoreFromString(const std::string& data) { + istringstream is(data); + TERScore* r = new TERScore; + is >> r->stats[TERScore::kINSERTIONS] + >> r->stats[TERScore::kDELETIONS] + >> r->stats[TERScore::kSUBSTITUTIONS] + >> r->stats[TERScore::kSHIFTS] + >> r->stats[TERScore::kREF_WORDCOUNT]; + return r; +} + +void TERScore::ScoreDetails(std::string* details) const { + char buf[200]; + sprintf(buf, "TER = %.2f, %3d|%3d|%3d|%3d (len=%d)", + ComputeScore() * 100.0f, + stats[kINSERTIONS], + stats[kDELETIONS], + stats[kSUBSTITUTIONS], + stats[kSHIFTS], + stats[kREF_WORDCOUNT]); + *details = buf; +} + +TERScorer::~TERScorer() { + for (vector<TERScorerImpl*>::iterator i = impl_.begin(); i != impl_.end(); ++i) + delete *i; +} + +TERScorer::TERScorer(const vector<vector<WordID> >& refs) : impl_(refs.size()) { + for (int i = 0; i < refs.size(); ++i) + impl_[i] = new TERScorerImpl(refs[i]); +} + +Score* TERScorer::ScoreCandidate(const std::vector<WordID>& hyp) const { + float best_score = numeric_limits<float>::max(); + TERScore* res = new TERScore; + int avg_len = 0; + for (int i = 0; i < impl_.size(); ++i) + avg_len += impl_[i]->GetRefLength(); + avg_len /= impl_.size(); + for (int i = 0; i < impl_.size(); ++i) { + int subs, ins, dels, shifts; + float score = impl_[i]->Calculate(hyp, &subs, &ins, &dels, &shifts); + // cerr << "Component TER cost: " << score << endl; + if (score < best_score) { + res->stats[TERScore::kINSERTIONS] = ins; + res->stats[TERScore::kDELETIONS] = dels; + res->stats[TERScore::kSUBSTITUTIONS] = subs; + res->stats[TERScore::kSHIFTS] = shifts; + if (ter_use_average_ref_len) { + res->stats[TERScore::kREF_WORDCOUNT] = avg_len; + } else { + res->stats[TERScore::kREF_WORDCOUNT] = impl_[i]->GetRefLength(); + } + + best_score = score; + } + } + return res; +} diff --git a/vest/test_data/0.json.gz b/vest/test_data/0.json.gz Binary files differnew file mode 100644 index 00000000..30f8dd77 --- /dev/null +++ b/vest/test_data/0.json.gz diff --git a/vest/test_data/1.json.gz b/vest/test_data/1.json.gz Binary files differnew file mode 100644 index 00000000..c82cc179 --- /dev/null +++ b/vest/test_data/1.json.gz diff --git a/vest/test_data/c2e.txt.0 b/vest/test_data/c2e.txt.0 new file mode 100644 index 00000000..12c4abe9 --- /dev/null +++ b/vest/test_data/c2e.txt.0 @@ -0,0 +1,2 @@ +australia reopens embassy in manila +( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack . diff --git a/vest/test_data/c2e.txt.1 b/vest/test_data/c2e.txt.1 new file mode 100644 index 00000000..4ac12df1 --- /dev/null +++ b/vest/test_data/c2e.txt.1 @@ -0,0 +1,2 @@ +australia reopened manila embassy +( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack . diff --git a/vest/test_data/c2e.txt.2 b/vest/test_data/c2e.txt.2 new file mode 100644 index 00000000..2f67b72f --- /dev/null +++ b/vest/test_data/c2e.txt.2 @@ -0,0 +1,2 @@ +australia to reopen embassy in manila +( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats . diff --git a/vest/test_data/c2e.txt.3 b/vest/test_data/c2e.txt.3 new file mode 100644 index 00000000..5483cef6 --- /dev/null +++ b/vest/test_data/c2e.txt.3 @@ -0,0 +1,2 @@ +australia to re - open its embassy to manila +( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago . diff --git a/vest/test_data/re.txt.0 b/vest/test_data/re.txt.0 new file mode 100644 index 00000000..86eff087 --- /dev/null +++ b/vest/test_data/re.txt.0 @@ -0,0 +1,5 @@ +erdogan states turkey to reject any pressures to urge it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . +erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . +we will discuss this dossier in the course of membership negotiations . " +he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/vest/test_data/re.txt.1 b/vest/test_data/re.txt.1 new file mode 100644 index 00000000..2140f198 --- /dev/null +++ b/vest/test_data/re.txt.1 @@ -0,0 +1,5 @@ +erdogan confirms turkey will resist any pressure to recognize cyprus +ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . +erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . +we shall discuss this issue in the course of the membership negotiations . " +he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/vest/test_data/re.txt.2 b/vest/test_data/re.txt.2 new file mode 100644 index 00000000..94e46286 --- /dev/null +++ b/vest/test_data/re.txt.2 @@ -0,0 +1,5 @@ +erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus +ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . +erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . +we shall discuss this dossier during the negotiations on joining . " +and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/vest/test_data/re.txt.3 b/vest/test_data/re.txt.3 new file mode 100644 index 00000000..f87c3308 --- /dev/null +++ b/vest/test_data/re.txt.3 @@ -0,0 +1,5 @@ +erdogan stresses that turkey will reject all pressures to force it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . +erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . +we will discuss this file during the negotiations on joining . " +he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " diff --git a/vest/union_forests.cc b/vest/union_forests.cc new file mode 100644 index 00000000..207ecb5c --- /dev/null +++ b/vest/union_forests.cc @@ -0,0 +1,73 @@ +#include <iostream> +#include <string> +#include <sstream> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "hg.h" +#include "hg_io.h" +#include "filelib.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value<unsigned int>(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") + ("new_forest_repository,n",po::value<string>(),"[REQD] Path to new forest repository") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; + } + if (conf->count("new_forest_repository") == 0) { + cerr << "Please specify the starting-point weights using -n PATH\n"; + flag = true; + } + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r PATH\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const int size = conf["dev_set_size"].as<unsigned int>(); + const string repo = conf["forest_repository"].as<string>(); + const string new_repo = conf["new_forest_repository"].as<string>(); + for (int i = 0; i < size; ++i) { + ostringstream sfin, sfout; + sfin << new_repo << '/' << i << ".json.gz"; + sfout << repo << '/' << i << ".json.gz"; + const string fin = sfin.str(); + const string fout = sfout.str(); + Hypergraph existing_hg; + cerr << "Processing " << fin << endl; + assert(FileExists(fin)); + if (FileExists(fout)) { + ReadFile rf(fout); + assert(HypergraphIO::ReadFromJSON(rf.stream(), &existing_hg)); + } + Hypergraph new_hg; + if (true) { + ReadFile rf(fin); + assert(HypergraphIO::ReadFromJSON(rf.stream(), &new_hg)); + } + existing_hg.Union(new_hg); + WriteFile wf(fout); + assert(HypergraphIO::WriteToJSON(existing_hg, false, wf.stream())); + } + return 0; +} diff --git a/vest/viterbi_envelope.cc b/vest/viterbi_envelope.cc new file mode 100644 index 00000000..1122030a --- /dev/null +++ b/vest/viterbi_envelope.cc @@ -0,0 +1,167 @@ +#include "viterbi_envelope.h" + +#include <cassert> +#include <limits> + +using namespace std; +using boost::shared_ptr; + +ostream& operator<<(ostream& os, const ViterbiEnvelope& env) { + os << '<'; + const vector<shared_ptr<Segment> >& segs = env.GetSortedSegs(); + for (int i = 0; i < segs.size(); ++i) + os << (i==0 ? "" : "|") << "x=" << segs[i]->x << ",b=" << segs[i]->b << ",m=" << segs[i]->m << ",p1=" << segs[i]->p1 << ",p2=" << segs[i]->p2; + return os << '>'; +} + +ViterbiEnvelope::ViterbiEnvelope(int i) { + if (i == 0) { + // do nothing - <> + } else if (i == 1) { + segs.push_back(shared_ptr<Segment>(new Segment(0, 0, 0, shared_ptr<Segment>(), shared_ptr<Segment>()))); + assert(this->IsMultiplicativeIdentity()); + } else { + cerr << "Only can create ViterbiEnvelope semiring 0 and 1 with this constructor!\n"; + abort(); + } +} + +struct SlopeCompare { + bool operator() (const shared_ptr<Segment>& a, const shared_ptr<Segment>& b) const { + return a->m < b->m; + } +}; + +const ViterbiEnvelope& ViterbiEnvelope::operator+=(const ViterbiEnvelope& other) { + if (!other.is_sorted) other.Sort(); + if (segs.empty()) { + segs = other.segs; + return *this; + } + is_sorted = false; + int j = segs.size(); + segs.resize(segs.size() + other.segs.size()); + for (int i = 0; i < other.segs.size(); ++i) + segs[j++] = other.segs[i]; + assert(j == segs.size()); + return *this; +} + +void ViterbiEnvelope::Sort() const { + sort(segs.begin(), segs.end(), SlopeCompare()); + const int k = segs.size(); + int j = 0; + for (int i = 0; i < k; ++i) { + Segment l = *segs[i]; + l.x = kMinusInfinity; + // cerr << "m=" << l.m << endl; + if (0 < j) { + if (segs[j-1]->m == l.m) { // lines are parallel + if (l.b <= segs[j-1]->b) continue; + --j; + } + while(0 < j) { + l.x = (l.b - segs[j-1]->b) / (segs[j-1]->m - l.m); + if (segs[j-1]->x < l.x) break; + --j; + } + if (0 == j) l.x = kMinusInfinity; + } + *segs[j++] = l; + } + segs.resize(j); + is_sorted = true; +} + +const ViterbiEnvelope& ViterbiEnvelope::operator*=(const ViterbiEnvelope& other) { + if (other.IsMultiplicativeIdentity()) { return *this; } + if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } + + if (!is_sorted) Sort(); + if (!other.is_sorted) other.Sort(); + + if (this->IsEdgeEnvelope()) { +// if (other.size() > 1) +// cerr << *this << " (TIMES) " << other << endl; + shared_ptr<Segment> edge_parent = segs[0]; + const double& edge_b = edge_parent->b; + const double& edge_m = edge_parent->m; + segs.clear(); + for (int i = 0; i < other.segs.size(); ++i) { + const Segment& seg = *other.segs[i]; + const double m = seg.m + edge_m; + const double b = seg.b + edge_b; + const double& x = seg.x; // x's don't change with * + segs.push_back(shared_ptr<Segment>(new Segment(x, m, b, edge_parent, other.segs[i]))); + assert(segs.back()->p1->rule); + } +// if (other.size() > 1) +// cerr << " = " << *this << endl; + } else { + vector<shared_ptr<Segment> > new_segs; + int this_i = 0; + int other_i = 0; + const int this_size = segs.size(); + const int other_size = other.segs.size(); + double cur_x = kMinusInfinity; // moves from left to right across the + // real numbers, stopping for all inter- + // sections + double this_next_val = (1 < this_size ? segs[1]->x : kPlusInfinity); + double other_next_val = (1 < other_size ? other.segs[1]->x : kPlusInfinity); + while (this_i < this_size && other_i < other_size) { + const Segment& this_seg = *segs[this_i]; + const Segment& other_seg= *other.segs[other_i]; + const double m = this_seg.m + other_seg.m; + const double b = this_seg.b + other_seg.b; + + new_segs.push_back(shared_ptr<Segment>(new Segment(cur_x, m, b, segs[this_i], other.segs[other_i]))); + int comp = 0; + if (this_next_val < other_next_val) comp = -1; else + if (this_next_val > other_next_val) comp = 1; + if (0 == comp) { // the next values are equal, advance both indices + ++this_i; + ++other_i; + cur_x = this_next_val; // could be other_next_val (they're equal!) + this_next_val = (this_i+1 < this_size ? segs[this_i+1]->x : kPlusInfinity); + other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity); + } else { // advance the i with the lower x, update cur_x + if (-1 == comp) { + ++this_i; + cur_x = this_next_val; + this_next_val = (this_i+1 < this_size ? segs[this_i+1]->x : kPlusInfinity); + } else { + ++other_i; + cur_x = other_next_val; + other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity); + } + } + } + segs.swap(new_segs); + } + //cerr << "Multiply: result=" << (*this) << endl; + return *this; +} + +// recursively construct translation +void Segment::ConstructTranslation(vector<WordID>* trans) const { + const Segment* cur = this; + vector<vector<WordID> > ant_trans; + while(!cur->rule) { + ant_trans.resize(ant_trans.size() + 1); + cur->p2->ConstructTranslation(&ant_trans.back()); + cur = cur->p1.get(); + } + size_t ant_size = ant_trans.size(); + vector<const vector<WordID>*> pants(ant_size); + --ant_size; + for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; + cur->rule->ESubstitute(pants, trans); +} + +ViterbiEnvelope ViterbiEnvelopeWeightFunction::operator()(const Hypergraph::Edge& e) const { + const double m = direction.dot(e.feature_values_); + const double b = origin.dot(e.feature_values_); + Segment* seg = new Segment(m, b, e.rule_); + return ViterbiEnvelope(1, seg); +} + |