diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-18 13:35:42 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-18 13:35:42 -0500 |
commit | 1b8181bf0d6e9137e6b9ccdbe414aec37377a1a9 (patch) | |
tree | 33e5f3aa5abff1f41314cf8f6afbd2c2c40e4bfd /training/dpmert | |
parent | 7c4665949fb93fb3de402e4ce1d19bef67850d05 (diff) |
major restructure of the training code
Diffstat (limited to 'training/dpmert')
32 files changed, 1943 insertions, 0 deletions
diff --git a/training/dpmert/Makefile.am b/training/dpmert/Makefile.am new file mode 100644 index 00000000..ff318bef --- /dev/null +++ b/training/dpmert/Makefile.am @@ -0,0 +1,25 @@ +bin_PROGRAMS = \ + mr_dpmert_map \ + mr_dpmert_reduce \ + mr_dpmert_generate_mapper_input + +noinst_PROGRAMS = \ + lo_test +TESTS = lo_test + +mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc +mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +# nbest2hg_SOURCES = nbest2hg.cc +# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz + +mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc +mr_dpmert_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc +mr_dpmert_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc +lo_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/dpmert/ces.cc b/training/dpmert/ces.cc new file mode 100644 index 00000000..157b2d17 --- /dev/null +++ b/training/dpmert/ces.cc @@ -0,0 +1,90 @@ +#include "ces.h" + +#include <vector> +#include <sstream> +#include <boost/shared_ptr.hpp> + +// TODO, if AER is to be optimized again, we will need this +// #include "aligner.h" +#include "lattice.h" +#include "mert_geometry.h" +#include "error_surface.h" +#include "ns.h" + +using namespace std; + +const bool minimize_segments = true; // if adjacent segments have equal scores, merge them + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& ve, + ErrorSurface* env, + const EvaluationMetric* metric, + const Hypergraph& hg) { + vector<WordID> prev_trans; + const vector<boost::shared_ptr<MERTPoint> >& ienv = ve.GetSortedSegs(); + env->resize(ienv.size()); + SufficientStats prev_score; // defaults to 0 + int j = 0; + for (unsigned i = 0; i < ienv.size(); ++i) { + const MERTPoint& seg = *ienv[i]; + vector<WordID> trans; +#if 0 + if (type == AER) { + vector<bool> edges(hg.edges_.size(), false); + seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi + // alignment + ostringstream os; + const string* psrc = ss.GetSource(); + if (psrc == NULL) { + cerr << "AER scoring in VEST requires source, but it is missing!\n"; + abort(); + } + size_t pos = psrc->rfind(" ||| "); + if (pos == string::npos) { + cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; + abort(); + } + Lattice src; + Lattice ref; + LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); + LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); + AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges); + string tstr = os.str(); + TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); + } else { +#endif + seg.ConstructTranslation(&trans); + //} + //cerr << "Scoring: " << TD::GetString(trans) << endl; + if (trans == prev_trans) { + if (!minimize_segments) { + ErrorSegment& out = (*env)[j]; + out.delta.fields.clear(); + out.x = seg.x; + ++j; + } + //cerr << "Identical translation, skipping scoring\n"; + } else { + SufficientStats score; + ss.Evaluate(trans, &score); + // cerr << "score= " << score->ComputeScore() << "\n"; + //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; + const SufficientStats delta = score - prev_score; + //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; + //string xx; delta.Encode(&xx); cerr << xx << endl; + prev_trans.swap(trans); + prev_score = score; + if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { + ErrorSegment& out = (*env)[j]; + out.delta = delta; + out.x = seg.x; + ++j; + } + } + } + // cerr << " In segments: " << ienv.size() << endl; + // cerr << "Out segments: " << j << endl; + assert(j > 0); + env->resize(j); +} + diff --git a/training/dpmert/ces.h b/training/dpmert/ces.h new file mode 100644 index 00000000..e4fa2080 --- /dev/null +++ b/training/dpmert/ces.h @@ -0,0 +1,16 @@ +#ifndef _CES_H_ +#define _CES_H_ + +class ConvexHull; +class Hypergraph; +class SegmentEvaluator; +class ErrorSurface; +class EvaluationMetric; + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& convex_hull, + ErrorSurface* es, + const EvaluationMetric* metric, + const Hypergraph& hg); + +#endif diff --git a/training/dpmert/divide_refs.py b/training/dpmert/divide_refs.py new file mode 100755 index 00000000..b478f918 --- /dev/null +++ b/training/dpmert/divide_refs.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +import sys + +(numRefs, outPrefix) = sys.argv[1:] +numRefs = int(numRefs) + +outs = [open(outPrefix+str(i), "w") for i in range(numRefs)] + +i = 0 +for line in sys.stdin: + outs[i].write(line) + i = (i + 1) % numRefs + +for out in outs: + out.close() diff --git a/training/dpmert/dpmert.pl b/training/dpmert/dpmert.pl new file mode 100755 index 00000000..559420f5 --- /dev/null +++ b/training/dpmert/dpmert.pl @@ -0,0 +1,618 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use File::Basename qw(basename); +require "libcall.pl"; + +my $QSUB_CMD = qsub_args(mert_memory()); + +# Default settings +my $srcFile; # deprecated +my $refFiles; # deprecated +my $default_jobs = env_default_jobs(); +my $bin_dir = $SCRIPT_DIR; +my $util_dir = "$SCRIPT_DIR/../utils"; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input"; +my $MAPPER = "$bin_dir/mr_dpmert_map"; +my $REDUCER = "$bin_dir/mr_dpmert_reduce"; +my $parallelize = "$util_dir/parallelize.pl"; +my $libcall = "$util_dir/libcall.pl"; +my $sentserver = "$util_dir/sentserver"; +my $sentclient = "$util_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 200; +my $rand_directions = 15; +my $iteration = 1; +my $best_weights; +my $max_iterations = 15; +my $optimization_iters = 6; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "9g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $initialWeights; +my $bleu_weight=1; +my $use_make = 1; # use make to parallelize line search +my $useqsub; +my $pass_suffix = ''; +my $devset; +# Process command-line options +if (GetOptions( + "config=s" => \$iniFile, + "weights=s" => \$initialWeights, + "devset=s" => \$devset, + "jobs=i" => \$jobs, + "pass-suffix=s" => \$pass_suffix, + "help" => \$help, + "qsub" => \$useqsub, + "iterations=i" => \$max_iterations, + "pmem=s" => \$pmem, + "random-directions=i" => \$rand_directions, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "output-dir=s" => \$dir, +) == 0 || @ARGV!=0 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (defined $srcFile || defined $refFiles) { + die <<EOT; + + The options --ref-files and --source-file are no longer supported. + Please specify the input file and its reference translations with + --devset FILE + +EOT +} + +if (!defined $iniFile) { push @missing_args, "--config"; } +if (!defined $devset) { push @missing_args, "--devset"; } +if (!defined $initialWeights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 40; +} elsif ($metric =~ /^meteor$/i) { + $lines_per_mapper = 2000; # start up time is really high for METEOR +} + + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +unless ($dir){ + $dir = "dpmert"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = basename($decoder); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; + +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} +if (-e $dir) { + # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs + die "ERROR: output directory $dir already exists (remove or use --output-dir dir)\n\n"; +} else { + mkdir "$dir" or die "Can't mkdir $dir: $!"; + mkdir "$dir/hgs" or die; + mkdir "$dir/scripts" or die; + print STDERR <<EOT; + DECODER: $decoder + INI FILE: $iniFile + WORKING DIR: $dir + DEVSET: $devset + EVAL METRIC: $metric + MAX ITERATIONS: $max_iterations + PARALLEL JOBS: $jobs + HEAD NODE: $host + PMEM (DECODING): $pmem + INITIAL WEIGHTS: $initialWeights +EOT +} + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +check_call("cp $initialWeights $dir/weights.0"); +$iniFile = $newIniFile; + +split_devset($devset, "$dir/dev.input.raw", "$dir/dev.refs"); +my $refs = "-r $dir/dev.refs"; +my $newsrc = "$dir/dev.input"; +enseg("$dir/dev.input.raw", $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER $refs -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { + print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; + print STDERR unchecked_output("date"); + $icc++; + $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + check_call("mkdir -p $dir/splag.$im1"); + $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; + my @shards = grep { /^mapinput\./ } readdir(DIR); + closedir DIR; + die "No shards!" unless scalar @shards > 0; + my $joblist = ""; + my $nmappers = 0; + my @mapoutputs = (); + @cleanupcmds = (); + my %o2i = (); + my $first_shard = 1; + my $mkfile; # only used with makefiles + my $mkfilename; + if ($use_make) { + $mkfilename = "$dir/splag.$im1/domap.mk"; + open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; + print $mkfile "all: $dir/splag.$im1/map.done\n\n"; + } + my @mkouts = (); # only used with makefiles + for my $shard (@shards) { + my $mapoutput = $shard; + my $client_name = $shard; + $client_name =~ s/mapinput.//; + $client_name = "dpmert.$client_name"; + $mapoutput =~ s/mapinput/mapoutput/; + push @mapoutputs, "$dir/splag.$im1/$mapoutput"; + $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; + my $script = "$MAPPER -s $srcFile -m $metric $refs < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; + if ($use_make) { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "#!/bin/bash\n"; + print F "$script\n"; + close F; + my $output = "$dir/splag.$im1/$mapoutput"; + push @mkouts, $output; + chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; + } else { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "$script\n"; + close F; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + + $nmappers++; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = check_output("$qcmd"); + chomp $jobid; + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); + print STDERR " $jobid"; + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + } + } + if ($use_make) { + print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; + close $mkfile; + my $mcmd = "make -j $jobs -f $mkfilename"; + print STDERR "\nExecuting: $mcmd\n"; + check_call($mcmd); + } else { + print STDERR "\nLaunched $nmappers mappers.\n"; + sleep 8; + print STDERR "Waiting for mappers to complete...\n"; + while ($nmappers > 0) { + sleep 5; + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); + $nmappers = scalar @livejobs; + } + print STDERR "All mappers complete.\n"; + } + my $tol = 0; + my $til = 0; + for my $mo (@mapoutputs) { + my $olines = get_lines($mo); + my $ilines = get_lines($o2i{$mo}); + $tol += $olines; + $til += $ilines; + die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; + } + print STDERR "Results for $tol/$til lines\n"; + print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; + print STDERR unchecked_output("date"); + $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; + # sort returns failure even when it doesn't fail for some reason + my $best=unchecked_output("$cmd"); chomp $best; + print STDERR "$best\n"; + my ($oa, $x, $xscore) = split /\|/, $best; + $score = $xscore; + print STDERR "PROJECTED SCORE: $score\n"; + if (abs($x) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; + last; + } + my $psd = $score - $last_score; + $last_score = $score; + if (abs($psd) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; + last; + } + my ($origin, $axis) = split /\s+/, $oa; + + my %ori = convert($origin); + my %axi = convert($axis); + + my $finalFile="$dir/weights.$im1-$opt_iter"; + open W, ">$finalFile" or die "Can't write: $finalFile: $!"; + my $norm = 0; + for my $k (sort keys %ori) { + my $dd = $ori{$k} + $axi{$k} * $x; + $norm += $dd * $dd; + } + $norm = sqrt($norm); + $norm = 1; + for my $k (sort keys %ori) { + my $v = ($ori{$k} + $axi{$k} * $x) / $norm; + print W "$k $v\n"; + } + check_call("rm $dir/splag.$im1/*"); + $inweights = $finalFile; + } + $lastWeightsFile = "$dir/weights.$iteration"; + check_call("cp $inweights $lastWeightsFile"); + if ($icc < 2) { + print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; + last; + } + $lastPScore = $score; + $iteration++; + print STDERR "\n==========\n"; +} + +check_call("cp $lastWeightsFile $dir/weights.final"); +print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n"; +print STDOUT "$dir/weights.final\n"; +exit 0; + + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while(<FL>) { $lc++; } + return $lc; +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while(<F>) { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + + my $executable = basename($0); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable [options] + Runs a complete MERT optimization. Required options are --weights, + --devset, and --config. + +Options: + + --config <file> [-c <file>] + The decoder configuration file. + + --devset <file> [-d <file>] + The source *and* references for the development set. + + --weights <file> [-w <file>] + A file specifying initial feature weights. The format is + FeatureName_1 value1 + FeatureName_2 value2 + **All and only the weights listed in <file> will be optimized!** + + --metric <name> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --iterations <M> + Maximum number of iterations to run. If not specified, defaults + to 10. + + --pass-suffix <S> + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --rand-directions <num> + MERT will attempt to optimize along all of the principle directions, + set this parameter to explore other directions. Defaults to 5. + + --output-dir <dir> + Directory for intermediate and output files. + + --help + Print this message and exit. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub split_devset { + my ($infile, $outsrc, $outref) = @_; + open F, "<$infile" or die "Can't read $infile: $!"; + open S, ">$outsrc" or die "Can't write $outsrc: $!"; + open R, ">$outref" or die "Can't write $outref: $!"; + while(<F>) { + chomp; + my ($src, @refs) = split /\s*\|\|\|\s*/; + die "Malformed devset line: $_\n" unless scalar @refs > 0; + print S "$src\n"; + print R join(' ||| ', @refs) . "\n"; + } + close R; + close S; + close F; +} + diff --git a/training/dpmert/error_surface.cc b/training/dpmert/error_surface.cc new file mode 100644 index 00000000..515b67f8 --- /dev/null +++ b/training/dpmert/error_surface.cc @@ -0,0 +1,42 @@ +#include "error_surface.h" + +#include <cassert> +#include <sstream> + +using namespace std; + +ErrorSurface::~ErrorSurface() {} + +void ErrorSurface::Serialize(std::string* out) const { + const int segments = this->size(); + ostringstream os(ios::binary); + os.write((const char*)&segments,sizeof(segments)); + for (int i = 0; i < segments; ++i) { + const ErrorSegment& cur = (*this)[i]; + string senc; + cur.delta.Encode(&senc); + assert(senc.size() < 1024); + unsigned char len = senc.size(); + os.write((const char*)&cur.x, sizeof(cur.x)); + os.write((const char*)&len, sizeof(len)); + os.write((const char*)&senc[0], len); + } + *out = os.str(); +} + +void ErrorSurface::Deserialize(const std::string& in) { + istringstream is(in, ios::binary); + int segments; + is.read((char*)&segments, sizeof(segments)); + this->resize(segments); + for (int i = 0; i < segments; ++i) { + ErrorSegment& cur = (*this)[i]; + unsigned char len; + is.read((char*)&cur.x, sizeof(cur.x)); + is.read((char*)&len, sizeof(len)); + string senc(len, '\0'); assert(senc.size() == len); + is.read((char*)&senc[0], len); + cur.delta = SufficientStats(senc); + } +} + diff --git a/training/dpmert/error_surface.h b/training/dpmert/error_surface.h new file mode 100644 index 00000000..bb65847b --- /dev/null +++ b/training/dpmert/error_surface.h @@ -0,0 +1,24 @@ +#ifndef _ERROR_SURFACE_H_ +#define _ERROR_SURFACE_H_ + +#include <vector> +#include <string> + +#include "ns.h" + +class Score; + +struct ErrorSegment { + double x; + SufficientStats delta; + ErrorSegment() : x(0), delta() {} +}; + +class ErrorSurface : public std::vector<ErrorSegment> { + public: + ~ErrorSurface(); + void Serialize(std::string* out) const; + void Deserialize(const std::string& in); +}; + +#endif diff --git a/training/dpmert/line_mediator.pl b/training/dpmert/line_mediator.pl new file mode 100755 index 00000000..bc2bb24c --- /dev/null +++ b/training/dpmert/line_mediator.pl @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w +#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication + +# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver) + +#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism. + +use strict; +use IPC::Open2; +use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO); + +my $quiet=!$ENV{DEBUG}; +$quiet=1 if $ENV{QUIET}; +sub info { + local $,=' '; + print STDERR @_ unless $quiet; +} + +my $mode='CROSS'; +my $ser='DIRECT'; +$mode='PIPE' if $ENV{PIPE}; +$mode='SNAKE' if $ENV{SNAKE}; +$mode='CROSS' if $ENV{CROSS}; +$ser='SERIAL' if $ENV{SERIAL}; +$ser='DIRECT' if $ENV{DIRECT}; +$ser='SERIAL' if $mode eq 'SNAKE'; +info("mode: $mode\n"); +info("connection: $ser\n"); + + +my @c1; +if (scalar @ARGV) { + do { + push @c1,shift + } while scalar @ARGV && $c1[$#c1] ne '--'; +} +pop @c1; +my @c2=@ARGV; +@ARGV=(); +(scalar @c1 && scalar @c2) || die qq{ +usage: $0 cmd1 args -- cmd2 args +all options are environment variables. +DEBUG=1 env var enables debugging output. +CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication. crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output. cmd1 initiates the conversation (sends the first line). default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork). +SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG. +if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout. +if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2. +DIRECT=1 (default) will override SERIAL=1. +CROSS=1 (default) will override SNAKE or PIPE. +}; + +info("1 cmd:",@c1,"\n"); +info("2 cmd:",@c2,"\n"); + +sub lineto { + select $_[0]; + $|=1; + shift; + print @_; +} + +if ($ser eq 'SERIAL') { + my ($R1,$W1,$R2,$W2); + my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3. + my $c2p=open2($R2,$W2,@c2); + if ($mode eq 'CROSS') { + while(<$R1>) { + info("1:",$_); + lineto($W2,$_); + last unless defined ($_=<$R2>); + info("1|2:",$_); + lineto($W1,$_); + } + } else { + my $snake=$mode eq 'SNAKE'; + while(<STDIN>) { + info("IN:",$_); + lineto($W1,$_); + last unless defined ($_=<$R1>); + info("IN|1:",$_); + lineto($W2,$_); + last unless defined ($_=<$R2>); + info("IN|1|2:",$_); + if ($snake) { + lineto($W1,$_); + last unless defined ($_=<$R1>); + info("IN|1|2|1:",$_); + } + lineto(*STDOUT,$_); + } + } +} else { + info("DIRECT mode\n"); + my @rw1=POSIX::pipe(); + my @rw2=POSIX::pipe(); + my $pid=undef; + $SIG{CHLD} = sub { wait }; + while (not defined ($pid=fork())) { + sleep 1; + } + my $pipe = $mode eq 'PIPE'; + unless ($pipe) { + POSIX::close(STDOUT_FILENO); + POSIX::close(STDIN_FILENO); + } + if ($pid) { + POSIX::dup2($rw1[1],STDOUT_FILENO); + POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe; + exec @c1; + } else { + POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe; + POSIX::dup2($rw1[0],STDIN_FILENO); + exec @c2; + } + while (wait()!=-1) {} +} diff --git a/training/dpmert/line_optimizer.cc b/training/dpmert/line_optimizer.cc new file mode 100644 index 00000000..9cf33502 --- /dev/null +++ b/training/dpmert/line_optimizer.cc @@ -0,0 +1,114 @@ +#include "line_optimizer.h" + +#include <limits> +#include <algorithm> + +#include "sparse_vector.h" +#include "ns.h" + +using namespace std; + +typedef ErrorSurface::const_iterator ErrorIter; + +// sort by increasing x-ints +struct IntervalComp { + bool operator() (const ErrorIter& a, const ErrorIter& b) const { + return a->x < b->x; + } +}; + +double LineOptimizer::LineOptimize( + const EvaluationMetric* metric, + const vector<ErrorSurface>& surfaces, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon) { + // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << " MINE=" << type << endl; + vector<ErrorIter> all_ints; + for (vector<ErrorSurface>::const_iterator i = surfaces.begin(); + i != surfaces.end(); ++i) { + const ErrorSurface& surface = *i; + for (ErrorIter j = surface.begin(); j != surface.end(); ++j) + all_ints.push_back(j); + } + sort(all_ints.begin(), all_ints.end(), IntervalComp()); + double last_boundary = all_ints.front()->x; + SufficientStats acc; + float& cur_best_score = *best_score; + cur_best_score = (type == MAXIMIZE_SCORE ? + -numeric_limits<float>::max() : numeric_limits<float>::max()); + bool left_edge = true; + double pos = numeric_limits<double>::quiet_NaN(); + for (vector<ErrorIter>::iterator i = all_ints.begin(); + i != all_ints.end(); ++i) { + const ErrorSegment& seg = **i; + if (seg.x - last_boundary > epsilon) { + float sco = metric->ComputeScore(acc); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = seg.x - 0.1; + left_edge = false; + } else { + pos = last_boundary + (seg.x - last_boundary) / 2; + } + //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; + } + // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; +#undef SHOW_ERROR_SURFACES +#ifdef SHOW_ERROR_SURFACES + cerr << "x=" << seg.x << "\ts=" << sco << "\n"; +#endif + last_boundary = seg.x; + } + // cerr << "x-boundary=" << seg.x << "\n"; + //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl; + //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; + acc += seg.delta; + } + float sco = metric->ComputeScore(acc); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = 0; + } else { + pos = last_boundary + 1000.0; + } + } + return pos; +} + +void LineOptimizer::RandomUnitVector(const vector<int>& features_to_optimize, + SparseVector<double>* axis, + RandomNumberGenerator<boost::mt19937>* rng) { + axis->clear(); + for (int i = 0; i < features_to_optimize.size(); ++i) + axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0)); + (*axis) /= axis->l2norm(); +} + +void LineOptimizer::CreateOptimizationDirections( + const vector<int>& features_to_optimize, + int additional_random_directions, + RandomNumberGenerator<boost::mt19937>* rng, + vector<SparseVector<double> >* dirs + , bool include_orthogonal + ) { + dirs->clear(); + typedef SparseVector<double> Dir; + vector<Dir> &out=*dirs; + int i=0; + if (include_orthogonal) + for (;i<features_to_optimize.size();++i) { + Dir d; + d.set_value(features_to_optimize[i],1.); + out.push_back(d); + } + out.resize(i+additional_random_directions); + for (;i<out.size();++i) + RandomUnitVector(features_to_optimize, &out[i], rng); + cerr << "Generated " << out.size() << " total axes to optimize along.\n"; +} + diff --git a/training/dpmert/line_optimizer.h b/training/dpmert/line_optimizer.h new file mode 100644 index 00000000..83819f41 --- /dev/null +++ b/training/dpmert/line_optimizer.h @@ -0,0 +1,48 @@ +#ifndef LINE_OPTIMIZER_H_ +#define LINE_OPTIMIZER_H_ + +#include <vector> + +#include "sparse_vector.h" +#include "error_surface.h" +#include "sampler.h" + +class EvaluationMetric; +class Weights; + +struct LineOptimizer { + + // use MINIMIZE_SCORE for things like TER, WER + // MAXIMIZE_SCORE for things like BLEU + enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE }; + + // merge all the error surfaces together into a global + // error surface and find (the middle of) the best segment + static double LineOptimize( + const EvaluationMetric* metric, + const std::vector<ErrorSurface>& envs, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon = 1.0/65536.0); + + // return a random vector of length 1 where all dimensions + // not listed in dimensions will be 0. + static void RandomUnitVector(const std::vector<int>& dimensions, + SparseVector<double>* axis, + RandomNumberGenerator<boost::mt19937>* rng); + + // generate a list of directions to optimize; the list will + // contain the orthogonal vectors corresponding to the dimensions in + // primary and then additional_random_directions directions in those + // dimensions as well. All vectors will be length 1. + static void CreateOptimizationDirections( + const std::vector<int>& primary, + int additional_random_directions, + RandomNumberGenerator<boost::mt19937>* rng, + std::vector<SparseVector<double> >* dirs + , bool include_primary=true + ); + +}; + +#endif diff --git a/training/dpmert/lo_test.cc b/training/dpmert/lo_test.cc new file mode 100644 index 00000000..95a08d3d --- /dev/null +++ b/training/dpmert/lo_test.cc @@ -0,0 +1,229 @@ +#define BOOST_TEST_MODULE LineOptimizerTest +#include <boost/test/unit_test.hpp> +#include <boost/test/floating_point_comparison.hpp> + +#include <cmath> +#include <iostream> +#include <fstream> + +#include <boost/shared_ptr.hpp> + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "fdict.h" +#include "hg.h" +#include "kbest.h" +#include "hg_io.h" +#include "filelib.h" +#include "inside_outside.h" +#include "viterbi.h" +#include "mert_geometry.h" +#include "line_optimizer.h" + +using namespace std; + +const char* ref11 = "australia reopens embassy in manila"; +const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack ."; +const char* ref21 = "australia reopened manila embassy"; +const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack ."; +const char* ref31 = "australia to reopen embassy in manila"; +const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats ."; +const char* ref41 = "australia to re - open its embassy to manila"; +const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago ."; + +BOOST_AUTO_TEST_CASE( TestCheckNaN) { + double x = 0; + double y = 0; + double z = x / y; + BOOST_CHECK_EQUAL(true, std::isnan(z)); +} + +BOOST_AUTO_TEST_CASE(TestConvexHull) { + boost::shared_ptr<MERTPoint> a1(new MERTPoint(-1, 0)); + boost::shared_ptr<MERTPoint> b1(new MERTPoint(1, 0)); + boost::shared_ptr<MERTPoint> a2(new MERTPoint(-1, 1)); + boost::shared_ptr<MERTPoint> b2(new MERTPoint(1, -1)); + vector<boost::shared_ptr<MERTPoint> > sa; sa.push_back(a1); sa.push_back(b1); + vector<boost::shared_ptr<MERTPoint> > sb; sb.push_back(a2); sb.push_back(b2); + ConvexHull a(sa); + cerr << a << endl; + ConvexHull b(sb); + ConvexHull c = a; + c *= b; + cerr << a << " (*) " << b << " = " << c << endl; + BOOST_CHECK_EQUAL(3, c.size()); +} + +BOOST_AUTO_TEST_CASE(TestConvexHullInside) { + const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector<double> wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 1.0); + hg.Reweight(wts); + vector<pair<vector<WordID>, prob_t> > list; + std::vector<SparseVector<double> > features; + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0); + ConvexHullWeightFunction wf(wts, dir); + ConvexHull env = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + cerr << env << endl; + const vector<boost::shared_ptr<MERTPoint> >& segs = env.GetSortedSegs(); + dir *= segs[1]->x; + wts += dir; + hg.Reweight(wts); + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest2(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest2.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + for (unsigned i = 0; i < segs.size(); ++i) { + cerr << "seg=" << i << endl; + vector<WordID> trans; + segs[i]->ConstructTranslation(&trans); + cerr << TD::GetString(trans) << endl; + } +} + +BOOST_AUTO_TEST_CASE( TestS1) { + int fPhraseModel_0 = FD::Convert("PhraseModel_0"); + int fPhraseModel_1 = FD::Convert("PhraseModel_1"); + int fPhraseModel_2 = FD::Convert("PhraseModel_2"); + int fLanguageModel = FD::Convert("LanguageModel"); + int fWordPenalty = FD::Convert("WordPenalty"); + int fPassThrough = FD::Convert("PassThrough"); + SparseVector<double> wts; + wts.set_value(fWordPenalty, 4.25); + wts.set_value(fLanguageModel, -1.1165); + wts.set_value(fPhraseModel_0, -0.96); + wts.set_value(fPhraseModel_1, -0.65); + wts.set_value(fPhraseModel_2, -0.77); + wts.set_value(fPassThrough, -10.0); + + vector<int> to_optimize; + to_optimize.push_back(fWordPenalty); + to_optimize.push_back(fLanguageModel); + to_optimize.push_back(fPhraseModel_0); + to_optimize.push_back(fPhraseModel_1); + to_optimize.push_back(fPhraseModel_2); + + std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : "test_data"); + + Hypergraph hg; + ReadFile rf(path + "/0.json.gz"); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(wts); + + Hypergraph hg2; + ReadFile rf2(path + "/1.json.gz"); + HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); + hg2.Reweight(wts); + + vector<vector<WordID> > refs1(4); + TD::ConvertSentence(ref11, &refs1[0]); + TD::ConvertSentence(ref21, &refs1[1]); + TD::ConvertSentence(ref31, &refs1[2]); + TD::ConvertSentence(ref41, &refs1[3]); + vector<vector<WordID> > refs2(4); + TD::ConvertSentence(ref12, &refs2[0]); + TD::ConvertSentence(ref22, &refs2[1]); + TD::ConvertSentence(ref32, &refs2[2]); + TD::ConvertSentence(ref42, &refs2[3]); + vector<ConvexHull> envs(2); + + RandomNumberGenerator<boost::mt19937> rng; + + vector<SparseVector<double> > axes; // directions to search + LineOptimizer::CreateOptimizationDirections( + to_optimize, + 10, + &rng, + &axes); + assert(axes.size() == 10 + to_optimize.size()); + for (unsigned i = 0; i < axes.size(); ++i) + cerr << axes[i] << endl; + const SparseVector<double>& axis = axes[0]; + + cerr << "Computing Viterbi envelope using inside algorithm...\n"; + cerr << "axis: " << axis << endl; + clock_t t_start=clock(); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + envs[1] = Inside<ConvexHull, ConvexHullWeightFunction>(hg2, NULL, wf); + + vector<ErrorSurface> es(2); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(refs1); + boost::shared_ptr<SegmentEvaluator> scorer2 = metric->CreateSegmentEvaluator(refs2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); + cerr << envs[0].size() << " " << envs[1].size() << endl; + cerr << es[0].size() << " " << es[1].size() << endl; + envs.clear(); + clock_t t_env=clock(); + float score; + double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); + clock_t t_opt=clock(); + cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; + BOOST_CHECK_CLOSE(0.48719698, score, 1e-5); + SparseVector<double> res = axis; + res *= m; + res += wts; + cerr << "res: " << res << endl; + cerr << "ENVELOPE PROCESSING=" << (static_cast<double>(t_env - t_start) / 1000.0) << endl; + cerr << " LINE OPTIMIZATION=" << (static_cast<double>(t_opt - t_env) / 1000.0) << endl; + hg.Reweight(res); + hg2.Reweight(res); + vector<WordID> t1,t2; + ViterbiESentence(hg, &t1); + ViterbiESentence(hg2, &t2); + cerr << TD::GetString(t1) << endl; + cerr << TD::GetString(t2) << endl; +} + +BOOST_AUTO_TEST_CASE(TestZeroOrigin) { + const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector<double> wts; + wts.set_value(FD::Convert("PassThrough"), -0.929201533002898); + hg.Reweight(wts); + + vector<pair<vector<WordID>, prob_t> > list; + std::vector<SparseVector<double> > features; + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + + SparseVector<double> axis; axis.set_value(FD::Convert("Glue"),1.0); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + vector<ConvexHull> envs(1); + envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + + vector<vector<WordID> > mr(4); + TD::ConvertSentence("untitled", &mr[0]); + TD::ConvertSentence("with no title", &mr[1]); + TD::ConvertSentence("without a title", &mr[2]); + TD::ConvertSentence("without title", &mr[3]); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(mr); + vector<ErrorSurface> es(1); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); +} + diff --git a/training/dpmert/mert_geometry.cc b/training/dpmert/mert_geometry.cc new file mode 100644 index 00000000..d6973658 --- /dev/null +++ b/training/dpmert/mert_geometry.cc @@ -0,0 +1,185 @@ +#include "mert_geometry.h" + +#include <cassert> +#include <limits> + +using namespace std; + +ConvexHull::ConvexHull(int i) { + if (i == 0) { + // do nothing - <> + } else if (i == 1) { + points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(0, 0, 0, boost::shared_ptr<MERTPoint>(), boost::shared_ptr<MERTPoint>()))); + assert(this->IsMultiplicativeIdentity()); + } else { + cerr << "Only can create ConvexHull semiring 0 and 1 with this constructor!\n"; + abort(); + } +} + +const ConvexHull ConvexHullWeightFunction::operator()(const Hypergraph::Edge& e) const { + const double m = direction.dot(e.feature_values_); + const double b = origin.dot(e.feature_values_); + MERTPoint* point = new MERTPoint(m, b, e); + return ConvexHull(1, point); +} + +ostream& operator<<(ostream& os, const ConvexHull& env) { + os << '<'; + const vector<boost::shared_ptr<MERTPoint> >& points = env.GetSortedSegs(); + for (int i = 0; i < points.size(); ++i) + os << (i==0 ? "" : "|") << "x=" << points[i]->x << ",b=" << points[i]->b << ",m=" << points[i]->m << ",p1=" << points[i]->p1 << ",p2=" << points[i]->p2; + return os << '>'; +} + +#define ORIGINAL_MERT_IMPLEMENTATION 1 +#ifdef ORIGINAL_MERT_IMPLEMENTATION + +struct SlopeCompare { + bool operator() (const boost::shared_ptr<MERTPoint>& a, const boost::shared_ptr<MERTPoint>& b) const { + return a->m < b->m; + } +}; + +const ConvexHull& ConvexHull::operator+=(const ConvexHull& other) { + if (!other.is_sorted) other.Sort(); + if (points.empty()) { + points = other.points; + return *this; + } + is_sorted = false; + int j = points.size(); + points.resize(points.size() + other.points.size()); + for (int i = 0; i < other.points.size(); ++i) + points[j++] = other.points[i]; + assert(j == points.size()); + return *this; +} + +void ConvexHull::Sort() const { + sort(points.begin(), points.end(), SlopeCompare()); + const int k = points.size(); + int j = 0; + for (int i = 0; i < k; ++i) { + MERTPoint l = *points[i]; + l.x = kMinusInfinity; + // cerr << "m=" << l.m << endl; + if (0 < j) { + if (points[j-1]->m == l.m) { // lines are parallel + if (l.b <= points[j-1]->b) continue; + --j; + } + while(0 < j) { + l.x = (l.b - points[j-1]->b) / (points[j-1]->m - l.m); + if (points[j-1]->x < l.x) break; + --j; + } + if (0 == j) l.x = kMinusInfinity; + } + *points[j++] = l; + } + points.resize(j); + is_sorted = true; +} + +const ConvexHull& ConvexHull::operator*=(const ConvexHull& other) { + if (other.IsMultiplicativeIdentity()) { return *this; } + if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } + + if (!is_sorted) Sort(); + if (!other.is_sorted) other.Sort(); + + if (this->IsEdgeEnvelope()) { +// if (other.size() > 1) +// cerr << *this << " (TIMES) " << other << endl; + boost::shared_ptr<MERTPoint> edge_parent = points[0]; + const double& edge_b = edge_parent->b; + const double& edge_m = edge_parent->m; + points.clear(); + for (int i = 0; i < other.points.size(); ++i) { + const MERTPoint& p = *other.points[i]; + const double m = p.m + edge_m; + const double b = p.b + edge_b; + const double& x = p.x; // x's don't change with * + points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(x, m, b, edge_parent, other.points[i]))); + assert(points.back()->p1->edge); + } +// if (other.size() > 1) +// cerr << " = " << *this << endl; + } else { + vector<boost::shared_ptr<MERTPoint> > new_points; + int this_i = 0; + int other_i = 0; + const int this_size = points.size(); + const int other_size = other.points.size(); + double cur_x = kMinusInfinity; // moves from left to right across the + // real numbers, stopping for all inter- + // sections + double this_next_val = (1 < this_size ? points[1]->x : kPlusInfinity); + double other_next_val = (1 < other_size ? other.points[1]->x : kPlusInfinity); + while (this_i < this_size && other_i < other_size) { + const MERTPoint& this_point = *points[this_i]; + const MERTPoint& other_point= *other.points[other_i]; + const double m = this_point.m + other_point.m; + const double b = this_point.b + other_point.b; + + new_points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(cur_x, m, b, points[this_i], other.points[other_i]))); + int comp = 0; + if (this_next_val < other_next_val) comp = -1; else + if (this_next_val > other_next_val) comp = 1; + if (0 == comp) { // the next values are equal, advance both indices + ++this_i; + ++other_i; + cur_x = this_next_val; // could be other_next_val (they're equal!) + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } else { // advance the i with the lower x, update cur_x + if (-1 == comp) { + ++this_i; + cur_x = this_next_val; + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + } else { + ++other_i; + cur_x = other_next_val; + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } + } + } + points.swap(new_points); + } + //cerr << "Multiply: result=" << (*this) << endl; + return *this; +} + +// recursively construct translation +void MERTPoint::ConstructTranslation(vector<WordID>* trans) const { + const MERTPoint* cur = this; + vector<vector<WordID> > ant_trans; + while(!cur->edge) { + ant_trans.resize(ant_trans.size() + 1); + cur->p2->ConstructTranslation(&ant_trans.back()); + cur = cur->p1.get(); + } + size_t ant_size = ant_trans.size(); + vector<const vector<WordID>*> pants(ant_size); + assert(ant_size == cur->edge->tail_nodes_.size()); + --ant_size; + for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; + cur->edge->rule_->ESubstitute(pants, trans); +} + +void MERTPoint::CollectEdgesUsed(std::vector<bool>* edges_used) const { + if (edge) { + assert(edge->id_ < edges_used->size()); + (*edges_used)[edge->id_] = true; + } + if (p1) p1->CollectEdgesUsed(edges_used); + if (p2) p2->CollectEdgesUsed(edges_used); +} + +#else + +// THIS IS THE NEW FASTER IMPLEMENTATION OF THE MERT SEMIRING OPERATIONS + +#endif + diff --git a/training/dpmert/mert_geometry.h b/training/dpmert/mert_geometry.h new file mode 100644 index 00000000..a8b6959e --- /dev/null +++ b/training/dpmert/mert_geometry.h @@ -0,0 +1,81 @@ +#ifndef _MERT_GEOMETRY_H_ +#define _MERT_GEOMETRY_H_ + +#include <vector> +#include <iostream> +#include <boost/shared_ptr.hpp> + +#include "hg.h" +#include "sparse_vector.h" + +static const double kMinusInfinity = -std::numeric_limits<double>::infinity(); +static const double kPlusInfinity = std::numeric_limits<double>::infinity(); + +struct MERTPoint { + MERTPoint() : x(), m(), b(), edge() {} + MERTPoint(double _m, double _b) : + x(kMinusInfinity), m(_m), b(_b), edge() {} + MERTPoint(double _x, double _m, double _b, const boost::shared_ptr<MERTPoint>& p1_, const boost::shared_ptr<MERTPoint>& p2_) : + x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} + MERTPoint(double _m, double _b, const Hypergraph::Edge& edge) : + x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} + + double x; // x intersection with previous segment in env, or -inf if none + double m; // this line's slope + double b; // intercept with y-axis + + // we keep a pointer to the "parents" of this segment so we can reconstruct + // the Viterbi translation corresponding to this segment + boost::shared_ptr<MERTPoint> p1; + boost::shared_ptr<MERTPoint> p2; + + // only MERTPoints created from an edge using the ConvexHullWeightFunction + // have rules + // TRulePtr rule; + const Hypergraph::Edge* edge; + + // recursively recover the Viterbi translation that will result from setting + // the weights to origin + axis * x, where x is any value from this->x up + // until the next largest x in the containing ConvexHull + void ConstructTranslation(std::vector<WordID>* trans) const; + void CollectEdgesUsed(std::vector<bool>* edges_used) const; +}; + +// this is the semiring value type, +// it defines constructors for 0, 1, and the operations + and * +struct ConvexHull { + // create semiring zero + ConvexHull() : is_sorted(true) {} // zero + // for debugging: + ConvexHull(const std::vector<boost::shared_ptr<MERTPoint> >& s) : points(s) { Sort(); } + // create semiring 1 or 0 + explicit ConvexHull(int i); + ConvexHull(int n, MERTPoint* point) : is_sorted(true), points(n, boost::shared_ptr<MERTPoint>(point)) {} + const ConvexHull& operator+=(const ConvexHull& other); + const ConvexHull& operator*=(const ConvexHull& other); + bool IsMultiplicativeIdentity() const { + return size() == 1 && (points[0]->b == 0.0 && points[0]->m == 0.0) && (!points[0]->edge) && (!points[0]->p1) && (!points[0]->p2); } + const std::vector<boost::shared_ptr<MERTPoint> >& GetSortedSegs() const { + if (!is_sorted) Sort(); + return points; + } + size_t size() const { return points.size(); } + + private: + bool IsEdgeEnvelope() const { + return points.size() == 1 && points[0]->edge; } + void Sort() const; + mutable bool is_sorted; + mutable std::vector<boost::shared_ptr<MERTPoint> > points; +}; +std::ostream& operator<<(std::ostream& os, const ConvexHull& env); + +struct ConvexHullWeightFunction { + ConvexHullWeightFunction(const SparseVector<double>& ori, + const SparseVector<double>& dir) : origin(ori), direction(dir) {} + const ConvexHull operator()(const Hypergraph::Edge& e) const; + const SparseVector<double> origin; + const SparseVector<double> direction; +}; + +#endif diff --git a/training/dpmert/mr_dpmert_generate_mapper_input.cc b/training/dpmert/mr_dpmert_generate_mapper_input.cc new file mode 100644 index 00000000..199cd23a --- /dev/null +++ b/training/dpmert/mr_dpmert_generate_mapper_input.cc @@ -0,0 +1,81 @@ +#include <iostream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "weights.h" +#include "line_optimizer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") + ("weights,w",po::value<string>(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; + } + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; + flag = true; + } + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r <DIR>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + RandomNumberGenerator<boost::mt19937> rng; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector<string> features; + SparseVector<weight_t> origin; + vector<weight_t> w; + Weights::InitFromFile(conf["weights"].as<string>(), &w, &features); + Weights::InitSparseVector(w, &origin); + const string forest_repository = conf["forest_repository"].as<string>(); + if (!DirectoryExists(forest_repository)) { + cerr << "Forest repository directory " << forest_repository << " not found!\n"; + return 1; + } + if (conf.count("optimize_feature") > 0) + features=conf["optimize_feature"].as<vector<string> >(); + vector<SparseVector<weight_t> > directions; + vector<int> fids(features.size()); + for (unsigned i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + LineOptimizer::CreateOptimizationDirections( + fids, + conf["random_directions"].as<unsigned int>(), + &rng, + &directions); + unsigned dev_set_size = conf["dev_set_size"].as<unsigned>(); + for (unsigned i = 0; i < dev_set_size; ++i) { + for (unsigned j = 0; j < directions.size(); ++j) { + cout << forest_repository << '/' << i << ".json.gz " << i << ' '; + print(cout, origin, "=", ";"); + cout << ' '; + print(cout, directions[j], "=", ";"); + cout << endl; + } + } + return 0; +} diff --git a/training/dpmert/mr_dpmert_map.cc b/training/dpmert/mr_dpmert_map.cc new file mode 100644 index 00000000..d1efcf96 --- /dev/null +++ b/training/dpmert/mr_dpmert_map.cc @@ -0,0 +1,112 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "filelib.h" +#include "stringlib.h" +#include "sparse_vector.h" +#include "mert_geometry.h" +#include "inside_outside.h" +#include "error_surface.h" +#include "b64tools.h" +#include "hg_io.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") + ("source,s",po::value<string>(), "Source file (ignored, except for AER)") + ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric being optimized") + ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r <REF.TXT>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +bool ReadSparseVectorString(const string& s, SparseVector<double>* v) { +#if 0 + // this should work, but untested. + std::istringstream i(s); + i>>*v; +#else + vector<string> fields; + Tokenize(s, ';', &fields); + if (fields.empty()) return false; + for (unsigned i = 0; i < fields.size(); ++i) { + vector<string> pair(2); + Tokenize(fields[i], '=', &pair); + if (pair.size() != 2) { + cerr << "Error parsing vector string: " << fields[i] << endl; + return false; + } + v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str())); + } + return true; +#endif +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as<string>()); + istream &in=*in_read.stream(); + while(in) { + string line; + getline(in, line); + if (line.empty()) continue; + istringstream is(line); + int sent_id; + string file, s_origin, s_direction; + // path-to-file (JSON) sent_ed starting-point search-direction + is >> file >> sent_id >> s_origin >> s_direction; + SparseVector<double> origin; + ReadSparseVectorString(s_origin, &origin); + SparseVector<double> direction; + ReadSparseVectorString(s_direction, &direction); + // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl; + if (last_file != file) { + last_file = file; + ReadFile rf(file); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + } + const ConvexHullWeightFunction wf(origin, direction); + const ConvexHull hull = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + + ErrorSurface es; + ComputeErrorSurface(*ds[sent_id], hull, &es, metric, hg); + //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; + // cerr << "Error surface has " << es.size() << " segments\n"; + string val; + es.Serialize(&val); + cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; + B64::b64encode(val.c_str(), val.size(), &cout); + cout << endl << flush; + } + return 0; +} diff --git a/training/dpmert/mr_dpmert_reduce.cc b/training/dpmert/mr_dpmert_reduce.cc new file mode 100644 index 00000000..31512a03 --- /dev/null +++ b/training/dpmert/mr_dpmert_reduce.cc @@ -0,0 +1,77 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sparse_vector.h" +#include "error_surface.h" +#include "line_optimizer.h" +#include "b64tools.h" +#include "stringlib.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("evaluation_metric,m",po::value<string>(), "Evaluation metric (IBM_BLEU, etc.)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = conf->count("evaluation_metric") == 0; + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; + if (metric->IsErrorMetric()) + opt_type = LineOptimizer::MINIMIZE_SCORE; + + vector<ErrorSurface> esv; + string last_key, line, key, val; + while(getline(cin, line)) { + size_t ks = line.find("\t"); + assert(string::npos != ks); + assert(ks > 2); + key = line.substr(2, ks - 2); + val = line.substr(ks + 1); + if (key != last_key) { + if (!last_key.empty()) { + float score; + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + last_key.swap(key); + esv.clear(); + } + if (val.size() % 4 != 0) { + cerr << "B64 encoding error 1! Skipping.\n"; + continue; + } + string encoded(val.size() / 4 * 3, '\0'); + if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) { + cerr << "B64 encoding error 2! Skipping.\n"; + continue; + } + esv.push_back(ErrorSurface()); + esv.back().Deserialize(encoded); + } + if (!esv.empty()) { + float score; + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + return 0; +} diff --git a/training/dpmert/test_aer/README b/training/dpmert/test_aer/README new file mode 100644 index 00000000..819b2e32 --- /dev/null +++ b/training/dpmert/test_aer/README @@ -0,0 +1,8 @@ +To run the test: + +../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights + +This will optimize the parameters of the tiny lexical translation model +so as to minimize the AER of the Viterbi alignment on the development +set in corpus.src according to the reference alignments in ref.0. + diff --git a/training/dpmert/test_aer/cdec.ini b/training/dpmert/test_aer/cdec.ini new file mode 100644 index 00000000..08187848 --- /dev/null +++ b/training/dpmert/test_aer/cdec.ini @@ -0,0 +1,3 @@ +formalism=lextrans +grammar=grammar +aligner=true diff --git a/training/dpmert/test_aer/corpus.src b/training/dpmert/test_aer/corpus.src new file mode 100644 index 00000000..31b23971 --- /dev/null +++ b/training/dpmert/test_aer/corpus.src @@ -0,0 +1,3 @@ +el gato negro ||| the black cat +el gato ||| the cat +el libro ||| the book diff --git a/training/dpmert/test_aer/grammar b/training/dpmert/test_aer/grammar new file mode 100644 index 00000000..9d857824 --- /dev/null +++ b/training/dpmert/test_aer/grammar @@ -0,0 +1,12 @@ +el ||| cat ||| F1=1 +el ||| the ||| F2=1 +el ||| black ||| F3=1 +el ||| book ||| F11=1 +gato ||| cat ||| F4=1 NN=1 +gato ||| black ||| F5=1 +gato ||| the ||| F6=1 +negro ||| the ||| F7=1 +negro ||| cat ||| F8=1 +negro ||| black ||| F9=1 +libro ||| the ||| F10=1 +libro ||| book ||| F12=1 NN=1 diff --git a/training/dpmert/test_aer/ref.0 b/training/dpmert/test_aer/ref.0 new file mode 100644 index 00000000..734a9c5b --- /dev/null +++ b/training/dpmert/test_aer/ref.0 @@ -0,0 +1,3 @@ +0-0 1-2 2-1 +0-0 1-1 +0-0 1-1 diff --git a/training/dpmert/test_aer/weights b/training/dpmert/test_aer/weights new file mode 100644 index 00000000..afc9282e --- /dev/null +++ b/training/dpmert/test_aer/weights @@ -0,0 +1,13 @@ +F1 0.1 +F2 -.5980815 +F3 0.24235 +F4 0.625 +F5 0.4514 +F6 0.112316 +F7 -0.123415 +F8 -0.25390285 +F9 -0.23852 +F10 0.646 +F11 0.413141 +F12 0.343216 +NN -0.1215 diff --git a/training/dpmert/test_data/0.json.gz b/training/dpmert/test_data/0.json.gz Binary files differnew file mode 100644 index 00000000..30f8dd77 --- /dev/null +++ b/training/dpmert/test_data/0.json.gz diff --git a/training/dpmert/test_data/1.json.gz b/training/dpmert/test_data/1.json.gz Binary files differnew file mode 100644 index 00000000..c82cc179 --- /dev/null +++ b/training/dpmert/test_data/1.json.gz diff --git a/training/dpmert/test_data/c2e.txt.0 b/training/dpmert/test_data/c2e.txt.0 new file mode 100644 index 00000000..12c4abe9 --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.0 @@ -0,0 +1,2 @@ +australia reopens embassy in manila +( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack . diff --git a/training/dpmert/test_data/c2e.txt.1 b/training/dpmert/test_data/c2e.txt.1 new file mode 100644 index 00000000..4ac12df1 --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.1 @@ -0,0 +1,2 @@ +australia reopened manila embassy +( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack . diff --git a/training/dpmert/test_data/c2e.txt.2 b/training/dpmert/test_data/c2e.txt.2 new file mode 100644 index 00000000..2f67b72f --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.2 @@ -0,0 +1,2 @@ +australia to reopen embassy in manila +( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats . diff --git a/training/dpmert/test_data/c2e.txt.3 b/training/dpmert/test_data/c2e.txt.3 new file mode 100644 index 00000000..5483cef6 --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.3 @@ -0,0 +1,2 @@ +australia to re - open its embassy to manila +( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago . diff --git a/training/dpmert/test_data/re.txt.0 b/training/dpmert/test_data/re.txt.0 new file mode 100644 index 00000000..86eff087 --- /dev/null +++ b/training/dpmert/test_data/re.txt.0 @@ -0,0 +1,5 @@ +erdogan states turkey to reject any pressures to urge it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . +erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . +we will discuss this dossier in the course of membership negotiations . " +he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/training/dpmert/test_data/re.txt.1 b/training/dpmert/test_data/re.txt.1 new file mode 100644 index 00000000..2140f198 --- /dev/null +++ b/training/dpmert/test_data/re.txt.1 @@ -0,0 +1,5 @@ +erdogan confirms turkey will resist any pressure to recognize cyprus +ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . +erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . +we shall discuss this issue in the course of the membership negotiations . " +he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/training/dpmert/test_data/re.txt.2 b/training/dpmert/test_data/re.txt.2 new file mode 100644 index 00000000..94e46286 --- /dev/null +++ b/training/dpmert/test_data/re.txt.2 @@ -0,0 +1,5 @@ +erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus +ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . +erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . +we shall discuss this dossier during the negotiations on joining . " +and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/training/dpmert/test_data/re.txt.3 b/training/dpmert/test_data/re.txt.3 new file mode 100644 index 00000000..f87c3308 --- /dev/null +++ b/training/dpmert/test_data/re.txt.3 @@ -0,0 +1,5 @@ +erdogan stresses that turkey will reject all pressures to force it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . +erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . +we will discuss this file during the negotiations on joining . " +he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " |