summaryrefslogtreecommitdiff
path: root/pro-train
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-07-10 23:00:21 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2011-07-10 23:00:21 -0400
commit95deb840699f9b6f8fe499b374bd726bce97365c (patch)
tree64d0299b2da95122d18654f254ae41c14bcb456d /pro-train
parentf68aaa588a2e1eb7cef5a57d57dd7e86fd4b0c9a (diff)
starting implementation of Hopkins&May (2011) optimizer
Diffstat (limited to 'pro-train')
-rw-r--r--pro-train/Makefile.am13
-rw-r--r--pro-train/README.shared-mem9
-rwxr-xr-xpro-train/dist-pro.pl735
-rw-r--r--pro-train/mr_pro_map.cc111
-rw-r--r--pro-train/mr_pro_reduce.cc81
5 files changed, 949 insertions, 0 deletions
diff --git a/pro-train/Makefile.am b/pro-train/Makefile.am
new file mode 100644
index 00000000..945ed5c3
--- /dev/null
+++ b/pro-train/Makefile.am
@@ -0,0 +1,13 @@
+bin_PROGRAMS = \
+ mr_pro_map \
+ mr_pro_reduce
+
+TESTS = lo_test
+
+mr_pro_map_SOURCES = mr_pro_map.cc
+mr_pro_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+mr_pro_reduce_SOURCES = mr_pro_reduce.cc
+mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/pro-train/README.shared-mem b/pro-train/README.shared-mem
new file mode 100644
index 00000000..7728efc0
--- /dev/null
+++ b/pro-train/README.shared-mem
@@ -0,0 +1,9 @@
+If you want to run dist-vest.pl on a very large shared memory machine, do the
+following:
+
+ ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini
+
+This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the
+decoder must load grammars, language models, etc., J should be smaller than I, but this will depend
+on the system you are running on and the complexity of the models used for decoding.
+
diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl
new file mode 100755
index 00000000..35bccea4
--- /dev/null
+++ b/pro-train/dist-pro.pl
@@ -0,0 +1,735 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+
+my $VEST_DIR="$SCRIPT_DIR/../vest";
+require "$VEST_DIR/libcall.pl";
+
+# Default settings
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input";
+my $MAPPER = "$bin_dir/mr_pro_map";
+my $REDUCER = "$bin_dir/mr_pro_reduce";
+my $parallelize = "$VEST_DIR/parallelize.pl";
+my $libcall = "$VEST_DIR/libcall.pl";
+my $sentserver = "$VEST_DIR/sentserver";
+my $sentclient = "$VEST_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 400;
+my $rand_directions = 15;
+my $iteration = 1;
+my $run_local = 0;
+my $best_weights;
+my $max_iterations = 15;
+my $optimization_iters = 6;
+my $decode_nodes = 15; # number of decode nodes
+my $pmem = "9g";
+my $disable_clean = 0;
+my %seen_weights;
+my $normalize;
+my $help = 0;
+my $epsilon = 0.0001;
+my $interval = 5;
+my $dryrun = 0;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $initialWeights;
+my $decoderOpt;
+my $noprimary;
+my $maxsim=0;
+my $oraclen=0;
+my $oracleb=20;
+my $bleu_weight=1;
+my $use_make; # use make to parallelize line search
+my $dirargs='';
+my $density_prune;
+my $usefork;
+my $pass_suffix = '';
+my $cpbin=1;
+# Process command-line options
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+ "decoder=s" => \$decoderOpt,
+ "decode-nodes=i" => \$decode_nodes,
+ "density-prune=f" => \$density_prune,
+ "dont-clean" => \$disable_clean,
+ "pass-suffix=s" => \$pass_suffix,
+ "use-fork" => \$usefork,
+ "dry-run" => \$dryrun,
+ "epsilon=s" => \$epsilon,
+ "help" => \$help,
+ "interval" => \$interval,
+ "iteration=i" => \$iteration,
+ "local" => \$run_local,
+ "use-make=i" => \$use_make,
+ "max-iterations=i" => \$max_iterations,
+ "normalize=s" => \$normalize,
+ "pmem=s" => \$pmem,
+ "cpbin!" => \$cpbin,
+ "rand-directions=i" => \$rand_directions,
+ "random_directions=i" => \$rand_directions,
+ "bleu_weight=s" => \$bleu_weight,
+ "no-primary!" => \$noprimary,
+ "max-similarity=s" => \$maxsim,
+ "oracle-directions=i" => \$oraclen,
+ "n-oracle=i" => \$oraclen,
+ "oracle-batch=i" => \$oracleb,
+ "directions-args=s" => \$dirargs,
+ "ref-files=s" => \$refFiles,
+ "metric=s" => \$metric,
+ "source-file=s" => \$srcFile,
+ "weights=s" => \$initialWeights,
+ "workdir=s" => \$dir,
+ "opt-iterations=i" => \$optimization_iters,
+) == 0 || @ARGV!=1 || $help) {
+ print_help();
+ exit;
+}
+
+if (defined $density_prune) {
+ die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0;
+}
+
+if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; }
+
+if ($metric =~ /^(combi|ter)$/i) {
+ $lines_per_mapper = 40;
+} elsif ($metric =~ /^meteor$/i) {
+ $lines_per_mapper = 2000; # start up time is really high
+}
+
+($iniFile) = @ARGV;
+
+
+sub write_config;
+sub enseg;
+sub print_help;
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+ $DIR_FLAG = '';
+}
+
+my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
+
+unless ($dir){
+ $dir = "vest";
+}
+unless ($dir =~ /^\//){ # convert relative path to absolute path
+ my $basedir = check_output("pwd");
+ chomp $basedir;
+ $dir = "$basedir/$dir";
+}
+
+if ($decoderOpt){ $decoder = $decoderOpt; }
+
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+ print STDERR "Cleanup...\n";
+ for my $pid (@childpids){ unchecked_call("kill $pid"); }
+ for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+ exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit =
+ sub{ cleanup(); };
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+
+
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+use File::Basename qw(basename);
+#pass bindir, refs to vars holding bin
+sub modbin {
+ local $_;
+ my $bindir=shift;
+ check_call("mkdir -p $bindir");
+ -d $bindir || die "couldn't make bindir $bindir";
+ for (@_) {
+ my $src=$$_;
+ $$_="$bindir/".basename($src);
+ check_call("cp -p $src $$_");
+ }
+}
+sub dirsize {
+ opendir ISEMPTY,$_[0];
+ return scalar(readdir(ISEMPTY))-1;
+}
+if ($dryrun){
+ write_config(*STDERR);
+ exit 0;
+} else {
+ if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs
+ die "ERROR: working dir $dir already exists\n\n";
+ } else {
+ -e $dir || mkdir $dir;
+ mkdir "$dir/hgs";
+ modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
+ mkdir "$dir/scripts";
+ my $cmdfile="$dir/rerun-vest.sh";
+ open CMD,'>',$cmdfile;
+ print CMD "cd ",&getcwd,"\n";
+# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
+ my $cline=&cmdline."\n";
+ print CMD $cline;
+ close CMD;
+ print STDERR $cline;
+ chmod(0755,$cmdfile);
+ unless (-e $initialWeights) {
+ print STDERR "Please specify an initial weights file with --initial-weights\n";
+ print_help();
+ exit;
+ }
+ check_call("cp $initialWeights $dir/weights.0");
+ die "Can't find weights.0" unless (-e "$dir/weights.0");
+ }
+ write_config(*STDERR);
+}
+
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+$iniFile = $newIniFile;
+
+my $newsrc = "$dir/dev.input";
+enseg($srcFile, $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+while (1){
+ print STDERR "\n\nITERATION $iteration\n==========\n";
+
+ if ($iteration > $max_iterations){
+ print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+ last;
+ }
+ # iteration-specific files
+ my $runFile="$dir/run.raw.$iteration";
+ my $onebestFile="$dir/1best.$iteration";
+ my $logdir="$dir/logs.$iteration";
+ my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+ my $scorerLog="$logdir/scorer.log.$iteration";
+ check_call("mkdir -p $logdir");
+
+
+ #decode
+ print STDERR "RUNNING DECODER AT ";
+ print STDERR unchecked_output("date");
+ my $im1 = $iteration - 1;
+ my $weightsFile="$dir/weights.$im1";
+ my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+ if ($density_prune) {
+ $decoder_cmd .= " --density_prune $density_prune";
+ }
+ my $pcmd;
+ if ($run_local) {
+ $pcmd = "cat $srcFile |";
+ } elsif ($use_make) {
+ # TODO: Throw error when decode_nodes is specified along with use_make
+ $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --";
+ } else {
+ $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --";
+ }
+ my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ my $num_hgs;
+ my $num_topbest;
+ my $retries = 0;
+ while($retries < 5) {
+ $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+ $num_topbest = check_output("wc -l < $runFile");
+ print STDERR "NUMBER OF HGs: $num_hgs\n";
+ print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+ if($devSize == $num_hgs && $devSize == $num_topbest) {
+ last;
+ } else {
+ print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+ sleep(3);
+ }
+ $retries++;
+ }
+ die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+ my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric");
+ chomp $dec_score;
+ print STDERR "DECODER SCORE: $dec_score\n";
+
+ # save space
+ check_call("gzip -f $runFile");
+ check_call("gzip -f $decoderLog");
+
+ # run optimizer
+ print STDERR "RUNNING OPTIMIZER AT ";
+ print STDERR unchecked_output("date");
+ my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+ my $score = 0;
+ my $icc = 0;
+ my $inweights="$dir/weights.$im1";
+ for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) {
+ print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n";
+ print STDERR unchecked_output("date");
+ $icc++;
+ my $nop=$noprimary?"--no_primary":"";
+ my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):"";
+ my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":"";
+ $cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ check_call("mkdir -p $dir/splag.$im1");
+ $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput.";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
+ my @shards = grep { /^mapinput\./ } readdir(DIR);
+ closedir DIR;
+ die "No shards!" unless scalar @shards > 0;
+ my $joblist = "";
+ my $nmappers = 0;
+ my @mapoutputs = ();
+ @cleanupcmds = ();
+ my %o2i = ();
+ my $first_shard = 1;
+ my $mkfile; # only used with makefiles
+ my $mkfilename;
+ if ($use_make) {
+ $mkfilename = "$dir/splag.$im1/domap.mk";
+ open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
+ print $mkfile "all: $dir/splag.$im1/map.done\n\n";
+ }
+ my @mkouts = (); # only used with makefiles
+ for my $shard (@shards) {
+ my $mapoutput = $shard;
+ my $client_name = $shard;
+ $client_name =~ s/mapinput.//;
+ $client_name = "vest.$client_name";
+ $mapoutput =~ s/mapinput/mapoutput/;
+ push @mapoutputs, "$dir/splag.$im1/$mapoutput";
+ $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
+ my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";
+ if ($run_local) {
+ print STDERR "COMMAND:\n$script\n";
+ check_bash_call($script);
+ } elsif ($use_make) {
+ my $script_file = "$dir/scripts/map.$shard";
+ open F, ">$script_file" or die "Can't write $script_file: $!";
+ print F "#!/bin/bash\n";
+ print F "$script\n";
+ close F;
+ my $output = "$dir/splag.$im1/$mapoutput";
+ push @mkouts, $output;
+ chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
+ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+ print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
+ } else {
+ my $script_file = "$dir/scripts/map.$shard";
+ open F, ">$script_file" or die "Can't write $script_file: $!";
+ print F "$script\n";
+ close F;
+ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+
+ $nmappers++;
+ my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
+ my $jobid = check_output("$qcmd");
+ chomp $jobid;
+ $jobid =~ s/^(\d+)(.*?)$/\1/g;
+ $jobid =~ s/^Your job (\d+) .*$/\1/;
+ push(@cleanupcmds, "qdel $jobid 2> /dev/null");
+ print STDERR " $jobid";
+ if ($joblist == "") { $joblist = $jobid; }
+ else {$joblist = $joblist . "\|" . $jobid; }
+ }
+ }
+ if ($run_local) {
+ print STDERR "\nProcessing line search complete.\n";
+ } elsif ($use_make) {
+ print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
+ close $mkfile;
+ my $mcmd = "make -j $use_make -f $mkfilename";
+ print STDERR "\nExecuting: $mcmd\n";
+ check_call($mcmd);
+ } else {
+ print STDERR "\nLaunched $nmappers mappers.\n";
+ sleep 8;
+ print STDERR "Waiting for mappers to complete...\n";
+ while ($nmappers > 0) {
+ sleep 5;
+ my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
+ $nmappers = scalar @livejobs;
+ }
+ print STDERR "All mappers complete.\n";
+ }
+ my $tol = 0;
+ my $til = 0;
+ for my $mo (@mapoutputs) {
+ my $olines = get_lines($mo);
+ my $ilines = get_lines($o2i{$mo});
+ $tol += $olines;
+ $til += $ilines;
+ die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines;
+ }
+ print STDERR "Results for $tol/$til lines\n";
+ print STDERR "\nSORTING AND RUNNING VEST REDUCER\n";
+ print STDERR unchecked_output("date");
+ $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1";
+ # sort returns failure even when it doesn't fail for some reason
+ my $best=unchecked_output("$cmd"); chomp $best;
+ print STDERR "$best\n";
+ my ($oa, $x, $xscore) = split /\|/, $best;
+ $score = $xscore;
+ print STDERR "PROJECTED SCORE: $score\n";
+ if (abs($x) < $epsilon) {
+ print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n";
+ last;
+ }
+ my $psd = $score - $last_score;
+ $last_score = $score;
+ if (abs($psd) < $epsilon) {
+ print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n";
+ last;
+ }
+ my ($origin, $axis) = split /\s+/, $oa;
+
+ my %ori = convert($origin);
+ my %axi = convert($axis);
+
+ my $finalFile="$dir/weights.$im1-$opt_iter";
+ open W, ">$finalFile" or die "Can't write: $finalFile: $!";
+ my $norm = 0;
+ for my $k (sort keys %ori) {
+ my $dd = $ori{$k} + $axi{$k} * $x;
+ $norm += $dd * $dd;
+ }
+ $norm = sqrt($norm);
+ $norm = 1;
+ for my $k (sort keys %ori) {
+ my $v = ($ori{$k} + $axi{$k} * $x) / $norm;
+ print W "$k $v\n";
+ }
+ check_call("rm $dir/splag.$im1/*");
+ $inweights = $finalFile;
+ }
+ $lastWeightsFile = "$dir/weights.$iteration";
+ check_call("cp $inweights $lastWeightsFile");
+ if ($icc < 2) {
+ print STDERR "\nREACHED STOPPING CRITERION: score change too little\n";
+ last;
+ }
+ $lastPScore = $score;
+ $iteration++;
+ print STDERR "\n==========\n";
+}
+
+print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
+
+print STDOUT "$lastWeightsFile\n";
+
+exit 0;
+
+sub normalize_weights {
+ my ($rfn, $rpts, $feat) = @_;
+ my @feat_names = @$rfn;
+ my @pts = @$rpts;
+ my $z = 1.0;
+ for (my $i=0; $i < scalar @feat_names; $i++) {
+ if ($feat_names[$i] eq $feat) {
+ $z = $pts[$i];
+ last;
+ }
+ }
+ for (my $i=0; $i < scalar @feat_names; $i++) {
+ $pts[$i] /= $z;
+ }
+ print STDERR " NORM WEIGHTS: @pts\n";
+ return @pts;
+}
+
+sub get_lines {
+ my $fn = shift @_;
+ open FL, "<$fn" or die "Couldn't read $fn: $!";
+ my $lc = 0;
+ while(<FL>) { $lc++; }
+ return $lc;
+}
+
+sub get_comma_sep_refs {
+ my ($r,$p) = @_;
+ my $o = check_output("echo $p");
+ chomp $o;
+ my @files = split /\s+/, $o;
+ return "-$r " . join(" -$r ", @files);
+}
+
+sub read_weights_file {
+ my ($file) = @_;
+ open F, "<$file" or die "Couldn't read $file: $!";
+ my @r = ();
+ my $pm = -1;
+ while(<F>) {
+ next if /^#/;
+ next if /^\s*$/;
+ chomp;
+ if (/^(.+)\s+(.+)$/) {
+ my $m = $1;
+ my $w = $2;
+ die "Weights out of order: $m <= $pm" unless $m > $pm;
+ push @r, $w;
+ } else {
+ warn "Unexpected feature name in weight file: $_";
+ }
+ }
+ close F;
+ return join ' ', @r;
+}
+
+# subs
+sub write_config {
+ my $fh = shift;
+ my $cleanup = "yes";
+ if ($disable_clean) {$cleanup = "no";}
+
+ print $fh "\n";
+ print $fh "DECODER: $decoder\n";
+ print $fh "INI FILE: $iniFile\n";
+ print $fh "WORKING DIR: $dir\n";
+ print $fh "SOURCE (DEV): $srcFile\n";
+ print $fh "REFS (DEV): $refFiles\n";
+ print $fh "EVAL METRIC: $metric\n";
+ print $fh "START ITERATION: $iteration\n";
+ print $fh "MAX ITERATIONS: $max_iterations\n";
+ print $fh "DECODE NODES: $decode_nodes\n";
+ print $fh "HEAD NODE: $host\n";
+ print $fh "PMEM (DECODING): $pmem\n";
+ print $fh "CLEANUP: $cleanup\n";
+ print $fh "INITIAL WEIGHTS: $initialWeights\n";
+}
+
+sub update_weights_file {
+ my ($neww, $rfn, $rpts) = @_;
+ my @feats = @$rfn;
+ my @pts = @$rpts;
+ my $num_feats = scalar @feats;
+ my $num_pts = scalar @pts;
+ die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+ open G, ">$neww" or die;
+ for (my $i = 0; $i < $num_feats; $i++) {
+ my $f = $feats[$i];
+ my $lambda = $pts[$i];
+ print G "$f $lambda\n";
+ }
+ close G;
+}
+
+sub enseg {
+ my $src = shift;
+ my $newsrc = shift;
+ open(SRC, $src);
+ open(NEWSRC, ">$newsrc");
+ my $i=0;
+ while (my $line=<SRC>){
+ chomp $line;
+ if ($line =~ /^\s*<seg/i) {
+ if($line =~ /id="[0-9]+"/) {
+ print NEWSRC "$line\n";
+ } else {
+ die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+ }
+ } else {
+ print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+ }
+ $i++;
+ }
+ close SRC;
+ close NEWSRC;
+}
+
+sub print_help {
+
+ my $executable = check_output("basename $0"); chomp $executable;
+ print << "Help";
+
+Usage: $executable [options] <ini file>
+
+ $executable [options] <ini file>
+ Runs a complete MERT optimization and test set decoding, using
+ the decoder configuration in ini file. Note that many of the
+ options have default values that are inferred automatically
+ based on certain conventions. For details, refer to descriptions
+ of the options --decoder, --weights, and --workdir.
+
+Options:
+
+ --local
+ Run the decoder and optimizer locally with a single thread.
+
+ --use-make <I>
+ Use make -j <I> to run the optimizer commands (useful on large
+ shared-memory machines where qsub is unavailable).
+
+ --decode-nodes <I>
+ Number of decoder processes to run in parallel. [default=15]
+
+ --decoder <decoder path>
+ Decoder binary to use.
+
+ --density-prune <N>
+ Limit the density of the hypergraph on each iteration to N times
+ the number of edges on the Viterbi path.
+
+ --help
+ Print this message and exit.
+
+ --iteration <I>
+ Starting iteration number. If not specified, defaults to 1.
+
+ --max-iterations <M>
+ Maximum number of iterations to run. If not specified, defaults
+ to 10.
+
+ --pass-suffix <S>
+ If the decoder is doing multi-pass decoding, the pass suffix "2",
+ "3", etc., is used to control what iteration of weights is set.
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs.
+
+ --ref-files <files>
+ Dev set ref files. This option takes only a single string argument.
+ To use multiple files (including file globbing), this argument should
+ be quoted.
+
+ --metric <method>
+ Metric to optimize.
+ Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+ --normalize <feature-name>
+ After each iteration, rescale all feature weights such that feature-
+ name has a weight of 1.0.
+
+ --rand-directions <num>
+ MERT will attempt to optimize along all of the principle directions,
+ set this parameter to explore other directions. Defaults to 5.
+
+ --source-file <file>
+ Dev set source file.
+
+ --weights <file>
+ A file specifying initial feature weights. The format is
+ FeatureName_1 value1
+ FeatureName_2 value2
+
+ --workdir <dir>
+ Directory for intermediate and output files. If not specified, the
+ name is derived from the ini filename. Assuming that the ini
+ filename begins with the decoder name and ends with ini, the default
+ name of the working directory is inferred from the middle part of
+ the filename. E.g. an ini file named decoder.foo.ini would have
+ a default working directory name foo.
+
+Help
+}
+
+sub convert {
+ my ($str) = @_;
+ my @ps = split /;/, $str;
+ my %dict = ();
+ for my $p (@ps) {
+ my ($k, $v) = split /=/, $p;
+ $dict{$k} = $v;
+ }
+ return %dict;
+}
+
+
+
+sub cmdline {
+ return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+
+sub escaped_shell_args {
+ return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+ return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+ return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc
new file mode 100644
index 00000000..b046cdea
--- /dev/null
+++ b/pro-train/mr_pro_map.cc
@@ -0,0 +1,111 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sampler.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "scorer.h"
+#include "inside_outside.h"
+#include "hg_io.h"
+#include "kbest.h"
+#include "viterbi.h"
+
+// This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011)
+
+using namespace std;
+namespace po = boost::program_options;
+
+boost::shared_ptr<MT19937> rng;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+ ("source,s",po::value<string>(), "Source file (ignored, except for AER)")
+ ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized")
+ ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+ ("weights,w",po::value<string>(), "[REQD] Current weights file")
+ ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")
+ ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")
+ ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)")
+ ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (!conf->count("reference")) {
+ cerr << "Please specify one or more references using -r <REF.TXT>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+struct HypInfo {
+ HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-1), x(feats) {}
+ double g() {
+ return g_;
+ }
+ private:
+ int sent_id;
+ vector<WordID> hyp;
+ double g_;
+ public:
+ SparseVector<double> x;
+};
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ if (conf.count("random_seed"))
+ rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ rng.reset(new MT19937);
+ const string loss_function = conf["loss_function"].as<string>();
+ ScoreType type = ScoreTypeFromString(loss_function);
+ DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl;
+ Hypergraph hg;
+ string last_file;
+ ReadFile in_read(conf["input"].as<string>());
+ istream &in=*in_read.stream();
+ const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
+ const unsigned gamma = conf["candidate_pairs"].as<unsigned>();
+ const unsigned xi = conf["best_pairs"].as<unsigned>();
+ while(in) {
+ string line;
+ getline(in, line);
+ if (line.empty()) continue;
+ istringstream is(line);
+ int sent_id;
+ string file;
+ // path-to-file (JSON) sent_id
+ is >> file >> sent_id;
+ ReadFile rf(file);
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
+
+ vector<HypInfo> J_i;
+ for (int i = 0; i < kbest_size; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore();
+ // if (invert_score) sentscore *= -1.0;
+ // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl;
+ d->feature_values;
+ sentscore;
+ }
+ }
+ return 0;
+}
+
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
new file mode 100644
index 00000000..3df52020
--- /dev/null
+++ b/pro-train/mr_pro_reduce.cc
@@ -0,0 +1,81 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sparse_vector.h"
+#include "error_surface.h"
+#include "line_optimizer.h"
+#include "b64tools.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("loss_function,l",po::value<string>(), "Loss function being optimized")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = conf->count("loss_function") == 0;
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const string loss_function = conf["loss_function"].as<string>();
+ ScoreType type = ScoreTypeFromString(loss_function);
+ LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE;
+ if (type == TER || type == AER) {
+ opt_type = LineOptimizer::MINIMIZE_SCORE;
+ }
+ string last_key;
+ vector<ErrorSurface> esv;
+ while(cin) {
+ string line;
+ getline(cin, line);
+ if (line.empty()) continue;
+ size_t ks = line.find("\t");
+ assert(string::npos != ks);
+ assert(ks > 2);
+ string key = line.substr(2, ks - 2);
+ string val = line.substr(ks + 1);
+ if (key != last_key) {
+ if (!last_key.empty()) {
+ float score;
+ double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
+ cout << last_key << "|" << x << "|" << score << endl;
+ }
+ last_key = key;
+ esv.clear();
+ }
+ if (val.size() % 4 != 0) {
+ cerr << "B64 encoding error 1! Skipping.\n";
+ continue;
+ }
+ string encoded(val.size() / 4 * 3, '\0');
+ if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) {
+ cerr << "B64 encoding error 2! Skipping.\n";
+ continue;
+ }
+ esv.push_back(ErrorSurface());
+ esv.back().Deserialize(type, encoded);
+ }
+ if (!esv.empty()) {
+ // cerr << "ESV=" << esv.size() << endl;
+ // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; }
+ float score;
+ double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
+ cout << last_key << "|" << x << "|" << score << endl;
+ }
+ return 0;
+}