summaryrefslogtreecommitdiff
path: root/dpmert
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2012-03-13 09:24:47 +0100
committerPatrick Simianer <p@simianer.de>2012-03-13 09:24:47 +0100
commitc3a9ea64251605532c7954959662643a6a927bb7 (patch)
treefed6048a5acdaf3834740107771c2bc48f26fd4d /dpmert
parent867bca3e5fa0cdd63bf032e5859fb5092d9a4ca1 (diff)
parenta45af4a3704531a8382cd231f6445b3a33b598a3 (diff)
merge with upstream
Diffstat (limited to 'dpmert')
-rw-r--r--dpmert/Makefile.am35
-rw-r--r--dpmert/README.shared-mem9
-rw-r--r--dpmert/ces.cc91
-rw-r--r--dpmert/ces.h16
-rwxr-xr-xdpmert/dpmert.pl700
-rw-r--r--dpmert/error_surface.cc42
-rw-r--r--dpmert/error_surface.h24
-rw-r--r--dpmert/libcall.pl71
-rwxr-xr-xdpmert/line_mediator.pl116
-rw-r--r--dpmert/line_optimizer.cc111
-rw-r--r--dpmert/line_optimizer.h48
-rw-r--r--dpmert/lo_test.cc236
-rw-r--r--dpmert/mert_geometry.cc186
-rw-r--r--dpmert/mert_geometry.h81
-rw-r--r--dpmert/mr_dpmert_generate_mapper_input.cc78
-rw-r--r--dpmert/mr_dpmert_map.cc112
-rw-r--r--dpmert/mr_dpmert_reduce.cc77
-rwxr-xr-xdpmert/parallelize.pl423
-rw-r--r--dpmert/sentclient.c76
-rw-r--r--dpmert/sentserver.c515
-rw-r--r--dpmert/sentserver.h6
-rw-r--r--dpmert/test_aer/README8
-rw-r--r--dpmert/test_aer/cdec.ini3
-rw-r--r--dpmert/test_aer/corpus.src3
-rw-r--r--dpmert/test_aer/grammar12
-rw-r--r--dpmert/test_aer/ref.03
-rw-r--r--dpmert/test_aer/weights13
-rw-r--r--dpmert/test_data/0.json.gzbin0 -> 13709 bytes
-rw-r--r--dpmert/test_data/1.json.gzbin0 -> 204803 bytes
-rw-r--r--dpmert/test_data/c2e.txt.02
-rw-r--r--dpmert/test_data/c2e.txt.12
-rw-r--r--dpmert/test_data/c2e.txt.22
-rw-r--r--dpmert/test_data/c2e.txt.32
-rw-r--r--dpmert/test_data/re.txt.05
-rw-r--r--dpmert/test_data/re.txt.15
-rw-r--r--dpmert/test_data/re.txt.25
-rw-r--r--dpmert/test_data/re.txt.35
37 files changed, 3123 insertions, 0 deletions
diff --git a/dpmert/Makefile.am b/dpmert/Makefile.am
new file mode 100644
index 00000000..2676fb50
--- /dev/null
+++ b/dpmert/Makefile.am
@@ -0,0 +1,35 @@
+bin_PROGRAMS = \
+ mr_dpmert_map \
+ mr_dpmert_reduce \
+ mr_dpmert_generate_mapper_input \
+ sentserver \
+ sentclient
+
+if HAVE_GTEST
+noinst_PROGRAMS = \
+ lo_test
+TESTS = lo_test
+endif
+
+sentserver_SOURCES = sentserver.c
+sentserver_LDFLAGS = -all-static -pthread
+
+sentclient_SOURCES = sentclient.c
+sentclient_LDFLAGS = -all-static -pthread
+
+mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc
+mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+# nbest2hg_SOURCES = nbest2hg.cc
+# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz
+
+mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc
+mr_dpmert_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc
+mr_dpmert_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc
+lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/dpmert/README.shared-mem b/dpmert/README.shared-mem
new file mode 100644
index 00000000..7728efc0
--- /dev/null
+++ b/dpmert/README.shared-mem
@@ -0,0 +1,9 @@
+If you want to run dist-vest.pl on a very large shared memory machine, do the
+following:
+
+ ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini
+
+This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the
+decoder must load grammars, language models, etc., J should be smaller than I, but this will depend
+on the system you are running on and the complexity of the models used for decoding.
+
diff --git a/dpmert/ces.cc b/dpmert/ces.cc
new file mode 100644
index 00000000..a85454da
--- /dev/null
+++ b/dpmert/ces.cc
@@ -0,0 +1,91 @@
+#include "ces.h"
+
+#include <vector>
+#include <sstream>
+#include <boost/shared_ptr.hpp>
+
+// TODO, if AER is to be optimized again, we will need this
+// #include "aligner.h"
+#include "lattice.h"
+#include "mert_geometry.h"
+#include "error_surface.h"
+#include "ns.h"
+
+using boost::shared_ptr;
+using namespace std;
+
+const bool minimize_segments = true; // if adjacent segments have equal scores, merge them
+
+void ComputeErrorSurface(const SegmentEvaluator& ss,
+ const ConvexHull& ve,
+ ErrorSurface* env,
+ const EvaluationMetric* metric,
+ const Hypergraph& hg) {
+ vector<WordID> prev_trans;
+ const vector<shared_ptr<MERTPoint> >& ienv = ve.GetSortedSegs();
+ env->resize(ienv.size());
+ SufficientStats prev_score; // defaults to 0
+ int j = 0;
+ for (int i = 0; i < ienv.size(); ++i) {
+ const MERTPoint& seg = *ienv[i];
+ vector<WordID> trans;
+#if 0
+ if (type == AER) {
+ vector<bool> edges(hg.edges_.size(), false);
+ seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi
+ // alignment
+ ostringstream os;
+ const string* psrc = ss.GetSource();
+ if (psrc == NULL) {
+ cerr << "AER scoring in VEST requires source, but it is missing!\n";
+ abort();
+ }
+ size_t pos = psrc->rfind(" ||| ");
+ if (pos == string::npos) {
+ cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl;
+ abort();
+ }
+ Lattice src;
+ Lattice ref;
+ LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src);
+ LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref);
+ AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges);
+ string tstr = os.str();
+ TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans);
+ } else {
+#endif
+ seg.ConstructTranslation(&trans);
+ //}
+ //cerr << "Scoring: " << TD::GetString(trans) << endl;
+ if (trans == prev_trans) {
+ if (!minimize_segments) {
+ ErrorSegment& out = (*env)[j];
+ out.delta.fields.clear();
+ out.x = seg.x;
+ ++j;
+ }
+ //cerr << "Identical translation, skipping scoring\n";
+ } else {
+ SufficientStats score;
+ ss.Evaluate(trans, &score);
+ // cerr << "score= " << score->ComputeScore() << "\n";
+ //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl;
+ const SufficientStats delta = score - prev_score;
+ //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl;
+ //string xx; delta.Encode(&xx); cerr << xx << endl;
+ prev_trans.swap(trans);
+ prev_score = score;
+ if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) {
+ ErrorSegment& out = (*env)[j];
+ out.delta = delta;
+ out.x = seg.x;
+ ++j;
+ }
+ }
+ }
+ // cerr << " In segments: " << ienv.size() << endl;
+ // cerr << "Out segments: " << j << endl;
+ assert(j > 0);
+ env->resize(j);
+}
+
diff --git a/dpmert/ces.h b/dpmert/ces.h
new file mode 100644
index 00000000..e4fa2080
--- /dev/null
+++ b/dpmert/ces.h
@@ -0,0 +1,16 @@
+#ifndef _CES_H_
+#define _CES_H_
+
+class ConvexHull;
+class Hypergraph;
+class SegmentEvaluator;
+class ErrorSurface;
+class EvaluationMetric;
+
+void ComputeErrorSurface(const SegmentEvaluator& ss,
+ const ConvexHull& convex_hull,
+ ErrorSurface* es,
+ const EvaluationMetric* metric,
+ const Hypergraph& hg);
+
+#endif
diff --git a/dpmert/dpmert.pl b/dpmert/dpmert.pl
new file mode 100755
index 00000000..52ce0fc0
--- /dev/null
+++ b/dpmert/dpmert.pl
@@ -0,0 +1,700 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+
+require "libcall.pl";
+
+# Default settings
+my $srcFile;
+my $refFiles;
+my $default_jobs = env_default_jobs();
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input";
+my $MAPPER = "$bin_dir/mr_dpmert_map";
+my $REDUCER = "$bin_dir/mr_dpmert_reduce";
+my $parallelize = "$bin_dir/parallelize.pl";
+my $libcall = "$bin_dir/libcall.pl";
+my $sentserver = "$bin_dir/sentserver";
+my $sentclient = "$bin_dir/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 400;
+my $rand_directions = 15;
+my $iteration = 1;
+my $best_weights;
+my $max_iterations = 15;
+my $optimization_iters = 6;
+my $jobs = $default_jobs; # number of decode nodes
+my $pmem = "9g";
+my $disable_clean = 0;
+my %seen_weights;
+my $normalize;
+my $help = 0;
+my $epsilon = 0.0001;
+my $interval = 5;
+my $dryrun = 0;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $initialWeights;
+my $decoderOpt;
+my $noprimary;
+my $maxsim=0;
+my $oraclen=0;
+my $oracleb=20;
+my $bleu_weight=1;
+my $use_make = 1; # use make to parallelize line search
+my $useqsub;
+my $pass_suffix = '';
+my $cpbin=1;
+# Process command-line options
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+ "decoder=s" => \$decoderOpt,
+ "jobs=i" => \$jobs,
+ "dont-clean" => \$disable_clean,
+ "pass-suffix=s" => \$pass_suffix,
+ "dry-run" => \$dryrun,
+ "epsilon=s" => \$epsilon,
+ "help" => \$help,
+ "interval" => \$interval,
+ "qsub" => \$useqsub,
+ "max-iterations=i" => \$max_iterations,
+ "normalize=s" => \$normalize,
+ "pmem=s" => \$pmem,
+ "cpbin!" => \$cpbin,
+ "random-directions=i" => \$rand_directions,
+ "ref-files=s" => \$refFiles,
+ "metric=s" => \$metric,
+ "source-file=s" => \$srcFile,
+ "weights=s" => \$initialWeights,
+ "workdir=s" => \$dir,
+ "opt-iterations=i" => \$optimization_iters,
+) == 0 || @ARGV!=1 || $help) {
+ print_help();
+ exit;
+}
+
+if ($useqsub) {
+ $use_make = 0;
+ die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (!defined $srcFile) { push @missing_args, "--source-file"; }
+if (!defined $refFiles) { push @missing_args, "--ref-files"; }
+if (!defined $initialWeights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
+
+if ($metric =~ /^(combi|ter)$/i) {
+ $lines_per_mapper = 40;
+} elsif ($metric =~ /^meteor$/i) {
+ $lines_per_mapper = 2000; # start up time is really high
+}
+
+($iniFile) = @ARGV;
+
+
+sub write_config;
+sub enseg;
+sub print_help;
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+ $DIR_FLAG = '';
+}
+
+my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
+
+unless ($dir){
+ $dir = "dpmert";
+}
+unless ($dir =~ /^\//){ # convert relative path to absolute path
+ my $basedir = check_output("pwd");
+ chomp $basedir;
+ $dir = "$basedir/$dir";
+}
+
+if ($decoderOpt){ $decoder = $decoderOpt; }
+
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+ print STDERR "Cleanup...\n";
+ for my $pid (@childpids){ unchecked_call("kill $pid"); }
+ for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+ exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit =
+ sub{ cleanup(); };
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+
+
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+use File::Basename qw(basename);
+#pass bindir, refs to vars holding bin
+sub modbin {
+ local $_;
+ my $bindir=shift;
+ check_call("mkdir -p $bindir");
+ -d $bindir || die "couldn't make bindir $bindir";
+ for (@_) {
+ my $src=$$_;
+ $$_="$bindir/".basename($src);
+ check_call("cp -p $src $$_");
+ }
+}
+sub dirsize {
+ opendir ISEMPTY,$_[0];
+ return scalar(readdir(ISEMPTY))-1;
+}
+if ($dryrun){
+ write_config(*STDERR);
+ exit 0;
+} else {
+ if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs
+ die "ERROR: working dir $dir already exists\n\n";
+ } else {
+ -e $dir || mkdir $dir;
+ mkdir "$dir/hgs";
+ modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
+ mkdir "$dir/scripts";
+ my $cmdfile="$dir/rerun-dpmert.sh";
+ open CMD,'>',$cmdfile;
+ print CMD "cd ",&getcwd,"\n";
+# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
+ my $cline=&cmdline."\n";
+ print CMD $cline;
+ close CMD;
+ print STDERR $cline;
+ chmod(0755,$cmdfile);
+ unless (-e $initialWeights) {
+ print STDERR "Please specify an initial weights file with --initial-weights\n";
+ print_help();
+ exit;
+ }
+ check_call("cp $initialWeights $dir/weights.0");
+ die "Can't find weights.0" unless (-e "$dir/weights.0");
+ }
+ write_config(*STDERR);
+}
+
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+$iniFile = $newIniFile;
+
+my $newsrc = "$dir/dev.input";
+enseg($srcFile, $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+while (1){
+ print STDERR "\n\nITERATION $iteration\n==========\n";
+
+ if ($iteration > $max_iterations){
+ print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+ last;
+ }
+ # iteration-specific files
+ my $runFile="$dir/run.raw.$iteration";
+ my $onebestFile="$dir/1best.$iteration";
+ my $logdir="$dir/logs.$iteration";
+ my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+ my $scorerLog="$logdir/scorer.log.$iteration";
+ check_call("mkdir -p $logdir");
+
+
+ #decode
+ print STDERR "RUNNING DECODER AT ";
+ print STDERR unchecked_output("date");
+ my $im1 = $iteration - 1;
+ my $weightsFile="$dir/weights.$im1";
+ my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+ my $pcmd;
+ if ($use_make) {
+ $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
+ } else {
+ $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
+ }
+ my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ my $num_hgs;
+ my $num_topbest;
+ my $retries = 0;
+ while($retries < 5) {
+ $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+ $num_topbest = check_output("wc -l < $runFile");
+ print STDERR "NUMBER OF HGs: $num_hgs\n";
+ print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+ if($devSize == $num_hgs && $devSize == $num_topbest) {
+ last;
+ } else {
+ print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+ sleep(3);
+ }
+ $retries++;
+ }
+ die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+ my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric");
+ chomp $dec_score;
+ print STDERR "DECODER SCORE: $dec_score\n";
+
+ # save space
+ check_call("gzip -f $runFile");
+ check_call("gzip -f $decoderLog");
+
+ # run optimizer
+ print STDERR "RUNNING OPTIMIZER AT ";
+ print STDERR unchecked_output("date");
+ my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+ my $score = 0;
+ my $icc = 0;
+ my $inweights="$dir/weights.$im1";
+ for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) {
+ print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n";
+ print STDERR unchecked_output("date");
+ $icc++;
+ $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ check_call("mkdir -p $dir/splag.$im1");
+ $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput.";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
+ my @shards = grep { /^mapinput\./ } readdir(DIR);
+ closedir DIR;
+ die "No shards!" unless scalar @shards > 0;
+ my $joblist = "";
+ my $nmappers = 0;
+ my @mapoutputs = ();
+ @cleanupcmds = ();
+ my %o2i = ();
+ my $first_shard = 1;
+ my $mkfile; # only used with makefiles
+ my $mkfilename;
+ if ($use_make) {
+ $mkfilename = "$dir/splag.$im1/domap.mk";
+ open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
+ print $mkfile "all: $dir/splag.$im1/map.done\n\n";
+ }
+ my @mkouts = (); # only used with makefiles
+ for my $shard (@shards) {
+ my $mapoutput = $shard;
+ my $client_name = $shard;
+ $client_name =~ s/mapinput.//;
+ $client_name = "dpmert.$client_name";
+ $mapoutput =~ s/mapinput/mapoutput/;
+ push @mapoutputs, "$dir/splag.$im1/$mapoutput";
+ $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
+ my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";
+ if ($use_make) {
+ my $script_file = "$dir/scripts/map.$shard";
+ open F, ">$script_file" or die "Can't write $script_file: $!";
+ print F "#!/bin/bash\n";
+ print F "$script\n";
+ close F;
+ my $output = "$dir/splag.$im1/$mapoutput";
+ push @mkouts, $output;
+ chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
+ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+ print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
+ } else {
+ my $script_file = "$dir/scripts/map.$shard";
+ open F, ">$script_file" or die "Can't write $script_file: $!";
+ print F "$script\n";
+ close F;
+ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+
+ $nmappers++;
+ my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
+ my $jobid = check_output("$qcmd");
+ chomp $jobid;
+ $jobid =~ s/^(\d+)(.*?)$/\1/g;
+ $jobid =~ s/^Your job (\d+) .*$/\1/;
+ push(@cleanupcmds, "qdel $jobid 2> /dev/null");
+ print STDERR " $jobid";
+ if ($joblist == "") { $joblist = $jobid; }
+ else {$joblist = $joblist . "\|" . $jobid; }
+ }
+ }
+ if ($use_make) {
+ print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
+ close $mkfile;
+ my $mcmd = "make -j $jobs -f $mkfilename";
+ print STDERR "\nExecuting: $mcmd\n";
+ check_call($mcmd);
+ } else {
+ print STDERR "\nLaunched $nmappers mappers.\n";
+ sleep 8;
+ print STDERR "Waiting for mappers to complete...\n";
+ while ($nmappers > 0) {
+ sleep 5;
+ my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
+ $nmappers = scalar @livejobs;
+ }
+ print STDERR "All mappers complete.\n";
+ }
+ my $tol = 0;
+ my $til = 0;
+ for my $mo (@mapoutputs) {
+ my $olines = get_lines($mo);
+ my $ilines = get_lines($o2i{$mo});
+ $tol += $olines;
+ $til += $ilines;
+ die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines;
+ }
+ print STDERR "Results for $tol/$til lines\n";
+ print STDERR "\nSORTING AND RUNNING VEST REDUCER\n";
+ print STDERR unchecked_output("date");
+ $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1";
+ # sort returns failure even when it doesn't fail for some reason
+ my $best=unchecked_output("$cmd"); chomp $best;
+ print STDERR "$best\n";
+ my ($oa, $x, $xscore) = split /\|/, $best;
+ $score = $xscore;
+ print STDERR "PROJECTED SCORE: $score\n";
+ if (abs($x) < $epsilon) {
+ print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n";
+ last;
+ }
+ my $psd = $score - $last_score;
+ $last_score = $score;
+ if (abs($psd) < $epsilon) {
+ print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n";
+ last;
+ }
+ my ($origin, $axis) = split /\s+/, $oa;
+
+ my %ori = convert($origin);
+ my %axi = convert($axis);
+
+ my $finalFile="$dir/weights.$im1-$opt_iter";
+ open W, ">$finalFile" or die "Can't write: $finalFile: $!";
+ my $norm = 0;
+ for my $k (sort keys %ori) {
+ my $dd = $ori{$k} + $axi{$k} * $x;
+ $norm += $dd * $dd;
+ }
+ $norm = sqrt($norm);
+ $norm = 1;
+ for my $k (sort keys %ori) {
+ my $v = ($ori{$k} + $axi{$k} * $x) / $norm;
+ print W "$k $v\n";
+ }
+ check_call("rm $dir/splag.$im1/*");
+ $inweights = $finalFile;
+ }
+ $lastWeightsFile = "$dir/weights.$iteration";
+ check_call("cp $inweights $lastWeightsFile");
+ if ($icc < 2) {
+ print STDERR "\nREACHED STOPPING CRITERION: score change too little\n";
+ last;
+ }
+ $lastPScore = $score;
+ $iteration++;
+ print STDERR "\n==========\n";
+}
+
+print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
+
+print STDOUT "$lastWeightsFile\n";
+
+exit 0;
+
+sub normalize_weights {
+ my ($rfn, $rpts, $feat) = @_;
+ my @feat_names = @$rfn;
+ my @pts = @$rpts;
+ my $z = 1.0;
+ for (my $i=0; $i < scalar @feat_names; $i++) {
+ if ($feat_names[$i] eq $feat) {
+ $z = $pts[$i];
+ last;
+ }
+ }
+ for (my $i=0; $i < scalar @feat_names; $i++) {
+ $pts[$i] /= $z;
+ }
+ print STDERR " NORM WEIGHTS: @pts\n";
+ return @pts;
+}
+
+sub get_lines {
+ my $fn = shift @_;
+ open FL, "<$fn" or die "Couldn't read $fn: $!";
+ my $lc = 0;
+ while(<FL>) { $lc++; }
+ return $lc;
+}
+
+sub get_comma_sep_refs {
+ my ($r,$p) = @_;
+ my $o = check_output("echo $p");
+ chomp $o;
+ my @files = split /\s+/, $o;
+ return "-$r " . join(" -$r ", @files);
+}
+
+sub read_weights_file {
+ my ($file) = @_;
+ open F, "<$file" or die "Couldn't read $file: $!";
+ my @r = ();
+ my $pm = -1;
+ while(<F>) {
+ next if /^#/;
+ next if /^\s*$/;
+ chomp;
+ if (/^(.+)\s+(.+)$/) {
+ my $m = $1;
+ my $w = $2;
+ die "Weights out of order: $m <= $pm" unless $m > $pm;
+ push @r, $w;
+ } else {
+ warn "Unexpected feature name in weight file: $_";
+ }
+ }
+ close F;
+ return join ' ', @r;
+}
+
+# subs
+sub write_config {
+ my $fh = shift;
+ my $cleanup = "yes";
+ if ($disable_clean) {$cleanup = "no";}
+
+ print $fh "\n";
+ print $fh "DECODER: $decoder\n";
+ print $fh "INI FILE: $iniFile\n";
+ print $fh "WORKING DIR: $dir\n";
+ print $fh "SOURCE (DEV): $srcFile\n";
+ print $fh "REFS (DEV): $refFiles\n";
+ print $fh "EVAL METRIC: $metric\n";
+ print $fh "START ITERATION: $iteration\n";
+ print $fh "MAX ITERATIONS: $max_iterations\n";
+ print $fh "PARALLEL JOBS: $jobs\n";
+ print $fh "HEAD NODE: $host\n";
+ print $fh "PMEM (DECODING): $pmem\n";
+ print $fh "CLEANUP: $cleanup\n";
+ print $fh "INITIAL WEIGHTS: $initialWeights\n";
+}
+
+sub update_weights_file {
+ my ($neww, $rfn, $rpts) = @_;
+ my @feats = @$rfn;
+ my @pts = @$rpts;
+ my $num_feats = scalar @feats;
+ my $num_pts = scalar @pts;
+ die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+ open G, ">$neww" or die;
+ for (my $i = 0; $i < $num_feats; $i++) {
+ my $f = $feats[$i];
+ my $lambda = $pts[$i];
+ print G "$f $lambda\n";
+ }
+ close G;
+}
+
+sub enseg {
+ my $src = shift;
+ my $newsrc = shift;
+ open(SRC, $src);
+ open(NEWSRC, ">$newsrc");
+ my $i=0;
+ while (my $line=<SRC>){
+ chomp $line;
+ if ($line =~ /^\s*<seg/i) {
+ if($line =~ /id="[0-9]+"/) {
+ print NEWSRC "$line\n";
+ } else {
+ die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+ }
+ } else {
+ print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+ }
+ $i++;
+ }
+ close SRC;
+ close NEWSRC;
+}
+
+sub print_help {
+
+ my $executable = check_output("basename $0"); chomp $executable;
+ print << "Help";
+
+Usage: $executable [options] <ini file>
+
+ $executable [options] <ini file>
+ Runs a complete MERT optimization using the decoder configuration
+ in <ini file>. Required options are --weights, --source-file, and
+ --ref-files.
+
+Options:
+
+ --help
+ Print this message and exit.
+
+ --max-iterations <M>
+ Maximum number of iterations to run. If not specified, defaults
+ to 10.
+
+ --pass-suffix <S>
+ If the decoder is doing multi-pass decoding, the pass suffix "2",
+ "3", etc., is used to control what iteration of weights is set.
+
+ --ref-files <files>
+ Dev set ref files. This option takes only a single string argument.
+ To use multiple files (including file globbing), this argument should
+ be quoted.
+
+ --metric <method>
+ Metric to optimize.
+ Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+ --normalize <feature-name>
+ After each iteration, rescale all feature weights such that feature-
+ name has a weight of 1.0.
+
+ --rand-directions <num>
+ MERT will attempt to optimize along all of the principle directions,
+ set this parameter to explore other directions. Defaults to 5.
+
+ --source-file <file>
+ Dev set source file.
+
+ --weights <file>
+ A file specifying initial feature weights. The format is
+ FeatureName_1 value1
+ FeatureName_2 value2
+ **All and only the weights listed in <file> will be optimized!**
+
+ --workdir <dir>
+ Directory for intermediate and output files. If not specified, the
+ name is derived from the ini filename. Assuming that the ini
+ filename begins with the decoder name and ends with ini, the default
+ name of the working directory is inferred from the middle part of
+ the filename. E.g. an ini file named decoder.foo.ini would have
+ a default working directory name foo.
+
+Job control options:
+
+ --jobs <I>
+ Number of decoder processes to run in parallel. [default=$default_jobs]
+
+ --qsub
+ Use qsub to run jobs in parallel (qsub must be configured in
+ environment/LocalEnvironment.pm)
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs
+ (used with qsub requests only)
+
+Help
+}
+
+sub convert {
+ my ($str) = @_;
+ my @ps = split /;/, $str;
+ my %dict = ();
+ for my $p (@ps) {
+ my ($k, $v) = split /=/, $p;
+ $dict{$k} = $v;
+ }
+ return %dict;
+}
+
+
+
+sub cmdline {
+ return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+
+sub escaped_shell_args {
+ return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+ return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+ return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
diff --git a/dpmert/error_surface.cc b/dpmert/error_surface.cc
new file mode 100644
index 00000000..515b67f8
--- /dev/null
+++ b/dpmert/error_surface.cc
@@ -0,0 +1,42 @@
+#include "error_surface.h"
+
+#include <cassert>
+#include <sstream>
+
+using namespace std;
+
+ErrorSurface::~ErrorSurface() {}
+
+void ErrorSurface::Serialize(std::string* out) const {
+ const int segments = this->size();
+ ostringstream os(ios::binary);
+ os.write((const char*)&segments,sizeof(segments));
+ for (int i = 0; i < segments; ++i) {
+ const ErrorSegment& cur = (*this)[i];
+ string senc;
+ cur.delta.Encode(&senc);
+ assert(senc.size() < 1024);
+ unsigned char len = senc.size();
+ os.write((const char*)&cur.x, sizeof(cur.x));
+ os.write((const char*)&len, sizeof(len));
+ os.write((const char*)&senc[0], len);
+ }
+ *out = os.str();
+}
+
+void ErrorSurface::Deserialize(const std::string& in) {
+ istringstream is(in, ios::binary);
+ int segments;
+ is.read((char*)&segments, sizeof(segments));
+ this->resize(segments);
+ for (int i = 0; i < segments; ++i) {
+ ErrorSegment& cur = (*this)[i];
+ unsigned char len;
+ is.read((char*)&cur.x, sizeof(cur.x));
+ is.read((char*)&len, sizeof(len));
+ string senc(len, '\0'); assert(senc.size() == len);
+ is.read((char*)&senc[0], len);
+ cur.delta = SufficientStats(senc);
+ }
+}
+
diff --git a/dpmert/error_surface.h b/dpmert/error_surface.h
new file mode 100644
index 00000000..bb65847b
--- /dev/null
+++ b/dpmert/error_surface.h
@@ -0,0 +1,24 @@
+#ifndef _ERROR_SURFACE_H_
+#define _ERROR_SURFACE_H_
+
+#include <vector>
+#include <string>
+
+#include "ns.h"
+
+class Score;
+
+struct ErrorSegment {
+ double x;
+ SufficientStats delta;
+ ErrorSegment() : x(0), delta() {}
+};
+
+class ErrorSurface : public std::vector<ErrorSegment> {
+ public:
+ ~ErrorSurface();
+ void Serialize(std::string* out) const;
+ void Deserialize(const std::string& in);
+};
+
+#endif
diff --git a/dpmert/libcall.pl b/dpmert/libcall.pl
new file mode 100644
index 00000000..c7d0f128
--- /dev/null
+++ b/dpmert/libcall.pl
@@ -0,0 +1,71 @@
+use IPC::Open3;
+use Symbol qw(gensym);
+
+$DUMMY_STDERR = gensym();
+$DUMMY_STDIN = gensym();
+
+# Run the command and ignore failures
+sub unchecked_call {
+ system("@_")
+}
+
+# Run the command and return its output, if any ignoring failures
+sub unchecked_output {
+ return `@_`
+}
+
+# WARNING: Do not use this for commands that will return large amounts
+# of stdout or stderr -- they might block indefinitely
+sub check_output {
+ print STDERR "Executing and gathering output: @_\n";
+
+ my $pid = open3($DUMMY_STDIN, \*PH, $DUMMY_STDERR, @_);
+ my $proc_output = "";
+ while( <PH> ) {
+ $proc_output .= $_;
+ }
+ waitpid($pid, 0);
+ # TODO: Grab signal that the process died from
+ my $child_exit_status = $? >> 8;
+ if($child_exit_status == 0) {
+ return $proc_output;
+ } else {
+ print STDERR "ERROR: Execution of @_ failed.\n";
+ exit(1);
+ }
+}
+
+# Based on Moses' safesystem sub
+sub check_call {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ my $exitcode = $? >> 8;
+ if($exitcode == 0) {
+ return 0;
+ } elsif ($? == -1) {
+ print STDERR "ERROR: Failed to execute: @_\n $!\n";
+ exit(1);
+
+ } elsif ($? & 127) {
+ printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit(1);
+
+ } else {
+ print STDERR "Failed with exit code: $exitcode\n" if $exitcode;
+ exit($exitcode);
+ }
+}
+
+sub check_bash_call {
+ my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_");
+ check_call(@args);
+}
+
+sub check_bash_output {
+ my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_");
+ return check_output(@args);
+}
+
+# perl module weirdness...
+return 1;
diff --git a/dpmert/line_mediator.pl b/dpmert/line_mediator.pl
new file mode 100755
index 00000000..bc2bb24c
--- /dev/null
+++ b/dpmert/line_mediator.pl
@@ -0,0 +1,116 @@
+#!/usr/bin/perl -w
+#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication
+
+# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver)
+
+#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism.
+
+use strict;
+use IPC::Open2;
+use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO);
+
+my $quiet=!$ENV{DEBUG};
+$quiet=1 if $ENV{QUIET};
+sub info {
+ local $,=' ';
+ print STDERR @_ unless $quiet;
+}
+
+my $mode='CROSS';
+my $ser='DIRECT';
+$mode='PIPE' if $ENV{PIPE};
+$mode='SNAKE' if $ENV{SNAKE};
+$mode='CROSS' if $ENV{CROSS};
+$ser='SERIAL' if $ENV{SERIAL};
+$ser='DIRECT' if $ENV{DIRECT};
+$ser='SERIAL' if $mode eq 'SNAKE';
+info("mode: $mode\n");
+info("connection: $ser\n");
+
+
+my @c1;
+if (scalar @ARGV) {
+ do {
+ push @c1,shift
+ } while scalar @ARGV && $c1[$#c1] ne '--';
+}
+pop @c1;
+my @c2=@ARGV;
+@ARGV=();
+(scalar @c1 && scalar @c2) || die qq{
+usage: $0 cmd1 args -- cmd2 args
+all options are environment variables.
+DEBUG=1 env var enables debugging output.
+CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication. crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output. cmd1 initiates the conversation (sends the first line). default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork).
+SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG.
+if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout.
+if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2.
+DIRECT=1 (default) will override SERIAL=1.
+CROSS=1 (default) will override SNAKE or PIPE.
+};
+
+info("1 cmd:",@c1,"\n");
+info("2 cmd:",@c2,"\n");
+
+sub lineto {
+ select $_[0];
+ $|=1;
+ shift;
+ print @_;
+}
+
+if ($ser eq 'SERIAL') {
+ my ($R1,$W1,$R2,$W2);
+ my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3.
+ my $c2p=open2($R2,$W2,@c2);
+ if ($mode eq 'CROSS') {
+ while(<$R1>) {
+ info("1:",$_);
+ lineto($W2,$_);
+ last unless defined ($_=<$R2>);
+ info("1|2:",$_);
+ lineto($W1,$_);
+ }
+ } else {
+ my $snake=$mode eq 'SNAKE';
+ while(<STDIN>) {
+ info("IN:",$_);
+ lineto($W1,$_);
+ last unless defined ($_=<$R1>);
+ info("IN|1:",$_);
+ lineto($W2,$_);
+ last unless defined ($_=<$R2>);
+ info("IN|1|2:",$_);
+ if ($snake) {
+ lineto($W1,$_);
+ last unless defined ($_=<$R1>);
+ info("IN|1|2|1:",$_);
+ }
+ lineto(*STDOUT,$_);
+ }
+ }
+} else {
+ info("DIRECT mode\n");
+ my @rw1=POSIX::pipe();
+ my @rw2=POSIX::pipe();
+ my $pid=undef;
+ $SIG{CHLD} = sub { wait };
+ while (not defined ($pid=fork())) {
+ sleep 1;
+ }
+ my $pipe = $mode eq 'PIPE';
+ unless ($pipe) {
+ POSIX::close(STDOUT_FILENO);
+ POSIX::close(STDIN_FILENO);
+ }
+ if ($pid) {
+ POSIX::dup2($rw1[1],STDOUT_FILENO);
+ POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe;
+ exec @c1;
+ } else {
+ POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe;
+ POSIX::dup2($rw1[0],STDIN_FILENO);
+ exec @c2;
+ }
+ while (wait()!=-1) {}
+}
diff --git a/dpmert/line_optimizer.cc b/dpmert/line_optimizer.cc
new file mode 100644
index 00000000..49443fbe
--- /dev/null
+++ b/dpmert/line_optimizer.cc
@@ -0,0 +1,111 @@
+#include "line_optimizer.h"
+
+#include <limits>
+#include <algorithm>
+
+#include "sparse_vector.h"
+#include "ns.h"
+
+using namespace std;
+
+typedef ErrorSurface::const_iterator ErrorIter;
+
+// sort by increasing x-ints
+struct IntervalComp {
+ bool operator() (const ErrorIter& a, const ErrorIter& b) const {
+ return a->x < b->x;
+ }
+};
+
+double LineOptimizer::LineOptimize(
+ const EvaluationMetric* metric,
+ const vector<ErrorSurface>& surfaces,
+ const LineOptimizer::ScoreType type,
+ float* best_score,
+ const double epsilon) {
+ // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << " MINE=" << type << endl;
+ vector<ErrorIter> all_ints;
+ for (vector<ErrorSurface>::const_iterator i = surfaces.begin();
+ i != surfaces.end(); ++i) {
+ const ErrorSurface& surface = *i;
+ for (ErrorIter j = surface.begin(); j != surface.end(); ++j)
+ all_ints.push_back(j);
+ }
+ sort(all_ints.begin(), all_ints.end(), IntervalComp());
+ double last_boundary = all_ints.front()->x;
+ SufficientStats acc;
+ float& cur_best_score = *best_score;
+ cur_best_score = (type == MAXIMIZE_SCORE ?
+ -numeric_limits<float>::max() : numeric_limits<float>::max());
+ bool left_edge = true;
+ double pos = numeric_limits<double>::quiet_NaN();
+ for (vector<ErrorIter>::iterator i = all_ints.begin();
+ i != all_ints.end(); ++i) {
+ const ErrorSegment& seg = **i;
+ if (seg.x - last_boundary > epsilon) {
+ float sco = metric->ComputeScore(acc);
+ if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
+ (type == MINIMIZE_SCORE && sco < cur_best_score) ) {
+ cur_best_score = sco;
+ if (left_edge) {
+ pos = seg.x - 0.1;
+ left_edge = false;
+ } else {
+ pos = last_boundary + (seg.x - last_boundary) / 2;
+ }
+ //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n";
+ }
+ // string xx = metric->DetailedScore(acc); cerr << "---- " << xx;
+ // cerr << "---- s=" << sco << "\n";
+ last_boundary = seg.x;
+ }
+ // cerr << "x-boundary=" << seg.x << "\n";
+ //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl;
+ //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl;
+ acc += seg.delta;
+ }
+ float sco = metric->ComputeScore(acc);
+ if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
+ (type == MINIMIZE_SCORE && sco < cur_best_score) ) {
+ cur_best_score = sco;
+ if (left_edge) {
+ pos = 0;
+ } else {
+ pos = last_boundary + 1000.0;
+ }
+ }
+ return pos;
+}
+
+void LineOptimizer::RandomUnitVector(const vector<int>& features_to_optimize,
+ SparseVector<double>* axis,
+ RandomNumberGenerator<boost::mt19937>* rng) {
+ axis->clear();
+ for (int i = 0; i < features_to_optimize.size(); ++i)
+ axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0));
+ (*axis) /= axis->l2norm();
+}
+
+void LineOptimizer::CreateOptimizationDirections(
+ const vector<int>& features_to_optimize,
+ int additional_random_directions,
+ RandomNumberGenerator<boost::mt19937>* rng,
+ vector<SparseVector<double> >* dirs
+ , bool include_orthogonal
+ ) {
+ dirs->clear();
+ typedef SparseVector<double> Dir;
+ vector<Dir> &out=*dirs;
+ int i=0;
+ if (include_orthogonal)
+ for (;i<features_to_optimize.size();++i) {
+ Dir d;
+ d.set_value(features_to_optimize[i],1.);
+ out.push_back(d);
+ }
+ out.resize(i+additional_random_directions);
+ for (;i<out.size();++i)
+ RandomUnitVector(features_to_optimize, &out[i], rng);
+ cerr << "Generated " << out.size() << " total axes to optimize along.\n";
+}
+
diff --git a/dpmert/line_optimizer.h b/dpmert/line_optimizer.h
new file mode 100644
index 00000000..83819f41
--- /dev/null
+++ b/dpmert/line_optimizer.h
@@ -0,0 +1,48 @@
+#ifndef LINE_OPTIMIZER_H_
+#define LINE_OPTIMIZER_H_
+
+#include <vector>
+
+#include "sparse_vector.h"
+#include "error_surface.h"
+#include "sampler.h"
+
+class EvaluationMetric;
+class Weights;
+
+struct LineOptimizer {
+
+ // use MINIMIZE_SCORE for things like TER, WER
+ // MAXIMIZE_SCORE for things like BLEU
+ enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE };
+
+ // merge all the error surfaces together into a global
+ // error surface and find (the middle of) the best segment
+ static double LineOptimize(
+ const EvaluationMetric* metric,
+ const std::vector<ErrorSurface>& envs,
+ const LineOptimizer::ScoreType type,
+ float* best_score,
+ const double epsilon = 1.0/65536.0);
+
+ // return a random vector of length 1 where all dimensions
+ // not listed in dimensions will be 0.
+ static void RandomUnitVector(const std::vector<int>& dimensions,
+ SparseVector<double>* axis,
+ RandomNumberGenerator<boost::mt19937>* rng);
+
+ // generate a list of directions to optimize; the list will
+ // contain the orthogonal vectors corresponding to the dimensions in
+ // primary and then additional_random_directions directions in those
+ // dimensions as well. All vectors will be length 1.
+ static void CreateOptimizationDirections(
+ const std::vector<int>& primary,
+ int additional_random_directions,
+ RandomNumberGenerator<boost::mt19937>* rng,
+ std::vector<SparseVector<double> >* dirs
+ , bool include_primary=true
+ );
+
+};
+
+#endif
diff --git a/dpmert/lo_test.cc b/dpmert/lo_test.cc
new file mode 100644
index 00000000..d9b909b8
--- /dev/null
+++ b/dpmert/lo_test.cc
@@ -0,0 +1,236 @@
+#include <cmath>
+#include <iostream>
+#include <fstream>
+
+#include <boost/shared_ptr.hpp>
+#include <gtest/gtest.h>
+
+#include "ns.h"
+#include "ns_docscorer.h"
+#include "ces.h"
+#include "fdict.h"
+#include "hg.h"
+#include "kbest.h"
+#include "hg_io.h"
+#include "filelib.h"
+#include "inside_outside.h"
+#include "viterbi.h"
+#include "mert_geometry.h"
+#include "line_optimizer.h"
+
+using namespace std;
+using boost::shared_ptr;
+
+class OptTest : public testing::Test {
+ protected:
+ virtual void SetUp() { }
+ virtual void TearDown() { }
+};
+
+const char* ref11 = "australia reopens embassy in manila";
+const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack .";
+const char* ref21 = "australia reopened manila embassy";
+const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack .";
+const char* ref31 = "australia to reopen embassy in manila";
+const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats .";
+const char* ref41 = "australia to re - open its embassy to manila";
+const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago .";
+
+TEST_F(OptTest, TestCheckNaN) {
+ double x = 0;
+ double y = 0;
+ double z = x / y;
+ EXPECT_EQ(true, isnan(z));
+}
+
+TEST_F(OptTest,TestConvexHull) {
+ shared_ptr<MERTPoint> a1(new MERTPoint(-1, 0));
+ shared_ptr<MERTPoint> b1(new MERTPoint(1, 0));
+ shared_ptr<MERTPoint> a2(new MERTPoint(-1, 1));
+ shared_ptr<MERTPoint> b2(new MERTPoint(1, -1));
+ vector<shared_ptr<MERTPoint> > sa; sa.push_back(a1); sa.push_back(b1);
+ vector<shared_ptr<MERTPoint> > sb; sb.push_back(a2); sb.push_back(b2);
+ ConvexHull a(sa);
+ cerr << a << endl;
+ ConvexHull b(sb);
+ ConvexHull c = a;
+ c *= b;
+ cerr << a << " (*) " << b << " = " << c << endl;
+ EXPECT_EQ(3, c.size());
+}
+
+TEST_F(OptTest,TestConvexHullInside) {
+ const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}";
+ Hypergraph hg;
+ istringstream instr(json);
+ HypergraphIO::ReadFromJSON(&instr, &hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 0.4);
+ wts.set_value(FD::Convert("f2"), 1.0);
+ hg.Reweight(wts);
+ vector<pair<vector<WordID>, prob_t> > list;
+ std::vector<SparseVector<double> > features;
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10);
+ for (int i = 0; i < 10; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
+ }
+ SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0);
+ ConvexHullWeightFunction wf(wts, dir);
+ ConvexHull env = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
+ cerr << env << endl;
+ const vector<boost::shared_ptr<MERTPoint> >& segs = env.GetSortedSegs();
+ dir *= segs[1]->x;
+ wts += dir;
+ hg.Reweight(wts);
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest2(hg, 10);
+ for (int i = 0; i < 10; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest2.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
+ }
+ for (int i = 0; i < segs.size(); ++i) {
+ cerr << "seg=" << i << endl;
+ vector<WordID> trans;
+ segs[i]->ConstructTranslation(&trans);
+ cerr << TD::GetString(trans) << endl;
+ }
+}
+
+TEST_F(OptTest, TestS1) {
+ int fPhraseModel_0 = FD::Convert("PhraseModel_0");
+ int fPhraseModel_1 = FD::Convert("PhraseModel_1");
+ int fPhraseModel_2 = FD::Convert("PhraseModel_2");
+ int fLanguageModel = FD::Convert("LanguageModel");
+ int fWordPenalty = FD::Convert("WordPenalty");
+ int fPassThrough = FD::Convert("PassThrough");
+ SparseVector<double> wts;
+ wts.set_value(fWordPenalty, 4.25);
+ wts.set_value(fLanguageModel, -1.1165);
+ wts.set_value(fPhraseModel_0, -0.96);
+ wts.set_value(fPhraseModel_1, -0.65);
+ wts.set_value(fPhraseModel_2, -0.77);
+ wts.set_value(fPassThrough, -10.0);
+
+ vector<int> to_optimize;
+ to_optimize.push_back(fWordPenalty);
+ to_optimize.push_back(fLanguageModel);
+ to_optimize.push_back(fPhraseModel_0);
+ to_optimize.push_back(fPhraseModel_1);
+ to_optimize.push_back(fPhraseModel_2);
+
+ Hypergraph hg;
+ ReadFile rf("./test_data/0.json.gz");
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ hg.Reweight(wts);
+
+ Hypergraph hg2;
+ ReadFile rf2("./test_data/1.json.gz");
+ HypergraphIO::ReadFromJSON(rf2.stream(), &hg2);
+ hg2.Reweight(wts);
+
+ vector<vector<WordID> > refs1(4);
+ TD::ConvertSentence(ref11, &refs1[0]);
+ TD::ConvertSentence(ref21, &refs1[1]);
+ TD::ConvertSentence(ref31, &refs1[2]);
+ TD::ConvertSentence(ref41, &refs1[3]);
+ vector<vector<WordID> > refs2(4);
+ TD::ConvertSentence(ref12, &refs2[0]);
+ TD::ConvertSentence(ref22, &refs2[1]);
+ TD::ConvertSentence(ref32, &refs2[2]);
+ TD::ConvertSentence(ref42, &refs2[3]);
+ vector<ConvexHull> envs(2);
+
+ RandomNumberGenerator<boost::mt19937> rng;
+
+ vector<SparseVector<double> > axes; // directions to search
+ LineOptimizer::CreateOptimizationDirections(
+ to_optimize,
+ 10,
+ &rng,
+ &axes);
+ assert(axes.size() == 10 + to_optimize.size());
+ for (int i = 0; i < axes.size(); ++i)
+ cerr << axes[i] << endl;
+ const SparseVector<double>& axis = axes[0];
+
+ cerr << "Computing Viterbi envelope using inside algorithm...\n";
+ cerr << "axis: " << axis << endl;
+ clock_t t_start=clock();
+ ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction
+ envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
+ envs[1] = Inside<ConvexHull, ConvexHullWeightFunction>(hg2, NULL, wf);
+
+ vector<ErrorSurface> es(2);
+ EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU");
+ boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(refs1);
+ boost::shared_ptr<SegmentEvaluator> scorer2 = metric->CreateSegmentEvaluator(refs2);
+ ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg);
+ ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2);
+ cerr << envs[0].size() << " " << envs[1].size() << endl;
+ cerr << es[0].size() << " " << es[1].size() << endl;
+ envs.clear();
+ clock_t t_env=clock();
+ float score;
+ double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score);
+ clock_t t_opt=clock();
+ cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n";
+ EXPECT_FLOAT_EQ(0.48719698, score);
+ SparseVector<double> res = axis;
+ res *= m;
+ res += wts;
+ cerr << "res: " << res << endl;
+ cerr << "ENVELOPE PROCESSING=" << (static_cast<double>(t_env - t_start) / 1000.0) << endl;
+ cerr << " LINE OPTIMIZATION=" << (static_cast<double>(t_opt - t_env) / 1000.0) << endl;
+ hg.Reweight(res);
+ hg2.Reweight(res);
+ vector<WordID> t1,t2;
+ ViterbiESentence(hg, &t1);
+ ViterbiESentence(hg2, &t2);
+ cerr << TD::GetString(t1) << endl;
+ cerr << TD::GetString(t2) << endl;
+}
+
+TEST_F(OptTest,TestZeroOrigin) {
+ const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}";
+ Hypergraph hg;
+ istringstream instr(json);
+ HypergraphIO::ReadFromJSON(&instr, &hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("PassThrough"), -0.929201533002898);
+ hg.Reweight(wts);
+
+ vector<pair<vector<WordID>, prob_t> > list;
+ std::vector<SparseVector<double> > features;
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10);
+ for (int i = 0; i < 10; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
+ }
+
+ SparseVector<double> axis; axis.set_value(FD::Convert("Glue"),1.0);
+ ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction
+ vector<ConvexHull> envs(1);
+ envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
+
+ vector<vector<WordID> > mr(4);
+ TD::ConvertSentence("untitled", &mr[0]);
+ TD::ConvertSentence("with no title", &mr[1]);
+ TD::ConvertSentence("without a title", &mr[2]);
+ TD::ConvertSentence("without title", &mr[3]);
+ EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU");
+ boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(mr);
+ vector<ErrorSurface> es(1);
+ ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg);
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
diff --git a/dpmert/mert_geometry.cc b/dpmert/mert_geometry.cc
new file mode 100644
index 00000000..81b25af9
--- /dev/null
+++ b/dpmert/mert_geometry.cc
@@ -0,0 +1,186 @@
+#include "mert_geometry.h"
+
+#include <cassert>
+#include <limits>
+
+using namespace std;
+using boost::shared_ptr;
+
+ConvexHull::ConvexHull(int i) {
+ if (i == 0) {
+ // do nothing - <>
+ } else if (i == 1) {
+ points.push_back(shared_ptr<MERTPoint>(new MERTPoint(0, 0, 0, shared_ptr<MERTPoint>(), shared_ptr<MERTPoint>())));
+ assert(this->IsMultiplicativeIdentity());
+ } else {
+ cerr << "Only can create ConvexHull semiring 0 and 1 with this constructor!\n";
+ abort();
+ }
+}
+
+const ConvexHull ConvexHullWeightFunction::operator()(const Hypergraph::Edge& e) const {
+ const double m = direction.dot(e.feature_values_);
+ const double b = origin.dot(e.feature_values_);
+ MERTPoint* point = new MERTPoint(m, b, e);
+ return ConvexHull(1, point);
+}
+
+ostream& operator<<(ostream& os, const ConvexHull& env) {
+ os << '<';
+ const vector<shared_ptr<MERTPoint> >& points = env.GetSortedSegs();
+ for (int i = 0; i < points.size(); ++i)
+ os << (i==0 ? "" : "|") << "x=" << points[i]->x << ",b=" << points[i]->b << ",m=" << points[i]->m << ",p1=" << points[i]->p1 << ",p2=" << points[i]->p2;
+ return os << '>';
+}
+
+#define ORIGINAL_MERT_IMPLEMENTATION 1
+#ifdef ORIGINAL_MERT_IMPLEMENTATION
+
+struct SlopeCompare {
+ bool operator() (const shared_ptr<MERTPoint>& a, const shared_ptr<MERTPoint>& b) const {
+ return a->m < b->m;
+ }
+};
+
+const ConvexHull& ConvexHull::operator+=(const ConvexHull& other) {
+ if (!other.is_sorted) other.Sort();
+ if (points.empty()) {
+ points = other.points;
+ return *this;
+ }
+ is_sorted = false;
+ int j = points.size();
+ points.resize(points.size() + other.points.size());
+ for (int i = 0; i < other.points.size(); ++i)
+ points[j++] = other.points[i];
+ assert(j == points.size());
+ return *this;
+}
+
+void ConvexHull::Sort() const {
+ sort(points.begin(), points.end(), SlopeCompare());
+ const int k = points.size();
+ int j = 0;
+ for (int i = 0; i < k; ++i) {
+ MERTPoint l = *points[i];
+ l.x = kMinusInfinity;
+ // cerr << "m=" << l.m << endl;
+ if (0 < j) {
+ if (points[j-1]->m == l.m) { // lines are parallel
+ if (l.b <= points[j-1]->b) continue;
+ --j;
+ }
+ while(0 < j) {
+ l.x = (l.b - points[j-1]->b) / (points[j-1]->m - l.m);
+ if (points[j-1]->x < l.x) break;
+ --j;
+ }
+ if (0 == j) l.x = kMinusInfinity;
+ }
+ *points[j++] = l;
+ }
+ points.resize(j);
+ is_sorted = true;
+}
+
+const ConvexHull& ConvexHull::operator*=(const ConvexHull& other) {
+ if (other.IsMultiplicativeIdentity()) { return *this; }
+ if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; }
+
+ if (!is_sorted) Sort();
+ if (!other.is_sorted) other.Sort();
+
+ if (this->IsEdgeEnvelope()) {
+// if (other.size() > 1)
+// cerr << *this << " (TIMES) " << other << endl;
+ shared_ptr<MERTPoint> edge_parent = points[0];
+ const double& edge_b = edge_parent->b;
+ const double& edge_m = edge_parent->m;
+ points.clear();
+ for (int i = 0; i < other.points.size(); ++i) {
+ const MERTPoint& p = *other.points[i];
+ const double m = p.m + edge_m;
+ const double b = p.b + edge_b;
+ const double& x = p.x; // x's don't change with *
+ points.push_back(shared_ptr<MERTPoint>(new MERTPoint(x, m, b, edge_parent, other.points[i])));
+ assert(points.back()->p1->edge);
+ }
+// if (other.size() > 1)
+// cerr << " = " << *this << endl;
+ } else {
+ vector<shared_ptr<MERTPoint> > new_points;
+ int this_i = 0;
+ int other_i = 0;
+ const int this_size = points.size();
+ const int other_size = other.points.size();
+ double cur_x = kMinusInfinity; // moves from left to right across the
+ // real numbers, stopping for all inter-
+ // sections
+ double this_next_val = (1 < this_size ? points[1]->x : kPlusInfinity);
+ double other_next_val = (1 < other_size ? other.points[1]->x : kPlusInfinity);
+ while (this_i < this_size && other_i < other_size) {
+ const MERTPoint& this_point = *points[this_i];
+ const MERTPoint& other_point= *other.points[other_i];
+ const double m = this_point.m + other_point.m;
+ const double b = this_point.b + other_point.b;
+
+ new_points.push_back(shared_ptr<MERTPoint>(new MERTPoint(cur_x, m, b, points[this_i], other.points[other_i])));
+ int comp = 0;
+ if (this_next_val < other_next_val) comp = -1; else
+ if (this_next_val > other_next_val) comp = 1;
+ if (0 == comp) { // the next values are equal, advance both indices
+ ++this_i;
+ ++other_i;
+ cur_x = this_next_val; // could be other_next_val (they're equal!)
+ this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity);
+ other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity);
+ } else { // advance the i with the lower x, update cur_x
+ if (-1 == comp) {
+ ++this_i;
+ cur_x = this_next_val;
+ this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity);
+ } else {
+ ++other_i;
+ cur_x = other_next_val;
+ other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity);
+ }
+ }
+ }
+ points.swap(new_points);
+ }
+ //cerr << "Multiply: result=" << (*this) << endl;
+ return *this;
+}
+
+// recursively construct translation
+void MERTPoint::ConstructTranslation(vector<WordID>* trans) const {
+ const MERTPoint* cur = this;
+ vector<vector<WordID> > ant_trans;
+ while(!cur->edge) {
+ ant_trans.resize(ant_trans.size() + 1);
+ cur->p2->ConstructTranslation(&ant_trans.back());
+ cur = cur->p1.get();
+ }
+ size_t ant_size = ant_trans.size();
+ vector<const vector<WordID>*> pants(ant_size);
+ assert(ant_size == cur->edge->tail_nodes_.size());
+ --ant_size;
+ for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i];
+ cur->edge->rule_->ESubstitute(pants, trans);
+}
+
+void MERTPoint::CollectEdgesUsed(std::vector<bool>* edges_used) const {
+ if (edge) {
+ assert(edge->id_ < edges_used->size());
+ (*edges_used)[edge->id_] = true;
+ }
+ if (p1) p1->CollectEdgesUsed(edges_used);
+ if (p2) p2->CollectEdgesUsed(edges_used);
+}
+
+#else
+
+// THIS IS THE NEW FASTER IMPLEMENTATION OF THE MERT SEMIRING OPERATIONS
+
+#endif
+
diff --git a/dpmert/mert_geometry.h b/dpmert/mert_geometry.h
new file mode 100644
index 00000000..a8b6959e
--- /dev/null
+++ b/dpmert/mert_geometry.h
@@ -0,0 +1,81 @@
+#ifndef _MERT_GEOMETRY_H_
+#define _MERT_GEOMETRY_H_
+
+#include <vector>
+#include <iostream>
+#include <boost/shared_ptr.hpp>
+
+#include "hg.h"
+#include "sparse_vector.h"
+
+static const double kMinusInfinity = -std::numeric_limits<double>::infinity();
+static const double kPlusInfinity = std::numeric_limits<double>::infinity();
+
+struct MERTPoint {
+ MERTPoint() : x(), m(), b(), edge() {}
+ MERTPoint(double _m, double _b) :
+ x(kMinusInfinity), m(_m), b(_b), edge() {}
+ MERTPoint(double _x, double _m, double _b, const boost::shared_ptr<MERTPoint>& p1_, const boost::shared_ptr<MERTPoint>& p2_) :
+ x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {}
+ MERTPoint(double _m, double _b, const Hypergraph::Edge& edge) :
+ x(kMinusInfinity), m(_m), b(_b), edge(&edge) {}
+
+ double x; // x intersection with previous segment in env, or -inf if none
+ double m; // this line's slope
+ double b; // intercept with y-axis
+
+ // we keep a pointer to the "parents" of this segment so we can reconstruct
+ // the Viterbi translation corresponding to this segment
+ boost::shared_ptr<MERTPoint> p1;
+ boost::shared_ptr<MERTPoint> p2;
+
+ // only MERTPoints created from an edge using the ConvexHullWeightFunction
+ // have rules
+ // TRulePtr rule;
+ const Hypergraph::Edge* edge;
+
+ // recursively recover the Viterbi translation that will result from setting
+ // the weights to origin + axis * x, where x is any value from this->x up
+ // until the next largest x in the containing ConvexHull
+ void ConstructTranslation(std::vector<WordID>* trans) const;
+ void CollectEdgesUsed(std::vector<bool>* edges_used) const;
+};
+
+// this is the semiring value type,
+// it defines constructors for 0, 1, and the operations + and *
+struct ConvexHull {
+ // create semiring zero
+ ConvexHull() : is_sorted(true) {} // zero
+ // for debugging:
+ ConvexHull(const std::vector<boost::shared_ptr<MERTPoint> >& s) : points(s) { Sort(); }
+ // create semiring 1 or 0
+ explicit ConvexHull(int i);
+ ConvexHull(int n, MERTPoint* point) : is_sorted(true), points(n, boost::shared_ptr<MERTPoint>(point)) {}
+ const ConvexHull& operator+=(const ConvexHull& other);
+ const ConvexHull& operator*=(const ConvexHull& other);
+ bool IsMultiplicativeIdentity() const {
+ return size() == 1 && (points[0]->b == 0.0 && points[0]->m == 0.0) && (!points[0]->edge) && (!points[0]->p1) && (!points[0]->p2); }
+ const std::vector<boost::shared_ptr<MERTPoint> >& GetSortedSegs() const {
+ if (!is_sorted) Sort();
+ return points;
+ }
+ size_t size() const { return points.size(); }
+
+ private:
+ bool IsEdgeEnvelope() const {
+ return points.size() == 1 && points[0]->edge; }
+ void Sort() const;
+ mutable bool is_sorted;
+ mutable std::vector<boost::shared_ptr<MERTPoint> > points;
+};
+std::ostream& operator<<(std::ostream& os, const ConvexHull& env);
+
+struct ConvexHullWeightFunction {
+ ConvexHullWeightFunction(const SparseVector<double>& ori,
+ const SparseVector<double>& dir) : origin(ori), direction(dir) {}
+ const ConvexHull operator()(const Hypergraph::Edge& e) const;
+ const SparseVector<double> origin;
+ const SparseVector<double> direction;
+};
+
+#endif
diff --git a/dpmert/mr_dpmert_generate_mapper_input.cc b/dpmert/mr_dpmert_generate_mapper_input.cc
new file mode 100644
index 00000000..59d4f24f
--- /dev/null
+++ b/dpmert/mr_dpmert_generate_mapper_input.cc
@@ -0,0 +1,78 @@
+#include <iostream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "weights.h"
+#include "line_optimizer.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)")
+ ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository")
+ ("weights,w",po::value<string>(),"[REQD] Current feature weights file")
+ ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
+ ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (conf->count("dev_set_size") == 0) {
+ cerr << "Please specify the size of the development set using -d N\n";
+ flag = true;
+ }
+ if (conf->count("weights") == 0) {
+ cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
+ flag = true;
+ }
+ if (conf->count("forest_repository") == 0) {
+ cerr << "Please specify the forest repository location using -r <DIR>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+int main(int argc, char** argv) {
+ RandomNumberGenerator<boost::mt19937> rng;
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ vector<string> features;
+ SparseVector<weight_t> origin;
+ vector<weight_t> w;
+ Weights::InitFromFile(conf["weights"].as<string>(), &w, &features);
+ Weights::InitSparseVector(w, &origin);
+ const string forest_repository = conf["forest_repository"].as<string>();
+ assert(DirectoryExists(forest_repository));
+ if (conf.count("optimize_feature") > 0)
+ features=conf["optimize_feature"].as<vector<string> >();
+ vector<SparseVector<weight_t> > directions;
+ vector<int> fids(features.size());
+ for (int i = 0; i < features.size(); ++i)
+ fids[i] = FD::Convert(features[i]);
+ LineOptimizer::CreateOptimizationDirections(
+ fids,
+ conf["random_directions"].as<unsigned int>(),
+ &rng,
+ &directions);
+ unsigned dev_set_size = conf["dev_set_size"].as<unsigned>();
+ for (unsigned i = 0; i < dev_set_size; ++i) {
+ for (unsigned j = 0; j < directions.size(); ++j) {
+ cout << forest_repository << '/' << i << ".json.gz " << i << ' ';
+ print(cout, origin, "=", ";");
+ cout << ' ';
+ print(cout, directions[j], "=", ";");
+ cout << endl;
+ }
+ }
+ return 0;
+}
diff --git a/dpmert/mr_dpmert_map.cc b/dpmert/mr_dpmert_map.cc
new file mode 100644
index 00000000..f3304f0f
--- /dev/null
+++ b/dpmert/mr_dpmert_map.cc
@@ -0,0 +1,112 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "ns.h"
+#include "ns_docscorer.h"
+#include "ces.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "sparse_vector.h"
+#include "mert_geometry.h"
+#include "inside_outside.h"
+#include "error_surface.h"
+#include "b64tools.h"
+#include "hg_io.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+ ("source,s",po::value<string>(), "Source file (ignored, except for AER)")
+ ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric being optimized")
+ ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (!conf->count("reference")) {
+ cerr << "Please specify one or more references using -r <REF.TXT>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+bool ReadSparseVectorString(const string& s, SparseVector<double>* v) {
+#if 0
+ // this should work, but untested.
+ std::istringstream i(s);
+ i>>*v;
+#else
+ vector<string> fields;
+ Tokenize(s, ';', &fields);
+ if (fields.empty()) return false;
+ for (int i = 0; i < fields.size(); ++i) {
+ vector<string> pair(2);
+ Tokenize(fields[i], '=', &pair);
+ if (pair.size() != 2) {
+ cerr << "Error parsing vector string: " << fields[i] << endl;
+ return false;
+ }
+ v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str()));
+ }
+ return true;
+#endif
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const string evaluation_metric = conf["evaluation_metric"].as<string>();
+ EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+ DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
+ Hypergraph hg;
+ string last_file;
+ ReadFile in_read(conf["input"].as<string>());
+ istream &in=*in_read.stream();
+ while(in) {
+ string line;
+ getline(in, line);
+ if (line.empty()) continue;
+ istringstream is(line);
+ int sent_id;
+ string file, s_origin, s_direction;
+ // path-to-file (JSON) sent_ed starting-point search-direction
+ is >> file >> sent_id >> s_origin >> s_direction;
+ SparseVector<double> origin;
+ ReadSparseVectorString(s_origin, &origin);
+ SparseVector<double> direction;
+ ReadSparseVectorString(s_direction, &direction);
+ // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl;
+ if (last_file != file) {
+ last_file = file;
+ ReadFile rf(file);
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ }
+ const ConvexHullWeightFunction wf(origin, direction);
+ const ConvexHull hull = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
+
+ ErrorSurface es;
+ ComputeErrorSurface(*ds[sent_id], hull, &es, metric, hg);
+ //cerr << "Viterbi envelope has " << ve.size() << " segments\n";
+ // cerr << "Error surface has " << es.size() << " segments\n";
+ string val;
+ es.Serialize(&val);
+ cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t';
+ B64::b64encode(val.c_str(), val.size(), &cout);
+ cout << endl << flush;
+ }
+ return 0;
+}
diff --git a/dpmert/mr_dpmert_reduce.cc b/dpmert/mr_dpmert_reduce.cc
new file mode 100644
index 00000000..31512a03
--- /dev/null
+++ b/dpmert/mr_dpmert_reduce.cc
@@ -0,0 +1,77 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sparse_vector.h"
+#include "error_surface.h"
+#include "line_optimizer.h"
+#include "b64tools.h"
+#include "stringlib.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("evaluation_metric,m",po::value<string>(), "Evaluation metric (IBM_BLEU, etc.)")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = conf->count("evaluation_metric") == 0;
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const string evaluation_metric = conf["evaluation_metric"].as<string>();
+ EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+ LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE;
+ if (metric->IsErrorMetric())
+ opt_type = LineOptimizer::MINIMIZE_SCORE;
+
+ vector<ErrorSurface> esv;
+ string last_key, line, key, val;
+ while(getline(cin, line)) {
+ size_t ks = line.find("\t");
+ assert(string::npos != ks);
+ assert(ks > 2);
+ key = line.substr(2, ks - 2);
+ val = line.substr(ks + 1);
+ if (key != last_key) {
+ if (!last_key.empty()) {
+ float score;
+ double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score);
+ cout << last_key << "|" << x << "|" << score << endl;
+ }
+ last_key.swap(key);
+ esv.clear();
+ }
+ if (val.size() % 4 != 0) {
+ cerr << "B64 encoding error 1! Skipping.\n";
+ continue;
+ }
+ string encoded(val.size() / 4 * 3, '\0');
+ if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) {
+ cerr << "B64 encoding error 2! Skipping.\n";
+ continue;
+ }
+ esv.push_back(ErrorSurface());
+ esv.back().Deserialize(encoded);
+ }
+ if (!esv.empty()) {
+ float score;
+ double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score);
+ cout << last_key << "|" << x << "|" << score << endl;
+ }
+ return 0;
+}
diff --git a/dpmert/parallelize.pl b/dpmert/parallelize.pl
new file mode 100755
index 00000000..7d0365cc
--- /dev/null
+++ b/dpmert/parallelize.pl
@@ -0,0 +1,423 @@
+#!/usr/bin/env perl
+
+# Author: Adam Lopez
+#
+# This script takes a command that processes input
+# from stdin one-line-at-time, and parallelizes it
+# on the cluster using David Chiang's sentserver/
+# sentclient architecture.
+#
+# Prerequisites: the command *must* read each line
+# without waiting for subsequent lines of input
+# (for instance, a command which must read all lines
+# of input before processing will not work) and
+# return it to the output *without* buffering
+# multiple lines.
+
+#TODO: if -j 1, run immediately, not via sentserver? possible differences in environment might make debugging harder
+
+#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s
+
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+use LocalConfig;
+
+use Cwd qw/ abs_path cwd getcwd /;
+use File::Temp qw/ tempfile /;
+use Getopt::Long;
+use IPC::Open2;
+use strict;
+use POSIX ":sys_wait_h";
+
+use File::Basename;
+my $myDir = dirname(__FILE__);
+print STDERR __FILE__." -> $myDir\n";
+push(@INC, $myDir);
+require "libcall.pl";
+
+my $tailn=5; # +0 = concatenate all the client logs. 5 = last 5 lines
+my $recycle_clients; # spawn new clients when previous ones terminate
+my $stay_alive; # dont let server die when having zero clients
+my $joblist = "";
+my $errordir="";
+my $multiline;
+my @files_to_stage;
+my $numnodes = 8;
+my $user = $ENV{"USER"};
+my $pmem = "9g";
+my $basep=50300;
+my $randp=300;
+my $tryp=50;
+my $no_which;
+my $no_cd;
+
+my $DEBUG=$ENV{DEBUG};
+print STDERR "DEBUG=$DEBUG output enabled.\n" if $DEBUG;
+my $verbose = 1;
+sub verbose {
+ if ($verbose) {
+ print STDERR @_,"\n";
+ }
+}
+sub debug {
+ if ($DEBUG) {
+ my ($package, $filename, $line) = caller;
+ print STDERR "DEBUG: $filename($line): ",join(' ',@_),"\n";
+ }
+}
+my $is_shell_special=qr.[ \t\n\\><|&;"'`~*?{}$!()].;
+my $shell_escape_in_quote=qr.[\\"\$`!].;
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ return '""' unless $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+sub preview_files {
+ my ($l,$skipempty,$footer,$n)=@_;
+ $n=$tailn unless defined $n;
+ my @f=grep { ! ($skipempty && -z $_) } @$l;
+ my $fn=join(' ',map {escape_shell($_)} @f);
+ my $cmd="tail -n $n $fn";
+ unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":"");
+}
+sub prefix_dirname($) {
+ #like `dirname but if ends in / then return the whole thing
+ local ($_)=@_;
+ if (/\/$/) {
+ $_;
+ } else {
+ s#/[^/]$##;
+ $_ ? $_ : '';
+ }
+}
+sub ensure_final_slash($) {
+ local ($_)=@_;
+ m#/$# ? $_ : ($_."/");
+}
+sub extend_path($$;$$) {
+ my ($base,$ext,$mkdir,$baseisdir)=@_;
+ if (-d $base) {
+ $base.="/";
+ } else {
+ my $dir;
+ if ($baseisdir) {
+ $dir=$base;
+ $base.='/' unless $base =~ /\/$/;
+ } else {
+ $dir=prefix_dirname($base);
+ }
+ my @cmd=("/bin/mkdir","-p",$dir);
+ check_call(@cmd) if $mkdir;
+ }
+ return $base.$ext;
+}
+
+my $abscwd=abs_path(&getcwd);
+sub print_help;
+
+my $use_fork;
+my @pids;
+
+# Process command-line options
+unless (GetOptions(
+ "stay-alive" => \$stay_alive,
+ "recycle-clients" => \$recycle_clients,
+ "error-dir=s" => \$errordir,
+ "multi-line" => \$multiline,
+ "file=s" => \@files_to_stage,
+ "use-fork" => \$use_fork,
+ "verbose" => \$verbose,
+ "jobs=i" => \$numnodes,
+ "pmem=s" => \$pmem,
+ "baseport=i" => \$basep,
+# "iport=i" => \$randp, #for short name -i
+ "no-which!" => \$no_which,
+ "no-cd!" => \$no_cd,
+ "tailn=s" => \$tailn,
+) && scalar @ARGV){
+ print_help();
+ die "bad options.";
+}
+
+my $cmd = "";
+my $prog=shift;
+if ($no_which) {
+ $cmd=$prog;
+} else {
+ $cmd=check_output("which $prog");
+ chomp $cmd;
+ die "$prog not found - $cmd" unless $cmd;
+}
+#$cmd=abs_path($cmd);
+for my $arg (@ARGV) {
+ $cmd .= " ".escape_shell($arg);
+}
+die "Please specify a command to parallelize\n" if $cmd eq '';
+
+my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n");
+
+my $executable = $cmd;
+$executable =~ s/^\s*(\S+)($|\s.*)/$1/;
+$executable=check_output("basename $executable");
+chomp $executable;
+
+
+print STDERR "Parallelizing ($numnodes ways): $cmd\n\n";
+
+# create -e dir and save .sh
+use File::Temp qw/tempdir/;
+unless ($errordir) {
+ $errordir=tempdir("$executable.XXXXXX",CLEANUP=>1);
+}
+if ($errordir) {
+ my $scriptfile=extend_path("$errordir/","$executable.sh",1,1);
+ -d $errordir || die "should have created -e dir $errordir";
+ open SF,">",$scriptfile || die;
+ print SF "$cdcmd$cmd\n";
+ close SF;
+ chmod 0755,$scriptfile;
+ $errordir=abs_path($errordir);
+ &verbose("-e dir: $errordir");
+}
+
+# set cleanup handler
+my @cleanup_cmds;
+sub cleanup;
+sub cleanup_and_die;
+$SIG{INT} = "cleanup_and_die";
+$SIG{TERM} = "cleanup_and_die";
+$SIG{HUP} = "cleanup_and_die";
+
+# other subs:
+sub numof_live_jobs;
+sub launch_job_on_node;
+
+
+# vars
+my $mydir = check_output("dirname $0"); chomp $mydir;
+my $sentserver = "$mydir/sentserver";
+my $sentclient = "$mydir/sentclient";
+my $host = check_output("hostname");
+chomp $host;
+
+
+# find open port
+srand;
+my $port = 50300+int(rand($randp));
+my $endp=$port+$tryp;
+sub listening_port_lines {
+ my $quiet=$verbose?'':'2>/dev/null';
+ return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp");
+}
+my $netstat=&listening_port_lines;
+
+if ($verbose){ print STDERR "Testing port $port...";}
+
+while ($netstat=~/$port/ || &listening_port_lines=~/$port/){
+ if ($verbose){ print STDERR "port is busy\n";}
+ $port++;
+ if ($port > $endp){
+ die "Unable to find open port\n";
+ }
+ if ($verbose){ print STDERR "Testing port $port... "; }
+}
+if ($verbose){
+ print STDERR "port $port is available\n";
+}
+
+my $key = int(rand()*1000000);
+
+my $multiflag = "";
+if ($multiline){ $multiflag = "-m"; print STDERR "expecting multiline output.\n"; }
+my $stay_alive_flag = "";
+if ($stay_alive){ $stay_alive_flag = "--stay-alive"; print STDERR "staying alive while no clients are connected.\n"; }
+
+my $node_count = 0;
+my $script = "";
+# fork == one thread runs the sentserver, while the
+# other spawns the sentclient commands.
+my $pid = fork;
+if ($pid == 0) { # child
+ sleep 8; # give other thread time to start sentserver
+ $script = "$cdcmd$sentclient $host:$port:$key $cmd";
+
+ if ($verbose){
+ print STDERR "Client script:\n====\n";
+ print STDERR $script;
+ print STDERR "====\n";
+ }
+ for (my $jobn=0; $jobn<$numnodes; $jobn++){
+ launch_job();
+ }
+ if ($recycle_clients) {
+ my $ret;
+ my $livejobs;
+ while (1) {
+ $ret = waitpid($pid, WNOHANG);
+ #print STDERR "waitpid $pid ret = $ret \n";
+ last if ($ret != 0);
+ $livejobs = numof_live_jobs();
+ if ($numnodes >= $livejobs ) { # a client terminated, OR # lines of input was less than -j
+ print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n";
+ launch_job();
+ } else {
+ sleep 15;
+ }
+ }
+ }
+ print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n";
+ for my $p (@pids) {
+ waitpid($p, 0);
+ }
+} else {
+# my $todo = "$sentserver -k $key $multiflag $port ";
+ my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag ";
+ if ($verbose){ print STDERR "Running: $todo\n"; }
+ check_call($todo);
+ print STDERR "Call to $sentserver returned.\n";
+ cleanup();
+ exit(0);
+}
+
+sub numof_live_jobs {
+ if ($use_fork) {
+ die "not implemented";
+ } else {
+ # We can probably continue decoding if the qstat error is only temporary
+ my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat")));
+ return ($#livejobs + 1);
+ }
+}
+my (@errors,@outs,@cmds);
+
+sub launch_job {
+ if ($use_fork) { return launch_job_fork(); }
+ my $errorfile = "/dev/null";
+ my $outfile = "/dev/null";
+ $node_count++;
+ my $clientname = $executable;
+ $clientname =~ s/^(.{4}).*$/$1/;
+ $clientname = "$clientname.$node_count";
+ if ($errordir){
+ $errorfile = "$errordir/$clientname.ER";
+ $outfile = "$errordir/$clientname.OU";
+ push @errors,$errorfile;
+ push @outs,$outfile;
+ }
+ my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile";
+ push @cmds,$todo;
+
+ print STDERR "Running: $todo\n";
+ local(*QOUT, *QIN);
+ open2(\*QOUT, \*QIN, $todo) or die "Failed to open2: $!";
+ print QIN $script;
+ close QIN;
+ while (my $jobid=<QOUT>){
+ chomp $jobid;
+ if ($verbose){ print STDERR "Launched client job: $jobid"; }
+ $jobid =~ s/^(\d+)(.*?)$/\1/g;
+ $jobid =~ s/^Your job (\d+) .*$/\1/;
+ print STDERR " short job id $jobid\n";
+ if ($verbose){
+ print STDERR "cd: $abscwd\n";
+ print STDERR "cmd: $cmd\n";
+ }
+ if ($joblist == "") { $joblist = $jobid; }
+ else {$joblist = $joblist . "\|" . $jobid; }
+ my $cleanfn="qdel $jobid 2> /dev/null";
+ push(@cleanup_cmds, $cleanfn);
+ }
+ close QOUT;
+}
+
+sub launch_job_fork {
+ my $errorfile = "/dev/null";
+ my $outfile = "/dev/null";
+ $node_count++;
+ my $clientname = $executable;
+ $clientname =~ s/^(.{4}).*$/$1/;
+ $clientname = "$clientname.$node_count";
+ if ($errordir){
+ $errorfile = "$errordir/$clientname.ER";
+ $outfile = "$errordir/$clientname.OU";
+ push @errors,$errorfile;
+ push @outs,$outfile;
+ }
+ my $pid = fork;
+ if ($pid == 0) {
+ my ($fh, $scr_name) = get_temp_script();
+ print $fh $script;
+ close $fh;
+ my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile";
+ print STDERR "EXEC: $todo\n";
+ my $out = check_output("$todo");
+ unlink $scr_name or warn "Failed to remove $scr_name";
+ exit 0;
+ } else {
+ push @pids, $pid;
+ }
+}
+
+sub get_temp_script {
+ my ($fh, $filename) = tempfile( "workXXXX", SUFFIX => '.sh');
+ return ($fh, $filename);
+}
+
+sub cleanup_and_die {
+ cleanup();
+ die "\n";
+}
+
+sub cleanup {
+ print STDERR "Cleaning up...\n";
+ for $cmd (@cleanup_cmds){
+ print STDERR " Cleanup command: $cmd\n";
+ eval $cmd;
+ }
+ print STDERR "outputs:\n",preview_files(\@outs,1),"\n";
+ print STDERR "errors:\n",preview_files(\@errors,1),"\n";
+ print STDERR "cmd:\n",$cmd,"\n";
+ print STDERR " cat $errordir/*.ER\nfor logs.\n";
+ print STDERR "Cleanup finished.\n";
+}
+
+sub print_help
+{
+ my $name = check_output("basename $0"); chomp $name;
+ print << "Help";
+
+usage: $name [options]
+
+ Automatic black-box parallelization of commands.
+
+options:
+
+ --use-fork
+ Instead of using qsub, use fork.
+
+ -e, --error-dir <dir>
+ Retain output files from jobs in <dir>, rather
+ than silently deleting them.
+
+ -m, --multi-line
+ Expect that command may produce multiple output
+ lines for a single input line. $name makes a
+ reasonable attempt to obtain all output before
+ processing additional inputs. However, use of this
+ option is inherently unsafe.
+
+ -v, --verbose
+ Print diagnostic informatoin on stderr.
+
+ -j, --jobs
+ Number of jobs to use.
+
+ -p, --pmem
+ pmem setting for each job.
+
+Help
+}
diff --git a/dpmert/sentclient.c b/dpmert/sentclient.c
new file mode 100644
index 00000000..91d994ab
--- /dev/null
+++ b/dpmert/sentclient.c
@@ -0,0 +1,76 @@
+/* Copyright (c) 2001 by David Chiang. All rights reserved.*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <string.h>
+
+#include "sentserver.h"
+
+int main (int argc, char *argv[]) {
+ int sock, port;
+ char *s, *key;
+ struct hostent *hp;
+ struct sockaddr_in server;
+ int errors = 0;
+
+ if (argc < 3) {
+ fprintf(stderr, "Usage: sentclient host[:port[:key]] command [args ...]\n");
+ exit(1);
+ }
+
+ s = strchr(argv[1], ':');
+ key = NULL;
+
+ if (s == NULL) {
+ port = DEFAULT_PORT;
+ } else {
+ *s = '\0';
+ s+=1;
+ /* dumb hack */
+ key = strchr(s, ':');
+ if (key != NULL){
+ *key = '\0';
+ key += 1;
+ }
+ port = atoi(s);
+ }
+
+ sock = socket(AF_INET, SOCK_STREAM, 0);
+
+ hp = gethostbyname(argv[1]);
+ if (hp == NULL) {
+ fprintf(stderr, "unknown host %s\n", argv[1]);
+ exit(1);
+ }
+
+ bzero((char *)&server, sizeof(server));
+ bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
+ server.sin_family = hp->h_addrtype;
+ server.sin_port = htons(port);
+
+ while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) {
+ perror("connect()");
+ sleep(1);
+ errors++;
+ if (errors > 5)
+ exit(1);
+ }
+
+ close(0);
+ close(1);
+ dup2(sock, 0);
+ dup2(sock, 1);
+
+ if (key != NULL){
+ write(1, key, strlen(key));
+ write(1, "\n", 1);
+ }
+
+ execvp(argv[2], argv+2);
+ return 0;
+}
diff --git a/dpmert/sentserver.c b/dpmert/sentserver.c
new file mode 100644
index 00000000..c20b4fa6
--- /dev/null
+++ b/dpmert/sentserver.c
@@ -0,0 +1,515 @@
+/* Copyright (c) 2001 by David Chiang. All rights reserved.*/
+
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <netinet/in.h>
+#include <sched.h>
+#include <pthread.h>
+#include <errno.h>
+
+#include "sentserver.h"
+
+#define MAX_CLIENTS 64
+
+struct clientinfo {
+ int s;
+ struct sockaddr_in sin;
+};
+
+struct line {
+ int id;
+ char *s;
+ int status;
+ struct line *next;
+} *head, **ptail;
+
+int n_sent = 0, n_received=0, n_flushed=0;
+
+#define STATUS_RUNNING 0
+#define STATUS_ABORTED 1
+#define STATUS_FINISHED 2
+
+pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t clients_mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+int n_clients = 0;
+int s;
+int expect_multiline_output = 0;
+int log_mutex = 0;
+int stay_alive = 0; /* dont panic and die with zero clients */
+
+void queue_finish(struct line *node, char *s, int fid);
+char * read_line(int fd, int multiline);
+void done (int code);
+
+struct line * queue_get(int fid) {
+ struct line *cur;
+ char *s, *synch;
+
+ if (log_mutex) fprintf(stderr, "Getting for data for fid %d\n", fid);
+ if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+ pthread_mutex_lock(&queue_mutex);
+
+ /* First, check for aborted sentences. */
+
+ if (log_mutex) fprintf(stderr, " Checking queue for aborted jobs (fid %d)\n", fid);
+ for (cur = head; cur != NULL; cur = cur->next) {
+ if (cur->status == STATUS_ABORTED) {
+ cur->status = STATUS_RUNNING;
+
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+
+ return cur;
+ }
+ }
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+
+ /* Otherwise, read a new one. */
+ if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid);
+ if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid);
+ pthread_mutex_lock(&input_mutex);
+ s = read_line(0,0);
+
+ while (s) {
+ if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+ pthread_mutex_lock(&queue_mutex);
+ if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid);
+ pthread_mutex_unlock(&input_mutex);
+
+ cur = malloc(sizeof (struct line));
+ cur->id = n_sent;
+ cur->s = s;
+ cur->next = NULL;
+
+ *ptail = cur;
+ ptail = &cur->next;
+
+ n_sent++;
+
+ if (strcmp(s,"===SYNCH===\n")==0){
+ fprintf(stderr, "Received ===SYNCH=== signal (fid %d)\n", fid);
+ // Note: queue_finish calls free(cur->s).
+ // Therefore we need to create a new string here.
+ synch = malloc((strlen("===SYNCH===\n")+2) * sizeof (char));
+ synch = strcpy(synch, s);
+
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+ queue_finish(cur, synch, fid); /* handles its own lock */
+
+ if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid);
+ if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid);
+ pthread_mutex_lock(&input_mutex);
+
+ s = read_line(0,0);
+ } else {
+ if (log_mutex) fprintf(stderr, " Received new data %d (fid %d)\n", cur->id, fid);
+ cur->status = STATUS_RUNNING;
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+ return cur;
+ }
+ }
+
+ if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid);
+ pthread_mutex_unlock(&input_mutex);
+ /* Only way to reach this point: no more output */
+
+ if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+ pthread_mutex_lock(&queue_mutex);
+ if (head == NULL) {
+ fprintf(stderr, "Reached end of file. Exiting.\n");
+ done(0);
+ } else
+ ptail = NULL; /* This serves as a signal that there is no more input */
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+
+ return NULL;
+}
+
+void queue_panic() {
+ struct line *next;
+ while (head && head->status == STATUS_FINISHED) {
+ /* Write out finished sentences */
+ if (head->status == STATUS_FINISHED) {
+ fputs(head->s, stdout);
+ fflush(stdout);
+ }
+ /* Write out blank line for unfinished sentences */
+ if (head->status == STATUS_ABORTED) {
+ fputs("\n", stdout);
+ fflush(stdout);
+ }
+ /* By defition, there cannot be any RUNNING sentences, since
+ function is only called when n_clients == 0 */
+ free(head->s);
+ next = head->next;
+ free(head);
+ head = next;
+ n_flushed++;
+ }
+ fclose(stdout);
+ fprintf(stderr, "All clients died. Panicking, flushing completed sentences and exiting.\n");
+ done(1);
+}
+
+void queue_abort(struct line *node, int fid) {
+ if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+ pthread_mutex_lock(&queue_mutex);
+ node->status = STATUS_ABORTED;
+ if (n_clients == 0) {
+ if (stay_alive) {
+ fprintf(stderr, "Warning! No live clients detected! Staying alive, will retry soon.\n");
+ } else {
+ queue_panic();
+ }
+ }
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+}
+
+
+void queue_print() {
+ struct line *cur;
+
+ fprintf(stderr, " Queue\n");
+
+ for (cur = head; cur != NULL; cur = cur->next) {
+ switch(cur->status) {
+ case STATUS_RUNNING:
+ fprintf(stderr, " %d running ", cur->id); break;
+ case STATUS_ABORTED:
+ fprintf(stderr, " %d aborted ", cur->id); break;
+ case STATUS_FINISHED:
+ fprintf(stderr, " %d finished ", cur->id); break;
+
+ }
+ fprintf(stderr, "\n");
+ //fprintf(stderr, cur->s);
+ }
+}
+
+void queue_finish(struct line *node, char *s, int fid) {
+ struct line *next;
+ if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+ pthread_mutex_lock(&queue_mutex);
+
+ free(node->s);
+ node->s = s;
+ node->status = STATUS_FINISHED;
+ n_received++;
+
+ /* Flush out finished nodes */
+ while (head && head->status == STATUS_FINISHED) {
+
+ if (log_mutex) fprintf(stderr, " Flushing finished node %d\n", head->id);
+
+ fputs(head->s, stdout);
+ fflush(stdout);
+ if (log_mutex) fprintf(stderr, " Flushed node %d\n", head->id);
+ free(head->s);
+
+ next = head->next;
+ free(head);
+
+ head = next;
+
+ n_flushed++;
+
+ if (head == NULL) { /* empty queue */
+ if (ptail == NULL) { /* This can only happen if set in queue_get as signal that there is no more input. */
+ fprintf(stderr, "All sentences finished. Exiting.\n");
+ done(0);
+ } else /* ptail pointed at something which was just popped off the stack -- reset to head*/
+ ptail = &head;
+ }
+ }
+
+ if (log_mutex) fprintf(stderr, " Flushing output %d\n", head->id);
+ fflush(stdout);
+ fprintf(stderr, "%d sentences sent, %d sentences finished, %d sentences flushed\n", n_sent, n_received, n_flushed);
+
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+
+}
+
+char * read_line(int fd, int multiline) {
+ int size = 80;
+ char errorbuf[100];
+ char *s = malloc(size+2);
+ int result, errors=0;
+ int i = 0;
+
+ result = read(fd, s+i, 1);
+
+ while (1) {
+ if (result < 0) {
+ perror("read()");
+ sprintf(errorbuf, "Error code: %d\n", errno);
+ fprintf(stderr, errorbuf);
+ errors++;
+ if (errors > 5) {
+ free(s);
+ return NULL;
+ } else {
+ sleep(1); /* retry after delay */
+ }
+ } else if (result == 0) {
+ break;
+ } else if (multiline==0 && s[i] == '\n') {
+ break;
+ } else {
+ if (s[i] == '\n'){
+ /* if we've reached this point,
+ then multiline must be 1, and we're
+ going to poll the fd for an additional
+ line of data. The basic design is to
+ run a select on the filedescriptor fd.
+ Select will return under two conditions:
+ if there is data on the fd, or if a
+ timeout is reached. We'll select on this
+ fd. If select returns because there's data
+ ready, keep going; else assume there's no
+ more and return the data we already have.
+ */
+
+ fd_set set;
+ FD_ZERO(&set);
+ FD_SET(fd, &set);
+
+ struct timeval timeout;
+ timeout.tv_sec = 3; // number of seconds for timeout
+ timeout.tv_usec = 0;
+
+ int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout);
+ if (ready<1){
+ break; // no more data, stop looping
+ }
+ }
+ i++;
+
+ if (i == size) {
+ size = size*2;
+ s = realloc(s, size+2);
+ }
+ }
+
+ result = read(fd, s+i, 1);
+ }
+
+ if (result == 0 && i == 0) { /* end of file */
+ free(s);
+ return NULL;
+ }
+
+ s[i] = '\n';
+ s[i+1] = '\0';
+
+ return s;
+}
+
+void * new_client(void *arg) {
+ struct clientinfo *client = (struct clientinfo *)arg;
+ struct line *cur;
+ int result;
+ char *s;
+ char errorbuf[100];
+
+ pthread_mutex_lock(&clients_mutex);
+ n_clients++;
+ pthread_mutex_unlock(&clients_mutex);
+
+ fprintf(stderr, "Client connected (%d connected)\n", n_clients);
+
+ for (;;) {
+
+ cur = queue_get(client->s);
+
+ if (cur) {
+ /* fprintf(stderr, "Sending to client: %s", cur->s); */
+ fprintf(stderr, "Sending data %d to client (fid %d)\n", cur->id, client->s);
+ result = write(client->s, cur->s, strlen(cur->s));
+ if (result < strlen(cur->s)){
+ perror("write()");
+ sprintf(errorbuf, "Error code: %d\n", errno);
+ fprintf(stderr, errorbuf);
+
+ pthread_mutex_lock(&clients_mutex);
+ n_clients--;
+ pthread_mutex_unlock(&clients_mutex);
+
+ fprintf(stderr, "Client died (%d connected)\n", n_clients);
+ queue_abort(cur, client->s);
+
+ close(client->s);
+ free(client);
+
+ pthread_exit(NULL);
+ }
+ } else {
+ close(client->s);
+ pthread_mutex_lock(&clients_mutex);
+ n_clients--;
+ pthread_mutex_unlock(&clients_mutex);
+ fprintf(stderr, "Client dismissed (%d connected)\n", n_clients);
+ pthread_exit(NULL);
+ }
+
+ s = read_line(client->s,expect_multiline_output);
+ if (s) {
+ /* fprintf(stderr, "Client (fid %d) returned: %s", client->s, s); */
+ fprintf(stderr, "Client (fid %d) returned data %d\n", client->s, cur->id);
+// queue_print();
+ queue_finish(cur, s, client->s);
+ } else {
+ pthread_mutex_lock(&clients_mutex);
+ n_clients--;
+ pthread_mutex_unlock(&clients_mutex);
+
+ fprintf(stderr, "Client died (%d connected)\n", n_clients);
+ queue_abort(cur, client->s);
+
+ close(client->s);
+ free(client);
+
+ pthread_exit(NULL);
+ }
+
+ }
+ return 0;
+}
+
+void done (int code) {
+ close(s);
+ exit(code);
+}
+
+
+
+int main (int argc, char *argv[]) {
+ struct sockaddr_in sin, from;
+ int g;
+ socklen_t len;
+ struct clientinfo *client;
+ int port;
+ int opt;
+ int errors = 0;
+ int argi;
+ char *key = NULL, *client_key;
+ int use_key = 0;
+ /* the key stuff here doesn't provide any
+ real measure of security, it's mainly to keep
+ jobs from bumping into each other. */
+
+ pthread_t tid;
+ port = DEFAULT_PORT;
+
+ for (argi=1; argi < argc; argi++){
+ if (strcmp(argv[argi], "-m")==0){
+ expect_multiline_output = 1;
+ } else if (strcmp(argv[argi], "-k")==0){
+ argi++;
+ if (argi == argc){
+ fprintf(stderr, "Key must be specified after -k\n");
+ exit(1);
+ }
+ key = argv[argi];
+ use_key = 1;
+ } else if (strcmp(argv[argi], "--stay-alive")==0){
+ stay_alive = 1; /* dont panic and die with zero clients */
+ } else {
+ port = atoi(argv[argi]);
+ }
+ }
+
+ /* Initialize data structures */
+ head = NULL;
+ ptail = &head;
+
+ /* Set up listener */
+ s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+ opt = 1;
+ setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
+
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(port);
+ while (bind(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) {
+ perror("bind()");
+ sleep(1);
+ errors++;
+ if (errors > 100)
+ exit(1);
+ }
+
+ len = sizeof(sin);
+ getsockname(s, (struct sockaddr *) &sin, &len);
+
+ fprintf(stderr, "Listening on port %hu\n", ntohs(sin.sin_port));
+
+ while (listen(s, MAX_CLIENTS) < 0) {
+ perror("listen()");
+ sleep(1);
+ errors++;
+ if (errors > 100)
+ exit(1);
+ }
+
+ for (;;) {
+ len = sizeof(from);
+ g = accept(s, (struct sockaddr *)&from, &len);
+ if (g < 0) {
+ perror("accept()");
+ sleep(1);
+ continue;
+ }
+ client = malloc(sizeof(struct clientinfo));
+ client->s = g;
+ bcopy(&from, &client->sin, len);
+
+ if (use_key){
+ fd_set set;
+ FD_ZERO(&set);
+ FD_SET(client->s, &set);
+
+ struct timeval timeout;
+ timeout.tv_sec = 3; // number of seconds for timeout
+ timeout.tv_usec = 0;
+
+ int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout);
+ if (ready<1){
+ fprintf(stderr, "Prospective client failed to respond with correct key.\n");
+ close(client->s);
+ free(client);
+ } else {
+ client_key = read_line(client->s,0);
+ client_key[strlen(client_key)-1]='\0'; /* chop trailing newline */
+ if (strcmp(key, client_key)==0){
+ pthread_create(&tid, NULL, new_client, client);
+ } else {
+ fprintf(stderr, "Prospective client failed to respond with correct key.\n");
+ close(client->s);
+ free(client);
+ }
+ free(client_key);
+ }
+ } else {
+ pthread_create(&tid, NULL, new_client, client);
+ }
+ }
+
+}
+
+
+
diff --git a/dpmert/sentserver.h b/dpmert/sentserver.h
new file mode 100644
index 00000000..cd17a546
--- /dev/null
+++ b/dpmert/sentserver.h
@@ -0,0 +1,6 @@
+#ifndef SENTSERVER_H
+#define SENTSERVER_H
+
+#define DEFAULT_PORT 50000
+
+#endif
diff --git a/dpmert/test_aer/README b/dpmert/test_aer/README
new file mode 100644
index 00000000..819b2e32
--- /dev/null
+++ b/dpmert/test_aer/README
@@ -0,0 +1,8 @@
+To run the test:
+
+../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights
+
+This will optimize the parameters of the tiny lexical translation model
+so as to minimize the AER of the Viterbi alignment on the development
+set in corpus.src according to the reference alignments in ref.0.
+
diff --git a/dpmert/test_aer/cdec.ini b/dpmert/test_aer/cdec.ini
new file mode 100644
index 00000000..08187848
--- /dev/null
+++ b/dpmert/test_aer/cdec.ini
@@ -0,0 +1,3 @@
+formalism=lextrans
+grammar=grammar
+aligner=true
diff --git a/dpmert/test_aer/corpus.src b/dpmert/test_aer/corpus.src
new file mode 100644
index 00000000..31b23971
--- /dev/null
+++ b/dpmert/test_aer/corpus.src
@@ -0,0 +1,3 @@
+el gato negro ||| the black cat
+el gato ||| the cat
+el libro ||| the book
diff --git a/dpmert/test_aer/grammar b/dpmert/test_aer/grammar
new file mode 100644
index 00000000..9d857824
--- /dev/null
+++ b/dpmert/test_aer/grammar
@@ -0,0 +1,12 @@
+el ||| cat ||| F1=1
+el ||| the ||| F2=1
+el ||| black ||| F3=1
+el ||| book ||| F11=1
+gato ||| cat ||| F4=1 NN=1
+gato ||| black ||| F5=1
+gato ||| the ||| F6=1
+negro ||| the ||| F7=1
+negro ||| cat ||| F8=1
+negro ||| black ||| F9=1
+libro ||| the ||| F10=1
+libro ||| book ||| F12=1 NN=1
diff --git a/dpmert/test_aer/ref.0 b/dpmert/test_aer/ref.0
new file mode 100644
index 00000000..734a9c5b
--- /dev/null
+++ b/dpmert/test_aer/ref.0
@@ -0,0 +1,3 @@
+0-0 1-2 2-1
+0-0 1-1
+0-0 1-1
diff --git a/dpmert/test_aer/weights b/dpmert/test_aer/weights
new file mode 100644
index 00000000..afc9282e
--- /dev/null
+++ b/dpmert/test_aer/weights
@@ -0,0 +1,13 @@
+F1 0.1
+F2 -.5980815
+F3 0.24235
+F4 0.625
+F5 0.4514
+F6 0.112316
+F7 -0.123415
+F8 -0.25390285
+F9 -0.23852
+F10 0.646
+F11 0.413141
+F12 0.343216
+NN -0.1215
diff --git a/dpmert/test_data/0.json.gz b/dpmert/test_data/0.json.gz
new file mode 100644
index 00000000..30f8dd77
--- /dev/null
+++ b/dpmert/test_data/0.json.gz
Binary files differ
diff --git a/dpmert/test_data/1.json.gz b/dpmert/test_data/1.json.gz
new file mode 100644
index 00000000..c82cc179
--- /dev/null
+++ b/dpmert/test_data/1.json.gz
Binary files differ
diff --git a/dpmert/test_data/c2e.txt.0 b/dpmert/test_data/c2e.txt.0
new file mode 100644
index 00000000..12c4abe9
--- /dev/null
+++ b/dpmert/test_data/c2e.txt.0
@@ -0,0 +1,2 @@
+australia reopens embassy in manila
+( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack .
diff --git a/dpmert/test_data/c2e.txt.1 b/dpmert/test_data/c2e.txt.1
new file mode 100644
index 00000000..4ac12df1
--- /dev/null
+++ b/dpmert/test_data/c2e.txt.1
@@ -0,0 +1,2 @@
+australia reopened manila embassy
+( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack .
diff --git a/dpmert/test_data/c2e.txt.2 b/dpmert/test_data/c2e.txt.2
new file mode 100644
index 00000000..2f67b72f
--- /dev/null
+++ b/dpmert/test_data/c2e.txt.2
@@ -0,0 +1,2 @@
+australia to reopen embassy in manila
+( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats .
diff --git a/dpmert/test_data/c2e.txt.3 b/dpmert/test_data/c2e.txt.3
new file mode 100644
index 00000000..5483cef6
--- /dev/null
+++ b/dpmert/test_data/c2e.txt.3
@@ -0,0 +1,2 @@
+australia to re - open its embassy to manila
+( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago .
diff --git a/dpmert/test_data/re.txt.0 b/dpmert/test_data/re.txt.0
new file mode 100644
index 00000000..86eff087
--- /dev/null
+++ b/dpmert/test_data/re.txt.0
@@ -0,0 +1,5 @@
+erdogan states turkey to reject any pressures to urge it to recognize cyprus
+ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened .
+erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus .
+we will discuss this dossier in the course of membership negotiations . "
+he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . "
diff --git a/dpmert/test_data/re.txt.1 b/dpmert/test_data/re.txt.1
new file mode 100644
index 00000000..2140f198
--- /dev/null
+++ b/dpmert/test_data/re.txt.1
@@ -0,0 +1,5 @@
+erdogan confirms turkey will resist any pressure to recognize cyprus
+ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara .
+erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus .
+we shall discuss this issue in the course of the membership negotiations . "
+he added : " let me be clear - i cannot confine turkey . this is something we do not accept . "
diff --git a/dpmert/test_data/re.txt.2 b/dpmert/test_data/re.txt.2
new file mode 100644
index 00000000..94e46286
--- /dev/null
+++ b/dpmert/test_data/re.txt.2
@@ -0,0 +1,5 @@
+erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus
+ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara .
+erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus .
+we shall discuss this dossier during the negotiations on joining . "
+and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . "
diff --git a/dpmert/test_data/re.txt.3 b/dpmert/test_data/re.txt.3
new file mode 100644
index 00000000..f87c3308
--- /dev/null
+++ b/dpmert/test_data/re.txt.3
@@ -0,0 +1,5 @@
+erdogan stresses that turkey will reject all pressures to force it to recognize cyprus
+ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not .
+erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus .
+we will discuss this file during the negotiations on joining . "
+he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . "