summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.am2
-rw-r--r--configure.ac2
-rw-r--r--decoder/ff_ngrams.cc23
-rw-r--r--decoder/ff_spans.cc39
-rw-r--r--mteval/scorer.cc12
-rw-r--r--pro-train/Makefile.am13
-rw-r--r--pro-train/README.shared-mem9
-rwxr-xr-xpro-train/dist-pro.pl657
-rwxr-xr-xpro-train/mr_pro_generate_mapper_input.pl18
-rw-r--r--pro-train/mr_pro_map.cc351
-rw-r--r--pro-train/mr_pro_reduce.cc277
-rw-r--r--utils/fast_sparse_vector.h19
-rw-r--r--utils/filelib.cc12
-rw-r--r--utils/filelib.h1
14 files changed, 1405 insertions, 30 deletions
diff --git a/Makefile.am b/Makefile.am
index f5397d0b..98b4bac7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,7 +1,7 @@
# warning - the subdirectories in the following list should
# be kept in topologically sorted order. Also, DO NOT introduce
# cyclic dependencies between these directories!
-SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira vest extools
+SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira vest pro-train extools
#gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava
diff --git a/configure.ac b/configure.ac
index db2b4afc..4e708073 100644
--- a/configure.ac
+++ b/configure.ac
@@ -92,4 +92,4 @@ then
AM_CONDITIONAL([GLC], true)
fi
-AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile vest/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile)
+AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile vest/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile)
diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc
index d52667cd..04dd1906 100644
--- a/decoder/ff_ngrams.cc
+++ b/decoder/ff_ngrams.cc
@@ -46,6 +46,17 @@ struct State {
};
}
+namespace {
+ string Escape(const string& x) {
+ string y = x;
+ for (int i = 0; i < y.size(); ++i) {
+ if (y[i] == '=') y[i]='_';
+ if (y[i] == ';') y[i]='_';
+ }
+ return y;
+ }
+}
+
class NgramDetectorImpl {
// returns the number of unscored words at the left edge of a span
@@ -114,11 +125,17 @@ class NgramDetectorImpl {
int& fid = ft->fids[curword];
++n;
if (!fid) {
- const char* code="_UBT456789";
+ const char* code="_UBT456789"; // prefix code (unigram, bigram, etc.)
ostringstream os;
os << code[n] << ':';
- for (int i = n-1; i >= 0; --i)
- os << (i != n-1 ? "_" : "") << TD::Convert(buf[i]);
+ for (int i = n-1; i >= 0; --i) {
+ os << (i != n-1 ? "_" : "");
+ const string& tok = TD::Convert(buf[i]);
+ if (tok.find('=') == string::npos)
+ os << tok;
+ else
+ os << Escape(tok);
+ }
fid = FD::Convert(os.str());
}
feats->set_value(fid, 1);
diff --git a/decoder/ff_spans.cc b/decoder/ff_spans.cc
index e1da088d..bc23974d 100644
--- a/decoder/ff_spans.cc
+++ b/decoder/ff_spans.cc
@@ -13,6 +13,17 @@
using namespace std;
+namespace {
+ string Escape(const string& x) {
+ string y = x;
+ for (int i = 0; i < y.size(); ++i) {
+ if (y[i] == '=') y[i]='_';
+ if (y[i] == ';') y[i]='_';
+ }
+ return y;
+ }
+}
+
// log transform to make long spans cluster together
// but preserve differences
int SpanSizeTransform(unsigned span_size) {
@@ -140,19 +151,19 @@ void SpanFeatures::PrepareForInput(const SentenceMetadata& smeta) {
word = MapIfNecessary(word);
ostringstream sfid;
sfid << "ES:" << TD::Convert(word);
- end_span_ids_[i] = FD::Convert(sfid.str());
+ end_span_ids_[i] = FD::Convert(Escape(sfid.str()));
ostringstream esbiid;
esbiid << "EBI:" << TD::Convert(bword) << "_" << TD::Convert(word);
- end_bigram_ids_[i] = FD::Convert(esbiid.str());
+ end_bigram_ids_[i] = FD::Convert(Escape(esbiid.str()));
ostringstream bsbiid;
bsbiid << "BBI:" << TD::Convert(bword) << "_" << TD::Convert(word);
- beg_bigram_ids_[i] = FD::Convert(bsbiid.str());
+ beg_bigram_ids_[i] = FD::Convert(Escape(bsbiid.str()));
ostringstream bfid;
bfid << "BS:" << TD::Convert(bword);
- beg_span_ids_[i] = FD::Convert(bfid.str());
+ beg_span_ids_[i] = FD::Convert(Escape(bfid.str()));
if (use_collapsed_features_) {
- end_span_vals_[i] = feat2val_[sfid.str()] + feat2val_[esbiid.str()];
- beg_span_vals_[i] = feat2val_[bfid.str()] + feat2val_[bsbiid.str()];
+ end_span_vals_[i] = feat2val_[Escape(sfid.str())] + feat2val_[Escape(esbiid.str())];
+ beg_span_vals_[i] = feat2val_[Escape(bfid.str())] + feat2val_[Escape(bsbiid.str())];
}
}
for (int i = 0; i <= lattice.size(); ++i) {
@@ -167,16 +178,16 @@ void SpanFeatures::PrepareForInput(const SentenceMetadata& smeta) {
word = MapIfNecessary(word);
ostringstream pf;
pf << "S:" << TD::Convert(bword) << "_" << TD::Convert(word);
- span_feats_(i,j).first = FD::Convert(pf.str());
- span_feats_(i,j).second = FD::Convert("S_" + pf.str());
+ span_feats_(i,j).first = FD::Convert(Escape(pf.str()));
+ span_feats_(i,j).second = FD::Convert(Escape("S_" + pf.str()));
ostringstream lf;
const unsigned span_size = (i < j ? j - i : i - j);
lf << "LS:" << SpanSizeTransform(span_size) << "_" << TD::Convert(bword) << "_" << TD::Convert(word);
- len_span_feats_(i,j).first = FD::Convert(lf.str());
- len_span_feats_(i,j).second = FD::Convert("S_" + lf.str());
+ len_span_feats_(i,j).first = FD::Convert(Escape(lf.str()));
+ len_span_feats_(i,j).second = FD::Convert(Escape("S_" + lf.str()));
if (use_collapsed_features_) {
- span_vals_(i,j).first = feat2val_[pf.str()] + feat2val_[lf.str()];
- span_vals_(i,j).second = feat2val_["S_" + pf.str()] + feat2val_["S_" + lf.str()];
+ span_vals_(i,j).first = feat2val_[Escape(pf.str())] + feat2val_[Escape(lf.str())];
+ span_vals_(i,j).second = feat2val_[Escape("S_" + pf.str())] + feat2val_[Escape("S_" + lf.str())];
}
}
}
@@ -209,14 +220,14 @@ void RuleNgramFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
const string& cur = TD::Convert(w);
ostringstream os;
os << "RB:" << prev << '_' << cur;
- const int fid = FD::Convert(os.str());
+ const int fid = FD::Convert(Escape(os.str()));
if (fid <= 0) return;
f.add_value(fid, 1.0);
prev = cur;
}
ostringstream os;
os << "RB:" << prev << '_' << "</r>";
- f.set_value(FD::Convert(os.str()), 1.0);
+ f.set_value(FD::Convert(Escape(os.str())), 1.0);
}
(*features) += it->second;
}
diff --git a/mteval/scorer.cc b/mteval/scorer.cc
index 2daa0daa..a83b9e2f 100644
--- a/mteval/scorer.cc
+++ b/mteval/scorer.cc
@@ -430,6 +430,7 @@ float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const {
float log_bleu = 0;
if (precs) precs->clear();
int count = 0;
+ vector<float> total_precs(N());
for (int i = 0; i < N(); ++i) {
if (hyp_ngram_counts[i] > 0) {
float cor_count = correct_ngram_hit_counts[i];
@@ -440,14 +441,21 @@ float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const {
log_bleu += lprec;
++count;
}
+ total_precs[i] = log_bleu;
}
- log_bleu /= static_cast<float>(count);
+ vector<float> bleus(N());
float lbp = 0.0;
if (hyp_len < ref_len)
lbp = (hyp_len - ref_len) / hyp_len;
log_bleu += lbp;
if (bp) *bp = exp(lbp);
- return exp(log_bleu);
+ float wb = 0;
+ for (int i = 0; i < N(); ++i) {
+ bleus[i] = exp(total_precs[i] / (i+1) + lbp);
+ wb += bleus[i] / pow(2.0, 4.0 - i);
+ }
+ //return wb;
+ return bleus.back();
}
diff --git a/pro-train/Makefile.am b/pro-train/Makefile.am
new file mode 100644
index 00000000..fdaf43e2
--- /dev/null
+++ b/pro-train/Makefile.am
@@ -0,0 +1,13 @@
+bin_PROGRAMS = \
+ mr_pro_map \
+ mr_pro_reduce
+
+TESTS = lo_test
+
+mr_pro_map_SOURCES = mr_pro_map.cc
+mr_pro_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+mr_pro_reduce_SOURCES = mr_pro_reduce.cc
+mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training
diff --git a/pro-train/README.shared-mem b/pro-train/README.shared-mem
new file mode 100644
index 00000000..7728efc0
--- /dev/null
+++ b/pro-train/README.shared-mem
@@ -0,0 +1,9 @@
+If you want to run dist-vest.pl on a very large shared memory machine, do the
+following:
+
+ ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini
+
+This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the
+decoder must load grammars, language models, etc., J should be smaller than I, but this will depend
+on the system you are running on and the complexity of the models used for decoding.
+
diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl
new file mode 100755
index 00000000..dbfa329a
--- /dev/null
+++ b/pro-train/dist-pro.pl
@@ -0,0 +1,657 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+
+my $VEST_DIR="$SCRIPT_DIR/../vest";
+require "$VEST_DIR/libcall.pl";
+
+# Default settings
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl";
+my $MAPPER = "$bin_dir/mr_pro_map";
+my $REDUCER = "$bin_dir/mr_pro_reduce";
+my $parallelize = "$VEST_DIR/parallelize.pl";
+my $libcall = "$VEST_DIR/libcall.pl";
+my $sentserver = "$VEST_DIR/sentserver";
+my $sentclient = "$VEST_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 30;
+my $iteration = 1;
+my $run_local = 0;
+my $best_weights;
+my $max_iterations = 30;
+my $decode_nodes = 15; # number of decode nodes
+my $pmem = "4g";
+my $disable_clean = 0;
+my %seen_weights;
+my $help = 0;
+my $epsilon = 0.0001;
+my $dryrun = 0;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $use_make; # use make to parallelize
+my $usefork;
+my $initial_weights;
+my $pass_suffix = '';
+my $cpbin=1;
+
+# regularization strength
+my $tune_regularizer = 0;
+my $reg = 1e-2;
+
+# Process command-line options
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+ "decode-nodes=i" => \$decode_nodes,
+ "dont-clean" => \$disable_clean,
+ "pass-suffix=s" => \$pass_suffix,
+ "use-fork" => \$usefork,
+ "dry-run" => \$dryrun,
+ "epsilon=s" => \$epsilon,
+ "help" => \$help,
+ "weights=s" => \$initial_weights,
+ "tune-regularizer" => \$tune_regularizer,
+ "reg=f" => \$reg,
+ "local" => \$run_local,
+ "use-make=i" => \$use_make,
+ "max-iterations=i" => \$max_iterations,
+ "pmem=s" => \$pmem,
+ "cpbin!" => \$cpbin,
+ "ref-files=s" => \$refFiles,
+ "metric=s" => \$metric,
+ "source-file=s" => \$srcFile,
+ "workdir=s" => \$dir,
+) == 0 || @ARGV!=1 || $help) {
+ print_help();
+ exit;
+}
+
+if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; }
+
+if ($metric =~ /^(combi|ter)$/i) {
+ $lines_per_mapper = 5;
+}
+
+($iniFile) = @ARGV;
+
+
+sub write_config;
+sub enseg;
+sub print_help;
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+ $DIR_FLAG = '';
+}
+
+my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
+
+unless ($dir){
+ $dir = "protrain";
+}
+unless ($dir =~ /^\//){ # convert relative path to absolute path
+ my $basedir = check_output("pwd");
+ chomp $basedir;
+ $dir = "$basedir/$dir";
+}
+
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+ print STDERR "Cleanup...\n";
+ for my $pid (@childpids){ unchecked_call("kill $pid"); }
+ for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+ exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit =
+ sub{ cleanup(); };
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+
+
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+use File::Basename qw(basename);
+#pass bindir, refs to vars holding bin
+sub modbin {
+ local $_;
+ my $bindir=shift;
+ check_call("mkdir -p $bindir");
+ -d $bindir || die "couldn't make bindir $bindir";
+ for (@_) {
+ my $src=$$_;
+ $$_="$bindir/".basename($src);
+ check_call("cp -p $src $$_");
+ }
+}
+sub dirsize {
+ opendir ISEMPTY,$_[0];
+ return scalar(readdir(ISEMPTY))-1;
+}
+my @allweights;
+if ($dryrun){
+ write_config(*STDERR);
+ exit 0;
+} else {
+ if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs
+ die "ERROR: working dir $dir already exists\n\n";
+ } else {
+ -e $dir || mkdir $dir;
+ mkdir "$dir/hgs";
+ modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
+ mkdir "$dir/scripts";
+ my $cmdfile="$dir/rerun-pro.sh";
+ open CMD,'>',$cmdfile;
+ print CMD "cd ",&getcwd,"\n";
+# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
+ my $cline=&cmdline."\n";
+ print CMD $cline;
+ close CMD;
+ print STDERR $cline;
+ chmod(0755,$cmdfile);
+ check_call("cp $initial_weights $dir/weights.0");
+ die "Can't find weights.0" unless (-e "$dir/weights.0");
+ }
+ write_config(*STDERR);
+}
+
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+$iniFile = $newIniFile;
+
+my $newsrc = "$dir/dev.input";
+enseg($srcFile, $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+while (1){
+ print STDERR "\n\nITERATION $iteration\n==========\n";
+
+ if ($iteration > $max_iterations){
+ print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+ last;
+ }
+ # iteration-specific files
+ my $runFile="$dir/run.raw.$iteration";
+ my $onebestFile="$dir/1best.$iteration";
+ my $logdir="$dir/logs.$iteration";
+ my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+ my $scorerLog="$logdir/scorer.log.$iteration";
+ check_call("mkdir -p $logdir");
+
+
+ #decode
+ print STDERR "RUNNING DECODER AT ";
+ print STDERR unchecked_output("date");
+ my $im1 = $iteration - 1;
+ my $weightsFile="$dir/weights.$im1";
+ push @allweights, "-w $dir/weights.$im1";
+ `rm -f $dir/hgs/*.gz`;
+ my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+ my $pcmd;
+ if ($run_local) {
+ $pcmd = "cat $srcFile |";
+ } elsif ($use_make) {
+ # TODO: Throw error when decode_nodes is specified along with use_make
+ $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --";
+ } else {
+ $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --";
+ }
+ my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ my $num_hgs;
+ my $num_topbest;
+ my $retries = 0;
+ while($retries < 5) {
+ $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+ $num_topbest = check_output("wc -l < $runFile");
+ print STDERR "NUMBER OF HGs: $num_hgs\n";
+ print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+ if($devSize == $num_hgs && $devSize == $num_topbest) {
+ last;
+ } else {
+ print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+ sleep(3);
+ }
+ $retries++;
+ }
+ die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+ my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric");
+ chomp $dec_score;
+ print STDERR "DECODER SCORE: $dec_score\n";
+
+ # save space
+ check_call("gzip -f $runFile");
+ check_call("gzip -f $decoderLog");
+
+ # run optimizer
+ print STDERR "RUNNING OPTIMIZER AT ";
+ print STDERR unchecked_output("date");
+ print STDERR " - GENERATE TRAINING EXEMPLARS\n";
+ my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+ my $score = 0;
+ my $icc = 0;
+ my $inweights="$dir/weights.$im1";
+ $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ check_call("mkdir -p $dir/splag.$im1");
+ $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1 $dir/splag.$im1/mapinput.";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
+ my @shards = grep { /^mapinput\./ } readdir(DIR);
+ closedir DIR;
+ die "No shards!" unless scalar @shards > 0;
+ my $joblist = "";
+ my $nmappers = 0;
+ @cleanupcmds = ();
+ my %o2i = ();
+ my $first_shard = 1;
+ my $mkfile; # only used with makefiles
+ my $mkfilename;
+ if ($use_make) {
+ $mkfilename = "$dir/splag.$im1/domap.mk";
+ open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
+ print $mkfile "all: $dir/splag.$im1/map.done\n\n";
+ }
+ my @mkouts = (); # only used with makefiles
+ my @mapoutputs = ();
+ for my $shard (@shards) {
+ my $mapoutput = $shard;
+ my $client_name = $shard;
+ $client_name =~ s/mapinput.//;
+ $client_name = "pro.$client_name";
+ $mapoutput =~ s/mapinput/mapoutput/;
+ push @mapoutputs, "$dir/splag.$im1/$mapoutput";
+ $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
+ my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
+ if ($run_local) {
+ print STDERR "COMMAND:\n$script\n";
+ check_bash_call($script);
+ } elsif ($use_make) {
+ my $script_file = "$dir/scripts/map.$shard";
+ open F, ">$script_file" or die "Can't write $script_file: $!";
+ print F "#!/bin/bash\n";
+ print F "$script\n";
+ close F;
+ my $output = "$dir/splag.$im1/$mapoutput";
+ push @mkouts, $output;
+ chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
+ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+ print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
+ } else {
+ my $script_file = "$dir/scripts/map.$shard";
+ open F, ">$script_file" or die "Can't write $script_file: $!";
+ print F "$script\n";
+ close F;
+ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+
+ $nmappers++;
+ my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
+ my $jobid = check_output("$qcmd");
+ chomp $jobid;
+ $jobid =~ s/^(\d+)(.*?)$/\1/g;
+ $jobid =~ s/^Your job (\d+) .*$/\1/;
+ push(@cleanupcmds, "qdel $jobid 2> /dev/null");
+ print STDERR " $jobid";
+ if ($joblist == "") { $joblist = $jobid; }
+ else {$joblist = $joblist . "\|" . $jobid; }
+ }
+ }
+ my @dev_outs = ();
+ my @devtest_outs = ();
+ if ($tune_regularizer) {
+ for (my $i = 0; $i < scalar @mapoutputs; $i++) {
+ if ($i % 3 == 1) {
+ push @devtest_outs, $mapoutputs[$i];
+ } else {
+ push @dev_outs, $mapoutputs[$i];
+ }
+ }
+ if (scalar @devtest_outs == 0) {
+ die "Not enough training instances for regularization tuning! Rerun without --tune-regularizer\n";
+ }
+ } else {
+ @dev_outs = @mapoutputs;
+ }
+ if ($run_local) {
+ print STDERR "\nCompleted extraction of training exemplars.\n";
+ } elsif ($use_make) {
+ print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
+ close $mkfile;
+ my $mcmd = "make -j $use_make -f $mkfilename";
+ print STDERR "\nExecuting: $mcmd\n";
+ check_call($mcmd);
+ } else {
+ print STDERR "\nLaunched $nmappers mappers.\n";
+ sleep 8;
+ print STDERR "Waiting for mappers to complete...\n";
+ while ($nmappers > 0) {
+ sleep 5;
+ my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
+ $nmappers = scalar @livejobs;
+ }
+ print STDERR "All mappers complete.\n";
+ }
+ my $tol = 0;
+ my $til = 0;
+ my $dev_test_file = "$dir/splag.$im1/devtest.gz";
+ if ($tune_regularizer) {
+ my $cmd = "cat @devtest_outs | gzip > $dev_test_file";
+ check_bash_call($cmd);
+ die "Can't find file $dev_test_file" unless -f $dev_test_file;
+ }
+ #print STDERR "MO: @mapoutputs\n";
+ for my $mo (@mapoutputs) {
+ #my $olines = get_lines($mo);
+ #my $ilines = get_lines($o2i{$mo});
+ #die "$mo: no training instances generated!" if $olines == 0;
+ }
+ print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n";
+ print STDERR unchecked_output("date");
+ $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -s $reg";
+ if ($tune_regularizer) {
+ $cmd .= " -T -t $dev_test_file";
+ }
+ $cmd .= " > $dir/weights.$iteration";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ $lastWeightsFile = "$dir/weights.$iteration";
+ if ($tune_regularizer) {
+ open W, "<$lastWeightsFile" or die "Can't read $lastWeightsFile: $!";
+ my $line = <W>;
+ close W;
+ my ($sharp, $label, $nreg) = split /\s|=/, $line;
+ print STDERR "REGULARIZATION STRENGTH ($label) IS $nreg\n";
+ $reg = $nreg;
+ # only tune regularizer on first iteration?
+ $tune_regularizer = 0;
+ }
+ $lastPScore = $score;
+ $iteration++;
+ print STDERR "\n==========\n";
+}
+
+print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
+
+print STDOUT "$lastWeightsFile\n";
+
+exit 0;
+
+sub get_lines {
+ my $fn = shift @_;
+ open FL, "<$fn" or die "Couldn't read $fn: $!";
+ my $lc = 0;
+ while(<FL>) { $lc++; }
+ return $lc;
+}
+
+sub get_comma_sep_refs {
+ my ($r,$p) = @_;
+ my $o = check_output("echo $p");
+ chomp $o;
+ my @files = split /\s+/, $o;
+ return "-$r " . join(" -$r ", @files);
+}
+
+sub read_weights_file {
+ my ($file) = @_;
+ open F, "<$file" or die "Couldn't read $file: $!";
+ my @r = ();
+ my $pm = -1;
+ while(<F>) {
+ next if /^#/;
+ next if /^\s*$/;
+ chomp;
+ if (/^(.+)\s+(.+)$/) {
+ my $m = $1;
+ my $w = $2;
+ die "Weights out of order: $m <= $pm" unless $m > $pm;
+ push @r, $w;
+ } else {
+ warn "Unexpected feature name in weight file: $_";
+ }
+ }
+ close F;
+ return join ' ', @r;
+}
+
+# subs
+sub write_config {
+ my $fh = shift;
+ my $cleanup = "yes";
+ if ($disable_clean) {$cleanup = "no";}
+
+ print $fh "\n";
+ print $fh "DECODER: $decoder\n";
+ print $fh "INI FILE: $iniFile\n";
+ print $fh "WORKING DIR: $dir\n";
+ print $fh "SOURCE (DEV): $srcFile\n";
+ print $fh "REFS (DEV): $refFiles\n";
+ print $fh "EVAL METRIC: $metric\n";
+ print $fh "MAX ITERATIONS: $max_iterations\n";
+ print $fh "DECODE NODES: $decode_nodes\n";
+ print $fh "HEAD NODE: $host\n";
+ print $fh "PMEM (DECODING): $pmem\n";
+ print $fh "CLEANUP: $cleanup\n";
+}
+
+sub update_weights_file {
+ my ($neww, $rfn, $rpts) = @_;
+ my @feats = @$rfn;
+ my @pts = @$rpts;
+ my $num_feats = scalar @feats;
+ my $num_pts = scalar @pts;
+ die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+ open G, ">$neww" or die;
+ for (my $i = 0; $i < $num_feats; $i++) {
+ my $f = $feats[$i];
+ my $lambda = $pts[$i];
+ print G "$f $lambda\n";
+ }
+ close G;
+}
+
+sub enseg {
+ my $src = shift;
+ my $newsrc = shift;
+ open(SRC, $src);
+ open(NEWSRC, ">$newsrc");
+ my $i=0;
+ while (my $line=<SRC>){
+ chomp $line;
+ if ($line =~ /^\s*<seg/i) {
+ if($line =~ /id="[0-9]+"/) {
+ print NEWSRC "$line\n";
+ } else {
+ die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+ }
+ } else {
+ print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+ }
+ $i++;
+ }
+ close SRC;
+ close NEWSRC;
+ die "Empty dev set!" if ($i == 0);
+}
+
+sub print_help {
+
+ my $executable = check_output("basename $0"); chomp $executable;
+ print << "Help";
+
+Usage: $executable [options] <ini file>
+
+ $executable [options] <ini file>
+ Runs a complete MERT optimization and test set decoding, using
+ the decoder configuration in ini file. Note that many of the
+ options have default values that are inferred automatically
+ based on certain conventions. For details, refer to descriptions
+ of the options --decoder, --weights, and --workdir.
+
+Required:
+
+ --ref-files <files>
+ Dev set ref files. This option takes only a single string argument.
+ To use multiple files (including file globbing), this argument should
+ be quoted.
+
+ --source-file <file>
+ Dev set source file.
+
+ --weights <file>
+ Initial weights file (use empty file to start from 0)
+
+General options:
+
+ --local
+ Run the decoder and optimizer locally with a single thread.
+
+ --decode-nodes <I>
+ Number of decoder processes to run in parallel. [default=15]
+
+ --help
+ Print this message and exit.
+
+ --max-iterations <M>
+ Maximum number of iterations to run. If not specified, defaults
+ to 10.
+
+ --metric <method>
+ Metric to optimize.
+ Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+ --pass-suffix <S>
+ If the decoder is doing multi-pass decoding, the pass suffix "2",
+ "3", etc., is used to control what iteration of weights is set.
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs.
+
+ --use-make <I>
+ Use make -j <I> to run the optimizer commands (useful on large
+ shared-memory machines where qsub is unavailable).
+
+ --workdir <dir>
+ Directory for intermediate and output files. If not specified, the
+ name is derived from the ini filename. Assuming that the ini
+ filename begins with the decoder name and ends with ini, the default
+ name of the working directory is inferred from the middle part of
+ the filename. E.g. an ini file named decoder.foo.ini would have
+ a default working directory name foo.
+
+Regularization options:
+
+ --tune-regularizer
+ Hold out one third of the tuning data and used this to tune the
+ regularization parameter.
+
+ --reg <F>
+
+Help
+}
+
+sub convert {
+ my ($str) = @_;
+ my @ps = split /;/, $str;
+ my %dict = ();
+ for my $p (@ps) {
+ my ($k, $v) = split /=/, $p;
+ $dict{$k} = $v;
+ }
+ return %dict;
+}
+
+
+sub cmdline {
+ return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+
+sub escaped_shell_args {
+ return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+ return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+ return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
diff --git a/pro-train/mr_pro_generate_mapper_input.pl b/pro-train/mr_pro_generate_mapper_input.pl
new file mode 100755
index 00000000..b30fc4fd
--- /dev/null
+++ b/pro-train/mr_pro_generate_mapper_input.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1;
+my $d = shift @ARGV;
+die "Can't find directory $d" unless -d $d;
+
+opendir(DIR, $d) or die "Can't read $d: $!";
+my @hgs = grep { /\.gz$/ } readdir(DIR);
+closedir DIR;
+
+for my $hg (@hgs) {
+ my $file = $hg;
+ my $id = $hg;
+ $id =~ s/(\.json)?\.gz//;
+ print "$d/$file $id\n";
+}
+
diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc
new file mode 100644
index 00000000..4324e8de
--- /dev/null
+++ b/pro-train/mr_pro_map.cc
@@ -0,0 +1,351 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <tr1/unordered_map>
+
+#include <boost/functional/hash.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sampler.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "weights.h"
+#include "scorer.h"
+#include "inside_outside.h"
+#include "hg_io.h"
+#include "kbest.h"
+#include "viterbi.h"
+
+// This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011)
+
+using namespace std;
+namespace po = boost::program_options;
+
+struct ApproxVectorHasher {
+ static const size_t MASK = 0xFFFFFFFFull;
+ union UType {
+ double f;
+ size_t i;
+ };
+ static inline double round(const double x) {
+ UType t;
+ t.f = x;
+ size_t r = t.i & MASK;
+ if ((r << 1) > MASK)
+ t.i += MASK - r + 1;
+ else
+ t.i &= (1ull - MASK);
+ return t.f;
+ }
+ size_t operator()(const SparseVector<double>& x) const {
+ size_t h = 0x573915839;
+ for (SparseVector<double>::const_iterator it = x.begin(); it != x.end(); ++it) {
+ UType t;
+ t.f = it->second;
+ if (t.f) {
+ size_t z = (t.i >> 32);
+ boost::hash_combine(h, it->first);
+ boost::hash_combine(h, z);
+ }
+ }
+ return h;
+ }
+};
+
+struct ApproxVectorEquals {
+ bool operator()(const SparseVector<double>& a, const SparseVector<double>& b) const {
+ SparseVector<double>::const_iterator bit = b.begin();
+ for (SparseVector<double>::const_iterator ait = a.begin(); ait != a.end(); ++ait) {
+ if (bit == b.end() ||
+ ait->first != bit->first ||
+ ApproxVectorHasher::round(ait->second) != ApproxVectorHasher::round(bit->second))
+ return false;
+ ++bit;
+ }
+ if (bit != b.end()) return false;
+ return true;
+ }
+};
+
+boost::shared_ptr<MT19937> rng;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+ ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
+ ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)")
+ ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+ ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)")
+ ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized")
+ ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")
+ ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")
+ ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)")
+ ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (!conf->count("reference")) {
+ cerr << "Please specify one or more references using -r <REF.TXT>\n";
+ flag = true;
+ }
+ if (!conf->count("weights")) {
+ cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+struct HypInfo {
+ HypInfo() : g_(-100.0) {}
+ HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-100.0), x(feats) {}
+
+ // lazy evaluation
+ double g(const SentenceScorer& scorer) const {
+ if (g_ == -100.0)
+ g_ = scorer.ScoreCandidate(hyp)->ComputeScore();
+ return g_;
+ }
+ vector<WordID> hyp;
+ mutable double g_;
+ SparseVector<double> x;
+};
+
+struct HypInfoCompare {
+ bool operator()(const HypInfo& a, const HypInfo& b) const {
+ ApproxVectorEquals comp;
+ return (a.hyp == b.hyp && comp(a.x,b.x));
+ }
+};
+
+struct HypInfoHasher {
+ size_t operator()(const HypInfo& x) const {
+ boost::hash<vector<WordID> > hhasher;
+ ApproxVectorHasher vhasher;
+ size_t ha = hhasher(x.hyp);
+ boost::hash_combine(ha, vhasher(x.x));
+ return ha;
+ }
+};
+
+void WriteKBest(const string& file, const vector<HypInfo>& kbest) {
+ WriteFile wf(file);
+ ostream& out = *wf.stream();
+ out.precision(10);
+ for (int i = 0; i < kbest.size(); ++i) {
+ out << TD::GetString(kbest[i].hyp) << endl;
+ out << kbest[i].x << endl;
+ }
+}
+
+void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
+ SparseVector<double>& x = *out;
+ size_t last_start = cur;
+ size_t last_comma = string::npos;
+ while(cur <= line.size()) {
+ if (line[cur] == ' ' || cur == line.size()) {
+ if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
+ cerr << "[ERROR] " << line << endl << " position = " << cur << endl;
+ exit(1);
+ }
+ const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
+ if (cur < line.size()) line[cur] = 0;
+ const double val = strtod(&line[last_comma + 1], NULL);
+ x.set_value(fid, val);
+
+ last_comma = string::npos;
+ last_start = cur+1;
+ } else {
+ if (line[cur] == '=')
+ last_comma = cur;
+ }
+ ++cur;
+ }
+}
+
+void ReadKBest(const string& file, vector<HypInfo>* kbest) {
+ cerr << "Reading from " << file << endl;
+ ReadFile rf(file);
+ istream& in = *rf.stream();
+ string cand;
+ string feats;
+ while(getline(in, cand)) {
+ getline(in, feats);
+ assert(in);
+ kbest->push_back(HypInfo());
+ TD::ConvertSentence(cand, &kbest->back().hyp);
+ ParseSparseVector(feats, 0, &kbest->back().x);
+ }
+ cerr << " read " << kbest->size() << " hypotheses\n";
+}
+
+void Dedup(vector<HypInfo>* h) {
+ cerr << "Dedup in=" << h->size();
+ tr1::unordered_set<HypInfo, HypInfoHasher, HypInfoCompare> u;
+ while(h->size() > 0) {
+ u.insert(h->back());
+ h->pop_back();
+ }
+ tr1::unordered_set<HypInfo, HypInfoHasher, HypInfoCompare>::iterator it = u.begin();
+ while (it != u.end()) {
+ h->push_back(*it);
+ it = u.erase(it);
+ }
+ cerr << " out=" << h->size() << endl;
+}
+
+struct ThresholdAlpha {
+ explicit ThresholdAlpha(double t = 0.05) : threshold(t) {}
+ double operator()(double mag) const {
+ if (mag < threshold) return 0.0; else return 1.0;
+ }
+ const double threshold;
+};
+
+struct TrainingInstance {
+ TrainingInstance(const SparseVector<double>& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {}
+ SparseVector<double> x;
+#undef DEBUGGING_PRO
+#ifdef DEBUGGING_PRO
+ vector<WordID> a;
+ vector<WordID> b;
+#endif
+ bool y;
+ double gdiff;
+};
+#ifdef DEBUGGING_PRO
+ostream& operator<<(ostream& os, const TrainingInstance& d) {
+ return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x;
+}
+#endif
+
+struct DiffOrder {
+ bool operator()(const TrainingInstance& a, const TrainingInstance& b) const {
+ return a.gdiff > b.gdiff;
+ }
+};
+
+void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const bool invert_score, vector<TrainingInstance>* pv) {
+ vector<TrainingInstance> v1, v2;
+ double avg_diff = 0;
+ for (unsigned i = 0; i < gamma; ++i) {
+ const size_t a = rng->inclusive(0, J_i.size() - 1)();
+ const size_t b = rng->inclusive(0, J_i.size() - 1)();
+ if (a == b) continue;
+ double ga = J_i[a].g(scorer);
+ double gb = J_i[b].g(scorer);
+ bool positive = gb < ga;
+ if (invert_score) positive = !positive;
+ const double gdiff = fabs(ga - gb);
+ if (!gdiff) continue;
+ avg_diff += gdiff;
+ SparseVector<double> xdiff = (J_i[a].x - J_i[b].x).erase_zeros();
+ if (xdiff.empty()) {
+ cerr << "Empty diff:\n " << TD::GetString(J_i[a].hyp) << endl << "x=" << J_i[a].x << endl;
+ cerr << " " << TD::GetString(J_i[b].hyp) << endl << "x=" << J_i[b].x << endl;
+ continue;
+ }
+ v1.push_back(TrainingInstance(xdiff, positive, gdiff));
+#ifdef DEBUGGING_PRO
+ v1.back().a = J_i[a].hyp;
+ v1.back().b = J_i[b].hyp;
+ cerr << "N: " << v1.back() << endl;
+#endif
+ }
+ avg_diff /= v1.size();
+
+ for (unsigned i = 0; i < v1.size(); ++i) {
+ double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff));
+ // cerr << "avg_diff=" << avg_diff << " gdiff=" << v1[i].gdiff << " p=" << p << endl;
+ if (rng->next() < p) v2.push_back(v1[i]);
+ }
+ vector<TrainingInstance>::iterator mid = v2.begin() + xi;
+ if (xi > v2.size()) mid = v2.end();
+ partial_sort(v2.begin(), mid, v2.end(), DiffOrder());
+ copy(v2.begin(), mid, back_inserter(*pv));
+#ifdef DEBUGGING_PRO
+ if (v2.size() >= 5) {
+ for (int i =0; i < (mid - v2.begin()); ++i) {
+ cerr << v2[i] << endl;
+ }
+ cerr << pv->back() << endl;
+ }
+#endif
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ if (conf.count("random_seed"))
+ rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ rng.reset(new MT19937);
+ const string loss_function = conf["loss_function"].as<string>();
+
+ ScoreType type = ScoreTypeFromString(loss_function);
+ DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl;
+ Hypergraph hg;
+ string last_file;
+ ReadFile in_read(conf["input"].as<string>());
+ istream &in=*in_read.stream();
+ const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
+ const unsigned gamma = conf["candidate_pairs"].as<unsigned>();
+ const unsigned xi = conf["best_pairs"].as<unsigned>();
+ string weightsf = conf["weights"].as<string>();
+ vector<double> weights;
+ {
+ Weights w;
+ w.InitFromFile(weightsf);
+ w.InitVector(&weights);
+ }
+ string kbest_repo = conf["kbest_repository"].as<string>();
+ MkDirP(kbest_repo);
+ while(in) {
+ vector<TrainingInstance> v;
+ string line;
+ getline(in, line);
+ if (line.empty()) continue;
+ istringstream is(line);
+ int sent_id;
+ string file;
+ // path-to-file (JSON) sent_id
+ is >> file >> sent_id;
+ ReadFile rf(file);
+ ostringstream os;
+ vector<HypInfo> J_i;
+ os << kbest_repo << "/kbest." << sent_id << ".txt.gz";
+ const string kbest_file = os.str();
+ if (FileExists(kbest_file))
+ ReadKBest(kbest_file, &J_i);
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ hg.Reweight(weights);
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
+
+ for (int i = 0; i < kbest_size; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ J_i.push_back(HypInfo(d->yield, d->feature_values));
+ }
+ Dedup(&J_i);
+ WriteKBest(kbest_file, J_i);
+
+ Sample(gamma, xi, J_i, *ds[sent_id], (type == TER), &v);
+ for (unsigned i = 0; i < v.size(); ++i) {
+ const TrainingInstance& vi = v[i];
+ cout << vi.y << "\t" << vi.x << endl;
+ cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl;
+ }
+ }
+ return 0;
+}
+
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
new file mode 100644
index 00000000..9b422f33
--- /dev/null
+++ b/pro-train/mr_pro_reduce.cc
@@ -0,0 +1,277 @@
+#include <cstdlib>
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "optimize.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+// since this is a ranking model, there should be equal numbers of
+// positive and negative examples, so the bias should be 0
+static const double MAX_BIAS = 1e-10;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")
+ ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev")
+ ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)")
+ ("sigma_squared,s",po::value<double>()->default_value(0.1), "Sigma squared for Gaussian prior")
+ ("min_reg,r",po::value<double>()->default_value(1e-8), "When tuning (-T) regularization strength, minimum regularization strenght")
+ ("max_reg,R",po::value<double>()->default_value(10.0), "When tuning (-T) regularization strength, maximum regularization strenght")
+ ("testset,t",po::value<string>(), "Optional held-out test set")
+ ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
+ SparseVector<double>& x = *out;
+ size_t last_start = cur;
+ size_t last_comma = string::npos;
+ while(cur <= line.size()) {
+ if (line[cur] == ' ' || cur == line.size()) {
+ if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
+ cerr << "[ERROR] " << line << endl << " position = " << cur << endl;
+ exit(1);
+ }
+ const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
+ if (cur < line.size()) line[cur] = 0;
+ const double val = strtod(&line[last_comma + 1], NULL);
+ x.set_value(fid, val);
+
+ last_comma = string::npos;
+ last_start = cur+1;
+ } else {
+ if (line[cur] == '=')
+ last_comma = cur;
+ }
+ ++cur;
+ }
+}
+
+void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<double> > >* corpus) {
+ istream& in = *pin;
+ corpus->clear();
+ bool flag = false;
+ int lc = 0;
+ string line;
+ SparseVector<double> x;
+ while(getline(in, line)) {
+ ++lc;
+ if (lc % 1000 == 0) { cerr << '.'; flag = true; }
+ if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
+ if (line.empty()) continue;
+ const size_t ks = line.find("\t");
+ assert(string::npos != ks);
+ assert(ks == 1);
+ const bool y = line[0] == '1';
+ x.clear();
+ ParseSparseVector(line, ks + 1, &x);
+ corpus->push_back(make_pair(y, x));
+ }
+ if (flag) cerr << endl;
+}
+
+void GradAdd(const SparseVector<double>& v, const double scale, vector<double>* acc) {
+ for (SparseVector<double>::const_iterator it = v.begin();
+ it != v.end(); ++it) {
+ (*acc)[it->first] += it->second * scale;
+ }
+}
+
+double TrainingInference(const vector<double>& x,
+ const vector<pair<bool, SparseVector<double> > >& corpus,
+ vector<double>* g = NULL) {
+ double cll = 0;
+ for (int i = 0; i < corpus.size(); ++i) {
+ const double dotprod = corpus[i].second.dot(x) + x[0]; // x[0] is bias
+ double lp_false = dotprod;
+ double lp_true = -dotprod;
+ if (0 < lp_true) {
+ lp_true += log1p(exp(-lp_true));
+ lp_false = log1p(exp(lp_false));
+ } else {
+ lp_true = log1p(exp(lp_true));
+ lp_false += log1p(exp(-lp_false));
+ }
+ lp_true*=-1;
+ lp_false*=-1;
+ if (corpus[i].first) { // true label
+ cll -= lp_true;
+ if (g) {
+ // g -= corpus[i].second * exp(lp_false);
+ GradAdd(corpus[i].second, -exp(lp_false), g);
+ (*g)[0] -= exp(lp_false); // bias
+ }
+ } else { // false label
+ cll -= lp_false;
+ if (g) {
+ // g += corpus[i].second * exp(lp_true);
+ GradAdd(corpus[i].second, exp(lp_true), g);
+ (*g)[0] += exp(lp_true); // bias
+ }
+ }
+ }
+ return cll;
+}
+
+// return held-out log likelihood
+double LearnParameters(const vector<pair<bool, SparseVector<double> > >& training,
+ const vector<pair<bool, SparseVector<double> > >& testing,
+ const double sigsq,
+ const unsigned memory_buffers,
+ vector<double>* px) {
+ vector<double>& x = *px;
+ vector<double> vg(FD::NumFeats(), 0.0);
+ bool converged = false;
+ LBFGSOptimizer opt(FD::NumFeats(), memory_buffers);
+ double tppl = 0.0;
+ while(!converged) {
+ fill(vg.begin(), vg.end(), 0.0);
+ double cll = TrainingInference(x, training, &vg);
+ double ppl = cll / log(2);
+ ppl /= training.size();
+ ppl = pow(2.0, ppl);
+
+ // evaluate optional held-out test set
+ if (testing.size()) {
+ tppl = TrainingInference(x, testing) / log(2);
+ tppl /= testing.size();
+ tppl = pow(2.0, tppl);
+ }
+
+ // handle regularizer
+#if 1
+ double norm = 0;
+ for (int i = 1; i < x.size(); ++i) {
+ const double mean_i = 0.0;
+ const double param = (x[i] - mean_i);
+ norm += param * param;
+ vg[i] += param / sigsq;
+ }
+ const double reg = norm / (2.0 * sigsq);
+#else
+ double reg = 0;
+#endif
+ cll += reg;
+ cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t";
+ try {
+ vector<double> old_x = x;
+ do {
+ opt.Optimize(cll, vg, &x);
+ converged = opt.HasConverged();
+ } while (!converged && x == old_x);
+ } catch (...) {
+ cerr << "Exception caught, assuming convergence is close enough...\n";
+ converged = true;
+ }
+ if (fabs(x[0]) > MAX_BIAS) {
+ cerr << "Biased model learned. Are your training instances wrong?\n";
+ cerr << " BIAS: " << x[0] << endl;
+ }
+ }
+ return tppl;
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ string line;
+ vector<pair<bool, SparseVector<double> > > training, testing;
+ SparseVector<double> old_weights;
+ const bool tune_regularizer = conf.count("tune_regularizer");
+ if (tune_regularizer && !conf.count("testset")) {
+ cerr << "--tune_regularizer requires --testset to be set\n";
+ return 1;
+ }
+ const double min_reg = conf["min_reg"].as<double>();
+ const double max_reg = conf["max_reg"].as<double>();
+ double sigsq = conf["sigma_squared"].as<double>();
+ assert(sigsq > 0.0);
+ assert(min_reg > 0.0);
+ assert(max_reg > 0.0);
+ assert(max_reg > min_reg);
+ const double psi = conf["interpolation"].as<double>();
+ if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
+ if (conf.count("weights")) {
+ Weights w;
+ w.InitFromFile(conf["weights"].as<string>());
+ w.InitSparseVector(&old_weights);
+ }
+ ReadCorpus(&cin, &training);
+ if (conf.count("testset")) {
+ ReadFile rf(conf["testset"].as<string>());
+ ReadCorpus(rf.stream(), &testing);
+ }
+ cerr << "Number of features: " << FD::NumFeats() << endl;
+ vector<double> x(FD::NumFeats(), 0.0); // x[0] is bias
+ for (SparseVector<double>::const_iterator it = old_weights.begin();
+ it != old_weights.end(); ++it)
+ x[it->first] = it->second;
+ double tppl = 0.0;
+ vector<pair<double,double> > sp;
+ vector<double> smoothed;
+ if (tune_regularizer) {
+ sigsq = min_reg;
+ const double steps = 18;
+ double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps);
+ cerr << "SWEEP FACTOR: " << sweep_factor << endl;
+ while(sigsq < max_reg) {
+ tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
+ sp.push_back(make_pair(sigsq, tppl));
+ sigsq *= sweep_factor;
+ }
+ smoothed.resize(sp.size(), 0);
+ smoothed[0] = sp[0].second;
+ smoothed.back() = sp.back().second;
+ for (int i = 1; i < sp.size()-1; ++i) {
+ double prev = sp[i-1].second;
+ double next = sp[i+1].second;
+ double cur = sp[i].second;
+ smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next);
+ }
+ double best_ppl = 9999999;
+ unsigned best_i = 0;
+ for (unsigned i = 0; i < sp.size(); ++i) {
+ if (smoothed[i] < best_ppl) {
+ best_ppl = smoothed[i];
+ best_i = i;
+ }
+ }
+ sigsq = sp[best_i].first;
+ tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
+ }
+ Weights w;
+ if (conf.count("weights")) {
+ for (int i = 1; i < x.size(); ++i)
+ x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi);
+ }
+ cout.precision(15);
+ cout << "# sigma^2=" << sigsq << "\theld out perplexity=";
+ if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; }
+ if (sp.size()) {
+ cout << "# Parameter sweep:\n";
+ for (int i = 0; i < sp.size(); ++i) {
+ cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl;
+ }
+ }
+ w.InitFromVector(x);
+ w.WriteToFile("-");
+ return 0;
+}
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index 4aae2039..9d72cb87 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -235,6 +235,13 @@ class FastSparseVector {
}
return *this;
}
+ FastSparseVector<T> erase_zeros(const T& EPSILON = 1e-4) const {
+ FastSparseVector<T> o;
+ for (const_iterator it = begin(); it != end(); ++it) {
+ if (fabs(it->second) > EPSILON) o.set_value(it->first, it->second);
+ }
+ return o;
+ }
const_iterator begin() const {
return const_iterator(*this, false);
}
@@ -344,15 +351,9 @@ const FastSparseVector<T> operator+(const FastSparseVector<T>& x, const FastSpar
template <typename T>
const FastSparseVector<T> operator-(const FastSparseVector<T>& x, const FastSparseVector<T>& y) {
- if (x.size() > y.size()) {
- FastSparseVector<T> res(x);
- res -= y;
- return res;
- } else {
- FastSparseVector<T> res(y);
- res -= x;
- return res;
- }
+ FastSparseVector<T> res(x);
+ res -= y;
+ return res;
}
template <class T>
diff --git a/utils/filelib.cc b/utils/filelib.cc
index 79ad2847..a0969b1a 100644
--- a/utils/filelib.cc
+++ b/utils/filelib.cc
@@ -20,3 +20,15 @@ bool DirectoryExists(const string& dir) {
return false;
}
+void MkDirP(const string& dir) {
+ if (DirectoryExists(dir)) return;
+ if (mkdir(dir.c_str(), 0777)) {
+ perror(dir.c_str());
+ abort();
+ }
+ if (chmod(dir.c_str(), 07777)) {
+ perror(dir.c_str());
+ abort();
+ }
+}
+
diff --git a/utils/filelib.h b/utils/filelib.h
index dda98671..a8622246 100644
--- a/utils/filelib.h
+++ b/utils/filelib.h
@@ -12,6 +12,7 @@
bool FileExists(const std::string& file_name);
bool DirectoryExists(const std::string& dir_name);
+void MkDirP(const std::string& dir_name);
// reads from standard in if filename is -
// uncompresses if file ends with .gz