From 7ff60b69cc21c90695ca20829375e6bf9b5f452d Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sun, 10 Jul 2011 23:00:21 -0400
Subject: starting implementation of Hopkins&May (2011) optimizer

---
 pro-train/mr_pro_reduce.cc | 81 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 pro-train/mr_pro_reduce.cc

(limited to 'pro-train/mr_pro_reduce.cc')
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
new file mode 100644
index 00000000..3df52020
--- /dev/null
+++ b/pro-train/mr_pro_reduce.cc
@@ -0,0 +1,81 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sparse_vector.h"
+#include "error_surface.h"
+#include "line_optimizer.h"
+#include "b64tools.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("loss_function,l",po::value<string>(), "Loss function being optimized")
+        ("help,h", "Help");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  bool flag = conf->count("loss_function") == 0;
+  if (flag || conf->count("help")) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  const string loss_function = conf["loss_function"].as<string>();
+  ScoreType type = ScoreTypeFromString(loss_function);
+  LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE;
+  if (type == TER || type == AER) {
+    opt_type = LineOptimizer::MINIMIZE_SCORE;
+  }
+  string last_key;
+  vector<ErrorSurface> esv;
+  while(cin) {
+    string line;
+    getline(cin, line);
+    if (line.empty()) continue;
+    size_t ks = line.find("\t");
+    assert(string::npos != ks);
+    assert(ks > 2);
+    string key = line.substr(2, ks - 2);
+    string val = line.substr(ks + 1);
+    if (key != last_key) {
+      if (!last_key.empty()) {
+	float score;
+        double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
+	cout << last_key << "|" << x << "|" << score << endl;
+      }
+      last_key = key;
+      esv.clear();
+    }
+    if (val.size() % 4 != 0) {
+      cerr << "B64 encoding error 1! Skipping.\n";
+      continue;
+    }
+    string encoded(val.size() / 4 * 3, '\0');
+    if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) {
+      cerr << "B64 encoding error 2! Skipping.\n";
+      continue;
+    }
+    esv.push_back(ErrorSurface());
+    esv.back().Deserialize(type, encoded);
+  }
+  if (!esv.empty()) {
+    // cerr << "ESV=" << esv.size() << endl;
+    // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; }
+    float score;
+    double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
+    cout << last_key << "|" << x << "|" << score << endl;
+  }
+  return 0;
+}
-- 
cgit v1.2.3


From a8a8aeba08d5c0f6841394087bb4ec0b6ade0694 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Mon, 11 Jul 2011 20:39:45 -0400
Subject: sort of working hopkins&may optimizer

---
 pro-train/Makefile.am                     |   4 +-
 pro-train/dist-pro.pl                     | 308 ++++++++++--------------------
 pro-train/mr_pro_generate_mapper_input.pl |  18 ++
 pro-train/mr_pro_map.cc                   | 118 ++++++++++--
 pro-train/mr_pro_reduce.cc                | 167 ++++++++++++----
 5 files changed, 349 insertions(+), 266 deletions(-)
 create mode 100755 pro-train/mr_pro_generate_mapper_input.pl

(limited to 'pro-train/mr_pro_reduce.cc')

diff --git a/pro-train/Makefile.am b/pro-train/Makefile.am
index 945ed5c3..fdaf43e2 100644
--- a/pro-train/Makefile.am
+++ b/pro-train/Makefile.am
@@ -8,6 +8,6 @@ mr_pro_map_SOURCES = mr_pro_map.cc
 mr_pro_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
 
 mr_pro_reduce_SOURCES = mr_pro_reduce.cc
-mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
 
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training
diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl
index 35bccea4..55d7f1fa 100755
--- a/pro-train/dist-pro.pl
+++ b/pro-train/dist-pro.pl
@@ -21,7 +21,7 @@ my $bin_dir = $SCRIPT_DIR;
 die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
 my $FAST_SCORE="$bin_dir/../mteval/fast_score";
 die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
-my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input";
+my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl";
 my $MAPPER = "$bin_dir/mr_pro_map";
 my $REDUCER = "$bin_dir/mr_pro_reduce";
 my $parallelize = "$VEST_DIR/parallelize.pl";
@@ -37,8 +37,7 @@ die "Can't find decoder in $cdec" unless -x $cdec;
 die "Can't find $parallelize" unless -x $parallelize;
 die "Can't find $libcall" unless -e $libcall;
 my $decoder = $cdec;
-my $lines_per_mapper = 400;
-my $rand_directions = 15;
+my $lines_per_mapper = 100;
 my $iteration = 1;
 my $run_local = 0;
 my $best_weights;
@@ -58,7 +57,6 @@ my $metric = "ibm_bleu";
 my $dir;
 my $iniFile;
 my $weights;
-my $initialWeights;
 my $decoderOpt;
 my $noprimary;
 my $maxsim=0;
@@ -67,7 +65,6 @@ my $oracleb=20;
 my $bleu_weight=1;
 my $use_make;  # use make to parallelize line search
 my $dirargs='';
-my $density_prune;
 my $usefork;
 my $pass_suffix = '';
 my $cpbin=1;
@@ -76,7 +73,6 @@ Getopt::Long::Configure("no_auto_abbrev");
 if (GetOptions(
 	"decoder=s" => \$decoderOpt,
 	"decode-nodes=i" => \$decode_nodes,
-	"density-prune=f" => \$density_prune,
 	"dont-clean" => \$disable_clean,
 	"pass-suffix=s" => \$pass_suffix,
         "use-fork" => \$usefork,
@@ -91,8 +87,6 @@ if (GetOptions(
 	"normalize=s" => \$normalize,
 	"pmem=s" => \$pmem,
         "cpbin!" => \$cpbin,
-	"rand-directions=i" => \$rand_directions,
-	"random_directions=i" => \$rand_directions,
         "bleu_weight=s" => \$bleu_weight,
         "no-primary!" => \$noprimary,
         "max-similarity=s" => \$maxsim,
@@ -103,18 +97,12 @@ if (GetOptions(
 	"ref-files=s" => \$refFiles,
 	"metric=s" => \$metric,
 	"source-file=s" => \$srcFile,
-	"weights=s" => \$initialWeights,
 	"workdir=s" => \$dir,
-    "opt-iterations=i" => \$optimization_iters,
 ) == 0 || @ARGV!=1 || $help) {
 	print_help();
 	exit;
 }
 
-if (defined $density_prune) {
-  die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0;
-}
-
 if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; }
 
 if ($metric =~ /^(combi|ter)$/i) {
@@ -146,7 +134,7 @@ if ($metric =~ /^ter$|^aer$/i) {
 my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
 
 unless ($dir){
-	$dir = "vest";
+	$dir = "protrain";
 }
 unless ($dir =~ /^\//){  # convert relative path to absolute path
 	my $basedir = check_output("pwd");
@@ -203,18 +191,19 @@ sub dirsize {
     opendir ISEMPTY,$_[0];
     return scalar(readdir(ISEMPTY))-1;
 }
+my @allweights;
 if ($dryrun){
 	write_config(*STDERR);
 	exit 0;
 } else {
-	if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs
+	if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs
 	  die "ERROR: working dir $dir already exists\n\n";
 	} else {
 		-e $dir || mkdir $dir;
 		mkdir "$dir/hgs";
         modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
     mkdir "$dir/scripts";
-        my $cmdfile="$dir/rerun-vest.sh";
+        my $cmdfile="$dir/rerun-pro.sh";
         open CMD,'>',$cmdfile;
         print CMD "cd ",&getcwd,"\n";
 #        print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
@@ -223,13 +212,8 @@ if ($dryrun){
         close CMD;
         print STDERR $cline;
         chmod(0755,$cmdfile);
-		unless (-e $initialWeights) {
-			print STDERR "Please specify an initial weights file with --initial-weights\n";
-			print_help();
-			exit;
-		}
-		check_call("cp $initialWeights $dir/weights.0");
-		die "Can't find weights.0" unless (-e "$dir/weights.0");
+	check_call("touch $dir/weights.0");
+	die "Can't find weights.0" unless (-e "$dir/weights.0");
 	}
 	write_config(*STDERR);
 }
@@ -255,6 +239,7 @@ my $random_seed = int(time / 1000);
 my $lastWeightsFile;
 my $lastPScore = 0;
 # main optimization loop
+my @mapoutputs = (); # aggregate map outputs over all iters
 while (1){
 	print STDERR "\n\nITERATION $iteration\n==========\n";
 
@@ -276,10 +261,8 @@ while (1){
 	print STDERR unchecked_output("date");
 	my $im1 = $iteration - 1;
 	my $weightsFile="$dir/weights.$im1";
+        push @allweights, "-w $dir/weights.$im1";
 	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
-	if ($density_prune) {
-		$decoder_cmd .= " --density_prune $density_prune";
-	}
 	my $pcmd;
 	if ($run_local) {
 		$pcmd = "cat $srcFile |";
@@ -320,163 +303,111 @@ while (1){
 	# run optimizer
 	print STDERR "RUNNING OPTIMIZER AT ";
 	print STDERR unchecked_output("date");
+	print STDERR " - GENERATE TRAINING EXEMPLARS\n";
 	my $mergeLog="$logdir/prune-merge.log.$iteration";
 
 	my $score = 0;
 	my $icc = 0;
 	my $inweights="$dir/weights.$im1";
-	for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) {
-		print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n";
-		print STDERR unchecked_output("date");
-		$icc++;
-		my $nop=$noprimary?"--no_primary":"";
-		my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):"";
-		my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":"";
-		$cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter";
-		print STDERR "COMMAND:\n$cmd\n";
-		check_call($cmd);
-		check_call("mkdir -p $dir/splag.$im1");
-		$cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput.";
-		print STDERR "COMMAND:\n$cmd\n";
-		check_call($cmd);
-		opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
-		my @shards = grep { /^mapinput\./ } readdir(DIR);
-		closedir DIR;
-		die "No shards!" unless scalar @shards > 0;
-		my $joblist = "";
-		my $nmappers = 0;
-		my @mapoutputs = ();
-		@cleanupcmds = ();
-		my %o2i = ();
-		my $first_shard = 1;
-		my $mkfile; # only used with makefiles
-		my $mkfilename;
-		if ($use_make) {
-			$mkfilename = "$dir/splag.$im1/domap.mk";
-			open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
-			print $mkfile "all: $dir/splag.$im1/map.done\n\n";
-		}
-		my @mkouts = ();  # only used with makefiles
-		for my $shard (@shards) {
-			my $mapoutput = $shard;
-			my $client_name = $shard;
-			$client_name =~ s/mapinput.//;
-			$client_name = "vest.$client_name";
-			$mapoutput =~ s/mapinput/mapoutput/;
-			push @mapoutputs, "$dir/splag.$im1/$mapoutput";
-			$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
-			my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";
-			if ($run_local) {
-				print STDERR "COMMAND:\n$script\n";
-				check_bash_call($script);
-			} elsif ($use_make) {
-				my $script_file = "$dir/scripts/map.$shard";
-				open F, ">$script_file" or die "Can't write $script_file: $!";
-				print F "#!/bin/bash\n";
-				print F "$script\n";
-				close F;
-				my $output = "$dir/splag.$im1/$mapoutput";
-				push @mkouts, $output;
-				chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
-				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
-				print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
-			} else {
-				my $script_file = "$dir/scripts/map.$shard";
-				open F, ">$script_file" or die "Can't write $script_file: $!";
-				print F "$script\n";
-				close F;
-				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
-
-				$nmappers++;
-				my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
-				my $jobid = check_output("$qcmd");
-				chomp $jobid;
-				$jobid =~ s/^(\d+)(.*?)$/\1/g;
-				$jobid =~ s/^Your job (\d+) .*$/\1/;
-		 	 	push(@cleanupcmds, "qdel $jobid 2> /dev/null");
-				print STDERR " $jobid";
-				if ($joblist == "") { $joblist = $jobid; }
-				else {$joblist = $joblist . "\|" . $jobid; }
-			}
-		}
+	$cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_call($cmd);
+	check_call("mkdir -p $dir/splag.$im1");
+	$cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1 $dir/splag.$im1/mapinput.";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_call($cmd);
+	opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
+	my @shards = grep { /^mapinput\./ } readdir(DIR);
+	closedir DIR;
+	die "No shards!" unless scalar @shards > 0;
+	my $joblist = "";
+	my $nmappers = 0;
+	@cleanupcmds = ();
+	my %o2i = ();
+	my $first_shard = 1;
+	my $mkfile; # only used with makefiles
+	my $mkfilename;
+	if ($use_make) {
+		$mkfilename = "$dir/splag.$im1/domap.mk";
+		open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
+		print $mkfile "all: $dir/splag.$im1/map.done\n\n";
+	}
+	my @mkouts = ();  # only used with makefiles
+	for my $shard (@shards) {
+		my $mapoutput = $shard;
+		my $client_name = $shard;
+		$client_name =~ s/mapinput.//;
+		$client_name = "pro.$client_name";
+		$mapoutput =~ s/mapinput/mapoutput/;
+		push @mapoutputs, "$dir/splag.$im1/$mapoutput";
+		$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
+		my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep @allweights < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
 		if ($run_local) {
-			print STDERR "\nProcessing line search complete.\n";
+			print STDERR "COMMAND:\n$script\n";
+			check_bash_call($script);
 		} elsif ($use_make) {
-			print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
-			close $mkfile;
-			my $mcmd = "make -j $use_make -f $mkfilename";
-			print STDERR "\nExecuting: $mcmd\n";
-			check_call($mcmd);
+			my $script_file = "$dir/scripts/map.$shard";
+			open F, ">$script_file" or die "Can't write $script_file: $!";
+			print F "#!/bin/bash\n";
+			print F "$script\n";
+			close F;
+			my $output = "$dir/splag.$im1/$mapoutput";
+			push @mkouts, $output;
+			chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
+			if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+			print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
 		} else {
-			print STDERR "\nLaunched $nmappers mappers.\n";
-      			sleep 8;
-			print STDERR "Waiting for mappers to complete...\n";
-			while ($nmappers > 0) {
-			  sleep 5;
-			  my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
-			  $nmappers = scalar @livejobs;
-			}
-			print STDERR "All mappers complete.\n";
+			my $script_file = "$dir/scripts/map.$shard";
+			open F, ">$script_file" or die "Can't write $script_file: $!";
+			print F "$script\n";
+			close F;
+			if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+
+			$nmappers++;
+			my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
+			my $jobid = check_output("$qcmd");
+			chomp $jobid;
+			$jobid =~ s/^(\d+)(.*?)$/\1/g;
+			$jobid =~ s/^Your job (\d+) .*$/\1/;
+		 	push(@cleanupcmds, "qdel $jobid 2> /dev/null");
+			print STDERR " $jobid";
+			if ($joblist == "") { $joblist = $jobid; }
+			else {$joblist = $joblist . "\|" . $jobid; }
 		}
-		my $tol = 0;
-		my $til = 0;
-		for my $mo (@mapoutputs) {
-		  my $olines = get_lines($mo);
-		  my $ilines = get_lines($o2i{$mo});
-		  $tol += $olines;
-		  $til += $ilines;
-		  die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines;
-		}
-		print STDERR "Results for $tol/$til lines\n";
-		print STDERR "\nSORTING AND RUNNING VEST REDUCER\n";
-		print STDERR unchecked_output("date");
-		$cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1";
-		print STDERR "COMMAND:\n$cmd\n";
-		check_bash_call($cmd);
-		$cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1";
-		# sort returns failure even when it doesn't fail for some reason
-		my $best=unchecked_output("$cmd"); chomp $best;
-		print STDERR "$best\n";
-		my ($oa, $x, $xscore) = split /\|/, $best;
-		$score = $xscore;
-		print STDERR "PROJECTED SCORE: $score\n";
-		if (abs($x) < $epsilon) {
-			print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n";
-			last;
-		}
-                my $psd = $score - $last_score;
-                $last_score = $score;
-		if (abs($psd) < $epsilon) {
-			print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n";
-			last;
-		}
-		my ($origin, $axis) = split /\s+/, $oa;
-
-		my %ori = convert($origin);
-		my %axi = convert($axis);
-
-		my $finalFile="$dir/weights.$im1-$opt_iter";
-		open W, ">$finalFile" or die "Can't write: $finalFile: $!";
-                my $norm = 0;
-		for my $k (sort keys %ori) {
-			my $dd = $ori{$k} + $axi{$k} * $x;
-                        $norm += $dd * $dd;
-		}
-                $norm = sqrt($norm);
-		$norm = 1;
-		for my $k (sort keys %ori) {
-			my $v = ($ori{$k} + $axi{$k} * $x) / $norm;
-			print W "$k $v\n";
+	}
+	if ($run_local) {
+		print STDERR "\nCompleted extraction of training exemplars.\n";
+	} elsif ($use_make) {
+		print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
+		close $mkfile;
+		my $mcmd = "make -j $use_make -f $mkfilename";
+		print STDERR "\nExecuting: $mcmd\n";
+		check_call($mcmd);
+	} else {
+		print STDERR "\nLaunched $nmappers mappers.\n";
+      		sleep 8;
+		print STDERR "Waiting for mappers to complete...\n";
+		while ($nmappers > 0) {
+		  sleep 5;
+		  my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
+		  $nmappers = scalar @livejobs;
 		}
-		check_call("rm $dir/splag.$im1/*");
-		$inweights = $finalFile;
+		print STDERR "All mappers complete.\n";
 	}
-	$lastWeightsFile = "$dir/weights.$iteration";
-	check_call("cp $inweights $lastWeightsFile");
-	if ($icc < 2) {
-		print STDERR "\nREACHED STOPPING CRITERION: score change too little\n";
-		last;
+	my $tol = 0;
+	my $til = 0;
+        print STDERR "MO: @mapoutputs\n";
+	for my $mo (@mapoutputs) {
+		#my $olines = get_lines($mo);
+		#my $ilines = get_lines($o2i{$mo});
+		#die "$mo: no training instances generated!" if $olines == 0;
 	}
+	print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n";
+	print STDERR unchecked_output("date");
+	$cmd="cat @mapoutputs | $REDUCER -w $dir/weights.$im1 > $dir/weights.$iteration";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_bash_call($cmd);
+	$lastWeightsFile = "$dir/weights.$iteration";
 	$lastPScore = $score;
 	$iteration++;
 	print STDERR "\n==========\n";
@@ -488,24 +419,6 @@ print STDOUT "$lastWeightsFile\n";
 
 exit 0;
 
-sub normalize_weights {
-  my ($rfn, $rpts, $feat) = @_;
-  my @feat_names = @$rfn;
-  my @pts = @$rpts;
-  my $z = 1.0;
-  for (my $i=0; $i < scalar @feat_names; $i++) {
-    if ($feat_names[$i] eq $feat) {
-      $z = $pts[$i];
-      last;
-    }
-  }
-  for (my $i=0; $i < scalar @feat_names; $i++) {
-    $pts[$i] /= $z;
-  }
-  print STDERR " NORM WEIGHTS: @pts\n";
-  return @pts;
-}
-
 sub get_lines {
   my $fn = shift @_;
   open FL, "<$fn" or die "Couldn't read $fn: $!";
@@ -563,7 +476,6 @@ sub write_config {
 	print $fh "HEAD NODE:        $host\n";
 	print $fh "PMEM (DECODING):  $pmem\n";
 	print $fh "CLEANUP:          $cleanup\n";
-	print $fh "INITIAL WEIGHTS:  $initialWeights\n";
 }
 
 sub update_weights_file {
@@ -603,6 +515,7 @@ sub enseg {
 	}
 	close SRC;
 	close NEWSRC;
+	die "Empty dev set!" if ($i == 0);
 }
 
 sub print_help {
@@ -634,10 +547,6 @@ Options:
 	--decoder <decoder path>
 		Decoder binary to use.
 
-	--density-prune <N>
-		Limit the density of the hypergraph on each iteration to N times
-		the number of edges on the Viterbi path.
-
 	--help
 		Print this message and exit.
 
@@ -668,18 +577,9 @@ Options:
 		After each iteration, rescale all feature weights such that feature-
 		name has a weight of 1.0.
 
-	--rand-directions <num>
-		MERT will attempt to optimize along all of the principle directions,
-		set this parameter to explore other directions. Defaults to 5.
-
 	--source-file <file>
 		Dev set source file.
 
-	--weights <file>
-		A file specifying initial feature weights.  The format is
-		FeatureName_1 value1
-		FeatureName_2 value2
-
 	--workdir <dir>
 		Directory for intermediate and output files.  If not specified, the
 		name is derived from the ini filename.  Assuming that the ini
diff --git a/pro-train/mr_pro_generate_mapper_input.pl b/pro-train/mr_pro_generate_mapper_input.pl
new file mode 100755
index 00000000..b30fc4fd
--- /dev/null
+++ b/pro-train/mr_pro_generate_mapper_input.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1;
+my $d = shift @ARGV;
+die "Can't find directory $d" unless -d $d;
+
+opendir(DIR, $d) or die "Can't read $d: $!";
+my @hgs = grep { /\.gz$/ } readdir(DIR);
+closedir DIR;
+
+for my $hg (@hgs) {
+  my $file = $hg;
+  my $id = $hg;
+  $id =~ s/(\.json)?\.gz//;
+  print "$d/$file $id\n";
+}
+
diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc
index b046cdea..128d93ce 100644
--- a/pro-train/mr_pro_map.cc
+++ b/pro-train/mr_pro_map.cc
@@ -10,6 +10,7 @@
 #include "sampler.h"
 #include "filelib.h"
 #include "stringlib.h"
+#include "weights.h"
 #include "scorer.h"
 #include "inside_outside.h"
 #include "hg_io.h"
@@ -27,10 +28,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
         ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
-        ("source,s",po::value<string>(), "Source file (ignored, except for AER)")
+        ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)")
         ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized")
         ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
-        ("weights,w",po::value<string>(), "[REQD] Current weights file")
+        ("weights,w",po::value<vector<string> >(), "[REQD] Weights files from previous and current iterations")
         ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")
         ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")
         ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)")
@@ -44,6 +45,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
     cerr << "Please specify one or more references using -r <REF.TXT>\n";
     flag = true;
   }
+  if (!conf->count("weights")) {
+    cerr << "Please specify one or more weights using -w <WEIGHTS.TXT>\n";
+    flag = true;
+  }
   if (flag || conf->count("help")) {
     cerr << dcmdline_options << endl;
     exit(1);
@@ -51,18 +56,78 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
 }
 
 struct HypInfo {
-  HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-1), x(feats) {}
-  double g() {
+  HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-100.0), x(feats) {}
+
+  // lazy evaluation
+  double g(const SentenceScorer& scorer) const {
+    if (g_ == -100.0)
+      g_ = scorer.ScoreCandidate(hyp)->ComputeScore();
     return g_;
   }
- private:
-  int sent_id;
   vector<WordID> hyp;
-  double g_;
+  mutable double g_;
  public:
   SparseVector<double> x;
 };
 
+struct ThresholdAlpha {
+  explicit ThresholdAlpha(double t = 0.05) : threshold(t) {}
+  double operator()(double mag) const {
+    if (mag < threshold) return 0.0; else return 1.0;
+  }
+  const double threshold;
+};
+
+struct TrainingInstance {
+  TrainingInstance(const SparseVector<double>& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {}
+  SparseVector<double> x;
+#ifdef DEBUGGING_PRO
+  vector<WordID> a;
+  vector<WordID> b;
+#endif
+  bool y;
+  double gdiff;
+};
+
+struct DiffOrder {
+  bool operator()(const TrainingInstance& a, const TrainingInstance& b) const {
+    return a.gdiff > b.gdiff;
+  }
+};
+
+template<typename Alpha>
+void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const Alpha& alpha_i, bool invert_score, vector<TrainingInstance>* pv) {
+  vector<TrainingInstance> v;
+  for (unsigned i = 0; i < gamma; ++i) {
+    size_t a = rng->inclusive(0, J_i.size() - 1)();
+    size_t b = rng->inclusive(0, J_i.size() - 1)();
+    if (a == b) continue;
+    double ga = J_i[a].g(scorer);
+    double gb = J_i[b].g(scorer);
+    bool positive = ga < gb;
+    if (invert_score) positive = !positive;
+    double gdiff = fabs(ga - gb);
+    if (!gdiff) continue;
+    if (rng->next() < alpha_i(gdiff)) {
+      v.push_back(TrainingInstance((J_i[a].x - J_i[b].x).erase_zeros(), positive, gdiff));
+#ifdef DEBUGGING_PRO
+      v.back().a = J_i[a].hyp;
+      v.back().b = J_i[b].hyp;
+#endif
+    }
+  }
+  vector<TrainingInstance>::iterator mid = v.begin() + xi;
+  if (xi > v.size()) mid = v.end();
+  partial_sort(v.begin(), mid, v.end(), DiffOrder());
+  copy(v.begin(), mid, back_inserter(*pv));
+#ifdef DEBUGGING_PRO
+  if (v.size() >= 5)
+    for (int i =0; i < 5; ++i) {
+      cerr << v[i].gdiff << " y=" << v[i].y << "\tA:" << TD::GetString(v[i].a) << "\n\tB: " << TD::GetString(v[i].b) << endl;
+    }
+#endif
+}
+
 int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
@@ -81,7 +146,15 @@ int main(int argc, char** argv) {
   const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
   const unsigned gamma = conf["candidate_pairs"].as<unsigned>();
   const unsigned xi = conf["best_pairs"].as<unsigned>();
+  vector<string> weights_files = conf["weights"].as<vector<string> >();
+  vector<vector<double> > weights(weights_files.size());
+  for (int i = 0; i < weights.size(); ++i) {
+    Weights w;
+    w.InitFromFile(weights_files[i]);
+    w.InitVector(&weights[i]);
+  }
   while(in) {
+    vector<TrainingInstance> v;
     string line;
     getline(in, line);
     if (line.empty()) continue;
@@ -92,18 +165,27 @@ int main(int argc, char** argv) {
     is >> file >> sent_id;
     ReadFile rf(file);
     HypergraphIO::ReadFromJSON(rf.stream(), &hg);
-    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
-
     vector<HypInfo> J_i;
-    for (int i = 0; i < kbest_size; ++i) {
-      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
-        kbest.LazyKthBest(hg.nodes_.size() - 1, i);
-      if (!d) break;
-      float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore();
-      // if (invert_score) sentscore *= -1.0;
-      // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl;
-      d->feature_values;
-      sentscore;
+    int start = weights.size();
+    start -= 4;
+    if (start < 0) start = 0;
+    for (int i = start; i < weights.size(); ++i) {
+      hg.Reweight(weights[i]);
+      KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
+
+      for (int i = 0; i < kbest_size; ++i) {
+        const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+          kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+        if (!d) break;
+        J_i.push_back(HypInfo(d->yield, d->feature_values));
+      }
+    }
+
+    Sample(gamma, xi, J_i, *ds[sent_id], ThresholdAlpha(0.05), (type == TER), &v);
+    for (unsigned i = 0; i < v.size(); ++i) {
+      const TrainingInstance& vi = v[i];
+      cout << vi.y << "\t" << vi.x << endl;
+      cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl;
     }
   }
   return 0;
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 3df52020..2b9c5ce7 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -1,3 +1,4 @@
+#include <cstdlib>
 #include <sstream>
 #include <iostream>
 #include <fstream>
@@ -6,24 +7,29 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "weights.h"
 #include "sparse_vector.h"
-#include "error_surface.h"
-#include "line_optimizer.h"
-#include "b64tools.h"
+#include "optimize.h"
 
 using namespace std;
 namespace po = boost::program_options;
 
+// since this is a ranking model, there should be equal numbers of
+// positive and negative examples so the bias should be 0
+static const double MAX_BIAS = 1e-10;
+
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
-        ("loss_function,l",po::value<string>(), "Loss function being optimized")
+        ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")
+        ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev")
+        ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)")
+        ("sigma_squared,s",po::value<double>()->default_value(0.5), "Sigma squared for Gaussian prior")
         ("help,h", "Help");
   po::options_description dcmdline_options;
   dcmdline_options.add(opts);
   po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  bool flag = conf->count("loss_function") == 0;
-  if (flag || conf->count("help")) {
+  if (conf->count("help")) {
     cerr << dcmdline_options << endl;
     exit(1);
   }
@@ -32,50 +38,127 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
 int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
-  const string loss_function = conf["loss_function"].as<string>();
-  ScoreType type = ScoreTypeFromString(loss_function);
-  LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE;
-  if (type == TER || type == AER) {
-    opt_type = LineOptimizer::MINIMIZE_SCORE;
+  string line;
+  vector<pair<bool, SparseVector<double> > > training;
+  int lc = 0;
+  bool flag = false;
+  SparseVector<double> old_weights;
+  const double psi = conf["interpolation"].as<double>();
+  if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
+  if (conf.count("weights")) {
+    Weights w;
+    w.InitFromFile(conf["weights"].as<string>());
+    w.InitSparseVector(&old_weights);
   }
-  string last_key;
-  vector<ErrorSurface> esv;
-  while(cin) {
-    string line;
-    getline(cin, line);
+  while(getline(cin, line)) {
+    ++lc;
+    if (lc % 1000 == 0) { cerr << '.'; flag = true; }
+    if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
     if (line.empty()) continue;
-    size_t ks = line.find("\t");
+    const size_t ks = line.find("\t");
     assert(string::npos != ks);
-    assert(ks > 2);
-    string key = line.substr(2, ks - 2);
-    string val = line.substr(ks + 1);
-    if (key != last_key) {
-      if (!last_key.empty()) {
-	float score;
-        double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
-	cout << last_key << "|" << x << "|" << score << endl;
+    assert(ks == 1);
+    const bool y = line[0] == '1';
+    SparseVector<double> x;
+    size_t last_start = ks + 1;
+    size_t last_comma = string::npos;
+    size_t cur = last_start;
+    while(cur <= line.size()) {
+      if (line[cur] == ' ' || cur == line.size()) {
+        if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
+          cerr << "[ERROR] " << line << endl << "  position = " << cur << endl;
+          exit(1);
+        }
+        const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
+        if (cur < line.size()) line[cur] = 0;
+        const double val = strtod(&line[last_comma + 1], NULL);
+        x.set_value(fid, val);
+
+        last_comma = string::npos;
+        last_start = cur+1;
+      } else {
+        if (line[cur] == '=')
+          last_comma = cur;
+      }
+      ++cur;
+    }
+    training.push_back(make_pair(y, x));
+  }
+  if (flag) cerr << endl;
+
+  cerr << "Number of features: " << FD::NumFeats() << endl;
+  vector<double> x(FD::NumFeats(), 0.0);  // x[0] is bias
+  for (SparseVector<double>::const_iterator it = old_weights.begin();
+       it != old_weights.end(); ++it)
+    x[it->first] = it->second;
+  vector<double> vg(FD::NumFeats(), 0.0);
+  SparseVector<double> g;
+  bool converged = false;
+  LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as<unsigned>());
+  while(!converged) {
+    double cll = 0;
+    double dbias = 0;
+    g.clear();
+    for (int i = 0; i < training.size(); ++i) {
+      const double dotprod = training[i].second.dot(x) + x[0]; // x[0] is bias
+      double lp_false = dotprod;
+      double lp_true = -dotprod;
+      if (0 < lp_true) {
+        lp_true += log1p(exp(-lp_true));
+        lp_false = log1p(exp(lp_false));
+      } else {
+        lp_true = log1p(exp(lp_true));
+        lp_false += log1p(exp(-lp_false));
+      }
+      lp_true*=-1;
+      lp_false*=-1;
+      if (training[i].first) {  // true label
+        cll -= lp_true;
+        g -= training[i].second * exp(lp_false);
+        dbias -= exp(lp_false);
+      } else {                  // false label
+        cll -= lp_false;
+        g += training[i].second * exp(lp_true);
+        dbias += exp(lp_true);
       }
-      last_key = key;
-      esv.clear();
     }
-    if (val.size() % 4 != 0) {
-      cerr << "B64 encoding error 1! Skipping.\n";
-      continue;
+    vg.clear();
+    g.init_vector(&vg);
+    vg[0] = dbias;
+#if 1
+    const double sigsq = conf["sigma_squared"].as<double>();
+    double norm = 0;
+    for (int i = 1; i < x.size(); ++i) {
+      const double mean_i = 0.0;
+      const double param = (x[i] - mean_i);
+      norm += param * param;
+      vg[i] += param / sigsq;
+    } 
+    const double reg = norm / (2.0 * sigsq);
+#else
+    double reg = 0;
+#endif
+    cll += reg;
+    cerr << cll << " (REG=" << reg << ")\t";
+    bool failed = false;
+    try {
+      opt.Optimize(cll, vg, &x);
+    } catch (...) {
+      cerr << "Exception caught, assuming convergence is close enough...\n";
+      failed = true;
     }
-    string encoded(val.size() / 4 * 3, '\0');
-    if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) {
-      cerr << "B64 encoding error 2! Skipping.\n";
-      continue;
+    if (fabs(x[0]) > MAX_BIAS) {
+      cerr << "Biased model learned. Are your training instances wrong?\n";
+      cerr << "  BIAS: " << x[0] << endl;
     }
-    esv.push_back(ErrorSurface());
-    esv.back().Deserialize(type, encoded);
+    converged = failed || opt.HasConverged();
   }
-  if (!esv.empty()) {
-    // cerr << "ESV=" << esv.size() << endl;
-    // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; }
-    float score;
-    double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
-    cout << last_key << "|" << x << "|" << score << endl;
+  Weights w;
+  if (conf.count("weights")) {
+    for (int i = 1; i < x.size(); ++i)
+      x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi);
   }
+  w.InitFromVector(x);
+  w.WriteToFile("-");
   return 0;
 }
-- 
cgit v1.2.3


From 5e3c68b62dd72255db95c5822835a3931770f285 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 12 Jul 2011 22:34:34 -0400
Subject: debugged pro trainer

---
 pro-train/dist-pro.pl      |   9 +-
 pro-train/mr_pro_map.cc    | 244 +++++++++++++++++++++++++++++++++++++--------
 pro-train/mr_pro_reduce.cc |  57 ++++++-----
 utils/filelib.cc           |  12 +++
 utils/filelib.h            |   1 +
 5 files changed, 253 insertions(+), 70 deletions(-)

(limited to 'pro-train/mr_pro_reduce.cc')

diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl
index 55d7f1fa..c42e3876 100755
--- a/pro-train/dist-pro.pl
+++ b/pro-train/dist-pro.pl
@@ -66,6 +66,7 @@ my $bleu_weight=1;
 my $use_make;  # use make to parallelize line search
 my $dirargs='';
 my $usefork;
+my $initial_weights;
 my $pass_suffix = '';
 my $cpbin=1;
 # Process command-line options
@@ -79,6 +80,7 @@ if (GetOptions(
 	"dry-run" => \$dryrun,
 	"epsilon=s" => \$epsilon,
 	"help" => \$help,
+        "weights=s" => \$initial_weights,
 	"interval" => \$interval,
 	"iteration=i" => \$iteration,
 	"local" => \$run_local,
@@ -212,7 +214,7 @@ if ($dryrun){
         close CMD;
         print STDERR $cline;
         chmod(0755,$cmdfile);
-	check_call("touch $dir/weights.0");
+	check_call("cp $initial_weights $dir/weights.0");
 	die "Can't find weights.0" unless (-e "$dir/weights.0");
 	}
 	write_config(*STDERR);
@@ -239,7 +241,6 @@ my $random_seed = int(time / 1000);
 my $lastWeightsFile;
 my $lastPScore = 0;
 # main optimization loop
-my @mapoutputs = (); # aggregate map outputs over all iters
 while (1){
 	print STDERR "\n\nITERATION $iteration\n==========\n";
 
@@ -262,6 +263,7 @@ while (1){
 	my $im1 = $iteration - 1;
 	my $weightsFile="$dir/weights.$im1";
         push @allweights, "-w $dir/weights.$im1";
+        `rm -f $dir/hgs/*.gz`;
 	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
 	my $pcmd;
 	if ($run_local) {
@@ -333,6 +335,7 @@ while (1){
 		print $mkfile "all: $dir/splag.$im1/map.done\n\n";
 	}
 	my @mkouts = ();  # only used with makefiles
+	my @mapoutputs = ();
 	for my $shard (@shards) {
 		my $mapoutput = $shard;
 		my $client_name = $shard;
@@ -341,7 +344,7 @@ while (1){
 		$mapoutput =~ s/mapinput/mapoutput/;
 		push @mapoutputs, "$dir/splag.$im1/$mapoutput";
 		$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
-		my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep @allweights < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
+		my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
 		if ($run_local) {
 			print STDERR "COMMAND:\n$script\n";
 			check_bash_call($script);
diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc
index 128d93ce..4324e8de 100644
--- a/pro-train/mr_pro_map.cc
+++ b/pro-train/mr_pro_map.cc
@@ -2,7 +2,9 @@
 #include <iostream>
 #include <fstream>
 #include <vector>
+#include <tr1/unordered_map>
 
+#include <boost/functional/hash.hpp>
 #include <boost/shared_ptr.hpp>
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
@@ -22,16 +24,63 @@
 using namespace std;
 namespace po = boost::program_options;
 
+struct ApproxVectorHasher {
+  static const size_t MASK = 0xFFFFFFFFull;
+  union UType {
+    double f;
+    size_t i;
+  };
+  static inline double round(const double x) {
+    UType t;
+    t.f = x;
+    size_t r = t.i & MASK;
+    if ((r << 1) > MASK)
+      t.i += MASK - r + 1;
+    else
+      t.i &= (1ull - MASK);
+    return t.f;
+  }
+  size_t operator()(const SparseVector<double>& x) const {
+    size_t h = 0x573915839;
+    for (SparseVector<double>::const_iterator it = x.begin(); it != x.end(); ++it) {
+      UType t;
+      t.f = it->second;
+      if (t.f) {
+        size_t z = (t.i >> 32);
+        boost::hash_combine(h, it->first);
+        boost::hash_combine(h, z);
+      }
+    }
+    return h;
+  }
+};
+
+struct ApproxVectorEquals {
+  bool operator()(const SparseVector<double>& a, const SparseVector<double>& b) const {
+    SparseVector<double>::const_iterator bit = b.begin();
+    for (SparseVector<double>::const_iterator ait = a.begin(); ait != a.end(); ++ait) {
+      if (bit == b.end() ||
+          ait->first != bit->first ||
+          ApproxVectorHasher::round(ait->second) != ApproxVectorHasher::round(bit->second))
+        return false;
+      ++bit;
+    }
+    if (bit != b.end()) return false;
+    return true;
+  }
+};
+
 boost::shared_ptr<MT19937> rng;
 
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
         ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+        ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
+        ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)")
+        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
         ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)")
         ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized")
-        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
-        ("weights,w",po::value<vector<string> >(), "[REQD] Weights files from previous and current iterations")
         ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")
         ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")
         ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)")
@@ -46,7 +95,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
     flag = true;
   }
   if (!conf->count("weights")) {
-    cerr << "Please specify one or more weights using -w <WEIGHTS.TXT>\n";
+    cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
     flag = true;
   }
   if (flag || conf->count("help")) {
@@ -56,6 +105,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
 }
 
 struct HypInfo {
+  HypInfo() : g_(-100.0) {}
   HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-100.0), x(feats) {}
 
   // lazy evaluation
@@ -66,10 +116,92 @@ struct HypInfo {
   }
   vector<WordID> hyp;
   mutable double g_;
- public:
   SparseVector<double> x;
 };
 
+struct HypInfoCompare {
+  bool operator()(const HypInfo& a, const HypInfo& b) const {
+    ApproxVectorEquals comp;
+    return (a.hyp == b.hyp && comp(a.x,b.x));
+  }
+};
+
+struct HypInfoHasher {
+  size_t operator()(const HypInfo& x) const {
+    boost::hash<vector<WordID> > hhasher;
+    ApproxVectorHasher vhasher;
+    size_t ha = hhasher(x.hyp);
+    boost::hash_combine(ha, vhasher(x.x));
+    return ha;
+  }
+};
+
+void WriteKBest(const string& file, const vector<HypInfo>& kbest) {
+  WriteFile wf(file);
+  ostream& out = *wf.stream();
+  out.precision(10);
+  for (int i = 0; i < kbest.size(); ++i) {
+    out << TD::GetString(kbest[i].hyp) << endl;
+    out << kbest[i].x << endl;
+  }
+}
+
+void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
+  SparseVector<double>& x = *out;
+  size_t last_start = cur;
+  size_t last_comma = string::npos;
+  while(cur <= line.size()) {
+    if (line[cur] == ' ' || cur == line.size()) {
+      if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
+        cerr << "[ERROR] " << line << endl << "  position = " << cur << endl;
+        exit(1);
+      }
+      const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
+      if (cur < line.size()) line[cur] = 0;
+      const double val = strtod(&line[last_comma + 1], NULL);
+      x.set_value(fid, val);
+
+      last_comma = string::npos;
+      last_start = cur+1;
+    } else {
+      if (line[cur] == '=')
+        last_comma = cur;
+    }
+    ++cur;
+  }
+}
+
+void ReadKBest(const string& file, vector<HypInfo>* kbest) {
+  cerr << "Reading from " << file << endl;
+  ReadFile rf(file);
+  istream& in = *rf.stream();
+  string cand;
+  string feats;
+  while(getline(in, cand)) {
+    getline(in, feats);
+    assert(in);
+    kbest->push_back(HypInfo());
+    TD::ConvertSentence(cand, &kbest->back().hyp);
+    ParseSparseVector(feats, 0, &kbest->back().x);
+  }
+  cerr << "  read " << kbest->size() << " hypotheses\n";
+}
+
+void Dedup(vector<HypInfo>* h) {
+  cerr << "Dedup in=" << h->size();
+  tr1::unordered_set<HypInfo, HypInfoHasher, HypInfoCompare> u;
+  while(h->size() > 0) {
+    u.insert(h->back());
+    h->pop_back();
+  }
+  tr1::unordered_set<HypInfo, HypInfoHasher, HypInfoCompare>::iterator it = u.begin();
+  while (it != u.end()) {
+    h->push_back(*it);
+    it = u.erase(it);
+  }
+  cerr << "  out=" << h->size() << endl;
+}
+
 struct ThresholdAlpha {
   explicit ThresholdAlpha(double t = 0.05) : threshold(t) {}
   double operator()(double mag) const {
@@ -81,6 +213,7 @@ struct ThresholdAlpha {
 struct TrainingInstance {
   TrainingInstance(const SparseVector<double>& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {}
   SparseVector<double> x;
+#undef DEBUGGING_PRO
 #ifdef DEBUGGING_PRO
   vector<WordID> a;
   vector<WordID> b;
@@ -88,6 +221,11 @@ struct TrainingInstance {
   bool y;
   double gdiff;
 };
+#ifdef DEBUGGING_PRO
+ostream& operator<<(ostream& os, const TrainingInstance& d) {
+  return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x;
+}
+#endif
 
 struct DiffOrder {
   bool operator()(const TrainingInstance& a, const TrainingInstance& b) const {
@@ -95,36 +233,51 @@ struct DiffOrder {
   }
 };
 
-template<typename Alpha>
-void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const Alpha& alpha_i, bool invert_score, vector<TrainingInstance>* pv) {
-  vector<TrainingInstance> v;
+void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const bool invert_score, vector<TrainingInstance>* pv) {
+  vector<TrainingInstance> v1, v2;
+  double avg_diff = 0;
   for (unsigned i = 0; i < gamma; ++i) {
-    size_t a = rng->inclusive(0, J_i.size() - 1)();
-    size_t b = rng->inclusive(0, J_i.size() - 1)();
+    const size_t a = rng->inclusive(0, J_i.size() - 1)();
+    const size_t b = rng->inclusive(0, J_i.size() - 1)();
     if (a == b) continue;
     double ga = J_i[a].g(scorer);
     double gb = J_i[b].g(scorer);
-    bool positive = ga < gb;
+    bool positive = gb < ga;
     if (invert_score) positive = !positive;
-    double gdiff = fabs(ga - gb);
+    const double gdiff = fabs(ga - gb);
     if (!gdiff) continue;
-    if (rng->next() < alpha_i(gdiff)) {
-      v.push_back(TrainingInstance((J_i[a].x - J_i[b].x).erase_zeros(), positive, gdiff));
+    avg_diff += gdiff;
+    SparseVector<double> xdiff = (J_i[a].x - J_i[b].x).erase_zeros();
+    if (xdiff.empty()) {
+      cerr << "Empty diff:\n  " << TD::GetString(J_i[a].hyp) << endl << "x=" << J_i[a].x << endl;
+      cerr << "  " << TD::GetString(J_i[b].hyp) << endl << "x=" << J_i[b].x << endl;
+      continue;
+    }
+    v1.push_back(TrainingInstance(xdiff, positive, gdiff));
 #ifdef DEBUGGING_PRO
-      v.back().a = J_i[a].hyp;
-      v.back().b = J_i[b].hyp;
+    v1.back().a = J_i[a].hyp;
+    v1.back().b = J_i[b].hyp;
+    cerr << "N: " << v1.back() << endl;
 #endif
-    }
   }
-  vector<TrainingInstance>::iterator mid = v.begin() + xi;
-  if (xi > v.size()) mid = v.end();
-  partial_sort(v.begin(), mid, v.end(), DiffOrder());
-  copy(v.begin(), mid, back_inserter(*pv));
+  avg_diff /= v1.size();
+
+  for (unsigned i = 0; i < v1.size(); ++i) {
+    double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff));
+    // cerr << "avg_diff=" << avg_diff << "  gdiff=" << v1[i].gdiff << "  p=" << p << endl;
+    if (rng->next() < p) v2.push_back(v1[i]);
+  }
+  vector<TrainingInstance>::iterator mid = v2.begin() + xi;
+  if (xi > v2.size()) mid = v2.end();
+  partial_sort(v2.begin(), mid, v2.end(), DiffOrder());
+  copy(v2.begin(), mid, back_inserter(*pv));
 #ifdef DEBUGGING_PRO
-  if (v.size() >= 5)
-    for (int i =0; i < 5; ++i) {
-      cerr << v[i].gdiff << " y=" << v[i].y << "\tA:" << TD::GetString(v[i].a) << "\n\tB: " << TD::GetString(v[i].b) << endl;
+  if (v2.size() >= 5) {
+    for (int i =0; i < (mid - v2.begin()); ++i) {
+      cerr << v2[i] << endl;
     }
+    cerr << pv->back() << endl;
+  }
 #endif
 }
 
@@ -136,6 +289,7 @@ int main(int argc, char** argv) {
   else
     rng.reset(new MT19937);
   const string loss_function = conf["loss_function"].as<string>();
+
   ScoreType type = ScoreTypeFromString(loss_function);
   DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>());
   cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl;
@@ -146,13 +300,15 @@ int main(int argc, char** argv) {
   const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
   const unsigned gamma = conf["candidate_pairs"].as<unsigned>();
   const unsigned xi = conf["best_pairs"].as<unsigned>();
-  vector<string> weights_files = conf["weights"].as<vector<string> >();
-  vector<vector<double> > weights(weights_files.size());
-  for (int i = 0; i < weights.size(); ++i) {
+  string weightsf = conf["weights"].as<string>();
+  vector<double> weights;
+  {
     Weights w;
-    w.InitFromFile(weights_files[i]);
-    w.InitVector(&weights[i]);
+    w.InitFromFile(weightsf);
+    w.InitVector(&weights);
   }
+  string kbest_repo = conf["kbest_repository"].as<string>();
+  MkDirP(kbest_repo);
   while(in) {
     vector<TrainingInstance> v;
     string line;
@@ -164,24 +320,26 @@ int main(int argc, char** argv) {
     // path-to-file (JSON) sent_id
     is >> file >> sent_id;
     ReadFile rf(file);
-    HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+    ostringstream os;
     vector<HypInfo> J_i;
-    int start = weights.size();
-    start -= 4;
-    if (start < 0) start = 0;
-    for (int i = start; i < weights.size(); ++i) {
-      hg.Reweight(weights[i]);
-      KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
-
-      for (int i = 0; i < kbest_size; ++i) {
-        const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
-          kbest.LazyKthBest(hg.nodes_.size() - 1, i);
-        if (!d) break;
-        J_i.push_back(HypInfo(d->yield, d->feature_values));
-      }
+    os << kbest_repo << "/kbest." << sent_id << ".txt.gz";
+    const string kbest_file = os.str();
+    if (FileExists(kbest_file))
+      ReadKBest(kbest_file, &J_i);
+    HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+    hg.Reweight(weights);
+    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
+
+    for (int i = 0; i < kbest_size; ++i) {
+      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+        kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+      if (!d) break;
+      J_i.push_back(HypInfo(d->yield, d->feature_values));
     }
+    Dedup(&J_i);
+    WriteKBest(kbest_file, J_i);
 
-    Sample(gamma, xi, J_i, *ds[sent_id], ThresholdAlpha(0.05), (type == TER), &v);
+    Sample(gamma, xi, J_i, *ds[sent_id], (type == TER), &v);
     for (unsigned i = 0; i < v.size(); ++i) {
       const TrainingInstance& vi = v[i];
       cout << vi.y << "\t" << vi.x << endl;
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 2b9c5ce7..e1a7db8a 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -24,7 +24,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")
         ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev")
         ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)")
-        ("sigma_squared,s",po::value<double>()->default_value(0.5), "Sigma squared for Gaussian prior")
+        ("sigma_squared,s",po::value<double>()->default_value(1.0), "Sigma squared for Gaussian prior")
         ("help,h", "Help");
   po::options_description dcmdline_options;
   dcmdline_options.add(opts);
@@ -35,6 +35,31 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   }
 }
 
+void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
+  SparseVector<double>& x = *out;
+  size_t last_start = cur;
+  size_t last_comma = string::npos;
+  while(cur <= line.size()) {
+    if (line[cur] == ' ' || cur == line.size()) {
+      if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
+        cerr << "[ERROR] " << line << endl << "  position = " << cur << endl;
+        exit(1);
+      }
+      const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
+      if (cur < line.size()) line[cur] = 0;
+      const double val = strtod(&line[last_comma + 1], NULL);
+      x.set_value(fid, val);
+
+      last_comma = string::npos;
+      last_start = cur+1;
+    } else {
+      if (line[cur] == '=')
+        last_comma = cur;
+    }
+    ++cur;
+  }
+}
+
 int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
@@ -60,28 +85,7 @@ int main(int argc, char** argv) {
     assert(ks == 1);
     const bool y = line[0] == '1';
     SparseVector<double> x;
-    size_t last_start = ks + 1;
-    size_t last_comma = string::npos;
-    size_t cur = last_start;
-    while(cur <= line.size()) {
-      if (line[cur] == ' ' || cur == line.size()) {
-        if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
-          cerr << "[ERROR] " << line << endl << "  position = " << cur << endl;
-          exit(1);
-        }
-        const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
-        if (cur < line.size()) line[cur] = 0;
-        const double val = strtod(&line[last_comma + 1], NULL);
-        x.set_value(fid, val);
-
-        last_comma = string::npos;
-        last_start = cur+1;
-      } else {
-        if (line[cur] == '=')
-          last_comma = cur;
-      }
-      ++cur;
-    }
+    ParseSparseVector(line, ks + 1, &x);
     training.push_back(make_pair(y, x));
   }
   if (flag) cerr << endl;
@@ -95,6 +99,7 @@ int main(int argc, char** argv) {
   SparseVector<double> g;
   bool converged = false;
   LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as<unsigned>());
+  double ppl = 0;
   while(!converged) {
     double cll = 0;
     double dbias = 0;
@@ -114,14 +119,18 @@ int main(int argc, char** argv) {
       lp_false*=-1;
       if (training[i].first) {  // true label
         cll -= lp_true;
+        ppl += lp_true / log(2);
         g -= training[i].second * exp(lp_false);
         dbias -= exp(lp_false);
       } else {                  // false label
         cll -= lp_false;
+        ppl += lp_false / log(2);
         g += training[i].second * exp(lp_true);
         dbias += exp(lp_true);
       }
     }
+    ppl /= training.size();
+    ppl = pow(2.0, - ppl);
     vg.clear();
     g.init_vector(&vg);
     vg[0] = dbias;
@@ -139,7 +148,7 @@ int main(int argc, char** argv) {
     double reg = 0;
 #endif
     cll += reg;
-    cerr << cll << " (REG=" << reg << ")\t";
+    cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t";
     bool failed = false;
     try {
       opt.Optimize(cll, vg, &x);
diff --git a/utils/filelib.cc b/utils/filelib.cc
index 79ad2847..a0969b1a 100644
--- a/utils/filelib.cc
+++ b/utils/filelib.cc
@@ -20,3 +20,15 @@ bool DirectoryExists(const string& dir) {
   return false;
 }
 
+void MkDirP(const string& dir) {
+  if (DirectoryExists(dir)) return;
+  if (mkdir(dir.c_str(), 0777)) {
+    perror(dir.c_str());
+    abort();
+  }
+  if (chmod(dir.c_str(), 07777)) {
+    perror(dir.c_str());
+    abort();
+  }
+}
+
diff --git a/utils/filelib.h b/utils/filelib.h
index dda98671..a8622246 100644
--- a/utils/filelib.h
+++ b/utils/filelib.h
@@ -12,6 +12,7 @@
 
 bool FileExists(const std::string& file_name);
 bool DirectoryExists(const std::string& dir_name);
+void MkDirP(const std::string& dir_name);
 
 // reads from standard in if filename is -
 // uncompresses if file ends with .gz
-- 
cgit v1.2.3


From b8f7fc10e14eb07b17f1ef46f8ecd3c13f128814 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 12 Jul 2011 23:32:11 -0400
Subject: minor optimization

---
 pro-train/mr_pro_reduce.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'pro-train/mr_pro_reduce.cc')

diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index e1a7db8a..5382e1a5 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -149,18 +149,20 @@ int main(int argc, char** argv) {
 #endif
     cll += reg;
     cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t";
-    bool failed = false;
     try {
-      opt.Optimize(cll, vg, &x);
+      vector<double> old_x = x;
+      do {
+        opt.Optimize(cll, vg, &x);
+        converged = opt.HasConverged();
+      } while (!converged && x == old_x);
     } catch (...) {
       cerr << "Exception caught, assuming convergence is close enough...\n";
-      failed = true;
+      converged = true;
     }
     if (fabs(x[0]) > MAX_BIAS) {
       cerr << "Biased model learned. Are your training instances wrong?\n";
       cerr << "  BIAS: " << x[0] << endl;
     }
-    converged = failed || opt.HasConverged();
   }
   Weights w;
   if (conf.count("weights")) {
-- 
cgit v1.2.3


From 9b469ea153e5ae63f4524a71caf3c4518e5f775d Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 13 Jul 2011 16:25:05 -0400
Subject: faster code, optional held-out test set

---
 pro-train/mr_pro_reduce.cc | 140 ++++++++++++++++++++++++++++-----------------
 1 file changed, 89 insertions(+), 51 deletions(-)

(limited to 'pro-train/mr_pro_reduce.cc')

diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 5382e1a5..491ceb3a 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -7,6 +7,7 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "filelib.h"
 #include "weights.h"
 #include "sparse_vector.h"
 #include "optimize.h"
@@ -25,6 +26,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev")
         ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)")
         ("sigma_squared,s",po::value<double>()->default_value(1.0), "Sigma squared for Gaussian prior")
+        ("testset,t",po::value<string>(), "Optional held-out test set to tune regularizer")
         ("help,h", "Help");
   po::options_description dcmdline_options;
   dcmdline_options.add(opts);
@@ -60,13 +62,79 @@ void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
   }
 }
 
+void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<double> > >* corpus) {
+  istream& in = *pin;
+  corpus->clear();
+  bool flag = false;
+  int lc = 0;
+  string line;
+  SparseVector<double> x;
+  while(getline(in, line)) {
+    ++lc;
+    if (lc % 1000 == 0) { cerr << '.'; flag = true; }
+    if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
+    if (line.empty()) continue;
+    const size_t ks = line.find("\t");
+    assert(string::npos != ks);
+    assert(ks == 1);
+    const bool y = line[0] == '1';
+    x.clear();
+    ParseSparseVector(line, ks + 1, &x);
+    corpus->push_back(make_pair(y, x));
+  }
+  if (flag) cerr << endl;
+}
+
+void GradAdd(const SparseVector<double>& v, const double scale, vector<double>* acc) {
+  for (SparseVector<double>::const_iterator it = v.begin();
+       it != v.end(); ++it) {
+    (*acc)[it->first] += it->second * scale;
+  }
+}
+
+double TrainingInference(const vector<double>& x,
+                         const vector<pair<bool, SparseVector<double> > >& corpus,
+                         vector<double>* g = NULL) {
+  if (g) fill(g->begin(), g->end(), 0.0);
+
+  double cll = 0;
+  for (int i = 0; i < corpus.size(); ++i) {
+    const double dotprod = corpus[i].second.dot(x) + x[0]; // x[0] is bias
+    double lp_false = dotprod;
+    double lp_true = -dotprod;
+    if (0 < lp_true) {
+      lp_true += log1p(exp(-lp_true));
+      lp_false = log1p(exp(lp_false));
+    } else {
+      lp_true = log1p(exp(lp_true));
+      lp_false += log1p(exp(-lp_false));
+    }
+    lp_true*=-1;
+    lp_false*=-1;
+    if (corpus[i].first) {  // true label
+      cll -= lp_true;
+      if (g) {
+        // g -= corpus[i].second * exp(lp_false);
+        GradAdd(corpus[i].second, -exp(lp_false), g);
+        (*g)[0] -= exp(lp_false); // bias
+      }
+    } else {                  // false label
+      cll -= lp_false;
+      if (g) {
+        // g += corpus[i].second * exp(lp_true);
+        GradAdd(corpus[i].second, exp(lp_true), g);
+        (*g)[0] += exp(lp_true); // bias
+      }
+    }
+  }
+  return cll;
+}
+
 int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
   string line;
-  vector<pair<bool, SparseVector<double> > > training;
-  int lc = 0;
-  bool flag = false;
+  vector<pair<bool, SparseVector<double> > > training, testing;
   SparseVector<double> old_weights;
   const double psi = conf["interpolation"].as<double>();
   if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
@@ -75,20 +143,11 @@ int main(int argc, char** argv) {
     w.InitFromFile(conf["weights"].as<string>());
     w.InitSparseVector(&old_weights);
   }
-  while(getline(cin, line)) {
-    ++lc;
-    if (lc % 1000 == 0) { cerr << '.'; flag = true; }
-    if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
-    if (line.empty()) continue;
-    const size_t ks = line.find("\t");
-    assert(string::npos != ks);
-    assert(ks == 1);
-    const bool y = line[0] == '1';
-    SparseVector<double> x;
-    ParseSparseVector(line, ks + 1, &x);
-    training.push_back(make_pair(y, x));
+  ReadCorpus(&cin, &training);
+  if (conf.count("testset")) {
+    ReadFile rf(conf["testset"].as<string>());
+    ReadCorpus(rf.stream(), &testing);
   }
-  if (flag) cerr << endl;
 
   cerr << "Number of features: " << FD::NumFeats() << endl;
   vector<double> x(FD::NumFeats(), 0.0);  // x[0] is bias
@@ -96,44 +155,23 @@ int main(int argc, char** argv) {
        it != old_weights.end(); ++it)
     x[it->first] = it->second;
   vector<double> vg(FD::NumFeats(), 0.0);
-  SparseVector<double> g;
   bool converged = false;
   LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as<unsigned>());
-  double ppl = 0;
   while(!converged) {
-    double cll = 0;
-    double dbias = 0;
-    g.clear();
-    for (int i = 0; i < training.size(); ++i) {
-      const double dotprod = training[i].second.dot(x) + x[0]; // x[0] is bias
-      double lp_false = dotprod;
-      double lp_true = -dotprod;
-      if (0 < lp_true) {
-        lp_true += log1p(exp(-lp_true));
-        lp_false = log1p(exp(lp_false));
-      } else {
-        lp_true = log1p(exp(lp_true));
-        lp_false += log1p(exp(-lp_false));
-      }
-      lp_true*=-1;
-      lp_false*=-1;
-      if (training[i].first) {  // true label
-        cll -= lp_true;
-        ppl += lp_true / log(2);
-        g -= training[i].second * exp(lp_false);
-        dbias -= exp(lp_false);
-      } else {                  // false label
-        cll -= lp_false;
-        ppl += lp_false / log(2);
-        g += training[i].second * exp(lp_true);
-        dbias += exp(lp_true);
-      }
-    }
+    double cll = TrainingInference(x, training, &vg);
+    double ppl = cll / log(2);
     ppl /= training.size();
-    ppl = pow(2.0, - ppl);
-    vg.clear();
-    g.init_vector(&vg);
-    vg[0] = dbias;
+    ppl = pow(2.0, ppl);
+    double tppl = 0.0;
+
+    // evaluate optional held-out test set
+    if (testing.size()) {
+      tppl = TrainingInference(x, testing) / log(2);
+      tppl /= testing.size();
+      tppl = pow(2.0, tppl);
+    }
+
+    // handle regularizer
 #if 1
     const double sigsq = conf["sigma_squared"].as<double>();
     double norm = 0;
@@ -148,7 +186,7 @@ int main(int argc, char** argv) {
     double reg = 0;
 #endif
     cll += reg;
-    cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t";
+    cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t";
     try {
       vector<double> old_x = x;
       do {
-- 
cgit v1.2.3


From b89c1f03c89c6c30b88099e4f3e0c1753d338ea7 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 16 Jul 2011 19:13:21 -0400
Subject: tune regularizer

---
 mteval/scorer.cc           |  12 +++-
 pro-train/dist-pro.pl      | 139 ++++++++++++++++++++++++++-------------------
 pro-train/mr_pro_reduce.cc | 128 ++++++++++++++++++++++++++++++-----------
 3 files changed, 185 insertions(+), 94 deletions(-)

(limited to 'pro-train/mr_pro_reduce.cc')

diff --git a/mteval/scorer.cc b/mteval/scorer.cc
index 2daa0daa..a83b9e2f 100644
--- a/mteval/scorer.cc
+++ b/mteval/scorer.cc
@@ -430,6 +430,7 @@ float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const {
   float log_bleu = 0;
   if (precs) precs->clear();
   int count = 0;
+  vector<float> total_precs(N());
   for (int i = 0; i < N(); ++i) {
     if (hyp_ngram_counts[i] > 0) {
       float cor_count = correct_ngram_hit_counts[i];
@@ -440,14 +441,21 @@ float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const {
       log_bleu += lprec;
       ++count;
     }
+    total_precs[i] = log_bleu;
   }
-  log_bleu /= static_cast<float>(count);
+  vector<float> bleus(N());
   float lbp = 0.0;
   if (hyp_len < ref_len)
     lbp = (hyp_len - ref_len) / hyp_len;
   log_bleu += lbp;
   if (bp) *bp = exp(lbp);
-  return exp(log_bleu);
+  float wb = 0;
+  for (int i = 0; i < N(); ++i) {
+    bleus[i] = exp(total_precs[i] / (i+1) + lbp);
+    wb += bleus[i] / pow(2.0, 4.0 - i);
+  }
+  //return wb;
+  return bleus.back();
 }
 
 
diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl
index c42e3876..dbfa329a 100755
--- a/pro-train/dist-pro.pl
+++ b/pro-train/dist-pro.pl
@@ -37,42 +37,36 @@ die "Can't find decoder in $cdec" unless -x $cdec;
 die "Can't find $parallelize" unless -x $parallelize;
 die "Can't find $libcall" unless -e $libcall;
 my $decoder = $cdec;
-my $lines_per_mapper = 100;
+my $lines_per_mapper = 30;
 my $iteration = 1;
 my $run_local = 0;
 my $best_weights;
-my $max_iterations = 15;
-my $optimization_iters = 6;
+my $max_iterations = 30;
 my $decode_nodes = 15;   # number of decode nodes
-my $pmem = "9g";
+my $pmem = "4g";
 my $disable_clean = 0;
 my %seen_weights;
-my $normalize;
 my $help = 0;
 my $epsilon = 0.0001;
-my $interval = 5;
 my $dryrun = 0;
 my $last_score = -10000000;
 my $metric = "ibm_bleu";
 my $dir;
 my $iniFile;
 my $weights;
-my $decoderOpt;
-my $noprimary;
-my $maxsim=0;
-my $oraclen=0;
-my $oracleb=20;
-my $bleu_weight=1;
-my $use_make;  # use make to parallelize line search
-my $dirargs='';
+my $use_make;  # use make to parallelize
 my $usefork;
 my $initial_weights;
 my $pass_suffix = '';
 my $cpbin=1;
+
+# regularization strength
+my $tune_regularizer = 0;
+my $reg = 1e-2;
+
 # Process command-line options
 Getopt::Long::Configure("no_auto_abbrev");
 if (GetOptions(
-	"decoder=s" => \$decoderOpt,
 	"decode-nodes=i" => \$decode_nodes,
 	"dont-clean" => \$disable_clean,
 	"pass-suffix=s" => \$pass_suffix,
@@ -81,21 +75,13 @@ if (GetOptions(
 	"epsilon=s" => \$epsilon,
 	"help" => \$help,
         "weights=s" => \$initial_weights,
-	"interval" => \$interval,
-	"iteration=i" => \$iteration,
+	"tune-regularizer" => \$tune_regularizer,
+	"reg=f" => \$reg,
 	"local" => \$run_local,
 	"use-make=i" => \$use_make,
 	"max-iterations=i" => \$max_iterations,
-	"normalize=s" => \$normalize,
 	"pmem=s" => \$pmem,
         "cpbin!" => \$cpbin,
-        "bleu_weight=s" => \$bleu_weight,
-        "no-primary!" => \$noprimary,
-        "max-similarity=s" => \$maxsim,
-        "oracle-directions=i" => \$oraclen,
-        "n-oracle=i" => \$oraclen,
-        "oracle-batch=i" => \$oracleb,
-        "directions-args=s" => \$dirargs,
 	"ref-files=s" => \$refFiles,
 	"metric=s" => \$metric,
 	"source-file=s" => \$srcFile,
@@ -108,9 +94,7 @@ if (GetOptions(
 if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; }
 
 if ($metric =~ /^(combi|ter)$/i) {
-  $lines_per_mapper = 40;
-} elsif ($metric =~ /^meteor$/i) {
-  $lines_per_mapper = 2000;   # start up time is really high
+  $lines_per_mapper = 5;
 }
 
 ($iniFile) = @ARGV;
@@ -144,8 +128,6 @@ unless ($dir =~ /^\//){  # convert relative path to absolute path
 	$dir = "$basedir/$dir";
 }
 
-if ($decoderOpt){ $decoder = $decoderOpt; }
-
 
 # Initializations and helper functions
 srand;
@@ -378,6 +360,22 @@ while (1){
 			else {$joblist = $joblist . "\|" . $jobid; }
 		}
 	}
+	my @dev_outs = ();
+	my @devtest_outs = ();
+	if ($tune_regularizer) {
+		for (my $i = 0; $i < scalar @mapoutputs; $i++) {
+			if ($i % 3 == 1) {
+				push @devtest_outs, $mapoutputs[$i];
+			} else {
+				push @dev_outs, $mapoutputs[$i];
+			}
+		}
+		if (scalar @devtest_outs == 0) {
+			die "Not enough training instances for regularization tuning! Rerun without --tune-regularizer\n";
+		}
+	} else {
+		@dev_outs = @mapoutputs;
+	}
 	if ($run_local) {
 		print STDERR "\nCompleted extraction of training exemplars.\n";
 	} elsif ($use_make) {
@@ -399,7 +397,13 @@ while (1){
 	}
 	my $tol = 0;
 	my $til = 0;
-        print STDERR "MO: @mapoutputs\n";
+	my $dev_test_file = "$dir/splag.$im1/devtest.gz";
+	if ($tune_regularizer) {
+		my $cmd = "cat @devtest_outs | gzip > $dev_test_file";
+		check_bash_call($cmd);
+		die "Can't find file $dev_test_file" unless -f $dev_test_file;
+	}
+        #print STDERR "MO: @mapoutputs\n";
 	for my $mo (@mapoutputs) {
 		#my $olines = get_lines($mo);
 		#my $ilines = get_lines($o2i{$mo});
@@ -407,10 +411,24 @@ while (1){
 	}
 	print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n";
 	print STDERR unchecked_output("date");
-	$cmd="cat @mapoutputs | $REDUCER -w $dir/weights.$im1 > $dir/weights.$iteration";
+	$cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -s $reg";
+	if ($tune_regularizer) {
+		$cmd .= " -T -t $dev_test_file";
+	}
+        $cmd .= " > $dir/weights.$iteration";
 	print STDERR "COMMAND:\n$cmd\n";
 	check_bash_call($cmd);
 	$lastWeightsFile = "$dir/weights.$iteration";
+	if ($tune_regularizer) {
+		open W, "<$lastWeightsFile" or die "Can't read $lastWeightsFile: $!";
+		my $line = <W>;
+		close W;
+		my ($sharp, $label, $nreg) = split /\s|=/, $line;
+		print STDERR "REGULARIZATION STRENGTH ($label) IS $nreg\n";
+		$reg = $nreg;
+		# only tune regularizer on first iteration?
+		$tune_regularizer = 0;
+	}
 	$lastPScore = $score;
 	$iteration++;
 	print STDERR "\n==========\n";
@@ -473,7 +491,6 @@ sub write_config {
 	print $fh "SOURCE (DEV):     $srcFile\n";
 	print $fh "REFS (DEV):       $refFiles\n";
 	print $fh "EVAL METRIC:      $metric\n";
-	print $fh "START ITERATION:  $iteration\n";
 	print $fh "MAX ITERATIONS:   $max_iterations\n";
 	print $fh "DECODE NODES:     $decode_nodes\n";
 	print $fh "HEAD NODE:        $host\n";
@@ -535,31 +552,38 @@ Usage: $executable [options] <ini file>
 		based on certain conventions.  For details, refer to descriptions
 		of the options --decoder, --weights, and --workdir.
 
-Options:
+Required:
+
+	--ref-files <files>
+		Dev set ref files.  This option takes only a single string argument.
+		To use multiple files (including file globbing), this argument should
+		be quoted.
+
+	--source-file <file>
+		Dev set source file.
+
+	--weights <file>
+		Initial weights file (use empty file to start from 0)
+
+General options:
 
 	--local
 		Run the decoder and optimizer locally with a single thread.
 
-	--use-make <I>
-		Use make -j <I> to run the optimizer commands (useful on large
-		shared-memory machines where qsub is unavailable).
-
 	--decode-nodes <I>
 		Number of decoder processes to run in parallel. [default=15]
 
-	--decoder <decoder path>
-		Decoder binary to use.
-
 	--help
 		Print this message and exit.
 
-	--iteration <I>
-		Starting iteration number.  If not specified, defaults to 1.
-
 	--max-iterations <M>
 		Maximum number of iterations to run.  If not specified, defaults
 		to 10.
 
+	--metric <method>
+		Metric to optimize.
+		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
 	--pass-suffix <S>
 		If the decoder is doing multi-pass decoding, the pass suffix "2",
 		"3", etc., is used to control what iteration of weights is set.
@@ -567,21 +591,9 @@ Options:
 	--pmem <N>
 		Amount of physical memory requested for parallel decoding jobs.
 
-	--ref-files <files>
-		Dev set ref files.  This option takes only a single string argument.
-		To use multiple files (including file globbing), this argument should
-		be quoted.
-
-	--metric <method>
-		Metric to optimize.
-		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
-
-	--normalize <feature-name>
-		After each iteration, rescale all feature weights such that feature-
-		name has a weight of 1.0.
-
-	--source-file <file>
-		Dev set source file.
+	--use-make <I>
+		Use make -j <I> to run the optimizer commands (useful on large
+		shared-memory machines where qsub is unavailable).
 
 	--workdir <dir>
 		Directory for intermediate and output files.  If not specified, the
@@ -591,6 +603,14 @@ Options:
 		the filename.  E.g. an ini file named decoder.foo.ini would have
 		a default working directory name foo.
 
+Regularization options:
+
+	--tune-regularizer
+		Hold out one third of the tuning data and used this to tune the
+		regularization parameter.
+
+	--reg <F>
+
 Help
 }
 
@@ -606,7 +626,6 @@ sub convert {
 }
 
 
-
 sub cmdline {
     return join ' ',($0,@ORIG_ARGV);
 }
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 491ceb3a..9b422f33 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -16,7 +16,7 @@ using namespace std;
 namespace po = boost::program_options;
 
 // since this is a ranking model, there should be equal numbers of
-// positive and negative examples so the bias should be 0
+// positive and negative examples, so the bias should be 0
 static const double MAX_BIAS = 1e-10;
 
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
@@ -25,8 +25,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")
         ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev")
         ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)")
-        ("sigma_squared,s",po::value<double>()->default_value(1.0), "Sigma squared for Gaussian prior")
-        ("testset,t",po::value<string>(), "Optional held-out test set to tune regularizer")
+        ("sigma_squared,s",po::value<double>()->default_value(0.1), "Sigma squared for Gaussian prior")
+        ("min_reg,r",po::value<double>()->default_value(1e-8), "When tuning (-T) regularization strength, minimum regularization strenght")
+        ("max_reg,R",po::value<double>()->default_value(10.0), "When tuning (-T) regularization strength, maximum regularization strenght")
+        ("testset,t",po::value<string>(), "Optional held-out test set")
+        ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength")
         ("help,h", "Help");
   po::options_description dcmdline_options;
   dcmdline_options.add(opts);
@@ -95,8 +98,6 @@ void GradAdd(const SparseVector<double>& v, const double scale, vector<double>*
 double TrainingInference(const vector<double>& x,
                          const vector<pair<bool, SparseVector<double> > >& corpus,
                          vector<double>* g = NULL) {
-  if (g) fill(g->begin(), g->end(), 0.0);
-
   double cll = 0;
   for (int i = 0; i < corpus.size(); ++i) {
     const double dotprod = corpus[i].second.dot(x) + x[0]; // x[0] is bias
@@ -130,39 +131,23 @@ double TrainingInference(const vector<double>& x,
   return cll;
 }
 
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-  string line;
-  vector<pair<bool, SparseVector<double> > > training, testing;
-  SparseVector<double> old_weights;
-  const double psi = conf["interpolation"].as<double>();
-  if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
-  if (conf.count("weights")) {
-    Weights w;
-    w.InitFromFile(conf["weights"].as<string>());
-    w.InitSparseVector(&old_weights);
-  }
-  ReadCorpus(&cin, &training);
-  if (conf.count("testset")) {
-    ReadFile rf(conf["testset"].as<string>());
-    ReadCorpus(rf.stream(), &testing);
-  }
-
-  cerr << "Number of features: " << FD::NumFeats() << endl;
-  vector<double> x(FD::NumFeats(), 0.0);  // x[0] is bias
-  for (SparseVector<double>::const_iterator it = old_weights.begin();
-       it != old_weights.end(); ++it)
-    x[it->first] = it->second;
+// return held-out log likelihood
+double LearnParameters(const vector<pair<bool, SparseVector<double> > >& training,
+                       const vector<pair<bool, SparseVector<double> > >& testing,
+                       const double sigsq,
+                       const unsigned memory_buffers,
+                       vector<double>* px) {
+  vector<double>& x = *px;
   vector<double> vg(FD::NumFeats(), 0.0);
   bool converged = false;
-  LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as<unsigned>());
+  LBFGSOptimizer opt(FD::NumFeats(), memory_buffers);
+  double tppl = 0.0;
   while(!converged) {
+    fill(vg.begin(), vg.end(), 0.0);
     double cll = TrainingInference(x, training, &vg);
     double ppl = cll / log(2);
     ppl /= training.size();
     ppl = pow(2.0, ppl);
-    double tppl = 0.0;
 
     // evaluate optional held-out test set
     if (testing.size()) {
@@ -173,7 +158,6 @@ int main(int argc, char** argv) {
 
     // handle regularizer
 #if 1
-    const double sigsq = conf["sigma_squared"].as<double>();
     double norm = 0;
     for (int i = 1; i < x.size(); ++i) {
       const double mean_i = 0.0;
@@ -202,11 +186,91 @@ int main(int argc, char** argv) {
       cerr << "  BIAS: " << x[0] << endl;
     }
   }
+  return tppl;
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  string line;
+  vector<pair<bool, SparseVector<double> > > training, testing;
+  SparseVector<double> old_weights;
+  const bool tune_regularizer = conf.count("tune_regularizer");
+  if (tune_regularizer && !conf.count("testset")) {
+    cerr << "--tune_regularizer requires --testset to be set\n";
+    return 1;
+  }
+  const double min_reg = conf["min_reg"].as<double>();
+  const double max_reg = conf["max_reg"].as<double>();
+  double sigsq = conf["sigma_squared"].as<double>();
+  assert(sigsq > 0.0);
+  assert(min_reg > 0.0);
+  assert(max_reg > 0.0);
+  assert(max_reg > min_reg);
+  const double psi = conf["interpolation"].as<double>();
+  if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
+  if (conf.count("weights")) {
+    Weights w;
+    w.InitFromFile(conf["weights"].as<string>());
+    w.InitSparseVector(&old_weights);
+  }
+  ReadCorpus(&cin, &training);
+  if (conf.count("testset")) {
+    ReadFile rf(conf["testset"].as<string>());
+    ReadCorpus(rf.stream(), &testing);
+  }
+  cerr << "Number of features: " << FD::NumFeats() << endl;
+  vector<double> x(FD::NumFeats(), 0.0);  // x[0] is bias
+  for (SparseVector<double>::const_iterator it = old_weights.begin();
+       it != old_weights.end(); ++it)
+    x[it->first] = it->second;
+  double tppl = 0.0;
+  vector<pair<double,double> > sp;
+  vector<double> smoothed;
+  if (tune_regularizer) {
+    sigsq = min_reg;
+    const double steps = 18;
+    double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps);
+    cerr << "SWEEP FACTOR: " << sweep_factor << endl;
+    while(sigsq < max_reg) {
+      tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
+      sp.push_back(make_pair(sigsq, tppl));
+      sigsq *= sweep_factor;
+    }
+    smoothed.resize(sp.size(), 0);
+    smoothed[0] = sp[0].second;
+    smoothed.back() = sp.back().second; 
+    for (int i = 1; i < sp.size()-1; ++i) {
+      double prev = sp[i-1].second;
+      double next = sp[i+1].second;
+      double cur = sp[i].second;
+      smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next);
+    }
+    double best_ppl = 9999999;
+    unsigned best_i = 0;
+    for (unsigned i = 0; i < sp.size(); ++i) {
+      if (smoothed[i] < best_ppl) {
+        best_ppl = smoothed[i];
+        best_i = i;
+      }
+    }
+    sigsq = sp[best_i].first;
+    tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
+  }
   Weights w;
   if (conf.count("weights")) {
     for (int i = 1; i < x.size(); ++i)
       x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi);
   }
+  cout.precision(15);
+  cout << "# sigma^2=" << sigsq << "\theld out perplexity=";
+  if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; }
+  if (sp.size()) {
+    cout << "# Parameter sweep:\n";
+    for (int i = 0; i < sp.size(); ++i) {
+      cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl;
+    }
+  }
   w.InitFromVector(x);
   w.WriteToFile("-");
   return 0;
-- 
cgit v1.2.3


From bb86637332d49f71c485df34576e464eaf053656 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 13 Sep 2011 17:36:23 +0100
Subject: get rid of bad Weights class so it no longer keeps a copy of a vector
 inside it

---
 decoder/decoder.cc                    |  64 ++++++++---------
 decoder/decoder.h                     |   9 ++-
 mira/kbest_mira.cc                    |  62 ++++-------------
 pro-train/mr_pro_map.cc               |   8 +--
 pro-train/mr_pro_reduce.cc            |  16 ++---
 training/Makefile.am                  |   8 ---
 training/augment_grammar.cc           |   4 +-
 training/collapse_weights.cc          |   6 +-
 training/compute_cllh.cc              |  23 +++---
 training/grammar_convert.cc           |   8 +--
 training/mpi_batch_optimize.cc        | 127 ++++++++--------------------------
 training/mpi_online_optimize.cc       |  69 +++++++-----------
 training/mr_optimize_reduce.cc        |  19 ++---
 utils/fdict.h                         |   2 +
 utils/phmt.cc                         |   8 +--
 utils/weights.cc                      |  75 ++++++++++++--------
 utils/weights.h                       |  22 +++---
 vest/mr_vest_generate_mapper_input.cc |   6 +-
 18 files changed, 201 insertions(+), 335 deletions(-)

(limited to 'pro-train/mr_pro_reduce.cc')

diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 25eb2de4..4d4b6245 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -159,8 +159,7 @@ struct RescoringPass {
   shared_ptr<ModelSet> models;
   shared_ptr<IntersectionConfiguration> inter_conf;
   vector<const FeatureFunction*> ffs;
-  shared_ptr<Weights> w;      // null == use previous weights
-  vector<double> weight_vector;
+  shared_ptr<vector<weight_t> > weight_vector;
   int fid_summary;            // 0 == no summary feature
   double density_prune;       // 0 == don't density prune
   double beam_prune;          // 0 == don't beam prune
@@ -169,7 +168,7 @@ struct RescoringPass {
 ostream& operator<<(ostream& os, const RescoringPass& rp) {
   os << "[num_fn=" << rp.ffs.size();
   if (rp.inter_conf) { os << " int_alg=" << *rp.inter_conf; }
-  if (rp.w) os << " new_weights";
+  //if (rp.weight_vector.size() > 0) os << " new_weights";
   if (rp.fid_summary) os << " summary_feature=" << FD::Convert(rp.fid_summary);
   if (rp.density_prune) os << " density_prune=" << rp.density_prune;
   if (rp.beam_prune) os << " beam_prune=" << rp.beam_prune;
@@ -181,13 +180,8 @@ struct DecoderImpl {
   DecoderImpl(po::variables_map& conf, int argc, char** argv, istream* cfg);
   ~DecoderImpl();
   bool Decode(const string& input, DecoderObserver*);
-  void SetWeights(const vector<double>& weights) {
-    init_weights = weights;
-    for (int i = 0; i < rescoring_passes.size(); ++i) {
-      if (rescoring_passes[i].models)
-        rescoring_passes[i].models->SetWeights(weights);
-      rescoring_passes[i].weight_vector = weights;
-    }
+  vector<weight_t>& CurrentWeightVector() {
+    return *rescoring_passes.back().weight_vector;
   }
   void SetId(int next_sent_id) { sent_id = next_sent_id - 1; }
 
@@ -300,8 +294,7 @@ struct DecoderImpl {
   OracleBleu oracle;
   string formalism;
   shared_ptr<Translator> translator;
-  Weights w_init_weights;      // used with initial parse
-  vector<double> init_weights; // weights used with initial parse
+  shared_ptr<vector<weight_t> > init_weights; // weights used with initial parse
   vector<shared_ptr<FeatureFunction> > pffs;
 #ifdef FSA_RESCORING
   CFGOptions cfg_options;
@@ -557,13 +550,18 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
     exit(1);
   }
 
-  // load initial feature weights (and possibly freeze feature set)
-  if (conf.count("weights")) {
-    w_init_weights.InitFromFile(str("weights",conf));
-    w_init_weights.InitVector(&init_weights);
-    init_weights.resize(FD::NumFeats());
+  // load perfect hash function for features
+  if (conf.count("cmph_perfect_feature_hash")) {
+    cerr << "Loading perfect hash function from " << conf["cmph_perfect_feature_hash"].as<string>() << " ...\n";
+    FD::EnableHash(conf["cmph_perfect_feature_hash"].as<string>());
+    cerr << "  " << FD::NumFeats() << " features in map\n";
   }
 
+  // load initial feature weights (and possibly freeze feature set)
+  init_weights.reset(new vector<weight_t>);
+  if (conf.count("weights"))
+    Weights::InitFromFile(str("weights",conf), init_weights.get());
+
   // cube pruning pop-limit: we may want to configure this on a per-pass basis
   pop_limit = conf["cubepruning_pop_limit"].as<int>();
 
@@ -582,9 +580,8 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
       RescoringPass& rp = rescoring_passes.back();
       // only configure new weights if pass > 0, otherwise we reuse the initial chart weights
       if (nth_pass_condition && conf.count(ws)) {
-        rp.w.reset(new Weights);
-        rp.w->InitFromFile(str(ws.c_str(), conf));
-        rp.w->InitVector(&rp.weight_vector);
+        rp.weight_vector.reset(new vector<weight_t>());
+        Weights::InitFromFile(str(ws.c_str(), conf), rp.weight_vector.get());
       }
       bool has_stateful = false;
       if (conf.count(ff)) {
@@ -624,11 +621,15 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
   }
 
   // set up weight vectors since later phases may reuse weights from earlier phases
-  const vector<double>* prev = &init_weights;
+  shared_ptr<vector<weight_t> > prev_weights = init_weights;
   for (int pass = 0; pass < rescoring_passes.size(); ++pass) {
     RescoringPass& rp = rescoring_passes[pass];
-    if (!rp.w) { rp.weight_vector = *prev; } else { prev = &rp.weight_vector; }
-    rp.models.reset(new ModelSet(rp.weight_vector, rp.ffs));
+    if (!rp.weight_vector) {
+      rp.weight_vector = prev_weights;
+    } else {
+      prev_weights = rp.weight_vector;
+    }
+    rp.models.reset(new ModelSet(*rp.weight_vector, rp.ffs));
     string ps = "Pass1 "; ps[4] += pass;
     if (!SILENT) show_models(conf,*rp.models,ps.c_str());
   }
@@ -650,12 +651,6 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
     FD::Freeze(); // this means we can't see the feature names of not-weighted features
   }
 
-  if (conf.count("cmph_perfect_feature_hash")) {
-    cerr << "Loading perfect hash function from " << conf["cmph_perfect_feature_hash"].as<string>() << " ...\n";
-    FD::EnableHash(conf["cmph_perfect_feature_hash"].as<string>());
-    cerr << "  " << FD::NumFeats() << " features in map\n";
-  }
-
   // set up translation back end
   if (formalism == "scfg")
     translator.reset(new SCFGTranslator(conf));
@@ -685,7 +680,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
   }
   if (!fsa_ffs.empty()) {
     cerr<<"FSA: ";
-    show_all_features(fsa_ffs,init_weights,cerr,cerr,true,true);
+    show_all_features(fsa_ffs,*init_weights,cerr,cerr,true,true);
   }
 #endif
 
@@ -733,7 +728,8 @@ bool Decoder::Decode(const string& input, DecoderObserver* o) {
   if (del) delete o;
   return res;
 }
-void Decoder::SetWeights(const vector<double>& weights) { pimpl_->SetWeights(weights); }
+vector<weight_t>& Decoder::CurrentWeightVector() { return pimpl_->CurrentWeightVector(); }
+const vector<weight_t>& Decoder::CurrentWeightVector() const { return pimpl_->CurrentWeightVector(); }
 void Decoder::SetSupplementalGrammar(const std::string& grammar_string) {
   assert(pimpl_->translator->GetDecoderType() == "SCFG");
   static_cast<SCFGTranslator&>(*pimpl_->translator).SetSupplementalGrammar(grammar_string);
@@ -774,7 +770,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
   translator->ProcessMarkupHints(smeta.sgml_);
   Timer t("Translation");
   const bool translation_successful =
-    translator->Translate(to_translate, &smeta, init_weights, &forest);
+    translator->Translate(to_translate, &smeta, *init_weights, &forest);
   translator->SentenceComplete();
 
   if (!translation_successful) {
@@ -812,7 +808,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
 
   for (int pass = 0; pass < rescoring_passes.size(); ++pass) {
     const RescoringPass& rp = rescoring_passes[pass];
-    const vector<double>& cur_weights = rp.weight_vector;
+    const vector<weight_t>& cur_weights = *rp.weight_vector;
     if (!SILENT) cerr << endl << "  RESCORING PASS #" << (pass+1) << " " << rp << endl;
 #ifdef FSA_RESCORING
     cfg_options.maybe_output_source(forest);
@@ -933,7 +929,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
 #endif
   }
 
-  const vector<double>& last_weights = (rescoring_passes.empty() ? init_weights : rescoring_passes.back().weight_vector);
+  const vector<double>& last_weights = (rescoring_passes.empty() ? *init_weights : *rescoring_passes.back().weight_vector);
 
   // Oracle Rescoring
   if(get_oracle_forest) {
diff --git a/decoder/decoder.h b/decoder/decoder.h
index 5491369f..9d009ffa 100644
--- a/decoder/decoder.h
+++ b/decoder/decoder.h
@@ -7,6 +7,8 @@
 #include <boost/shared_ptr.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "weights.h"  // weight_t
+
 #undef CP_TIME
 //#define CP_TIME
 #ifdef CP_TIME
@@ -39,7 +41,12 @@ struct Decoder {
   Decoder(int argc, char** argv);
   Decoder(std::istream* config_file);
   bool Decode(const std::string& input, DecoderObserver* observer = NULL);
-  void SetWeights(const std::vector<double>& weights);
+
+  // access this to either *read* or *write* to the decoder's last
+  // weight vector (i.e., the weights of the finest past)
+  std::vector<weight_t>& CurrentWeightVector();
+  const std::vector<weight_t>& CurrentWeightVector() const;
+
   void SetId(int id);
   ~Decoder();
   const boost::program_options::variables_map& GetConf() const { return conf; }
diff --git a/mira/kbest_mira.cc b/mira/kbest_mira.cc
index 6918a9a1..459a5e6f 100644
--- a/mira/kbest_mira.cc
+++ b/mira/kbest_mira.cc
@@ -32,21 +32,6 @@ namespace po = boost::program_options;
 bool invert_score;
 boost::shared_ptr<MT19937> rng;
 
-void SanityCheck(const vector<double>& w) {
-  for (int i = 0; i < w.size(); ++i) {
-    assert(!isnan(w[i]));
-    assert(!isinf(w[i]));
-  }
-}
-
-struct FComp {
-  const vector<double>& w_;
-  FComp(const vector<double>& w) : w_(w) {}
-  bool operator()(int a, int b) const {
-    return fabs(w_[a]) > fabs(w_[b]);
-  }
-};
-
 void RandomPermutation(int len, vector<int>* p_ids) {
   vector<int>& ids = *p_ids;
   ids.resize(len);
@@ -58,21 +43,6 @@ void RandomPermutation(int len, vector<int>* p_ids) {
   }  
 }
 
-void ShowLargestFeatures(const vector<double>& w) {
-  vector<int> fnums(w.size());
-  for (int i = 0; i < w.size(); ++i)
-    fnums[i] = i;
-  vector<int>::iterator mid = fnums.begin();
-  mid += (w.size() > 10 ? 10 : w.size());
-  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
-  cerr << "TOP FEATURES:";
-  --mid;
-  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
-    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
-  }
-  cerr << endl;
-}
-
 bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
@@ -209,14 +179,16 @@ int main(int argc, char** argv) {
     cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n";
     return 1;
   }
-  // load initial weights
-  Weights weights;
-  weights.InitFromFile(conf["input_weights"].as<string>());
-  SparseVector<double> lambdas;
-  weights.InitSparseVector(&lambdas);
 
   ReadFile ini_rf(conf["decoder_config"].as<string>());
   Decoder decoder(ini_rf.stream());
+
+  // load initial weights
+  vector<weight_t>& dense_weights = decoder.CurrentWeightVector();
+  SparseVector<weight_t> lambdas;
+  Weights::InitFromFile(conf["input_weights"].as<string>(), &dense_weights);
+  Weights::InitSparseVector(dense_weights, &lambdas);
+
   const double max_step_size = conf["max_step_size"].as<double>();
   const double mt_metric_scale = conf["mt_metric_scale"].as<double>();
 
@@ -230,7 +202,6 @@ int main(int argc, char** argv) {
   double tot_loss = 0;
   int dots = 0;
   int cur_pass = 0;
-  vector<double> dense_weights;
   SparseVector<double> tot;
   tot += lambdas;          // initial weights
   normalizer++;            // count for initial weights
@@ -240,27 +211,22 @@ int main(int argc, char** argv) {
   vector<int> order;
   RandomPermutation(corpus.size(), &order);
   while (lcount <= max_iteration) {
-    dense_weights.clear();
-    weights.InitFromVector(lambdas);
-    weights.InitVector(&dense_weights);
-    decoder.SetWeights(dense_weights);
+    lambdas.init_vector(&dense_weights);
     if ((cur_sent * 40 / corpus.size()) > dots) { ++dots; cerr << '.'; }
     if (corpus.size() == cur_sent) {
       cerr << " [AVG METRIC LAST PASS=" << (tot_loss / corpus.size()) << "]\n";
-      ShowLargestFeatures(dense_weights);
+      Weights::ShowLargestFeatures(dense_weights);
       cur_sent = 0;
       tot_loss = 0;
       dots = 0;
       ostringstream os;
       os << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << ".gz";
-      weights.WriteToFile(os.str(), true, &msg);
       SparseVector<double> x = tot;
       x /= normalizer;
       ostringstream sa;
       sa << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "-avg.gz";
-      Weights ww;
-      ww.InitFromVector(x);
-      ww.WriteToFile(sa.str(), true, &msga);
+      x.init_vector(&dense_weights);
+      Weights::WriteToFile(os.str(), dense_weights, true, &msg);
       ++cur_pass;
       RandomPermutation(corpus.size(), &order);
     }
@@ -294,11 +260,11 @@ int main(int argc, char** argv) {
     ++cur_sent;
   }
   cerr << endl;
-  weights.WriteToFile("weights.mira-final.gz", true, &msg);
+  Weights::WriteToFile("weights.mira-final.gz", dense_weights, true, &msg);
   tot /= normalizer;
-  weights.InitFromVector(tot);
+  tot.init_vector(dense_weights);
   msg = "# MIRA tuned weights (averaged vector)";
-  weights.WriteToFile("weights.mira-final-avg.gz", true, &msg);
+  Weights::WriteToFile("weights.mira-final-avg.gz", dense_weights, true, &msg);
   cerr << "Optimization complete.\nAVERAGED WEIGHTS: weights.mira-final-avg.gz\n";
   return 0;
 }
diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc
index 4324e8de..bc59285b 100644
--- a/pro-train/mr_pro_map.cc
+++ b/pro-train/mr_pro_map.cc
@@ -301,12 +301,8 @@ int main(int argc, char** argv) {
   const unsigned gamma = conf["candidate_pairs"].as<unsigned>();
   const unsigned xi = conf["best_pairs"].as<unsigned>();
   string weightsf = conf["weights"].as<string>();
-  vector<double> weights;
-  {
-    Weights w;
-    w.InitFromFile(weightsf);
-    w.InitVector(&weights);
-  }
+  vector<weight_t> weights;
+  Weights::InitFromFile(weightsf, &weights);
   string kbest_repo = conf["kbest_repository"].as<string>();
   MkDirP(kbest_repo);
   while(in) {
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 9b422f33..9caaa1d1 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -194,7 +194,7 @@ int main(int argc, char** argv) {
   InitCommandLine(argc, argv, &conf);
   string line;
   vector<pair<bool, SparseVector<double> > > training, testing;
-  SparseVector<double> old_weights;
+  SparseVector<weight_t> old_weights;
   const bool tune_regularizer = conf.count("tune_regularizer");
   if (tune_regularizer && !conf.count("testset")) {
     cerr << "--tune_regularizer requires --testset to be set\n";
@@ -210,9 +210,9 @@ int main(int argc, char** argv) {
   const double psi = conf["interpolation"].as<double>();
   if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
   if (conf.count("weights")) {
-    Weights w;
-    w.InitFromFile(conf["weights"].as<string>());
-    w.InitSparseVector(&old_weights);
+    vector<weight_t> dt;
+    Weights::InitFromFile(conf["weights"].as<string>(), &dt);
+    Weights::InitSparseVector(dt, &old_weights);
   }
   ReadCorpus(&cin, &training);
   if (conf.count("testset")) {
@@ -220,8 +220,8 @@ int main(int argc, char** argv) {
     ReadCorpus(rf.stream(), &testing);
   }
   cerr << "Number of features: " << FD::NumFeats() << endl;
-  vector<double> x(FD::NumFeats(), 0.0);  // x[0] is bias
-  for (SparseVector<double>::const_iterator it = old_weights.begin();
+  vector<weight_t> x(FD::NumFeats(), 0.0);  // x[0] is bias
+  for (SparseVector<weight_t>::const_iterator it = old_weights.begin();
        it != old_weights.end(); ++it)
     x[it->first] = it->second;
   double tppl = 0.0;
@@ -257,7 +257,6 @@ int main(int argc, char** argv) {
     sigsq = sp[best_i].first;
     tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
   }
-  Weights w;
   if (conf.count("weights")) {
     for (int i = 1; i < x.size(); ++i)
       x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi);
@@ -271,7 +270,6 @@ int main(int argc, char** argv) {
       cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl;
     }
   }
-  w.InitFromVector(x);
-  w.WriteToFile("-");
+  Weights::WriteToFile("-", x);
   return 0;
 }
diff --git a/training/Makefile.am b/training/Makefile.am
index e075e417..6e2c06f5 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -12,9 +12,7 @@ bin_PROGRAMS = \
   cllh_filter_grammar \
   mpi_online_optimize \
   mpi_batch_optimize \
-  mpi_em_optimize \
   compute_cllh \
-  feature_expectations \
   augment_grammar
 
 noinst_PROGRAMS = \
@@ -29,12 +27,6 @@ mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval
 mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc
 mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
-feature_expectations_SOURCES = feature_expectations.cc
-feature_expectations_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_em_optimize_SOURCES = mpi_em_optimize.cc optimize.cc
-mpi_em_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
 compute_cllh_SOURCES = compute_cllh.cc
 compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
diff --git a/training/augment_grammar.cc b/training/augment_grammar.cc
index df8d4ee8..e89a92d5 100644
--- a/training/augment_grammar.cc
+++ b/training/augment_grammar.cc
@@ -134,9 +134,7 @@ int main(int argc, char** argv) {
   } else { ngram = NULL; }
   extra_feature = conf.count("extra_lex_feature") > 0;
   if (conf.count("collapse_weights")) {
-    Weights w;
-    w.InitFromFile(conf["collapse_weights"].as<string>());
-    w.InitVector(&col_weights);
+    Weights::InitFromFile(conf["collapse_weights"].as<string>(), &col_weights);
   }
   clear_features = conf.count("clear_features_after_collapse") > 0;
   gather_rules = false;
diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc
index 4fb742fb..dc480f6c 100644
--- a/training/collapse_weights.cc
+++ b/training/collapse_weights.cc
@@ -59,10 +59,8 @@ int main(int argc, char** argv) {
   InitCommandLine(argc, argv, &conf);
   const string wfile = conf["weights"].as<string>();
   const string gfile = conf["grammar"].as<string>();
-  Weights wm;
-  wm.InitFromFile(wfile);
-  vector<double> w;
-  wm.InitVector(&w);
+  vector<weight_t> w;
+  Weights::InitFromFile(wfile, &w);
   MarginalMap e_tots;
   MarginalMap f_tots;
   prob_t tot;
diff --git a/training/compute_cllh.cc b/training/compute_cllh.cc
index 332f6d0c..b496d196 100644
--- a/training/compute_cllh.cc
+++ b/training/compute_cllh.cc
@@ -148,15 +148,6 @@ int main(int argc, char** argv) {
   if (!InitCommandLine(argc, argv, &conf))
     return false;
 
-  // load initial weights
-  Weights weights;
-  if (conf.count("weights"))
-    weights.InitFromFile(conf["weights"].as<string>());
-
-  // freeze feature set
-  //const bool freeze_feature_set = conf.count("freeze_feature_set");
-  //if (freeze_feature_set) FD::Freeze();
-
   // load cdec.ini and set up decoder
   ReadFile ini_rf(conf["decoder_config"].as<string>());
   Decoder decoder(ini_rf.stream());
@@ -165,17 +156,22 @@ int main(int argc, char** argv) {
     abort();
   }
 
+  // load weights
+  vector<weight_t>& weights = decoder.CurrentWeightVector();
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &weights);
+
+  // freeze feature set
+  //const bool freeze_feature_set = conf.count("freeze_feature_set");
+  //if (freeze_feature_set) FD::Freeze();
+
   vector<string> corpus; vector<int> ids;
   ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
   assert(corpus.size() > 0);
   assert(corpus.size() == ids.size());
 
-  vector<double> wv;
-  weights.InitVector(&wv);
-  decoder.SetWeights(wv);
   TrainingObserver observer;
   double objective = 0;
-  bool converged = false;
 
   observer.Reset();
   if (rank == 0)
@@ -197,3 +193,4 @@ int main(int argc, char** argv) {
 
   return 0;
 }
+
diff --git a/training/grammar_convert.cc b/training/grammar_convert.cc
index 8d292f8a..bf8abb26 100644
--- a/training/grammar_convert.cc
+++ b/training/grammar_convert.cc
@@ -251,12 +251,10 @@ int main(int argc, char **argv) {
   const bool is_split_input = (conf["format"].as<string>() == "split");
   const bool is_json_input = is_split_input || (conf["format"].as<string>() == "json");
   const bool collapse_weights = conf.count("collapse_weights");
-  Weights wts;
   vector<double> w;
-  if (conf.count("weights")) {
-    wts.InitFromFile(conf["weights"].as<string>());
-    wts.InitVector(&w);
-  }
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &w);
+
   if (collapse_weights && !w.size()) {
     cerr << "--collapse_weights requires a weights file to be specified!\n";
     exit(1);
diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc
index 39a8af7d..cc5953f6 100644
--- a/training/mpi_batch_optimize.cc
+++ b/training/mpi_batch_optimize.cc
@@ -31,42 +31,12 @@ using namespace std;
 using boost::shared_ptr;
 namespace po = boost::program_options;
 
-void SanityCheck(const vector<double>& w) {
-  for (int i = 0; i < w.size(); ++i) {
-    assert(!isnan(w[i]));
-    assert(!isinf(w[i]));
-  }
-}
-
-struct FComp {
-  const vector<double>& w_;
-  FComp(const vector<double>& w) : w_(w) {}
-  bool operator()(int a, int b) const {
-    return fabs(w_[a]) > fabs(w_[b]);
-  }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
-  vector<int> fnums(w.size());
-  for (int i = 0; i < w.size(); ++i)
-    fnums[i] = i;
-  vector<int>::iterator mid = fnums.begin();
-  mid += (w.size() > 10 ? 10 : w.size());
-  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
-  cerr << "TOP FEATURES:";
-  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
-    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
-  }
-  cerr << endl;
-}
-
 bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
         ("input_weights,w",po::value<string>(),"Input feature weights file")
         ("training_data,t",po::value<string>(),"Training data")
         ("decoder_config,d",po::value<string>(),"Decoder configuration file")
-        ("sharded_input,s",po::value<string>(), "Corpus and grammar files are 'sharded' so each processor loads its own input and grammar file. Argument is the directory containing the shards.")
         ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file")
         ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)")
 	("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory")
@@ -88,14 +58,10 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   }
   po::notify(*conf);
 
-  if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data") | conf->count("sharded_input")) || !conf->count("decoder_config")) {
+  if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data")) || !conf->count("decoder_config")) {
     cerr << dcmdline_options << endl;
     return false;
   }
-  if (conf->count("training_data") && conf->count("sharded_input")) {
-    cerr << "Cannot specify both --training_data and --sharded_input\n";
-    return false;
-  }
   return true;
 }
 
@@ -236,42 +202,9 @@ int main(int argc, char** argv) {
   po::variables_map conf;
   if (!InitCommandLine(argc, argv, &conf)) return 1;
 
-  string shard_dir;
-  if (conf.count("sharded_input")) {
-    shard_dir = conf["sharded_input"].as<string>();
-    if (!DirectoryExists(shard_dir)) {
-      if (rank == 0) cerr << "Can't find shard directory: " << shard_dir << endl;
-      return 1;
-    }
-    if (rank == 0)
-      cerr << "Shard directory: " << shard_dir << endl;
-  }
-
-  // load initial weights
-  Weights weights;
-  if (rank == 0) { cerr << "Loading weights...\n"; }
-  weights.InitFromFile(conf["input_weights"].as<string>());
-  if (rank == 0) { cerr << "Done loading weights.\n"; }
-
-  // freeze feature set (should be optional?)
-  const bool freeze_feature_set = true;
-  if (freeze_feature_set) FD::Freeze();
-
   // load cdec.ini and set up decoder
   vector<string> cdec_ini;
   ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
-  if (shard_dir.size()) {
-    if (rank == 0) {
-      for (int i = 0; i < cdec_ini.size(); ++i) {
-        if (cdec_ini[i].find("grammar=") == 0) {
-          cerr << "!!! using sharded input and " << conf["decoder_config"].as<string>() << " contains a grammar specification:\n" << cdec_ini[i] << "\n  VERIFY THAT THIS IS CORRECT!\n";
-        }
-      }
-    }
-    ostringstream g;
-    g << "grammar=" << shard_dir << "/grammar." << rank << "_of_" << size << ".gz";
-    cdec_ini.push_back(g.str());
-  }
   istringstream ini;
   StoreConfig(cdec_ini, &ini);
   if (rank == 0) cerr << "Loading grammar...\n";
@@ -282,22 +215,28 @@ int main(int argc, char** argv) {
   }
   if (rank == 0) cerr << "Done loading grammar!\n";
 
+  // load initial weights
+  if (rank == 0) { cerr << "Loading weights...\n"; }
+  vector<weight_t>& lambdas = decoder->CurrentWeightVector();
+  Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas);
+  if (rank == 0) { cerr << "Done loading weights.\n"; }
+
+  // freeze feature set (should be optional?)
+  const bool freeze_feature_set = true;
+  if (freeze_feature_set) FD::Freeze();
+
   const int num_feats = FD::NumFeats();
   if (rank == 0) cerr << "Number of features: " << num_feats << endl;
+  lambdas.resize(num_feats);
+
   const bool gaussian_prior = conf.count("gaussian_prior");
-  vector<double> means(num_feats, 0);
+  vector<weight_t> means(num_feats, 0);
   if (conf.count("means")) {
     if (!gaussian_prior) {
       cerr << "Don't use --means without --gaussian_prior!\n";
       exit(1);
     }
-    Weights wm; 
-    wm.InitFromFile(conf["means"].as<string>());
-    if (num_feats != FD::NumFeats()) {
-      cerr << "[ERROR] Means file had unexpected features!\n";
-      exit(1);
-    }
-    wm.InitVector(&means);
+    Weights::InitFromFile(conf["means"].as<string>(), &means);
   }
   shared_ptr<BatchOptimizer> o;
   if (rank == 0) {
@@ -309,26 +248,13 @@ int main(int argc, char** argv) {
     cerr << "Optimizer: " << o->Name() << endl;
   }
   double objective = 0;
-  vector<double> lambdas(num_feats, 0.0);
-  weights.InitVector(&lambdas);
-  if (lambdas.size() != num_feats) {
-    cerr << "Initial weights file did not have all features specified!\n  feats="
-         << num_feats << "\n  weights file=" << lambdas.size() << endl;
-    lambdas.resize(num_feats, 0.0);
-  }
   vector<double> gradient(num_feats, 0.0);
-  vector<double> rcv_grad(num_feats, 0.0);
+  vector<double> rcv_grad;
+  rcv_grad.clear();
   bool converged = false;
 
   vector<string> corpus;
-  if (shard_dir.size()) {
-    ostringstream os; os << shard_dir << "/corpus." << rank << "_of_" << size;
-    ReadTrainingCorpus(os.str(), 0, 1, &corpus);
-    cerr << os.str() << " has " << corpus.size() << " training examples. " << endl;
-    if (corpus.size() > 500) { corpus.resize(500); cerr << "  TRUNCATING\n"; }
-  } else {
-    ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
-  }
+  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
   assert(corpus.size() > 0);
 
   TrainingObserver observer;
@@ -341,19 +267,20 @@ int main(int argc, char** argv) {
     if (rank == 0) {
       cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n";
     }
-    decoder->SetWeights(lambdas);
     for (int i = 0; i < corpus.size(); ++i)
       decoder->Decode(corpus[i], &observer);
     cerr << "  process " << rank << '/' << size << " done\n";
     fill(gradient.begin(), gradient.end(), 0);
-    fill(rcv_grad.begin(), rcv_grad.end(), 0);
     observer.SetLocalGradientAndObjective(&gradient, &objective);
 
     double to = 0;
 #ifdef HAVE_MPI
+    rcv_grad.resize(num_feats, 0.0);
     mpi::reduce(world, &gradient[0], gradient.size(), &rcv_grad[0], plus<double>(), 0);
-    mpi::reduce(world, objective, to, plus<double>(), 0);
     swap(gradient, rcv_grad);
+    rcv_grad.clear();
+
+    mpi::reduce(world, objective, to, plus<double>(), 0);
     objective = to;
 #endif
 
@@ -378,7 +305,7 @@ int main(int argc, char** argv) {
       for (int i = 0; i < gradient.size(); ++i)
         gnorm += gradient[i] * gradient[i];
       cerr << "  GNORM=" << sqrt(gnorm) << endl;
-      vector<double> old = lambdas;
+      vector<weight_t> old = lambdas;
       int c = 0;
       while (old == lambdas) {
         ++c;
@@ -387,9 +314,8 @@ int main(int argc, char** argv) {
         assert(c < 5);
       }
       old.clear();
-      SanityCheck(lambdas);
-      ShowLargestFeatures(lambdas);
-      weights.InitFromVector(lambdas);
+      Weights::SanityCheck(lambdas);
+      Weights::ShowLargestFeatures(lambdas);
 
       converged = o->HasConverged();
       if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
@@ -399,7 +325,7 @@ int main(int argc, char** argv) {
       ostringstream vv;
       vv << "Objective = " << objective << "  (eval count=" << o->EvaluationCount() << ")";
       const string svv = vv.str();
-      weights.WriteToFile(fname, true, &svv);
+      Weights::WriteToFile(fname, lambdas, true, &svv);
     }  // rank == 0
     int cint = converged;
 #ifdef HAVE_MPI
@@ -411,3 +337,4 @@ int main(int argc, char** argv) {
   }
   return 0;
 }
+
diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc
index 32033c19..2ef4a2e7 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/mpi_online_optimize.cc
@@ -31,35 +31,6 @@ namespace mpi = boost::mpi;
 using namespace std;
 namespace po = boost::program_options;
 
-void SanityCheck(const vector<double>& w) {
-  for (int i = 0; i < w.size(); ++i) {
-    assert(!isnan(w[i]));
-    assert(!isinf(w[i]));
-  }
-}
-
-struct FComp {
-  const vector<double>& w_;
-  FComp(const vector<double>& w) : w_(w) {}
-  bool operator()(int a, int b) const {
-    return fabs(w_[a]) > fabs(w_[b]);
-  }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
-  vector<int> fnums(w.size());
-  for (int i = 0; i < w.size(); ++i)
-    fnums[i] = i;
-  vector<int>::iterator mid = fnums.begin();
-  mid += (w.size() > 10 ? 10 : w.size());
-  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
-  cerr << "TOP FEATURES:";
-  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
-    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
-  }
-  cerr << endl;
-}
-
 bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
@@ -250,10 +221,25 @@ int main(int argc, char** argv) {
   if (!InitCommandLine(argc, argv, &conf))
     return 1;
 
+  vector<pair<string, int> > agenda;
+  if (!LoadAgenda(conf["training_agenda"].as<string>(), &agenda))
+    return 1;
+  if (rank == 0)
+    cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n";
+
+  assert(agenda.size() > 0);
+
+  if (1) {  // hack to load the feature hash functions -- TODO this should not be in cdec.ini
+    const string& cur_config = agenda[0].first;
+    const unsigned max_iteration = agenda[0].second;
+    ReadFile ini_rf(cur_config);
+    Decoder decoder(ini_rf.stream());
+  }
+
   // load initial weights
-  Weights weights;
+  vector<weight_t> init_weights;
   if (conf.count("input_weights"))
-    weights.InitFromFile(conf["input_weights"].as<string>());
+    Weights::InitFromFile(conf["input_weights"].as<string>(), &init_weights);
 
   vector<int> frozen_fids;
   if (conf.count("frozen_features")) {
@@ -310,19 +296,12 @@ int main(int argc, char** argv) {
     rng.reset(new MT19937);
 
   SparseVector<double> x;
-  weights.InitSparseVector(&x);
+  Weights::InitSparseVector(init_weights, &x);
   TrainingObserver observer;
 
   int write_weights_every_ith = 100; // TODO configure
   int titer = -1;
 
-  vector<pair<string, int> > agenda;
-  if (!LoadAgenda(conf["training_agenda"].as<string>(), &agenda))
-    return 1;
-  if (rank == 0)
-    cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n";
-
-  vector<double> lambdas;
   for (int ai = 0; ai < agenda.size(); ++ai) {
     const string& cur_config = agenda[ai].first;
     const unsigned max_iteration = agenda[ai].second;
@@ -331,6 +310,8 @@ int main(int argc, char** argv) {
     // load cdec.ini and set up decoder
     ReadFile ini_rf(cur_config);
     Decoder decoder(ini_rf.stream());
+    vector<weight_t>& lambdas = decoder.CurrentWeightVector();
+    if (ai == 0) { lambdas.swap(init_weights); init_weights.clear(); }
 
     if (rank == 0)
       o->ResetEpoch(); // resets the learning rate-- TODO is this good?
@@ -341,15 +322,13 @@ int main(int argc, char** argv) {
 #ifdef HAVE_MPI
       mpi::timer timer;
 #endif
-      weights.InitFromVector(x);
-      weights.InitVector(&lambdas);
+      x.init_vector(&lambdas);
       ++iter; ++titer;
       observer.Reset();
-      decoder.SetWeights(lambdas);
       if (rank == 0) {
         converged = (iter == max_iteration);
-        SanityCheck(lambdas);
-        ShowLargestFeatures(lambdas);
+        Weights::SanityCheck(lambdas);
+        Weights::ShowLargestFeatures(lambdas);
         string fname = "weights.cur.gz";
         if (iter % write_weights_every_ith == 0) {
           ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz";
@@ -360,7 +339,7 @@ int main(int argc, char** argv) {
         vv << "total iter=" << titer << " (of current config iter=" << iter << ")  minibatch=" << size_per_proc << " sentences/proc x " << size << " procs.   num_feats=" << x.size() << '/' << FD::NumFeats() << "   passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << "   eta=" << lr->eta(titer);
         const string svv = vv.str();
         cerr << svv << endl;
-        weights.WriteToFile(fname, true, &svv);
+        Weights::WriteToFile(fname, lambdas, true, &svv);
       }
 
       for (int i = 0; i < size_per_proc; ++i) {
diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc
index b931991d..15e28fa1 100644
--- a/training/mr_optimize_reduce.cc
+++ b/training/mr_optimize_reduce.cc
@@ -88,25 +88,19 @@ int main(int argc, char** argv) {
 
   const bool use_b64 = conf["input_format"].as<string>() == "b64";
 
-  Weights weights;
-  weights.InitFromFile(conf["input_weights"].as<string>());
+  vector<weight_t> lambdas;
+  Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas);
   const string s_obj = "**OBJ**";
   int num_feats = FD::NumFeats();
   cerr << "Number of features: " << num_feats << endl;
   const bool gaussian_prior = conf.count("gaussian_prior");
-  vector<double> means(num_feats, 0);
+  vector<weight_t> means(num_feats, 0);
   if (conf.count("means")) {
     if (!gaussian_prior) {
       cerr << "Don't use --means without --gaussian_prior!\n";
       exit(1);
     }
-    Weights wm; 
-    wm.InitFromFile(conf["means"].as<string>());
-    if (num_feats != FD::NumFeats()) {
-      cerr << "[ERROR] Means file had unexpected features!\n";
-      exit(1);
-    }
-    wm.InitVector(&means);
+    Weights::InitFromFile(conf["means"].as<string>(), &means);
   }
   shared_ptr<BatchOptimizer> o;
   const string omethod = conf["optimization_method"].as<string>();
@@ -124,8 +118,6 @@ int main(int argc, char** argv) {
       cerr << "No state file found, assuming ITERATION 1\n";
   }
 
-  vector<double> lambdas(num_feats, 0);
-  weights.InitVector(&lambdas);
   double objective = 0;
   vector<double> gradient(num_feats, 0);
   // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2;
@@ -223,8 +215,7 @@ int main(int argc, char** argv) {
   old.clear();
   SanityCheck(lambdas);
   ShowLargestFeatures(lambdas);
-  weights.InitFromVector(lambdas);
-  weights.WriteToFile(conf["output_weights"].as<string>(), false);
+  Weights::WriteToFile(conf["output_weights"].as<string>(), lambdas, false);
 
   const bool conv = o->HasConverged();
   if (conv) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
diff --git a/utils/fdict.h b/utils/fdict.h
index 771e8b91..f0871b9a 100644
--- a/utils/fdict.h
+++ b/utils/fdict.h
@@ -28,6 +28,8 @@ struct FD {
   }
   static void EnableHash(const std::string& cmph_file) {
 #ifdef HAVE_CMPH
+    assert(dict_.max() == 0);  // dictionary must not have
+                               // been added to
     hash_ = new PerfectHashFunction(cmph_file);
 #endif
   }
diff --git a/utils/phmt.cc b/utils/phmt.cc
index 1f59afaf..48d9f093 100644
--- a/utils/phmt.cc
+++ b/utils/phmt.cc
@@ -19,22 +19,18 @@ int main(int argc, char** argv) {
   cerr << "LexFE = " << FD::Convert("LexFE") << endl;
   cerr << "LexEF = " << FD::Convert("LexEF") << endl;
   {
-    Weights w;
     vector<weight_t> v(FD::NumFeats());
     v[FD::Convert("LexFE")] = 1.0;
     v[FD::Convert("LexEF")] = 0.5;
-    w.InitFromVector(v);
     cerr << "Writing...\n";
-    w.WriteToFile("weights.bin");
+    Weights::WriteToFile("weights.bin", v);
     cerr << "Done.\n";
   }
   {
-    Weights w;
     vector<weight_t> v(FD::NumFeats());
     cerr << "Reading...\n";
-    w.InitFromFile("weights.bin");
+    Weights::InitFromFile("weights.bin", &v);
     cerr << "Done.\n";
-    w.InitVector(&v);
     assert(v[FD::Convert("LexFE")] == 1.0);
     assert(v[FD::Convert("LexEF")] == 0.5);
   }
diff --git a/utils/weights.cc b/utils/weights.cc
index 0916b72a..c49000be 100644
--- a/utils/weights.cc
+++ b/utils/weights.cc
@@ -8,7 +8,10 @@
 
 using namespace std;
 
-void Weights::InitFromFile(const std::string& filename, vector<string>* feature_list) {
+void Weights::InitFromFile(const string& filename,
+                           vector<weight_t>* pweights,
+                           vector<string>* feature_list) {
+  vector<weight_t>& weights = *pweights;
   if (!SILENT) cerr << "Reading weights from " << filename << endl;
   ReadFile in_file(filename);
   istream& in = *in_file.stream();
@@ -47,16 +50,16 @@ void Weights::InitFromFile(const std::string& filename, vector<string>* feature_
       int end = 0;
       while(end < buf.size() && buf[end] != ' ') ++end;
       const int fid = FD::Convert(buf.substr(start, end - start));
+      if (feature_list) { feature_list->push_back(buf.substr(start, end - start)); }
       while(end < buf.size() && buf[end] == ' ') ++end;
       val = strtod(&buf.c_str()[end], NULL);
       if (isnan(val)) {
         cerr << FD::Convert(fid) << " has weight NaN!\n";
         abort();
       }
-      if (wv_.size() <= fid)
-        wv_.resize(fid + 1);
-      wv_[fid] = val;
-      if (feature_list) { feature_list->push_back(FD::Convert(fid)); }
+      if (weights.size() <= fid)
+        weights.resize(fid + 1);
+      weights[fid] = val;
       ++weight_count;
       if (!SILENT) {
         if (weight_count %   50000 == 0) { cerr << '.' << flush; fl = true; }
@@ -76,8 +79,8 @@ void Weights::InitFromFile(const std::string& filename, vector<string>* feature_
       cerr << "Hash function reports " << FD::NumFeats() << " keys but weights file contains " << num_keys[0] << endl;
       abort();
     }
-    wv_.resize(num_keys[0]);
-    in.get(reinterpret_cast<char*>(&wv_[0]), num_keys[0] * sizeof(weight_t));
+    weights.resize(num_keys[0]);
+    in.get(reinterpret_cast<char*>(&weights[0]), num_keys[0] * sizeof(weight_t));
     if (!in.good()) {
       cerr << "Error loading weights!\n";
       abort();
@@ -85,7 +88,10 @@ void Weights::InitFromFile(const std::string& filename, vector<string>* feature_
   }
 }
 
-void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features, const string* extra) const {
+void Weights::WriteToFile(const string& fname,
+                          const vector<weight_t>& weights,
+                          bool hide_zero_value_features,
+                          const string* extra) {
   WriteFile out(fname);
   ostream& o = *out.stream();
   assert(o);
@@ -96,41 +102,54 @@ void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_feature
     o.precision(17);
     const int num_feats = FD::NumFeats();
     for (int i = 1; i < num_feats; ++i) {
-      const weight_t val = (i < wv_.size() ? wv_[i] : 0.0);
+      const weight_t val = (i < weights.size() ? weights[i] : 0.0);
       if (hide_zero_value_features && val == 0.0) continue;
       o << FD::Convert(i) << ' ' << val << endl;
     }
   } else {
     o.write("_PHWf", 5);
     const size_t keys = FD::NumFeats();
-    assert(keys <= wv_.size());
+    assert(keys <= weights.size());
     o.write(reinterpret_cast<const char*>(&keys), sizeof(keys));
-    o.write(reinterpret_cast<const char*>(&wv_[0]), keys * sizeof(weight_t));
+    o.write(reinterpret_cast<const char*>(&weights[0]), keys * sizeof(weight_t));
   }
 }
 
-void Weights::InitVector(std::vector<weight_t>* w) const {
-  *w = wv_;
+void Weights::InitSparseVector(const vector<weight_t>& dv,
+                               SparseVector<weight_t>* sv) {
+  sv->clear();
+  for (unsigned i = 1; i < dv.size(); ++i) {
+    if (dv[i]) sv->set_value(i, dv[i]);
+  }
 }
 
-void Weights::InitSparseVector(SparseVector<weight_t>* w) const {
-  for (int i = 1; i < wv_.size(); ++i) {
-    const weight_t& weight = wv_[i];
-    if (weight) w->set_value(i, weight);
+void Weights::SanityCheck(const vector<weight_t>& w) {
+  for (int i = 0; i < w.size(); ++i) {
+    assert(!isnan(w[i]));
+    assert(!isinf(w[i]));
   }
 }
 
-void Weights::InitFromVector(const std::vector<weight_t>& w) {
-  wv_ = w;
-  if (wv_.size() > FD::NumFeats())
-    cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n";
-  wv_.resize(FD::NumFeats(), 0);
-}
+struct FComp {
+  const vector<weight_t>& w_;
+  FComp(const vector<weight_t>& w) : w_(w) {}
+  bool operator()(int a, int b) const {
+    return fabs(w_[a]) > fabs(w_[b]);
+  }
+};
 
-void Weights::InitFromVector(const SparseVector<weight_t>& w) {
-  wv_.clear();
-  wv_.resize(FD::NumFeats(), 0.0);
-  for (int i = 1; i < FD::NumFeats(); ++i)
-    wv_[i] = w.value(i);
+void Weights::ShowLargestFeatures(const vector<weight_t>& w) {
+  vector<int> fnums(w.size());
+  for (int i = 0; i < w.size(); ++i)
+    fnums[i] = i;
+  vector<int>::iterator mid = fnums.begin();
+  mid += (w.size() > 10 ? 10 : w.size());
+  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
+  cerr << "TOP FEATURES:";
+  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
+    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
+  }
+  cerr << endl;
 }
 
+
diff --git a/utils/weights.h b/utils/weights.h
index 7664810b..30f71db0 100644
--- a/utils/weights.h
+++ b/utils/weights.h
@@ -10,15 +10,21 @@ typedef double weight_t;
 
 class Weights {
  public:
-  Weights() {}
-  void InitFromFile(const std::string& fname, std::vector<std::string>* feature_list = NULL);
-  void WriteToFile(const std::string& fname, bool hide_zero_value_features = true, const std::string* extra = NULL) const;
-  void InitVector(std::vector<weight_t>* w) const;
-  void InitSparseVector(SparseVector<weight_t>* w) const;
-  void InitFromVector(const std::vector<weight_t>& w);
-  void InitFromVector(const SparseVector<weight_t>& w);
+  static void InitFromFile(const std::string& fname,
+                           std::vector<weight_t>* weights,
+                           std::vector<std::string>* feature_list = NULL);
+  static void WriteToFile(const std::string& fname,
+                          const std::vector<weight_t>& weights,
+                          bool hide_zero_value_features = true,
+                          const std::string* extra = NULL);
+  static void InitSparseVector(const std::vector<weight_t>& dv,
+                               SparseVector<weight_t>* sv);
+  // check for infinities, NaNs, etc
+  static void SanityCheck(const std::vector<weight_t>& w);
+  // write weights with largest magnitude to cerr
+  static void ShowLargestFeatures(const std::vector<weight_t>& w);
  private:
-  std::vector<weight_t> wv_;
+  Weights();
 };
 
 #endif
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc
index b84c44bc..0c094fd5 100644
--- a/vest/mr_vest_generate_mapper_input.cc
+++ b/vest/mr_vest_generate_mapper_input.cc
@@ -223,16 +223,16 @@ struct oracle_directions {
     cerr << "Forest repo: " << forest_repository << endl;
     assert(DirectoryExists(forest_repository));
     vector<string> features;
-    weights.InitFromFile(weights_file, &features);
+    vector<weight_t> dorigin;
+    Weights::InitFromFile(weights_file, &dorigin, &features);
     if (optimize_features.size())
       features=optimize_features;
-    weights.InitSparseVector(&origin);
+    Weights::InitSparseVector(dorigin, &origin);
     fids.clear();
     AddFeatureIds(features);
     oracles.resize(dev_set_size);
   }
 
-  Weights weights;
   void AddFeatureIds(vector<string> const& features) {
     int i = fids.size();
     fids.resize(fids.size()+features.size());
-- 
cgit v1.2.3


From 6c8309c58dc4a6015dfb2f478a2cef5f65f92961 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 14 Sep 2011 12:17:04 +0100
Subject: weight_t refactoring

---
 pro-train/mr_pro_map.cc    | 42 +++++++++++++++++++++---------------------
 pro-train/mr_pro_reduce.cc | 34 +++++++++++++++++-----------------
 2 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'pro-train/mr_pro_reduce.cc')

diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc
index bc59285b..0a9b75d7 100644
--- a/pro-train/mr_pro_map.cc
+++ b/pro-train/mr_pro_map.cc
@@ -27,7 +27,7 @@ namespace po = boost::program_options;
 struct ApproxVectorHasher {
   static const size_t MASK = 0xFFFFFFFFull;
   union UType {
-    double f;
+    double f;   // leave as double
     size_t i;
   };
   static inline double round(const double x) {
@@ -40,9 +40,9 @@ struct ApproxVectorHasher {
       t.i &= (1ull - MASK);
     return t.f;
   }
-  size_t operator()(const SparseVector<double>& x) const {
+  size_t operator()(const SparseVector<weight_t>& x) const {
     size_t h = 0x573915839;
-    for (SparseVector<double>::const_iterator it = x.begin(); it != x.end(); ++it) {
+    for (SparseVector<weight_t>::const_iterator it = x.begin(); it != x.end(); ++it) {
       UType t;
       t.f = it->second;
       if (t.f) {
@@ -56,9 +56,9 @@ struct ApproxVectorHasher {
 };
 
 struct ApproxVectorEquals {
-  bool operator()(const SparseVector<double>& a, const SparseVector<double>& b) const {
-    SparseVector<double>::const_iterator bit = b.begin();
-    for (SparseVector<double>::const_iterator ait = a.begin(); ait != a.end(); ++ait) {
+  bool operator()(const SparseVector<weight_t>& a, const SparseVector<weight_t>& b) const {
+    SparseVector<weight_t>::const_iterator bit = b.begin();
+    for (SparseVector<weight_t>::const_iterator ait = a.begin(); ait != a.end(); ++ait) {
       if (bit == b.end() ||
           ait->first != bit->first ||
           ApproxVectorHasher::round(ait->second) != ApproxVectorHasher::round(bit->second))
@@ -105,18 +105,18 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
 }
 
 struct HypInfo {
-  HypInfo() : g_(-100.0) {}
-  HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-100.0), x(feats) {}
+  HypInfo() : g_(-100.0f) {}
+  HypInfo(const vector<WordID>& h, const SparseVector<weight_t>& feats) : hyp(h), g_(-100.0f), x(feats) {}
 
   // lazy evaluation
   double g(const SentenceScorer& scorer) const {
-    if (g_ == -100.0)
+    if (g_ == -100.0f)
       g_ = scorer.ScoreCandidate(hyp)->ComputeScore();
     return g_;
   }
   vector<WordID> hyp;
-  mutable double g_;
-  SparseVector<double> x;
+  mutable float g_;
+  SparseVector<weight_t> x;
 };
 
 struct HypInfoCompare {
@@ -146,8 +146,8 @@ void WriteKBest(const string& file, const vector<HypInfo>& kbest) {
   }
 }
 
-void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
-  SparseVector<double>& x = *out;
+void ParseSparseVector(string& line, size_t cur, SparseVector<weight_t>* out) {
+  SparseVector<weight_t>& x = *out;
   size_t last_start = cur;
   size_t last_comma = string::npos;
   while(cur <= line.size()) {
@@ -211,15 +211,15 @@ struct ThresholdAlpha {
 };
 
 struct TrainingInstance {
-  TrainingInstance(const SparseVector<double>& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {}
-  SparseVector<double> x;
+  TrainingInstance(const SparseVector<weight_t>& feats, bool positive, float diff) : x(feats), y(positive), gdiff(diff) {}
+  SparseVector<weight_t> x;
 #undef DEBUGGING_PRO
 #ifdef DEBUGGING_PRO
   vector<WordID> a;
   vector<WordID> b;
 #endif
   bool y;
-  double gdiff;
+  float gdiff;
 };
 #ifdef DEBUGGING_PRO
 ostream& operator<<(ostream& os, const TrainingInstance& d) {
@@ -235,19 +235,19 @@ struct DiffOrder {
 
 void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const bool invert_score, vector<TrainingInstance>* pv) {
   vector<TrainingInstance> v1, v2;
-  double avg_diff = 0;
+  float avg_diff = 0;
   for (unsigned i = 0; i < gamma; ++i) {
     const size_t a = rng->inclusive(0, J_i.size() - 1)();
     const size_t b = rng->inclusive(0, J_i.size() - 1)();
     if (a == b) continue;
-    double ga = J_i[a].g(scorer);
-    double gb = J_i[b].g(scorer);
+    float ga = J_i[a].g(scorer);
+    float gb = J_i[b].g(scorer);
     bool positive = gb < ga;
     if (invert_score) positive = !positive;
-    const double gdiff = fabs(ga - gb);
+    const float gdiff = fabs(ga - gb);
     if (!gdiff) continue;
     avg_diff += gdiff;
-    SparseVector<double> xdiff = (J_i[a].x - J_i[b].x).erase_zeros();
+    SparseVector<weight_t> xdiff = (J_i[a].x - J_i[b].x).erase_zeros();
     if (xdiff.empty()) {
       cerr << "Empty diff:\n  " << TD::GetString(J_i[a].hyp) << endl << "x=" << J_i[a].x << endl;
       cerr << "  " << TD::GetString(J_i[b].hyp) << endl << "x=" << J_i[b].x << endl;
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 9caaa1d1..239649c1 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -40,8 +40,8 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   }
 }
 
-void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
-  SparseVector<double>& x = *out;
+void ParseSparseVector(string& line, size_t cur, SparseVector<weight_t>* out) {
+  SparseVector<weight_t>& x = *out;
   size_t last_start = cur;
   size_t last_comma = string::npos;
   while(cur <= line.size()) {
@@ -52,7 +52,7 @@ void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
       }
       const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
       if (cur < line.size()) line[cur] = 0;
-      const double val = strtod(&line[last_comma + 1], NULL);
+      const weight_t val = strtod(&line[last_comma + 1], NULL);
       x.set_value(fid, val);
 
       last_comma = string::npos;
@@ -65,13 +65,13 @@ void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
   }
 }
 
-void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<double> > >* corpus) {
+void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<weight_t> > >* corpus) {
   istream& in = *pin;
   corpus->clear();
   bool flag = false;
   int lc = 0;
   string line;
-  SparseVector<double> x;
+  SparseVector<weight_t> x;
   while(getline(in, line)) {
     ++lc;
     if (lc % 1000 == 0) { cerr << '.'; flag = true; }
@@ -88,16 +88,16 @@ void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<double> > >* corpus
   if (flag) cerr << endl;
 }
 
-void GradAdd(const SparseVector<double>& v, const double scale, vector<double>* acc) {
-  for (SparseVector<double>::const_iterator it = v.begin();
+void GradAdd(const SparseVector<weight_t>& v, const double scale, vector<weight_t>* acc) {
+  for (SparseVector<weight_t>::const_iterator it = v.begin();
        it != v.end(); ++it) {
     (*acc)[it->first] += it->second * scale;
   }
 }
 
-double TrainingInference(const vector<double>& x,
-                         const vector<pair<bool, SparseVector<double> > >& corpus,
-                         vector<double>* g = NULL) {
+double TrainingInference(const vector<weight_t>& x,
+                         const vector<pair<bool, SparseVector<weight_t> > >& corpus,
+                         vector<weight_t>* g = NULL) {
   double cll = 0;
   for (int i = 0; i < corpus.size(); ++i) {
     const double dotprod = corpus[i].second.dot(x) + x[0]; // x[0] is bias
@@ -132,13 +132,13 @@ double TrainingInference(const vector<double>& x,
 }
 
 // return held-out log likelihood
-double LearnParameters(const vector<pair<bool, SparseVector<double> > >& training,
-                       const vector<pair<bool, SparseVector<double> > >& testing,
+double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training,
+                       const vector<pair<bool, SparseVector<weight_t> > >& testing,
                        const double sigsq,
                        const unsigned memory_buffers,
-                       vector<double>* px) {
-  vector<double>& x = *px;
-  vector<double> vg(FD::NumFeats(), 0.0);
+                       vector<weight_t>* px) {
+  vector<weight_t>& x = *px;
+  vector<weight_t> vg(FD::NumFeats(), 0.0);
   bool converged = false;
   LBFGSOptimizer opt(FD::NumFeats(), memory_buffers);
   double tppl = 0.0;
@@ -172,7 +172,7 @@ double LearnParameters(const vector<pair<bool, SparseVector<double> > >& trainin
     cll += reg;
     cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t";
     try {
-      vector<double> old_x = x;
+      vector<weight_t> old_x = x;
       do {
         opt.Optimize(cll, vg, &x);
         converged = opt.HasConverged();
@@ -193,7 +193,7 @@ int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
   string line;
-  vector<pair<bool, SparseVector<double> > > training, testing;
+  vector<pair<bool, SparseVector<weight_t> > > training, testing;
   SparseVector<weight_t> old_weights;
   const bool tune_regularizer = conf.count("tune_regularizer");
   if (tune_regularizer && !conf.count("testset")) {
-- 
cgit v1.2.3


From 9ba06c6f1a7e751da245219da291e329efa2b7e5 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 14 Sep 2011 13:12:01 +0100
Subject: fix pro train bug causing it not to optimize when there is no
 held-out test set

---
 pro-train/mr_pro_reduce.cc | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'pro-train/mr_pro_reduce.cc')

diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 239649c1..e71347ba 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -194,7 +194,6 @@ int main(int argc, char** argv) {
   InitCommandLine(argc, argv, &conf);
   string line;
   vector<pair<bool, SparseVector<weight_t> > > training, testing;
-  SparseVector<weight_t> old_weights;
   const bool tune_regularizer = conf.count("tune_regularizer");
   if (tune_regularizer && !conf.count("testset")) {
     cerr << "--tune_regularizer requires --testset to be set\n";
@@ -202,28 +201,28 @@ int main(int argc, char** argv) {
   }
   const double min_reg = conf["min_reg"].as<double>();
   const double max_reg = conf["max_reg"].as<double>();
-  double sigsq = conf["sigma_squared"].as<double>();
+  double sigsq = conf["sigma_squared"].as<double>(); // will be overridden if parameter is tuned
   assert(sigsq > 0.0);
   assert(min_reg > 0.0);
   assert(max_reg > 0.0);
   assert(max_reg > min_reg);
   const double psi = conf["interpolation"].as<double>();
   if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
-  if (conf.count("weights")) {
-    vector<weight_t> dt;
-    Weights::InitFromFile(conf["weights"].as<string>(), &dt);
-    Weights::InitSparseVector(dt, &old_weights);
-  }
   ReadCorpus(&cin, &training);
   if (conf.count("testset")) {
     ReadFile rf(conf["testset"].as<string>());
     ReadCorpus(rf.stream(), &testing);
   }
   cerr << "Number of features: " << FD::NumFeats() << endl;
-  vector<weight_t> x(FD::NumFeats(), 0.0);  // x[0] is bias
-  for (SparseVector<weight_t>::const_iterator it = old_weights.begin();
-       it != old_weights.end(); ++it)
-    x[it->first] = it->second;
+
+  vector<weight_t> x, prev_x;  // x[0] is bias
+  if (conf.count("weights")) {
+    Weights::InitFromFile(conf["weights"].as<string>(), &x);
+    prev_x = x;
+  }
+  cerr << "         Number of features: " << x.size() << endl;
+  cerr << "Number of training examples: " << training.size() << endl;
+  cerr << "Number of  testing examples: " << testing.size() << endl;
   double tppl = 0.0;
   vector<pair<double,double> > sp;
   vector<double> smoothed;
@@ -255,11 +254,12 @@ int main(int argc, char** argv) {
       }
     }
     sigsq = sp[best_i].first;
-    tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
-  }
+  }  // tune regularizer
+  tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
   if (conf.count("weights")) {
-    for (int i = 1; i < x.size(); ++i)
-      x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi);
+    for (int i = 1; i < x.size(); ++i) {
+      x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi);
+    }
   }
   cout.precision(15);
   cout << "# sigma^2=" << sigsq << "\theld out perplexity=";
-- 
cgit v1.2.3


From fc5c72f9c5ce60c5d9a3dcd363eb51ccdd543bc9 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 14 Sep 2011 14:43:03 +0100
Subject: fix for potential segv with no weights

---
 pro-train/mr_pro_reduce.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pro-train/mr_pro_reduce.cc')

diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index e71347ba..6b491918 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -100,7 +100,7 @@ double TrainingInference(const vector<weight_t>& x,
                          vector<weight_t>* g = NULL) {
   double cll = 0;
   for (int i = 0; i < corpus.size(); ++i) {
-    const double dotprod = corpus[i].second.dot(x) + x[0]; // x[0] is bias
+    const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias
     double lp_false = dotprod;
     double lp_true = -dotprod;
     if (0 < lp_true) {
-- 
cgit v1.2.3


From 9acb1f98b698f9fd0c09f6b7c122011651dcc435 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 14 Sep 2011 14:47:06 +0100
Subject: fix for more problems caused by hash refactoring

---
 pro-train/mr_pro_reduce.cc | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'pro-train/mr_pro_reduce.cc')

diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 6b491918..aff410a0 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -218,6 +218,10 @@ int main(int argc, char** argv) {
   vector<weight_t> x, prev_x;  // x[0] is bias
   if (conf.count("weights")) {
     Weights::InitFromFile(conf["weights"].as<string>(), &x);
+    x.resize(FD::NumFeats());
+    prev_x = x;
+  } else {
+    x.resize(FD::NumFeats());
     prev_x = x;
   }
   cerr << "         Number of features: " << x.size() << endl;
-- 
cgit v1.2.3