sort of working hopkins&may optimizer

author: Chris Dyer <cdyer@cs.cmu.edu> 2011-07-11 20:39:45 -0400
committer: Chris Dyer <cdyer@cs.cmu.edu> 2011-07-11 20:39:45 -0400
commit: bde4a34bab96052570c248f7d9ccc299a9a3f097 (patch)
tree: b74180cb2d36e373eafc1fd6a74968a969287ead /pro-train
parent: 95deb840699f9b6f8fe499b374bd726bce97365c (diff)
5 files changed, 349 insertions, 266 deletions
diff --git a/pro-train/Makefile.am b/pro-train/Makefile.am
index 945ed5c3..fdaf43e2 100644
--- a/pro-train/Makefile.am
+++ b/pro-train/Makefile.am
@@ -8,6 +8,6 @@ mr_pro_map_SOURCES = mr_pro_map.cc
 mr_pro_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
 
 mr_pro_reduce_SOURCES = mr_pro_reduce.cc
-mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
 
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training
diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl
index 35bccea4..55d7f1fa 100755
--- a/pro-train/dist-pro.pl
+++ b/pro-train/dist-pro.pl
@@ -21,7 +21,7 @@ my $bin_dir = $SCRIPT_DIR;
 die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
 my $FAST_SCORE="$bin_dir/../mteval/fast_score";
 die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
-my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input";
+my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl";
 my $MAPPER = "$bin_dir/mr_pro_map";
 my $REDUCER = "$bin_dir/mr_pro_reduce";
 my $parallelize = "$VEST_DIR/parallelize.pl";
@@ -37,8 +37,7 @@ die "Can't find decoder in $cdec" unless -x $cdec;
 die "Can't find $parallelize" unless -x $parallelize;
 die "Can't find $libcall" unless -e $libcall;
 my $decoder = $cdec;
-my $lines_per_mapper = 400;
-my $rand_directions = 15;
+my $lines_per_mapper = 100;
 my $iteration = 1;
 my $run_local = 0;
 my $best_weights;
@@ -58,7 +57,6 @@ my $metric = "ibm_bleu";
 my $dir;
 my $iniFile;
 my $weights;
-my $initialWeights;
 my $decoderOpt;
 my $noprimary;
 my $maxsim=0;
@@ -67,7 +65,6 @@ my $oracleb=20;
 my $bleu_weight=1;
 my $use_make;  # use make to parallelize line search
 my $dirargs='';
-my $density_prune;
 my $usefork;
 my $pass_suffix = '';
 my $cpbin=1;
@@ -76,7 +73,6 @@ Getopt::Long::Configure("no_auto_abbrev");
 if (GetOptions(
 	"decoder=s" => \$decoderOpt,
 	"decode-nodes=i" => \$decode_nodes,
-	"density-prune=f" => \$density_prune,
 	"dont-clean" => \$disable_clean,
 	"pass-suffix=s" => \$pass_suffix,
         "use-fork" => \$usefork,
@@ -91,8 +87,6 @@ if (GetOptions(
 	"normalize=s" => \$normalize,
 	"pmem=s" => \$pmem,
         "cpbin!" => \$cpbin,
-	"rand-directions=i" => \$rand_directions,
-	"random_directions=i" => \$rand_directions,
         "bleu_weight=s" => \$bleu_weight,
         "no-primary!" => \$noprimary,
         "max-similarity=s" => \$maxsim,
@@ -103,18 +97,12 @@ if (GetOptions(
 	"ref-files=s" => \$refFiles,
 	"metric=s" => \$metric,
 	"source-file=s" => \$srcFile,
-	"weights=s" => \$initialWeights,
 	"workdir=s" => \$dir,
-    "opt-iterations=i" => \$optimization_iters,
 ) == 0 || @ARGV!=1 || $help) {
 	print_help();
 	exit;
 }
 
-if (defined $density_prune) {
-  die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0;
-}
-
 if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; }
 
 if ($metric =~ /^(combi|ter)$/i) {
@@ -146,7 +134,7 @@ if ($metric =~ /^ter$|^aer$/i) {
 my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
 
 unless ($dir){
-	$dir = "vest";
+	$dir = "protrain";
 }
 unless ($dir =~ /^\//){  # convert relative path to absolute path
 	my $basedir = check_output("pwd");
@@ -203,18 +191,19 @@ sub dirsize {
     opendir ISEMPTY,$_[0];
     return scalar(readdir(ISEMPTY))-1;
 }
+my @allweights;
 if ($dryrun){
 	write_config(*STDERR);
 	exit 0;
 } else {
-	if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs
+	if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs
 	  die "ERROR: working dir $dir already exists\n\n";
 	} else {
 		-e $dir || mkdir $dir;
 		mkdir "$dir/hgs";
         modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
     mkdir "$dir/scripts";
-        my $cmdfile="$dir/rerun-vest.sh";
+        my $cmdfile="$dir/rerun-pro.sh";
         open CMD,'>',$cmdfile;
         print CMD "cd ",&getcwd,"\n";
 #        print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
@@ -223,13 +212,8 @@ if ($dryrun){
         close CMD;
         print STDERR $cline;
         chmod(0755,$cmdfile);
-		unless (-e $initialWeights) {
-			print STDERR "Please specify an initial weights file with --initial-weights\n";
-			print_help();
-			exit;
-		}
-		check_call("cp $initialWeights $dir/weights.0");
-		die "Can't find weights.0" unless (-e "$dir/weights.0");
+	check_call("touch $dir/weights.0");
+	die "Can't find weights.0" unless (-e "$dir/weights.0");
 	}
 	write_config(*STDERR);
 }
@@ -255,6 +239,7 @@ my $random_seed = int(time / 1000);
 my $lastWeightsFile;
 my $lastPScore = 0;
 # main optimization loop
+my @mapoutputs = (); # aggregate map outputs over all iters
 while (1){
 	print STDERR "\n\nITERATION $iteration\n==========\n";
 
@@ -276,10 +261,8 @@ while (1){
 	print STDERR unchecked_output("date");
 	my $im1 = $iteration - 1;
 	my $weightsFile="$dir/weights.$im1";
+        push @allweights, "-w $dir/weights.$im1";
 	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
-	if ($density_prune) {
-		$decoder_cmd .= " --density_prune $density_prune";
-	}
 	my $pcmd;
 	if ($run_local) {
 		$pcmd = "cat $srcFile |";
@@ -320,163 +303,111 @@ while (1){
 	# run optimizer
 	print STDERR "RUNNING OPTIMIZER AT ";
 	print STDERR unchecked_output("date");
+	print STDERR " - GENERATE TRAINING EXEMPLARS\n";
 	my $mergeLog="$logdir/prune-merge.log.$iteration";
 
 	my $score = 0;
 	my $icc = 0;
 	my $inweights="$dir/weights.$im1";
-	for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) {
-		print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n";
-		print STDERR unchecked_output("date");
-		$icc++;
-		my $nop=$noprimary?"--no_primary":"";
-		my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):"";
-		my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":"";
-		$cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter";
-		print STDERR "COMMAND:\n$cmd\n";
-		check_call($cmd);
-		check_call("mkdir -p $dir/splag.$im1");
-		$cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput.";
-		print STDERR "COMMAND:\n$cmd\n";
-		check_call($cmd);
-		opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
-		my @shards = grep { /^mapinput\./ } readdir(DIR);
-		closedir DIR;
-		die "No shards!" unless scalar @shards > 0;
-		my $joblist = "";
-		my $nmappers = 0;
-		my @mapoutputs = ();
-		@cleanupcmds = ();
-		my %o2i = ();
-		my $first_shard = 1;
-		my $mkfile; # only used with makefiles
-		my $mkfilename;
-		if ($use_make) {
-			$mkfilename = "$dir/splag.$im1/domap.mk";
-			open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
-			print $mkfile "all: $dir/splag.$im1/map.done\n\n";
-		}
-		my @mkouts = ();  # only used with makefiles
-		for my $shard (@shards) {
-			my $mapoutput = $shard;
-			my $client_name = $shard;
-			$client_name =~ s/mapinput.//;
-			$client_name = "vest.$client_name";
-			$mapoutput =~ s/mapinput/mapoutput/;
-			push @mapoutputs, "$dir/splag.$im1/$mapoutput";
-			$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
-			my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";
-			if ($run_local) {
-				print STDERR "COMMAND:\n$script\n";
-				check_bash_call($script);
-			} elsif ($use_make) {
-				my $script_file = "$dir/scripts/map.$shard";
-				open F, ">$script_file" or die "Can't write $script_file: $!";
-				print F "#!/bin/bash\n";
-				print F "$script\n";
-				close F;
-				my $output = "$dir/splag.$im1/$mapoutput";
-				push @mkouts, $output;
-				chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
-				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
-				print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
-			} else {
-				my $script_file = "$dir/scripts/map.$shard";
-				open F, ">$script_file" or die "Can't write $script_file: $!";
-				print F "$script\n";
-				close F;
-				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
-
-				$nmappers++;
-				my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
-				my $jobid = check_output("$qcmd");
-				chomp $jobid;
-				$jobid =~ s/^(\d+)(.*?)$/\1/g;
-				$jobid =~ s/^Your job (\d+) .*$/\1/;
-		 	 	push(@cleanupcmds, "qdel $jobid 2> /dev/null");
-				print STDERR " $jobid";
-				if ($joblist == "") { $joblist = $jobid; }
-				else {$joblist = $joblist . "\|" . $jobid; }
-			}
-		}
+	$cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_call($cmd);
+	check_call("mkdir -p $dir/splag.$im1");
+	$cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1 $dir/splag.$im1/mapinput.";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_call($cmd);
+	opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
+	my @shards = grep { /^mapinput\./ } readdir(DIR);
+	closedir DIR;
+	die "No shards!" unless scalar @shards > 0;
+	my $joblist = "";
+	my $nmappers = 0;
+	@cleanupcmds = ();
+	my %o2i = ();
+	my $first_shard = 1;
+	my $mkfile; # only used with makefiles
+	my $mkfilename;
+	if ($use_make) {
+		$mkfilename = "$dir/splag.$im1/domap.mk";
+		open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
+		print $mkfile "all: $dir/splag.$im1/map.done\n\n";
+	}
+	my @mkouts = ();  # only used with makefiles
+	for my $shard (@shards) {
+		my $mapoutput = $shard;
+		my $client_name = $shard;
+		$client_name =~ s/mapinput.//;
+		$client_name = "pro.$client_name";
+		$mapoutput =~ s/mapinput/mapoutput/;
+		push @mapoutputs, "$dir/splag.$im1/$mapoutput";
+		$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
+		my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep @allweights < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
 		if ($run_local) {
-			print STDERR "\nProcessing line search complete.\n";
+			print STDERR "COMMAND:\n$script\n";
+			check_bash_call($script);
 		} elsif ($use_make) {
-			print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
-			close $mkfile;
-			my $mcmd = "make -j $use_make -f $mkfilename";
-			print STDERR "\nExecuting: $mcmd\n";
-			check_call($mcmd);
+			my $script_file = "$dir/scripts/map.$shard";
+			open F, ">$script_file" or die "Can't write $script_file: $!";
+			print F "#!/bin/bash\n";
+			print F "$script\n";
+			close F;
+			my $output = "$dir/splag.$im1/$mapoutput";
+			push @mkouts, $output;
+			chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
+			if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+			print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
 		} else {
-			print STDERR "\nLaunched $nmappers mappers.\n";
-      			sleep 8;
-			print STDERR "Waiting for mappers to complete...\n";
-			while ($nmappers > 0) {
-			  sleep 5;
-			  my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
-			  $nmappers = scalar @livejobs;
-			}
-			print STDERR "All mappers complete.\n";
+			my $script_file = "$dir/scripts/map.$shard";
+			open F, ">$script_file" or die "Can't write $script_file: $!";
+			print F "$script\n";
+			close F;
+			if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+
+			$nmappers++;
+			my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
+			my $jobid = check_output("$qcmd");
+			chomp $jobid;
+			$jobid =~ s/^(\d+)(.*?)$/\1/g;
+			$jobid =~ s/^Your job (\d+) .*$/\1/;
+		 	push(@cleanupcmds, "qdel $jobid 2> /dev/null");
+			print STDERR " $jobid";
+			if ($joblist == "") { $joblist = $jobid; }
+			else {$joblist = $joblist . "\|" . $jobid; }
 		}
-		my $tol = 0;
-		my $til = 0;
-		for my $mo (@mapoutputs) {
-		  my $olines = get_lines($mo);
-		  my $ilines = get_lines($o2i{$mo});
-		  $tol += $olines;
-		  $til += $ilines;
-		  die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines;
-		}
-		print STDERR "Results for $tol/$til lines\n";
-		print STDERR "\nSORTING AND RUNNING VEST REDUCER\n";
-		print STDERR unchecked_output("date");
-		$cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1";
-		print STDERR "COMMAND:\n$cmd\n";
-		check_bash_call($cmd);
-		$cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1";
-		# sort returns failure even when it doesn't fail for some reason
-		my $best=unchecked_output("$cmd"); chomp $best;
-		print STDERR "$best\n";
-		my ($oa, $x, $xscore) = split /\|/, $best;
-		$score = $xscore;
-		print STDERR "PROJECTED SCORE: $score\n";
-		if (abs($x) < $epsilon) {
-			print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n";
-			last;
-		}
-                my $psd = $score - $last_score;
-                $last_score = $score;
-		if (abs($psd) < $epsilon) {
-			print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n";
-			last;
-		}
-		my ($origin, $axis) = split /\s+/, $oa;
-
-		my %ori = convert($origin);
-		my %axi = convert($axis);
-
-		my $finalFile="$dir/weights.$im1-$opt_iter";
-		open W, ">$finalFile" or die "Can't write: $finalFile: $!";
-                my $norm = 0;
-		for my $k (sort keys %ori) {
-			my $dd = $ori{$k} + $axi{$k} * $x;
-                        $norm += $dd * $dd;
-		}
-                $norm = sqrt($norm);
-		$norm = 1;
-		for my $k (sort keys %ori) {
-			my $v = ($ori{$k} + $axi{$k} * $x) / $norm;
-			print W "$k $v\n";
+	}
+	if ($run_local) {
+		print STDERR "\nCompleted extraction of training exemplars.\n";
+	} elsif ($use_make) {
+		print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
+		close $mkfile;
+		my $mcmd = "make -j $use_make -f $mkfilename";
+		print STDERR "\nExecuting: $mcmd\n";
+		check_call($mcmd);
+	} else {
+		print STDERR "\nLaunched $nmappers mappers.\n";
+      		sleep 8;
+		print STDERR "Waiting for mappers to complete...\n";
+		while ($nmappers > 0) {
+		  sleep 5;
+		  my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
+		  $nmappers = scalar @livejobs;
 		}
-		check_call("rm $dir/splag.$im1/*");
-		$inweights = $finalFile;
+		print STDERR "All mappers complete.\n";
 	}
-	$lastWeightsFile = "$dir/weights.$iteration";
-	check_call("cp $inweights $lastWeightsFile");
-	if ($icc < 2) {
-		print STDERR "\nREACHED STOPPING CRITERION: score change too little\n";
-		last;
+	my $tol = 0;
+	my $til = 0;
+        print STDERR "MO: @mapoutputs\n";
+	for my $mo (@mapoutputs) {
+		#my $olines = get_lines($mo);
+		#my $ilines = get_lines($o2i{$mo});
+		#die "$mo: no training instances generated!" if $olines == 0;
 	}
+	print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n";
+	print STDERR unchecked_output("date");
+	$cmd="cat @mapoutputs | $REDUCER -w $dir/weights.$im1 > $dir/weights.$iteration";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_bash_call($cmd);
+	$lastWeightsFile = "$dir/weights.$iteration";
 	$lastPScore = $score;
 	$iteration++;
 	print STDERR "\n==========\n";
@@ -488,24 +419,6 @@ print STDOUT "$lastWeightsFile\n";
 
 exit 0;
 
-sub normalize_weights {
-  my ($rfn, $rpts, $feat) = @_;
-  my @feat_names = @$rfn;
-  my @pts = @$rpts;
-  my $z = 1.0;
-  for (my $i=0; $i < scalar @feat_names; $i++) {
-    if ($feat_names[$i] eq $feat) {
-      $z = $pts[$i];
-      last;
-    }
-  }
-  for (my $i=0; $i < scalar @feat_names; $i++) {
-    $pts[$i] /= $z;
-  }
-  print STDERR " NORM WEIGHTS: @pts\n";
-  return @pts;
-}
-
 sub get_lines {
   my $fn = shift @_;
   open FL, "<$fn" or die "Couldn't read $fn: $!";
@@ -563,7 +476,6 @@ sub write_config {
 	print $fh "HEAD NODE:        $host\n";
 	print $fh "PMEM (DECODING):  $pmem\n";
 	print $fh "CLEANUP:          $cleanup\n";
-	print $fh "INITIAL WEIGHTS:  $initialWeights\n";
 }
 
 sub update_weights_file {
@@ -603,6 +515,7 @@ sub enseg {
 	}
 	close SRC;
 	close NEWSRC;
+	die "Empty dev set!" if ($i == 0);
 }
 
 sub print_help {
@@ -634,10 +547,6 @@ Options:
 	--decoder <decoder path>
 		Decoder binary to use.
 
-	--density-prune <N>
-		Limit the density of the hypergraph on each iteration to N times
-		the number of edges on the Viterbi path.
-
 	--help
 		Print this message and exit.
 
@@ -668,18 +577,9 @@ Options:
 		After each iteration, rescale all feature weights such that feature-
 		name has a weight of 1.0.
 
-	--rand-directions <num>
-		MERT will attempt to optimize along all of the principle directions,
-		set this parameter to explore other directions. Defaults to 5.
-
 	--source-file <file>
 		Dev set source file.
 
-	--weights <file>
-		A file specifying initial feature weights.  The format is
-		FeatureName_1 value1
-		FeatureName_2 value2
-
 	--workdir <dir>
 		Directory for intermediate and output files.  If not specified, the
 		name is derived from the ini filename.  Assuming that the ini
diff --git a/pro-train/mr_pro_generate_mapper_input.pl b/pro-train/mr_pro_generate_mapper_input.pl
new file mode 100755
index 00000000..b30fc4fd
--- /dev/null
+++ b/pro-train/mr_pro_generate_mapper_input.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1;
+my $d = shift @ARGV;
+die "Can't find directory $d" unless -d $d;
+
+opendir(DIR, $d) or die "Can't read $d: $!";
+my @hgs = grep { /\.gz$/ } readdir(DIR);
+closedir DIR;
+
+for my $hg (@hgs) {
+  my $file = $hg;
+  my $id = $hg;
+  $id =~ s/(\.json)?\.gz//;
+  print "$d/$file $id\n";
+}
+
diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc
index b046cdea..128d93ce 100644
--- a/pro-train/mr_pro_map.cc
+++ b/pro-train/mr_pro_map.cc
@@ -10,6 +10,7 @@
 #include "sampler.h"
 #include "filelib.h"
 #include "stringlib.h"
+#include "weights.h"
 #include "scorer.h"
 #include "inside_outside.h"
 #include "hg_io.h"
@@ -27,10 +28,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
         ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
-        ("source,s",po::value<string>(), "Source file (ignored, except for AER)")
+        ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)")
         ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized")
         ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
-        ("weights,w",po::value<string>(), "[REQD] Current weights file")
+        ("weights,w",po::value<vector<string> >(), "[REQD] Weights files from previous and current iterations")
         ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")
         ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")
         ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)")
@@ -44,6 +45,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
     cerr << "Please specify one or more references using -r <REF.TXT>\n";
     flag = true;
   }
+  if (!conf->count("weights")) {
+    cerr << "Please specify one or more weights using -w <WEIGHTS.TXT>\n";
+    flag = true;
+  }
   if (flag || conf->count("help")) {
     cerr << dcmdline_options << endl;
     exit(1);
@@ -51,18 +56,78 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
 }
 
 struct HypInfo {
-  HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-1), x(feats) {}
-  double g() {
+  HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-100.0), x(feats) {}
+
+  // lazy evaluation
+  double g(const SentenceScorer& scorer) const {
+    if (g_ == -100.0)
+      g_ = scorer.ScoreCandidate(hyp)->ComputeScore();
     return g_;
   }
- private:
-  int sent_id;
   vector<WordID> hyp;
-  double g_;
+  mutable double g_;
  public:
   SparseVector<double> x;
 };
 
+struct ThresholdAlpha {
+  explicit ThresholdAlpha(double t = 0.05) : threshold(t) {}
+  double operator()(double mag) const {
+    if (mag < threshold) return 0.0; else return 1.0;
+  }
+  const double threshold;
+};
+
+struct TrainingInstance {
+  TrainingInstance(const SparseVector<double>& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {}
+  SparseVector<double> x;
+#ifdef DEBUGGING_PRO
+  vector<WordID> a;
+  vector<WordID> b;
+#endif
+  bool y;
+  double gdiff;
+};
+
+struct DiffOrder {
+  bool operator()(const TrainingInstance& a, const TrainingInstance& b) const {
+    return a.gdiff > b.gdiff;
+  }
+};
+
+template<typename Alpha>
+void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const Alpha& alpha_i, bool invert_score, vector<TrainingInstance>* pv) {
+  vector<TrainingInstance> v;
+  for (unsigned i = 0; i < gamma; ++i) {
+    size_t a = rng->inclusive(0, J_i.size() - 1)();
+    size_t b = rng->inclusive(0, J_i.size() - 1)();
+    if (a == b) continue;
+    double ga = J_i[a].g(scorer);
+    double gb = J_i[b].g(scorer);
+    bool positive = ga < gb;
+    if (invert_score) positive = !positive;
+    double gdiff = fabs(ga - gb);
+    if (!gdiff) continue;
+    if (rng->next() < alpha_i(gdiff)) {
+      v.push_back(TrainingInstance((J_i[a].x - J_i[b].x).erase_zeros(), positive, gdiff));
+#ifdef DEBUGGING_PRO
+      v.back().a = J_i[a].hyp;
+      v.back().b = J_i[b].hyp;
+#endif
+    }
+  }
+  vector<TrainingInstance>::iterator mid = v.begin() + xi;
+  if (xi > v.size()) mid = v.end();
+  partial_sort(v.begin(), mid, v.end(), DiffOrder());
+  copy(v.begin(), mid, back_inserter(*pv));
+#ifdef DEBUGGING_PRO
+  if (v.size() >= 5)
+    for (int i =0; i < 5; ++i) {
+      cerr << v[i].gdiff << " y=" << v[i].y << "\tA:" << TD::GetString(v[i].a) << "\n\tB: " << TD::GetString(v[i].b) << endl;
+    }
+#endif
+}
+
 int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
@@ -81,7 +146,15 @@ int main(int argc, char** argv) {
   const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
   const unsigned gamma = conf["candidate_pairs"].as<unsigned>();
   const unsigned xi = conf["best_pairs"].as<unsigned>();
+  vector<string> weights_files = conf["weights"].as<vector<string> >();
+  vector<vector<double> > weights(weights_files.size());
+  for (int i = 0; i < weights.size(); ++i) {
+    Weights w;
+    w.InitFromFile(weights_files[i]);
+    w.InitVector(&weights[i]);
+  }
   while(in) {
+    vector<TrainingInstance> v;
     string line;
     getline(in, line);
     if (line.empty()) continue;
@@ -92,18 +165,27 @@ int main(int argc, char** argv) {
     is >> file >> sent_id;
     ReadFile rf(file);
     HypergraphIO::ReadFromJSON(rf.stream(), &hg);
-    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
-
     vector<HypInfo> J_i;
-    for (int i = 0; i < kbest_size; ++i) {
-      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
-        kbest.LazyKthBest(hg.nodes_.size() - 1, i);
-      if (!d) break;
-      float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore();
-      // if (invert_score) sentscore *= -1.0;
-      // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl;
-      d->feature_values;
-      sentscore;
+    int start = weights.size();
+    start -= 4;
+    if (start < 0) start = 0;
+    for (int i = start; i < weights.size(); ++i) {
+      hg.Reweight(weights[i]);
+      KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
+
+      for (int i = 0; i < kbest_size; ++i) {
+        const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+          kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+        if (!d) break;
+        J_i.push_back(HypInfo(d->yield, d->feature_values));
+      }
+    }
+
+    Sample(gamma, xi, J_i, *ds[sent_id], ThresholdAlpha(0.05), (type == TER), &v);
+    for (unsigned i = 0; i < v.size(); ++i) {
+      const TrainingInstance& vi = v[i];
+      cout << vi.y << "\t" << vi.x << endl;
+      cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl;
     }
   }
   return 0;
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 3df52020..2b9c5ce7 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -1,3 +1,4 @@
+#include <cstdlib>
 #include <sstream>
 #include <iostream>
 #include <fstream>
@@ -6,24 +7,29 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "weights.h"
 #include "sparse_vector.h"
-#include "error_surface.h"
-#include "line_optimizer.h"
-#include "b64tools.h"
+#include "optimize.h"
 
 using namespace std;
 namespace po = boost::program_options;
 
+// since this is a ranking model, there should be equal numbers of
+// positive and negative examples so the bias should be 0
+static const double MAX_BIAS = 1e-10;
+
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
-        ("loss_function,l",po::value<string>(), "Loss function being optimized")
+        ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")
+        ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev")
+        ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)")
+        ("sigma_squared,s",po::value<double>()->default_value(0.5), "Sigma squared for Gaussian prior")
         ("help,h", "Help");
   po::options_description dcmdline_options;
   dcmdline_options.add(opts);
   po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  bool flag = conf->count("loss_function") == 0;
-  if (flag || conf->count("help")) {
+  if (conf->count("help")) {
     cerr << dcmdline_options << endl;
     exit(1);
   }
@@ -32,50 +38,127 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
 int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
-  const string loss_function = conf["loss_function"].as<string>();
-  ScoreType type = ScoreTypeFromString(loss_function);
-  LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE;
-  if (type == TER || type == AER) {
-    opt_type = LineOptimizer::MINIMIZE_SCORE;
+  string line;
+  vector<pair<bool, SparseVector<double> > > training;
+  int lc = 0;
+  bool flag = false;
+  SparseVector<double> old_weights;
+  const double psi = conf["interpolation"].as<double>();
+  if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
+  if (conf.count("weights")) {
+    Weights w;
+    w.InitFromFile(conf["weights"].as<string>());
+    w.InitSparseVector(&old_weights);
   }
-  string last_key;
-  vector<ErrorSurface> esv;
-  while(cin) {
-    string line;
-    getline(cin, line);
+  while(getline(cin, line)) {
+    ++lc;
+    if (lc % 1000 == 0) { cerr << '.'; flag = true; }
+    if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
     if (line.empty()) continue;
-    size_t ks = line.find("\t");
+    const size_t ks = line.find("\t");
     assert(string::npos != ks);
-    assert(ks > 2);
-    string key = line.substr(2, ks - 2);
-    string val = line.substr(ks + 1);
-    if (key != last_key) {
-      if (!last_key.empty()) {
-	float score;
-        double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
-	cout << last_key << "|" << x << "|" << score << endl;
+    assert(ks == 1);
+    const bool y = line[0] == '1';
+    SparseVector<double> x;
+    size_t last_start = ks + 1;
+    size_t last_comma = string::npos;
+    size_t cur = last_start;
+    while(cur <= line.size()) {
+      if (line[cur] == ' ' || cur == line.size()) {
+        if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
+          cerr << "[ERROR] " << line << endl << "  position = " << cur << endl;
+          exit(1);
+        }
+        const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
+        if (cur < line.size()) line[cur] = 0;
+        const double val = strtod(&line[last_comma + 1], NULL);
+        x.set_value(fid, val);
+
+        last_comma = string::npos;
+        last_start = cur+1;
+      } else {
+        if (line[cur] == '=')
+          last_comma = cur;
+      }
+      ++cur;
+    }
+    training.push_back(make_pair(y, x));
+  }
+  if (flag) cerr << endl;
+
+  cerr << "Number of features: " << FD::NumFeats() << endl;
+  vector<double> x(FD::NumFeats(), 0.0);  // x[0] is bias
+  for (SparseVector<double>::const_iterator it = old_weights.begin();
+       it != old_weights.end(); ++it)
+    x[it->first] = it->second;
+  vector<double> vg(FD::NumFeats(), 0.0);
+  SparseVector<double> g;
+  bool converged = false;
+  LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as<unsigned>());
+  while(!converged) {
+    double cll = 0;
+    double dbias = 0;
+    g.clear();
+    for (int i = 0; i < training.size(); ++i) {
+      const double dotprod = training[i].second.dot(x) + x[0]; // x[0] is bias
+      double lp_false = dotprod;
+      double lp_true = -dotprod;
+      if (0 < lp_true) {
+        lp_true += log1p(exp(-lp_true));
+        lp_false = log1p(exp(lp_false));
+      } else {
+        lp_true = log1p(exp(lp_true));
+        lp_false += log1p(exp(-lp_false));
+      }
+      lp_true*=-1;
+      lp_false*=-1;
+      if (training[i].first) {  // true label
+        cll -= lp_true;
+        g -= training[i].second * exp(lp_false);
+        dbias -= exp(lp_false);
+      } else {                  // false label
+        cll -= lp_false;
+        g += training[i].second * exp(lp_true);
+        dbias += exp(lp_true);
       }
-      last_key = key;
-      esv.clear();
     }
-    if (val.size() % 4 != 0) {
-      cerr << "B64 encoding error 1! Skipping.\n";
-      continue;
+    vg.clear();
+    g.init_vector(&vg);
+    vg[0] = dbias;
+#if 1
+    const double sigsq = conf["sigma_squared"].as<double>();
+    double norm = 0;
+    for (int i = 1; i < x.size(); ++i) {
+      const double mean_i = 0.0;
+      const double param = (x[i] - mean_i);
+      norm += param * param;
+      vg[i] += param / sigsq;
+    } 
+    const double reg = norm / (2.0 * sigsq);
+#else
+    double reg = 0;
+#endif
+    cll += reg;
+    cerr << cll << " (REG=" << reg << ")\t";
+    bool failed = false;
+    try {
+      opt.Optimize(cll, vg, &x);
+    } catch (...) {
+      cerr << "Exception caught, assuming convergence is close enough...\n";
+      failed = true;
     }
-    string encoded(val.size() / 4 * 3, '\0');
-    if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) {
-      cerr << "B64 encoding error 2! Skipping.\n";
-      continue;
+    if (fabs(x[0]) > MAX_BIAS) {
+      cerr << "Biased model learned. Are your training instances wrong?\n";
+      cerr << "  BIAS: " << x[0] << endl;
     }
-    esv.push_back(ErrorSurface());
-    esv.back().Deserialize(type, encoded);
+    converged = failed || opt.HasConverged();
   }
-  if (!esv.empty()) {
-    // cerr << "ESV=" << esv.size() << endl;
-    // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; }
-    float score;
-    double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
-    cout << last_key << "|" << x << "|" << score << endl;
+  Weights w;
+  if (conf.count("weights")) {
+    for (int i = 1; i < x.size(); ++i)
+      x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi);
   }
+  w.InitFromVector(x);
+  w.WriteToFile("-");
   return 0;
 }
author	Chris Dyer <cdyer@cs.cmu.edu>	2011-07-11 20:39:45 -0400
committer	Chris Dyer <cdyer@cs.cmu.edu>	2011-07-11 20:39:45 -0400
commit	bde4a34bab96052570c248f7d9ccc299a9a3f097 (patch)
tree	b74180cb2d36e373eafc1fd6a74968a969287ead /pro-train
parent	95deb840699f9b6f8fe499b374bd726bce97365c (diff)