diff options
Diffstat (limited to 'pro-train')
| -rw-r--r-- | pro-train/Makefile.am | 13 | ||||
| -rw-r--r-- | pro-train/README.shared-mem | 9 | ||||
| -rwxr-xr-x | pro-train/dist-pro.pl | 657 | ||||
| -rwxr-xr-x | pro-train/mr_pro_generate_mapper_input.pl | 18 | ||||
| -rw-r--r-- | pro-train/mr_pro_map.cc | 351 | ||||
| -rw-r--r-- | pro-train/mr_pro_reduce.cc | 277 | 
6 files changed, 1325 insertions, 0 deletions
diff --git a/pro-train/Makefile.am b/pro-train/Makefile.am new file mode 100644 index 00000000..fdaf43e2 --- /dev/null +++ b/pro-train/Makefile.am @@ -0,0 +1,13 @@ +bin_PROGRAMS = \ +  mr_pro_map \ +  mr_pro_reduce + +TESTS = lo_test + +mr_pro_map_SOURCES = mr_pro_map.cc +mr_pro_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +mr_pro_reduce_SOURCES = mr_pro_reduce.cc +mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training diff --git a/pro-train/README.shared-mem b/pro-train/README.shared-mem new file mode 100644 index 00000000..7728efc0 --- /dev/null +++ b/pro-train/README.shared-mem @@ -0,0 +1,9 @@ +If you want to run dist-vest.pl on a very large shared memory machine, do the +following: + +  ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini + +This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the +decoder must load grammars, language models, etc., J should be smaller than I, but this will depend +on the system you are running on and the complexity of the models used for decoding. + diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl new file mode 100755 index 00000000..dbfa329a --- /dev/null +++ b/pro-train/dist-pro.pl @@ -0,0 +1,657 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); + +my $VEST_DIR="$SCRIPT_DIR/../vest"; +require "$VEST_DIR/libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl"; +my $MAPPER = "$bin_dir/mr_pro_map"; +my $REDUCER = "$bin_dir/mr_pro_reduce"; +my $parallelize = "$VEST_DIR/parallelize.pl"; +my $libcall = "$VEST_DIR/libcall.pl"; +my $sentserver = "$VEST_DIR/sentserver"; +my $sentclient = "$VEST_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 30; +my $iteration = 1; +my $run_local = 0; +my $best_weights; +my $max_iterations = 30; +my $decode_nodes = 15;   # number of decode nodes +my $pmem = "4g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $use_make;  # use make to parallelize +my $usefork; +my $initial_weights; +my $pass_suffix = ''; +my $cpbin=1; + +# regularization strength +my $tune_regularizer = 0; +my $reg = 1e-2; + +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( +	"decode-nodes=i" => \$decode_nodes, +	"dont-clean" => \$disable_clean, +	"pass-suffix=s" => \$pass_suffix, +        "use-fork" => \$usefork, +	"dry-run" => \$dryrun, +	"epsilon=s" => \$epsilon, +	"help" => \$help, +        "weights=s" => \$initial_weights, +	"tune-regularizer" => \$tune_regularizer, +	"reg=f" => \$reg, +	"local" => \$run_local, +	"use-make=i" => \$use_make, +	"max-iterations=i" => \$max_iterations, +	"pmem=s" => \$pmem, +        "cpbin!" => \$cpbin, +	"ref-files=s" => \$refFiles, +	"metric=s" => \$metric, +	"source-file=s" => \$srcFile, +	"workdir=s" => \$dir, +) == 0 || @ARGV!=1 || $help) { +	print_help(); +	exit; +} + +if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; } + +if ($metric =~ /^(combi|ter)$/i) { +  $lines_per_mapper = 5; +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { +  $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +unless ($dir){ +	$dir = "protrain"; +} +unless ($dir =~ /^\//){  # convert relative path to absolute path +	my $basedir = check_output("pwd"); +	chomp $basedir; +	$dir = "$basedir/$dir"; +} + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { +	print STDERR "Cleanup...\n"; +	for my $pid (@childpids){ unchecked_call("kill $pid"); } +	for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } +	exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit =  +    sub{ cleanup(); };  +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { +    local $_; +    my $bindir=shift; +    check_call("mkdir -p $bindir"); +    -d $bindir || die "couldn't make bindir $bindir"; +    for (@_) { +        my $src=$$_; +        $$_="$bindir/".basename($src); +        check_call("cp -p $src $$_"); +    } +} +sub dirsize { +    opendir ISEMPTY,$_[0]; +    return scalar(readdir(ISEMPTY))-1; +} +my @allweights; +if ($dryrun){ +	write_config(*STDERR); +	exit 0; +} else { +	if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs +	  die "ERROR: working dir $dir already exists\n\n"; +	} else { +		-e $dir || mkdir $dir; +		mkdir "$dir/hgs"; +        modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; +    mkdir "$dir/scripts"; +        my $cmdfile="$dir/rerun-pro.sh"; +        open CMD,'>',$cmdfile; +        print CMD "cd ",&getcwd,"\n"; +#        print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. +        my $cline=&cmdline."\n"; +        print CMD $cline; +        close CMD; +        print STDERR $cline; +        chmod(0755,$cmdfile); +	check_call("cp $initial_weights $dir/weights.0"); +	die "Can't find weights.0" unless (-e "$dir/weights.0"); +	} +	write_config(*STDERR); +} + + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ +	print STDERR "\n\nITERATION $iteration\n==========\n"; + +	if ($iteration > $max_iterations){ +		print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; +		last; +	} +	# iteration-specific files +	my $runFile="$dir/run.raw.$iteration"; +	my $onebestFile="$dir/1best.$iteration"; +	my $logdir="$dir/logs.$iteration"; +	my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; +	my $scorerLog="$logdir/scorer.log.$iteration"; +	check_call("mkdir -p $logdir"); + + +	#decode +	print STDERR "RUNNING DECODER AT "; +	print STDERR unchecked_output("date"); +	my $im1 = $iteration - 1; +	my $weightsFile="$dir/weights.$im1"; +        push @allweights, "-w $dir/weights.$im1"; +        `rm -f $dir/hgs/*.gz`; +	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; +	my $pcmd; +	if ($run_local) { +		$pcmd = "cat $srcFile |"; +	} elsif ($use_make) { +	    # TODO: Throw error when decode_nodes is specified along with use_make +		$pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --"; +	} else { +		$pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --"; +	} +	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; +	print STDERR "COMMAND:\n$cmd\n"; +	check_bash_call($cmd); +        my $num_hgs; +        my $num_topbest; +        my $retries = 0; +	while($retries < 5) { +	    $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); +	    $num_topbest = check_output("wc -l < $runFile"); +	    print STDERR "NUMBER OF HGs: $num_hgs\n"; +	    print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; +	    if($devSize == $num_hgs && $devSize == $num_topbest) { +		last; +	    } else { +		print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; +		sleep(3); +	    } +	    $retries++; +	} +	die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); +	my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric"); +	chomp $dec_score; +	print STDERR "DECODER SCORE: $dec_score\n"; + +	# save space +	check_call("gzip -f $runFile"); +	check_call("gzip -f $decoderLog"); + +	# run optimizer +	print STDERR "RUNNING OPTIMIZER AT "; +	print STDERR unchecked_output("date"); +	print STDERR " - GENERATE TRAINING EXEMPLARS\n"; +	my $mergeLog="$logdir/prune-merge.log.$iteration"; + +	my $score = 0; +	my $icc = 0; +	my $inweights="$dir/weights.$im1"; +	$cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1"; +	print STDERR "COMMAND:\n$cmd\n"; +	check_call($cmd); +	check_call("mkdir -p $dir/splag.$im1"); +	$cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1 $dir/splag.$im1/mapinput."; +	print STDERR "COMMAND:\n$cmd\n"; +	check_call($cmd); +	opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; +	my @shards = grep { /^mapinput\./ } readdir(DIR); +	closedir DIR; +	die "No shards!" unless scalar @shards > 0; +	my $joblist = ""; +	my $nmappers = 0; +	@cleanupcmds = (); +	my %o2i = (); +	my $first_shard = 1; +	my $mkfile; # only used with makefiles +	my $mkfilename; +	if ($use_make) { +		$mkfilename = "$dir/splag.$im1/domap.mk"; +		open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; +		print $mkfile "all: $dir/splag.$im1/map.done\n\n"; +	} +	my @mkouts = ();  # only used with makefiles +	my @mapoutputs = (); +	for my $shard (@shards) { +		my $mapoutput = $shard; +		my $client_name = $shard; +		$client_name =~ s/mapinput.//; +		$client_name = "pro.$client_name"; +		$mapoutput =~ s/mapinput/mapoutput/; +		push @mapoutputs, "$dir/splag.$im1/$mapoutput"; +		$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; +		my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; +		if ($run_local) { +			print STDERR "COMMAND:\n$script\n"; +			check_bash_call($script); +		} elsif ($use_make) { +			my $script_file = "$dir/scripts/map.$shard"; +			open F, ">$script_file" or die "Can't write $script_file: $!"; +			print F "#!/bin/bash\n"; +			print F "$script\n"; +			close F; +			my $output = "$dir/splag.$im1/$mapoutput"; +			push @mkouts, $output; +			chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; +			if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } +			print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; +		} else { +			my $script_file = "$dir/scripts/map.$shard"; +			open F, ">$script_file" or die "Can't write $script_file: $!"; +			print F "$script\n"; +			close F; +			if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + +			$nmappers++; +			my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; +			my $jobid = check_output("$qcmd"); +			chomp $jobid; +			$jobid =~ s/^(\d+)(.*?)$/\1/g; +			$jobid =~ s/^Your job (\d+) .*$/\1/; +		 	push(@cleanupcmds, "qdel $jobid 2> /dev/null"); +			print STDERR " $jobid"; +			if ($joblist == "") { $joblist = $jobid; } +			else {$joblist = $joblist . "\|" . $jobid; } +		} +	} +	my @dev_outs = (); +	my @devtest_outs = (); +	if ($tune_regularizer) { +		for (my $i = 0; $i < scalar @mapoutputs; $i++) { +			if ($i % 3 == 1) { +				push @devtest_outs, $mapoutputs[$i]; +			} else { +				push @dev_outs, $mapoutputs[$i]; +			} +		} +		if (scalar @devtest_outs == 0) { +			die "Not enough training instances for regularization tuning! Rerun without --tune-regularizer\n"; +		} +	} else { +		@dev_outs = @mapoutputs; +	} +	if ($run_local) { +		print STDERR "\nCompleted extraction of training exemplars.\n"; +	} elsif ($use_make) { +		print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; +		close $mkfile; +		my $mcmd = "make -j $use_make -f $mkfilename"; +		print STDERR "\nExecuting: $mcmd\n"; +		check_call($mcmd); +	} else { +		print STDERR "\nLaunched $nmappers mappers.\n"; +      		sleep 8; +		print STDERR "Waiting for mappers to complete...\n"; +		while ($nmappers > 0) { +		  sleep 5; +		  my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); +		  $nmappers = scalar @livejobs; +		} +		print STDERR "All mappers complete.\n"; +	} +	my $tol = 0; +	my $til = 0; +	my $dev_test_file = "$dir/splag.$im1/devtest.gz"; +	if ($tune_regularizer) { +		my $cmd = "cat @devtest_outs | gzip > $dev_test_file"; +		check_bash_call($cmd); +		die "Can't find file $dev_test_file" unless -f $dev_test_file; +	} +        #print STDERR "MO: @mapoutputs\n"; +	for my $mo (@mapoutputs) { +		#my $olines = get_lines($mo); +		#my $ilines = get_lines($o2i{$mo}); +		#die "$mo: no training instances generated!" if $olines == 0; +	} +	print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n"; +	print STDERR unchecked_output("date"); +	$cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -s $reg"; +	if ($tune_regularizer) { +		$cmd .= " -T -t $dev_test_file"; +	} +        $cmd .= " > $dir/weights.$iteration"; +	print STDERR "COMMAND:\n$cmd\n"; +	check_bash_call($cmd); +	$lastWeightsFile = "$dir/weights.$iteration"; +	if ($tune_regularizer) { +		open W, "<$lastWeightsFile" or die "Can't read $lastWeightsFile: $!"; +		my $line = <W>; +		close W; +		my ($sharp, $label, $nreg) = split /\s|=/, $line; +		print STDERR "REGULARIZATION STRENGTH ($label) IS $nreg\n"; +		$reg = $nreg; +		# only tune regularizer on first iteration? +		$tune_regularizer = 0; +	} +	$lastPScore = $score; +	$iteration++; +	print STDERR "\n==========\n"; +} + +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +exit 0; + +sub get_lines { +  my $fn = shift @_; +  open FL, "<$fn" or die "Couldn't read $fn: $!"; +  my $lc = 0; +  while(<FL>) { $lc++; } +  return $lc; +} + +sub get_comma_sep_refs { +  my ($r,$p) = @_; +  my $o = check_output("echo $p"); +  chomp $o; +  my @files = split /\s+/, $o; +  return "-$r " . join(" -$r ", @files); +} + +sub read_weights_file { +  my ($file) = @_; +  open F, "<$file" or die "Couldn't read $file: $!"; +  my @r = (); +  my $pm = -1; +  while(<F>) { +    next if /^#/; +    next if /^\s*$/; +    chomp; +    if (/^(.+)\s+(.+)$/) { +      my $m = $1; +      my $w = $2; +      die "Weights out of order: $m <= $pm" unless $m > $pm; +      push @r, $w; +    } else { +      warn "Unexpected feature name in weight file: $_"; +    } +  } +  close F; +  return join ' ', @r; +} + +# subs +sub write_config { +	my $fh = shift; +	my $cleanup = "yes"; +	if ($disable_clean) {$cleanup = "no";} + +	print $fh "\n"; +	print $fh "DECODER:          $decoder\n"; +	print $fh "INI FILE:         $iniFile\n"; +	print $fh "WORKING DIR:      $dir\n"; +	print $fh "SOURCE (DEV):     $srcFile\n"; +	print $fh "REFS (DEV):       $refFiles\n"; +	print $fh "EVAL METRIC:      $metric\n"; +	print $fh "MAX ITERATIONS:   $max_iterations\n"; +	print $fh "DECODE NODES:     $decode_nodes\n"; +	print $fh "HEAD NODE:        $host\n"; +	print $fh "PMEM (DECODING):  $pmem\n"; +	print $fh "CLEANUP:          $cleanup\n"; +} + +sub update_weights_file { +  my ($neww, $rfn, $rpts) = @_; +  my @feats = @$rfn; +  my @pts = @$rpts; +  my $num_feats = scalar @feats; +  my $num_pts = scalar @pts; +  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; +  open G, ">$neww" or die; +  for (my $i = 0; $i < $num_feats; $i++) { +    my $f = $feats[$i]; +    my $lambda = $pts[$i]; +    print G "$f $lambda\n"; +  } +  close G; +} + +sub enseg { +	my $src = shift; +	my $newsrc = shift; +	open(SRC, $src); +	open(NEWSRC, ">$newsrc"); +	my $i=0; +	while (my $line=<SRC>){ +		chomp $line; +		if ($line =~ /^\s*<seg/i) { +		    if($line =~ /id="[0-9]+"/) { +			print NEWSRC "$line\n"; +		    } else { +			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; +		    } +		} else { +			print NEWSRC "<seg id=\"$i\">$line</seg>\n"; +		} +		$i++; +	} +	close SRC; +	close NEWSRC; +	die "Empty dev set!" if ($i == 0); +} + +sub print_help { + +	my $executable = check_output("basename $0"); chomp $executable; +    print << "Help"; + +Usage: $executable [options] <ini file> + +	$executable [options] <ini file> +		Runs a complete MERT optimization and test set decoding, using +		the decoder configuration in ini file.  Note that many of the +		options have default values that are inferred automatically +		based on certain conventions.  For details, refer to descriptions +		of the options --decoder, --weights, and --workdir. + +Required: + +	--ref-files <files> +		Dev set ref files.  This option takes only a single string argument. +		To use multiple files (including file globbing), this argument should +		be quoted. + +	--source-file <file> +		Dev set source file. + +	--weights <file> +		Initial weights file (use empty file to start from 0) + +General options: + +	--local +		Run the decoder and optimizer locally with a single thread. + +	--decode-nodes <I> +		Number of decoder processes to run in parallel. [default=15] + +	--help +		Print this message and exit. + +	--max-iterations <M> +		Maximum number of iterations to run.  If not specified, defaults +		to 10. + +	--metric <method> +		Metric to optimize. +		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + +	--pass-suffix <S> +		If the decoder is doing multi-pass decoding, the pass suffix "2", +		"3", etc., is used to control what iteration of weights is set. + +	--pmem <N> +		Amount of physical memory requested for parallel decoding jobs. + +	--use-make <I> +		Use make -j <I> to run the optimizer commands (useful on large +		shared-memory machines where qsub is unavailable). + +	--workdir <dir> +		Directory for intermediate and output files.  If not specified, the +		name is derived from the ini filename.  Assuming that the ini +		filename begins with the decoder name and ends with ini, the default +		name of the working directory is inferred from the middle part of +		the filename.  E.g. an ini file named decoder.foo.ini would have +		a default working directory name foo. + +Regularization options: + +	--tune-regularizer +		Hold out one third of the tuning data and used this to tune the +		regularization parameter. + +	--reg <F> + +Help +} + +sub convert { +  my ($str) = @_; +  my @ps = split /;/, $str; +  my %dict = (); +  for my $p (@ps) { +    my ($k, $v) = split /=/, $p; +    $dict{$k} = $v; +  } +  return %dict; +} + + +sub cmdline { +    return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { +    my ($arg)=@_; +    return undef unless defined $arg; +    if ($arg =~ /$is_shell_special/) { +        $arg =~ s/($shell_escape_in_quote)/\\$1/g; +        return "\"$arg\""; +    } +    return $arg; +} + +sub escaped_shell_args { +    return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { +    return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { +    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} diff --git a/pro-train/mr_pro_generate_mapper_input.pl b/pro-train/mr_pro_generate_mapper_input.pl new file mode 100755 index 00000000..b30fc4fd --- /dev/null +++ b/pro-train/mr_pro_generate_mapper_input.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1; +my $d = shift @ARGV; +die "Can't find directory $d" unless -d $d; + +opendir(DIR, $d) or die "Can't read $d: $!"; +my @hgs = grep { /\.gz$/ } readdir(DIR); +closedir DIR; + +for my $hg (@hgs) { +  my $file = $hg; +  my $id = $hg; +  $id =~ s/(\.json)?\.gz//; +  print "$d/$file $id\n"; +} + diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc new file mode 100644 index 00000000..4324e8de --- /dev/null +++ b/pro-train/mr_pro_map.cc @@ -0,0 +1,351 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> +#include <tr1/unordered_map> + +#include <boost/functional/hash.hpp> +#include <boost/shared_ptr.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sampler.h" +#include "filelib.h" +#include "stringlib.h" +#include "weights.h" +#include "scorer.h" +#include "inside_outside.h" +#include "hg_io.h" +#include "kbest.h" +#include "viterbi.h" + +// This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011) + +using namespace std; +namespace po = boost::program_options; + +struct ApproxVectorHasher { +  static const size_t MASK = 0xFFFFFFFFull; +  union UType { +    double f; +    size_t i; +  }; +  static inline double round(const double x) { +    UType t; +    t.f = x; +    size_t r = t.i & MASK; +    if ((r << 1) > MASK) +      t.i += MASK - r + 1; +    else +      t.i &= (1ull - MASK); +    return t.f; +  } +  size_t operator()(const SparseVector<double>& x) const { +    size_t h = 0x573915839; +    for (SparseVector<double>::const_iterator it = x.begin(); it != x.end(); ++it) { +      UType t; +      t.f = it->second; +      if (t.f) { +        size_t z = (t.i >> 32); +        boost::hash_combine(h, it->first); +        boost::hash_combine(h, z); +      } +    } +    return h; +  } +}; + +struct ApproxVectorEquals { +  bool operator()(const SparseVector<double>& a, const SparseVector<double>& b) const { +    SparseVector<double>::const_iterator bit = b.begin(); +    for (SparseVector<double>::const_iterator ait = a.begin(); ait != a.end(); ++ait) { +      if (bit == b.end() || +          ait->first != bit->first || +          ApproxVectorHasher::round(ait->second) != ApproxVectorHasher::round(bit->second)) +        return false; +      ++bit; +    } +    if (bit != b.end()) return false; +    return true; +  } +}; + +boost::shared_ptr<MT19937> rng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") +        ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations") +        ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)") +        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") +        ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)") +        ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized") +        ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract") +        ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)") +        ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") +        ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = false; +  if (!conf->count("reference")) { +    cerr << "Please specify one or more references using -r <REF.TXT>\n"; +    flag = true; +  } +  if (!conf->count("weights")) { +    cerr << "Please specify weights using -w <WEIGHTS.TXT>\n"; +    flag = true; +  } +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +struct HypInfo { +  HypInfo() : g_(-100.0) {} +  HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-100.0), x(feats) {} + +  // lazy evaluation +  double g(const SentenceScorer& scorer) const { +    if (g_ == -100.0) +      g_ = scorer.ScoreCandidate(hyp)->ComputeScore(); +    return g_; +  } +  vector<WordID> hyp; +  mutable double g_; +  SparseVector<double> x; +}; + +struct HypInfoCompare { +  bool operator()(const HypInfo& a, const HypInfo& b) const { +    ApproxVectorEquals comp; +    return (a.hyp == b.hyp && comp(a.x,b.x)); +  } +}; + +struct HypInfoHasher { +  size_t operator()(const HypInfo& x) const { +    boost::hash<vector<WordID> > hhasher; +    ApproxVectorHasher vhasher; +    size_t ha = hhasher(x.hyp); +    boost::hash_combine(ha, vhasher(x.x)); +    return ha; +  } +}; + +void WriteKBest(const string& file, const vector<HypInfo>& kbest) { +  WriteFile wf(file); +  ostream& out = *wf.stream(); +  out.precision(10); +  for (int i = 0; i < kbest.size(); ++i) { +    out << TD::GetString(kbest[i].hyp) << endl; +    out << kbest[i].x << endl; +  } +} + +void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) { +  SparseVector<double>& x = *out; +  size_t last_start = cur; +  size_t last_comma = string::npos; +  while(cur <= line.size()) { +    if (line[cur] == ' ' || cur == line.size()) { +      if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { +        cerr << "[ERROR] " << line << endl << "  position = " << cur << endl; +        exit(1); +      } +      const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); +      if (cur < line.size()) line[cur] = 0; +      const double val = strtod(&line[last_comma + 1], NULL); +      x.set_value(fid, val); + +      last_comma = string::npos; +      last_start = cur+1; +    } else { +      if (line[cur] == '=') +        last_comma = cur; +    } +    ++cur; +  } +} + +void ReadKBest(const string& file, vector<HypInfo>* kbest) { +  cerr << "Reading from " << file << endl; +  ReadFile rf(file); +  istream& in = *rf.stream(); +  string cand; +  string feats; +  while(getline(in, cand)) { +    getline(in, feats); +    assert(in); +    kbest->push_back(HypInfo()); +    TD::ConvertSentence(cand, &kbest->back().hyp); +    ParseSparseVector(feats, 0, &kbest->back().x); +  } +  cerr << "  read " << kbest->size() << " hypotheses\n"; +} + +void Dedup(vector<HypInfo>* h) { +  cerr << "Dedup in=" << h->size(); +  tr1::unordered_set<HypInfo, HypInfoHasher, HypInfoCompare> u; +  while(h->size() > 0) { +    u.insert(h->back()); +    h->pop_back(); +  } +  tr1::unordered_set<HypInfo, HypInfoHasher, HypInfoCompare>::iterator it = u.begin(); +  while (it != u.end()) { +    h->push_back(*it); +    it = u.erase(it); +  } +  cerr << "  out=" << h->size() << endl; +} + +struct ThresholdAlpha { +  explicit ThresholdAlpha(double t = 0.05) : threshold(t) {} +  double operator()(double mag) const { +    if (mag < threshold) return 0.0; else return 1.0; +  } +  const double threshold; +}; + +struct TrainingInstance { +  TrainingInstance(const SparseVector<double>& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {} +  SparseVector<double> x; +#undef DEBUGGING_PRO +#ifdef DEBUGGING_PRO +  vector<WordID> a; +  vector<WordID> b; +#endif +  bool y; +  double gdiff; +}; +#ifdef DEBUGGING_PRO +ostream& operator<<(ostream& os, const TrainingInstance& d) { +  return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x; +} +#endif + +struct DiffOrder { +  bool operator()(const TrainingInstance& a, const TrainingInstance& b) const { +    return a.gdiff > b.gdiff; +  } +}; + +void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const bool invert_score, vector<TrainingInstance>* pv) { +  vector<TrainingInstance> v1, v2; +  double avg_diff = 0; +  for (unsigned i = 0; i < gamma; ++i) { +    const size_t a = rng->inclusive(0, J_i.size() - 1)(); +    const size_t b = rng->inclusive(0, J_i.size() - 1)(); +    if (a == b) continue; +    double ga = J_i[a].g(scorer); +    double gb = J_i[b].g(scorer); +    bool positive = gb < ga; +    if (invert_score) positive = !positive; +    const double gdiff = fabs(ga - gb); +    if (!gdiff) continue; +    avg_diff += gdiff; +    SparseVector<double> xdiff = (J_i[a].x - J_i[b].x).erase_zeros(); +    if (xdiff.empty()) { +      cerr << "Empty diff:\n  " << TD::GetString(J_i[a].hyp) << endl << "x=" << J_i[a].x << endl; +      cerr << "  " << TD::GetString(J_i[b].hyp) << endl << "x=" << J_i[b].x << endl; +      continue; +    } +    v1.push_back(TrainingInstance(xdiff, positive, gdiff)); +#ifdef DEBUGGING_PRO +    v1.back().a = J_i[a].hyp; +    v1.back().b = J_i[b].hyp; +    cerr << "N: " << v1.back() << endl; +#endif +  } +  avg_diff /= v1.size(); + +  for (unsigned i = 0; i < v1.size(); ++i) { +    double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff)); +    // cerr << "avg_diff=" << avg_diff << "  gdiff=" << v1[i].gdiff << "  p=" << p << endl; +    if (rng->next() < p) v2.push_back(v1[i]); +  } +  vector<TrainingInstance>::iterator mid = v2.begin() + xi; +  if (xi > v2.size()) mid = v2.end(); +  partial_sort(v2.begin(), mid, v2.end(), DiffOrder()); +  copy(v2.begin(), mid, back_inserter(*pv)); +#ifdef DEBUGGING_PRO +  if (v2.size() >= 5) { +    for (int i =0; i < (mid - v2.begin()); ++i) { +      cerr << v2[i] << endl; +    } +    cerr << pv->back() << endl; +  } +#endif +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  if (conf.count("random_seed")) +    rng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    rng.reset(new MT19937); +  const string loss_function = conf["loss_function"].as<string>(); + +  ScoreType type = ScoreTypeFromString(loss_function); +  DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>()); +  cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; +  Hypergraph hg; +  string last_file; +  ReadFile in_read(conf["input"].as<string>()); +  istream &in=*in_read.stream(); +  const unsigned kbest_size = conf["kbest_size"].as<unsigned>(); +  const unsigned gamma = conf["candidate_pairs"].as<unsigned>(); +  const unsigned xi = conf["best_pairs"].as<unsigned>(); +  string weightsf = conf["weights"].as<string>(); +  vector<double> weights; +  { +    Weights w; +    w.InitFromFile(weightsf); +    w.InitVector(&weights); +  } +  string kbest_repo = conf["kbest_repository"].as<string>(); +  MkDirP(kbest_repo); +  while(in) { +    vector<TrainingInstance> v; +    string line; +    getline(in, line); +    if (line.empty()) continue; +    istringstream is(line); +    int sent_id; +    string file; +    // path-to-file (JSON) sent_id +    is >> file >> sent_id; +    ReadFile rf(file); +    ostringstream os; +    vector<HypInfo> J_i; +    os << kbest_repo << "/kbest." << sent_id << ".txt.gz"; +    const string kbest_file = os.str(); +    if (FileExists(kbest_file)) +      ReadKBest(kbest_file, &J_i); +    HypergraphIO::ReadFromJSON(rf.stream(), &hg); +    hg.Reweight(weights); +    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size); + +    for (int i = 0; i < kbest_size; ++i) { +      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +        kbest.LazyKthBest(hg.nodes_.size() - 1, i); +      if (!d) break; +      J_i.push_back(HypInfo(d->yield, d->feature_values)); +    } +    Dedup(&J_i); +    WriteKBest(kbest_file, J_i); + +    Sample(gamma, xi, J_i, *ds[sent_id], (type == TER), &v); +    for (unsigned i = 0; i < v.size(); ++i) { +      const TrainingInstance& vi = v[i]; +      cout << vi.y << "\t" << vi.x << endl; +      cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl; +    } +  } +  return 0; +} + diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc new file mode 100644 index 00000000..9b422f33 --- /dev/null +++ b/pro-train/mr_pro_reduce.cc @@ -0,0 +1,277 @@ +#include <cstdlib> +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "weights.h" +#include "sparse_vector.h" +#include "optimize.h" + +using namespace std; +namespace po = boost::program_options; + +// since this is a ranking model, there should be equal numbers of +// positive and negative examples, so the bias should be 0 +static const double MAX_BIAS = 1e-10; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation") +        ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev") +        ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)") +        ("sigma_squared,s",po::value<double>()->default_value(0.1), "Sigma squared for Gaussian prior") +        ("min_reg,r",po::value<double>()->default_value(1e-8), "When tuning (-T) regularization strength, minimum regularization strenght") +        ("max_reg,R",po::value<double>()->default_value(10.0), "When tuning (-T) regularization strength, maximum regularization strenght") +        ("testset,t",po::value<string>(), "Optional held-out test set") +        ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) { +  SparseVector<double>& x = *out; +  size_t last_start = cur; +  size_t last_comma = string::npos; +  while(cur <= line.size()) { +    if (line[cur] == ' ' || cur == line.size()) { +      if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { +        cerr << "[ERROR] " << line << endl << "  position = " << cur << endl; +        exit(1); +      } +      const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); +      if (cur < line.size()) line[cur] = 0; +      const double val = strtod(&line[last_comma + 1], NULL); +      x.set_value(fid, val); + +      last_comma = string::npos; +      last_start = cur+1; +    } else { +      if (line[cur] == '=') +        last_comma = cur; +    } +    ++cur; +  } +} + +void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<double> > >* corpus) { +  istream& in = *pin; +  corpus->clear(); +  bool flag = false; +  int lc = 0; +  string line; +  SparseVector<double> x; +  while(getline(in, line)) { +    ++lc; +    if (lc % 1000 == 0) { cerr << '.'; flag = true; } +    if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } +    if (line.empty()) continue; +    const size_t ks = line.find("\t"); +    assert(string::npos != ks); +    assert(ks == 1); +    const bool y = line[0] == '1'; +    x.clear(); +    ParseSparseVector(line, ks + 1, &x); +    corpus->push_back(make_pair(y, x)); +  } +  if (flag) cerr << endl; +} + +void GradAdd(const SparseVector<double>& v, const double scale, vector<double>* acc) { +  for (SparseVector<double>::const_iterator it = v.begin(); +       it != v.end(); ++it) { +    (*acc)[it->first] += it->second * scale; +  } +} + +double TrainingInference(const vector<double>& x, +                         const vector<pair<bool, SparseVector<double> > >& corpus, +                         vector<double>* g = NULL) { +  double cll = 0; +  for (int i = 0; i < corpus.size(); ++i) { +    const double dotprod = corpus[i].second.dot(x) + x[0]; // x[0] is bias +    double lp_false = dotprod; +    double lp_true = -dotprod; +    if (0 < lp_true) { +      lp_true += log1p(exp(-lp_true)); +      lp_false = log1p(exp(lp_false)); +    } else { +      lp_true = log1p(exp(lp_true)); +      lp_false += log1p(exp(-lp_false)); +    } +    lp_true*=-1; +    lp_false*=-1; +    if (corpus[i].first) {  // true label +      cll -= lp_true; +      if (g) { +        // g -= corpus[i].second * exp(lp_false); +        GradAdd(corpus[i].second, -exp(lp_false), g); +        (*g)[0] -= exp(lp_false); // bias +      } +    } else {                  // false label +      cll -= lp_false; +      if (g) { +        // g += corpus[i].second * exp(lp_true); +        GradAdd(corpus[i].second, exp(lp_true), g); +        (*g)[0] += exp(lp_true); // bias +      } +    } +  } +  return cll; +} + +// return held-out log likelihood +double LearnParameters(const vector<pair<bool, SparseVector<double> > >& training, +                       const vector<pair<bool, SparseVector<double> > >& testing, +                       const double sigsq, +                       const unsigned memory_buffers, +                       vector<double>* px) { +  vector<double>& x = *px; +  vector<double> vg(FD::NumFeats(), 0.0); +  bool converged = false; +  LBFGSOptimizer opt(FD::NumFeats(), memory_buffers); +  double tppl = 0.0; +  while(!converged) { +    fill(vg.begin(), vg.end(), 0.0); +    double cll = TrainingInference(x, training, &vg); +    double ppl = cll / log(2); +    ppl /= training.size(); +    ppl = pow(2.0, ppl); + +    // evaluate optional held-out test set +    if (testing.size()) { +      tppl = TrainingInference(x, testing) / log(2); +      tppl /= testing.size(); +      tppl = pow(2.0, tppl); +    } + +    // handle regularizer +#if 1 +    double norm = 0; +    for (int i = 1; i < x.size(); ++i) { +      const double mean_i = 0.0; +      const double param = (x[i] - mean_i); +      norm += param * param; +      vg[i] += param / sigsq; +    }  +    const double reg = norm / (2.0 * sigsq); +#else +    double reg = 0; +#endif +    cll += reg; +    cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t"; +    try { +      vector<double> old_x = x; +      do { +        opt.Optimize(cll, vg, &x); +        converged = opt.HasConverged(); +      } while (!converged && x == old_x); +    } catch (...) { +      cerr << "Exception caught, assuming convergence is close enough...\n"; +      converged = true; +    } +    if (fabs(x[0]) > MAX_BIAS) { +      cerr << "Biased model learned. Are your training instances wrong?\n"; +      cerr << "  BIAS: " << x[0] << endl; +    } +  } +  return tppl; +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  string line; +  vector<pair<bool, SparseVector<double> > > training, testing; +  SparseVector<double> old_weights; +  const bool tune_regularizer = conf.count("tune_regularizer"); +  if (tune_regularizer && !conf.count("testset")) { +    cerr << "--tune_regularizer requires --testset to be set\n"; +    return 1; +  } +  const double min_reg = conf["min_reg"].as<double>(); +  const double max_reg = conf["max_reg"].as<double>(); +  double sigsq = conf["sigma_squared"].as<double>(); +  assert(sigsq > 0.0); +  assert(min_reg > 0.0); +  assert(max_reg > 0.0); +  assert(max_reg > min_reg); +  const double psi = conf["interpolation"].as<double>(); +  if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } +  if (conf.count("weights")) { +    Weights w; +    w.InitFromFile(conf["weights"].as<string>()); +    w.InitSparseVector(&old_weights); +  } +  ReadCorpus(&cin, &training); +  if (conf.count("testset")) { +    ReadFile rf(conf["testset"].as<string>()); +    ReadCorpus(rf.stream(), &testing); +  } +  cerr << "Number of features: " << FD::NumFeats() << endl; +  vector<double> x(FD::NumFeats(), 0.0);  // x[0] is bias +  for (SparseVector<double>::const_iterator it = old_weights.begin(); +       it != old_weights.end(); ++it) +    x[it->first] = it->second; +  double tppl = 0.0; +  vector<pair<double,double> > sp; +  vector<double> smoothed; +  if (tune_regularizer) { +    sigsq = min_reg; +    const double steps = 18; +    double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps); +    cerr << "SWEEP FACTOR: " << sweep_factor << endl; +    while(sigsq < max_reg) { +      tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x); +      sp.push_back(make_pair(sigsq, tppl)); +      sigsq *= sweep_factor; +    } +    smoothed.resize(sp.size(), 0); +    smoothed[0] = sp[0].second; +    smoothed.back() = sp.back().second;  +    for (int i = 1; i < sp.size()-1; ++i) { +      double prev = sp[i-1].second; +      double next = sp[i+1].second; +      double cur = sp[i].second; +      smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next); +    } +    double best_ppl = 9999999; +    unsigned best_i = 0; +    for (unsigned i = 0; i < sp.size(); ++i) { +      if (smoothed[i] < best_ppl) { +        best_ppl = smoothed[i]; +        best_i = i; +      } +    } +    sigsq = sp[best_i].first; +    tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x); +  } +  Weights w; +  if (conf.count("weights")) { +    for (int i = 1; i < x.size(); ++i) +      x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi); +  } +  cout.precision(15); +  cout << "# sigma^2=" << sigsq << "\theld out perplexity="; +  if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; } +  if (sp.size()) { +    cout << "# Parameter sweep:\n"; +    for (int i = 0; i < sp.size(); ++i) { +      cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl; +    } +  } +  w.InitFromVector(x); +  w.WriteToFile("-"); +  return 0; +}  | 
