diff options
| -rw-r--r-- | Makefile.am | 2 | ||||
| -rw-r--r-- | configure.ac | 2 | ||||
| -rw-r--r-- | pro-train/Makefile.am | 13 | ||||
| -rw-r--r-- | pro-train/README.shared-mem | 9 | ||||
| -rwxr-xr-x | pro-train/dist-pro.pl | 735 | ||||
| -rw-r--r-- | pro-train/mr_pro_map.cc | 111 | ||||
| -rw-r--r-- | pro-train/mr_pro_reduce.cc | 81 | 
7 files changed, 951 insertions, 2 deletions
| diff --git a/Makefile.am b/Makefile.am index f5397d0b..98b4bac7 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,7 +1,7 @@  # warning - the subdirectories in the following list should  # be kept in topologically sorted order. Also, DO NOT introduce  # cyclic dependencies between these directories! -SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira vest extools +SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira vest pro-train extools  #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava diff --git a/configure.ac b/configure.ac index db2b4afc..4e708073 100644 --- a/configure.ac +++ b/configure.ac @@ -92,4 +92,4 @@ then    AM_CONDITIONAL([GLC], true)  fi -AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile vest/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile) +AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile vest/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile) diff --git a/pro-train/Makefile.am b/pro-train/Makefile.am new file mode 100644 index 00000000..945ed5c3 --- /dev/null +++ b/pro-train/Makefile.am @@ -0,0 +1,13 @@ +bin_PROGRAMS = \ +  mr_pro_map \ +  mr_pro_reduce + +TESTS = lo_test + +mr_pro_map_SOURCES = mr_pro_map.cc +mr_pro_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +mr_pro_reduce_SOURCES = mr_pro_reduce.cc +mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/pro-train/README.shared-mem b/pro-train/README.shared-mem new file mode 100644 index 00000000..7728efc0 --- /dev/null +++ b/pro-train/README.shared-mem @@ -0,0 +1,9 @@ +If you want to run dist-vest.pl on a very large shared memory machine, do the +following: + +  ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini + +This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the +decoder must load grammars, language models, etc., J should be smaller than I, but this will depend +on the system you are running on and the complexity of the models used for decoding. + diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl new file mode 100755 index 00000000..35bccea4 --- /dev/null +++ b/pro-train/dist-pro.pl @@ -0,0 +1,735 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); + +my $VEST_DIR="$SCRIPT_DIR/../vest"; +require "$VEST_DIR/libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input"; +my $MAPPER = "$bin_dir/mr_pro_map"; +my $REDUCER = "$bin_dir/mr_pro_reduce"; +my $parallelize = "$VEST_DIR/parallelize.pl"; +my $libcall = "$VEST_DIR/libcall.pl"; +my $sentserver = "$VEST_DIR/sentserver"; +my $sentclient = "$VEST_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 400; +my $rand_directions = 15; +my $iteration = 1; +my $run_local = 0; +my $best_weights; +my $max_iterations = 15; +my $optimization_iters = 6; +my $decode_nodes = 15;   # number of decode nodes +my $pmem = "9g"; +my $disable_clean = 0; +my %seen_weights; +my $normalize; +my $help = 0; +my $epsilon = 0.0001; +my $interval = 5; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $initialWeights; +my $decoderOpt; +my $noprimary; +my $maxsim=0; +my $oraclen=0; +my $oracleb=20; +my $bleu_weight=1; +my $use_make;  # use make to parallelize line search +my $dirargs=''; +my $density_prune; +my $usefork; +my $pass_suffix = ''; +my $cpbin=1; +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( +	"decoder=s" => \$decoderOpt, +	"decode-nodes=i" => \$decode_nodes, +	"density-prune=f" => \$density_prune, +	"dont-clean" => \$disable_clean, +	"pass-suffix=s" => \$pass_suffix, +        "use-fork" => \$usefork, +	"dry-run" => \$dryrun, +	"epsilon=s" => \$epsilon, +	"help" => \$help, +	"interval" => \$interval, +	"iteration=i" => \$iteration, +	"local" => \$run_local, +	"use-make=i" => \$use_make, +	"max-iterations=i" => \$max_iterations, +	"normalize=s" => \$normalize, +	"pmem=s" => \$pmem, +        "cpbin!" => \$cpbin, +	"rand-directions=i" => \$rand_directions, +	"random_directions=i" => \$rand_directions, +        "bleu_weight=s" => \$bleu_weight, +        "no-primary!" => \$noprimary, +        "max-similarity=s" => \$maxsim, +        "oracle-directions=i" => \$oraclen, +        "n-oracle=i" => \$oraclen, +        "oracle-batch=i" => \$oracleb, +        "directions-args=s" => \$dirargs, +	"ref-files=s" => \$refFiles, +	"metric=s" => \$metric, +	"source-file=s" => \$srcFile, +	"weights=s" => \$initialWeights, +	"workdir=s" => \$dir, +    "opt-iterations=i" => \$optimization_iters, +) == 0 || @ARGV!=1 || $help) { +	print_help(); +	exit; +} + +if (defined $density_prune) { +  die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0; +} + +if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; } + +if ($metric =~ /^(combi|ter)$/i) { +  $lines_per_mapper = 40; +} elsif ($metric =~ /^meteor$/i) { +  $lines_per_mapper = 2000;   # start up time is really high +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { +  $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +unless ($dir){ +	$dir = "vest"; +} +unless ($dir =~ /^\//){  # convert relative path to absolute path +	my $basedir = check_output("pwd"); +	chomp $basedir; +	$dir = "$basedir/$dir"; +} + +if ($decoderOpt){ $decoder = $decoderOpt; } + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { +	print STDERR "Cleanup...\n"; +	for my $pid (@childpids){ unchecked_call("kill $pid"); } +	for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } +	exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit =  +    sub{ cleanup(); };  +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { +    local $_; +    my $bindir=shift; +    check_call("mkdir -p $bindir"); +    -d $bindir || die "couldn't make bindir $bindir"; +    for (@_) { +        my $src=$$_; +        $$_="$bindir/".basename($src); +        check_call("cp -p $src $$_"); +    } +} +sub dirsize { +    opendir ISEMPTY,$_[0]; +    return scalar(readdir(ISEMPTY))-1; +} +if ($dryrun){ +	write_config(*STDERR); +	exit 0; +} else { +	if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs +	  die "ERROR: working dir $dir already exists\n\n"; +	} else { +		-e $dir || mkdir $dir; +		mkdir "$dir/hgs"; +        modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; +    mkdir "$dir/scripts"; +        my $cmdfile="$dir/rerun-vest.sh"; +        open CMD,'>',$cmdfile; +        print CMD "cd ",&getcwd,"\n"; +#        print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. +        my $cline=&cmdline."\n"; +        print CMD $cline; +        close CMD; +        print STDERR $cline; +        chmod(0755,$cmdfile); +		unless (-e $initialWeights) { +			print STDERR "Please specify an initial weights file with --initial-weights\n"; +			print_help(); +			exit; +		} +		check_call("cp $initialWeights $dir/weights.0"); +		die "Can't find weights.0" unless (-e "$dir/weights.0"); +	} +	write_config(*STDERR); +} + + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ +	print STDERR "\n\nITERATION $iteration\n==========\n"; + +	if ($iteration > $max_iterations){ +		print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; +		last; +	} +	# iteration-specific files +	my $runFile="$dir/run.raw.$iteration"; +	my $onebestFile="$dir/1best.$iteration"; +	my $logdir="$dir/logs.$iteration"; +	my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; +	my $scorerLog="$logdir/scorer.log.$iteration"; +	check_call("mkdir -p $logdir"); + + +	#decode +	print STDERR "RUNNING DECODER AT "; +	print STDERR unchecked_output("date"); +	my $im1 = $iteration - 1; +	my $weightsFile="$dir/weights.$im1"; +	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; +	if ($density_prune) { +		$decoder_cmd .= " --density_prune $density_prune"; +	} +	my $pcmd; +	if ($run_local) { +		$pcmd = "cat $srcFile |"; +	} elsif ($use_make) { +	    # TODO: Throw error when decode_nodes is specified along with use_make +		$pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --"; +	} else { +		$pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --"; +	} +	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; +	print STDERR "COMMAND:\n$cmd\n"; +	check_bash_call($cmd); +        my $num_hgs; +        my $num_topbest; +        my $retries = 0; +	while($retries < 5) { +	    $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); +	    $num_topbest = check_output("wc -l < $runFile"); +	    print STDERR "NUMBER OF HGs: $num_hgs\n"; +	    print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; +	    if($devSize == $num_hgs && $devSize == $num_topbest) { +		last; +	    } else { +		print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; +		sleep(3); +	    } +	    $retries++; +	} +	die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); +	my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric"); +	chomp $dec_score; +	print STDERR "DECODER SCORE: $dec_score\n"; + +	# save space +	check_call("gzip -f $runFile"); +	check_call("gzip -f $decoderLog"); + +	# run optimizer +	print STDERR "RUNNING OPTIMIZER AT "; +	print STDERR unchecked_output("date"); +	my $mergeLog="$logdir/prune-merge.log.$iteration"; + +	my $score = 0; +	my $icc = 0; +	my $inweights="$dir/weights.$im1"; +	for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { +		print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; +		print STDERR unchecked_output("date"); +		$icc++; +		my $nop=$noprimary?"--no_primary":""; +		my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):""; +		my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":""; +		$cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter"; +		print STDERR "COMMAND:\n$cmd\n"; +		check_call($cmd); +		check_call("mkdir -p $dir/splag.$im1"); +		$cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; +		print STDERR "COMMAND:\n$cmd\n"; +		check_call($cmd); +		opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; +		my @shards = grep { /^mapinput\./ } readdir(DIR); +		closedir DIR; +		die "No shards!" unless scalar @shards > 0; +		my $joblist = ""; +		my $nmappers = 0; +		my @mapoutputs = (); +		@cleanupcmds = (); +		my %o2i = (); +		my $first_shard = 1; +		my $mkfile; # only used with makefiles +		my $mkfilename; +		if ($use_make) { +			$mkfilename = "$dir/splag.$im1/domap.mk"; +			open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; +			print $mkfile "all: $dir/splag.$im1/map.done\n\n"; +		} +		my @mkouts = ();  # only used with makefiles +		for my $shard (@shards) { +			my $mapoutput = $shard; +			my $client_name = $shard; +			$client_name =~ s/mapinput.//; +			$client_name = "vest.$client_name"; +			$mapoutput =~ s/mapinput/mapoutput/; +			push @mapoutputs, "$dir/splag.$im1/$mapoutput"; +			$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; +			my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; +			if ($run_local) { +				print STDERR "COMMAND:\n$script\n"; +				check_bash_call($script); +			} elsif ($use_make) { +				my $script_file = "$dir/scripts/map.$shard"; +				open F, ">$script_file" or die "Can't write $script_file: $!"; +				print F "#!/bin/bash\n"; +				print F "$script\n"; +				close F; +				my $output = "$dir/splag.$im1/$mapoutput"; +				push @mkouts, $output; +				chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; +				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } +				print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; +			} else { +				my $script_file = "$dir/scripts/map.$shard"; +				open F, ">$script_file" or die "Can't write $script_file: $!"; +				print F "$script\n"; +				close F; +				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + +				$nmappers++; +				my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; +				my $jobid = check_output("$qcmd"); +				chomp $jobid; +				$jobid =~ s/^(\d+)(.*?)$/\1/g; +				$jobid =~ s/^Your job (\d+) .*$/\1/; +		 	 	push(@cleanupcmds, "qdel $jobid 2> /dev/null"); +				print STDERR " $jobid"; +				if ($joblist == "") { $joblist = $jobid; } +				else {$joblist = $joblist . "\|" . $jobid; } +			} +		} +		if ($run_local) { +			print STDERR "\nProcessing line search complete.\n"; +		} elsif ($use_make) { +			print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; +			close $mkfile; +			my $mcmd = "make -j $use_make -f $mkfilename"; +			print STDERR "\nExecuting: $mcmd\n"; +			check_call($mcmd); +		} else { +			print STDERR "\nLaunched $nmappers mappers.\n"; +      			sleep 8; +			print STDERR "Waiting for mappers to complete...\n"; +			while ($nmappers > 0) { +			  sleep 5; +			  my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); +			  $nmappers = scalar @livejobs; +			} +			print STDERR "All mappers complete.\n"; +		} +		my $tol = 0; +		my $til = 0; +		for my $mo (@mapoutputs) { +		  my $olines = get_lines($mo); +		  my $ilines = get_lines($o2i{$mo}); +		  $tol += $olines; +		  $til += $ilines; +		  die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; +		} +		print STDERR "Results for $tol/$til lines\n"; +		print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; +		print STDERR unchecked_output("date"); +		$cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1"; +		print STDERR "COMMAND:\n$cmd\n"; +		check_bash_call($cmd); +		$cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; +		# sort returns failure even when it doesn't fail for some reason +		my $best=unchecked_output("$cmd"); chomp $best; +		print STDERR "$best\n"; +		my ($oa, $x, $xscore) = split /\|/, $best; +		$score = $xscore; +		print STDERR "PROJECTED SCORE: $score\n"; +		if (abs($x) < $epsilon) { +			print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; +			last; +		} +                my $psd = $score - $last_score; +                $last_score = $score; +		if (abs($psd) < $epsilon) { +			print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; +			last; +		} +		my ($origin, $axis) = split /\s+/, $oa; + +		my %ori = convert($origin); +		my %axi = convert($axis); + +		my $finalFile="$dir/weights.$im1-$opt_iter"; +		open W, ">$finalFile" or die "Can't write: $finalFile: $!"; +                my $norm = 0; +		for my $k (sort keys %ori) { +			my $dd = $ori{$k} + $axi{$k} * $x; +                        $norm += $dd * $dd; +		} +                $norm = sqrt($norm); +		$norm = 1; +		for my $k (sort keys %ori) { +			my $v = ($ori{$k} + $axi{$k} * $x) / $norm; +			print W "$k $v\n"; +		} +		check_call("rm $dir/splag.$im1/*"); +		$inweights = $finalFile; +	} +	$lastWeightsFile = "$dir/weights.$iteration"; +	check_call("cp $inweights $lastWeightsFile"); +	if ($icc < 2) { +		print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; +		last; +	} +	$lastPScore = $score; +	$iteration++; +	print STDERR "\n==========\n"; +} + +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +exit 0; + +sub normalize_weights { +  my ($rfn, $rpts, $feat) = @_; +  my @feat_names = @$rfn; +  my @pts = @$rpts; +  my $z = 1.0; +  for (my $i=0; $i < scalar @feat_names; $i++) { +    if ($feat_names[$i] eq $feat) { +      $z = $pts[$i]; +      last; +    } +  } +  for (my $i=0; $i < scalar @feat_names; $i++) { +    $pts[$i] /= $z; +  } +  print STDERR " NORM WEIGHTS: @pts\n"; +  return @pts; +} + +sub get_lines { +  my $fn = shift @_; +  open FL, "<$fn" or die "Couldn't read $fn: $!"; +  my $lc = 0; +  while(<FL>) { $lc++; } +  return $lc; +} + +sub get_comma_sep_refs { +  my ($r,$p) = @_; +  my $o = check_output("echo $p"); +  chomp $o; +  my @files = split /\s+/, $o; +  return "-$r " . join(" -$r ", @files); +} + +sub read_weights_file { +  my ($file) = @_; +  open F, "<$file" or die "Couldn't read $file: $!"; +  my @r = (); +  my $pm = -1; +  while(<F>) { +    next if /^#/; +    next if /^\s*$/; +    chomp; +    if (/^(.+)\s+(.+)$/) { +      my $m = $1; +      my $w = $2; +      die "Weights out of order: $m <= $pm" unless $m > $pm; +      push @r, $w; +    } else { +      warn "Unexpected feature name in weight file: $_"; +    } +  } +  close F; +  return join ' ', @r; +} + +# subs +sub write_config { +	my $fh = shift; +	my $cleanup = "yes"; +	if ($disable_clean) {$cleanup = "no";} + +	print $fh "\n"; +	print $fh "DECODER:          $decoder\n"; +	print $fh "INI FILE:         $iniFile\n"; +	print $fh "WORKING DIR:      $dir\n"; +	print $fh "SOURCE (DEV):     $srcFile\n"; +	print $fh "REFS (DEV):       $refFiles\n"; +	print $fh "EVAL METRIC:      $metric\n"; +	print $fh "START ITERATION:  $iteration\n"; +	print $fh "MAX ITERATIONS:   $max_iterations\n"; +	print $fh "DECODE NODES:     $decode_nodes\n"; +	print $fh "HEAD NODE:        $host\n"; +	print $fh "PMEM (DECODING):  $pmem\n"; +	print $fh "CLEANUP:          $cleanup\n"; +	print $fh "INITIAL WEIGHTS:  $initialWeights\n"; +} + +sub update_weights_file { +  my ($neww, $rfn, $rpts) = @_; +  my @feats = @$rfn; +  my @pts = @$rpts; +  my $num_feats = scalar @feats; +  my $num_pts = scalar @pts; +  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; +  open G, ">$neww" or die; +  for (my $i = 0; $i < $num_feats; $i++) { +    my $f = $feats[$i]; +    my $lambda = $pts[$i]; +    print G "$f $lambda\n"; +  } +  close G; +} + +sub enseg { +	my $src = shift; +	my $newsrc = shift; +	open(SRC, $src); +	open(NEWSRC, ">$newsrc"); +	my $i=0; +	while (my $line=<SRC>){ +		chomp $line; +		if ($line =~ /^\s*<seg/i) { +		    if($line =~ /id="[0-9]+"/) { +			print NEWSRC "$line\n"; +		    } else { +			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; +		    } +		} else { +			print NEWSRC "<seg id=\"$i\">$line</seg>\n"; +		} +		$i++; +	} +	close SRC; +	close NEWSRC; +} + +sub print_help { + +	my $executable = check_output("basename $0"); chomp $executable; +    print << "Help"; + +Usage: $executable [options] <ini file> + +	$executable [options] <ini file> +		Runs a complete MERT optimization and test set decoding, using +		the decoder configuration in ini file.  Note that many of the +		options have default values that are inferred automatically +		based on certain conventions.  For details, refer to descriptions +		of the options --decoder, --weights, and --workdir. + +Options: + +	--local +		Run the decoder and optimizer locally with a single thread. + +	--use-make <I> +		Use make -j <I> to run the optimizer commands (useful on large +		shared-memory machines where qsub is unavailable). + +	--decode-nodes <I> +		Number of decoder processes to run in parallel. [default=15] + +	--decoder <decoder path> +		Decoder binary to use. + +	--density-prune <N> +		Limit the density of the hypergraph on each iteration to N times +		the number of edges on the Viterbi path. + +	--help +		Print this message and exit. + +	--iteration <I> +		Starting iteration number.  If not specified, defaults to 1. + +	--max-iterations <M> +		Maximum number of iterations to run.  If not specified, defaults +		to 10. + +	--pass-suffix <S> +		If the decoder is doing multi-pass decoding, the pass suffix "2", +		"3", etc., is used to control what iteration of weights is set. + +	--pmem <N> +		Amount of physical memory requested for parallel decoding jobs. + +	--ref-files <files> +		Dev set ref files.  This option takes only a single string argument. +		To use multiple files (including file globbing), this argument should +		be quoted. + +	--metric <method> +		Metric to optimize. +		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + +	--normalize <feature-name> +		After each iteration, rescale all feature weights such that feature- +		name has a weight of 1.0. + +	--rand-directions <num> +		MERT will attempt to optimize along all of the principle directions, +		set this parameter to explore other directions. Defaults to 5. + +	--source-file <file> +		Dev set source file. + +	--weights <file> +		A file specifying initial feature weights.  The format is +		FeatureName_1 value1 +		FeatureName_2 value2 + +	--workdir <dir> +		Directory for intermediate and output files.  If not specified, the +		name is derived from the ini filename.  Assuming that the ini +		filename begins with the decoder name and ends with ini, the default +		name of the working directory is inferred from the middle part of +		the filename.  E.g. an ini file named decoder.foo.ini would have +		a default working directory name foo. + +Help +} + +sub convert { +  my ($str) = @_; +  my @ps = split /;/, $str; +  my %dict = (); +  for my $p (@ps) { +    my ($k, $v) = split /=/, $p; +    $dict{$k} = $v; +  } +  return %dict; +} + + + +sub cmdline { +    return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { +    my ($arg)=@_; +    return undef unless defined $arg; +    if ($arg =~ /$is_shell_special/) { +        $arg =~ s/($shell_escape_in_quote)/\\$1/g; +        return "\"$arg\""; +    } +    return $arg; +} + +sub escaped_shell_args { +    return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { +    return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { +    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc new file mode 100644 index 00000000..b046cdea --- /dev/null +++ b/pro-train/mr_pro_map.cc @@ -0,0 +1,111 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/shared_ptr.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sampler.h" +#include "filelib.h" +#include "stringlib.h" +#include "scorer.h" +#include "inside_outside.h" +#include "hg_io.h" +#include "kbest.h" +#include "viterbi.h" + +// This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011) + +using namespace std; +namespace po = boost::program_options; + +boost::shared_ptr<MT19937> rng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") +        ("source,s",po::value<string>(), "Source file (ignored, except for AER)") +        ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized") +        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") +        ("weights,w",po::value<string>(), "[REQD] Current weights file") +        ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract") +        ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)") +        ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") +        ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = false; +  if (!conf->count("reference")) { +    cerr << "Please specify one or more references using -r <REF.TXT>\n"; +    flag = true; +  } +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +struct HypInfo { +  HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-1), x(feats) {} +  double g() { +    return g_; +  } + private: +  int sent_id; +  vector<WordID> hyp; +  double g_; + public: +  SparseVector<double> x; +}; + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  if (conf.count("random_seed")) +    rng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    rng.reset(new MT19937); +  const string loss_function = conf["loss_function"].as<string>(); +  ScoreType type = ScoreTypeFromString(loss_function); +  DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>()); +  cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; +  Hypergraph hg; +  string last_file; +  ReadFile in_read(conf["input"].as<string>()); +  istream &in=*in_read.stream(); +  const unsigned kbest_size = conf["kbest_size"].as<unsigned>(); +  const unsigned gamma = conf["candidate_pairs"].as<unsigned>(); +  const unsigned xi = conf["best_pairs"].as<unsigned>(); +  while(in) { +    string line; +    getline(in, line); +    if (line.empty()) continue; +    istringstream is(line); +    int sent_id; +    string file; +    // path-to-file (JSON) sent_id +    is >> file >> sent_id; +    ReadFile rf(file); +    HypergraphIO::ReadFromJSON(rf.stream(), &hg); +    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size); + +    vector<HypInfo> J_i; +    for (int i = 0; i < kbest_size; ++i) { +      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +        kbest.LazyKthBest(hg.nodes_.size() - 1, i); +      if (!d) break; +      float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore(); +      // if (invert_score) sentscore *= -1.0; +      // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; +      d->feature_values; +      sentscore; +    } +  } +  return 0; +} + diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc new file mode 100644 index 00000000..3df52020 --- /dev/null +++ b/pro-train/mr_pro_reduce.cc @@ -0,0 +1,81 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sparse_vector.h" +#include "error_surface.h" +#include "line_optimizer.h" +#include "b64tools.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("loss_function,l",po::value<string>(), "Loss function being optimized") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = conf->count("loss_function") == 0; +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const string loss_function = conf["loss_function"].as<string>(); +  ScoreType type = ScoreTypeFromString(loss_function); +  LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; +  if (type == TER || type == AER) { +    opt_type = LineOptimizer::MINIMIZE_SCORE; +  } +  string last_key; +  vector<ErrorSurface> esv; +  while(cin) { +    string line; +    getline(cin, line); +    if (line.empty()) continue; +    size_t ks = line.find("\t"); +    assert(string::npos != ks); +    assert(ks > 2); +    string key = line.substr(2, ks - 2); +    string val = line.substr(ks + 1); +    if (key != last_key) { +      if (!last_key.empty()) { +	float score; +        double x = LineOptimizer::LineOptimize(esv, opt_type, &score); +	cout << last_key << "|" << x << "|" << score << endl; +      } +      last_key = key; +      esv.clear(); +    } +    if (val.size() % 4 != 0) { +      cerr << "B64 encoding error 1! Skipping.\n"; +      continue; +    } +    string encoded(val.size() / 4 * 3, '\0'); +    if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) { +      cerr << "B64 encoding error 2! Skipping.\n"; +      continue; +    } +    esv.push_back(ErrorSurface()); +    esv.back().Deserialize(type, encoded); +  } +  if (!esv.empty()) { +    // cerr << "ESV=" << esv.size() << endl; +    // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; } +    float score; +    double x = LineOptimizer::LineOptimize(esv, opt_type, &score); +    cout << last_key << "|" << x << "|" << score << endl; +  } +  return 0; +} | 
