From 95deb840699f9b6f8fe499b374bd726bce97365c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 10 Jul 2011 23:00:21 -0400 Subject: starting implementation of Hopkins&May (2011) optimizer --- pro-train/dist-pro.pl | 735 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 735 insertions(+) create mode 100755 pro-train/dist-pro.pl (limited to 'pro-train/dist-pro.pl') diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl new file mode 100755 index 00000000..35bccea4 --- /dev/null +++ b/pro-train/dist-pro.pl @@ -0,0 +1,735 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); + +my $VEST_DIR="$SCRIPT_DIR/../vest"; +require "$VEST_DIR/libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input"; +my $MAPPER = "$bin_dir/mr_pro_map"; +my $REDUCER = "$bin_dir/mr_pro_reduce"; +my $parallelize = "$VEST_DIR/parallelize.pl"; +my $libcall = "$VEST_DIR/libcall.pl"; +my $sentserver = "$VEST_DIR/sentserver"; +my $sentclient = "$VEST_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 400; +my $rand_directions = 15; +my $iteration = 1; +my $run_local = 0; +my $best_weights; +my $max_iterations = 15; +my $optimization_iters = 6; +my $decode_nodes = 15; # number of decode nodes +my $pmem = "9g"; +my $disable_clean = 0; +my %seen_weights; +my $normalize; +my $help = 0; +my $epsilon = 0.0001; +my $interval = 5; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $initialWeights; +my $decoderOpt; +my $noprimary; +my $maxsim=0; +my $oraclen=0; +my $oracleb=20; +my $bleu_weight=1; +my $use_make; # use make to parallelize line search +my $dirargs=''; +my $density_prune; +my $usefork; +my $pass_suffix = ''; +my $cpbin=1; +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "decoder=s" => \$decoderOpt, + "decode-nodes=i" => \$decode_nodes, + "density-prune=f" => \$density_prune, + "dont-clean" => \$disable_clean, + "pass-suffix=s" => \$pass_suffix, + "use-fork" => \$usefork, + "dry-run" => \$dryrun, + "epsilon=s" => \$epsilon, + "help" => \$help, + "interval" => \$interval, + "iteration=i" => \$iteration, + "local" => \$run_local, + "use-make=i" => \$use_make, + "max-iterations=i" => \$max_iterations, + "normalize=s" => \$normalize, + "pmem=s" => \$pmem, + "cpbin!" => \$cpbin, + "rand-directions=i" => \$rand_directions, + "random_directions=i" => \$rand_directions, + "bleu_weight=s" => \$bleu_weight, + "no-primary!" => \$noprimary, + "max-similarity=s" => \$maxsim, + "oracle-directions=i" => \$oraclen, + "n-oracle=i" => \$oraclen, + "oracle-batch=i" => \$oracleb, + "directions-args=s" => \$dirargs, + "ref-files=s" => \$refFiles, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "weights=s" => \$initialWeights, + "workdir=s" => \$dir, + "opt-iterations=i" => \$optimization_iters, +) == 0 || @ARGV!=1 || $help) { + print_help(); + exit; +} + +if (defined $density_prune) { + die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0; +} + +if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; } + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 40; +} elsif ($metric =~ /^meteor$/i) { + $lines_per_mapper = 2000; # start up time is really high +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +unless ($dir){ + $dir = "vest"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + +if ($decoderOpt){ $decoder = $decoderOpt; } + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { + local $_; + my $bindir=shift; + check_call("mkdir -p $bindir"); + -d $bindir || die "couldn't make bindir $bindir"; + for (@_) { + my $src=$$_; + $$_="$bindir/".basename($src); + check_call("cp -p $src $$_"); + } +} +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} +if ($dryrun){ + write_config(*STDERR); + exit 0; +} else { + if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs + die "ERROR: working dir $dir already exists\n\n"; + } else { + -e $dir || mkdir $dir; + mkdir "$dir/hgs"; + modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; + mkdir "$dir/scripts"; + my $cmdfile="$dir/rerun-vest.sh"; + open CMD,'>',$cmdfile; + print CMD "cd ",&getcwd,"\n"; +# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. + my $cline=&cmdline."\n"; + print CMD $cline; + close CMD; + print STDERR $cline; + chmod(0755,$cmdfile); + unless (-e $initialWeights) { + print STDERR "Please specify an initial weights file with --initial-weights\n"; + print_help(); + exit; + } + check_call("cp $initialWeights $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); + } + write_config(*STDERR); +} + + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while() { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + if ($density_prune) { + $decoder_cmd .= " --density_prune $density_prune"; + } + my $pcmd; + if ($run_local) { + $pcmd = "cat $srcFile |"; + } elsif ($use_make) { + # TODO: Throw error when decode_nodes is specified along with use_make + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --"; + } else { + $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { + print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; + print STDERR unchecked_output("date"); + $icc++; + my $nop=$noprimary?"--no_primary":""; + my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):""; + my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":""; + $cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + check_call("mkdir -p $dir/splag.$im1"); + $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; + my @shards = grep { /^mapinput\./ } readdir(DIR); + closedir DIR; + die "No shards!" unless scalar @shards > 0; + my $joblist = ""; + my $nmappers = 0; + my @mapoutputs = (); + @cleanupcmds = (); + my %o2i = (); + my $first_shard = 1; + my $mkfile; # only used with makefiles + my $mkfilename; + if ($use_make) { + $mkfilename = "$dir/splag.$im1/domap.mk"; + open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; + print $mkfile "all: $dir/splag.$im1/map.done\n\n"; + } + my @mkouts = (); # only used with makefiles + for my $shard (@shards) { + my $mapoutput = $shard; + my $client_name = $shard; + $client_name =~ s/mapinput.//; + $client_name = "vest.$client_name"; + $mapoutput =~ s/mapinput/mapoutput/; + push @mapoutputs, "$dir/splag.$im1/$mapoutput"; + $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; + my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; + if ($run_local) { + print STDERR "COMMAND:\n$script\n"; + check_bash_call($script); + } elsif ($use_make) { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "#!/bin/bash\n"; + print F "$script\n"; + close F; + my $output = "$dir/splag.$im1/$mapoutput"; + push @mkouts, $output; + chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; + } else { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "$script\n"; + close F; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + + $nmappers++; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = check_output("$qcmd"); + chomp $jobid; + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); + print STDERR " $jobid"; + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + } + } + if ($run_local) { + print STDERR "\nProcessing line search complete.\n"; + } elsif ($use_make) { + print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; + close $mkfile; + my $mcmd = "make -j $use_make -f $mkfilename"; + print STDERR "\nExecuting: $mcmd\n"; + check_call($mcmd); + } else { + print STDERR "\nLaunched $nmappers mappers.\n"; + sleep 8; + print STDERR "Waiting for mappers to complete...\n"; + while ($nmappers > 0) { + sleep 5; + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); + $nmappers = scalar @livejobs; + } + print STDERR "All mappers complete.\n"; + } + my $tol = 0; + my $til = 0; + for my $mo (@mapoutputs) { + my $olines = get_lines($mo); + my $ilines = get_lines($o2i{$mo}); + $tol += $olines; + $til += $ilines; + die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; + } + print STDERR "Results for $tol/$til lines\n"; + print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; + print STDERR unchecked_output("date"); + $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; + # sort returns failure even when it doesn't fail for some reason + my $best=unchecked_output("$cmd"); chomp $best; + print STDERR "$best\n"; + my ($oa, $x, $xscore) = split /\|/, $best; + $score = $xscore; + print STDERR "PROJECTED SCORE: $score\n"; + if (abs($x) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; + last; + } + my $psd = $score - $last_score; + $last_score = $score; + if (abs($psd) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; + last; + } + my ($origin, $axis) = split /\s+/, $oa; + + my %ori = convert($origin); + my %axi = convert($axis); + + my $finalFile="$dir/weights.$im1-$opt_iter"; + open W, ">$finalFile" or die "Can't write: $finalFile: $!"; + my $norm = 0; + for my $k (sort keys %ori) { + my $dd = $ori{$k} + $axi{$k} * $x; + $norm += $dd * $dd; + } + $norm = sqrt($norm); + $norm = 1; + for my $k (sort keys %ori) { + my $v = ($ori{$k} + $axi{$k} * $x) / $norm; + print W "$k $v\n"; + } + check_call("rm $dir/splag.$im1/*"); + $inweights = $finalFile; + } + $lastWeightsFile = "$dir/weights.$iteration"; + check_call("cp $inweights $lastWeightsFile"); + if ($icc < 2) { + print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; + last; + } + $lastPScore = $score; + $iteration++; + print STDERR "\n==========\n"; +} + +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +exit 0; + +sub normalize_weights { + my ($rfn, $rpts, $feat) = @_; + my @feat_names = @$rfn; + my @pts = @$rpts; + my $z = 1.0; + for (my $i=0; $i < scalar @feat_names; $i++) { + if ($feat_names[$i] eq $feat) { + $z = $pts[$i]; + last; + } + } + for (my $i=0; $i < scalar @feat_names; $i++) { + $pts[$i] /= $z; + } + print STDERR " NORM WEIGHTS: @pts\n"; + return @pts; +} + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while() { $lc++; } + return $lc; +} + +sub get_comma_sep_refs { + my ($r,$p) = @_; + my $o = check_output("echo $p"); + chomp $o; + my @files = split /\s+/, $o; + return "-$r " . join(" -$r ", @files); +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while() { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +# subs +sub write_config { + my $fh = shift; + my $cleanup = "yes"; + if ($disable_clean) {$cleanup = "no";} + + print $fh "\n"; + print $fh "DECODER: $decoder\n"; + print $fh "INI FILE: $iniFile\n"; + print $fh "WORKING DIR: $dir\n"; + print $fh "SOURCE (DEV): $srcFile\n"; + print $fh "REFS (DEV): $refFiles\n"; + print $fh "EVAL METRIC: $metric\n"; + print $fh "START ITERATION: $iteration\n"; + print $fh "MAX ITERATIONS: $max_iterations\n"; + print $fh "DECODE NODES: $decode_nodes\n"; + print $fh "HEAD NODE: $host\n"; + print $fh "PMEM (DECODING): $pmem\n"; + print $fh "CLEANUP: $cleanup\n"; + print $fh "INITIAL WEIGHTS: $initialWeights\n"; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=){ + chomp $line; + if ($line =~ /^\s* tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "$line\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + + my $executable = check_output("basename $0"); chomp $executable; + print << "Help"; + +Usage: $executable [options] + + $executable [options] + Runs a complete MERT optimization and test set decoding, using + the decoder configuration in ini file. Note that many of the + options have default values that are inferred automatically + based on certain conventions. For details, refer to descriptions + of the options --decoder, --weights, and --workdir. + +Options: + + --local + Run the decoder and optimizer locally with a single thread. + + --use-make + Use make -j to run the optimizer commands (useful on large + shared-memory machines where qsub is unavailable). + + --decode-nodes + Number of decoder processes to run in parallel. [default=15] + + --decoder + Decoder binary to use. + + --density-prune + Limit the density of the hypergraph on each iteration to N times + the number of edges on the Viterbi path. + + --help + Print this message and exit. + + --iteration + Starting iteration number. If not specified, defaults to 1. + + --max-iterations + Maximum number of iterations to run. If not specified, defaults + to 10. + + --pass-suffix + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --pmem + Amount of physical memory requested for parallel decoding jobs. + + --ref-files + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + + --metric + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --normalize + After each iteration, rescale all feature weights such that feature- + name has a weight of 1.0. + + --rand-directions + MERT will attempt to optimize along all of the principle directions, + set this parameter to explore other directions. Defaults to 5. + + --source-file + Dev set source file. + + --weights + A file specifying initial feature weights. The format is + FeatureName_1 value1 + FeatureName_2 value2 + + --workdir + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} -- cgit v1.2.3 From bde4a34bab96052570c248f7d9ccc299a9a3f097 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 11 Jul 2011 20:39:45 -0400 Subject: sort of working hopkins&may optimizer --- pro-train/Makefile.am | 4 +- pro-train/dist-pro.pl | 308 ++++++++++-------------------- pro-train/mr_pro_generate_mapper_input.pl | 18 ++ pro-train/mr_pro_map.cc | 118 ++++++++++-- pro-train/mr_pro_reduce.cc | 167 ++++++++++++---- 5 files changed, 349 insertions(+), 266 deletions(-) create mode 100755 pro-train/mr_pro_generate_mapper_input.pl (limited to 'pro-train/dist-pro.pl') diff --git a/pro-train/Makefile.am b/pro-train/Makefile.am index 945ed5c3..fdaf43e2 100644 --- a/pro-train/Makefile.am +++ b/pro-train/Makefile.am @@ -8,6 +8,6 @@ mr_pro_map_SOURCES = mr_pro_map.cc mr_pro_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz mr_pro_reduce_SOURCES = mr_pro_reduce.cc -mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz +mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index 35bccea4..55d7f1fa 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -21,7 +21,7 @@ my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; my $FAST_SCORE="$bin_dir/../mteval/fast_score"; die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; -my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input"; +my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl"; my $MAPPER = "$bin_dir/mr_pro_map"; my $REDUCER = "$bin_dir/mr_pro_reduce"; my $parallelize = "$VEST_DIR/parallelize.pl"; @@ -37,8 +37,7 @@ die "Can't find decoder in $cdec" unless -x $cdec; die "Can't find $parallelize" unless -x $parallelize; die "Can't find $libcall" unless -e $libcall; my $decoder = $cdec; -my $lines_per_mapper = 400; -my $rand_directions = 15; +my $lines_per_mapper = 100; my $iteration = 1; my $run_local = 0; my $best_weights; @@ -58,7 +57,6 @@ my $metric = "ibm_bleu"; my $dir; my $iniFile; my $weights; -my $initialWeights; my $decoderOpt; my $noprimary; my $maxsim=0; @@ -67,7 +65,6 @@ my $oracleb=20; my $bleu_weight=1; my $use_make; # use make to parallelize line search my $dirargs=''; -my $density_prune; my $usefork; my $pass_suffix = ''; my $cpbin=1; @@ -76,7 +73,6 @@ Getopt::Long::Configure("no_auto_abbrev"); if (GetOptions( "decoder=s" => \$decoderOpt, "decode-nodes=i" => \$decode_nodes, - "density-prune=f" => \$density_prune, "dont-clean" => \$disable_clean, "pass-suffix=s" => \$pass_suffix, "use-fork" => \$usefork, @@ -91,8 +87,6 @@ if (GetOptions( "normalize=s" => \$normalize, "pmem=s" => \$pmem, "cpbin!" => \$cpbin, - "rand-directions=i" => \$rand_directions, - "random_directions=i" => \$rand_directions, "bleu_weight=s" => \$bleu_weight, "no-primary!" => \$noprimary, "max-similarity=s" => \$maxsim, @@ -103,18 +97,12 @@ if (GetOptions( "ref-files=s" => \$refFiles, "metric=s" => \$metric, "source-file=s" => \$srcFile, - "weights=s" => \$initialWeights, "workdir=s" => \$dir, - "opt-iterations=i" => \$optimization_iters, ) == 0 || @ARGV!=1 || $help) { print_help(); exit; } -if (defined $density_prune) { - die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0; -} - if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; } if ($metric =~ /^(combi|ter)$/i) { @@ -146,7 +134,7 @@ if ($metric =~ /^ter$|^aer$/i) { my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); unless ($dir){ - $dir = "vest"; + $dir = "protrain"; } unless ($dir =~ /^\//){ # convert relative path to absolute path my $basedir = check_output("pwd"); @@ -203,18 +191,19 @@ sub dirsize { opendir ISEMPTY,$_[0]; return scalar(readdir(ISEMPTY))-1; } +my @allweights; if ($dryrun){ write_config(*STDERR); exit 0; } else { - if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs + if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs die "ERROR: working dir $dir already exists\n\n"; } else { -e $dir || mkdir $dir; mkdir "$dir/hgs"; modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; mkdir "$dir/scripts"; - my $cmdfile="$dir/rerun-vest.sh"; + my $cmdfile="$dir/rerun-pro.sh"; open CMD,'>',$cmdfile; print CMD "cd ",&getcwd,"\n"; # print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. @@ -223,13 +212,8 @@ if ($dryrun){ close CMD; print STDERR $cline; chmod(0755,$cmdfile); - unless (-e $initialWeights) { - print STDERR "Please specify an initial weights file with --initial-weights\n"; - print_help(); - exit; - } - check_call("cp $initialWeights $dir/weights.0"); - die "Can't find weights.0" unless (-e "$dir/weights.0"); + check_call("touch $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); } write_config(*STDERR); } @@ -255,6 +239,7 @@ my $random_seed = int(time / 1000); my $lastWeightsFile; my $lastPScore = 0; # main optimization loop +my @mapoutputs = (); # aggregate map outputs over all iters while (1){ print STDERR "\n\nITERATION $iteration\n==========\n"; @@ -276,10 +261,8 @@ while (1){ print STDERR unchecked_output("date"); my $im1 = $iteration - 1; my $weightsFile="$dir/weights.$im1"; + push @allweights, "-w $dir/weights.$im1"; my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; - if ($density_prune) { - $decoder_cmd .= " --density_prune $density_prune"; - } my $pcmd; if ($run_local) { $pcmd = "cat $srcFile |"; @@ -320,163 +303,111 @@ while (1){ # run optimizer print STDERR "RUNNING OPTIMIZER AT "; print STDERR unchecked_output("date"); + print STDERR " - GENERATE TRAINING EXEMPLARS\n"; my $mergeLog="$logdir/prune-merge.log.$iteration"; my $score = 0; my $icc = 0; my $inweights="$dir/weights.$im1"; - for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { - print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; - print STDERR unchecked_output("date"); - $icc++; - my $nop=$noprimary?"--no_primary":""; - my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):""; - my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":""; - $cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter"; - print STDERR "COMMAND:\n$cmd\n"; - check_call($cmd); - check_call("mkdir -p $dir/splag.$im1"); - $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; - print STDERR "COMMAND:\n$cmd\n"; - check_call($cmd); - opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; - my @shards = grep { /^mapinput\./ } readdir(DIR); - closedir DIR; - die "No shards!" unless scalar @shards > 0; - my $joblist = ""; - my $nmappers = 0; - my @mapoutputs = (); - @cleanupcmds = (); - my %o2i = (); - my $first_shard = 1; - my $mkfile; # only used with makefiles - my $mkfilename; - if ($use_make) { - $mkfilename = "$dir/splag.$im1/domap.mk"; - open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; - print $mkfile "all: $dir/splag.$im1/map.done\n\n"; - } - my @mkouts = (); # only used with makefiles - for my $shard (@shards) { - my $mapoutput = $shard; - my $client_name = $shard; - $client_name =~ s/mapinput.//; - $client_name = "vest.$client_name"; - $mapoutput =~ s/mapinput/mapoutput/; - push @mapoutputs, "$dir/splag.$im1/$mapoutput"; - $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; - if ($run_local) { - print STDERR "COMMAND:\n$script\n"; - check_bash_call($script); - } elsif ($use_make) { - my $script_file = "$dir/scripts/map.$shard"; - open F, ">$script_file" or die "Can't write $script_file: $!"; - print F "#!/bin/bash\n"; - print F "$script\n"; - close F; - my $output = "$dir/splag.$im1/$mapoutput"; - push @mkouts, $output; - chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; - if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } - print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; - } else { - my $script_file = "$dir/scripts/map.$shard"; - open F, ">$script_file" or die "Can't write $script_file: $!"; - print F "$script\n"; - close F; - if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } - - $nmappers++; - my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; - my $jobid = check_output("$qcmd"); - chomp $jobid; - $jobid =~ s/^(\d+)(.*?)$/\1/g; - $jobid =~ s/^Your job (\d+) .*$/\1/; - push(@cleanupcmds, "qdel $jobid 2> /dev/null"); - print STDERR " $jobid"; - if ($joblist == "") { $joblist = $jobid; } - else {$joblist = $joblist . "\|" . $jobid; } - } - } + $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + check_call("mkdir -p $dir/splag.$im1"); + $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1 $dir/splag.$im1/mapinput."; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; + my @shards = grep { /^mapinput\./ } readdir(DIR); + closedir DIR; + die "No shards!" unless scalar @shards > 0; + my $joblist = ""; + my $nmappers = 0; + @cleanupcmds = (); + my %o2i = (); + my $first_shard = 1; + my $mkfile; # only used with makefiles + my $mkfilename; + if ($use_make) { + $mkfilename = "$dir/splag.$im1/domap.mk"; + open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; + print $mkfile "all: $dir/splag.$im1/map.done\n\n"; + } + my @mkouts = (); # only used with makefiles + for my $shard (@shards) { + my $mapoutput = $shard; + my $client_name = $shard; + $client_name =~ s/mapinput.//; + $client_name = "pro.$client_name"; + $mapoutput =~ s/mapinput/mapoutput/; + push @mapoutputs, "$dir/splag.$im1/$mapoutput"; + $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; + my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep @allweights < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; if ($run_local) { - print STDERR "\nProcessing line search complete.\n"; + print STDERR "COMMAND:\n$script\n"; + check_bash_call($script); } elsif ($use_make) { - print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; - close $mkfile; - my $mcmd = "make -j $use_make -f $mkfilename"; - print STDERR "\nExecuting: $mcmd\n"; - check_call($mcmd); + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "#!/bin/bash\n"; + print F "$script\n"; + close F; + my $output = "$dir/splag.$im1/$mapoutput"; + push @mkouts, $output; + chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; } else { - print STDERR "\nLaunched $nmappers mappers.\n"; - sleep 8; - print STDERR "Waiting for mappers to complete...\n"; - while ($nmappers > 0) { - sleep 5; - my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); - $nmappers = scalar @livejobs; - } - print STDERR "All mappers complete.\n"; + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "$script\n"; + close F; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + + $nmappers++; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = check_output("$qcmd"); + chomp $jobid; + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); + print STDERR " $jobid"; + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } } - my $tol = 0; - my $til = 0; - for my $mo (@mapoutputs) { - my $olines = get_lines($mo); - my $ilines = get_lines($o2i{$mo}); - $tol += $olines; - $til += $ilines; - die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; - } - print STDERR "Results for $tol/$til lines\n"; - print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; - print STDERR unchecked_output("date"); - $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1"; - print STDERR "COMMAND:\n$cmd\n"; - check_bash_call($cmd); - $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; - # sort returns failure even when it doesn't fail for some reason - my $best=unchecked_output("$cmd"); chomp $best; - print STDERR "$best\n"; - my ($oa, $x, $xscore) = split /\|/, $best; - $score = $xscore; - print STDERR "PROJECTED SCORE: $score\n"; - if (abs($x) < $epsilon) { - print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; - last; - } - my $psd = $score - $last_score; - $last_score = $score; - if (abs($psd) < $epsilon) { - print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; - last; - } - my ($origin, $axis) = split /\s+/, $oa; - - my %ori = convert($origin); - my %axi = convert($axis); - - my $finalFile="$dir/weights.$im1-$opt_iter"; - open W, ">$finalFile" or die "Can't write: $finalFile: $!"; - my $norm = 0; - for my $k (sort keys %ori) { - my $dd = $ori{$k} + $axi{$k} * $x; - $norm += $dd * $dd; - } - $norm = sqrt($norm); - $norm = 1; - for my $k (sort keys %ori) { - my $v = ($ori{$k} + $axi{$k} * $x) / $norm; - print W "$k $v\n"; + } + if ($run_local) { + print STDERR "\nCompleted extraction of training exemplars.\n"; + } elsif ($use_make) { + print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; + close $mkfile; + my $mcmd = "make -j $use_make -f $mkfilename"; + print STDERR "\nExecuting: $mcmd\n"; + check_call($mcmd); + } else { + print STDERR "\nLaunched $nmappers mappers.\n"; + sleep 8; + print STDERR "Waiting for mappers to complete...\n"; + while ($nmappers > 0) { + sleep 5; + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); + $nmappers = scalar @livejobs; } - check_call("rm $dir/splag.$im1/*"); - $inweights = $finalFile; + print STDERR "All mappers complete.\n"; } - $lastWeightsFile = "$dir/weights.$iteration"; - check_call("cp $inweights $lastWeightsFile"); - if ($icc < 2) { - print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; - last; + my $tol = 0; + my $til = 0; + print STDERR "MO: @mapoutputs\n"; + for my $mo (@mapoutputs) { + #my $olines = get_lines($mo); + #my $ilines = get_lines($o2i{$mo}); + #die "$mo: no training instances generated!" if $olines == 0; } + print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n"; + print STDERR unchecked_output("date"); + $cmd="cat @mapoutputs | $REDUCER -w $dir/weights.$im1 > $dir/weights.$iteration"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + $lastWeightsFile = "$dir/weights.$iteration"; $lastPScore = $score; $iteration++; print STDERR "\n==========\n"; @@ -488,24 +419,6 @@ print STDOUT "$lastWeightsFile\n"; exit 0; -sub normalize_weights { - my ($rfn, $rpts, $feat) = @_; - my @feat_names = @$rfn; - my @pts = @$rpts; - my $z = 1.0; - for (my $i=0; $i < scalar @feat_names; $i++) { - if ($feat_names[$i] eq $feat) { - $z = $pts[$i]; - last; - } - } - for (my $i=0; $i < scalar @feat_names; $i++) { - $pts[$i] /= $z; - } - print STDERR " NORM WEIGHTS: @pts\n"; - return @pts; -} - sub get_lines { my $fn = shift @_; open FL, "<$fn" or die "Couldn't read $fn: $!"; @@ -563,7 +476,6 @@ sub write_config { print $fh "HEAD NODE: $host\n"; print $fh "PMEM (DECODING): $pmem\n"; print $fh "CLEANUP: $cleanup\n"; - print $fh "INITIAL WEIGHTS: $initialWeights\n"; } sub update_weights_file { @@ -603,6 +515,7 @@ sub enseg { } close SRC; close NEWSRC; + die "Empty dev set!" if ($i == 0); } sub print_help { @@ -634,10 +547,6 @@ Options: --decoder Decoder binary to use. - --density-prune - Limit the density of the hypergraph on each iteration to N times - the number of edges on the Viterbi path. - --help Print this message and exit. @@ -668,18 +577,9 @@ Options: After each iteration, rescale all feature weights such that feature- name has a weight of 1.0. - --rand-directions - MERT will attempt to optimize along all of the principle directions, - set this parameter to explore other directions. Defaults to 5. - --source-file Dev set source file. - --weights - A file specifying initial feature weights. The format is - FeatureName_1 value1 - FeatureName_2 value2 - --workdir Directory for intermediate and output files. If not specified, the name is derived from the ini filename. Assuming that the ini diff --git a/pro-train/mr_pro_generate_mapper_input.pl b/pro-train/mr_pro_generate_mapper_input.pl new file mode 100755 index 00000000..b30fc4fd --- /dev/null +++ b/pro-train/mr_pro_generate_mapper_input.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1; +my $d = shift @ARGV; +die "Can't find directory $d" unless -d $d; + +opendir(DIR, $d) or die "Can't read $d: $!"; +my @hgs = grep { /\.gz$/ } readdir(DIR); +closedir DIR; + +for my $hg (@hgs) { + my $file = $hg; + my $id = $hg; + $id =~ s/(\.json)?\.gz//; + print "$d/$file $id\n"; +} + diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc index b046cdea..128d93ce 100644 --- a/pro-train/mr_pro_map.cc +++ b/pro-train/mr_pro_map.cc @@ -10,6 +10,7 @@ #include "sampler.h" #include "filelib.h" #include "stringlib.h" +#include "weights.h" #include "scorer.h" #include "inside_outside.h" #include "hg_io.h" @@ -27,10 +28,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("reference,r",po::value >(), "[REQD] Reference translation (tokenized text)") - ("source,s",po::value(), "Source file (ignored, except for AER)") + ("source,s",po::value()->default_value(""), "Source file (ignored, except for AER)") ("loss_function,l",po::value()->default_value("ibm_bleu"), "Loss function being optimized") ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") - ("weights,w",po::value(), "[REQD] Current weights file") + ("weights,w",po::value >(), "[REQD] Weights files from previous and current iterations") ("kbest_size,k",po::value()->default_value(1500u), "Top k-hypotheses to extract") ("candidate_pairs,G", po::value()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)") ("best_pairs,X", po::value()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") @@ -44,6 +45,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { cerr << "Please specify one or more references using -r \n"; flag = true; } + if (!conf->count("weights")) { + cerr << "Please specify one or more weights using -w \n"; + flag = true; + } if (flag || conf->count("help")) { cerr << dcmdline_options << endl; exit(1); @@ -51,18 +56,78 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } struct HypInfo { - HypInfo(const vector& h, const SparseVector& feats) : hyp(h), g_(-1), x(feats) {} - double g() { + HypInfo(const vector& h, const SparseVector& feats) : hyp(h), g_(-100.0), x(feats) {} + + // lazy evaluation + double g(const SentenceScorer& scorer) const { + if (g_ == -100.0) + g_ = scorer.ScoreCandidate(hyp)->ComputeScore(); return g_; } - private: - int sent_id; vector hyp; - double g_; + mutable double g_; public: SparseVector x; }; +struct ThresholdAlpha { + explicit ThresholdAlpha(double t = 0.05) : threshold(t) {} + double operator()(double mag) const { + if (mag < threshold) return 0.0; else return 1.0; + } + const double threshold; +}; + +struct TrainingInstance { + TrainingInstance(const SparseVector& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {} + SparseVector x; +#ifdef DEBUGGING_PRO + vector a; + vector b; +#endif + bool y; + double gdiff; +}; + +struct DiffOrder { + bool operator()(const TrainingInstance& a, const TrainingInstance& b) const { + return a.gdiff > b.gdiff; + } +}; + +template +void Sample(const unsigned gamma, const unsigned xi, const vector& J_i, const SentenceScorer& scorer, const Alpha& alpha_i, bool invert_score, vector* pv) { + vector v; + for (unsigned i = 0; i < gamma; ++i) { + size_t a = rng->inclusive(0, J_i.size() - 1)(); + size_t b = rng->inclusive(0, J_i.size() - 1)(); + if (a == b) continue; + double ga = J_i[a].g(scorer); + double gb = J_i[b].g(scorer); + bool positive = ga < gb; + if (invert_score) positive = !positive; + double gdiff = fabs(ga - gb); + if (!gdiff) continue; + if (rng->next() < alpha_i(gdiff)) { + v.push_back(TrainingInstance((J_i[a].x - J_i[b].x).erase_zeros(), positive, gdiff)); +#ifdef DEBUGGING_PRO + v.back().a = J_i[a].hyp; + v.back().b = J_i[b].hyp; +#endif + } + } + vector::iterator mid = v.begin() + xi; + if (xi > v.size()) mid = v.end(); + partial_sort(v.begin(), mid, v.end(), DiffOrder()); + copy(v.begin(), mid, back_inserter(*pv)); +#ifdef DEBUGGING_PRO + if (v.size() >= 5) + for (int i =0; i < 5; ++i) { + cerr << v[i].gdiff << " y=" << v[i].y << "\tA:" << TD::GetString(v[i].a) << "\n\tB: " << TD::GetString(v[i].b) << endl; + } +#endif +} + int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); @@ -81,7 +146,15 @@ int main(int argc, char** argv) { const unsigned kbest_size = conf["kbest_size"].as(); const unsigned gamma = conf["candidate_pairs"].as(); const unsigned xi = conf["best_pairs"].as(); + vector weights_files = conf["weights"].as >(); + vector > weights(weights_files.size()); + for (int i = 0; i < weights.size(); ++i) { + Weights w; + w.InitFromFile(weights_files[i]); + w.InitVector(&weights[i]); + } while(in) { + vector v; string line; getline(in, line); if (line.empty()) continue; @@ -92,18 +165,27 @@ int main(int argc, char** argv) { is >> file >> sent_id; ReadFile rf(file); HypergraphIO::ReadFromJSON(rf.stream(), &hg); - KBest::KBestDerivations, ESentenceTraversal> kbest(hg, kbest_size); - vector J_i; - for (int i = 0; i < kbest_size; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore(); - // if (invert_score) sentscore *= -1.0; - // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; - d->feature_values; - sentscore; + int start = weights.size(); + start -= 4; + if (start < 0) start = 0; + for (int i = start; i < weights.size(); ++i) { + hg.Reweight(weights[i]); + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, kbest_size); + + for (int i = 0; i < kbest_size; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + J_i.push_back(HypInfo(d->yield, d->feature_values)); + } + } + + Sample(gamma, xi, J_i, *ds[sent_id], ThresholdAlpha(0.05), (type == TER), &v); + for (unsigned i = 0; i < v.size(); ++i) { + const TrainingInstance& vi = v[i]; + cout << vi.y << "\t" << vi.x << endl; + cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl; } } return 0; diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 3df52020..2b9c5ce7 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -1,3 +1,4 @@ +#include #include #include #include @@ -6,24 +7,29 @@ #include #include +#include "weights.h" #include "sparse_vector.h" -#include "error_surface.h" -#include "line_optimizer.h" -#include "b64tools.h" +#include "optimize.h" using namespace std; namespace po = boost::program_options; +// since this is a ranking model, there should be equal numbers of +// positive and negative examples so the bias should be 0 +static const double MAX_BIAS = 1e-10; + void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("loss_function,l",po::value(), "Loss function being optimized") + ("weights,w", po::value(), "Weights from previous iteration (used as initialization and interpolation") + ("interpolation,p",po::value()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev") + ("memory_buffers,m",po::value()->default_value(200), "Number of memory buffers (LBFGS)") + ("sigma_squared,s",po::value()->default_value(0.5), "Sigma squared for Gaussian prior") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = conf->count("loss_function") == 0; - if (flag || conf->count("help")) { + if (conf->count("help")) { cerr << dcmdline_options << endl; exit(1); } @@ -32,50 +38,127 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as(); - ScoreType type = ScoreTypeFromString(loss_function); - LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; - if (type == TER || type == AER) { - opt_type = LineOptimizer::MINIMIZE_SCORE; + string line; + vector > > training; + int lc = 0; + bool flag = false; + SparseVector old_weights; + const double psi = conf["interpolation"].as(); + if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } + if (conf.count("weights")) { + Weights w; + w.InitFromFile(conf["weights"].as()); + w.InitSparseVector(&old_weights); } - string last_key; - vector esv; - while(cin) { - string line; - getline(cin, line); + while(getline(cin, line)) { + ++lc; + if (lc % 1000 == 0) { cerr << '.'; flag = true; } + if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } if (line.empty()) continue; - size_t ks = line.find("\t"); + const size_t ks = line.find("\t"); assert(string::npos != ks); - assert(ks > 2); - string key = line.substr(2, ks - 2); - string val = line.substr(ks + 1); - if (key != last_key) { - if (!last_key.empty()) { - float score; - double x = LineOptimizer::LineOptimize(esv, opt_type, &score); - cout << last_key << "|" << x << "|" << score << endl; + assert(ks == 1); + const bool y = line[0] == '1'; + SparseVector x; + size_t last_start = ks + 1; + size_t last_comma = string::npos; + size_t cur = last_start; + while(cur <= line.size()) { + if (line[cur] == ' ' || cur == line.size()) { + if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { + cerr << "[ERROR] " << line << endl << " position = " << cur << endl; + exit(1); + } + const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); + if (cur < line.size()) line[cur] = 0; + const double val = strtod(&line[last_comma + 1], NULL); + x.set_value(fid, val); + + last_comma = string::npos; + last_start = cur+1; + } else { + if (line[cur] == '=') + last_comma = cur; + } + ++cur; + } + training.push_back(make_pair(y, x)); + } + if (flag) cerr << endl; + + cerr << "Number of features: " << FD::NumFeats() << endl; + vector x(FD::NumFeats(), 0.0); // x[0] is bias + for (SparseVector::const_iterator it = old_weights.begin(); + it != old_weights.end(); ++it) + x[it->first] = it->second; + vector vg(FD::NumFeats(), 0.0); + SparseVector g; + bool converged = false; + LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as()); + while(!converged) { + double cll = 0; + double dbias = 0; + g.clear(); + for (int i = 0; i < training.size(); ++i) { + const double dotprod = training[i].second.dot(x) + x[0]; // x[0] is bias + double lp_false = dotprod; + double lp_true = -dotprod; + if (0 < lp_true) { + lp_true += log1p(exp(-lp_true)); + lp_false = log1p(exp(lp_false)); + } else { + lp_true = log1p(exp(lp_true)); + lp_false += log1p(exp(-lp_false)); + } + lp_true*=-1; + lp_false*=-1; + if (training[i].first) { // true label + cll -= lp_true; + g -= training[i].second * exp(lp_false); + dbias -= exp(lp_false); + } else { // false label + cll -= lp_false; + g += training[i].second * exp(lp_true); + dbias += exp(lp_true); } - last_key = key; - esv.clear(); } - if (val.size() % 4 != 0) { - cerr << "B64 encoding error 1! Skipping.\n"; - continue; + vg.clear(); + g.init_vector(&vg); + vg[0] = dbias; +#if 1 + const double sigsq = conf["sigma_squared"].as(); + double norm = 0; + for (int i = 1; i < x.size(); ++i) { + const double mean_i = 0.0; + const double param = (x[i] - mean_i); + norm += param * param; + vg[i] += param / sigsq; + } + const double reg = norm / (2.0 * sigsq); +#else + double reg = 0; +#endif + cll += reg; + cerr << cll << " (REG=" << reg << ")\t"; + bool failed = false; + try { + opt.Optimize(cll, vg, &x); + } catch (...) { + cerr << "Exception caught, assuming convergence is close enough...\n"; + failed = true; } - string encoded(val.size() / 4 * 3, '\0'); - if (!B64::b64decode(reinterpret_cast(&val[0]), val.size(), &encoded[0], encoded.size())) { - cerr << "B64 encoding error 2! Skipping.\n"; - continue; + if (fabs(x[0]) > MAX_BIAS) { + cerr << "Biased model learned. Are your training instances wrong?\n"; + cerr << " BIAS: " << x[0] << endl; } - esv.push_back(ErrorSurface()); - esv.back().Deserialize(type, encoded); + converged = failed || opt.HasConverged(); } - if (!esv.empty()) { - // cerr << "ESV=" << esv.size() << endl; - // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; } - float score; - double x = LineOptimizer::LineOptimize(esv, opt_type, &score); - cout << last_key << "|" << x << "|" << score << endl; + Weights w; + if (conf.count("weights")) { + for (int i = 1; i < x.size(); ++i) + x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi); } + w.InitFromVector(x); + w.WriteToFile("-"); return 0; } -- cgit v1.2.3 From c87835f5f94b3aa954682133c40117b3f8e26585 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 12 Jul 2011 22:34:34 -0400 Subject: debugged pro trainer --- pro-train/dist-pro.pl | 9 +- pro-train/mr_pro_map.cc | 244 +++++++++++++++++++++++++++++++++++++-------- pro-train/mr_pro_reduce.cc | 57 ++++++----- utils/filelib.cc | 12 +++ utils/filelib.h | 1 + 5 files changed, 253 insertions(+), 70 deletions(-) (limited to 'pro-train/dist-pro.pl') diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index 55d7f1fa..c42e3876 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -66,6 +66,7 @@ my $bleu_weight=1; my $use_make; # use make to parallelize line search my $dirargs=''; my $usefork; +my $initial_weights; my $pass_suffix = ''; my $cpbin=1; # Process command-line options @@ -79,6 +80,7 @@ if (GetOptions( "dry-run" => \$dryrun, "epsilon=s" => \$epsilon, "help" => \$help, + "weights=s" => \$initial_weights, "interval" => \$interval, "iteration=i" => \$iteration, "local" => \$run_local, @@ -212,7 +214,7 @@ if ($dryrun){ close CMD; print STDERR $cline; chmod(0755,$cmdfile); - check_call("touch $dir/weights.0"); + check_call("cp $initial_weights $dir/weights.0"); die "Can't find weights.0" unless (-e "$dir/weights.0"); } write_config(*STDERR); @@ -239,7 +241,6 @@ my $random_seed = int(time / 1000); my $lastWeightsFile; my $lastPScore = 0; # main optimization loop -my @mapoutputs = (); # aggregate map outputs over all iters while (1){ print STDERR "\n\nITERATION $iteration\n==========\n"; @@ -262,6 +263,7 @@ while (1){ my $im1 = $iteration - 1; my $weightsFile="$dir/weights.$im1"; push @allweights, "-w $dir/weights.$im1"; + `rm -f $dir/hgs/*.gz`; my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; my $pcmd; if ($run_local) { @@ -333,6 +335,7 @@ while (1){ print $mkfile "all: $dir/splag.$im1/map.done\n\n"; } my @mkouts = (); # only used with makefiles + my @mapoutputs = (); for my $shard (@shards) { my $mapoutput = $shard; my $client_name = $shard; @@ -341,7 +344,7 @@ while (1){ $mapoutput =~ s/mapinput/mapoutput/; push @mapoutputs, "$dir/splag.$im1/$mapoutput"; $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep @allweights < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; + my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; if ($run_local) { print STDERR "COMMAND:\n$script\n"; check_bash_call($script); diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc index 128d93ce..4324e8de 100644 --- a/pro-train/mr_pro_map.cc +++ b/pro-train/mr_pro_map.cc @@ -2,7 +2,9 @@ #include #include #include +#include +#include #include #include #include @@ -22,16 +24,63 @@ using namespace std; namespace po = boost::program_options; +struct ApproxVectorHasher { + static const size_t MASK = 0xFFFFFFFFull; + union UType { + double f; + size_t i; + }; + static inline double round(const double x) { + UType t; + t.f = x; + size_t r = t.i & MASK; + if ((r << 1) > MASK) + t.i += MASK - r + 1; + else + t.i &= (1ull - MASK); + return t.f; + } + size_t operator()(const SparseVector& x) const { + size_t h = 0x573915839; + for (SparseVector::const_iterator it = x.begin(); it != x.end(); ++it) { + UType t; + t.f = it->second; + if (t.f) { + size_t z = (t.i >> 32); + boost::hash_combine(h, it->first); + boost::hash_combine(h, z); + } + } + return h; + } +}; + +struct ApproxVectorEquals { + bool operator()(const SparseVector& a, const SparseVector& b) const { + SparseVector::const_iterator bit = b.begin(); + for (SparseVector::const_iterator ait = a.begin(); ait != a.end(); ++ait) { + if (bit == b.end() || + ait->first != bit->first || + ApproxVectorHasher::round(ait->second) != ApproxVectorHasher::round(bit->second)) + return false; + ++bit; + } + if (bit != b.end()) return false; + return true; + } +}; + boost::shared_ptr rng; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("reference,r",po::value >(), "[REQD] Reference translation (tokenized text)") + ("weights,w",po::value(), "[REQD] Weights files from current iterations") + ("kbest_repository,K",po::value()->default_value("./kbest"),"K-best list repository (directory)") + ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") ("source,s",po::value()->default_value(""), "Source file (ignored, except for AER)") ("loss_function,l",po::value()->default_value("ibm_bleu"), "Loss function being optimized") - ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") - ("weights,w",po::value >(), "[REQD] Weights files from previous and current iterations") ("kbest_size,k",po::value()->default_value(1500u), "Top k-hypotheses to extract") ("candidate_pairs,G", po::value()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)") ("best_pairs,X", po::value()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") @@ -46,7 +95,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { flag = true; } if (!conf->count("weights")) { - cerr << "Please specify one or more weights using -w \n"; + cerr << "Please specify weights using -w \n"; flag = true; } if (flag || conf->count("help")) { @@ -56,6 +105,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } struct HypInfo { + HypInfo() : g_(-100.0) {} HypInfo(const vector& h, const SparseVector& feats) : hyp(h), g_(-100.0), x(feats) {} // lazy evaluation @@ -66,10 +116,92 @@ struct HypInfo { } vector hyp; mutable double g_; - public: SparseVector x; }; +struct HypInfoCompare { + bool operator()(const HypInfo& a, const HypInfo& b) const { + ApproxVectorEquals comp; + return (a.hyp == b.hyp && comp(a.x,b.x)); + } +}; + +struct HypInfoHasher { + size_t operator()(const HypInfo& x) const { + boost::hash > hhasher; + ApproxVectorHasher vhasher; + size_t ha = hhasher(x.hyp); + boost::hash_combine(ha, vhasher(x.x)); + return ha; + } +}; + +void WriteKBest(const string& file, const vector& kbest) { + WriteFile wf(file); + ostream& out = *wf.stream(); + out.precision(10); + for (int i = 0; i < kbest.size(); ++i) { + out << TD::GetString(kbest[i].hyp) << endl; + out << kbest[i].x << endl; + } +} + +void ParseSparseVector(string& line, size_t cur, SparseVector* out) { + SparseVector& x = *out; + size_t last_start = cur; + size_t last_comma = string::npos; + while(cur <= line.size()) { + if (line[cur] == ' ' || cur == line.size()) { + if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { + cerr << "[ERROR] " << line << endl << " position = " << cur << endl; + exit(1); + } + const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); + if (cur < line.size()) line[cur] = 0; + const double val = strtod(&line[last_comma + 1], NULL); + x.set_value(fid, val); + + last_comma = string::npos; + last_start = cur+1; + } else { + if (line[cur] == '=') + last_comma = cur; + } + ++cur; + } +} + +void ReadKBest(const string& file, vector* kbest) { + cerr << "Reading from " << file << endl; + ReadFile rf(file); + istream& in = *rf.stream(); + string cand; + string feats; + while(getline(in, cand)) { + getline(in, feats); + assert(in); + kbest->push_back(HypInfo()); + TD::ConvertSentence(cand, &kbest->back().hyp); + ParseSparseVector(feats, 0, &kbest->back().x); + } + cerr << " read " << kbest->size() << " hypotheses\n"; +} + +void Dedup(vector* h) { + cerr << "Dedup in=" << h->size(); + tr1::unordered_set u; + while(h->size() > 0) { + u.insert(h->back()); + h->pop_back(); + } + tr1::unordered_set::iterator it = u.begin(); + while (it != u.end()) { + h->push_back(*it); + it = u.erase(it); + } + cerr << " out=" << h->size() << endl; +} + struct ThresholdAlpha { explicit ThresholdAlpha(double t = 0.05) : threshold(t) {} double operator()(double mag) const { @@ -81,6 +213,7 @@ struct ThresholdAlpha { struct TrainingInstance { TrainingInstance(const SparseVector& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {} SparseVector x; +#undef DEBUGGING_PRO #ifdef DEBUGGING_PRO vector a; vector b; @@ -88,6 +221,11 @@ struct TrainingInstance { bool y; double gdiff; }; +#ifdef DEBUGGING_PRO +ostream& operator<<(ostream& os, const TrainingInstance& d) { + return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x; +} +#endif struct DiffOrder { bool operator()(const TrainingInstance& a, const TrainingInstance& b) const { @@ -95,36 +233,51 @@ struct DiffOrder { } }; -template -void Sample(const unsigned gamma, const unsigned xi, const vector& J_i, const SentenceScorer& scorer, const Alpha& alpha_i, bool invert_score, vector* pv) { - vector v; +void Sample(const unsigned gamma, const unsigned xi, const vector& J_i, const SentenceScorer& scorer, const bool invert_score, vector* pv) { + vector v1, v2; + double avg_diff = 0; for (unsigned i = 0; i < gamma; ++i) { - size_t a = rng->inclusive(0, J_i.size() - 1)(); - size_t b = rng->inclusive(0, J_i.size() - 1)(); + const size_t a = rng->inclusive(0, J_i.size() - 1)(); + const size_t b = rng->inclusive(0, J_i.size() - 1)(); if (a == b) continue; double ga = J_i[a].g(scorer); double gb = J_i[b].g(scorer); - bool positive = ga < gb; + bool positive = gb < ga; if (invert_score) positive = !positive; - double gdiff = fabs(ga - gb); + const double gdiff = fabs(ga - gb); if (!gdiff) continue; - if (rng->next() < alpha_i(gdiff)) { - v.push_back(TrainingInstance((J_i[a].x - J_i[b].x).erase_zeros(), positive, gdiff)); + avg_diff += gdiff; + SparseVector xdiff = (J_i[a].x - J_i[b].x).erase_zeros(); + if (xdiff.empty()) { + cerr << "Empty diff:\n " << TD::GetString(J_i[a].hyp) << endl << "x=" << J_i[a].x << endl; + cerr << " " << TD::GetString(J_i[b].hyp) << endl << "x=" << J_i[b].x << endl; + continue; + } + v1.push_back(TrainingInstance(xdiff, positive, gdiff)); #ifdef DEBUGGING_PRO - v.back().a = J_i[a].hyp; - v.back().b = J_i[b].hyp; + v1.back().a = J_i[a].hyp; + v1.back().b = J_i[b].hyp; + cerr << "N: " << v1.back() << endl; #endif - } } - vector::iterator mid = v.begin() + xi; - if (xi > v.size()) mid = v.end(); - partial_sort(v.begin(), mid, v.end(), DiffOrder()); - copy(v.begin(), mid, back_inserter(*pv)); + avg_diff /= v1.size(); + + for (unsigned i = 0; i < v1.size(); ++i) { + double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff)); + // cerr << "avg_diff=" << avg_diff << " gdiff=" << v1[i].gdiff << " p=" << p << endl; + if (rng->next() < p) v2.push_back(v1[i]); + } + vector::iterator mid = v2.begin() + xi; + if (xi > v2.size()) mid = v2.end(); + partial_sort(v2.begin(), mid, v2.end(), DiffOrder()); + copy(v2.begin(), mid, back_inserter(*pv)); #ifdef DEBUGGING_PRO - if (v.size() >= 5) - for (int i =0; i < 5; ++i) { - cerr << v[i].gdiff << " y=" << v[i].y << "\tA:" << TD::GetString(v[i].a) << "\n\tB: " << TD::GetString(v[i].b) << endl; + if (v2.size() >= 5) { + for (int i =0; i < (mid - v2.begin()); ++i) { + cerr << v2[i] << endl; } + cerr << pv->back() << endl; + } #endif } @@ -136,6 +289,7 @@ int main(int argc, char** argv) { else rng.reset(new MT19937); const string loss_function = conf["loss_function"].as(); + ScoreType type = ScoreTypeFromString(loss_function); DocScorer ds(type, conf["reference"].as >(), conf["source"].as()); cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; @@ -146,13 +300,15 @@ int main(int argc, char** argv) { const unsigned kbest_size = conf["kbest_size"].as(); const unsigned gamma = conf["candidate_pairs"].as(); const unsigned xi = conf["best_pairs"].as(); - vector weights_files = conf["weights"].as >(); - vector > weights(weights_files.size()); - for (int i = 0; i < weights.size(); ++i) { + string weightsf = conf["weights"].as(); + vector weights; + { Weights w; - w.InitFromFile(weights_files[i]); - w.InitVector(&weights[i]); + w.InitFromFile(weightsf); + w.InitVector(&weights); } + string kbest_repo = conf["kbest_repository"].as(); + MkDirP(kbest_repo); while(in) { vector v; string line; @@ -164,24 +320,26 @@ int main(int argc, char** argv) { // path-to-file (JSON) sent_id is >> file >> sent_id; ReadFile rf(file); - HypergraphIO::ReadFromJSON(rf.stream(), &hg); + ostringstream os; vector J_i; - int start = weights.size(); - start -= 4; - if (start < 0) start = 0; - for (int i = start; i < weights.size(); ++i) { - hg.Reweight(weights[i]); - KBest::KBestDerivations, ESentenceTraversal> kbest(hg, kbest_size); - - for (int i = 0; i < kbest_size; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - J_i.push_back(HypInfo(d->yield, d->feature_values)); - } + os << kbest_repo << "/kbest." << sent_id << ".txt.gz"; + const string kbest_file = os.str(); + if (FileExists(kbest_file)) + ReadKBest(kbest_file, &J_i); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(weights); + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, kbest_size); + + for (int i = 0; i < kbest_size; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + J_i.push_back(HypInfo(d->yield, d->feature_values)); } + Dedup(&J_i); + WriteKBest(kbest_file, J_i); - Sample(gamma, xi, J_i, *ds[sent_id], ThresholdAlpha(0.05), (type == TER), &v); + Sample(gamma, xi, J_i, *ds[sent_id], (type == TER), &v); for (unsigned i = 0; i < v.size(); ++i) { const TrainingInstance& vi = v[i]; cout << vi.y << "\t" << vi.x << endl; diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 2b9c5ce7..e1a7db8a 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -24,7 +24,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("weights,w", po::value(), "Weights from previous iteration (used as initialization and interpolation") ("interpolation,p",po::value()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev") ("memory_buffers,m",po::value()->default_value(200), "Number of memory buffers (LBFGS)") - ("sigma_squared,s",po::value()->default_value(0.5), "Sigma squared for Gaussian prior") + ("sigma_squared,s",po::value()->default_value(1.0), "Sigma squared for Gaussian prior") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); @@ -35,6 +35,31 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } +void ParseSparseVector(string& line, size_t cur, SparseVector* out) { + SparseVector& x = *out; + size_t last_start = cur; + size_t last_comma = string::npos; + while(cur <= line.size()) { + if (line[cur] == ' ' || cur == line.size()) { + if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { + cerr << "[ERROR] " << line << endl << " position = " << cur << endl; + exit(1); + } + const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); + if (cur < line.size()) line[cur] = 0; + const double val = strtod(&line[last_comma + 1], NULL); + x.set_value(fid, val); + + last_comma = string::npos; + last_start = cur+1; + } else { + if (line[cur] == '=') + last_comma = cur; + } + ++cur; + } +} + int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); @@ -60,28 +85,7 @@ int main(int argc, char** argv) { assert(ks == 1); const bool y = line[0] == '1'; SparseVector x; - size_t last_start = ks + 1; - size_t last_comma = string::npos; - size_t cur = last_start; - while(cur <= line.size()) { - if (line[cur] == ' ' || cur == line.size()) { - if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { - cerr << "[ERROR] " << line << endl << " position = " << cur << endl; - exit(1); - } - const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); - if (cur < line.size()) line[cur] = 0; - const double val = strtod(&line[last_comma + 1], NULL); - x.set_value(fid, val); - - last_comma = string::npos; - last_start = cur+1; - } else { - if (line[cur] == '=') - last_comma = cur; - } - ++cur; - } + ParseSparseVector(line, ks + 1, &x); training.push_back(make_pair(y, x)); } if (flag) cerr << endl; @@ -95,6 +99,7 @@ int main(int argc, char** argv) { SparseVector g; bool converged = false; LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as()); + double ppl = 0; while(!converged) { double cll = 0; double dbias = 0; @@ -114,14 +119,18 @@ int main(int argc, char** argv) { lp_false*=-1; if (training[i].first) { // true label cll -= lp_true; + ppl += lp_true / log(2); g -= training[i].second * exp(lp_false); dbias -= exp(lp_false); } else { // false label cll -= lp_false; + ppl += lp_false / log(2); g += training[i].second * exp(lp_true); dbias += exp(lp_true); } } + ppl /= training.size(); + ppl = pow(2.0, - ppl); vg.clear(); g.init_vector(&vg); vg[0] = dbias; @@ -139,7 +148,7 @@ int main(int argc, char** argv) { double reg = 0; #endif cll += reg; - cerr << cll << " (REG=" << reg << ")\t"; + cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t"; bool failed = false; try { opt.Optimize(cll, vg, &x); diff --git a/utils/filelib.cc b/utils/filelib.cc index 79ad2847..a0969b1a 100644 --- a/utils/filelib.cc +++ b/utils/filelib.cc @@ -20,3 +20,15 @@ bool DirectoryExists(const string& dir) { return false; } +void MkDirP(const string& dir) { + if (DirectoryExists(dir)) return; + if (mkdir(dir.c_str(), 0777)) { + perror(dir.c_str()); + abort(); + } + if (chmod(dir.c_str(), 07777)) { + perror(dir.c_str()); + abort(); + } +} + diff --git a/utils/filelib.h b/utils/filelib.h index dda98671..a8622246 100644 --- a/utils/filelib.h +++ b/utils/filelib.h @@ -12,6 +12,7 @@ bool FileExists(const std::string& file_name); bool DirectoryExists(const std::string& dir_name); +void MkDirP(const std::string& dir_name); // reads from standard in if filename is - // uncompresses if file ends with .gz -- cgit v1.2.3 From c3828b0a2deb42de5c7378e93f93f5e69efb304c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 16 Jul 2011 19:13:21 -0400 Subject: tune regularizer --- mteval/scorer.cc | 12 +++- pro-train/dist-pro.pl | 139 ++++++++++++++++++++++++++------------------- pro-train/mr_pro_reduce.cc | 128 ++++++++++++++++++++++++++++++----------- 3 files changed, 185 insertions(+), 94 deletions(-) (limited to 'pro-train/dist-pro.pl') diff --git a/mteval/scorer.cc b/mteval/scorer.cc index 2daa0daa..a83b9e2f 100644 --- a/mteval/scorer.cc +++ b/mteval/scorer.cc @@ -430,6 +430,7 @@ float BLEUScore::ComputeScore(vector* precs, float* bp) const { float log_bleu = 0; if (precs) precs->clear(); int count = 0; + vector total_precs(N()); for (int i = 0; i < N(); ++i) { if (hyp_ngram_counts[i] > 0) { float cor_count = correct_ngram_hit_counts[i]; @@ -440,14 +441,21 @@ float BLEUScore::ComputeScore(vector* precs, float* bp) const { log_bleu += lprec; ++count; } + total_precs[i] = log_bleu; } - log_bleu /= static_cast(count); + vector bleus(N()); float lbp = 0.0; if (hyp_len < ref_len) lbp = (hyp_len - ref_len) / hyp_len; log_bleu += lbp; if (bp) *bp = exp(lbp); - return exp(log_bleu); + float wb = 0; + for (int i = 0; i < N(); ++i) { + bleus[i] = exp(total_precs[i] / (i+1) + lbp); + wb += bleus[i] / pow(2.0, 4.0 - i); + } + //return wb; + return bleus.back(); } diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index c42e3876..dbfa329a 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -37,42 +37,36 @@ die "Can't find decoder in $cdec" unless -x $cdec; die "Can't find $parallelize" unless -x $parallelize; die "Can't find $libcall" unless -e $libcall; my $decoder = $cdec; -my $lines_per_mapper = 100; +my $lines_per_mapper = 30; my $iteration = 1; my $run_local = 0; my $best_weights; -my $max_iterations = 15; -my $optimization_iters = 6; +my $max_iterations = 30; my $decode_nodes = 15; # number of decode nodes -my $pmem = "9g"; +my $pmem = "4g"; my $disable_clean = 0; my %seen_weights; -my $normalize; my $help = 0; my $epsilon = 0.0001; -my $interval = 5; my $dryrun = 0; my $last_score = -10000000; my $metric = "ibm_bleu"; my $dir; my $iniFile; my $weights; -my $decoderOpt; -my $noprimary; -my $maxsim=0; -my $oraclen=0; -my $oracleb=20; -my $bleu_weight=1; -my $use_make; # use make to parallelize line search -my $dirargs=''; +my $use_make; # use make to parallelize my $usefork; my $initial_weights; my $pass_suffix = ''; my $cpbin=1; + +# regularization strength +my $tune_regularizer = 0; +my $reg = 1e-2; + # Process command-line options Getopt::Long::Configure("no_auto_abbrev"); if (GetOptions( - "decoder=s" => \$decoderOpt, "decode-nodes=i" => \$decode_nodes, "dont-clean" => \$disable_clean, "pass-suffix=s" => \$pass_suffix, @@ -81,21 +75,13 @@ if (GetOptions( "epsilon=s" => \$epsilon, "help" => \$help, "weights=s" => \$initial_weights, - "interval" => \$interval, - "iteration=i" => \$iteration, + "tune-regularizer" => \$tune_regularizer, + "reg=f" => \$reg, "local" => \$run_local, "use-make=i" => \$use_make, "max-iterations=i" => \$max_iterations, - "normalize=s" => \$normalize, "pmem=s" => \$pmem, "cpbin!" => \$cpbin, - "bleu_weight=s" => \$bleu_weight, - "no-primary!" => \$noprimary, - "max-similarity=s" => \$maxsim, - "oracle-directions=i" => \$oraclen, - "n-oracle=i" => \$oraclen, - "oracle-batch=i" => \$oracleb, - "directions-args=s" => \$dirargs, "ref-files=s" => \$refFiles, "metric=s" => \$metric, "source-file=s" => \$srcFile, @@ -108,9 +94,7 @@ if (GetOptions( if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; } if ($metric =~ /^(combi|ter)$/i) { - $lines_per_mapper = 40; -} elsif ($metric =~ /^meteor$/i) { - $lines_per_mapper = 2000; # start up time is really high + $lines_per_mapper = 5; } ($iniFile) = @ARGV; @@ -144,8 +128,6 @@ unless ($dir =~ /^\//){ # convert relative path to absolute path $dir = "$basedir/$dir"; } -if ($decoderOpt){ $decoder = $decoderOpt; } - # Initializations and helper functions srand; @@ -378,6 +360,22 @@ while (1){ else {$joblist = $joblist . "\|" . $jobid; } } } + my @dev_outs = (); + my @devtest_outs = (); + if ($tune_regularizer) { + for (my $i = 0; $i < scalar @mapoutputs; $i++) { + if ($i % 3 == 1) { + push @devtest_outs, $mapoutputs[$i]; + } else { + push @dev_outs, $mapoutputs[$i]; + } + } + if (scalar @devtest_outs == 0) { + die "Not enough training instances for regularization tuning! Rerun without --tune-regularizer\n"; + } + } else { + @dev_outs = @mapoutputs; + } if ($run_local) { print STDERR "\nCompleted extraction of training exemplars.\n"; } elsif ($use_make) { @@ -399,7 +397,13 @@ while (1){ } my $tol = 0; my $til = 0; - print STDERR "MO: @mapoutputs\n"; + my $dev_test_file = "$dir/splag.$im1/devtest.gz"; + if ($tune_regularizer) { + my $cmd = "cat @devtest_outs | gzip > $dev_test_file"; + check_bash_call($cmd); + die "Can't find file $dev_test_file" unless -f $dev_test_file; + } + #print STDERR "MO: @mapoutputs\n"; for my $mo (@mapoutputs) { #my $olines = get_lines($mo); #my $ilines = get_lines($o2i{$mo}); @@ -407,10 +411,24 @@ while (1){ } print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n"; print STDERR unchecked_output("date"); - $cmd="cat @mapoutputs | $REDUCER -w $dir/weights.$im1 > $dir/weights.$iteration"; + $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -s $reg"; + if ($tune_regularizer) { + $cmd .= " -T -t $dev_test_file"; + } + $cmd .= " > $dir/weights.$iteration"; print STDERR "COMMAND:\n$cmd\n"; check_bash_call($cmd); $lastWeightsFile = "$dir/weights.$iteration"; + if ($tune_regularizer) { + open W, "<$lastWeightsFile" or die "Can't read $lastWeightsFile: $!"; + my $line = ; + close W; + my ($sharp, $label, $nreg) = split /\s|=/, $line; + print STDERR "REGULARIZATION STRENGTH ($label) IS $nreg\n"; + $reg = $nreg; + # only tune regularizer on first iteration? + $tune_regularizer = 0; + } $lastPScore = $score; $iteration++; print STDERR "\n==========\n"; @@ -473,7 +491,6 @@ sub write_config { print $fh "SOURCE (DEV): $srcFile\n"; print $fh "REFS (DEV): $refFiles\n"; print $fh "EVAL METRIC: $metric\n"; - print $fh "START ITERATION: $iteration\n"; print $fh "MAX ITERATIONS: $max_iterations\n"; print $fh "DECODE NODES: $decode_nodes\n"; print $fh "HEAD NODE: $host\n"; @@ -535,31 +552,38 @@ Usage: $executable [options] based on certain conventions. For details, refer to descriptions of the options --decoder, --weights, and --workdir. -Options: +Required: + + --ref-files + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + + --source-file + Dev set source file. + + --weights + Initial weights file (use empty file to start from 0) + +General options: --local Run the decoder and optimizer locally with a single thread. - --use-make - Use make -j to run the optimizer commands (useful on large - shared-memory machines where qsub is unavailable). - --decode-nodes Number of decoder processes to run in parallel. [default=15] - --decoder - Decoder binary to use. - --help Print this message and exit. - --iteration - Starting iteration number. If not specified, defaults to 1. - --max-iterations Maximum number of iterations to run. If not specified, defaults to 10. + --metric + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + --pass-suffix If the decoder is doing multi-pass decoding, the pass suffix "2", "3", etc., is used to control what iteration of weights is set. @@ -567,21 +591,9 @@ Options: --pmem Amount of physical memory requested for parallel decoding jobs. - --ref-files - Dev set ref files. This option takes only a single string argument. - To use multiple files (including file globbing), this argument should - be quoted. - - --metric - Metric to optimize. - Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi - - --normalize - After each iteration, rescale all feature weights such that feature- - name has a weight of 1.0. - - --source-file - Dev set source file. + --use-make + Use make -j to run the optimizer commands (useful on large + shared-memory machines where qsub is unavailable). --workdir Directory for intermediate and output files. If not specified, the @@ -591,6 +603,14 @@ Options: the filename. E.g. an ini file named decoder.foo.ini would have a default working directory name foo. +Regularization options: + + --tune-regularizer + Hold out one third of the tuning data and used this to tune the + regularization parameter. + + --reg + Help } @@ -606,7 +626,6 @@ sub convert { } - sub cmdline { return join ' ',($0,@ORIG_ARGV); } diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 491ceb3a..9b422f33 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -16,7 +16,7 @@ using namespace std; namespace po = boost::program_options; // since this is a ranking model, there should be equal numbers of -// positive and negative examples so the bias should be 0 +// positive and negative examples, so the bias should be 0 static const double MAX_BIAS = 1e-10; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { @@ -25,8 +25,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("weights,w", po::value(), "Weights from previous iteration (used as initialization and interpolation") ("interpolation,p",po::value()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev") ("memory_buffers,m",po::value()->default_value(200), "Number of memory buffers (LBFGS)") - ("sigma_squared,s",po::value()->default_value(1.0), "Sigma squared for Gaussian prior") - ("testset,t",po::value(), "Optional held-out test set to tune regularizer") + ("sigma_squared,s",po::value()->default_value(0.1), "Sigma squared for Gaussian prior") + ("min_reg,r",po::value()->default_value(1e-8), "When tuning (-T) regularization strength, minimum regularization strenght") + ("max_reg,R",po::value()->default_value(10.0), "When tuning (-T) regularization strength, maximum regularization strenght") + ("testset,t",po::value(), "Optional held-out test set") + ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); @@ -95,8 +98,6 @@ void GradAdd(const SparseVector& v, const double scale, vector* double TrainingInference(const vector& x, const vector > >& corpus, vector* g = NULL) { - if (g) fill(g->begin(), g->end(), 0.0); - double cll = 0; for (int i = 0; i < corpus.size(); ++i) { const double dotprod = corpus[i].second.dot(x) + x[0]; // x[0] is bias @@ -130,39 +131,23 @@ double TrainingInference(const vector& x, return cll; } -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - string line; - vector > > training, testing; - SparseVector old_weights; - const double psi = conf["interpolation"].as(); - if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } - if (conf.count("weights")) { - Weights w; - w.InitFromFile(conf["weights"].as()); - w.InitSparseVector(&old_weights); - } - ReadCorpus(&cin, &training); - if (conf.count("testset")) { - ReadFile rf(conf["testset"].as()); - ReadCorpus(rf.stream(), &testing); - } - - cerr << "Number of features: " << FD::NumFeats() << endl; - vector x(FD::NumFeats(), 0.0); // x[0] is bias - for (SparseVector::const_iterator it = old_weights.begin(); - it != old_weights.end(); ++it) - x[it->first] = it->second; +// return held-out log likelihood +double LearnParameters(const vector > >& training, + const vector > >& testing, + const double sigsq, + const unsigned memory_buffers, + vector* px) { + vector& x = *px; vector vg(FD::NumFeats(), 0.0); bool converged = false; - LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as()); + LBFGSOptimizer opt(FD::NumFeats(), memory_buffers); + double tppl = 0.0; while(!converged) { + fill(vg.begin(), vg.end(), 0.0); double cll = TrainingInference(x, training, &vg); double ppl = cll / log(2); ppl /= training.size(); ppl = pow(2.0, ppl); - double tppl = 0.0; // evaluate optional held-out test set if (testing.size()) { @@ -173,7 +158,6 @@ int main(int argc, char** argv) { // handle regularizer #if 1 - const double sigsq = conf["sigma_squared"].as(); double norm = 0; for (int i = 1; i < x.size(); ++i) { const double mean_i = 0.0; @@ -202,11 +186,91 @@ int main(int argc, char** argv) { cerr << " BIAS: " << x[0] << endl; } } + return tppl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + string line; + vector > > training, testing; + SparseVector old_weights; + const bool tune_regularizer = conf.count("tune_regularizer"); + if (tune_regularizer && !conf.count("testset")) { + cerr << "--tune_regularizer requires --testset to be set\n"; + return 1; + } + const double min_reg = conf["min_reg"].as(); + const double max_reg = conf["max_reg"].as(); + double sigsq = conf["sigma_squared"].as(); + assert(sigsq > 0.0); + assert(min_reg > 0.0); + assert(max_reg > 0.0); + assert(max_reg > min_reg); + const double psi = conf["interpolation"].as(); + if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } + if (conf.count("weights")) { + Weights w; + w.InitFromFile(conf["weights"].as()); + w.InitSparseVector(&old_weights); + } + ReadCorpus(&cin, &training); + if (conf.count("testset")) { + ReadFile rf(conf["testset"].as()); + ReadCorpus(rf.stream(), &testing); + } + cerr << "Number of features: " << FD::NumFeats() << endl; + vector x(FD::NumFeats(), 0.0); // x[0] is bias + for (SparseVector::const_iterator it = old_weights.begin(); + it != old_weights.end(); ++it) + x[it->first] = it->second; + double tppl = 0.0; + vector > sp; + vector smoothed; + if (tune_regularizer) { + sigsq = min_reg; + const double steps = 18; + double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps); + cerr << "SWEEP FACTOR: " << sweep_factor << endl; + while(sigsq < max_reg) { + tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as(), &x); + sp.push_back(make_pair(sigsq, tppl)); + sigsq *= sweep_factor; + } + smoothed.resize(sp.size(), 0); + smoothed[0] = sp[0].second; + smoothed.back() = sp.back().second; + for (int i = 1; i < sp.size()-1; ++i) { + double prev = sp[i-1].second; + double next = sp[i+1].second; + double cur = sp[i].second; + smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next); + } + double best_ppl = 9999999; + unsigned best_i = 0; + for (unsigned i = 0; i < sp.size(); ++i) { + if (smoothed[i] < best_ppl) { + best_ppl = smoothed[i]; + best_i = i; + } + } + sigsq = sp[best_i].first; + tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as(), &x); + } Weights w; if (conf.count("weights")) { for (int i = 1; i < x.size(); ++i) x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi); } + cout.precision(15); + cout << "# sigma^2=" << sigsq << "\theld out perplexity="; + if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; } + if (sp.size()) { + cout << "# Parameter sweep:\n"; + for (int i = 0; i < sp.size(); ++i) { + cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl; + } + } w.InitFromVector(x); w.WriteToFile("-"); return 0; -- cgit v1.2.3