diff options
-rw-r--r-- | pro-train/Makefile.am | 4 | ||||
-rwxr-xr-x | pro-train/dist-pro.pl | 308 | ||||
-rwxr-xr-x | pro-train/mr_pro_generate_mapper_input.pl | 18 | ||||
-rw-r--r-- | pro-train/mr_pro_map.cc | 118 | ||||
-rw-r--r-- | pro-train/mr_pro_reduce.cc | 167 |
5 files changed, 349 insertions, 266 deletions
diff --git a/pro-train/Makefile.am b/pro-train/Makefile.am index 945ed5c3..fdaf43e2 100644 --- a/pro-train/Makefile.am +++ b/pro-train/Makefile.am @@ -8,6 +8,6 @@ mr_pro_map_SOURCES = mr_pro_map.cc mr_pro_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz mr_pro_reduce_SOURCES = mr_pro_reduce.cc -mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz +mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index 35bccea4..55d7f1fa 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -21,7 +21,7 @@ my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; my $FAST_SCORE="$bin_dir/../mteval/fast_score"; die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; -my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input"; +my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl"; my $MAPPER = "$bin_dir/mr_pro_map"; my $REDUCER = "$bin_dir/mr_pro_reduce"; my $parallelize = "$VEST_DIR/parallelize.pl"; @@ -37,8 +37,7 @@ die "Can't find decoder in $cdec" unless -x $cdec; die "Can't find $parallelize" unless -x $parallelize; die "Can't find $libcall" unless -e $libcall; my $decoder = $cdec; -my $lines_per_mapper = 400; -my $rand_directions = 15; +my $lines_per_mapper = 100; my $iteration = 1; my $run_local = 0; my $best_weights; @@ -58,7 +57,6 @@ my $metric = "ibm_bleu"; my $dir; my $iniFile; my $weights; -my $initialWeights; my $decoderOpt; my $noprimary; my $maxsim=0; @@ -67,7 +65,6 @@ my $oracleb=20; my $bleu_weight=1; my $use_make; # use make to parallelize line search my $dirargs=''; -my $density_prune; my $usefork; my $pass_suffix = ''; my $cpbin=1; @@ -76,7 +73,6 @@ Getopt::Long::Configure("no_auto_abbrev"); if (GetOptions( "decoder=s" => \$decoderOpt, "decode-nodes=i" => \$decode_nodes, - "density-prune=f" => \$density_prune, "dont-clean" => \$disable_clean, "pass-suffix=s" => \$pass_suffix, "use-fork" => \$usefork, @@ -91,8 +87,6 @@ if (GetOptions( "normalize=s" => \$normalize, "pmem=s" => \$pmem, "cpbin!" => \$cpbin, - "rand-directions=i" => \$rand_directions, - "random_directions=i" => \$rand_directions, "bleu_weight=s" => \$bleu_weight, "no-primary!" => \$noprimary, "max-similarity=s" => \$maxsim, @@ -103,18 +97,12 @@ if (GetOptions( "ref-files=s" => \$refFiles, "metric=s" => \$metric, "source-file=s" => \$srcFile, - "weights=s" => \$initialWeights, "workdir=s" => \$dir, - "opt-iterations=i" => \$optimization_iters, ) == 0 || @ARGV!=1 || $help) { print_help(); exit; } -if (defined $density_prune) { - die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0; -} - if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; } if ($metric =~ /^(combi|ter)$/i) { @@ -146,7 +134,7 @@ if ($metric =~ /^ter$|^aer$/i) { my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); unless ($dir){ - $dir = "vest"; + $dir = "protrain"; } unless ($dir =~ /^\//){ # convert relative path to absolute path my $basedir = check_output("pwd"); @@ -203,18 +191,19 @@ sub dirsize { opendir ISEMPTY,$_[0]; return scalar(readdir(ISEMPTY))-1; } +my @allweights; if ($dryrun){ write_config(*STDERR); exit 0; } else { - if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs + if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs die "ERROR: working dir $dir already exists\n\n"; } else { -e $dir || mkdir $dir; mkdir "$dir/hgs"; modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; mkdir "$dir/scripts"; - my $cmdfile="$dir/rerun-vest.sh"; + my $cmdfile="$dir/rerun-pro.sh"; open CMD,'>',$cmdfile; print CMD "cd ",&getcwd,"\n"; # print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. @@ -223,13 +212,8 @@ if ($dryrun){ close CMD; print STDERR $cline; chmod(0755,$cmdfile); - unless (-e $initialWeights) { - print STDERR "Please specify an initial weights file with --initial-weights\n"; - print_help(); - exit; - } - check_call("cp $initialWeights $dir/weights.0"); - die "Can't find weights.0" unless (-e "$dir/weights.0"); + check_call("touch $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); } write_config(*STDERR); } @@ -255,6 +239,7 @@ my $random_seed = int(time / 1000); my $lastWeightsFile; my $lastPScore = 0; # main optimization loop +my @mapoutputs = (); # aggregate map outputs over all iters while (1){ print STDERR "\n\nITERATION $iteration\n==========\n"; @@ -276,10 +261,8 @@ while (1){ print STDERR unchecked_output("date"); my $im1 = $iteration - 1; my $weightsFile="$dir/weights.$im1"; + push @allweights, "-w $dir/weights.$im1"; my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; - if ($density_prune) { - $decoder_cmd .= " --density_prune $density_prune"; - } my $pcmd; if ($run_local) { $pcmd = "cat $srcFile |"; @@ -320,163 +303,111 @@ while (1){ # run optimizer print STDERR "RUNNING OPTIMIZER AT "; print STDERR unchecked_output("date"); + print STDERR " - GENERATE TRAINING EXEMPLARS\n"; my $mergeLog="$logdir/prune-merge.log.$iteration"; my $score = 0; my $icc = 0; my $inweights="$dir/weights.$im1"; - for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { - print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; - print STDERR unchecked_output("date"); - $icc++; - my $nop=$noprimary?"--no_primary":""; - my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):""; - my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":""; - $cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter"; - print STDERR "COMMAND:\n$cmd\n"; - check_call($cmd); - check_call("mkdir -p $dir/splag.$im1"); - $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; - print STDERR "COMMAND:\n$cmd\n"; - check_call($cmd); - opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; - my @shards = grep { /^mapinput\./ } readdir(DIR); - closedir DIR; - die "No shards!" unless scalar @shards > 0; - my $joblist = ""; - my $nmappers = 0; - my @mapoutputs = (); - @cleanupcmds = (); - my %o2i = (); - my $first_shard = 1; - my $mkfile; # only used with makefiles - my $mkfilename; - if ($use_make) { - $mkfilename = "$dir/splag.$im1/domap.mk"; - open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; - print $mkfile "all: $dir/splag.$im1/map.done\n\n"; - } - my @mkouts = (); # only used with makefiles - for my $shard (@shards) { - my $mapoutput = $shard; - my $client_name = $shard; - $client_name =~ s/mapinput.//; - $client_name = "vest.$client_name"; - $mapoutput =~ s/mapinput/mapoutput/; - push @mapoutputs, "$dir/splag.$im1/$mapoutput"; - $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; - if ($run_local) { - print STDERR "COMMAND:\n$script\n"; - check_bash_call($script); - } elsif ($use_make) { - my $script_file = "$dir/scripts/map.$shard"; - open F, ">$script_file" or die "Can't write $script_file: $!"; - print F "#!/bin/bash\n"; - print F "$script\n"; - close F; - my $output = "$dir/splag.$im1/$mapoutput"; - push @mkouts, $output; - chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; - if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } - print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; - } else { - my $script_file = "$dir/scripts/map.$shard"; - open F, ">$script_file" or die "Can't write $script_file: $!"; - print F "$script\n"; - close F; - if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } - - $nmappers++; - my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; - my $jobid = check_output("$qcmd"); - chomp $jobid; - $jobid =~ s/^(\d+)(.*?)$/\1/g; - $jobid =~ s/^Your job (\d+) .*$/\1/; - push(@cleanupcmds, "qdel $jobid 2> /dev/null"); - print STDERR " $jobid"; - if ($joblist == "") { $joblist = $jobid; } - else {$joblist = $joblist . "\|" . $jobid; } - } - } + $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + check_call("mkdir -p $dir/splag.$im1"); + $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1 $dir/splag.$im1/mapinput."; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; + my @shards = grep { /^mapinput\./ } readdir(DIR); + closedir DIR; + die "No shards!" unless scalar @shards > 0; + my $joblist = ""; + my $nmappers = 0; + @cleanupcmds = (); + my %o2i = (); + my $first_shard = 1; + my $mkfile; # only used with makefiles + my $mkfilename; + if ($use_make) { + $mkfilename = "$dir/splag.$im1/domap.mk"; + open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; + print $mkfile "all: $dir/splag.$im1/map.done\n\n"; + } + my @mkouts = (); # only used with makefiles + for my $shard (@shards) { + my $mapoutput = $shard; + my $client_name = $shard; + $client_name =~ s/mapinput.//; + $client_name = "pro.$client_name"; + $mapoutput =~ s/mapinput/mapoutput/; + push @mapoutputs, "$dir/splag.$im1/$mapoutput"; + $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; + my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep @allweights < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; if ($run_local) { - print STDERR "\nProcessing line search complete.\n"; + print STDERR "COMMAND:\n$script\n"; + check_bash_call($script); } elsif ($use_make) { - print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; - close $mkfile; - my $mcmd = "make -j $use_make -f $mkfilename"; - print STDERR "\nExecuting: $mcmd\n"; - check_call($mcmd); + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "#!/bin/bash\n"; + print F "$script\n"; + close F; + my $output = "$dir/splag.$im1/$mapoutput"; + push @mkouts, $output; + chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; } else { - print STDERR "\nLaunched $nmappers mappers.\n"; - sleep 8; - print STDERR "Waiting for mappers to complete...\n"; - while ($nmappers > 0) { - sleep 5; - my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); - $nmappers = scalar @livejobs; - } - print STDERR "All mappers complete.\n"; + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "$script\n"; + close F; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + + $nmappers++; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = check_output("$qcmd"); + chomp $jobid; + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); + print STDERR " $jobid"; + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } } - my $tol = 0; - my $til = 0; - for my $mo (@mapoutputs) { - my $olines = get_lines($mo); - my $ilines = get_lines($o2i{$mo}); - $tol += $olines; - $til += $ilines; - die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; - } - print STDERR "Results for $tol/$til lines\n"; - print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; - print STDERR unchecked_output("date"); - $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1"; - print STDERR "COMMAND:\n$cmd\n"; - check_bash_call($cmd); - $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; - # sort returns failure even when it doesn't fail for some reason - my $best=unchecked_output("$cmd"); chomp $best; - print STDERR "$best\n"; - my ($oa, $x, $xscore) = split /\|/, $best; - $score = $xscore; - print STDERR "PROJECTED SCORE: $score\n"; - if (abs($x) < $epsilon) { - print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; - last; - } - my $psd = $score - $last_score; - $last_score = $score; - if (abs($psd) < $epsilon) { - print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; - last; - } - my ($origin, $axis) = split /\s+/, $oa; - - my %ori = convert($origin); - my %axi = convert($axis); - - my $finalFile="$dir/weights.$im1-$opt_iter"; - open W, ">$finalFile" or die "Can't write: $finalFile: $!"; - my $norm = 0; - for my $k (sort keys %ori) { - my $dd = $ori{$k} + $axi{$k} * $x; - $norm += $dd * $dd; - } - $norm = sqrt($norm); - $norm = 1; - for my $k (sort keys %ori) { - my $v = ($ori{$k} + $axi{$k} * $x) / $norm; - print W "$k $v\n"; + } + if ($run_local) { + print STDERR "\nCompleted extraction of training exemplars.\n"; + } elsif ($use_make) { + print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; + close $mkfile; + my $mcmd = "make -j $use_make -f $mkfilename"; + print STDERR "\nExecuting: $mcmd\n"; + check_call($mcmd); + } else { + print STDERR "\nLaunched $nmappers mappers.\n"; + sleep 8; + print STDERR "Waiting for mappers to complete...\n"; + while ($nmappers > 0) { + sleep 5; + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); + $nmappers = scalar @livejobs; } - check_call("rm $dir/splag.$im1/*"); - $inweights = $finalFile; + print STDERR "All mappers complete.\n"; } - $lastWeightsFile = "$dir/weights.$iteration"; - check_call("cp $inweights $lastWeightsFile"); - if ($icc < 2) { - print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; - last; + my $tol = 0; + my $til = 0; + print STDERR "MO: @mapoutputs\n"; + for my $mo (@mapoutputs) { + #my $olines = get_lines($mo); + #my $ilines = get_lines($o2i{$mo}); + #die "$mo: no training instances generated!" if $olines == 0; } + print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n"; + print STDERR unchecked_output("date"); + $cmd="cat @mapoutputs | $REDUCER -w $dir/weights.$im1 > $dir/weights.$iteration"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + $lastWeightsFile = "$dir/weights.$iteration"; $lastPScore = $score; $iteration++; print STDERR "\n==========\n"; @@ -488,24 +419,6 @@ print STDOUT "$lastWeightsFile\n"; exit 0; -sub normalize_weights { - my ($rfn, $rpts, $feat) = @_; - my @feat_names = @$rfn; - my @pts = @$rpts; - my $z = 1.0; - for (my $i=0; $i < scalar @feat_names; $i++) { - if ($feat_names[$i] eq $feat) { - $z = $pts[$i]; - last; - } - } - for (my $i=0; $i < scalar @feat_names; $i++) { - $pts[$i] /= $z; - } - print STDERR " NORM WEIGHTS: @pts\n"; - return @pts; -} - sub get_lines { my $fn = shift @_; open FL, "<$fn" or die "Couldn't read $fn: $!"; @@ -563,7 +476,6 @@ sub write_config { print $fh "HEAD NODE: $host\n"; print $fh "PMEM (DECODING): $pmem\n"; print $fh "CLEANUP: $cleanup\n"; - print $fh "INITIAL WEIGHTS: $initialWeights\n"; } sub update_weights_file { @@ -603,6 +515,7 @@ sub enseg { } close SRC; close NEWSRC; + die "Empty dev set!" if ($i == 0); } sub print_help { @@ -634,10 +547,6 @@ Options: --decoder <decoder path> Decoder binary to use. - --density-prune <N> - Limit the density of the hypergraph on each iteration to N times - the number of edges on the Viterbi path. - --help Print this message and exit. @@ -668,18 +577,9 @@ Options: After each iteration, rescale all feature weights such that feature- name has a weight of 1.0. - --rand-directions <num> - MERT will attempt to optimize along all of the principle directions, - set this parameter to explore other directions. Defaults to 5. - --source-file <file> Dev set source file. - --weights <file> - A file specifying initial feature weights. The format is - FeatureName_1 value1 - FeatureName_2 value2 - --workdir <dir> Directory for intermediate and output files. If not specified, the name is derived from the ini filename. Assuming that the ini diff --git a/pro-train/mr_pro_generate_mapper_input.pl b/pro-train/mr_pro_generate_mapper_input.pl new file mode 100755 index 00000000..b30fc4fd --- /dev/null +++ b/pro-train/mr_pro_generate_mapper_input.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1; +my $d = shift @ARGV; +die "Can't find directory $d" unless -d $d; + +opendir(DIR, $d) or die "Can't read $d: $!"; +my @hgs = grep { /\.gz$/ } readdir(DIR); +closedir DIR; + +for my $hg (@hgs) { + my $file = $hg; + my $id = $hg; + $id =~ s/(\.json)?\.gz//; + print "$d/$file $id\n"; +} + diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc index b046cdea..128d93ce 100644 --- a/pro-train/mr_pro_map.cc +++ b/pro-train/mr_pro_map.cc @@ -10,6 +10,7 @@ #include "sampler.h" #include "filelib.h" #include "stringlib.h" +#include "weights.h" #include "scorer.h" #include "inside_outside.h" #include "hg_io.h" @@ -27,10 +28,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") - ("source,s",po::value<string>(), "Source file (ignored, except for AER)") + ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)") ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized") ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") - ("weights,w",po::value<string>(), "[REQD] Current weights file") + ("weights,w",po::value<vector<string> >(), "[REQD] Weights files from previous and current iterations") ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract") ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)") ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") @@ -44,6 +45,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { cerr << "Please specify one or more references using -r <REF.TXT>\n"; flag = true; } + if (!conf->count("weights")) { + cerr << "Please specify one or more weights using -w <WEIGHTS.TXT>\n"; + flag = true; + } if (flag || conf->count("help")) { cerr << dcmdline_options << endl; exit(1); @@ -51,18 +56,78 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } struct HypInfo { - HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-1), x(feats) {} - double g() { + HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-100.0), x(feats) {} + + // lazy evaluation + double g(const SentenceScorer& scorer) const { + if (g_ == -100.0) + g_ = scorer.ScoreCandidate(hyp)->ComputeScore(); return g_; } - private: - int sent_id; vector<WordID> hyp; - double g_; + mutable double g_; public: SparseVector<double> x; }; +struct ThresholdAlpha { + explicit ThresholdAlpha(double t = 0.05) : threshold(t) {} + double operator()(double mag) const { + if (mag < threshold) return 0.0; else return 1.0; + } + const double threshold; +}; + +struct TrainingInstance { + TrainingInstance(const SparseVector<double>& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {} + SparseVector<double> x; +#ifdef DEBUGGING_PRO + vector<WordID> a; + vector<WordID> b; +#endif + bool y; + double gdiff; +}; + +struct DiffOrder { + bool operator()(const TrainingInstance& a, const TrainingInstance& b) const { + return a.gdiff > b.gdiff; + } +}; + +template<typename Alpha> +void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const Alpha& alpha_i, bool invert_score, vector<TrainingInstance>* pv) { + vector<TrainingInstance> v; + for (unsigned i = 0; i < gamma; ++i) { + size_t a = rng->inclusive(0, J_i.size() - 1)(); + size_t b = rng->inclusive(0, J_i.size() - 1)(); + if (a == b) continue; + double ga = J_i[a].g(scorer); + double gb = J_i[b].g(scorer); + bool positive = ga < gb; + if (invert_score) positive = !positive; + double gdiff = fabs(ga - gb); + if (!gdiff) continue; + if (rng->next() < alpha_i(gdiff)) { + v.push_back(TrainingInstance((J_i[a].x - J_i[b].x).erase_zeros(), positive, gdiff)); +#ifdef DEBUGGING_PRO + v.back().a = J_i[a].hyp; + v.back().b = J_i[b].hyp; +#endif + } + } + vector<TrainingInstance>::iterator mid = v.begin() + xi; + if (xi > v.size()) mid = v.end(); + partial_sort(v.begin(), mid, v.end(), DiffOrder()); + copy(v.begin(), mid, back_inserter(*pv)); +#ifdef DEBUGGING_PRO + if (v.size() >= 5) + for (int i =0; i < 5; ++i) { + cerr << v[i].gdiff << " y=" << v[i].y << "\tA:" << TD::GetString(v[i].a) << "\n\tB: " << TD::GetString(v[i].b) << endl; + } +#endif +} + int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); @@ -81,7 +146,15 @@ int main(int argc, char** argv) { const unsigned kbest_size = conf["kbest_size"].as<unsigned>(); const unsigned gamma = conf["candidate_pairs"].as<unsigned>(); const unsigned xi = conf["best_pairs"].as<unsigned>(); + vector<string> weights_files = conf["weights"].as<vector<string> >(); + vector<vector<double> > weights(weights_files.size()); + for (int i = 0; i < weights.size(); ++i) { + Weights w; + w.InitFromFile(weights_files[i]); + w.InitVector(&weights[i]); + } while(in) { + vector<TrainingInstance> v; string line; getline(in, line); if (line.empty()) continue; @@ -92,18 +165,27 @@ int main(int argc, char** argv) { is >> file >> sent_id; ReadFile rf(file); HypergraphIO::ReadFromJSON(rf.stream(), &hg); - KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size); - vector<HypInfo> J_i; - for (int i = 0; i < kbest_size; ++i) { - const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore(); - // if (invert_score) sentscore *= -1.0; - // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; - d->feature_values; - sentscore; + int start = weights.size(); + start -= 4; + if (start < 0) start = 0; + for (int i = start; i < weights.size(); ++i) { + hg.Reweight(weights[i]); + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size); + + for (int i = 0; i < kbest_size; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + J_i.push_back(HypInfo(d->yield, d->feature_values)); + } + } + + Sample(gamma, xi, J_i, *ds[sent_id], ThresholdAlpha(0.05), (type == TER), &v); + for (unsigned i = 0; i < v.size(); ++i) { + const TrainingInstance& vi = v[i]; + cout << vi.y << "\t" << vi.x << endl; + cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl; } } return 0; diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 3df52020..2b9c5ce7 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -1,3 +1,4 @@ +#include <cstdlib> #include <sstream> #include <iostream> #include <fstream> @@ -6,24 +7,29 @@ #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> +#include "weights.h" #include "sparse_vector.h" -#include "error_surface.h" -#include "line_optimizer.h" -#include "b64tools.h" +#include "optimize.h" using namespace std; namespace po = boost::program_options; +// since this is a ranking model, there should be equal numbers of +// positive and negative examples so the bias should be 0 +static const double MAX_BIAS = 1e-10; + void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("loss_function,l",po::value<string>(), "Loss function being optimized") + ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation") + ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev") + ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)") + ("sigma_squared,s",po::value<double>()->default_value(0.5), "Sigma squared for Gaussian prior") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = conf->count("loss_function") == 0; - if (flag || conf->count("help")) { + if (conf->count("help")) { cerr << dcmdline_options << endl; exit(1); } @@ -32,50 +38,127 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as<string>(); - ScoreType type = ScoreTypeFromString(loss_function); - LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; - if (type == TER || type == AER) { - opt_type = LineOptimizer::MINIMIZE_SCORE; + string line; + vector<pair<bool, SparseVector<double> > > training; + int lc = 0; + bool flag = false; + SparseVector<double> old_weights; + const double psi = conf["interpolation"].as<double>(); + if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } + if (conf.count("weights")) { + Weights w; + w.InitFromFile(conf["weights"].as<string>()); + w.InitSparseVector(&old_weights); } - string last_key; - vector<ErrorSurface> esv; - while(cin) { - string line; - getline(cin, line); + while(getline(cin, line)) { + ++lc; + if (lc % 1000 == 0) { cerr << '.'; flag = true; } + if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } if (line.empty()) continue; - size_t ks = line.find("\t"); + const size_t ks = line.find("\t"); assert(string::npos != ks); - assert(ks > 2); - string key = line.substr(2, ks - 2); - string val = line.substr(ks + 1); - if (key != last_key) { - if (!last_key.empty()) { - float score; - double x = LineOptimizer::LineOptimize(esv, opt_type, &score); - cout << last_key << "|" << x << "|" << score << endl; + assert(ks == 1); + const bool y = line[0] == '1'; + SparseVector<double> x; + size_t last_start = ks + 1; + size_t last_comma = string::npos; + size_t cur = last_start; + while(cur <= line.size()) { + if (line[cur] == ' ' || cur == line.size()) { + if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { + cerr << "[ERROR] " << line << endl << " position = " << cur << endl; + exit(1); + } + const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); + if (cur < line.size()) line[cur] = 0; + const double val = strtod(&line[last_comma + 1], NULL); + x.set_value(fid, val); + + last_comma = string::npos; + last_start = cur+1; + } else { + if (line[cur] == '=') + last_comma = cur; + } + ++cur; + } + training.push_back(make_pair(y, x)); + } + if (flag) cerr << endl; + + cerr << "Number of features: " << FD::NumFeats() << endl; + vector<double> x(FD::NumFeats(), 0.0); // x[0] is bias + for (SparseVector<double>::const_iterator it = old_weights.begin(); + it != old_weights.end(); ++it) + x[it->first] = it->second; + vector<double> vg(FD::NumFeats(), 0.0); + SparseVector<double> g; + bool converged = false; + LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as<unsigned>()); + while(!converged) { + double cll = 0; + double dbias = 0; + g.clear(); + for (int i = 0; i < training.size(); ++i) { + const double dotprod = training[i].second.dot(x) + x[0]; // x[0] is bias + double lp_false = dotprod; + double lp_true = -dotprod; + if (0 < lp_true) { + lp_true += log1p(exp(-lp_true)); + lp_false = log1p(exp(lp_false)); + } else { + lp_true = log1p(exp(lp_true)); + lp_false += log1p(exp(-lp_false)); + } + lp_true*=-1; + lp_false*=-1; + if (training[i].first) { // true label + cll -= lp_true; + g -= training[i].second * exp(lp_false); + dbias -= exp(lp_false); + } else { // false label + cll -= lp_false; + g += training[i].second * exp(lp_true); + dbias += exp(lp_true); } - last_key = key; - esv.clear(); } - if (val.size() % 4 != 0) { - cerr << "B64 encoding error 1! Skipping.\n"; - continue; + vg.clear(); + g.init_vector(&vg); + vg[0] = dbias; +#if 1 + const double sigsq = conf["sigma_squared"].as<double>(); + double norm = 0; + for (int i = 1; i < x.size(); ++i) { + const double mean_i = 0.0; + const double param = (x[i] - mean_i); + norm += param * param; + vg[i] += param / sigsq; + } + const double reg = norm / (2.0 * sigsq); +#else + double reg = 0; +#endif + cll += reg; + cerr << cll << " (REG=" << reg << ")\t"; + bool failed = false; + try { + opt.Optimize(cll, vg, &x); + } catch (...) { + cerr << "Exception caught, assuming convergence is close enough...\n"; + failed = true; } - string encoded(val.size() / 4 * 3, '\0'); - if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) { - cerr << "B64 encoding error 2! Skipping.\n"; - continue; + if (fabs(x[0]) > MAX_BIAS) { + cerr << "Biased model learned. Are your training instances wrong?\n"; + cerr << " BIAS: " << x[0] << endl; } - esv.push_back(ErrorSurface()); - esv.back().Deserialize(type, encoded); + converged = failed || opt.HasConverged(); } - if (!esv.empty()) { - // cerr << "ESV=" << esv.size() << endl; - // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; } - float score; - double x = LineOptimizer::LineOptimize(esv, opt_type, &score); - cout << last_key << "|" << x << "|" << score << endl; + Weights w; + if (conf.count("weights")) { + for (int i = 1; i < x.size(); ++i) + x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi); } + w.InitFromVector(x); + w.WriteToFile("-"); return 0; } |