diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2011-07-16 19:13:21 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2011-07-16 19:13:21 -0400 |
commit | b89c1f03c89c6c30b88099e4f3e0c1753d338ea7 (patch) | |
tree | 109992968d10f9d5f19e8eb34cfd981b6d0c8e8d /pro-train | |
parent | 73284fa32176da9f45953055c12951cb69395a02 (diff) |
tune regularizer
Diffstat (limited to 'pro-train')
-rwxr-xr-x | pro-train/dist-pro.pl | 139 | ||||
-rw-r--r-- | pro-train/mr_pro_reduce.cc | 128 |
2 files changed, 175 insertions, 92 deletions
diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index c42e3876..dbfa329a 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -37,42 +37,36 @@ die "Can't find decoder in $cdec" unless -x $cdec; die "Can't find $parallelize" unless -x $parallelize; die "Can't find $libcall" unless -e $libcall; my $decoder = $cdec; -my $lines_per_mapper = 100; +my $lines_per_mapper = 30; my $iteration = 1; my $run_local = 0; my $best_weights; -my $max_iterations = 15; -my $optimization_iters = 6; +my $max_iterations = 30; my $decode_nodes = 15; # number of decode nodes -my $pmem = "9g"; +my $pmem = "4g"; my $disable_clean = 0; my %seen_weights; -my $normalize; my $help = 0; my $epsilon = 0.0001; -my $interval = 5; my $dryrun = 0; my $last_score = -10000000; my $metric = "ibm_bleu"; my $dir; my $iniFile; my $weights; -my $decoderOpt; -my $noprimary; -my $maxsim=0; -my $oraclen=0; -my $oracleb=20; -my $bleu_weight=1; -my $use_make; # use make to parallelize line search -my $dirargs=''; +my $use_make; # use make to parallelize my $usefork; my $initial_weights; my $pass_suffix = ''; my $cpbin=1; + +# regularization strength +my $tune_regularizer = 0; +my $reg = 1e-2; + # Process command-line options Getopt::Long::Configure("no_auto_abbrev"); if (GetOptions( - "decoder=s" => \$decoderOpt, "decode-nodes=i" => \$decode_nodes, "dont-clean" => \$disable_clean, "pass-suffix=s" => \$pass_suffix, @@ -81,21 +75,13 @@ if (GetOptions( "epsilon=s" => \$epsilon, "help" => \$help, "weights=s" => \$initial_weights, - "interval" => \$interval, - "iteration=i" => \$iteration, + "tune-regularizer" => \$tune_regularizer, + "reg=f" => \$reg, "local" => \$run_local, "use-make=i" => \$use_make, "max-iterations=i" => \$max_iterations, - "normalize=s" => \$normalize, "pmem=s" => \$pmem, "cpbin!" => \$cpbin, - "bleu_weight=s" => \$bleu_weight, - "no-primary!" => \$noprimary, - "max-similarity=s" => \$maxsim, - "oracle-directions=i" => \$oraclen, - "n-oracle=i" => \$oraclen, - "oracle-batch=i" => \$oracleb, - "directions-args=s" => \$dirargs, "ref-files=s" => \$refFiles, "metric=s" => \$metric, "source-file=s" => \$srcFile, @@ -108,9 +94,7 @@ if (GetOptions( if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; } if ($metric =~ /^(combi|ter)$/i) { - $lines_per_mapper = 40; -} elsif ($metric =~ /^meteor$/i) { - $lines_per_mapper = 2000; # start up time is really high + $lines_per_mapper = 5; } ($iniFile) = @ARGV; @@ -144,8 +128,6 @@ unless ($dir =~ /^\//){ # convert relative path to absolute path $dir = "$basedir/$dir"; } -if ($decoderOpt){ $decoder = $decoderOpt; } - # Initializations and helper functions srand; @@ -378,6 +360,22 @@ while (1){ else {$joblist = $joblist . "\|" . $jobid; } } } + my @dev_outs = (); + my @devtest_outs = (); + if ($tune_regularizer) { + for (my $i = 0; $i < scalar @mapoutputs; $i++) { + if ($i % 3 == 1) { + push @devtest_outs, $mapoutputs[$i]; + } else { + push @dev_outs, $mapoutputs[$i]; + } + } + if (scalar @devtest_outs == 0) { + die "Not enough training instances for regularization tuning! Rerun without --tune-regularizer\n"; + } + } else { + @dev_outs = @mapoutputs; + } if ($run_local) { print STDERR "\nCompleted extraction of training exemplars.\n"; } elsif ($use_make) { @@ -399,7 +397,13 @@ while (1){ } my $tol = 0; my $til = 0; - print STDERR "MO: @mapoutputs\n"; + my $dev_test_file = "$dir/splag.$im1/devtest.gz"; + if ($tune_regularizer) { + my $cmd = "cat @devtest_outs | gzip > $dev_test_file"; + check_bash_call($cmd); + die "Can't find file $dev_test_file" unless -f $dev_test_file; + } + #print STDERR "MO: @mapoutputs\n"; for my $mo (@mapoutputs) { #my $olines = get_lines($mo); #my $ilines = get_lines($o2i{$mo}); @@ -407,10 +411,24 @@ while (1){ } print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n"; print STDERR unchecked_output("date"); - $cmd="cat @mapoutputs | $REDUCER -w $dir/weights.$im1 > $dir/weights.$iteration"; + $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -s $reg"; + if ($tune_regularizer) { + $cmd .= " -T -t $dev_test_file"; + } + $cmd .= " > $dir/weights.$iteration"; print STDERR "COMMAND:\n$cmd\n"; check_bash_call($cmd); $lastWeightsFile = "$dir/weights.$iteration"; + if ($tune_regularizer) { + open W, "<$lastWeightsFile" or die "Can't read $lastWeightsFile: $!"; + my $line = <W>; + close W; + my ($sharp, $label, $nreg) = split /\s|=/, $line; + print STDERR "REGULARIZATION STRENGTH ($label) IS $nreg\n"; + $reg = $nreg; + # only tune regularizer on first iteration? + $tune_regularizer = 0; + } $lastPScore = $score; $iteration++; print STDERR "\n==========\n"; @@ -473,7 +491,6 @@ sub write_config { print $fh "SOURCE (DEV): $srcFile\n"; print $fh "REFS (DEV): $refFiles\n"; print $fh "EVAL METRIC: $metric\n"; - print $fh "START ITERATION: $iteration\n"; print $fh "MAX ITERATIONS: $max_iterations\n"; print $fh "DECODE NODES: $decode_nodes\n"; print $fh "HEAD NODE: $host\n"; @@ -535,31 +552,38 @@ Usage: $executable [options] <ini file> based on certain conventions. For details, refer to descriptions of the options --decoder, --weights, and --workdir. -Options: +Required: + + --ref-files <files> + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + + --source-file <file> + Dev set source file. + + --weights <file> + Initial weights file (use empty file to start from 0) + +General options: --local Run the decoder and optimizer locally with a single thread. - --use-make <I> - Use make -j <I> to run the optimizer commands (useful on large - shared-memory machines where qsub is unavailable). - --decode-nodes <I> Number of decoder processes to run in parallel. [default=15] - --decoder <decoder path> - Decoder binary to use. - --help Print this message and exit. - --iteration <I> - Starting iteration number. If not specified, defaults to 1. - --max-iterations <M> Maximum number of iterations to run. If not specified, defaults to 10. + --metric <method> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + --pass-suffix <S> If the decoder is doing multi-pass decoding, the pass suffix "2", "3", etc., is used to control what iteration of weights is set. @@ -567,21 +591,9 @@ Options: --pmem <N> Amount of physical memory requested for parallel decoding jobs. - --ref-files <files> - Dev set ref files. This option takes only a single string argument. - To use multiple files (including file globbing), this argument should - be quoted. - - --metric <method> - Metric to optimize. - Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi - - --normalize <feature-name> - After each iteration, rescale all feature weights such that feature- - name has a weight of 1.0. - - --source-file <file> - Dev set source file. + --use-make <I> + Use make -j <I> to run the optimizer commands (useful on large + shared-memory machines where qsub is unavailable). --workdir <dir> Directory for intermediate and output files. If not specified, the @@ -591,6 +603,14 @@ Options: the filename. E.g. an ini file named decoder.foo.ini would have a default working directory name foo. +Regularization options: + + --tune-regularizer + Hold out one third of the tuning data and used this to tune the + regularization parameter. + + --reg <F> + Help } @@ -606,7 +626,6 @@ sub convert { } - sub cmdline { return join ' ',($0,@ORIG_ARGV); } diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 491ceb3a..9b422f33 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -16,7 +16,7 @@ using namespace std; namespace po = boost::program_options; // since this is a ranking model, there should be equal numbers of -// positive and negative examples so the bias should be 0 +// positive and negative examples, so the bias should be 0 static const double MAX_BIAS = 1e-10; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { @@ -25,8 +25,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation") ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev") ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)") - ("sigma_squared,s",po::value<double>()->default_value(1.0), "Sigma squared for Gaussian prior") - ("testset,t",po::value<string>(), "Optional held-out test set to tune regularizer") + ("sigma_squared,s",po::value<double>()->default_value(0.1), "Sigma squared for Gaussian prior") + ("min_reg,r",po::value<double>()->default_value(1e-8), "When tuning (-T) regularization strength, minimum regularization strenght") + ("max_reg,R",po::value<double>()->default_value(10.0), "When tuning (-T) regularization strength, maximum regularization strenght") + ("testset,t",po::value<string>(), "Optional held-out test set") + ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); @@ -95,8 +98,6 @@ void GradAdd(const SparseVector<double>& v, const double scale, vector<double>* double TrainingInference(const vector<double>& x, const vector<pair<bool, SparseVector<double> > >& corpus, vector<double>* g = NULL) { - if (g) fill(g->begin(), g->end(), 0.0); - double cll = 0; for (int i = 0; i < corpus.size(); ++i) { const double dotprod = corpus[i].second.dot(x) + x[0]; // x[0] is bias @@ -130,39 +131,23 @@ double TrainingInference(const vector<double>& x, return cll; } -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - string line; - vector<pair<bool, SparseVector<double> > > training, testing; - SparseVector<double> old_weights; - const double psi = conf["interpolation"].as<double>(); - if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } - if (conf.count("weights")) { - Weights w; - w.InitFromFile(conf["weights"].as<string>()); - w.InitSparseVector(&old_weights); - } - ReadCorpus(&cin, &training); - if (conf.count("testset")) { - ReadFile rf(conf["testset"].as<string>()); - ReadCorpus(rf.stream(), &testing); - } - - cerr << "Number of features: " << FD::NumFeats() << endl; - vector<double> x(FD::NumFeats(), 0.0); // x[0] is bias - for (SparseVector<double>::const_iterator it = old_weights.begin(); - it != old_weights.end(); ++it) - x[it->first] = it->second; +// return held-out log likelihood +double LearnParameters(const vector<pair<bool, SparseVector<double> > >& training, + const vector<pair<bool, SparseVector<double> > >& testing, + const double sigsq, + const unsigned memory_buffers, + vector<double>* px) { + vector<double>& x = *px; vector<double> vg(FD::NumFeats(), 0.0); bool converged = false; - LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as<unsigned>()); + LBFGSOptimizer opt(FD::NumFeats(), memory_buffers); + double tppl = 0.0; while(!converged) { + fill(vg.begin(), vg.end(), 0.0); double cll = TrainingInference(x, training, &vg); double ppl = cll / log(2); ppl /= training.size(); ppl = pow(2.0, ppl); - double tppl = 0.0; // evaluate optional held-out test set if (testing.size()) { @@ -173,7 +158,6 @@ int main(int argc, char** argv) { // handle regularizer #if 1 - const double sigsq = conf["sigma_squared"].as<double>(); double norm = 0; for (int i = 1; i < x.size(); ++i) { const double mean_i = 0.0; @@ -202,11 +186,91 @@ int main(int argc, char** argv) { cerr << " BIAS: " << x[0] << endl; } } + return tppl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + string line; + vector<pair<bool, SparseVector<double> > > training, testing; + SparseVector<double> old_weights; + const bool tune_regularizer = conf.count("tune_regularizer"); + if (tune_regularizer && !conf.count("testset")) { + cerr << "--tune_regularizer requires --testset to be set\n"; + return 1; + } + const double min_reg = conf["min_reg"].as<double>(); + const double max_reg = conf["max_reg"].as<double>(); + double sigsq = conf["sigma_squared"].as<double>(); + assert(sigsq > 0.0); + assert(min_reg > 0.0); + assert(max_reg > 0.0); + assert(max_reg > min_reg); + const double psi = conf["interpolation"].as<double>(); + if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } + if (conf.count("weights")) { + Weights w; + w.InitFromFile(conf["weights"].as<string>()); + w.InitSparseVector(&old_weights); + } + ReadCorpus(&cin, &training); + if (conf.count("testset")) { + ReadFile rf(conf["testset"].as<string>()); + ReadCorpus(rf.stream(), &testing); + } + cerr << "Number of features: " << FD::NumFeats() << endl; + vector<double> x(FD::NumFeats(), 0.0); // x[0] is bias + for (SparseVector<double>::const_iterator it = old_weights.begin(); + it != old_weights.end(); ++it) + x[it->first] = it->second; + double tppl = 0.0; + vector<pair<double,double> > sp; + vector<double> smoothed; + if (tune_regularizer) { + sigsq = min_reg; + const double steps = 18; + double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps); + cerr << "SWEEP FACTOR: " << sweep_factor << endl; + while(sigsq < max_reg) { + tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x); + sp.push_back(make_pair(sigsq, tppl)); + sigsq *= sweep_factor; + } + smoothed.resize(sp.size(), 0); + smoothed[0] = sp[0].second; + smoothed.back() = sp.back().second; + for (int i = 1; i < sp.size()-1; ++i) { + double prev = sp[i-1].second; + double next = sp[i+1].second; + double cur = sp[i].second; + smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next); + } + double best_ppl = 9999999; + unsigned best_i = 0; + for (unsigned i = 0; i < sp.size(); ++i) { + if (smoothed[i] < best_ppl) { + best_ppl = smoothed[i]; + best_i = i; + } + } + sigsq = sp[best_i].first; + tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x); + } Weights w; if (conf.count("weights")) { for (int i = 1; i < x.size(); ++i) x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi); } + cout.precision(15); + cout << "# sigma^2=" << sigsq << "\theld out perplexity="; + if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; } + if (sp.size()) { + cout << "# Parameter sweep:\n"; + for (int i = 0; i < sp.size(); ++i) { + cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl; + } + } w.InitFromVector(x); w.WriteToFile("-"); return 0; |