tune regularizer

author: Chris Dyer <cdyer@cs.cmu.edu> 2011-07-16 19:13:21 -0400
committer: Chris Dyer <cdyer@cs.cmu.edu> 2011-07-16 19:13:21 -0400
commit: c3828b0a2deb42de5c7378e93f93f5e69efb304c (patch)
tree: 0f7c0dc043caac5fc28e52c54da2746272bf1887 /pro-train
parent: 816bee82abc909335d4f3a300cff99afa4dd1da5 (diff)
2 files changed, 175 insertions, 92 deletions
diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl
index c42e3876..dbfa329a 100755
--- a/pro-train/dist-pro.pl
+++ b/pro-train/dist-pro.pl
@@ -37,42 +37,36 @@ die "Can't find decoder in $cdec" unless -x $cdec;
 die "Can't find $parallelize" unless -x $parallelize;
 die "Can't find $libcall" unless -e $libcall;
 my $decoder = $cdec;
-my $lines_per_mapper = 100;
+my $lines_per_mapper = 30;
 my $iteration = 1;
 my $run_local = 0;
 my $best_weights;
-my $max_iterations = 15;
-my $optimization_iters = 6;
+my $max_iterations = 30;
 my $decode_nodes = 15;   # number of decode nodes
-my $pmem = "9g";
+my $pmem = "4g";
 my $disable_clean = 0;
 my %seen_weights;
-my $normalize;
 my $help = 0;
 my $epsilon = 0.0001;
-my $interval = 5;
 my $dryrun = 0;
 my $last_score = -10000000;
 my $metric = "ibm_bleu";
 my $dir;
 my $iniFile;
 my $weights;
-my $decoderOpt;
-my $noprimary;
-my $maxsim=0;
-my $oraclen=0;
-my $oracleb=20;
-my $bleu_weight=1;
-my $use_make;  # use make to parallelize line search
-my $dirargs='';
+my $use_make;  # use make to parallelize
 my $usefork;
 my $initial_weights;
 my $pass_suffix = '';
 my $cpbin=1;
+
+# regularization strength
+my $tune_regularizer = 0;
+my $reg = 1e-2;
+
 # Process command-line options
 Getopt::Long::Configure("no_auto_abbrev");
 if (GetOptions(
-	"decoder=s" => \$decoderOpt,
 	"decode-nodes=i" => \$decode_nodes,
 	"dont-clean" => \$disable_clean,
 	"pass-suffix=s" => \$pass_suffix,
@@ -81,21 +75,13 @@ if (GetOptions(
 	"epsilon=s" => \$epsilon,
 	"help" => \$help,
         "weights=s" => \$initial_weights,
-	"interval" => \$interval,
-	"iteration=i" => \$iteration,
+	"tune-regularizer" => \$tune_regularizer,
+	"reg=f" => \$reg,
 	"local" => \$run_local,
 	"use-make=i" => \$use_make,
 	"max-iterations=i" => \$max_iterations,
-	"normalize=s" => \$normalize,
 	"pmem=s" => \$pmem,
         "cpbin!" => \$cpbin,
-        "bleu_weight=s" => \$bleu_weight,
-        "no-primary!" => \$noprimary,
-        "max-similarity=s" => \$maxsim,
-        "oracle-directions=i" => \$oraclen,
-        "n-oracle=i" => \$oraclen,
-        "oracle-batch=i" => \$oracleb,
-        "directions-args=s" => \$dirargs,
 	"ref-files=s" => \$refFiles,
 	"metric=s" => \$metric,
 	"source-file=s" => \$srcFile,
@@ -108,9 +94,7 @@ if (GetOptions(
 if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; }
 
 if ($metric =~ /^(combi|ter)$/i) {
-  $lines_per_mapper = 40;
-} elsif ($metric =~ /^meteor$/i) {
-  $lines_per_mapper = 2000;   # start up time is really high
+  $lines_per_mapper = 5;
 }
 
 ($iniFile) = @ARGV;
@@ -144,8 +128,6 @@ unless ($dir =~ /^\//){  # convert relative path to absolute path
 	$dir = "$basedir/$dir";
 }
 
-if ($decoderOpt){ $decoder = $decoderOpt; }
-
 
 # Initializations and helper functions
 srand;
@@ -378,6 +360,22 @@ while (1){
 			else {$joblist = $joblist . "\|" . $jobid; }
 		}
 	}
+	my @dev_outs = ();
+	my @devtest_outs = ();
+	if ($tune_regularizer) {
+		for (my $i = 0; $i < scalar @mapoutputs; $i++) {
+			if ($i % 3 == 1) {
+				push @devtest_outs, $mapoutputs[$i];
+			} else {
+				push @dev_outs, $mapoutputs[$i];
+			}
+		}
+		if (scalar @devtest_outs == 0) {
+			die "Not enough training instances for regularization tuning! Rerun without --tune-regularizer\n";
+		}
+	} else {
+		@dev_outs = @mapoutputs;
+	}
 	if ($run_local) {
 		print STDERR "\nCompleted extraction of training exemplars.\n";
 	} elsif ($use_make) {
@@ -399,7 +397,13 @@ while (1){
 	}
 	my $tol = 0;
 	my $til = 0;
-        print STDERR "MO: @mapoutputs\n";
+	my $dev_test_file = "$dir/splag.$im1/devtest.gz";
+	if ($tune_regularizer) {
+		my $cmd = "cat @devtest_outs | gzip > $dev_test_file";
+		check_bash_call($cmd);
+		die "Can't find file $dev_test_file" unless -f $dev_test_file;
+	}
+        #print STDERR "MO: @mapoutputs\n";
 	for my $mo (@mapoutputs) {
 		#my $olines = get_lines($mo);
 		#my $ilines = get_lines($o2i{$mo});
@@ -407,10 +411,24 @@ while (1){
 	}
 	print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n";
 	print STDERR unchecked_output("date");
-	$cmd="cat @mapoutputs | $REDUCER -w $dir/weights.$im1 > $dir/weights.$iteration";
+	$cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -s $reg";
+	if ($tune_regularizer) {
+		$cmd .= " -T -t $dev_test_file";
+	}
+        $cmd .= " > $dir/weights.$iteration";
 	print STDERR "COMMAND:\n$cmd\n";
 	check_bash_call($cmd);
 	$lastWeightsFile = "$dir/weights.$iteration";
+	if ($tune_regularizer) {
+		open W, "<$lastWeightsFile" or die "Can't read $lastWeightsFile: $!";
+		my $line = <W>;
+		close W;
+		my ($sharp, $label, $nreg) = split /\s|=/, $line;
+		print STDERR "REGULARIZATION STRENGTH ($label) IS $nreg\n";
+		$reg = $nreg;
+		# only tune regularizer on first iteration?
+		$tune_regularizer = 0;
+	}
 	$lastPScore = $score;
 	$iteration++;
 	print STDERR "\n==========\n";
@@ -473,7 +491,6 @@ sub write_config {
 	print $fh "SOURCE (DEV):     $srcFile\n";
 	print $fh "REFS (DEV):       $refFiles\n";
 	print $fh "EVAL METRIC:      $metric\n";
-	print $fh "START ITERATION:  $iteration\n";
 	print $fh "MAX ITERATIONS:   $max_iterations\n";
 	print $fh "DECODE NODES:     $decode_nodes\n";
 	print $fh "HEAD NODE:        $host\n";
@@ -535,31 +552,38 @@ Usage: $executable [options] <ini file>
 		based on certain conventions.  For details, refer to descriptions
 		of the options --decoder, --weights, and --workdir.
 
-Options:
+Required:
+
+	--ref-files <files>
+		Dev set ref files.  This option takes only a single string argument.
+		To use multiple files (including file globbing), this argument should
+		be quoted.
+
+	--source-file <file>
+		Dev set source file.
+
+	--weights <file>
+		Initial weights file (use empty file to start from 0)
+
+General options:
 
 	--local
 		Run the decoder and optimizer locally with a single thread.
 
-	--use-make <I>
-		Use make -j <I> to run the optimizer commands (useful on large
-		shared-memory machines where qsub is unavailable).
-
 	--decode-nodes <I>
 		Number of decoder processes to run in parallel. [default=15]
 
-	--decoder <decoder path>
-		Decoder binary to use.
-
 	--help
 		Print this message and exit.
 
-	--iteration <I>
-		Starting iteration number.  If not specified, defaults to 1.
-
 	--max-iterations <M>
 		Maximum number of iterations to run.  If not specified, defaults
 		to 10.
 
+	--metric <method>
+		Metric to optimize.
+		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
 	--pass-suffix <S>
 		If the decoder is doing multi-pass decoding, the pass suffix "2",
 		"3", etc., is used to control what iteration of weights is set.
@@ -567,21 +591,9 @@ Options:
 	--pmem <N>
 		Amount of physical memory requested for parallel decoding jobs.
 
-	--ref-files <files>
-		Dev set ref files.  This option takes only a single string argument.
-		To use multiple files (including file globbing), this argument should
-		be quoted.
-
-	--metric <method>
-		Metric to optimize.
-		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
-
-	--normalize <feature-name>
-		After each iteration, rescale all feature weights such that feature-
-		name has a weight of 1.0.
-
-	--source-file <file>
-		Dev set source file.
+	--use-make <I>
+		Use make -j <I> to run the optimizer commands (useful on large
+		shared-memory machines where qsub is unavailable).
 
 	--workdir <dir>
 		Directory for intermediate and output files.  If not specified, the
@@ -591,6 +603,14 @@ Options:
 		the filename.  E.g. an ini file named decoder.foo.ini would have
 		a default working directory name foo.
 
+Regularization options:
+
+	--tune-regularizer
+		Hold out one third of the tuning data and used this to tune the
+		regularization parameter.
+
+	--reg <F>
+
 Help
 }
 
@@ -606,7 +626,6 @@ sub convert {
 }
 
 
-
 sub cmdline {
     return join ' ',($0,@ORIG_ARGV);
 }
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 491ceb3a..9b422f33 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -16,7 +16,7 @@ using namespace std;
 namespace po = boost::program_options;
 
 // since this is a ranking model, there should be equal numbers of
-// positive and negative examples so the bias should be 0
+// positive and negative examples, so the bias should be 0
 static const double MAX_BIAS = 1e-10;
 
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
@@ -25,8 +25,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")
         ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev")
         ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)")
-        ("sigma_squared,s",po::value<double>()->default_value(1.0), "Sigma squared for Gaussian prior")
-        ("testset,t",po::value<string>(), "Optional held-out test set to tune regularizer")
+        ("sigma_squared,s",po::value<double>()->default_value(0.1), "Sigma squared for Gaussian prior")
+        ("min_reg,r",po::value<double>()->default_value(1e-8), "When tuning (-T) regularization strength, minimum regularization strenght")
+        ("max_reg,R",po::value<double>()->default_value(10.0), "When tuning (-T) regularization strength, maximum regularization strenght")
+        ("testset,t",po::value<string>(), "Optional held-out test set")
+        ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength")
         ("help,h", "Help");
   po::options_description dcmdline_options;
   dcmdline_options.add(opts);
@@ -95,8 +98,6 @@ void GradAdd(const SparseVector<double>& v, const double scale, vector<double>*
 double TrainingInference(const vector<double>& x,
                          const vector<pair<bool, SparseVector<double> > >& corpus,
                          vector<double>* g = NULL) {
-  if (g) fill(g->begin(), g->end(), 0.0);
-
   double cll = 0;
   for (int i = 0; i < corpus.size(); ++i) {
     const double dotprod = corpus[i].second.dot(x) + x[0]; // x[0] is bias
@@ -130,39 +131,23 @@ double TrainingInference(const vector<double>& x,
   return cll;
 }
 
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-  string line;
-  vector<pair<bool, SparseVector<double> > > training, testing;
-  SparseVector<double> old_weights;
-  const double psi = conf["interpolation"].as<double>();
-  if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
-  if (conf.count("weights")) {
-    Weights w;
-    w.InitFromFile(conf["weights"].as<string>());
-    w.InitSparseVector(&old_weights);
-  }
-  ReadCorpus(&cin, &training);
-  if (conf.count("testset")) {
-    ReadFile rf(conf["testset"].as<string>());
-    ReadCorpus(rf.stream(), &testing);
-  }
-
-  cerr << "Number of features: " << FD::NumFeats() << endl;
-  vector<double> x(FD::NumFeats(), 0.0);  // x[0] is bias
-  for (SparseVector<double>::const_iterator it = old_weights.begin();
-       it != old_weights.end(); ++it)
-    x[it->first] = it->second;
+// return held-out log likelihood
+double LearnParameters(const vector<pair<bool, SparseVector<double> > >& training,
+                       const vector<pair<bool, SparseVector<double> > >& testing,
+                       const double sigsq,
+                       const unsigned memory_buffers,
+                       vector<double>* px) {
+  vector<double>& x = *px;
   vector<double> vg(FD::NumFeats(), 0.0);
   bool converged = false;
-  LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as<unsigned>());
+  LBFGSOptimizer opt(FD::NumFeats(), memory_buffers);
+  double tppl = 0.0;
   while(!converged) {
+    fill(vg.begin(), vg.end(), 0.0);
     double cll = TrainingInference(x, training, &vg);
     double ppl = cll / log(2);
     ppl /= training.size();
     ppl = pow(2.0, ppl);
-    double tppl = 0.0;
 
     // evaluate optional held-out test set
     if (testing.size()) {
@@ -173,7 +158,6 @@ int main(int argc, char** argv) {
 
     // handle regularizer
 #if 1
-    const double sigsq = conf["sigma_squared"].as<double>();
     double norm = 0;
     for (int i = 1; i < x.size(); ++i) {
       const double mean_i = 0.0;
@@ -202,11 +186,91 @@ int main(int argc, char** argv) {
       cerr << "  BIAS: " << x[0] << endl;
     }
   }
+  return tppl;
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  string line;
+  vector<pair<bool, SparseVector<double> > > training, testing;
+  SparseVector<double> old_weights;
+  const bool tune_regularizer = conf.count("tune_regularizer");
+  if (tune_regularizer && !conf.count("testset")) {
+    cerr << "--tune_regularizer requires --testset to be set\n";
+    return 1;
+  }
+  const double min_reg = conf["min_reg"].as<double>();
+  const double max_reg = conf["max_reg"].as<double>();
+  double sigsq = conf["sigma_squared"].as<double>();
+  assert(sigsq > 0.0);
+  assert(min_reg > 0.0);
+  assert(max_reg > 0.0);
+  assert(max_reg > min_reg);
+  const double psi = conf["interpolation"].as<double>();
+  if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
+  if (conf.count("weights")) {
+    Weights w;
+    w.InitFromFile(conf["weights"].as<string>());
+    w.InitSparseVector(&old_weights);
+  }
+  ReadCorpus(&cin, &training);
+  if (conf.count("testset")) {
+    ReadFile rf(conf["testset"].as<string>());
+    ReadCorpus(rf.stream(), &testing);
+  }
+  cerr << "Number of features: " << FD::NumFeats() << endl;
+  vector<double> x(FD::NumFeats(), 0.0);  // x[0] is bias
+  for (SparseVector<double>::const_iterator it = old_weights.begin();
+       it != old_weights.end(); ++it)
+    x[it->first] = it->second;
+  double tppl = 0.0;
+  vector<pair<double,double> > sp;
+  vector<double> smoothed;
+  if (tune_regularizer) {
+    sigsq = min_reg;
+    const double steps = 18;
+    double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps);
+    cerr << "SWEEP FACTOR: " << sweep_factor << endl;
+    while(sigsq < max_reg) {
+      tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
+      sp.push_back(make_pair(sigsq, tppl));
+      sigsq *= sweep_factor;
+    }
+    smoothed.resize(sp.size(), 0);
+    smoothed[0] = sp[0].second;
+    smoothed.back() = sp.back().second; 
+    for (int i = 1; i < sp.size()-1; ++i) {
+      double prev = sp[i-1].second;
+      double next = sp[i+1].second;
+      double cur = sp[i].second;
+      smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next);
+    }
+    double best_ppl = 9999999;
+    unsigned best_i = 0;
+    for (unsigned i = 0; i < sp.size(); ++i) {
+      if (smoothed[i] < best_ppl) {
+        best_ppl = smoothed[i];
+        best_i = i;
+      }
+    }
+    sigsq = sp[best_i].first;
+    tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
+  }
   Weights w;
   if (conf.count("weights")) {
     for (int i = 1; i < x.size(); ++i)
       x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi);
   }
+  cout.precision(15);
+  cout << "# sigma^2=" << sigsq << "\theld out perplexity=";
+  if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; }
+  if (sp.size()) {
+    cout << "# Parameter sweep:\n";
+    for (int i = 0; i < sp.size(); ++i) {
+      cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl;
+    }
+  }
   w.InitFromVector(x);
   w.WriteToFile("-");
   return 0;
author	Chris Dyer <cdyer@cs.cmu.edu>	2011-07-16 19:13:21 -0400
committer	Chris Dyer <cdyer@cs.cmu.edu>	2011-07-16 19:13:21 -0400
commit	c3828b0a2deb42de5c7378e93f93f5e69efb304c (patch)
tree	0f7c0dc043caac5fc28e52c54da2746272bf1887 /pro-train
parent	816bee82abc909335d4f3a300cff99afa4dd1da5 (diff)