diff options
| -rwxr-xr-x | pro-train/dist-pro.pl | 22 | ||||
| -rw-r--r-- | pro-train/mr_pro_reduce.cc | 82 | 
2 files changed, 66 insertions, 38 deletions
| diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index dbfa329a..4bc9cfe3 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -41,6 +41,7 @@ my $lines_per_mapper = 30;  my $iteration = 1;  my $run_local = 0;  my $best_weights; +my $psi = 1;  my $max_iterations = 30;  my $decode_nodes = 15;   # number of decode nodes  my $pmem = "4g"; @@ -62,7 +63,8 @@ my $cpbin=1;  # regularization strength  my $tune_regularizer = 0; -my $reg = 1e-2; +my $reg = 10; +my $reg_previous = 0;  # Process command-line options  Getopt::Long::Configure("no_auto_abbrev"); @@ -73,10 +75,12 @@ if (GetOptions(          "use-fork" => \$usefork,  	"dry-run" => \$dryrun,  	"epsilon=s" => \$epsilon, +	"interpolate-with-weights=f" => \$psi,  	"help" => \$help,          "weights=s" => \$initial_weights,  	"tune-regularizer" => \$tune_regularizer,  	"reg=f" => \$reg, +	"reg-previous=f" => \$reg_previous,  	"local" => \$run_local,  	"use-make=i" => \$use_make,  	"max-iterations=i" => \$max_iterations, @@ -91,6 +95,8 @@ if (GetOptions(  	exit;  } +die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer; +  if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; }  if ($metric =~ /^(combi|ter)$/i) { @@ -411,7 +417,7 @@ while (1){  	}  	print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n";  	print STDERR unchecked_output("date"); -	$cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -s $reg"; +	$cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -C $reg -y $reg_previous --interpolate_with_weights $psi";  	if ($tune_regularizer) {  		$cmd .= " -T -t $dev_test_file";  	} @@ -605,11 +611,21 @@ General options:  Regularization options: +	--interpolate-with-weights <F> +		[deprecated] At each iteration the resulting weights are +                interpolated with the weights from the previous iteration, with +                this factor. +  	--tune-regularizer  		Hold out one third of the tuning data and used this to tune the -		regularization parameter. +		regularization parameter. [this doesn't work well]  	--reg <F> +		l2 regularization strength + +	--reg-previous <F> +		l2 penalty for moving away from the weights from the previous +		iteration.  Help  } diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index aff410a0..98cddba2 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -23,11 +23,12 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    po::options_description opts("Configuration options");    opts.add_options()          ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation") -        ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev") +        ("regularize_to_weights,y",po::value<double>()->default_value(0.0), "Differences in learned weights to previous weights are penalized with an l2 penalty with this strength; 0.0 = no effect") +        ("interpolate_with_weights,p",po::value<double>()->default_value(1.0), "Output weights are p*w + (1-p)*w_prev; 1.0 = no effect")          ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)") -        ("sigma_squared,s",po::value<double>()->default_value(0.1), "Sigma squared for Gaussian prior") -        ("min_reg,r",po::value<double>()->default_value(1e-8), "When tuning (-T) regularization strength, minimum regularization strenght") -        ("max_reg,R",po::value<double>()->default_value(10.0), "When tuning (-T) regularization strength, maximum regularization strenght") +        ("regularization_strength,C",po::value<double>()->default_value(1.0), "l2 regularization strength") +        ("min_reg,r",po::value<double>()->default_value(0.01), "When tuning (-T) regularization strength, minimum regularization strenght") +        ("max_reg,R",po::value<double>()->default_value(1e6), "When tuning (-T) regularization strength, maximum regularization strenght")          ("testset,t",po::value<string>(), "Optional held-out test set")          ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength")          ("help,h", "Help"); @@ -95,6 +96,27 @@ void GradAdd(const SparseVector<weight_t>& v, const double scale, vector<weight_    }  } +double ApplyRegularizationTerms(const double C, +                                const double T, +                                const vector<weight_t>& weights, +                                const vector<weight_t>& prev_weights, +                                vector<weight_t>* g) { +  assert(weights.size() == g->size()); +  double reg = 0; +  for (size_t i = 0; i < weights.size(); ++i) { +    const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0); +    const double& w_i = weights[i]; +    double& g_i = (*g)[i]; +    reg += C * w_i * w_i; +    g_i += 2 * C * w_i; + +    const double diff_i = w_i - prev_w_i; +    reg += T * diff_i * diff_i; +    g_i += 2 * T * diff_i; +  } +  return reg; +} +  double TrainingInference(const vector<weight_t>& x,                           const vector<pair<bool, SparseVector<weight_t> > >& corpus,                           vector<weight_t>* g = NULL) { @@ -134,8 +156,10 @@ double TrainingInference(const vector<weight_t>& x,  // return held-out log likelihood  double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training,                         const vector<pair<bool, SparseVector<weight_t> > >& testing, -                       const double sigsq, +                       const double C, +                       const double T,                         const unsigned memory_buffers, +                       const vector<weight_t>& prev_x,                         vector<weight_t>* px) {    vector<weight_t>& x = *px;    vector<weight_t> vg(FD::NumFeats(), 0.0); @@ -157,26 +181,12 @@ double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& train      }      // handle regularizer -#if 1 -    double norm = 0; -    for (int i = 1; i < x.size(); ++i) { -      const double mean_i = 0.0; -      const double param = (x[i] - mean_i); -      norm += param * param; -      vg[i] += param / sigsq; -    }  -    const double reg = norm / (2.0 * sigsq); -#else -    double reg = 0; -#endif +    double reg = ApplyRegularizationTerms(C, T, x, prev_x, &vg);      cll += reg; -    cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t"; +    cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t" << endl;      try { -      vector<weight_t> old_x = x; -      do { -        opt.Optimize(cll, vg, &x); -        converged = opt.HasConverged(); -      } while (!converged && x == old_x); +      opt.Optimize(cll, vg, &x); +      converged = opt.HasConverged();      } catch (...) {        cerr << "Exception caught, assuming convergence is close enough...\n";        converged = true; @@ -201,13 +211,14 @@ int main(int argc, char** argv) {    }    const double min_reg = conf["min_reg"].as<double>();    const double max_reg = conf["max_reg"].as<double>(); -  double sigsq = conf["sigma_squared"].as<double>(); // will be overridden if parameter is tuned -  assert(sigsq > 0.0); +  double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned +  const double T = conf["regularize_to_weights"].as<double>(); +  assert(C > 0.0);    assert(min_reg > 0.0);    assert(max_reg > 0.0);    assert(max_reg > min_reg); -  const double psi = conf["interpolation"].as<double>(); -  if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } +  const double psi = conf["interpolate_with_weights"].as<double>(); +  if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; }    ReadCorpus(&cin, &training);    if (conf.count("testset")) {      ReadFile rf(conf["testset"].as<string>()); @@ -231,14 +242,15 @@ int main(int argc, char** argv) {    vector<pair<double,double> > sp;    vector<double> smoothed;    if (tune_regularizer) { -    sigsq = min_reg; +    C = min_reg;      const double steps = 18;      double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps);      cerr << "SWEEP FACTOR: " << sweep_factor << endl; -    while(sigsq < max_reg) { -      tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x); -      sp.push_back(make_pair(sigsq, tppl)); -      sigsq *= sweep_factor; +    while(C < max_reg) { +      cerr << "C=" << C << "\tT=" <<T << endl; +      tppl = LearnParameters(training, testing, C, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x); +      sp.push_back(make_pair(C, tppl)); +      C *= sweep_factor;      }      smoothed.resize(sp.size(), 0);      smoothed[0] = sp[0].second; @@ -257,16 +269,16 @@ int main(int argc, char** argv) {          best_i = i;        }      } -    sigsq = sp[best_i].first; +    C = sp[best_i].first;    }  // tune regularizer -  tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x); +  tppl = LearnParameters(training, testing, C, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x);    if (conf.count("weights")) {      for (int i = 1; i < x.size(); ++i) {        x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi);      }    }    cout.precision(15); -  cout << "# sigma^2=" << sigsq << "\theld out perplexity="; +  cout << "# C=" << C << "\theld out perplexity=";    if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; }    if (sp.size()) {      cout << "# Parameter sweep:\n"; | 
