From bcda3258ab35cba2f71e28e1c93863958f5aca8b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 7 Nov 2011 18:09:47 -0500 Subject: updates to pro to support regularization to previous weight vectors, regualarization normalization, disable broken regularization tuning --- pro-train/dist-pro.pl | 22 +++++++++++-- pro-train/mr_pro_reduce.cc | 82 ++++++++++++++++++++++++++-------------------- 2 files changed, 66 insertions(+), 38 deletions(-) diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index dbfa329a..4bc9cfe3 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -41,6 +41,7 @@ my $lines_per_mapper = 30; my $iteration = 1; my $run_local = 0; my $best_weights; +my $psi = 1; my $max_iterations = 30; my $decode_nodes = 15; # number of decode nodes my $pmem = "4g"; @@ -62,7 +63,8 @@ my $cpbin=1; # regularization strength my $tune_regularizer = 0; -my $reg = 1e-2; +my $reg = 10; +my $reg_previous = 0; # Process command-line options Getopt::Long::Configure("no_auto_abbrev"); @@ -73,10 +75,12 @@ if (GetOptions( "use-fork" => \$usefork, "dry-run" => \$dryrun, "epsilon=s" => \$epsilon, + "interpolate-with-weights=f" => \$psi, "help" => \$help, "weights=s" => \$initial_weights, "tune-regularizer" => \$tune_regularizer, "reg=f" => \$reg, + "reg-previous=f" => \$reg_previous, "local" => \$run_local, "use-make=i" => \$use_make, "max-iterations=i" => \$max_iterations, @@ -91,6 +95,8 @@ if (GetOptions( exit; } +die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer; + if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; } if ($metric =~ /^(combi|ter)$/i) { @@ -411,7 +417,7 @@ while (1){ } print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n"; print STDERR unchecked_output("date"); - $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -s $reg"; + $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -C $reg -y $reg_previous --interpolate_with_weights $psi"; if ($tune_regularizer) { $cmd .= " -T -t $dev_test_file"; } @@ -605,11 +611,21 @@ General options: Regularization options: + --interpolate-with-weights + [deprecated] At each iteration the resulting weights are + interpolated with the weights from the previous iteration, with + this factor. + --tune-regularizer Hold out one third of the tuning data and used this to tune the - regularization parameter. + regularization parameter. [this doesn't work well] --reg + l2 regularization strength + + --reg-previous + l2 penalty for moving away from the weights from the previous + iteration. Help } diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index aff410a0..98cddba2 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -23,11 +23,12 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("weights,w", po::value(), "Weights from previous iteration (used as initialization and interpolation") - ("interpolation,p",po::value()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev") + ("regularize_to_weights,y",po::value()->default_value(0.0), "Differences in learned weights to previous weights are penalized with an l2 penalty with this strength; 0.0 = no effect") + ("interpolate_with_weights,p",po::value()->default_value(1.0), "Output weights are p*w + (1-p)*w_prev; 1.0 = no effect") ("memory_buffers,m",po::value()->default_value(200), "Number of memory buffers (LBFGS)") - ("sigma_squared,s",po::value()->default_value(0.1), "Sigma squared for Gaussian prior") - ("min_reg,r",po::value()->default_value(1e-8), "When tuning (-T) regularization strength, minimum regularization strenght") - ("max_reg,R",po::value()->default_value(10.0), "When tuning (-T) regularization strength, maximum regularization strenght") + ("regularization_strength,C",po::value()->default_value(1.0), "l2 regularization strength") + ("min_reg,r",po::value()->default_value(0.01), "When tuning (-T) regularization strength, minimum regularization strenght") + ("max_reg,R",po::value()->default_value(1e6), "When tuning (-T) regularization strength, maximum regularization strenght") ("testset,t",po::value(), "Optional held-out test set") ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength") ("help,h", "Help"); @@ -95,6 +96,27 @@ void GradAdd(const SparseVector& v, const double scale, vector& weights, + const vector& prev_weights, + vector* g) { + assert(weights.size() == g->size()); + double reg = 0; + for (size_t i = 0; i < weights.size(); ++i) { + const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0); + const double& w_i = weights[i]; + double& g_i = (*g)[i]; + reg += C * w_i * w_i; + g_i += 2 * C * w_i; + + const double diff_i = w_i - prev_w_i; + reg += T * diff_i * diff_i; + g_i += 2 * T * diff_i; + } + return reg; +} + double TrainingInference(const vector& x, const vector > >& corpus, vector* g = NULL) { @@ -134,8 +156,10 @@ double TrainingInference(const vector& x, // return held-out log likelihood double LearnParameters(const vector > >& training, const vector > >& testing, - const double sigsq, + const double C, + const double T, const unsigned memory_buffers, + const vector& prev_x, vector* px) { vector& x = *px; vector vg(FD::NumFeats(), 0.0); @@ -157,26 +181,12 @@ double LearnParameters(const vector > >& train } // handle regularizer -#if 1 - double norm = 0; - for (int i = 1; i < x.size(); ++i) { - const double mean_i = 0.0; - const double param = (x[i] - mean_i); - norm += param * param; - vg[i] += param / sigsq; - } - const double reg = norm / (2.0 * sigsq); -#else - double reg = 0; -#endif + double reg = ApplyRegularizationTerms(C, T, x, prev_x, &vg); cll += reg; - cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t"; + cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t" << endl; try { - vector old_x = x; - do { - opt.Optimize(cll, vg, &x); - converged = opt.HasConverged(); - } while (!converged && x == old_x); + opt.Optimize(cll, vg, &x); + converged = opt.HasConverged(); } catch (...) { cerr << "Exception caught, assuming convergence is close enough...\n"; converged = true; @@ -201,13 +211,14 @@ int main(int argc, char** argv) { } const double min_reg = conf["min_reg"].as(); const double max_reg = conf["max_reg"].as(); - double sigsq = conf["sigma_squared"].as(); // will be overridden if parameter is tuned - assert(sigsq > 0.0); + double C = conf["regularization_strength"].as(); // will be overridden if parameter is tuned + const double T = conf["regularize_to_weights"].as(); + assert(C > 0.0); assert(min_reg > 0.0); assert(max_reg > 0.0); assert(max_reg > min_reg); - const double psi = conf["interpolation"].as(); - if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } + const double psi = conf["interpolate_with_weights"].as(); + if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; } ReadCorpus(&cin, &training); if (conf.count("testset")) { ReadFile rf(conf["testset"].as()); @@ -231,14 +242,15 @@ int main(int argc, char** argv) { vector > sp; vector smoothed; if (tune_regularizer) { - sigsq = min_reg; + C = min_reg; const double steps = 18; double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps); cerr << "SWEEP FACTOR: " << sweep_factor << endl; - while(sigsq < max_reg) { - tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as(), &x); - sp.push_back(make_pair(sigsq, tppl)); - sigsq *= sweep_factor; + while(C < max_reg) { + cerr << "C=" << C << "\tT=" <(), prev_x, &x); + sp.push_back(make_pair(C, tppl)); + C *= sweep_factor; } smoothed.resize(sp.size(), 0); smoothed[0] = sp[0].second; @@ -257,16 +269,16 @@ int main(int argc, char** argv) { best_i = i; } } - sigsq = sp[best_i].first; + C = sp[best_i].first; } // tune regularizer - tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as(), &x); + tppl = LearnParameters(training, testing, C, T, conf["memory_buffers"].as(), prev_x, &x); if (conf.count("weights")) { for (int i = 1; i < x.size(); ++i) { x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi); } } cout.precision(15); - cout << "# sigma^2=" << sigsq << "\theld out perplexity="; + cout << "# C=" << C << "\theld out perplexity="; if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; } if (sp.size()) { cout << "# Parameter sweep:\n"; -- cgit v1.2.3