From c3828b0a2deb42de5c7378e93f93f5e69efb304c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 16 Jul 2011 19:13:21 -0400 Subject: tune regularizer --- pro-train/mr_pro_reduce.cc | 128 +++++++++++++++++++++++++++++++++------------ 1 file changed, 96 insertions(+), 32 deletions(-) (limited to 'pro-train/mr_pro_reduce.cc') diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 491ceb3a..9b422f33 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -16,7 +16,7 @@ using namespace std; namespace po = boost::program_options; // since this is a ranking model, there should be equal numbers of -// positive and negative examples so the bias should be 0 +// positive and negative examples, so the bias should be 0 static const double MAX_BIAS = 1e-10; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { @@ -25,8 +25,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("weights,w", po::value(), "Weights from previous iteration (used as initialization and interpolation") ("interpolation,p",po::value()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev") ("memory_buffers,m",po::value()->default_value(200), "Number of memory buffers (LBFGS)") - ("sigma_squared,s",po::value()->default_value(1.0), "Sigma squared for Gaussian prior") - ("testset,t",po::value(), "Optional held-out test set to tune regularizer") + ("sigma_squared,s",po::value()->default_value(0.1), "Sigma squared for Gaussian prior") + ("min_reg,r",po::value()->default_value(1e-8), "When tuning (-T) regularization strength, minimum regularization strenght") + ("max_reg,R",po::value()->default_value(10.0), "When tuning (-T) regularization strength, maximum regularization strenght") + ("testset,t",po::value(), "Optional held-out test set") + ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); @@ -95,8 +98,6 @@ void GradAdd(const SparseVector& v, const double scale, vector* double TrainingInference(const vector& x, const vector > >& corpus, vector* g = NULL) { - if (g) fill(g->begin(), g->end(), 0.0); - double cll = 0; for (int i = 0; i < corpus.size(); ++i) { const double dotprod = corpus[i].second.dot(x) + x[0]; // x[0] is bias @@ -130,39 +131,23 @@ double TrainingInference(const vector& x, return cll; } -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - string line; - vector > > training, testing; - SparseVector old_weights; - const double psi = conf["interpolation"].as(); - if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } - if (conf.count("weights")) { - Weights w; - w.InitFromFile(conf["weights"].as()); - w.InitSparseVector(&old_weights); - } - ReadCorpus(&cin, &training); - if (conf.count("testset")) { - ReadFile rf(conf["testset"].as()); - ReadCorpus(rf.stream(), &testing); - } - - cerr << "Number of features: " << FD::NumFeats() << endl; - vector x(FD::NumFeats(), 0.0); // x[0] is bias - for (SparseVector::const_iterator it = old_weights.begin(); - it != old_weights.end(); ++it) - x[it->first] = it->second; +// return held-out log likelihood +double LearnParameters(const vector > >& training, + const vector > >& testing, + const double sigsq, + const unsigned memory_buffers, + vector* px) { + vector& x = *px; vector vg(FD::NumFeats(), 0.0); bool converged = false; - LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as()); + LBFGSOptimizer opt(FD::NumFeats(), memory_buffers); + double tppl = 0.0; while(!converged) { + fill(vg.begin(), vg.end(), 0.0); double cll = TrainingInference(x, training, &vg); double ppl = cll / log(2); ppl /= training.size(); ppl = pow(2.0, ppl); - double tppl = 0.0; // evaluate optional held-out test set if (testing.size()) { @@ -173,7 +158,6 @@ int main(int argc, char** argv) { // handle regularizer #if 1 - const double sigsq = conf["sigma_squared"].as(); double norm = 0; for (int i = 1; i < x.size(); ++i) { const double mean_i = 0.0; @@ -202,11 +186,91 @@ int main(int argc, char** argv) { cerr << " BIAS: " << x[0] << endl; } } + return tppl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + string line; + vector > > training, testing; + SparseVector old_weights; + const bool tune_regularizer = conf.count("tune_regularizer"); + if (tune_regularizer && !conf.count("testset")) { + cerr << "--tune_regularizer requires --testset to be set\n"; + return 1; + } + const double min_reg = conf["min_reg"].as(); + const double max_reg = conf["max_reg"].as(); + double sigsq = conf["sigma_squared"].as(); + assert(sigsq > 0.0); + assert(min_reg > 0.0); + assert(max_reg > 0.0); + assert(max_reg > min_reg); + const double psi = conf["interpolation"].as(); + if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } + if (conf.count("weights")) { + Weights w; + w.InitFromFile(conf["weights"].as()); + w.InitSparseVector(&old_weights); + } + ReadCorpus(&cin, &training); + if (conf.count("testset")) { + ReadFile rf(conf["testset"].as()); + ReadCorpus(rf.stream(), &testing); + } + cerr << "Number of features: " << FD::NumFeats() << endl; + vector x(FD::NumFeats(), 0.0); // x[0] is bias + for (SparseVector::const_iterator it = old_weights.begin(); + it != old_weights.end(); ++it) + x[it->first] = it->second; + double tppl = 0.0; + vector > sp; + vector smoothed; + if (tune_regularizer) { + sigsq = min_reg; + const double steps = 18; + double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps); + cerr << "SWEEP FACTOR: " << sweep_factor << endl; + while(sigsq < max_reg) { + tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as(), &x); + sp.push_back(make_pair(sigsq, tppl)); + sigsq *= sweep_factor; + } + smoothed.resize(sp.size(), 0); + smoothed[0] = sp[0].second; + smoothed.back() = sp.back().second; + for (int i = 1; i < sp.size()-1; ++i) { + double prev = sp[i-1].second; + double next = sp[i+1].second; + double cur = sp[i].second; + smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next); + } + double best_ppl = 9999999; + unsigned best_i = 0; + for (unsigned i = 0; i < sp.size(); ++i) { + if (smoothed[i] < best_ppl) { + best_ppl = smoothed[i]; + best_i = i; + } + } + sigsq = sp[best_i].first; + tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as(), &x); + } Weights w; if (conf.count("weights")) { for (int i = 1; i < x.size(); ++i) x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi); } + cout.precision(15); + cout << "# sigma^2=" << sigsq << "\theld out perplexity="; + if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; } + if (sp.size()) { + cout << "# Parameter sweep:\n"; + for (int i = 0; i < sp.size(); ++i) { + cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl; + } + } w.InitFromVector(x); w.WriteToFile("-"); return 0; -- cgit v1.2.3