diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-18 13:35:42 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-18 13:35:42 -0500 |
commit | 1b8181bf0d6e9137e6b9ccdbe414aec37377a1a9 (patch) | |
tree | 33e5f3aa5abff1f41314cf8f6afbd2c2c40e4bfd /pro/mr_pro_reduce.cc | |
parent | 7c4665949fb93fb3de402e4ce1d19bef67850d05 (diff) |
major restructure of the training code
Diffstat (limited to 'pro/mr_pro_reduce.cc')
-rw-r--r-- | pro/mr_pro_reduce.cc | 286 |
1 files changed, 0 insertions, 286 deletions
diff --git a/pro/mr_pro_reduce.cc b/pro/mr_pro_reduce.cc deleted file mode 100644 index 5ef9b470..00000000 --- a/pro/mr_pro_reduce.cc +++ /dev/null @@ -1,286 +0,0 @@ -#include <cstdlib> -#include <sstream> -#include <iostream> -#include <fstream> -#include <vector> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "weights.h" -#include "sparse_vector.h" -#include "optimize.h" -#include "liblbfgs/lbfgs++.h" - -using namespace std; -namespace po = boost::program_options; - -// since this is a ranking model, there should be equal numbers of -// positive and negative examples, so the bias should be 0 -static const double MAX_BIAS = 1e-10; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation") - ("regularization_strength,C",po::value<double>()->default_value(500.0), "l2 regularization strength") - ("l1",po::value<double>()->default_value(0.0), "l1 regularization strength") - ("regularize_to_weights,y",po::value<double>()->default_value(5000.0), "Differences in learned weights to previous weights are penalized with an l2 penalty with this strength; 0.0 = no effect") - ("memory_buffers,m",po::value<unsigned>()->default_value(100), "Number of memory buffers (LBFGS)") - ("min_reg,r",po::value<double>()->default_value(0.01), "When tuning (-T) regularization strength, minimum regularization strenght") - ("max_reg,R",po::value<double>()->default_value(1e6), "When tuning (-T) regularization strength, maximum regularization strenght") - ("testset,t",po::value<string>(), "Optional held-out test set") - ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength") - ("interpolate_with_weights,p",po::value<double>()->default_value(1.0), "[deprecated] Output weights are p*w + (1-p)*w_prev; 1.0 = no effect") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void ParseSparseVector(string& line, size_t cur, SparseVector<weight_t>* out) { - SparseVector<weight_t>& x = *out; - size_t last_start = cur; - size_t last_comma = string::npos; - while(cur <= line.size()) { - if (line[cur] == ' ' || cur == line.size()) { - if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { - cerr << "[ERROR] " << line << endl << " position = " << cur << endl; - exit(1); - } - const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); - if (cur < line.size()) line[cur] = 0; - const weight_t val = strtod(&line[last_comma + 1], NULL); - x.set_value(fid, val); - - last_comma = string::npos; - last_start = cur+1; - } else { - if (line[cur] == '=') - last_comma = cur; - } - ++cur; - } -} - -void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<weight_t> > >* corpus) { - istream& in = *pin; - corpus->clear(); - bool flag = false; - int lc = 0; - string line; - SparseVector<weight_t> x; - while(getline(in, line)) { - ++lc; - if (lc % 1000 == 0) { cerr << '.'; flag = true; } - if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } - if (line.empty()) continue; - const size_t ks = line.find("\t"); - assert(string::npos != ks); - assert(ks == 1); - const bool y = line[0] == '1'; - x.clear(); - ParseSparseVector(line, ks + 1, &x); - corpus->push_back(make_pair(y, x)); - } - if (flag) cerr << endl; -} - -void GradAdd(const SparseVector<weight_t>& v, const double scale, weight_t* acc) { - for (SparseVector<weight_t>::const_iterator it = v.begin(); - it != v.end(); ++it) { - acc[it->first] += it->second * scale; - } -} - -double ApplyRegularizationTerms(const double C, - const double T, - const vector<weight_t>& weights, - const vector<weight_t>& prev_weights, - weight_t* g) { - double reg = 0; - for (size_t i = 0; i < weights.size(); ++i) { - const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0); - const double& w_i = weights[i]; - reg += C * w_i * w_i; - g[i] += 2 * C * w_i; - - const double diff_i = w_i - prev_w_i; - reg += T * diff_i * diff_i; - g[i] += 2 * T * diff_i; - } - return reg; -} - -double TrainingInference(const vector<weight_t>& x, - const vector<pair<bool, SparseVector<weight_t> > >& corpus, - weight_t* g = NULL) { - double cll = 0; - for (int i = 0; i < corpus.size(); ++i) { - const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias - double lp_false = dotprod; - double lp_true = -dotprod; - if (0 < lp_true) { - lp_true += log1p(exp(-lp_true)); - lp_false = log1p(exp(lp_false)); - } else { - lp_true = log1p(exp(lp_true)); - lp_false += log1p(exp(-lp_false)); - } - lp_true*=-1; - lp_false*=-1; - if (corpus[i].first) { // true label - cll -= lp_true; - if (g) { - // g -= corpus[i].second * exp(lp_false); - GradAdd(corpus[i].second, -exp(lp_false), g); - g[0] -= exp(lp_false); // bias - } - } else { // false label - cll -= lp_false; - if (g) { - // g += corpus[i].second * exp(lp_true); - GradAdd(corpus[i].second, exp(lp_true), g); - g[0] += exp(lp_true); // bias - } - } - } - return cll; -} - -struct ProLoss { - ProLoss(const vector<pair<bool, SparseVector<weight_t> > >& tr, - const vector<pair<bool, SparseVector<weight_t> > >& te, - const double c, - const double t, - const vector<weight_t>& px) : training(tr), testing(te), C(c), T(t), prev_x(px){} - double operator()(const vector<double>& x, double* g) const { - fill(g, g + x.size(), 0.0); - double cll = TrainingInference(x, training, g); - tppl = 0; - if (testing.size()) - tppl = pow(2.0, TrainingInference(x, testing, g) / (log(2) * testing.size())); - double ppl = cll / log(2); - ppl /= training.size(); - ppl = pow(2.0, ppl); - double reg = ApplyRegularizationTerms(C, T, x, prev_x, g); - return cll + reg; - } - const vector<pair<bool, SparseVector<weight_t> > >& training, testing; - const double C, T; - const vector<double>& prev_x; - mutable double tppl; -}; - -// return held-out log likelihood -double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training, - const vector<pair<bool, SparseVector<weight_t> > >& testing, - const double C, - const double C1, - const double T, - const unsigned memory_buffers, - const vector<weight_t>& prev_x, - vector<weight_t>* px) { - assert(px->size() == prev_x.size()); - ProLoss loss(training, testing, C, T, prev_x); - LBFGS<ProLoss> lbfgs(px, loss, memory_buffers, C1); - lbfgs.MinimizeFunction(); - return loss.tppl; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - string line; - vector<pair<bool, SparseVector<weight_t> > > training, testing; - const bool tune_regularizer = conf.count("tune_regularizer"); - if (tune_regularizer && !conf.count("testset")) { - cerr << "--tune_regularizer requires --testset to be set\n"; - return 1; - } - const double min_reg = conf["min_reg"].as<double>(); - const double max_reg = conf["max_reg"].as<double>(); - double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned - double C1 = conf["l1"].as<double>(); // will be overridden if parameter is tuned - const double T = conf["regularize_to_weights"].as<double>(); - assert(C >= 0.0); - assert(min_reg >= 0.0); - assert(max_reg >= 0.0); - assert(max_reg > min_reg); - const double psi = conf["interpolate_with_weights"].as<double>(); - if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; } - ReadCorpus(&cin, &training); - if (conf.count("testset")) { - ReadFile rf(conf["testset"].as<string>()); - ReadCorpus(rf.stream(), &testing); - } - cerr << "Number of features: " << FD::NumFeats() << endl; - - vector<weight_t> x, prev_x; // x[0] is bias - if (conf.count("weights")) { - Weights::InitFromFile(conf["weights"].as<string>(), &x); - x.resize(FD::NumFeats()); - prev_x = x; - } else { - x.resize(FD::NumFeats()); - prev_x = x; - } - cerr << " Number of features: " << x.size() << endl; - cerr << "Number of training examples: " << training.size() << endl; - cerr << "Number of testing examples: " << testing.size() << endl; - double tppl = 0.0; - vector<pair<double,double> > sp; - vector<double> smoothed; - if (tune_regularizer) { - C = min_reg; - const double steps = 18; - double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps); - cerr << "SWEEP FACTOR: " << sweep_factor << endl; - while(C < max_reg) { - cerr << "C=" << C << "\tT=" <<T << endl; - tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x); - sp.push_back(make_pair(C, tppl)); - C *= sweep_factor; - } - smoothed.resize(sp.size(), 0); - smoothed[0] = sp[0].second; - smoothed.back() = sp.back().second; - for (int i = 1; i < sp.size()-1; ++i) { - double prev = sp[i-1].second; - double next = sp[i+1].second; - double cur = sp[i].second; - smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next); - } - double best_ppl = 9999999; - unsigned best_i = 0; - for (unsigned i = 0; i < sp.size(); ++i) { - if (smoothed[i] < best_ppl) { - best_ppl = smoothed[i]; - best_i = i; - } - } - C = sp[best_i].first; - } // tune regularizer - tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x); - if (conf.count("weights")) { - for (int i = 1; i < x.size(); ++i) { - x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi); - } - } - cout.precision(15); - cout << "# C=" << C << "\theld out perplexity="; - if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; } - if (sp.size()) { - cout << "# Parameter sweep:\n"; - for (int i = 0; i < sp.size(); ++i) { - cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl; - } - } - Weights::WriteToFile("-", x); - return 0; -} |