From 1b8181bf0d6e9137e6b9ccdbe414aec37377a1a9 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 18 Nov 2012 13:35:42 -0500 Subject: major restructure of the training code --- training/pro/mr_pro_reduce.cc | 286 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100644 training/pro/mr_pro_reduce.cc (limited to 'training/pro/mr_pro_reduce.cc') diff --git a/training/pro/mr_pro_reduce.cc b/training/pro/mr_pro_reduce.cc new file mode 100644 index 00000000..5ef9b470 --- /dev/null +++ b/training/pro/mr_pro_reduce.cc @@ -0,0 +1,286 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "filelib.h" +#include "weights.h" +#include "sparse_vector.h" +#include "optimize.h" +#include "liblbfgs/lbfgs++.h" + +using namespace std; +namespace po = boost::program_options; + +// since this is a ranking model, there should be equal numbers of +// positive and negative examples, so the bias should be 0 +static const double MAX_BIAS = 1e-10; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("weights,w", po::value(), "Weights from previous iteration (used as initialization and interpolation") + ("regularization_strength,C",po::value()->default_value(500.0), "l2 regularization strength") + ("l1",po::value()->default_value(0.0), "l1 regularization strength") + ("regularize_to_weights,y",po::value()->default_value(5000.0), "Differences in learned weights to previous weights are penalized with an l2 penalty with this strength; 0.0 = no effect") + ("memory_buffers,m",po::value()->default_value(100), "Number of memory buffers (LBFGS)") + ("min_reg,r",po::value()->default_value(0.01), "When tuning (-T) regularization strength, minimum regularization strenght") + ("max_reg,R",po::value()->default_value(1e6), "When tuning (-T) regularization strength, maximum regularization strenght") + ("testset,t",po::value(), "Optional held-out test set") + ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength") + ("interpolate_with_weights,p",po::value()->default_value(1.0), "[deprecated] Output weights are p*w + (1-p)*w_prev; 1.0 = no effect") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ParseSparseVector(string& line, size_t cur, SparseVector* out) { + SparseVector& x = *out; + size_t last_start = cur; + size_t last_comma = string::npos; + while(cur <= line.size()) { + if (line[cur] == ' ' || cur == line.size()) { + if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { + cerr << "[ERROR] " << line << endl << " position = " << cur << endl; + exit(1); + } + const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); + if (cur < line.size()) line[cur] = 0; + const weight_t val = strtod(&line[last_comma + 1], NULL); + x.set_value(fid, val); + + last_comma = string::npos; + last_start = cur+1; + } else { + if (line[cur] == '=') + last_comma = cur; + } + ++cur; + } +} + +void ReadCorpus(istream* pin, vector > >* corpus) { + istream& in = *pin; + corpus->clear(); + bool flag = false; + int lc = 0; + string line; + SparseVector x; + while(getline(in, line)) { + ++lc; + if (lc % 1000 == 0) { cerr << '.'; flag = true; } + if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } + if (line.empty()) continue; + const size_t ks = line.find("\t"); + assert(string::npos != ks); + assert(ks == 1); + const bool y = line[0] == '1'; + x.clear(); + ParseSparseVector(line, ks + 1, &x); + corpus->push_back(make_pair(y, x)); + } + if (flag) cerr << endl; +} + +void GradAdd(const SparseVector& v, const double scale, weight_t* acc) { + for (SparseVector::const_iterator it = v.begin(); + it != v.end(); ++it) { + acc[it->first] += it->second * scale; + } +} + +double ApplyRegularizationTerms(const double C, + const double T, + const vector& weights, + const vector& prev_weights, + weight_t* g) { + double reg = 0; + for (size_t i = 0; i < weights.size(); ++i) { + const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0); + const double& w_i = weights[i]; + reg += C * w_i * w_i; + g[i] += 2 * C * w_i; + + const double diff_i = w_i - prev_w_i; + reg += T * diff_i * diff_i; + g[i] += 2 * T * diff_i; + } + return reg; +} + +double TrainingInference(const vector& x, + const vector > >& corpus, + weight_t* g = NULL) { + double cll = 0; + for (int i = 0; i < corpus.size(); ++i) { + const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias + double lp_false = dotprod; + double lp_true = -dotprod; + if (0 < lp_true) { + lp_true += log1p(exp(-lp_true)); + lp_false = log1p(exp(lp_false)); + } else { + lp_true = log1p(exp(lp_true)); + lp_false += log1p(exp(-lp_false)); + } + lp_true*=-1; + lp_false*=-1; + if (corpus[i].first) { // true label + cll -= lp_true; + if (g) { + // g -= corpus[i].second * exp(lp_false); + GradAdd(corpus[i].second, -exp(lp_false), g); + g[0] -= exp(lp_false); // bias + } + } else { // false label + cll -= lp_false; + if (g) { + // g += corpus[i].second * exp(lp_true); + GradAdd(corpus[i].second, exp(lp_true), g); + g[0] += exp(lp_true); // bias + } + } + } + return cll; +} + +struct ProLoss { + ProLoss(const vector > >& tr, + const vector > >& te, + const double c, + const double t, + const vector& px) : training(tr), testing(te), C(c), T(t), prev_x(px){} + double operator()(const vector& x, double* g) const { + fill(g, g + x.size(), 0.0); + double cll = TrainingInference(x, training, g); + tppl = 0; + if (testing.size()) + tppl = pow(2.0, TrainingInference(x, testing, g) / (log(2) * testing.size())); + double ppl = cll / log(2); + ppl /= training.size(); + ppl = pow(2.0, ppl); + double reg = ApplyRegularizationTerms(C, T, x, prev_x, g); + return cll + reg; + } + const vector > >& training, testing; + const double C, T; + const vector& prev_x; + mutable double tppl; +}; + +// return held-out log likelihood +double LearnParameters(const vector > >& training, + const vector > >& testing, + const double C, + const double C1, + const double T, + const unsigned memory_buffers, + const vector& prev_x, + vector* px) { + assert(px->size() == prev_x.size()); + ProLoss loss(training, testing, C, T, prev_x); + LBFGS lbfgs(px, loss, memory_buffers, C1); + lbfgs.MinimizeFunction(); + return loss.tppl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + string line; + vector > > training, testing; + const bool tune_regularizer = conf.count("tune_regularizer"); + if (tune_regularizer && !conf.count("testset")) { + cerr << "--tune_regularizer requires --testset to be set\n"; + return 1; + } + const double min_reg = conf["min_reg"].as(); + const double max_reg = conf["max_reg"].as(); + double C = conf["regularization_strength"].as(); // will be overridden if parameter is tuned + double C1 = conf["l1"].as(); // will be overridden if parameter is tuned + const double T = conf["regularize_to_weights"].as(); + assert(C >= 0.0); + assert(min_reg >= 0.0); + assert(max_reg >= 0.0); + assert(max_reg > min_reg); + const double psi = conf["interpolate_with_weights"].as(); + if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; } + ReadCorpus(&cin, &training); + if (conf.count("testset")) { + ReadFile rf(conf["testset"].as()); + ReadCorpus(rf.stream(), &testing); + } + cerr << "Number of features: " << FD::NumFeats() << endl; + + vector x, prev_x; // x[0] is bias + if (conf.count("weights")) { + Weights::InitFromFile(conf["weights"].as(), &x); + x.resize(FD::NumFeats()); + prev_x = x; + } else { + x.resize(FD::NumFeats()); + prev_x = x; + } + cerr << " Number of features: " << x.size() << endl; + cerr << "Number of training examples: " << training.size() << endl; + cerr << "Number of testing examples: " << testing.size() << endl; + double tppl = 0.0; + vector > sp; + vector smoothed; + if (tune_regularizer) { + C = min_reg; + const double steps = 18; + double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps); + cerr << "SWEEP FACTOR: " << sweep_factor << endl; + while(C < max_reg) { + cerr << "C=" << C << "\tT=" <(), prev_x, &x); + sp.push_back(make_pair(C, tppl)); + C *= sweep_factor; + } + smoothed.resize(sp.size(), 0); + smoothed[0] = sp[0].second; + smoothed.back() = sp.back().second; + for (int i = 1; i < sp.size()-1; ++i) { + double prev = sp[i-1].second; + double next = sp[i+1].second; + double cur = sp[i].second; + smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next); + } + double best_ppl = 9999999; + unsigned best_i = 0; + for (unsigned i = 0; i < sp.size(); ++i) { + if (smoothed[i] < best_ppl) { + best_ppl = smoothed[i]; + best_i = i; + } + } + C = sp[best_i].first; + } // tune regularizer + tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as(), prev_x, &x); + if (conf.count("weights")) { + for (int i = 1; i < x.size(); ++i) { + x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi); + } + } + cout.precision(15); + cout << "# C=" << C << "\theld out perplexity="; + if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; } + if (sp.size()) { + cout << "# Parameter sweep:\n"; + for (int i = 0; i < sp.size(); ++i) { + cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl; + } + } + Weights::WriteToFile("-", x); + return 0; +} -- cgit v1.2.3