diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-05-08 19:45:10 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-05-08 19:45:10 -0400 |
commit | 33d4601da5e2e715260619a38f5899645d157952 (patch) | |
tree | f5828cf0906e907c88dd9c3ed7643e9983f62a56 /pro-train | |
parent | c168daa1e801a8a0be4d0c16311ae30b06a43b82 (diff) |
switch to liblbfgs for pro
Diffstat (limited to 'pro-train')
-rw-r--r-- | pro-train/Makefile.am | 2 | ||||
-rw-r--r-- | pro-train/mr_pro_reduce.cc | 89 |
2 files changed, 41 insertions, 50 deletions
diff --git a/pro-train/Makefile.am b/pro-train/Makefile.am index fdaf43e2..11d26211 100644 --- a/pro-train/Makefile.am +++ b/pro-train/Makefile.am @@ -8,6 +8,6 @@ mr_pro_map_SOURCES = mr_pro_map.cc mr_pro_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz mr_pro_reduce_SOURCES = mr_pro_reduce.cc -mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz +mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 6362ce47..d3fb8026 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -11,6 +11,7 @@ #include "weights.h" #include "sparse_vector.h" #include "optimize.h" +#include "liblbfgs/lbfgs++.h" using namespace std; namespace po = boost::program_options; @@ -89,10 +90,10 @@ void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<weight_t> > >* corp if (flag) cerr << endl; } -void GradAdd(const SparseVector<weight_t>& v, const double scale, vector<weight_t>* acc) { +void GradAdd(const SparseVector<weight_t>& v, const double scale, weight_t* acc) { for (SparseVector<weight_t>::const_iterator it = v.begin(); it != v.end(); ++it) { - (*acc)[it->first] += it->second * scale; + acc[it->first] += it->second * scale; } } @@ -100,26 +101,24 @@ double ApplyRegularizationTerms(const double C, const double T, const vector<weight_t>& weights, const vector<weight_t>& prev_weights, - vector<weight_t>* g) { - assert(weights.size() == g->size()); + weight_t* g) { double reg = 0; for (size_t i = 0; i < weights.size(); ++i) { const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0); const double& w_i = weights[i]; - double& g_i = (*g)[i]; reg += C * w_i * w_i; - g_i += 2 * C * w_i; + g[i] += 2 * C * w_i; const double diff_i = w_i - prev_w_i; reg += T * diff_i * diff_i; - g_i += 2 * T * diff_i; + g[i] += 2 * T * diff_i; } return reg; } double TrainingInference(const vector<weight_t>& x, const vector<pair<bool, SparseVector<weight_t> > >& corpus, - vector<weight_t>* g = NULL) { + weight_t* g = NULL) { double cll = 0; for (int i = 0; i < corpus.size(); ++i) { const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias @@ -139,20 +138,44 @@ double TrainingInference(const vector<weight_t>& x, if (g) { // g -= corpus[i].second * exp(lp_false); GradAdd(corpus[i].second, -exp(lp_false), g); - (*g)[0] -= exp(lp_false); // bias + g[0] -= exp(lp_false); // bias } } else { // false label cll -= lp_false; if (g) { // g += corpus[i].second * exp(lp_true); GradAdd(corpus[i].second, exp(lp_true), g); - (*g)[0] += exp(lp_true); // bias + g[0] += exp(lp_true); // bias } } } return cll; } +struct ProLoss { + ProLoss(const vector<pair<bool, SparseVector<weight_t> > >& tr, + const vector<pair<bool, SparseVector<weight_t> > >& te, + const double c, + const double t, + const vector<weight_t>& px) : training(tr), testing(te), C(c), T(t), prev_x(px){} + double operator()(const vector<double>& x, double* g) const { + fill(g, g + x.size(), 0.0); + double cll = TrainingInference(x, training, g); + tppl = 0; + if (testing.size()) + tppl = pow(2.0, TrainingInference(x, testing, g) / (log(2) * testing.size())); + double ppl = cll / log(2); + ppl /= training.size(); + ppl = pow(2.0, ppl); + double reg = ApplyRegularizationTerms(C, T, x, prev_x, g); + return cll + reg; + } + const vector<pair<bool, SparseVector<weight_t> > >& training, testing; + const double C, T; + const vector<double>& prev_x; + mutable double tppl; +}; + // return held-out log likelihood double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training, const vector<pair<bool, SparseVector<weight_t> > >& testing, @@ -161,42 +184,10 @@ double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& train const unsigned memory_buffers, const vector<weight_t>& prev_x, vector<weight_t>* px) { - vector<weight_t>& x = *px; - vector<weight_t> vg(FD::NumFeats(), 0.0); - bool converged = false; - LBFGSOptimizer opt(FD::NumFeats(), memory_buffers); - double tppl = 0.0; - while(!converged) { - fill(vg.begin(), vg.end(), 0.0); - double cll = TrainingInference(x, training, &vg); - double ppl = cll / log(2); - ppl /= training.size(); - ppl = pow(2.0, ppl); - - // evaluate optional held-out test set - if (testing.size()) { - tppl = TrainingInference(x, testing) / log(2); - tppl /= testing.size(); - tppl = pow(2.0, tppl); - } - - // handle regularizer - double reg = ApplyRegularizationTerms(C, T, x, prev_x, &vg); - cll += reg; - cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t" << endl; - try { - opt.Optimize(cll, vg, &x); - converged = opt.HasConverged(); - } catch (...) { - cerr << "Exception caught, assuming convergence is close enough...\n"; - converged = true; - } - if (fabs(x[0]) > MAX_BIAS) { - cerr << "Biased model learned. Are your training instances wrong?\n"; - cerr << " BIAS: " << x[0] << endl; - } - } - return tppl; + ProLoss loss(training, testing, C, T, prev_x); + LBFGS<ProLoss> lbfgs(px, loss, 0.0, memory_buffers); + lbfgs.MinimizeFunction(); + return loss.tppl; } int main(int argc, char** argv) { @@ -213,9 +204,9 @@ int main(int argc, char** argv) { const double max_reg = conf["max_reg"].as<double>(); double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned const double T = conf["regularize_to_weights"].as<double>(); - assert(C > 0.0); - assert(min_reg > 0.0); - assert(max_reg > 0.0); + assert(C >= 0.0); + assert(min_reg >= 0.0); + assert(max_reg >= 0.0); assert(max_reg > min_reg); const double psi = conf["interpolate_with_weights"].as<double>(); if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; } |