diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-05-08 19:45:10 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-05-08 19:45:10 -0400 |
commit | 33d4601da5e2e715260619a38f5899645d157952 (patch) | |
tree | f5828cf0906e907c88dd9c3ed7643e9983f62a56 | |
parent | c168daa1e801a8a0be4d0c16311ae30b06a43b82 (diff) |
switch to liblbfgs for pro
-rw-r--r-- | pro-train/Makefile.am | 2 | ||||
-rw-r--r-- | pro-train/mr_pro_reduce.cc | 89 | ||||
-rw-r--r-- | training/liblbfgs/lbfgs++.h | 36 | ||||
-rw-r--r-- | training/liblbfgs/ll_test.cc | 9 |
4 files changed, 67 insertions, 69 deletions
diff --git a/pro-train/Makefile.am b/pro-train/Makefile.am index fdaf43e2..11d26211 100644 --- a/pro-train/Makefile.am +++ b/pro-train/Makefile.am @@ -8,6 +8,6 @@ mr_pro_map_SOURCES = mr_pro_map.cc mr_pro_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz mr_pro_reduce_SOURCES = mr_pro_reduce.cc -mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz +mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 6362ce47..d3fb8026 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -11,6 +11,7 @@ #include "weights.h" #include "sparse_vector.h" #include "optimize.h" +#include "liblbfgs/lbfgs++.h" using namespace std; namespace po = boost::program_options; @@ -89,10 +90,10 @@ void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<weight_t> > >* corp if (flag) cerr << endl; } -void GradAdd(const SparseVector<weight_t>& v, const double scale, vector<weight_t>* acc) { +void GradAdd(const SparseVector<weight_t>& v, const double scale, weight_t* acc) { for (SparseVector<weight_t>::const_iterator it = v.begin(); it != v.end(); ++it) { - (*acc)[it->first] += it->second * scale; + acc[it->first] += it->second * scale; } } @@ -100,26 +101,24 @@ double ApplyRegularizationTerms(const double C, const double T, const vector<weight_t>& weights, const vector<weight_t>& prev_weights, - vector<weight_t>* g) { - assert(weights.size() == g->size()); + weight_t* g) { double reg = 0; for (size_t i = 0; i < weights.size(); ++i) { const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0); const double& w_i = weights[i]; - double& g_i = (*g)[i]; reg += C * w_i * w_i; - g_i += 2 * C * w_i; + g[i] += 2 * C * w_i; const double diff_i = w_i - prev_w_i; reg += T * diff_i * diff_i; - g_i += 2 * T * diff_i; + g[i] += 2 * T * diff_i; } return reg; } double TrainingInference(const vector<weight_t>& x, const vector<pair<bool, SparseVector<weight_t> > >& corpus, - vector<weight_t>* g = NULL) { + weight_t* g = NULL) { double cll = 0; for (int i = 0; i < corpus.size(); ++i) { const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias @@ -139,20 +138,44 @@ double TrainingInference(const vector<weight_t>& x, if (g) { // g -= corpus[i].second * exp(lp_false); GradAdd(corpus[i].second, -exp(lp_false), g); - (*g)[0] -= exp(lp_false); // bias + g[0] -= exp(lp_false); // bias } } else { // false label cll -= lp_false; if (g) { // g += corpus[i].second * exp(lp_true); GradAdd(corpus[i].second, exp(lp_true), g); - (*g)[0] += exp(lp_true); // bias + g[0] += exp(lp_true); // bias } } } return cll; } +struct ProLoss { + ProLoss(const vector<pair<bool, SparseVector<weight_t> > >& tr, + const vector<pair<bool, SparseVector<weight_t> > >& te, + const double c, + const double t, + const vector<weight_t>& px) : training(tr), testing(te), C(c), T(t), prev_x(px){} + double operator()(const vector<double>& x, double* g) const { + fill(g, g + x.size(), 0.0); + double cll = TrainingInference(x, training, g); + tppl = 0; + if (testing.size()) + tppl = pow(2.0, TrainingInference(x, testing, g) / (log(2) * testing.size())); + double ppl = cll / log(2); + ppl /= training.size(); + ppl = pow(2.0, ppl); + double reg = ApplyRegularizationTerms(C, T, x, prev_x, g); + return cll + reg; + } + const vector<pair<bool, SparseVector<weight_t> > >& training, testing; + const double C, T; + const vector<double>& prev_x; + mutable double tppl; +}; + // return held-out log likelihood double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training, const vector<pair<bool, SparseVector<weight_t> > >& testing, @@ -161,42 +184,10 @@ double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& train const unsigned memory_buffers, const vector<weight_t>& prev_x, vector<weight_t>* px) { - vector<weight_t>& x = *px; - vector<weight_t> vg(FD::NumFeats(), 0.0); - bool converged = false; - LBFGSOptimizer opt(FD::NumFeats(), memory_buffers); - double tppl = 0.0; - while(!converged) { - fill(vg.begin(), vg.end(), 0.0); - double cll = TrainingInference(x, training, &vg); - double ppl = cll / log(2); - ppl /= training.size(); - ppl = pow(2.0, ppl); - - // evaluate optional held-out test set - if (testing.size()) { - tppl = TrainingInference(x, testing) / log(2); - tppl /= testing.size(); - tppl = pow(2.0, tppl); - } - - // handle regularizer - double reg = ApplyRegularizationTerms(C, T, x, prev_x, &vg); - cll += reg; - cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t" << endl; - try { - opt.Optimize(cll, vg, &x); - converged = opt.HasConverged(); - } catch (...) { - cerr << "Exception caught, assuming convergence is close enough...\n"; - converged = true; - } - if (fabs(x[0]) > MAX_BIAS) { - cerr << "Biased model learned. Are your training instances wrong?\n"; - cerr << " BIAS: " << x[0] << endl; - } - } - return tppl; + ProLoss loss(training, testing, C, T, prev_x); + LBFGS<ProLoss> lbfgs(px, loss, 0.0, memory_buffers); + lbfgs.MinimizeFunction(); + return loss.tppl; } int main(int argc, char** argv) { @@ -213,9 +204,9 @@ int main(int argc, char** argv) { const double max_reg = conf["max_reg"].as<double>(); double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned const double T = conf["regularize_to_weights"].as<double>(); - assert(C > 0.0); - assert(min_reg > 0.0); - assert(max_reg > 0.0); + assert(C >= 0.0); + assert(min_reg >= 0.0); + assert(max_reg >= 0.0); assert(max_reg > min_reg); const double psi = conf["interpolate_with_weights"].as<double>(); if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; } diff --git a/training/liblbfgs/lbfgs++.h b/training/liblbfgs/lbfgs++.h index 119511e5..6c4d1807 100644 --- a/training/liblbfgs/lbfgs++.h +++ b/training/liblbfgs/lbfgs++.h @@ -9,9 +9,10 @@ #define __LBFGSPP_H__ #include <vector> +#include <cassert> #include "liblbfgs/lbfgs.h" -// Function must be lbfgsfloatval_t f(const double* x_start, double* g_start) +// Function must be double f(const vector<double>& x_start, double* g_start) template <typename Function> class LBFGS { public: @@ -46,11 +47,14 @@ class LBFGS { lbfgsfloatval_t& operator[](size_t i) { return m_x[i]; } size_t size() const { return m_x.size(); } - int Optimize() { + int MinimizeFunction(bool s = false) { + silence = s; lbfgsfloatval_t fx; int ret = lbfgs(m_x.size(), &m_x[0], &fx, _evaluate, _progress, this, ¶m); - std::cerr << "L-BFGS optimization terminated with status code = " << ret << std::endl; - std::cerr << " fx = " << fx << std::endl; + if (!silence) { + std::cerr << "L-BFGS optimization terminated with status code = " << ret << std::endl; + std::cerr << " fx = " << fx << std::endl; + } return ret; } @@ -62,6 +66,7 @@ class LBFGS { param.linesearch = LBFGS_LINESEARCH_BACKTRACKING; param.orthantwise_c = 1.0; } + silence = false; } static lbfgsfloatval_t _evaluate( @@ -79,7 +84,8 @@ class LBFGS { const lbfgsfloatval_t step) { (void) n; (void) step; - return func(x, g); + assert(x == &m_x[0]); // sanity check, ensures pass m_x is okay + return func(m_x, g); } static int _progress( @@ -109,21 +115,23 @@ class LBFGS { int n, int k, int ls - ) - { - (void) n; - (void) k; - std::cerr << "Iteration " << k << ':' << std::endl; - std::cerr << " fx = " << fx << ", x[0] = " << x[0] << ", x[1] = " << x[1] << std::endl; - std::cerr << " xnorm = " << xnorm << ", gnorm = " << gnorm << ", step = " << step << std::endl << std::endl; - return 0; + ) { + (void) x; + (void) g; + (void) n; + (void) ls; + if (!silence) { + std::cerr << "Iteration " << k << ':' << "\tfx = " << fx << "\t" + << " xnorm = " << xnorm << ", gnorm = " << gnorm << ", step = " << step << std::endl; } + return 0; + } std::vector<lbfgsfloatval_t>* p_x; const bool owned; std::vector<lbfgsfloatval_t>& m_x; const Function& func; lbfgs_parameter_t param; - + bool silence; }; #endif diff --git a/training/liblbfgs/ll_test.cc b/training/liblbfgs/ll_test.cc index 058db716..43c0f214 100644 --- a/training/liblbfgs/ll_test.cc +++ b/training/liblbfgs/ll_test.cc @@ -4,12 +4,11 @@ using namespace std; // Function must be lbfgsfloatval_t f(x.begin, x.end, g.begin) -lbfgsfloatval_t func(const lbfgsfloatval_t* x, lbfgsfloatval_t* g) { +lbfgsfloatval_t func(const vector<lbfgsfloatval_t>& x, lbfgsfloatval_t* g) { int i; lbfgsfloatval_t fx = 0.0; - int n = 4; - for (i = 0;i < n;i += 2) { + for (i = 0;i < x.size();i += 2) { lbfgsfloatval_t t1 = 1.0 - x[i]; lbfgsfloatval_t t2 = 10.0 * (x[i+1] - x[i] * x[i]); g[i+1] = 20.0 * t2; @@ -21,8 +20,8 @@ lbfgsfloatval_t func(const lbfgsfloatval_t* x, lbfgsfloatval_t* g) { template<typename F> void Opt(F& f) { - LBFGS<F> lbfgs(4, f, 1.0); - lbfgs.Optimize(); + LBFGS<F> lbfgs(4, f); + lbfgs.MinimizeFunction(); } int main(int argc, char** argv) { |