switch to liblbfgs for pro

author: Chris Dyer <cdyer@cs.cmu.edu> 2012-05-08 19:45:10 -0400
committer: Chris Dyer <cdyer@cs.cmu.edu> 2012-05-08 19:45:10 -0400
commit: 33d4601da5e2e715260619a38f5899645d157952 (patch)
tree: f5828cf0906e907c88dd9c3ed7643e9983f62a56
parent: c168daa1e801a8a0be4d0c16311ae30b06a43b82 (diff)
4 files changed, 67 insertions, 69 deletions
diff --git a/pro-train/Makefile.am b/pro-train/Makefile.am
index fdaf43e2..11d26211 100644
--- a/pro-train/Makefile.am
+++ b/pro-train/Makefile.am
@@ -8,6 +8,6 @@ mr_pro_map_SOURCES = mr_pro_map.cc
 mr_pro_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
 
 mr_pro_reduce_SOURCES = mr_pro_reduce.cc
-mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
 
 AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 6362ce47..d3fb8026 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -11,6 +11,7 @@
 #include "weights.h"
 #include "sparse_vector.h"
 #include "optimize.h"
+#include "liblbfgs/lbfgs++.h"
 
 using namespace std;
 namespace po = boost::program_options;
@@ -89,10 +90,10 @@ void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<weight_t> > >* corp
   if (flag) cerr << endl;
 }
 
-void GradAdd(const SparseVector<weight_t>& v, const double scale, vector<weight_t>* acc) {
+void GradAdd(const SparseVector<weight_t>& v, const double scale, weight_t* acc) {
   for (SparseVector<weight_t>::const_iterator it = v.begin();
        it != v.end(); ++it) {
-    (*acc)[it->first] += it->second * scale;
+    acc[it->first] += it->second * scale;
   }
 }
 
@@ -100,26 +101,24 @@ double ApplyRegularizationTerms(const double C,
                                 const double T,
                                 const vector<weight_t>& weights,
                                 const vector<weight_t>& prev_weights,
-                                vector<weight_t>* g) {
-  assert(weights.size() == g->size());
+                                weight_t* g) {
   double reg = 0;
   for (size_t i = 0; i < weights.size(); ++i) {
     const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0);
     const double& w_i = weights[i];
-    double& g_i = (*g)[i];
     reg += C * w_i * w_i;
-    g_i += 2 * C * w_i;
+    g[i] += 2 * C * w_i;
 
     const double diff_i = w_i - prev_w_i;
     reg += T * diff_i * diff_i;
-    g_i += 2 * T * diff_i;
+    g[i] += 2 * T * diff_i;
   }
   return reg;
 }
 
 double TrainingInference(const vector<weight_t>& x,
                          const vector<pair<bool, SparseVector<weight_t> > >& corpus,
-                         vector<weight_t>* g = NULL) {
+                         weight_t* g = NULL) {
   double cll = 0;
   for (int i = 0; i < corpus.size(); ++i) {
     const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias
@@ -139,20 +138,44 @@ double TrainingInference(const vector<weight_t>& x,
       if (g) {
         // g -= corpus[i].second * exp(lp_false);
         GradAdd(corpus[i].second, -exp(lp_false), g);
-        (*g)[0] -= exp(lp_false); // bias
+        g[0] -= exp(lp_false); // bias
       }
     } else {                  // false label
       cll -= lp_false;
       if (g) {
         // g += corpus[i].second * exp(lp_true);
         GradAdd(corpus[i].second, exp(lp_true), g);
-        (*g)[0] += exp(lp_true); // bias
+        g[0] += exp(lp_true); // bias
       }
     }
   }
   return cll;
 }
 
+struct ProLoss {
+  ProLoss(const vector<pair<bool, SparseVector<weight_t> > >& tr,
+          const vector<pair<bool, SparseVector<weight_t> > >& te,
+          const double c,
+          const double t,
+          const vector<weight_t>& px) : training(tr), testing(te), C(c), T(t), prev_x(px){}
+  double operator()(const vector<double>& x, double* g) const {
+    fill(g, g + x.size(), 0.0);
+    double cll = TrainingInference(x, training, g);
+    tppl = 0;
+    if (testing.size())
+      tppl = pow(2.0, TrainingInference(x, testing, g) / (log(2) * testing.size()));
+    double ppl = cll / log(2);
+    ppl /= training.size();
+    ppl = pow(2.0, ppl);
+    double reg = ApplyRegularizationTerms(C, T, x, prev_x, g);
+    return cll + reg;
+  }
+  const vector<pair<bool, SparseVector<weight_t> > >& training, testing;
+  const double C, T;
+  const vector<double>& prev_x;
+  mutable double tppl;
+};
+
 // return held-out log likelihood
 double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training,
                        const vector<pair<bool, SparseVector<weight_t> > >& testing,
@@ -161,42 +184,10 @@ double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& train
                        const unsigned memory_buffers,
                        const vector<weight_t>& prev_x,
                        vector<weight_t>* px) {
-  vector<weight_t>& x = *px;
-  vector<weight_t> vg(FD::NumFeats(), 0.0);
-  bool converged = false;
-  LBFGSOptimizer opt(FD::NumFeats(), memory_buffers);
-  double tppl = 0.0;
-  while(!converged) {
-    fill(vg.begin(), vg.end(), 0.0);
-    double cll = TrainingInference(x, training, &vg);
-    double ppl = cll / log(2);
-    ppl /= training.size();
-    ppl = pow(2.0, ppl);
-
-    // evaluate optional held-out test set
-    if (testing.size()) {
-      tppl = TrainingInference(x, testing) / log(2);
-      tppl /= testing.size();
-      tppl = pow(2.0, tppl);
-    }
-
-    // handle regularizer
-    double reg = ApplyRegularizationTerms(C, T, x, prev_x, &vg);
-    cll += reg;
-    cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t" << endl;
-    try {
-      opt.Optimize(cll, vg, &x);
-      converged = opt.HasConverged();
-    } catch (...) {
-      cerr << "Exception caught, assuming convergence is close enough...\n";
-      converged = true;
-    }
-    if (fabs(x[0]) > MAX_BIAS) {
-      cerr << "Biased model learned. Are your training instances wrong?\n";
-      cerr << "  BIAS: " << x[0] << endl;
-    }
-  }
-  return tppl;
+  ProLoss loss(training, testing, C, T, prev_x);
+  LBFGS<ProLoss> lbfgs(px, loss, 0.0, memory_buffers);
+  lbfgs.MinimizeFunction();
+  return loss.tppl;
 }
 
 int main(int argc, char** argv) {
@@ -213,9 +204,9 @@ int main(int argc, char** argv) {
   const double max_reg = conf["max_reg"].as<double>();
   double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned
   const double T = conf["regularize_to_weights"].as<double>();
-  assert(C > 0.0);
-  assert(min_reg > 0.0);
-  assert(max_reg > 0.0);
+  assert(C >= 0.0);
+  assert(min_reg >= 0.0);
+  assert(max_reg >= 0.0);
   assert(max_reg > min_reg);
   const double psi = conf["interpolate_with_weights"].as<double>();
   if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; }
diff --git a/training/liblbfgs/lbfgs++.h b/training/liblbfgs/lbfgs++.h
index 119511e5..6c4d1807 100644
--- a/training/liblbfgs/lbfgs++.h
+++ b/training/liblbfgs/lbfgs++.h
@@ -9,9 +9,10 @@
 #define __LBFGSPP_H__
 
 #include <vector>
+#include <cassert>
 #include "liblbfgs/lbfgs.h"
 
-// Function must be lbfgsfloatval_t f(const double* x_start, double* g_start)
+// Function must be double f(const vector<double>& x_start, double* g_start)
 template <typename Function>
 class LBFGS {
  public:
@@ -46,11 +47,14 @@ class LBFGS {
   lbfgsfloatval_t& operator[](size_t i) { return m_x[i]; }
   size_t size() const { return m_x.size(); }
 
-  int Optimize() {
+  int MinimizeFunction(bool s = false) {
+    silence = s;
     lbfgsfloatval_t fx;
     int ret = lbfgs(m_x.size(), &m_x[0], &fx, _evaluate, _progress, this, &param);
-    std::cerr << "L-BFGS optimization terminated with status code = " << ret << std::endl;
-    std::cerr << "  fx = " << fx << std::endl;
+    if (!silence) {
+      std::cerr << "L-BFGS optimization terminated with status code = " << ret << std::endl;
+      std::cerr << "  fx = " << fx << std::endl;
+    }
     return ret;
   }
 
@@ -62,6 +66,7 @@ class LBFGS {
       param.linesearch = LBFGS_LINESEARCH_BACKTRACKING;
       param.orthantwise_c = 1.0;
     }
+    silence = false;
   }
 
   static lbfgsfloatval_t _evaluate(
@@ -79,7 +84,8 @@ class LBFGS {
                              const lbfgsfloatval_t step) {
       (void) n;
       (void) step;
-      return func(x, g);
+      assert(x == &m_x[0]);  // sanity check, ensures pass m_x is okay
+      return func(m_x, g);
     }
 
     static int _progress(
@@ -109,21 +115,23 @@ class LBFGS {
         int n,
         int k,
         int ls
-        )
-    {
-        (void) n;
-        (void) k;
-        std::cerr << "Iteration " << k << ':' << std::endl;
-        std::cerr << "  fx = " << fx << ", x[0] = " << x[0] << ", x[1] = " << x[1] << std::endl;
-        std::cerr << "  xnorm = " << xnorm << ", gnorm = " << gnorm << ", step = " << step << std::endl << std::endl;
-        return 0;
+        ) {
+    (void) x;
+    (void) g;
+    (void) n;
+    (void) ls;
+    if (!silence) {
+      std::cerr << "Iteration " << k << ':' << "\tfx = " << fx << "\t"
+                << "  xnorm = " << xnorm << ", gnorm = " << gnorm << ", step = " << step << std::endl;
     }
+    return 0;
+  }
   std::vector<lbfgsfloatval_t>* p_x;
   const bool owned;
   std::vector<lbfgsfloatval_t>& m_x;
   const Function& func;
   lbfgs_parameter_t param;
-
+  bool silence;
 };
 
 #endif
diff --git a/training/liblbfgs/ll_test.cc b/training/liblbfgs/ll_test.cc
index 058db716..43c0f214 100644
--- a/training/liblbfgs/ll_test.cc
+++ b/training/liblbfgs/ll_test.cc
@@ -4,12 +4,11 @@
 using namespace std;
 
 // Function must be lbfgsfloatval_t f(x.begin, x.end, g.begin)
-lbfgsfloatval_t func(const lbfgsfloatval_t* x, lbfgsfloatval_t* g) {
+lbfgsfloatval_t func(const vector<lbfgsfloatval_t>& x, lbfgsfloatval_t* g) {
     int i;
     lbfgsfloatval_t fx = 0.0;
-    int n = 4;
 
-    for (i = 0;i < n;i += 2) {
+    for (i = 0;i < x.size();i += 2) {
         lbfgsfloatval_t t1 = 1.0 - x[i];
         lbfgsfloatval_t t2 = 10.0 * (x[i+1] - x[i] * x[i]);
         g[i+1] = 20.0 * t2;
@@ -21,8 +20,8 @@ lbfgsfloatval_t func(const lbfgsfloatval_t* x, lbfgsfloatval_t* g) {
 
 template<typename F>
 void Opt(F& f) {
-  LBFGS<F> lbfgs(4, f, 1.0);
-  lbfgs.Optimize();
+  LBFGS<F> lbfgs(4, f);
+  lbfgs.MinimizeFunction();
 }
 
 int main(int argc, char** argv) {
author	Chris Dyer <cdyer@cs.cmu.edu>	2012-05-08 19:45:10 -0400
committer	Chris Dyer <cdyer@cs.cmu.edu>	2012-05-08 19:45:10 -0400
commit	33d4601da5e2e715260619a38f5899645d157952 (patch)
tree	f5828cf0906e907c88dd9c3ed7643e9983f62a56
parent	c168daa1e801a8a0be4d0c16311ae30b06a43b82 (diff)