summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-05-08 19:45:10 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2012-05-08 19:45:10 -0400
commit33d4601da5e2e715260619a38f5899645d157952 (patch)
treef5828cf0906e907c88dd9c3ed7643e9983f62a56
parentc168daa1e801a8a0be4d0c16311ae30b06a43b82 (diff)
switch to liblbfgs for pro
-rw-r--r--pro-train/Makefile.am2
-rw-r--r--pro-train/mr_pro_reduce.cc89
-rw-r--r--training/liblbfgs/lbfgs++.h36
-rw-r--r--training/liblbfgs/ll_test.cc9
4 files changed, 67 insertions, 69 deletions
diff --git a/pro-train/Makefile.am b/pro-train/Makefile.am
index fdaf43e2..11d26211 100644
--- a/pro-train/Makefile.am
+++ b/pro-train/Makefile.am
@@ -8,6 +8,6 @@ mr_pro_map_SOURCES = mr_pro_map.cc
mr_pro_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
mr_pro_reduce_SOURCES = mr_pro_reduce.cc
-mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+mr_pro_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/training/optimize.o $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 6362ce47..d3fb8026 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -11,6 +11,7 @@
#include "weights.h"
#include "sparse_vector.h"
#include "optimize.h"
+#include "liblbfgs/lbfgs++.h"
using namespace std;
namespace po = boost::program_options;
@@ -89,10 +90,10 @@ void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<weight_t> > >* corp
if (flag) cerr << endl;
}
-void GradAdd(const SparseVector<weight_t>& v, const double scale, vector<weight_t>* acc) {
+void GradAdd(const SparseVector<weight_t>& v, const double scale, weight_t* acc) {
for (SparseVector<weight_t>::const_iterator it = v.begin();
it != v.end(); ++it) {
- (*acc)[it->first] += it->second * scale;
+ acc[it->first] += it->second * scale;
}
}
@@ -100,26 +101,24 @@ double ApplyRegularizationTerms(const double C,
const double T,
const vector<weight_t>& weights,
const vector<weight_t>& prev_weights,
- vector<weight_t>* g) {
- assert(weights.size() == g->size());
+ weight_t* g) {
double reg = 0;
for (size_t i = 0; i < weights.size(); ++i) {
const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0);
const double& w_i = weights[i];
- double& g_i = (*g)[i];
reg += C * w_i * w_i;
- g_i += 2 * C * w_i;
+ g[i] += 2 * C * w_i;
const double diff_i = w_i - prev_w_i;
reg += T * diff_i * diff_i;
- g_i += 2 * T * diff_i;
+ g[i] += 2 * T * diff_i;
}
return reg;
}
double TrainingInference(const vector<weight_t>& x,
const vector<pair<bool, SparseVector<weight_t> > >& corpus,
- vector<weight_t>* g = NULL) {
+ weight_t* g = NULL) {
double cll = 0;
for (int i = 0; i < corpus.size(); ++i) {
const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias
@@ -139,20 +138,44 @@ double TrainingInference(const vector<weight_t>& x,
if (g) {
// g -= corpus[i].second * exp(lp_false);
GradAdd(corpus[i].second, -exp(lp_false), g);
- (*g)[0] -= exp(lp_false); // bias
+ g[0] -= exp(lp_false); // bias
}
} else { // false label
cll -= lp_false;
if (g) {
// g += corpus[i].second * exp(lp_true);
GradAdd(corpus[i].second, exp(lp_true), g);
- (*g)[0] += exp(lp_true); // bias
+ g[0] += exp(lp_true); // bias
}
}
}
return cll;
}
+struct ProLoss {
+ ProLoss(const vector<pair<bool, SparseVector<weight_t> > >& tr,
+ const vector<pair<bool, SparseVector<weight_t> > >& te,
+ const double c,
+ const double t,
+ const vector<weight_t>& px) : training(tr), testing(te), C(c), T(t), prev_x(px){}
+ double operator()(const vector<double>& x, double* g) const {
+ fill(g, g + x.size(), 0.0);
+ double cll = TrainingInference(x, training, g);
+ tppl = 0;
+ if (testing.size())
+ tppl = pow(2.0, TrainingInference(x, testing, g) / (log(2) * testing.size()));
+ double ppl = cll / log(2);
+ ppl /= training.size();
+ ppl = pow(2.0, ppl);
+ double reg = ApplyRegularizationTerms(C, T, x, prev_x, g);
+ return cll + reg;
+ }
+ const vector<pair<bool, SparseVector<weight_t> > >& training, testing;
+ const double C, T;
+ const vector<double>& prev_x;
+ mutable double tppl;
+};
+
// return held-out log likelihood
double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training,
const vector<pair<bool, SparseVector<weight_t> > >& testing,
@@ -161,42 +184,10 @@ double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& train
const unsigned memory_buffers,
const vector<weight_t>& prev_x,
vector<weight_t>* px) {
- vector<weight_t>& x = *px;
- vector<weight_t> vg(FD::NumFeats(), 0.0);
- bool converged = false;
- LBFGSOptimizer opt(FD::NumFeats(), memory_buffers);
- double tppl = 0.0;
- while(!converged) {
- fill(vg.begin(), vg.end(), 0.0);
- double cll = TrainingInference(x, training, &vg);
- double ppl = cll / log(2);
- ppl /= training.size();
- ppl = pow(2.0, ppl);
-
- // evaluate optional held-out test set
- if (testing.size()) {
- tppl = TrainingInference(x, testing) / log(2);
- tppl /= testing.size();
- tppl = pow(2.0, tppl);
- }
-
- // handle regularizer
- double reg = ApplyRegularizationTerms(C, T, x, prev_x, &vg);
- cll += reg;
- cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t" << endl;
- try {
- opt.Optimize(cll, vg, &x);
- converged = opt.HasConverged();
- } catch (...) {
- cerr << "Exception caught, assuming convergence is close enough...\n";
- converged = true;
- }
- if (fabs(x[0]) > MAX_BIAS) {
- cerr << "Biased model learned. Are your training instances wrong?\n";
- cerr << " BIAS: " << x[0] << endl;
- }
- }
- return tppl;
+ ProLoss loss(training, testing, C, T, prev_x);
+ LBFGS<ProLoss> lbfgs(px, loss, 0.0, memory_buffers);
+ lbfgs.MinimizeFunction();
+ return loss.tppl;
}
int main(int argc, char** argv) {
@@ -213,9 +204,9 @@ int main(int argc, char** argv) {
const double max_reg = conf["max_reg"].as<double>();
double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned
const double T = conf["regularize_to_weights"].as<double>();
- assert(C > 0.0);
- assert(min_reg > 0.0);
- assert(max_reg > 0.0);
+ assert(C >= 0.0);
+ assert(min_reg >= 0.0);
+ assert(max_reg >= 0.0);
assert(max_reg > min_reg);
const double psi = conf["interpolate_with_weights"].as<double>();
if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; }
diff --git a/training/liblbfgs/lbfgs++.h b/training/liblbfgs/lbfgs++.h
index 119511e5..6c4d1807 100644
--- a/training/liblbfgs/lbfgs++.h
+++ b/training/liblbfgs/lbfgs++.h
@@ -9,9 +9,10 @@
#define __LBFGSPP_H__
#include <vector>
+#include <cassert>
#include "liblbfgs/lbfgs.h"
-// Function must be lbfgsfloatval_t f(const double* x_start, double* g_start)
+// Function must be double f(const vector<double>& x_start, double* g_start)
template <typename Function>
class LBFGS {
public:
@@ -46,11 +47,14 @@ class LBFGS {
lbfgsfloatval_t& operator[](size_t i) { return m_x[i]; }
size_t size() const { return m_x.size(); }
- int Optimize() {
+ int MinimizeFunction(bool s = false) {
+ silence = s;
lbfgsfloatval_t fx;
int ret = lbfgs(m_x.size(), &m_x[0], &fx, _evaluate, _progress, this, &param);
- std::cerr << "L-BFGS optimization terminated with status code = " << ret << std::endl;
- std::cerr << " fx = " << fx << std::endl;
+ if (!silence) {
+ std::cerr << "L-BFGS optimization terminated with status code = " << ret << std::endl;
+ std::cerr << " fx = " << fx << std::endl;
+ }
return ret;
}
@@ -62,6 +66,7 @@ class LBFGS {
param.linesearch = LBFGS_LINESEARCH_BACKTRACKING;
param.orthantwise_c = 1.0;
}
+ silence = false;
}
static lbfgsfloatval_t _evaluate(
@@ -79,7 +84,8 @@ class LBFGS {
const lbfgsfloatval_t step) {
(void) n;
(void) step;
- return func(x, g);
+ assert(x == &m_x[0]); // sanity check, ensures pass m_x is okay
+ return func(m_x, g);
}
static int _progress(
@@ -109,21 +115,23 @@ class LBFGS {
int n,
int k,
int ls
- )
- {
- (void) n;
- (void) k;
- std::cerr << "Iteration " << k << ':' << std::endl;
- std::cerr << " fx = " << fx << ", x[0] = " << x[0] << ", x[1] = " << x[1] << std::endl;
- std::cerr << " xnorm = " << xnorm << ", gnorm = " << gnorm << ", step = " << step << std::endl << std::endl;
- return 0;
+ ) {
+ (void) x;
+ (void) g;
+ (void) n;
+ (void) ls;
+ if (!silence) {
+ std::cerr << "Iteration " << k << ':' << "\tfx = " << fx << "\t"
+ << " xnorm = " << xnorm << ", gnorm = " << gnorm << ", step = " << step << std::endl;
}
+ return 0;
+ }
std::vector<lbfgsfloatval_t>* p_x;
const bool owned;
std::vector<lbfgsfloatval_t>& m_x;
const Function& func;
lbfgs_parameter_t param;
-
+ bool silence;
};
#endif
diff --git a/training/liblbfgs/ll_test.cc b/training/liblbfgs/ll_test.cc
index 058db716..43c0f214 100644
--- a/training/liblbfgs/ll_test.cc
+++ b/training/liblbfgs/ll_test.cc
@@ -4,12 +4,11 @@
using namespace std;
// Function must be lbfgsfloatval_t f(x.begin, x.end, g.begin)
-lbfgsfloatval_t func(const lbfgsfloatval_t* x, lbfgsfloatval_t* g) {
+lbfgsfloatval_t func(const vector<lbfgsfloatval_t>& x, lbfgsfloatval_t* g) {
int i;
lbfgsfloatval_t fx = 0.0;
- int n = 4;
- for (i = 0;i < n;i += 2) {
+ for (i = 0;i < x.size();i += 2) {
lbfgsfloatval_t t1 = 1.0 - x[i];
lbfgsfloatval_t t2 = 10.0 * (x[i+1] - x[i] * x[i]);
g[i+1] = 20.0 * t2;
@@ -21,8 +20,8 @@ lbfgsfloatval_t func(const lbfgsfloatval_t* x, lbfgsfloatval_t* g) {
template<typename F>
void Opt(F& f) {
- LBFGS<F> lbfgs(4, f, 1.0);
- lbfgs.Optimize();
+ LBFGS<F> lbfgs(4, f);
+ lbfgs.MinimizeFunction();
}
int main(int argc, char** argv) {