diff options
| author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-29 00:36:09 +0000 | 
|---|---|---|
| committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-29 00:36:09 +0000 | 
| commit | 594cc714737708bb7c90c24a1ab1537b052f45ee (patch) | |
| tree | c3dca78e6c28379736f4b663e5c94b9dc40975cf /training | |
| parent | 016b376714976bbf19510e07797c9787f29daf60 (diff) | |
online optimizer
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@631 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'training')
| -rw-r--r-- | training/Makefile.am | 8 | ||||
| -rw-r--r-- | training/mr_optimize_reduce.cc | 6 | ||||
| -rw-r--r-- | training/online_optimizer.cc | 14 | ||||
| -rw-r--r-- | training/online_optimizer.h | 102 | ||||
| -rw-r--r-- | training/online_train.cc | 8 | ||||
| -rw-r--r-- | training/optimize.cc | 22 | ||||
| -rw-r--r-- | training/optimize.h | 23 | ||||
| -rw-r--r-- | training/optimize_test.cc | 19 | 
8 files changed, 158 insertions, 44 deletions
| diff --git a/training/Makefile.am b/training/Makefile.am index 48b19932..a947e4a5 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -7,12 +7,16 @@ bin_PROGRAMS = \    grammar_convert \    atools \    plftools \ -  collapse_weights +  collapse_weights \ +  online_train  noinst_PROGRAMS = \    lbfgs_test \    optimize_test +online_train_SOURCES = online_train.cc online_optimizer.cc +online_train_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +  atools_SOURCES = atools.cc  atools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz @@ -22,7 +26,7 @@ model1_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -l  grammar_convert_SOURCES = grammar_convert.cc  grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz -optimize_test_SOURCES = optimize_test.cc optimize.cc +optimize_test_SOURCES = optimize_test.cc optimize.cc online_optimizer.cc  optimize_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz  collapse_weights_SOURCES = collapse_weights.cc diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc index 42727ecb..b931991d 100644 --- a/training/mr_optimize_reduce.cc +++ b/training/mr_optimize_reduce.cc @@ -108,11 +108,9 @@ int main(int argc, char** argv) {      }      wm.InitVector(&means);    } -  shared_ptr<Optimizer> o; +  shared_ptr<BatchOptimizer> o;    const string omethod = conf["optimization_method"].as<string>(); -  if (omethod == "sgd") -    o.reset(new SGDOptimizer(conf["eta"].as<double>())); -  else if (omethod == "rprop") +  if (omethod == "rprop")      o.reset(new RPropOptimizer(num_feats));  // TODO add configuration    else      o.reset(new LBFGSOptimizer(num_feats, conf["correction_buffers"].as<int>())); diff --git a/training/online_optimizer.cc b/training/online_optimizer.cc new file mode 100644 index 00000000..db55c95e --- /dev/null +++ b/training/online_optimizer.cc @@ -0,0 +1,14 @@ +#include "online_optimizer.h" + +LearningRateSchedule::~LearningRateSchedule() {} + +double StandardLearningRate::eta(int k) const { +  return eta_0_ / (1.0 + k / N_); +} + +double ExponentialDecayLearningRate::eta(int k) const { +  return eta_0_ * pow(alpha_, k / N_); +} + +OnlineOptimizer::~OnlineOptimizer() {} + diff --git a/training/online_optimizer.h b/training/online_optimizer.h new file mode 100644 index 00000000..0cd748c4 --- /dev/null +++ b/training/online_optimizer.h @@ -0,0 +1,102 @@ +#ifndef _ONL_OPTIMIZE_H_ +#define _ONL_OPTIMIZE_H_ + +#include <tr1/memory> +#include <string> +#include <cmath> +#include "sparse_vector.h" + +struct LearningRateSchedule { +  virtual ~LearningRateSchedule(); +  // returns the learning rate for iteration k +  virtual double eta(int k) const = 0; +}; + +struct StandardLearningRate : public LearningRateSchedule { +  StandardLearningRate( +      size_t training_instances, +      double eta_0 = 0.2) : +    eta_0_(eta_0), +    N_(static_cast<double>(training_instances)) {} + +  virtual double eta(int k) const; + + private: +  const double eta_0_; +  const double N_; +}; + +struct ExponentialDecayLearningRate : public LearningRateSchedule { +  ExponentialDecayLearningRate( +      size_t training_instances, +      double eta_0 = 0.2, +      double alpha = 0.85       // recommended by Tsuruoka et al. (ACL 2009) +    ) : eta_0_(eta_0), +        N_(static_cast<double>(training_instances)), +        alpha_(alpha) { +    assert(alpha > 0); +    assert(alpha < 1.0); +  } + +  virtual double eta(int k) const; + + private: +  const double eta_0_; +  const double N_; +  const double alpha_; +}; + +class OnlineOptimizer { + public: +  virtual ~OnlineOptimizer(); +  OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s, +    size_t training_instances) : schedule_(s), k_(), N_(training_instances) {} +  void UpdateWeights(const SparseVector<double>& approx_g, SparseVector<double>* weights) { +    ++k_; +    const double eta = schedule_->eta(k_); +    UpdateWeightsImpl(eta, approx_g, weights); +  } + + protected: +  virtual void UpdateWeightsImpl(const double& eta, const SparseVector<double>& approx_g, SparseVector<double>* weights) = 0; +  const size_t N_; // number of training instances + + private: +  std::tr1::shared_ptr<LearningRateSchedule> schedule_; +  int k_;  // iteration count +}; + +class CumulativeL1OnlineOptimizer : public OnlineOptimizer { + public: +  CumulativeL1OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s, +                              size_t training_instances, double C) : +    OnlineOptimizer(s, training_instances), C_(C), u_() {} + + protected: +  void UpdateWeightsImpl(const double& eta, const SparseVector<double>& approx_g, SparseVector<double>* weights) { +    u_ += eta * C_ / N_; +    (*weights) += eta * approx_g; +    for (SparseVector<double>::const_iterator it = approx_g.begin(); it != approx_g.end(); ++it) +      ApplyPenalty(it->first, weights); +  } + + private: +  void ApplyPenalty(int i, SparseVector<double>* w) { +    const double z = w->value(i); +    double w_i = z; +    double q_i = q_.value(i); +    if (w_i > 0) +      w_i = std::max(0.0, w_i - (u_ + q_i)); +    else +      w_i = std::max(0.0, w_i + (u_ - q_i)); +    q_i += w_i - z; +    q_.set_value(i, q_i); +    w->set_value(i, w_i); +  } + +  const double C_;  // reguarlization strength +  double u_; +  SparseVector<double> q_; +}; + +#endif diff --git a/training/online_train.cc b/training/online_train.cc new file mode 100644 index 00000000..2e906913 --- /dev/null +++ b/training/online_train.cc @@ -0,0 +1,8 @@ +#include <iostream> + +#include "online_optimizer.h" + +int main(int argc, char** argv) { +  return 0; +} + diff --git a/training/optimize.cc b/training/optimize.cc index 5194752e..1377caa6 100644 --- a/training/optimize.cc +++ b/training/optimize.cc @@ -7,9 +7,9 @@  using namespace std; -Optimizer::~Optimizer() {} +BatchOptimizer::~BatchOptimizer() {} -void Optimizer::Save(ostream* out) const { +void BatchOptimizer::Save(ostream* out) const {    out->write((const char*)&eval_, sizeof(eval_));    out->write((const char*)&has_converged_, sizeof(has_converged_));    SaveImpl(out); @@ -17,7 +17,7 @@ void Optimizer::Save(ostream* out) const {    out->write((const char*)&magic, sizeof(magic));  } -void Optimizer::Load(istream* in) { +void BatchOptimizer::Load(istream* in) {    in->read((char*)&eval_, sizeof(eval_));    ++eval_;    in->read((char*)&has_converged_, sizeof(has_converged_)); @@ -28,11 +28,11 @@ void Optimizer::Load(istream* in) {    cerr << Name() << " EVALUATION #" << eval_ << endl;  } -void Optimizer::SaveImpl(ostream* out) const { +void BatchOptimizer::SaveImpl(ostream* out) const {    (void)out;  } -void Optimizer::LoadImpl(istream* in) { +void BatchOptimizer::LoadImpl(istream* in) {    (void)in;  } @@ -78,18 +78,6 @@ void RPropOptimizer::LoadImpl(istream* in) {    in->read((char*)&delta_ij_[0], sizeof(double) * n);  } -string SGDOptimizer::Name() const { -  return "SGDOptimizer"; -} - -void SGDOptimizer::OptimizeImpl(const double& obj, -                            const vector<double>& g, -                            vector<double>* x) { -  (void)obj; -  for (int i = 0; i < g.size(); ++i) -    (*x)[i] -= g[i] * eta_; -} -  string LBFGSOptimizer::Name() const {    return "LBFGSOptimizer";  } diff --git a/training/optimize.h b/training/optimize.h index eddceaad..e2620f93 100644 --- a/training/optimize.h +++ b/training/optimize.h @@ -10,10 +10,10 @@  // abstract base class for first order optimizers  // order of invocation: new, Load(), Optimize(), Save(), delete -class Optimizer { +class BatchOptimizer {   public: -  Optimizer() : eval_(1), has_converged_(false) {} -  virtual ~Optimizer(); +  BatchOptimizer() : eval_(1), has_converged_(false) {} +  virtual ~BatchOptimizer();    virtual std::string Name() const = 0;    int EvaluationCount() const { return eval_; }    bool HasConverged() const { return has_converged_; } @@ -41,7 +41,7 @@ class Optimizer {    bool has_converged_;  }; -class RPropOptimizer : public Optimizer { +class RPropOptimizer : public BatchOptimizer {   public:    explicit RPropOptimizer(int num_vars,                            double eta_plus = 1.2, @@ -75,20 +75,7 @@ class RPropOptimizer : public Optimizer {    const double delta_min_;  }; -class SGDOptimizer : public Optimizer { - public: -  explicit SGDOptimizer(int num_vars, double eta = 0.1) : eta_(eta) { -    (void) num_vars; -  } -  std::string Name() const; -  void OptimizeImpl(const double& obj, -                    const std::vector<double>& g, -                    std::vector<double>* x); - private: -  const double eta_; -}; - -class LBFGSOptimizer : public Optimizer { +class LBFGSOptimizer : public BatchOptimizer {   public:    explicit LBFGSOptimizer(int num_vars, int memory_buffers = 10);    std::string Name() const; diff --git a/training/optimize_test.cc b/training/optimize_test.cc index 0ada7cbb..6fa5efd4 100644 --- a/training/optimize_test.cc +++ b/training/optimize_test.cc @@ -3,12 +3,13 @@  #include <sstream>  #include <boost/program_options/variables_map.hpp>  #include "optimize.h" +#include "online_optimizer.h"  #include "sparse_vector.h"  #include "fdict.h"  using namespace std; -double TestOptimizer(Optimizer* opt) { +double TestOptimizer(BatchOptimizer* opt) {    cerr << "TESTING NON-PERSISTENT OPTIMIZER\n";    // f(x,y) = 4x1^2 + x1*x2 + x2^2 + x3^2 + 6x3 + 5 @@ -34,7 +35,7 @@ double TestOptimizer(Optimizer* opt) {    return obj;  } -double TestPersistentOptimizer(Optimizer* opt) { +double TestPersistentOptimizer(BatchOptimizer* opt) {    cerr << "\nTESTING PERSISTENT OPTIMIZER\n";    // f(x,y) = 4x1^2 + x1*x2 + x2^2 + x3^2 + 6x3 + 5    // df/dx1 = 8*x1 + x2 @@ -95,11 +96,23 @@ void TestOptimizerVariants(int num_vars) {    cerr << oa.Name() << " SUCCESS\n";  } +using namespace std::tr1; + +void TestOnline() { +  size_t N = 20; +  double C = 1.0; +  double eta0 = 0.2; +  shared_ptr<LearningRateSchedule> r(new ExponentialDecayLearningRate(N, eta0, 0.85)); +  //shared_ptr<LearningRateSchedule> r(new StandardLearningRate(N, eta0)); +  CumulativeL1OnlineOptimizer opt(r, N, C); +  assert(r->eta(10) < r->eta(1)); +} +  int main() {    int n = 3; -  TestOptimizerVariants<SGDOptimizer>(n);    TestOptimizerVariants<LBFGSOptimizer>(n);    TestOptimizerVariants<RPropOptimizer>(n); +  TestOnline();    return 0;  } | 
