summaryrefslogtreecommitdiff
path: root/training/crf
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2013-11-28 11:11:22 +0100
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2013-11-28 11:11:22 +0100
commita6b3d3417c6d8a63c36a0fdc9c7cf06f7b06df9f (patch)
treeb86142bbf686a69cc3c9913faee18e073b8eece0 /training/crf
parent0be26bb330f3a976243c9da1d6908d4aaf033ebe (diff)
parenta5b4f06838585874f200c94028cd4d41d54c4f76 (diff)
Merge branch 'master' of github.com:pks/cdec-dtrain
Diffstat (limited to 'training/crf')
-rw-r--r--training/crf/Makefile.am4
-rw-r--r--training/crf/mpi_adagrad_optimize.cc394
-rw-r--r--training/crf/mpi_batch_optimize.cc6
3 files changed, 401 insertions, 3 deletions
diff --git a/training/crf/Makefile.am b/training/crf/Makefile.am
index 4a8c30fd..cd82161f 100644
--- a/training/crf/Makefile.am
+++ b/training/crf/Makefile.am
@@ -1,5 +1,6 @@
bin_PROGRAMS = \
mpi_batch_optimize \
+ mpi_adagrad_optimize \
mpi_compute_cllh \
mpi_extract_features \
mpi_extract_reachable \
@@ -10,6 +11,9 @@ bin_PROGRAMS = \
mpi_baum_welch_SOURCES = mpi_baum_welch.cc
mpi_baum_welch_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz
+mpi_adagrad_optimize_SOURCES = mpi_adagrad_optimize.cc cllh_observer.cc cllh_observer.h
+mpi_adagrad_optimize_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz
+
mpi_online_optimize_SOURCES = mpi_online_optimize.cc
mpi_online_optimize_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz
diff --git a/training/crf/mpi_adagrad_optimize.cc b/training/crf/mpi_adagrad_optimize.cc
new file mode 100644
index 00000000..af963e3a
--- /dev/null
+++ b/training/crf/mpi_adagrad_optimize.cc
@@ -0,0 +1,394 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+#include <ctime>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+#include <boost/shared_ptr.hpp>
+
+#include "config.h"
+#include "stringlib.h"
+#include "verbose.h"
+#include "cllh_observer.h"
+#include "hg.h"
+#include "prob.h"
+#include "inside_outside.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "online_optimizer.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+#ifdef HAVE_MPI
+#include <boost/mpi/timer.hpp>
+#include <boost/mpi.hpp>
+namespace mpi = boost::mpi;
+#endif
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("weights,w",po::value<string>(), "Initial feature weights")
+ ("training_data,d",po::value<string>(), "Training data corpus")
+ ("test_data,t",po::value<string>(), "(optional) Test data")
+ ("decoder_config,c",po::value<string>(), "Decoder configuration file")
+ ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(8),
+ "Number of training instances evaluated per processor in each minibatch")
+ ("max_passes", po::value<double>()->default_value(20.0), "Maximum number of passes through the data")
+ ("max_walltime", po::value<unsigned>(), "Walltime to run (in minutes)")
+ ("write_every_n_minibatches", po::value<unsigned>()->default_value(100), "Write weights every N minibatches processed")
+ ("random_seed,S", po::value<uint32_t>(), "Random seed")
+ ("regularization,r", po::value<string>()->default_value("none"),
+ "Regularization 'none', 'l1', or 'l2'")
+ ("regularization_strength,C", po::value<double>(), "Regularization strength")
+ ("eta,e", po::value<double>()->default_value(1.0), "Initial learning rate (eta)");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
+ cerr << dcmdline_options << endl;
+ return false;
+ }
+ return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) {
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ int id = 0;
+ while(getline(in, line)) {
+ if (id % size == rank) {
+ c->push_back(line);
+ order->push_back(id);
+ }
+ ++id;
+ }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct TrainingObserver : public DecoderObserver {
+ void Reset() {
+ acc_grad.clear();
+ acc_obj = 0;
+ total_complete = 0;
+ }
+
+ virtual void NotifyDecodingStart(const SentenceMetadata&) {
+ cur_model_exp.clear();
+ cur_obj = 0;
+ state = 1;
+ }
+
+ // compute model expectations, denominator of objective
+ virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+ assert(state == 1);
+ state = 2;
+ const prob_t z = InsideOutside<prob_t,
+ EdgeProb,
+ SparseVector<prob_t>,
+ EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
+ cur_obj = log(z);
+ cur_model_exp /= z;
+ }
+
+ // compute "empirical" expectations, numerator of objective
+ virtual void NotifyAlignmentForest(const SentenceMetadata&, Hypergraph* hg) {
+ assert(state == 2);
+ state = 3;
+ SparseVector<prob_t> ref_exp;
+ const prob_t ref_z = InsideOutside<prob_t,
+ EdgeProb,
+ SparseVector<prob_t>,
+ EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp);
+ ref_exp /= ref_z;
+
+ double log_ref_z;
+#if 0
+ if (crf_uniform_empirical) {
+ log_ref_z = ref_exp.dot(feature_weights);
+ } else {
+ log_ref_z = log(ref_z);
+ }
+#else
+ log_ref_z = log(ref_z);
+#endif
+
+ // rounding errors means that <0 is too strict
+ if ((cur_obj - log_ref_z) < kMINUS_EPSILON) {
+ cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
+ exit(1);
+ }
+ assert(!std::isnan(log_ref_z));
+ ref_exp -= cur_model_exp;
+ acc_grad += ref_exp;
+ acc_obj += (cur_obj - log_ref_z);
+ }
+
+ virtual void NotifyDecodingComplete(const SentenceMetadata&) {
+ if (state == 3) {
+ ++total_complete;
+ } else {
+ }
+ }
+
+ void GetGradient(SparseVector<double>* g) const {
+ g->clear();
+#if HAVE_CXX11
+ for (auto& gi : acc_grad) {
+#else
+ for (FastSparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it) {
+ pair<unsigned, double>& gi = *it;
+#endif
+ g->set_value(gi.first, -gi.second.as_float());
+ }
+ }
+
+ int total_complete;
+ SparseVector<prob_t> cur_model_exp;
+ SparseVector<prob_t> acc_grad;
+ double acc_obj;
+ double cur_obj;
+ int state;
+};
+
+#ifdef HAVE_MPI
+namespace boost { namespace mpi {
+ template<>
+ struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> >
+ : mpl::true_ { };
+} } // end namespace boost::mpi
+#endif
+
+class AdaGradOptimizer {
+ public:
+ explicit AdaGradOptimizer(double e) :
+ eta(e),
+ G() {}
+ void update(const SparseVector<double>& g, vector<double>* x) {
+ if (x->size() > G.size()) G.resize(x->size(), 0.0);
+#if HAVE_CXX11
+ for (auto& gi : g) {
+#else
+ for (SparseVector<double>::const_iterator it = g.begin(); it != g.end(); ++it) {
+ const pair<unsigned,double>& gi = *it;
+#endif
+ if (gi.second) {
+ G[gi.first] += gi.second * gi.second;
+ (*x)[gi.first] -= eta / sqrt(G[gi.first]) * gi.second;
+ }
+ }
+ }
+ const double eta;
+ vector<double> G;
+};
+
+class AdaGradL1Optimizer {
+ public:
+ explicit AdaGradL1Optimizer(double e, double l) :
+ t(),
+ eta(e),
+ lambda(l),
+ G() {}
+ void update(const SparseVector<double>& g, vector<double>* x) {
+ t += 1.0;
+ if (x->size() > G.size()) {
+ G.resize(x->size(), 0.0);
+ u.resize(x->size(), 0.0);
+ }
+#if HAVE_CXX11
+ for (auto& gi : g) {
+#else
+ for (SparseVector<double>::const_iterator it = g.begin(); it != g.end(); ++it) {
+ const pair<unsigned,double>& gi = *it;
+#endif
+ if (gi.second) {
+ u[gi.first] += gi.second;
+ G[gi.first] += gi.second * gi.second;
+ double z = fabs(u[gi.first] / t) - lambda;
+ double s = 1;
+ if (u[gi.first] > 0) s = -1;
+ if (z > 0 && G[gi.first])
+ (*x)[gi.first] = eta * s * z * t / sqrt(G[gi.first]);
+ else
+ (*x)[gi.first] = 0.0;
+ }
+ }
+ }
+ double t;
+ const double eta;
+ const double lambda;
+ vector<double> G, u;
+};
+
+unsigned non_zeros(const vector<double>& x) {
+ unsigned nz = 0;
+ for (unsigned i = 0; i < x.size(); ++i)
+ if (x[i]) ++nz;
+ return nz;
+}
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+ mpi::environment env(argc, argv);
+ mpi::communicator world;
+ const int size = world.size();
+ const int rank = world.rank();
+#else
+ const int size = 1;
+ const int rank = 0;
+#endif
+ if (size > 1) SetSilent(true); // turn off verbose decoder output
+ register_feature_functions();
+
+ po::variables_map conf;
+ if (!InitCommandLine(argc, argv, &conf))
+ return 1;
+
+ ReadFile ini_rf(conf["decoder_config"].as<string>());
+ Decoder decoder(ini_rf.stream());
+
+ // load initial weights
+ vector<weight_t> init_weights;
+ if (conf.count("input_weights"))
+ Weights::InitFromFile(conf["input_weights"].as<string>(), &init_weights);
+
+ vector<string> corpus, test_corpus;
+ vector<int> ids;
+ ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
+ assert(corpus.size() > 0);
+ if (conf.count("test_data"))
+ ReadTrainingCorpus(conf["test_data"].as<string>(), rank, size, &corpus, &ids);
+
+ const unsigned size_per_proc = conf["minibatch_size_per_proc"].as<unsigned>();
+ if (size_per_proc > corpus.size()) {
+ cerr << "Minibatch size must be smaller than corpus size!\n";
+ return 1;
+ }
+ const double minibatch_size = size_per_proc * size;
+
+ size_t total_corpus_size = 0;
+#ifdef HAVE_MPI
+ reduce(world, corpus.size(), total_corpus_size, std::plus<size_t>(), 0);
+#else
+ total_corpus_size = corpus.size();
+#endif
+
+ if (rank == 0)
+ cerr << "Total corpus size: " << total_corpus_size << endl;
+
+ boost::shared_ptr<MT19937> rng;
+ if (conf.count("random_seed"))
+ rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ rng.reset(new MT19937);
+
+ double passes_per_minibatch = static_cast<double>(size_per_proc) / total_corpus_size;
+
+ int write_weights_every_ith = conf["write_every_n_minibatches"].as<unsigned>();
+
+ unsigned max_iteration = conf["max_passes"].as<double>() / passes_per_minibatch;
+ ++max_iteration;
+ if (rank == 0)
+ cerr << "Max passes through data = " << conf["max_passes"].as<double>() << endl
+ << " --> max minibatches = " << max_iteration << endl;
+ unsigned timeout = 0;
+ if (conf.count("max_walltime"))
+ timeout = 60 * conf["max_walltime"].as<unsigned>();
+ vector<weight_t>& lambdas = decoder.CurrentWeightVector();
+ if (init_weights.size()) {
+ lambdas.swap(init_weights);
+ init_weights.clear();
+ }
+
+ //AdaGradOptimizer adagrad(conf["eta"].as<double>());
+ AdaGradL1Optimizer adagrad(conf["eta"].as<double>(), conf["regularization_strength"].as<double>());
+ int iter = -1;
+ bool converged = false;
+
+ TrainingObserver observer;
+ ConditionalLikelihoodObserver cllh_observer;
+
+ const time_t start_time = time(NULL);
+ while (!converged) {
+#ifdef HAVE_MPI
+ mpi::timer timer;
+#endif
+ ++iter;
+ observer.Reset();
+ if (rank == 0) {
+ converged = (iter == max_iteration);
+ string fname = "weights.cur.gz";
+ if (iter % write_weights_every_ith == 0) {
+ ostringstream o; o << "weights." << iter << ".gz";
+ fname = o.str();
+ }
+ const time_t cur_time = time(NULL);
+ if (timeout && ((cur_time - start_time) > timeout)) {
+ converged = true;
+ fname = "weights.final.gz";
+ }
+ ostringstream vv;
+ double minutes = (cur_time - start_time) / 60.0;
+ vv << "total walltime=" << minutes << "min iter=" << iter << " minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << non_zeros(lambdas) << '/' << FD::NumFeats() << " passes_thru_data=" << (iter * size_per_proc / static_cast<double>(corpus.size()));
+ const string svv = vv.str();
+ cerr << svv << endl;
+ Weights::WriteToFile(fname, lambdas, true, &svv);
+ }
+
+ for (int i = 0; i < size_per_proc; ++i) {
+ int ei = corpus.size() * rng->next();
+ int id = ids[ei];
+ decoder.SetId(id);
+ decoder.Decode(corpus[ei], &observer);
+ }
+ SparseVector<double> local_grad, g;
+ observer.GetGradient(&local_grad);
+#ifdef HAVE_MPI
+ reduce(world, local_grad, g, std::plus<SparseVector<double> >(), 0);
+#else
+ g.swap(local_grad);
+#endif
+ local_grad.clear();
+ if (rank == 0) {
+ g /= minibatch_size;
+ lambdas.resize(FD::NumFeats(), 0.0); // might have seen new features
+ adagrad.update(g, &lambdas);
+ Weights::SanityCheck(lambdas);
+ Weights::ShowLargestFeatures(lambdas);
+ }
+#ifdef HAVE_MPI
+ broadcast(world, lambdas, 0);
+ broadcast(world, converged, 0);
+ world.barrier();
+ if (rank == 0) { cerr << " ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; }
+#endif
+ }
+ cerr << "CONVERGED = " << converged << endl;
+ cerr << "EXITING...\n";
+ return 0;
+}
+
diff --git a/training/crf/mpi_batch_optimize.cc b/training/crf/mpi_batch_optimize.cc
index 2eff07e4..da1845b1 100644
--- a/training/crf/mpi_batch_optimize.cc
+++ b/training/crf/mpi_batch_optimize.cc
@@ -97,14 +97,14 @@ struct TrainingObserver : public DecoderObserver {
(*g)[it->first] = it->second.as_float();
}
- virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
+ virtual void NotifyDecodingStart(const SentenceMetadata&) {
cur_model_exp.clear();
cur_obj = 0;
state = 1;
}
// compute model expectations, denominator of objective
- virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+ virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
assert(state == 1);
state = 2;
const prob_t z = InsideOutside<prob_t,
@@ -149,7 +149,7 @@ struct TrainingObserver : public DecoderObserver {
trg_words += smeta.GetReference().size();
}
- virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
+ virtual void NotifyDecodingComplete(const SentenceMetadata&) {
if (state == 3) {
++total_complete;
} else {