diff options
author | Kenneth Heafield <github@kheafield.com> | 2012-10-22 12:07:20 +0100 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2012-10-22 12:07:20 +0100 |
commit | 5f98fe5c4f2a2090eeb9d30c030305a70a8347d1 (patch) | |
tree | 9b6002f850e6dea1e3400c6b19bb31a9cdf3067f /rst_parser/mst_train.cc | |
parent | cf9994131993b40be62e90e213b1e11e6b550143 (diff) | |
parent | 21825a09d97c2e0afd20512f306fb25fed55e529 (diff) |
Merge remote branch 'upstream/master'
Conflicts:
Jamroot
bjam
decoder/Jamfile
decoder/cdec.cc
dpmert/Jamfile
jam-files/sanity.jam
klm/lm/Jamfile
klm/util/Jamfile
mira/Jamfile
Diffstat (limited to 'rst_parser/mst_train.cc')
-rw-r--r-- | rst_parser/mst_train.cc | 228 |
1 files changed, 0 insertions, 228 deletions
diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc deleted file mode 100644 index a78df600..00000000 --- a/rst_parser/mst_train.cc +++ /dev/null @@ -1,228 +0,0 @@ -#include "arc_factored.h" - -#include <vector> -#include <iostream> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -// #define HAVE_THREAD 1 -#if HAVE_THREAD -#include <boost/thread.hpp> -#endif - -#include "arc_ff.h" -#include "stringlib.h" -#include "filelib.h" -#include "tdict.h" -#include "dep_training.h" -#include "optimize.h" -#include "weights.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - string cfg_file; - opts.add_options() - ("training_data,t",po::value<string>()->default_value("-"), "File containing training data (jsent format)") - ("weights,w",po::value<string>(), "Optional starting weights") - ("output_every_i_iterations,I",po::value<unsigned>()->default_value(1), "Write weights every I iterations") - ("regularization_strength,C",po::value<double>()->default_value(1.0), "Regularization strength") -#ifdef HAVE_CMPH - ("cmph_perfect_feature_hash,h", po::value<string>(), "Load perfect hash function for features") -#endif -#if HAVE_THREAD - ("threads,T",po::value<unsigned>()->default_value(1), "Number of threads") -#endif - ("correction_buffers,m", po::value<int>()->default_value(10), "LBFGS correction buffers"); - po::options_description clo("Command line options"); - clo.add_options() - ("config,c", po::value<string>(&cfg_file), "Configuration file") - ("help,?", "Print this help message and exit"); - - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(dconfig_options).add(clo); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (cfg_file.size() > 0) { - ReadFile rf(cfg_file); - po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); - } - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void AddFeatures(double prob, const SparseVector<double>& fmap, vector<double>* g) { - for (SparseVector<double>::const_iterator it = fmap.begin(); it != fmap.end(); ++it) - (*g)[it->first] += it->second * prob; -} - -double ApplyRegularizationTerms(const double C, - const vector<double>& weights, - vector<double>* g) { - assert(weights.size() == g->size()); - double reg = 0; - for (size_t i = 0; i < weights.size(); ++i) { -// const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0); - const double& w_i = weights[i]; - double& g_i = (*g)[i]; - reg += C * w_i * w_i; - g_i += 2 * C * w_i; - -// reg += T * (w_i - prev_w_i) * (w_i - prev_w_i); -// g_i += 2 * T * (w_i - prev_w_i); - } - return reg; -} - -struct GradientWorker { - GradientWorker(int f, - int t, - vector<double>* w, - vector<TrainingInstance>* c, - vector<ArcFactoredForest>* fs) : obj(), weights(*w), from(f), to(t), corpus(*c), forests(*fs), g(w->size()) {} - void operator()() { - int every = (to - from) / 20; - if (!every) every++; - for (int i = from; i < to; ++i) { - if ((from == 0) && (i + 1) % every == 0) cerr << '.' << flush; - const int num_words = corpus[i].ts.words.size(); - forests[i].Reweight(weights); - prob_t z; - forests[i].EdgeMarginals(&z); - obj -= log(z); - //cerr << " O = " << (-corpus[i].features.dot(weights)) << " D=" << -lz << " OO= " << (-corpus[i].features.dot(weights) - lz) << endl; - //cerr << " ZZ = " << zz << endl; - for (int h = -1; h < num_words; ++h) { - for (int m = 0; m < num_words; ++m) { - if (h == m) continue; - const ArcFactoredForest::Edge& edge = forests[i](h,m); - const SparseVector<weight_t>& fmap = edge.features; - double prob = edge.edge_prob.as_float(); - if (prob < -0.000001) { cerr << "Prob < 0: " << prob << endl; prob = 0; } - if (prob > 1.000001) { cerr << "Prob > 1: " << prob << endl; prob = 1; } - AddFeatures(prob, fmap, &g); - //mfm += fmap * prob; // DE - } - } - } - } - double obj; - vector<double>& weights; - const int from, to; - vector<TrainingInstance>& corpus; - vector<ArcFactoredForest>& forests; - vector<double> g; // local gradient -}; - -int main(int argc, char** argv) { - int rank = 0; - int size = 1; - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - if (conf.count("cmph_perfect_feature_hash")) { - cerr << "Loading perfect hash function from " << conf["cmph_perfect_feature_hash"].as<string>() << " ...\n"; - FD::EnableHash(conf["cmph_perfect_feature_hash"].as<string>()); - cerr << " " << FD::NumFeats() << " features in map\n"; - } - ArcFeatureFunctions ffs; - vector<TrainingInstance> corpus; - TrainingInstance::ReadTrainingCorpus(conf["training_data"].as<string>(), &corpus, rank, size); - vector<weight_t> weights; - Weights::InitFromFile(conf["weights"].as<string>(), &weights); - vector<ArcFactoredForest> forests(corpus.size()); - SparseVector<double> empirical; - cerr << "Extracting features...\n"; - bool flag = false; - for (int i = 0; i < corpus.size(); ++i) { - TrainingInstance& cur = corpus[i]; - if (rank == 0 && (i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } - if (rank == 0 && (i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } - ffs.PrepareForInput(cur.ts); - SparseVector<weight_t> efmap; - for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { - efmap.clear(); - ffs.EdgeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, - cur.tree.h_m_pairs[j].second, - &efmap); - cur.features += efmap; - } - for (int j = 0; j < cur.tree.roots.size(); ++j) { - efmap.clear(); - ffs.EdgeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); - cur.features += efmap; - } - empirical += cur.features; - forests[i].resize(cur.ts.words.size()); - forests[i].ExtractFeatures(cur.ts, ffs); - } - if (flag) cerr << endl; - //cerr << "EMP: " << empirical << endl; //DE - weights.resize(FD::NumFeats(), 0.0); - vector<weight_t> g(FD::NumFeats(), 0.0); - cerr << "features initialized\noptimizing...\n"; - boost::shared_ptr<BatchOptimizer> o; -#if HAVE_THREAD - unsigned threads = conf["threads"].as<unsigned>(); - if (threads > corpus.size()) threads = corpus.size(); -#else - const unsigned threads = 1; -#endif - int chunk = corpus.size() / threads; - o.reset(new LBFGSOptimizer(g.size(), conf["correction_buffers"].as<int>())); - int iterations = 1000; - for (int iter = 0; iter < iterations; ++iter) { - cerr << "ITERATION " << iter << " " << flush; - fill(g.begin(), g.end(), 0.0); - for (SparseVector<double>::iterator it = empirical.begin(); it != empirical.end(); ++it) - g[it->first] = -it->second; - double obj = -empirical.dot(weights); - vector<boost::shared_ptr<GradientWorker> > jobs; - for (int from = 0; from < corpus.size(); from += chunk) { - int to = from + chunk; - if (to > corpus.size()) to = corpus.size(); - jobs.push_back(boost::shared_ptr<GradientWorker>(new GradientWorker(from, to, &weights, &corpus, &forests))); - } -#if HAVE_THREAD - boost::thread_group tg; - for (int i = 0; i < threads; ++i) - tg.create_thread(boost::ref(*jobs[i])); - tg.join_all(); -#else - (*jobs[0])(); -#endif - for (int i = 0; i < threads; ++i) { - obj += jobs[i]->obj; - vector<double>& tg = jobs[i]->g; - for (unsigned j = 0; j < g.size(); ++j) - g[j] += tg[j]; - } - // SparseVector<double> mfm; //DE - //cerr << endl << "E: " << empirical << endl; // DE - //cerr << "M: " << mfm << endl; // DE - double r = ApplyRegularizationTerms(conf["regularization_strength"].as<double>(), weights, &g); - double gnorm = 0; - for (int i = 0; i < g.size(); ++i) - gnorm += g[i]*g[i]; - ostringstream ll; - ll << "ITER=" << (iter+1) << "\tOBJ=" << (obj+r) << "\t[F=" << obj << " R=" << r << "]\tGnorm=" << sqrt(gnorm); - cerr << ' ' << ll.str().substr(ll.str().find('\t')+1) << endl; - obj += r; - assert(obj >= 0); - o->Optimize(obj, g, &weights); - Weights::ShowLargestFeatures(weights); - const bool converged = o->HasConverged(); - const char* ofname = converged ? "weights.final.gz" : "weights.cur.gz"; - if (converged || ((iter+1) % conf["output_every_i_iterations"].as<unsigned>()) == 0) { - cerr << "writing..." << flush; - const string sl = ll.str(); - Weights::WriteToFile(ofname, weights, true, &sl); - cerr << "done" << endl; - } - if (converged) { cerr << "CONVERGED\n"; break; } - } - return 0; -} - |