diff options
author | Kenneth Heafield <github@kheafield.com> | 2012-10-22 12:07:20 +0100 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2012-10-22 12:07:20 +0100 |
commit | ac586bc9b156b4ae687cd5961ba1fe7b20ec57d6 (patch) | |
tree | 052473b46d7fa18d51f897cdb9e7c93a7186dafd /rst_parser/rst_train.cc | |
parent | 97b85c082b3e55c28a8b0c0eb762483ac84a1577 (diff) | |
parent | ad6d4a1b2519896f2b16a282699ce4e64041fab8 (diff) |
Merge remote branch 'upstream/master'
Conflicts:
Jamroot
bjam
decoder/Jamfile
decoder/cdec.cc
dpmert/Jamfile
jam-files/sanity.jam
klm/lm/Jamfile
klm/util/Jamfile
mira/Jamfile
Diffstat (limited to 'rst_parser/rst_train.cc')
-rw-r--r-- | rst_parser/rst_train.cc | 144 |
1 files changed, 0 insertions, 144 deletions
diff --git a/rst_parser/rst_train.cc b/rst_parser/rst_train.cc deleted file mode 100644 index a8b8dd84..00000000 --- a/rst_parser/rst_train.cc +++ /dev/null @@ -1,144 +0,0 @@ -#include "arc_factored.h" - -#include <vector> -#include <iostream> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "timing_stats.h" -#include "arc_ff.h" -#include "dep_training.h" -#include "stringlib.h" -#include "filelib.h" -#include "tdict.h" -#include "weights.h" -#include "rst.h" -#include "global_ff.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - string cfg_file; - opts.add_options() - ("training_data,t",po::value<string>()->default_value("-"), "File containing training data (jsent format)") - ("q_weights,q",po::value<string>(), "Arc-factored weights for proposal distribution") - ("samples,n",po::value<unsigned>()->default_value(1000), "Number of samples"); - po::options_description clo("Command line options"); - clo.add_options() - ("config,c", po::value<string>(&cfg_file), "Configuration file") - ("help,?", "Print this help message and exit"); - - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(dconfig_options).add(clo); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (cfg_file.size() > 0) { - ReadFile rf(cfg_file); - po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); - } - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - vector<weight_t> qweights(FD::NumFeats(), 0.0); - Weights::InitFromFile(conf["q_weights"].as<string>(), &qweights); - vector<TrainingInstance> corpus; - ArcFeatureFunctions ffs; - GlobalFeatureFunctions gff; - TrainingInstance::ReadTrainingCorpus(conf["training_data"].as<string>(), &corpus); - vector<ArcFactoredForest> forests(corpus.size()); - vector<prob_t> zs(corpus.size()); - SparseVector<double> empirical; - bool flag = false; - for (int i = 0; i < corpus.size(); ++i) { - TrainingInstance& cur = corpus[i]; - if ((i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } - if ((i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } - SparseVector<weight_t> efmap; - ffs.PrepareForInput(cur.ts); - gff.PrepareForInput(cur.ts); - for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { - efmap.clear(); - ffs.EdgeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, - cur.tree.h_m_pairs[j].second, - &efmap); - cur.features += efmap; - } - for (int j = 0; j < cur.tree.roots.size(); ++j) { - efmap.clear(); - ffs.EdgeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); - cur.features += efmap; - } - efmap.clear(); - gff.Features(cur.ts, cur.tree, &efmap); - cur.features += efmap; - empirical += cur.features; - forests[i].resize(cur.ts.words.size()); - forests[i].ExtractFeatures(cur.ts, ffs); - forests[i].Reweight(qweights); - forests[i].EdgeMarginals(&zs[i]); - zs[i] = prob_t::One() / zs[i]; - // cerr << zs[i] << endl; - forests[i].Reweight(qweights); // EdgeMarginals overwrites edge_prob - } - if (flag) cerr << endl; - MT19937 rng; - SparseVector<double> model_exp; - SparseVector<double> weights; - Weights::InitSparseVector(qweights, &weights); - int samples = conf["samples"].as<unsigned>(); - for (int i = 0; i < corpus.size(); ++i) { -#if 0 - forests[i].EdgeMarginals(); - model_exp.clear(); - for (int h = -1; h < num_words; ++h) { - for (int m = 0; m < num_words; ++m) { - if (h == m) continue; - const ArcFactoredForest::Edge& edge = forests[i](h,m); - const SparseVector<weight_t>& fmap = edge.features; - double prob = edge.edge_prob.as_float(); - model_exp += fmap * prob; - } - } - cerr << "TRUE EXP: " << model_exp << endl; - forests[i].Reweight(weights); -#endif - - TreeSampler ts(forests[i]); - prob_t zhat = prob_t::Zero(); - SparseVector<prob_t> sampled_exp; - for (int n = 0; n < samples; ++n) { - EdgeSubset tree; - ts.SampleRandomSpanningTree(&tree, &rng); - SparseVector<double> qfeats, gfeats; - tree.ExtractFeatures(corpus[i].ts, ffs, &qfeats); - prob_t u; u.logeq(qfeats.dot(qweights)); - const prob_t q = u / zs[i]; // proposal mass - gff.Features(corpus[i].ts, tree, &gfeats); - SparseVector<double> tot_feats = qfeats + gfeats; - u.logeq(tot_feats.dot(weights)); - prob_t w = u / q; - zhat += w; - for (SparseVector<double>::iterator it = tot_feats.begin(); it != tot_feats.end(); ++it) - sampled_exp.add_value(it->first, w * prob_t(it->second)); - } - sampled_exp /= zhat; - SparseVector<double> tot_m; - for (SparseVector<prob_t>::iterator it = sampled_exp.begin(); it != sampled_exp.end(); ++it) - tot_m.add_value(it->first, it->second.as_float()); - //cerr << "DIFF: " << (tot_m - corpus[i].features) << endl; - const double eta = 0.03; - weights -= (tot_m - corpus[i].features) * eta; - } - cerr << "WEIGHTS.\n"; - cerr << weights << endl; - return 0; -} - |