summaryrefslogtreecommitdiff
path: root/rst_parser/rst_train.cc
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
committerKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
commitac586bc9b156b4ae687cd5961ba1fe7b20ec57d6 (patch)
tree052473b46d7fa18d51f897cdb9e7c93a7186dafd /rst_parser/rst_train.cc
parent97b85c082b3e55c28a8b0c0eb762483ac84a1577 (diff)
parentad6d4a1b2519896f2b16a282699ce4e64041fab8 (diff)
Merge remote branch 'upstream/master'
Conflicts: Jamroot bjam decoder/Jamfile decoder/cdec.cc dpmert/Jamfile jam-files/sanity.jam klm/lm/Jamfile klm/util/Jamfile mira/Jamfile
Diffstat (limited to 'rst_parser/rst_train.cc')
-rw-r--r--rst_parser/rst_train.cc144
1 files changed, 0 insertions, 144 deletions
diff --git a/rst_parser/rst_train.cc b/rst_parser/rst_train.cc
deleted file mode 100644
index a8b8dd84..00000000
--- a/rst_parser/rst_train.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-#include "arc_factored.h"
-
-#include <vector>
-#include <iostream>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "timing_stats.h"
-#include "arc_ff.h"
-#include "dep_training.h"
-#include "stringlib.h"
-#include "filelib.h"
-#include "tdict.h"
-#include "weights.h"
-#include "rst.h"
-#include "global_ff.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- string cfg_file;
- opts.add_options()
- ("training_data,t",po::value<string>()->default_value("-"), "File containing training data (jsent format)")
- ("q_weights,q",po::value<string>(), "Arc-factored weights for proposal distribution")
- ("samples,n",po::value<unsigned>()->default_value(1000), "Number of samples");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config,c", po::value<string>(&cfg_file), "Configuration file")
- ("help,?", "Print this help message and exit");
-
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(dconfig_options).add(clo);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (cfg_file.size() > 0) {
- ReadFile rf(cfg_file);
- po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf);
- }
- if (conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- vector<weight_t> qweights(FD::NumFeats(), 0.0);
- Weights::InitFromFile(conf["q_weights"].as<string>(), &qweights);
- vector<TrainingInstance> corpus;
- ArcFeatureFunctions ffs;
- GlobalFeatureFunctions gff;
- TrainingInstance::ReadTrainingCorpus(conf["training_data"].as<string>(), &corpus);
- vector<ArcFactoredForest> forests(corpus.size());
- vector<prob_t> zs(corpus.size());
- SparseVector<double> empirical;
- bool flag = false;
- for (int i = 0; i < corpus.size(); ++i) {
- TrainingInstance& cur = corpus[i];
- if ((i+1) % 10 == 0) { cerr << '.' << flush; flag = true; }
- if ((i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; }
- SparseVector<weight_t> efmap;
- ffs.PrepareForInput(cur.ts);
- gff.PrepareForInput(cur.ts);
- for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) {
- efmap.clear();
- ffs.EdgeFeatures(cur.ts, cur.tree.h_m_pairs[j].first,
- cur.tree.h_m_pairs[j].second,
- &efmap);
- cur.features += efmap;
- }
- for (int j = 0; j < cur.tree.roots.size(); ++j) {
- efmap.clear();
- ffs.EdgeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap);
- cur.features += efmap;
- }
- efmap.clear();
- gff.Features(cur.ts, cur.tree, &efmap);
- cur.features += efmap;
- empirical += cur.features;
- forests[i].resize(cur.ts.words.size());
- forests[i].ExtractFeatures(cur.ts, ffs);
- forests[i].Reweight(qweights);
- forests[i].EdgeMarginals(&zs[i]);
- zs[i] = prob_t::One() / zs[i];
- // cerr << zs[i] << endl;
- forests[i].Reweight(qweights); // EdgeMarginals overwrites edge_prob
- }
- if (flag) cerr << endl;
- MT19937 rng;
- SparseVector<double> model_exp;
- SparseVector<double> weights;
- Weights::InitSparseVector(qweights, &weights);
- int samples = conf["samples"].as<unsigned>();
- for (int i = 0; i < corpus.size(); ++i) {
-#if 0
- forests[i].EdgeMarginals();
- model_exp.clear();
- for (int h = -1; h < num_words; ++h) {
- for (int m = 0; m < num_words; ++m) {
- if (h == m) continue;
- const ArcFactoredForest::Edge& edge = forests[i](h,m);
- const SparseVector<weight_t>& fmap = edge.features;
- double prob = edge.edge_prob.as_float();
- model_exp += fmap * prob;
- }
- }
- cerr << "TRUE EXP: " << model_exp << endl;
- forests[i].Reweight(weights);
-#endif
-
- TreeSampler ts(forests[i]);
- prob_t zhat = prob_t::Zero();
- SparseVector<prob_t> sampled_exp;
- for (int n = 0; n < samples; ++n) {
- EdgeSubset tree;
- ts.SampleRandomSpanningTree(&tree, &rng);
- SparseVector<double> qfeats, gfeats;
- tree.ExtractFeatures(corpus[i].ts, ffs, &qfeats);
- prob_t u; u.logeq(qfeats.dot(qweights));
- const prob_t q = u / zs[i]; // proposal mass
- gff.Features(corpus[i].ts, tree, &gfeats);
- SparseVector<double> tot_feats = qfeats + gfeats;
- u.logeq(tot_feats.dot(weights));
- prob_t w = u / q;
- zhat += w;
- for (SparseVector<double>::iterator it = tot_feats.begin(); it != tot_feats.end(); ++it)
- sampled_exp.add_value(it->first, w * prob_t(it->second));
- }
- sampled_exp /= zhat;
- SparseVector<double> tot_m;
- for (SparseVector<prob_t>::iterator it = sampled_exp.begin(); it != sampled_exp.end(); ++it)
- tot_m.add_value(it->first, it->second.as_float());
- //cerr << "DIFF: " << (tot_m - corpus[i].features) << endl;
- const double eta = 0.03;
- weights -= (tot_m - corpus[i].features) * eta;
- }
- cerr << "WEIGHTS.\n";
- cerr << weights << endl;
- return 0;
-}
-