summaryrefslogtreecommitdiff
path: root/pro/mr_pro_map.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2012-11-18 13:35:42 -0500
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2012-11-18 13:35:42 -0500
commit1b8181bf0d6e9137e6b9ccdbe414aec37377a1a9 (patch)
tree33e5f3aa5abff1f41314cf8f6afbd2c2c40e4bfd /pro/mr_pro_map.cc
parent7c4665949fb93fb3de402e4ce1d19bef67850d05 (diff)
major restructure of the training code
Diffstat (limited to 'pro/mr_pro_map.cc')
-rw-r--r--pro/mr_pro_map.cc201
1 files changed, 0 insertions, 201 deletions
diff --git a/pro/mr_pro_map.cc b/pro/mr_pro_map.cc
deleted file mode 100644
index eef40b8a..00000000
--- a/pro/mr_pro_map.cc
+++ /dev/null
@@ -1,201 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <tr1/unordered_map>
-
-#include <boost/functional/hash.hpp>
-#include <boost/shared_ptr.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "candidate_set.h"
-#include "sampler.h"
-#include "filelib.h"
-#include "stringlib.h"
-#include "weights.h"
-#include "inside_outside.h"
-#include "hg_io.h"
-#include "ns.h"
-#include "ns_docscorer.h"
-
-// This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011)
-
-using namespace std;
-namespace po = boost::program_options;
-
-boost::shared_ptr<MT19937> rng;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
- ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
- ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)")
- ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
- ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)")
- ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)")
- ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")
- ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")
- ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)")
- ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
- ("help,h", "Help");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- bool flag = false;
- if (!conf->count("reference")) {
- cerr << "Please specify one or more references using -r <REF.TXT>\n";
- flag = true;
- }
- if (!conf->count("weights")) {
- cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
- flag = true;
- }
- if (flag || conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-struct ThresholdAlpha {
- explicit ThresholdAlpha(double t = 0.05) : threshold(t) {}
- double operator()(double mag) const {
- if (mag < threshold) return 0.0; else return 1.0;
- }
- const double threshold;
-};
-
-struct TrainingInstance {
- TrainingInstance(const SparseVector<weight_t>& feats, bool positive, float diff) : x(feats), y(positive), gdiff(diff) {}
- SparseVector<weight_t> x;
-#undef DEBUGGING_PRO
-#ifdef DEBUGGING_PRO
- vector<WordID> a;
- vector<WordID> b;
-#endif
- bool y;
- float gdiff;
-};
-#ifdef DEBUGGING_PRO
-ostream& operator<<(ostream& os, const TrainingInstance& d) {
- return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x;
-}
-#endif
-
-struct DiffOrder {
- bool operator()(const TrainingInstance& a, const TrainingInstance& b) const {
- return a.gdiff > b.gdiff;
- }
-};
-
-void Sample(const unsigned gamma,
- const unsigned xi,
- const training::CandidateSet& J_i,
- const EvaluationMetric* metric,
- vector<TrainingInstance>* pv) {
- const bool invert_score = metric->IsErrorMetric();
- vector<TrainingInstance> v1, v2;
- float avg_diff = 0;
- for (unsigned i = 0; i < gamma; ++i) {
- const size_t a = rng->inclusive(0, J_i.size() - 1)();
- const size_t b = rng->inclusive(0, J_i.size() - 1)();
- if (a == b) continue;
- float ga = metric->ComputeScore(J_i[a].eval_feats);
- float gb = metric->ComputeScore(J_i[b].eval_feats);
- bool positive = gb < ga;
- if (invert_score) positive = !positive;
- const float gdiff = fabs(ga - gb);
- if (!gdiff) continue;
- avg_diff += gdiff;
- SparseVector<weight_t> xdiff = (J_i[a].fmap - J_i[b].fmap).erase_zeros();
- if (xdiff.empty()) {
- cerr << "Empty diff:\n " << TD::GetString(J_i[a].ewords) << endl << "x=" << J_i[a].fmap << endl;
- cerr << " " << TD::GetString(J_i[b].ewords) << endl << "x=" << J_i[b].fmap << endl;
- continue;
- }
- v1.push_back(TrainingInstance(xdiff, positive, gdiff));
-#ifdef DEBUGGING_PRO
- v1.back().a = J_i[a].hyp;
- v1.back().b = J_i[b].hyp;
- cerr << "N: " << v1.back() << endl;
-#endif
- }
- avg_diff /= v1.size();
-
- for (unsigned i = 0; i < v1.size(); ++i) {
- double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff));
- // cerr << "avg_diff=" << avg_diff << " gdiff=" << v1[i].gdiff << " p=" << p << endl;
- if (rng->next() < p) v2.push_back(v1[i]);
- }
- vector<TrainingInstance>::iterator mid = v2.begin() + xi;
- if (xi > v2.size()) mid = v2.end();
- partial_sort(v2.begin(), mid, v2.end(), DiffOrder());
- copy(v2.begin(), mid, back_inserter(*pv));
-#ifdef DEBUGGING_PRO
- if (v2.size() >= 5) {
- for (int i =0; i < (mid - v2.begin()); ++i) {
- cerr << v2[i] << endl;
- }
- cerr << pv->back() << endl;
- }
-#endif
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- if (conf.count("random_seed"))
- rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- rng.reset(new MT19937);
- const string evaluation_metric = conf["evaluation_metric"].as<string>();
-
- EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
- DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
- cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
-
- Hypergraph hg;
- string last_file;
- ReadFile in_read(conf["input"].as<string>());
- istream &in=*in_read.stream();
- const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
- const unsigned gamma = conf["candidate_pairs"].as<unsigned>();
- const unsigned xi = conf["best_pairs"].as<unsigned>();
- string weightsf = conf["weights"].as<string>();
- vector<weight_t> weights;
- Weights::InitFromFile(weightsf, &weights);
- string kbest_repo = conf["kbest_repository"].as<string>();
- MkDirP(kbest_repo);
- while(in) {
- vector<TrainingInstance> v;
- string line;
- getline(in, line);
- if (line.empty()) continue;
- istringstream is(line);
- int sent_id;
- string file;
- // path-to-file (JSON) sent_id
- is >> file >> sent_id;
- ReadFile rf(file);
- ostringstream os;
- training::CandidateSet J_i;
- os << kbest_repo << "/kbest." << sent_id << ".txt.gz";
- const string kbest_file = os.str();
- if (FileExists(kbest_file))
- J_i.ReadFromFile(kbest_file);
- HypergraphIO::ReadFromJSON(rf.stream(), &hg);
- hg.Reweight(weights);
- J_i.AddKBestCandidates(hg, kbest_size, ds[sent_id]);
- J_i.WriteToFile(kbest_file);
-
- Sample(gamma, xi, J_i, metric, &v);
- for (unsigned i = 0; i < v.size(); ++i) {
- const TrainingInstance& vi = v[i];
- cout << vi.y << "\t" << vi.x << endl;
- cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl;
- }
- }
- return 0;
-}
-