summaryrefslogtreecommitdiff
path: root/pro
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2012-11-18 13:35:42 -0500
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2012-11-18 13:35:42 -0500
commit1b8181bf0d6e9137e6b9ccdbe414aec37377a1a9 (patch)
tree33e5f3aa5abff1f41314cf8f6afbd2c2c40e4bfd /pro
parent7c4665949fb93fb3de402e4ce1d19bef67850d05 (diff)
major restructure of the training code
Diffstat (limited to 'pro')
-rw-r--r--pro/Makefile.am11
-rw-r--r--pro/README.shared-mem9
-rwxr-xr-xpro/mr_pro_generate_mapper_input.pl18
-rw-r--r--pro/mr_pro_map.cc201
-rw-r--r--pro/mr_pro_reduce.cc286
-rwxr-xr-xpro/pro.pl555
6 files changed, 0 insertions, 1080 deletions
diff --git a/pro/Makefile.am b/pro/Makefile.am
deleted file mode 100644
index 1e9d46b0..00000000
--- a/pro/Makefile.am
+++ /dev/null
@@ -1,11 +0,0 @@
-bin_PROGRAMS = \
- mr_pro_map \
- mr_pro_reduce
-
-mr_pro_map_SOURCES = mr_pro_map.cc
-mr_pro_map_LDADD = $(top_srcdir)/training/libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_pro_reduce_SOURCES = mr_pro_reduce.cc
-mr_pro_reduce_LDADD = $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/utils/libutils.a -lz
-
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training
diff --git a/pro/README.shared-mem b/pro/README.shared-mem
deleted file mode 100644
index 7728efc0..00000000
--- a/pro/README.shared-mem
+++ /dev/null
@@ -1,9 +0,0 @@
-If you want to run dist-vest.pl on a very large shared memory machine, do the
-following:
-
- ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini
-
-This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the
-decoder must load grammars, language models, etc., J should be smaller than I, but this will depend
-on the system you are running on and the complexity of the models used for decoding.
-
diff --git a/pro/mr_pro_generate_mapper_input.pl b/pro/mr_pro_generate_mapper_input.pl
deleted file mode 100755
index b30fc4fd..00000000
--- a/pro/mr_pro_generate_mapper_input.pl
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1;
-my $d = shift @ARGV;
-die "Can't find directory $d" unless -d $d;
-
-opendir(DIR, $d) or die "Can't read $d: $!";
-my @hgs = grep { /\.gz$/ } readdir(DIR);
-closedir DIR;
-
-for my $hg (@hgs) {
- my $file = $hg;
- my $id = $hg;
- $id =~ s/(\.json)?\.gz//;
- print "$d/$file $id\n";
-}
-
diff --git a/pro/mr_pro_map.cc b/pro/mr_pro_map.cc
deleted file mode 100644
index eef40b8a..00000000
--- a/pro/mr_pro_map.cc
+++ /dev/null
@@ -1,201 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <tr1/unordered_map>
-
-#include <boost/functional/hash.hpp>
-#include <boost/shared_ptr.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "candidate_set.h"
-#include "sampler.h"
-#include "filelib.h"
-#include "stringlib.h"
-#include "weights.h"
-#include "inside_outside.h"
-#include "hg_io.h"
-#include "ns.h"
-#include "ns_docscorer.h"
-
-// This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011)
-
-using namespace std;
-namespace po = boost::program_options;
-
-boost::shared_ptr<MT19937> rng;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
- ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
- ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)")
- ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
- ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)")
- ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)")
- ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")
- ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")
- ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)")
- ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
- ("help,h", "Help");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- bool flag = false;
- if (!conf->count("reference")) {
- cerr << "Please specify one or more references using -r <REF.TXT>\n";
- flag = true;
- }
- if (!conf->count("weights")) {
- cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
- flag = true;
- }
- if (flag || conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-struct ThresholdAlpha {
- explicit ThresholdAlpha(double t = 0.05) : threshold(t) {}
- double operator()(double mag) const {
- if (mag < threshold) return 0.0; else return 1.0;
- }
- const double threshold;
-};
-
-struct TrainingInstance {
- TrainingInstance(const SparseVector<weight_t>& feats, bool positive, float diff) : x(feats), y(positive), gdiff(diff) {}
- SparseVector<weight_t> x;
-#undef DEBUGGING_PRO
-#ifdef DEBUGGING_PRO
- vector<WordID> a;
- vector<WordID> b;
-#endif
- bool y;
- float gdiff;
-};
-#ifdef DEBUGGING_PRO
-ostream& operator<<(ostream& os, const TrainingInstance& d) {
- return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x;
-}
-#endif
-
-struct DiffOrder {
- bool operator()(const TrainingInstance& a, const TrainingInstance& b) const {
- return a.gdiff > b.gdiff;
- }
-};
-
-void Sample(const unsigned gamma,
- const unsigned xi,
- const training::CandidateSet& J_i,
- const EvaluationMetric* metric,
- vector<TrainingInstance>* pv) {
- const bool invert_score = metric->IsErrorMetric();
- vector<TrainingInstance> v1, v2;
- float avg_diff = 0;
- for (unsigned i = 0; i < gamma; ++i) {
- const size_t a = rng->inclusive(0, J_i.size() - 1)();
- const size_t b = rng->inclusive(0, J_i.size() - 1)();
- if (a == b) continue;
- float ga = metric->ComputeScore(J_i[a].eval_feats);
- float gb = metric->ComputeScore(J_i[b].eval_feats);
- bool positive = gb < ga;
- if (invert_score) positive = !positive;
- const float gdiff = fabs(ga - gb);
- if (!gdiff) continue;
- avg_diff += gdiff;
- SparseVector<weight_t> xdiff = (J_i[a].fmap - J_i[b].fmap).erase_zeros();
- if (xdiff.empty()) {
- cerr << "Empty diff:\n " << TD::GetString(J_i[a].ewords) << endl << "x=" << J_i[a].fmap << endl;
- cerr << " " << TD::GetString(J_i[b].ewords) << endl << "x=" << J_i[b].fmap << endl;
- continue;
- }
- v1.push_back(TrainingInstance(xdiff, positive, gdiff));
-#ifdef DEBUGGING_PRO
- v1.back().a = J_i[a].hyp;
- v1.back().b = J_i[b].hyp;
- cerr << "N: " << v1.back() << endl;
-#endif
- }
- avg_diff /= v1.size();
-
- for (unsigned i = 0; i < v1.size(); ++i) {
- double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff));
- // cerr << "avg_diff=" << avg_diff << " gdiff=" << v1[i].gdiff << " p=" << p << endl;
- if (rng->next() < p) v2.push_back(v1[i]);
- }
- vector<TrainingInstance>::iterator mid = v2.begin() + xi;
- if (xi > v2.size()) mid = v2.end();
- partial_sort(v2.begin(), mid, v2.end(), DiffOrder());
- copy(v2.begin(), mid, back_inserter(*pv));
-#ifdef DEBUGGING_PRO
- if (v2.size() >= 5) {
- for (int i =0; i < (mid - v2.begin()); ++i) {
- cerr << v2[i] << endl;
- }
- cerr << pv->back() << endl;
- }
-#endif
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- if (conf.count("random_seed"))
- rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- rng.reset(new MT19937);
- const string evaluation_metric = conf["evaluation_metric"].as<string>();
-
- EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
- DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
- cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
-
- Hypergraph hg;
- string last_file;
- ReadFile in_read(conf["input"].as<string>());
- istream &in=*in_read.stream();
- const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
- const unsigned gamma = conf["candidate_pairs"].as<unsigned>();
- const unsigned xi = conf["best_pairs"].as<unsigned>();
- string weightsf = conf["weights"].as<string>();
- vector<weight_t> weights;
- Weights::InitFromFile(weightsf, &weights);
- string kbest_repo = conf["kbest_repository"].as<string>();
- MkDirP(kbest_repo);
- while(in) {
- vector<TrainingInstance> v;
- string line;
- getline(in, line);
- if (line.empty()) continue;
- istringstream is(line);
- int sent_id;
- string file;
- // path-to-file (JSON) sent_id
- is >> file >> sent_id;
- ReadFile rf(file);
- ostringstream os;
- training::CandidateSet J_i;
- os << kbest_repo << "/kbest." << sent_id << ".txt.gz";
- const string kbest_file = os.str();
- if (FileExists(kbest_file))
- J_i.ReadFromFile(kbest_file);
- HypergraphIO::ReadFromJSON(rf.stream(), &hg);
- hg.Reweight(weights);
- J_i.AddKBestCandidates(hg, kbest_size, ds[sent_id]);
- J_i.WriteToFile(kbest_file);
-
- Sample(gamma, xi, J_i, metric, &v);
- for (unsigned i = 0; i < v.size(); ++i) {
- const TrainingInstance& vi = v[i];
- cout << vi.y << "\t" << vi.x << endl;
- cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl;
- }
- }
- return 0;
-}
-
diff --git a/pro/mr_pro_reduce.cc b/pro/mr_pro_reduce.cc
deleted file mode 100644
index 5ef9b470..00000000
--- a/pro/mr_pro_reduce.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-#include <cstdlib>
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "filelib.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "optimize.h"
-#include "liblbfgs/lbfgs++.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-// since this is a ranking model, there should be equal numbers of
-// positive and negative examples, so the bias should be 0
-static const double MAX_BIAS = 1e-10;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")
- ("regularization_strength,C",po::value<double>()->default_value(500.0), "l2 regularization strength")
- ("l1",po::value<double>()->default_value(0.0), "l1 regularization strength")
- ("regularize_to_weights,y",po::value<double>()->default_value(5000.0), "Differences in learned weights to previous weights are penalized with an l2 penalty with this strength; 0.0 = no effect")
- ("memory_buffers,m",po::value<unsigned>()->default_value(100), "Number of memory buffers (LBFGS)")
- ("min_reg,r",po::value<double>()->default_value(0.01), "When tuning (-T) regularization strength, minimum regularization strenght")
- ("max_reg,R",po::value<double>()->default_value(1e6), "When tuning (-T) regularization strength, maximum regularization strenght")
- ("testset,t",po::value<string>(), "Optional held-out test set")
- ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength")
- ("interpolate_with_weights,p",po::value<double>()->default_value(1.0), "[deprecated] Output weights are p*w + (1-p)*w_prev; 1.0 = no effect")
- ("help,h", "Help");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-void ParseSparseVector(string& line, size_t cur, SparseVector<weight_t>* out) {
- SparseVector<weight_t>& x = *out;
- size_t last_start = cur;
- size_t last_comma = string::npos;
- while(cur <= line.size()) {
- if (line[cur] == ' ' || cur == line.size()) {
- if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
- cerr << "[ERROR] " << line << endl << " position = " << cur << endl;
- exit(1);
- }
- const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
- if (cur < line.size()) line[cur] = 0;
- const weight_t val = strtod(&line[last_comma + 1], NULL);
- x.set_value(fid, val);
-
- last_comma = string::npos;
- last_start = cur+1;
- } else {
- if (line[cur] == '=')
- last_comma = cur;
- }
- ++cur;
- }
-}
-
-void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<weight_t> > >* corpus) {
- istream& in = *pin;
- corpus->clear();
- bool flag = false;
- int lc = 0;
- string line;
- SparseVector<weight_t> x;
- while(getline(in, line)) {
- ++lc;
- if (lc % 1000 == 0) { cerr << '.'; flag = true; }
- if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
- if (line.empty()) continue;
- const size_t ks = line.find("\t");
- assert(string::npos != ks);
- assert(ks == 1);
- const bool y = line[0] == '1';
- x.clear();
- ParseSparseVector(line, ks + 1, &x);
- corpus->push_back(make_pair(y, x));
- }
- if (flag) cerr << endl;
-}
-
-void GradAdd(const SparseVector<weight_t>& v, const double scale, weight_t* acc) {
- for (SparseVector<weight_t>::const_iterator it = v.begin();
- it != v.end(); ++it) {
- acc[it->first] += it->second * scale;
- }
-}
-
-double ApplyRegularizationTerms(const double C,
- const double T,
- const vector<weight_t>& weights,
- const vector<weight_t>& prev_weights,
- weight_t* g) {
- double reg = 0;
- for (size_t i = 0; i < weights.size(); ++i) {
- const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0);
- const double& w_i = weights[i];
- reg += C * w_i * w_i;
- g[i] += 2 * C * w_i;
-
- const double diff_i = w_i - prev_w_i;
- reg += T * diff_i * diff_i;
- g[i] += 2 * T * diff_i;
- }
- return reg;
-}
-
-double TrainingInference(const vector<weight_t>& x,
- const vector<pair<bool, SparseVector<weight_t> > >& corpus,
- weight_t* g = NULL) {
- double cll = 0;
- for (int i = 0; i < corpus.size(); ++i) {
- const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias
- double lp_false = dotprod;
- double lp_true = -dotprod;
- if (0 < lp_true) {
- lp_true += log1p(exp(-lp_true));
- lp_false = log1p(exp(lp_false));
- } else {
- lp_true = log1p(exp(lp_true));
- lp_false += log1p(exp(-lp_false));
- }
- lp_true*=-1;
- lp_false*=-1;
- if (corpus[i].first) { // true label
- cll -= lp_true;
- if (g) {
- // g -= corpus[i].second * exp(lp_false);
- GradAdd(corpus[i].second, -exp(lp_false), g);
- g[0] -= exp(lp_false); // bias
- }
- } else { // false label
- cll -= lp_false;
- if (g) {
- // g += corpus[i].second * exp(lp_true);
- GradAdd(corpus[i].second, exp(lp_true), g);
- g[0] += exp(lp_true); // bias
- }
- }
- }
- return cll;
-}
-
-struct ProLoss {
- ProLoss(const vector<pair<bool, SparseVector<weight_t> > >& tr,
- const vector<pair<bool, SparseVector<weight_t> > >& te,
- const double c,
- const double t,
- const vector<weight_t>& px) : training(tr), testing(te), C(c), T(t), prev_x(px){}
- double operator()(const vector<double>& x, double* g) const {
- fill(g, g + x.size(), 0.0);
- double cll = TrainingInference(x, training, g);
- tppl = 0;
- if (testing.size())
- tppl = pow(2.0, TrainingInference(x, testing, g) / (log(2) * testing.size()));
- double ppl = cll / log(2);
- ppl /= training.size();
- ppl = pow(2.0, ppl);
- double reg = ApplyRegularizationTerms(C, T, x, prev_x, g);
- return cll + reg;
- }
- const vector<pair<bool, SparseVector<weight_t> > >& training, testing;
- const double C, T;
- const vector<double>& prev_x;
- mutable double tppl;
-};
-
-// return held-out log likelihood
-double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training,
- const vector<pair<bool, SparseVector<weight_t> > >& testing,
- const double C,
- const double C1,
- const double T,
- const unsigned memory_buffers,
- const vector<weight_t>& prev_x,
- vector<weight_t>* px) {
- assert(px->size() == prev_x.size());
- ProLoss loss(training, testing, C, T, prev_x);
- LBFGS<ProLoss> lbfgs(px, loss, memory_buffers, C1);
- lbfgs.MinimizeFunction();
- return loss.tppl;
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- string line;
- vector<pair<bool, SparseVector<weight_t> > > training, testing;
- const bool tune_regularizer = conf.count("tune_regularizer");
- if (tune_regularizer && !conf.count("testset")) {
- cerr << "--tune_regularizer requires --testset to be set\n";
- return 1;
- }
- const double min_reg = conf["min_reg"].as<double>();
- const double max_reg = conf["max_reg"].as<double>();
- double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned
- double C1 = conf["l1"].as<double>(); // will be overridden if parameter is tuned
- const double T = conf["regularize_to_weights"].as<double>();
- assert(C >= 0.0);
- assert(min_reg >= 0.0);
- assert(max_reg >= 0.0);
- assert(max_reg > min_reg);
- const double psi = conf["interpolate_with_weights"].as<double>();
- if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; }
- ReadCorpus(&cin, &training);
- if (conf.count("testset")) {
- ReadFile rf(conf["testset"].as<string>());
- ReadCorpus(rf.stream(), &testing);
- }
- cerr << "Number of features: " << FD::NumFeats() << endl;
-
- vector<weight_t> x, prev_x; // x[0] is bias
- if (conf.count("weights")) {
- Weights::InitFromFile(conf["weights"].as<string>(), &x);
- x.resize(FD::NumFeats());
- prev_x = x;
- } else {
- x.resize(FD::NumFeats());
- prev_x = x;
- }
- cerr << " Number of features: " << x.size() << endl;
- cerr << "Number of training examples: " << training.size() << endl;
- cerr << "Number of testing examples: " << testing.size() << endl;
- double tppl = 0.0;
- vector<pair<double,double> > sp;
- vector<double> smoothed;
- if (tune_regularizer) {
- C = min_reg;
- const double steps = 18;
- double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps);
- cerr << "SWEEP FACTOR: " << sweep_factor << endl;
- while(C < max_reg) {
- cerr << "C=" << C << "\tT=" <<T << endl;
- tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x);
- sp.push_back(make_pair(C, tppl));
- C *= sweep_factor;
- }
- smoothed.resize(sp.size(), 0);
- smoothed[0] = sp[0].second;
- smoothed.back() = sp.back().second;
- for (int i = 1; i < sp.size()-1; ++i) {
- double prev = sp[i-1].second;
- double next = sp[i+1].second;
- double cur = sp[i].second;
- smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next);
- }
- double best_ppl = 9999999;
- unsigned best_i = 0;
- for (unsigned i = 0; i < sp.size(); ++i) {
- if (smoothed[i] < best_ppl) {
- best_ppl = smoothed[i];
- best_i = i;
- }
- }
- C = sp[best_i].first;
- } // tune regularizer
- tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x);
- if (conf.count("weights")) {
- for (int i = 1; i < x.size(); ++i) {
- x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi);
- }
- }
- cout.precision(15);
- cout << "# C=" << C << "\theld out perplexity=";
- if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; }
- if (sp.size()) {
- cout << "# Parameter sweep:\n";
- for (int i = 0; i < sp.size(); ++i) {
- cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl;
- }
- }
- Weights::WriteToFile("-", x);
- return 0;
-}
diff --git a/pro/pro.pl b/pro/pro.pl
deleted file mode 100755
index 891b7e4c..00000000
--- a/pro/pro.pl
+++ /dev/null
@@ -1,555 +0,0 @@
-#!/usr/bin/env perl
-use strict;
-use File::Basename qw(basename);
-my @ORIG_ARGV=@ARGV;
-use Cwd qw(getcwd);
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
-
-# Skip local config (used for distributing jobs) if we're running in local-only mode
-use LocalConfig;
-use Getopt::Long;
-use IPC::Open2;
-use POSIX ":sys_wait_h";
-my $QSUB_CMD = qsub_args(mert_memory());
-my $default_jobs = env_default_jobs();
-
-my $VEST_DIR="$SCRIPT_DIR/../dpmert";
-require "$VEST_DIR/libcall.pl";
-
-# Default settings
-my $srcFile;
-my $refFiles;
-my $bin_dir = $SCRIPT_DIR;
-die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
-my $FAST_SCORE="$bin_dir/../mteval/fast_score";
-die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
-my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl";
-my $MAPPER = "$bin_dir/mr_pro_map";
-my $REDUCER = "$bin_dir/mr_pro_reduce";
-my $parallelize = "$VEST_DIR/parallelize.pl";
-my $libcall = "$VEST_DIR/libcall.pl";
-my $sentserver = "$VEST_DIR/sentserver";
-my $sentclient = "$VEST_DIR/sentclient";
-my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
-
-my $SCORER = $FAST_SCORE;
-die "Can't find $MAPPER" unless -x $MAPPER;
-my $cdec = "$bin_dir/../decoder/cdec";
-die "Can't find decoder in $cdec" unless -x $cdec;
-die "Can't find $parallelize" unless -x $parallelize;
-die "Can't find $libcall" unless -e $libcall;
-my $decoder = $cdec;
-my $lines_per_mapper = 30;
-my $iteration = 1;
-my $best_weights;
-my $psi = 1;
-my $default_max_iter = 30;
-my $max_iterations = $default_max_iter;
-my $jobs = $default_jobs; # number of decode nodes
-my $pmem = "4g";
-my $disable_clean = 0;
-my %seen_weights;
-my $help = 0;
-my $epsilon = 0.0001;
-my $dryrun = 0;
-my $last_score = -10000000;
-my $metric = "ibm_bleu";
-my $dir;
-my $iniFile;
-my $weights;
-my $use_make = 1; # use make to parallelize
-my $useqsub = 0;
-my $initial_weights;
-my $pass_suffix = '';
-my $devset;
-
-# regularization strength
-my $reg = 500;
-my $reg_previous = 5000;
-
-# Process command-line options
-if (GetOptions(
- "config=s" => \$iniFile,
- "weights=s" => \$initial_weights,
- "devset=s" => \$devset,
- "jobs=i" => \$jobs,
- "metric=s" => \$metric,
- "pass-suffix=s" => \$pass_suffix,
- "qsub" => \$useqsub,
- "help" => \$help,
- "reg=f" => \$reg,
- "reg-previous=f" => \$reg_previous,
- "output-dir=s" => \$dir,
-) == 0 || @ARGV!=0 || $help) {
- print_help();
- exit;
-}
-
-if ($useqsub) {
- $use_make = 0;
- die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
-}
-
-my @missing_args = ();
-if (!defined $iniFile) { push @missing_args, "--config"; }
-if (!defined $devset) { push @missing_args, "--devset"; }
-if (!defined $initial_weights) { push @missing_args, "--weights"; }
-die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
-
-if ($metric =~ /^(combi|ter)$/i) {
- $lines_per_mapper = 5;
-}
-
-my $host =check_output("hostname"); chomp $host;
-my $bleu;
-my $interval_count = 0;
-my $logfile;
-my $projected_score;
-
-# used in sorting scores
-my $DIR_FLAG = '-r';
-if ($metric =~ /^ter$|^aer$/i) {
- $DIR_FLAG = '';
-}
-
-unless ($dir){
- $dir = 'pro';
-}
-unless ($dir =~ /^\//){ # convert relative path to absolute path
- my $basedir = check_output("pwd");
- chomp $basedir;
- $dir = "$basedir/$dir";
-}
-
-# Initializations and helper functions
-srand;
-
-my @childpids = ();
-my @cleanupcmds = ();
-
-sub cleanup {
- print STDERR "Cleanup...\n";
- for my $pid (@childpids){ unchecked_call("kill $pid"); }
- for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
- exit 1;
-};
-# Always call cleanup, no matter how we exit
-*CORE::GLOBAL::exit =
- sub{ cleanup(); };
-$SIG{INT} = "cleanup";
-$SIG{TERM} = "cleanup";
-$SIG{HUP} = "cleanup";
-
-my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
-my $newIniFile = "$dir/$decoderBase.ini";
-my $inputFileName = "$dir/input";
-my $user = $ENV{"USER"};
-
-
-# process ini file
--e $iniFile || die "Error: could not open $iniFile for reading\n";
-open(INI, $iniFile);
-
-if (-e $dir) {
- die "ERROR: working dir $dir already exists\n\n";
-} else {
- mkdir "$dir" or die "Can't mkdir $dir: $!";
- mkdir "$dir/hgs" or die;
- mkdir "$dir/scripts" or die;
- print STDERR <<EOT;
- DECODER: $decoder
- INI FILE: $iniFile
- WORKING DIR: $dir
- DEVSET: $devset
- EVAL METRIC: $metric
- MAX ITERATIONS: $max_iterations
- PARALLEL JOBS: $jobs
- HEAD NODE: $host
- PMEM (DECODING): $pmem
- INITIAL WEIGHTS: $initial_weights
-EOT
-}
-
-# Generate initial files and values
-check_call("cp $iniFile $newIniFile");
-check_call("cp $initial_weights $dir/weights.0");
-$iniFile = $newIniFile;
-
-my $refs = "$dir/dev.refs";
-split_devset($devset, "$dir/dev.input.raw", $refs);
-my $newsrc = "$dir/dev.input";
-enseg("$dir/dev.input.raw", $newsrc);
-$srcFile = $newsrc;
-my $devSize = 0;
-open F, "<$srcFile" or die "Can't read $srcFile: $!";
-while(<F>) { $devSize++; }
-close F;
-
-unless($best_weights){ $best_weights = $weights; }
-unless($projected_score){ $projected_score = 0.0; }
-$seen_weights{$weights} = 1;
-
-my $random_seed = int(time / 1000);
-my $lastWeightsFile;
-my $lastPScore = 0;
-# main optimization loop
-my @allweights;
-while (1){
- print STDERR "\n\nITERATION $iteration\n==========\n";
-
- if ($iteration > $max_iterations){
- print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
- last;
- }
- # iteration-specific files
- my $runFile="$dir/run.raw.$iteration";
- my $onebestFile="$dir/1best.$iteration";
- my $logdir="$dir/logs.$iteration";
- my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
- my $scorerLog="$logdir/scorer.log.$iteration";
- check_call("mkdir -p $logdir");
-
-
- #decode
- print STDERR "RUNNING DECODER AT ";
- print STDERR unchecked_output("date");
- my $im1 = $iteration - 1;
- my $weightsFile="$dir/weights.$im1";
- push @allweights, "-w $dir/weights.$im1";
- `rm -f $dir/hgs/*.gz`;
- my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
- my $pcmd;
- if ($use_make) {
- $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
- } else {
- $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
- }
- my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
- print STDERR "COMMAND:\n$cmd\n";
- check_bash_call($cmd);
- my $num_hgs;
- my $num_topbest;
- my $retries = 0;
- while($retries < 5) {
- $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
- $num_topbest = check_output("wc -l < $runFile");
- print STDERR "NUMBER OF HGs: $num_hgs\n";
- print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
- if($devSize == $num_hgs && $devSize == $num_topbest) {
- last;
- } else {
- print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
- sleep(3);
- }
- $retries++;
- }
- die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
- my $dec_score = check_output("cat $runFile | $SCORER -r $refs -m $metric");
- chomp $dec_score;
- print STDERR "DECODER SCORE: $dec_score\n";
-
- # save space
- check_call("gzip -f $runFile");
- check_call("gzip -f $decoderLog");
-
- # run optimizer
- print STDERR "RUNNING OPTIMIZER AT ";
- print STDERR unchecked_output("date");
- print STDERR " - GENERATE TRAINING EXEMPLARS\n";
- my $mergeLog="$logdir/prune-merge.log.$iteration";
-
- my $score = 0;
- my $icc = 0;
- my $inweights="$dir/weights.$im1";
- $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1";
- print STDERR "COMMAND:\n$cmd\n";
- check_call($cmd);
- check_call("mkdir -p $dir/splag.$im1");
- $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1 $dir/splag.$im1/mapinput.";
- print STDERR "COMMAND:\n$cmd\n";
- check_call($cmd);
- opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
- my @shards = grep { /^mapinput\./ } readdir(DIR);
- closedir DIR;
- die "No shards!" unless scalar @shards > 0;
- my $joblist = "";
- my $nmappers = 0;
- @cleanupcmds = ();
- my %o2i = ();
- my $first_shard = 1;
- my $mkfile; # only used with makefiles
- my $mkfilename;
- if ($use_make) {
- $mkfilename = "$dir/splag.$im1/domap.mk";
- open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
- print $mkfile "all: $dir/splag.$im1/map.done\n\n";
- }
- my @mkouts = (); # only used with makefiles
- my @mapoutputs = ();
- for my $shard (@shards) {
- my $mapoutput = $shard;
- my $client_name = $shard;
- $client_name =~ s/mapinput.//;
- $client_name = "pro.$client_name";
- $mapoutput =~ s/mapinput/mapoutput/;
- push @mapoutputs, "$dir/splag.$im1/$mapoutput";
- $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
- my $script = "$MAPPER -s $srcFile -m $metric -r $refs -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
- if ($use_make) {
- my $script_file = "$dir/scripts/map.$shard";
- open F, ">$script_file" or die "Can't write $script_file: $!";
- print F "#!/bin/bash\n";
- print F "$script\n";
- close F;
- my $output = "$dir/splag.$im1/$mapoutput";
- push @mkouts, $output;
- chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
- if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
- print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
- } else {
- my $script_file = "$dir/scripts/map.$shard";
- open F, ">$script_file" or die "Can't write $script_file: $!";
- print F "$script\n";
- close F;
- if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
-
- $nmappers++;
- my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
- my $jobid = check_output("$qcmd");
- chomp $jobid;
- $jobid =~ s/^(\d+)(.*?)$/\1/g;
- $jobid =~ s/^Your job (\d+) .*$/\1/;
- push(@cleanupcmds, "qdel $jobid 2> /dev/null");
- print STDERR " $jobid";
- if ($joblist == "") { $joblist = $jobid; }
- else {$joblist = $joblist . "\|" . $jobid; }
- }
- }
- my @dev_outs = ();
- my @devtest_outs = ();
- @dev_outs = @mapoutputs;
- if ($use_make) {
- print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
- close $mkfile;
- my $mcmd = "make -j $jobs -f $mkfilename";
- print STDERR "\nExecuting: $mcmd\n";
- check_call($mcmd);
- } else {
- print STDERR "\nLaunched $nmappers mappers.\n";
- sleep 8;
- print STDERR "Waiting for mappers to complete...\n";
- while ($nmappers > 0) {
- sleep 5;
- my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
- $nmappers = scalar @livejobs;
- }
- print STDERR "All mappers complete.\n";
- }
- my $tol = 0;
- my $til = 0;
- my $dev_test_file = "$dir/splag.$im1/devtest.gz";
- print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n";
- print STDERR unchecked_output("date");
- $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -C $reg -y $reg_previous --interpolate_with_weights $psi";
- $cmd .= " > $dir/weights.$iteration";
- print STDERR "COMMAND:\n$cmd\n";
- check_bash_call($cmd);
- $lastWeightsFile = "$dir/weights.$iteration";
- $lastPScore = $score;
- $iteration++;
- print STDERR "\n==========\n";
-}
-
-
-check_call("cp $lastWeightsFile $dir/weights.final");
-print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n";
-print STDOUT "$dir/weights.final\n";
-
-exit 0;
-
-sub read_weights_file {
- my ($file) = @_;
- open F, "<$file" or die "Couldn't read $file: $!";
- my @r = ();
- my $pm = -1;
- while(<F>) {
- next if /^#/;
- next if /^\s*$/;
- chomp;
- if (/^(.+)\s+(.+)$/) {
- my $m = $1;
- my $w = $2;
- die "Weights out of order: $m <= $pm" unless $m > $pm;
- push @r, $w;
- } else {
- warn "Unexpected feature name in weight file: $_";
- }
- }
- close F;
- return join ' ', @r;
-}
-
-sub enseg {
- my $src = shift;
- my $newsrc = shift;
- open(SRC, $src);
- open(NEWSRC, ">$newsrc");
- my $i=0;
- while (my $line=<SRC>){
- chomp $line;
- if ($line =~ /^\s*<seg/i) {
- if($line =~ /id="[0-9]+"/) {
- print NEWSRC "$line\n";
- } else {
- die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
- }
- } else {
- print NEWSRC "<seg id=\"$i\">$line</seg>\n";
- }
- $i++;
- }
- close SRC;
- close NEWSRC;
- die "Empty dev set!" if ($i == 0);
-}
-
-sub print_help {
-
- my $executable = basename($0); chomp $executable;
- print << "Help";
-
-Usage: $executable [options]
-
- $executable [options]
- Runs a complete PRO optimization using the ini file specified.
-
-Required:
-
- --config <cdec.ini>
- Decoder configuration file.
-
- --devset <files>
- Dev set source and reference data.
-
- --weights <file>
- Initial weights file (use empty file to start from 0)
-
-General options:
-
- --help
- Print this message and exit.
-
- --max-iterations <M>
- Maximum number of iterations to run. If not specified, defaults
- to $default_max_iter.
-
- --metric <method>
- Metric to optimize.
- Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
-
- --pass-suffix <S>
- If the decoder is doing multi-pass decoding, the pass suffix "2",
- "3", etc., is used to control what iteration of weights is set.
-
- --workdir <dir>
- Directory for intermediate and output files. If not specified, the
- name is derived from the ini filename. Assuming that the ini
- filename begins with the decoder name and ends with ini, the default
- name of the working directory is inferred from the middle part of
- the filename. E.g. an ini file named decoder.foo.ini would have
- a default working directory name foo.
-
-Regularization options:
-
- --reg <F>
- l2 regularization strength [default=500]. The greater this value,
- the closer to zero the weights will be.
-
- --reg-previous <F>
- l2 penalty for moving away from the weights from the previous
- iteration. [default=5000]. The greater this value, the closer
- to the previous iteration's weights the next iteration's weights
- will be.
-
-Job control options:
-
- --jobs <I>
- Number of decoder processes to run in parallel. [default=$default_jobs]
-
- --qsub
- Use qsub to run jobs in parallel (qsub must be configured in
- environment/LocalEnvironment.pm)
-
- --pmem <N>
- Amount of physical memory requested for parallel decoding jobs
- (used with qsub requests only)
-
-Deprecated options:
-
- --interpolate-with-weights <F>
- [deprecated] At each iteration the resulting weights are
- interpolated with the weights from the previous iteration, with
- this factor. [default=1.0, i.e., no effect]
-
-Help
-}
-
-sub convert {
- my ($str) = @_;
- my @ps = split /;/, $str;
- my %dict = ();
- for my $p (@ps) {
- my ($k, $v) = split /=/, $p;
- $dict{$k} = $v;
- }
- return %dict;
-}
-
-
-sub cmdline {
- return join ' ',($0,@ORIG_ARGV);
-}
-
-#buggy: last arg gets quoted sometimes?
-my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
-my $shell_escape_in_quote=qr{[\\"\$`!]};
-
-sub escape_shell {
- my ($arg)=@_;
- return undef unless defined $arg;
- if ($arg =~ /$is_shell_special/) {
- $arg =~ s/($shell_escape_in_quote)/\\$1/g;
- return "\"$arg\"";
- }
- return $arg;
-}
-
-sub escaped_shell_args {
- return map {local $_=$_;chomp;escape_shell($_)} @_;
-}
-
-sub escaped_shell_args_str {
- return join ' ',&escaped_shell_args(@_);
-}
-
-sub escaped_cmdline {
- return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
-}
-
-sub split_devset {
- my ($infile, $outsrc, $outref) = @_;
- open F, "<$infile" or die "Can't read $infile: $!";
- open S, ">$outsrc" or die "Can't write $outsrc: $!";
- open R, ">$outref" or die "Can't write $outref: $!";
- while(<F>) {
- chomp;
- my ($src, @refs) = split /\s*\|\|\|\s*/;
- die "Malformed devset line: $_\n" unless scalar @refs > 0;
- print S "$src\n";
- print R join(' ||| ', @refs) . "\n";
- }
- close R;
- close S;
- close F;
-}
-