diff options
Diffstat (limited to 'training/pro')
| -rw-r--r-- | training/pro/Makefile.am | 13 | ||||
| -rwxr-xr-x | training/pro/mr_pro_generate_mapper_input.pl | 18 | ||||
| -rw-r--r-- | training/pro/mr_pro_map.cc | 201 | ||||
| -rw-r--r-- | training/pro/mr_pro_reduce.cc | 286 | ||||
| -rwxr-xr-x | training/pro/pro.pl | 555 | 
5 files changed, 1073 insertions, 0 deletions
| diff --git a/training/pro/Makefile.am b/training/pro/Makefile.am new file mode 100644 index 00000000..09364804 --- /dev/null +++ b/training/pro/Makefile.am @@ -0,0 +1,13 @@ +bin_PROGRAMS = \ +  mr_pro_map \ +  mr_pro_reduce + +mr_pro_map_SOURCES = mr_pro_map.cc +mr_pro_map_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a + +mr_pro_reduce_SOURCES = mr_pro_reduce.cc +mr_pro_reduce_LDADD = ../../training/liblbfgs/liblbfgs.a ../../utils/libutils.a + +EXTRA_DIST = mr_pro_generate_mapper_input.pl pro.pl + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils -I$(top_srcdir)/training diff --git a/training/pro/mr_pro_generate_mapper_input.pl b/training/pro/mr_pro_generate_mapper_input.pl new file mode 100755 index 00000000..b30fc4fd --- /dev/null +++ b/training/pro/mr_pro_generate_mapper_input.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1; +my $d = shift @ARGV; +die "Can't find directory $d" unless -d $d; + +opendir(DIR, $d) or die "Can't read $d: $!"; +my @hgs = grep { /\.gz$/ } readdir(DIR); +closedir DIR; + +for my $hg (@hgs) { +  my $file = $hg; +  my $id = $hg; +  $id =~ s/(\.json)?\.gz//; +  print "$d/$file $id\n"; +} + diff --git a/training/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc new file mode 100644 index 00000000..eef40b8a --- /dev/null +++ b/training/pro/mr_pro_map.cc @@ -0,0 +1,201 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> +#include <tr1/unordered_map> + +#include <boost/functional/hash.hpp> +#include <boost/shared_ptr.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "candidate_set.h" +#include "sampler.h" +#include "filelib.h" +#include "stringlib.h" +#include "weights.h" +#include "inside_outside.h" +#include "hg_io.h" +#include "ns.h" +#include "ns_docscorer.h" + +// This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011) + +using namespace std; +namespace po = boost::program_options; + +boost::shared_ptr<MT19937> rng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") +        ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations") +        ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)") +        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") +        ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)") +        ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") +        ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract") +        ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)") +        ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") +        ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = false; +  if (!conf->count("reference")) { +    cerr << "Please specify one or more references using -r <REF.TXT>\n"; +    flag = true; +  } +  if (!conf->count("weights")) { +    cerr << "Please specify weights using -w <WEIGHTS.TXT>\n"; +    flag = true; +  } +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +struct ThresholdAlpha { +  explicit ThresholdAlpha(double t = 0.05) : threshold(t) {} +  double operator()(double mag) const { +    if (mag < threshold) return 0.0; else return 1.0; +  } +  const double threshold; +}; + +struct TrainingInstance { +  TrainingInstance(const SparseVector<weight_t>& feats, bool positive, float diff) : x(feats), y(positive), gdiff(diff) {} +  SparseVector<weight_t> x; +#undef DEBUGGING_PRO +#ifdef DEBUGGING_PRO +  vector<WordID> a; +  vector<WordID> b; +#endif +  bool y; +  float gdiff; +}; +#ifdef DEBUGGING_PRO +ostream& operator<<(ostream& os, const TrainingInstance& d) { +  return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x; +} +#endif + +struct DiffOrder { +  bool operator()(const TrainingInstance& a, const TrainingInstance& b) const { +    return a.gdiff > b.gdiff; +  } +}; + +void Sample(const unsigned gamma, +            const unsigned xi, +            const training::CandidateSet& J_i, +            const EvaluationMetric* metric, +            vector<TrainingInstance>* pv) { +  const bool invert_score = metric->IsErrorMetric(); +  vector<TrainingInstance> v1, v2; +  float avg_diff = 0; +  for (unsigned i = 0; i < gamma; ++i) { +    const size_t a = rng->inclusive(0, J_i.size() - 1)(); +    const size_t b = rng->inclusive(0, J_i.size() - 1)(); +    if (a == b) continue; +    float ga = metric->ComputeScore(J_i[a].eval_feats); +    float gb = metric->ComputeScore(J_i[b].eval_feats); +    bool positive = gb < ga; +    if (invert_score) positive = !positive; +    const float gdiff = fabs(ga - gb); +    if (!gdiff) continue; +    avg_diff += gdiff; +    SparseVector<weight_t> xdiff = (J_i[a].fmap - J_i[b].fmap).erase_zeros(); +    if (xdiff.empty()) { +      cerr << "Empty diff:\n  " << TD::GetString(J_i[a].ewords) << endl << "x=" << J_i[a].fmap << endl; +      cerr << "  " << TD::GetString(J_i[b].ewords) << endl << "x=" << J_i[b].fmap << endl; +      continue; +    } +    v1.push_back(TrainingInstance(xdiff, positive, gdiff)); +#ifdef DEBUGGING_PRO +    v1.back().a = J_i[a].hyp; +    v1.back().b = J_i[b].hyp; +    cerr << "N: " << v1.back() << endl; +#endif +  } +  avg_diff /= v1.size(); + +  for (unsigned i = 0; i < v1.size(); ++i) { +    double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff)); +    // cerr << "avg_diff=" << avg_diff << "  gdiff=" << v1[i].gdiff << "  p=" << p << endl; +    if (rng->next() < p) v2.push_back(v1[i]); +  } +  vector<TrainingInstance>::iterator mid = v2.begin() + xi; +  if (xi > v2.size()) mid = v2.end(); +  partial_sort(v2.begin(), mid, v2.end(), DiffOrder()); +  copy(v2.begin(), mid, back_inserter(*pv)); +#ifdef DEBUGGING_PRO +  if (v2.size() >= 5) { +    for (int i =0; i < (mid - v2.begin()); ++i) { +      cerr << v2[i] << endl; +    } +    cerr << pv->back() << endl; +  } +#endif +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  if (conf.count("random_seed")) +    rng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    rng.reset(new MT19937); +  const string evaluation_metric = conf["evaluation_metric"].as<string>(); + +  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); +  DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); +  cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + +  Hypergraph hg; +  string last_file; +  ReadFile in_read(conf["input"].as<string>()); +  istream &in=*in_read.stream(); +  const unsigned kbest_size = conf["kbest_size"].as<unsigned>(); +  const unsigned gamma = conf["candidate_pairs"].as<unsigned>(); +  const unsigned xi = conf["best_pairs"].as<unsigned>(); +  string weightsf = conf["weights"].as<string>(); +  vector<weight_t> weights; +  Weights::InitFromFile(weightsf, &weights); +  string kbest_repo = conf["kbest_repository"].as<string>(); +  MkDirP(kbest_repo); +  while(in) { +    vector<TrainingInstance> v; +    string line; +    getline(in, line); +    if (line.empty()) continue; +    istringstream is(line); +    int sent_id; +    string file; +    // path-to-file (JSON) sent_id +    is >> file >> sent_id; +    ReadFile rf(file); +    ostringstream os; +    training::CandidateSet J_i; +    os << kbest_repo << "/kbest." << sent_id << ".txt.gz"; +    const string kbest_file = os.str(); +    if (FileExists(kbest_file)) +      J_i.ReadFromFile(kbest_file); +    HypergraphIO::ReadFromJSON(rf.stream(), &hg); +    hg.Reweight(weights); +    J_i.AddKBestCandidates(hg, kbest_size, ds[sent_id]); +    J_i.WriteToFile(kbest_file); + +    Sample(gamma, xi, J_i, metric, &v); +    for (unsigned i = 0; i < v.size(); ++i) { +      const TrainingInstance& vi = v[i]; +      cout << vi.y << "\t" << vi.x << endl; +      cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl; +    } +  } +  return 0; +} + diff --git a/training/pro/mr_pro_reduce.cc b/training/pro/mr_pro_reduce.cc new file mode 100644 index 00000000..5ef9b470 --- /dev/null +++ b/training/pro/mr_pro_reduce.cc @@ -0,0 +1,286 @@ +#include <cstdlib> +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "weights.h" +#include "sparse_vector.h" +#include "optimize.h" +#include "liblbfgs/lbfgs++.h" + +using namespace std; +namespace po = boost::program_options; + +// since this is a ranking model, there should be equal numbers of +// positive and negative examples, so the bias should be 0 +static const double MAX_BIAS = 1e-10; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation") +        ("regularization_strength,C",po::value<double>()->default_value(500.0), "l2 regularization strength") +        ("l1",po::value<double>()->default_value(0.0), "l1 regularization strength") +        ("regularize_to_weights,y",po::value<double>()->default_value(5000.0), "Differences in learned weights to previous weights are penalized with an l2 penalty with this strength; 0.0 = no effect") +        ("memory_buffers,m",po::value<unsigned>()->default_value(100), "Number of memory buffers (LBFGS)") +        ("min_reg,r",po::value<double>()->default_value(0.01), "When tuning (-T) regularization strength, minimum regularization strenght") +        ("max_reg,R",po::value<double>()->default_value(1e6), "When tuning (-T) regularization strength, maximum regularization strenght") +        ("testset,t",po::value<string>(), "Optional held-out test set") +        ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength") +        ("interpolate_with_weights,p",po::value<double>()->default_value(1.0), "[deprecated] Output weights are p*w + (1-p)*w_prev; 1.0 = no effect") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +void ParseSparseVector(string& line, size_t cur, SparseVector<weight_t>* out) { +  SparseVector<weight_t>& x = *out; +  size_t last_start = cur; +  size_t last_comma = string::npos; +  while(cur <= line.size()) { +    if (line[cur] == ' ' || cur == line.size()) { +      if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { +        cerr << "[ERROR] " << line << endl << "  position = " << cur << endl; +        exit(1); +      } +      const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); +      if (cur < line.size()) line[cur] = 0; +      const weight_t val = strtod(&line[last_comma + 1], NULL); +      x.set_value(fid, val); + +      last_comma = string::npos; +      last_start = cur+1; +    } else { +      if (line[cur] == '=') +        last_comma = cur; +    } +    ++cur; +  } +} + +void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<weight_t> > >* corpus) { +  istream& in = *pin; +  corpus->clear(); +  bool flag = false; +  int lc = 0; +  string line; +  SparseVector<weight_t> x; +  while(getline(in, line)) { +    ++lc; +    if (lc % 1000 == 0) { cerr << '.'; flag = true; } +    if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } +    if (line.empty()) continue; +    const size_t ks = line.find("\t"); +    assert(string::npos != ks); +    assert(ks == 1); +    const bool y = line[0] == '1'; +    x.clear(); +    ParseSparseVector(line, ks + 1, &x); +    corpus->push_back(make_pair(y, x)); +  } +  if (flag) cerr << endl; +} + +void GradAdd(const SparseVector<weight_t>& v, const double scale, weight_t* acc) { +  for (SparseVector<weight_t>::const_iterator it = v.begin(); +       it != v.end(); ++it) { +    acc[it->first] += it->second * scale; +  } +} + +double ApplyRegularizationTerms(const double C, +                                const double T, +                                const vector<weight_t>& weights, +                                const vector<weight_t>& prev_weights, +                                weight_t* g) { +  double reg = 0; +  for (size_t i = 0; i < weights.size(); ++i) { +    const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0); +    const double& w_i = weights[i]; +    reg += C * w_i * w_i; +    g[i] += 2 * C * w_i; + +    const double diff_i = w_i - prev_w_i; +    reg += T * diff_i * diff_i; +    g[i] += 2 * T * diff_i; +  } +  return reg; +} + +double TrainingInference(const vector<weight_t>& x, +                         const vector<pair<bool, SparseVector<weight_t> > >& corpus, +                         weight_t* g = NULL) { +  double cll = 0; +  for (int i = 0; i < corpus.size(); ++i) { +    const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias +    double lp_false = dotprod; +    double lp_true = -dotprod; +    if (0 < lp_true) { +      lp_true += log1p(exp(-lp_true)); +      lp_false = log1p(exp(lp_false)); +    } else { +      lp_true = log1p(exp(lp_true)); +      lp_false += log1p(exp(-lp_false)); +    } +    lp_true*=-1; +    lp_false*=-1; +    if (corpus[i].first) {  // true label +      cll -= lp_true; +      if (g) { +        // g -= corpus[i].second * exp(lp_false); +        GradAdd(corpus[i].second, -exp(lp_false), g); +        g[0] -= exp(lp_false); // bias +      } +    } else {                  // false label +      cll -= lp_false; +      if (g) { +        // g += corpus[i].second * exp(lp_true); +        GradAdd(corpus[i].second, exp(lp_true), g); +        g[0] += exp(lp_true); // bias +      } +    } +  } +  return cll; +} + +struct ProLoss { +  ProLoss(const vector<pair<bool, SparseVector<weight_t> > >& tr, +          const vector<pair<bool, SparseVector<weight_t> > >& te, +          const double c, +          const double t, +          const vector<weight_t>& px) : training(tr), testing(te), C(c), T(t), prev_x(px){} +  double operator()(const vector<double>& x, double* g) const { +    fill(g, g + x.size(), 0.0); +    double cll = TrainingInference(x, training, g); +    tppl = 0; +    if (testing.size()) +      tppl = pow(2.0, TrainingInference(x, testing, g) / (log(2) * testing.size())); +    double ppl = cll / log(2); +    ppl /= training.size(); +    ppl = pow(2.0, ppl); +    double reg = ApplyRegularizationTerms(C, T, x, prev_x, g); +    return cll + reg; +  } +  const vector<pair<bool, SparseVector<weight_t> > >& training, testing; +  const double C, T; +  const vector<double>& prev_x; +  mutable double tppl; +}; + +// return held-out log likelihood +double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training, +                       const vector<pair<bool, SparseVector<weight_t> > >& testing, +                       const double C, +                       const double C1, +                       const double T, +                       const unsigned memory_buffers, +                       const vector<weight_t>& prev_x, +                       vector<weight_t>* px) { +  assert(px->size() == prev_x.size()); +  ProLoss loss(training, testing, C, T, prev_x); +  LBFGS<ProLoss> lbfgs(px, loss, memory_buffers, C1); +  lbfgs.MinimizeFunction(); +  return loss.tppl; +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  string line; +  vector<pair<bool, SparseVector<weight_t> > > training, testing; +  const bool tune_regularizer = conf.count("tune_regularizer"); +  if (tune_regularizer && !conf.count("testset")) { +    cerr << "--tune_regularizer requires --testset to be set\n"; +    return 1; +  } +  const double min_reg = conf["min_reg"].as<double>(); +  const double max_reg = conf["max_reg"].as<double>(); +  double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned +  double C1 = conf["l1"].as<double>(); // will be overridden if parameter is tuned +  const double T = conf["regularize_to_weights"].as<double>(); +  assert(C >= 0.0); +  assert(min_reg >= 0.0); +  assert(max_reg >= 0.0); +  assert(max_reg > min_reg); +  const double psi = conf["interpolate_with_weights"].as<double>(); +  if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; } +  ReadCorpus(&cin, &training); +  if (conf.count("testset")) { +    ReadFile rf(conf["testset"].as<string>()); +    ReadCorpus(rf.stream(), &testing); +  } +  cerr << "Number of features: " << FD::NumFeats() << endl; + +  vector<weight_t> x, prev_x;  // x[0] is bias +  if (conf.count("weights")) { +    Weights::InitFromFile(conf["weights"].as<string>(), &x); +    x.resize(FD::NumFeats()); +    prev_x = x; +  } else { +    x.resize(FD::NumFeats()); +    prev_x = x; +  } +  cerr << "         Number of features: " << x.size() << endl; +  cerr << "Number of training examples: " << training.size() << endl; +  cerr << "Number of  testing examples: " << testing.size() << endl; +  double tppl = 0.0; +  vector<pair<double,double> > sp; +  vector<double> smoothed; +  if (tune_regularizer) { +    C = min_reg; +    const double steps = 18; +    double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps); +    cerr << "SWEEP FACTOR: " << sweep_factor << endl; +    while(C < max_reg) { +      cerr << "C=" << C << "\tT=" <<T << endl; +      tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x); +      sp.push_back(make_pair(C, tppl)); +      C *= sweep_factor; +    } +    smoothed.resize(sp.size(), 0); +    smoothed[0] = sp[0].second; +    smoothed.back() = sp.back().second;  +    for (int i = 1; i < sp.size()-1; ++i) { +      double prev = sp[i-1].second; +      double next = sp[i+1].second; +      double cur = sp[i].second; +      smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next); +    } +    double best_ppl = 9999999; +    unsigned best_i = 0; +    for (unsigned i = 0; i < sp.size(); ++i) { +      if (smoothed[i] < best_ppl) { +        best_ppl = smoothed[i]; +        best_i = i; +      } +    } +    C = sp[best_i].first; +  }  // tune regularizer +  tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x); +  if (conf.count("weights")) { +    for (int i = 1; i < x.size(); ++i) { +      x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi); +    } +  } +  cout.precision(15); +  cout << "# C=" << C << "\theld out perplexity="; +  if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; } +  if (sp.size()) { +    cout << "# Parameter sweep:\n"; +    for (int i = 0; i < sp.size(); ++i) { +      cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl; +    } +  } +  Weights::WriteToFile("-", x); +  return 0; +} diff --git a/training/pro/pro.pl b/training/pro/pro.pl new file mode 100755 index 00000000..3b30c379 --- /dev/null +++ b/training/pro/pro.pl @@ -0,0 +1,555 @@ +#!/usr/bin/env perl +use strict; +use File::Basename qw(basename); +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); +my $default_jobs = env_default_jobs(); + +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl"; +my $MAPPER = "$bin_dir/mr_pro_map"; +my $REDUCER = "$bin_dir/mr_pro_reduce"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 30; +my $iteration = 1; +my $best_weights; +my $psi = 1; +my $default_max_iter = 30; +my $max_iterations = $default_max_iter; +my $jobs = $default_jobs;   # number of decode nodes +my $pmem = "4g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $use_make = 1;  # use make to parallelize +my $useqsub = 0; +my $initial_weights; +my $pass_suffix = ''; +my $devset; + +# regularization strength +my $reg = 500; +my $reg_previous = 5000; + +# Process command-line options +if (GetOptions( +	"config=s" => \$iniFile, +	"weights=s" => \$initial_weights, +        "devset=s" => \$devset, +	"jobs=i" => \$jobs, +	"metric=s" => \$metric, +	"pass-suffix=s" => \$pass_suffix, +        "qsub" => \$useqsub, +	"help" => \$help, +	"reg=f" => \$reg, +	"reg-previous=f" => \$reg_previous, +	"output-dir=s" => \$dir, +) == 0 || @ARGV!=0 || $help) { +	print_help(); +	exit; +} + +if ($useqsub) { +  $use_make = 0; +  die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (!defined $iniFile) { push @missing_args, "--config"; } +if (!defined $devset) { push @missing_args, "--devset"; } +if (!defined $initial_weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { +  $lines_per_mapper = 5; +} + +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { +  $DIR_FLAG = ''; +} + +unless ($dir){ +	$dir = 'pro'; +} +unless ($dir =~ /^\//){  # convert relative path to absolute path +	my $basedir = check_output("pwd"); +	chomp $basedir; +	$dir = "$basedir/$dir"; +} + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { +	print STDERR "Cleanup...\n"; +	for my $pid (@childpids){ unchecked_call("kill $pid"); } +	for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } +	exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit =  +    sub{ cleanup(); };  +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +if (-e $dir) { +	die "ERROR: working dir $dir already exists\n\n"; +} else { +	mkdir "$dir" or die "Can't mkdir $dir: $!"; +	mkdir "$dir/hgs" or die; +	mkdir "$dir/scripts" or die; +	print STDERR <<EOT; +	DECODER:          $decoder +	INI FILE:         $iniFile +	WORKING DIR:      $dir +	DEVSET:           $devset +	EVAL METRIC:      $metric +	MAX ITERATIONS:   $max_iterations +	PARALLEL JOBS:    $jobs +	HEAD NODE:        $host +	PMEM (DECODING):  $pmem +	INITIAL WEIGHTS:  $initial_weights +EOT +} + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +check_call("cp $initial_weights $dir/weights.0"); +$iniFile = $newIniFile; + +my $refs = "$dir/dev.refs"; +split_devset($devset, "$dir/dev.input.raw", $refs); +my $newsrc = "$dir/dev.input"; +enseg("$dir/dev.input.raw", $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +my @allweights; +while (1){ +	print STDERR "\n\nITERATION $iteration\n==========\n"; + +	if ($iteration > $max_iterations){ +		print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; +		last; +	} +	# iteration-specific files +	my $runFile="$dir/run.raw.$iteration"; +	my $onebestFile="$dir/1best.$iteration"; +	my $logdir="$dir/logs.$iteration"; +	my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; +	my $scorerLog="$logdir/scorer.log.$iteration"; +	check_call("mkdir -p $logdir"); + + +	#decode +	print STDERR "RUNNING DECODER AT "; +	print STDERR unchecked_output("date"); +	my $im1 = $iteration - 1; +	my $weightsFile="$dir/weights.$im1"; +        push @allweights, "-w $dir/weights.$im1"; +        `rm -f $dir/hgs/*.gz`; +	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; +	my $pcmd; +	if ($use_make) { +		$pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; +	} else { +		$pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; +	} +	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; +	print STDERR "COMMAND:\n$cmd\n"; +	check_bash_call($cmd); +        my $num_hgs; +        my $num_topbest; +        my $retries = 0; +	while($retries < 5) { +	    $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); +	    $num_topbest = check_output("wc -l < $runFile"); +	    print STDERR "NUMBER OF HGs: $num_hgs\n"; +	    print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; +	    if($devSize == $num_hgs && $devSize == $num_topbest) { +		last; +	    } else { +		print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; +		sleep(3); +	    } +	    $retries++; +	} +	die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); +	my $dec_score = check_output("cat $runFile | $SCORER -r $refs -m $metric"); +	chomp $dec_score; +	print STDERR "DECODER SCORE: $dec_score\n"; + +	# save space +	check_call("gzip -f $runFile"); +	check_call("gzip -f $decoderLog"); + +	# run optimizer +	print STDERR "RUNNING OPTIMIZER AT "; +	print STDERR unchecked_output("date"); +	print STDERR " - GENERATE TRAINING EXEMPLARS\n"; +	my $mergeLog="$logdir/prune-merge.log.$iteration"; + +	my $score = 0; +	my $icc = 0; +	my $inweights="$dir/weights.$im1"; +	$cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1"; +	print STDERR "COMMAND:\n$cmd\n"; +	check_call($cmd); +	check_call("mkdir -p $dir/splag.$im1"); +	$cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1 $dir/splag.$im1/mapinput."; +	print STDERR "COMMAND:\n$cmd\n"; +	check_call($cmd); +	opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; +	my @shards = grep { /^mapinput\./ } readdir(DIR); +	closedir DIR; +	die "No shards!" unless scalar @shards > 0; +	my $joblist = ""; +	my $nmappers = 0; +	@cleanupcmds = (); +	my %o2i = (); +	my $first_shard = 1; +	my $mkfile; # only used with makefiles +	my $mkfilename; +	if ($use_make) { +		$mkfilename = "$dir/splag.$im1/domap.mk"; +		open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; +		print $mkfile "all: $dir/splag.$im1/map.done\n\n"; +	} +	my @mkouts = ();  # only used with makefiles +	my @mapoutputs = (); +	for my $shard (@shards) { +		my $mapoutput = $shard; +		my $client_name = $shard; +		$client_name =~ s/mapinput.//; +		$client_name = "pro.$client_name"; +		$mapoutput =~ s/mapinput/mapoutput/; +		push @mapoutputs, "$dir/splag.$im1/$mapoutput"; +		$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; +		my $script = "$MAPPER -s $srcFile -m $metric -r $refs -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; +		if ($use_make) { +			my $script_file = "$dir/scripts/map.$shard"; +			open F, ">$script_file" or die "Can't write $script_file: $!"; +			print F "#!/bin/bash\n"; +			print F "$script\n"; +			close F; +			my $output = "$dir/splag.$im1/$mapoutput"; +			push @mkouts, $output; +			chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; +			if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } +			print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; +		} else { +			my $script_file = "$dir/scripts/map.$shard"; +			open F, ">$script_file" or die "Can't write $script_file: $!"; +			print F "$script\n"; +			close F; +			if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + +			$nmappers++; +			my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; +			my $jobid = check_output("$qcmd"); +			chomp $jobid; +			$jobid =~ s/^(\d+)(.*?)$/\1/g; +			$jobid =~ s/^Your job (\d+) .*$/\1/; +		 	push(@cleanupcmds, "qdel $jobid 2> /dev/null"); +			print STDERR " $jobid"; +			if ($joblist == "") { $joblist = $jobid; } +			else {$joblist = $joblist . "\|" . $jobid; } +		} +	} +	my @dev_outs = (); +	my @devtest_outs = (); +	@dev_outs = @mapoutputs; +	if ($use_make) { +		print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; +		close $mkfile; +		my $mcmd = "make -j $jobs -f $mkfilename"; +		print STDERR "\nExecuting: $mcmd\n"; +		check_call($mcmd); +	} else { +		print STDERR "\nLaunched $nmappers mappers.\n"; +      		sleep 8; +		print STDERR "Waiting for mappers to complete...\n"; +		while ($nmappers > 0) { +		  sleep 5; +		  my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); +		  $nmappers = scalar @livejobs; +		} +		print STDERR "All mappers complete.\n"; +	} +	my $tol = 0; +	my $til = 0; +	my $dev_test_file = "$dir/splag.$im1/devtest.gz"; +	print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n"; +	print STDERR unchecked_output("date"); +	$cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -C $reg -y $reg_previous --interpolate_with_weights $psi"; +        $cmd .= " > $dir/weights.$iteration"; +	print STDERR "COMMAND:\n$cmd\n"; +	check_bash_call($cmd); +	$lastWeightsFile = "$dir/weights.$iteration"; +	$lastPScore = $score; +	$iteration++; +	print STDERR "\n==========\n"; +} + + +check_call("cp $lastWeightsFile $dir/weights.final"); +print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n"; +print STDOUT "$dir/weights.final\n"; + +exit 0; + +sub read_weights_file { +  my ($file) = @_; +  open F, "<$file" or die "Couldn't read $file: $!"; +  my @r = (); +  my $pm = -1; +  while(<F>) { +    next if /^#/; +    next if /^\s*$/; +    chomp; +    if (/^(.+)\s+(.+)$/) { +      my $m = $1; +      my $w = $2; +      die "Weights out of order: $m <= $pm" unless $m > $pm; +      push @r, $w; +    } else { +      warn "Unexpected feature name in weight file: $_"; +    } +  } +  close F; +  return join ' ', @r; +} + +sub enseg { +	my $src = shift; +	my $newsrc = shift; +	open(SRC, $src); +	open(NEWSRC, ">$newsrc"); +	my $i=0; +	while (my $line=<SRC>){ +		chomp $line; +		if ($line =~ /^\s*<seg/i) { +		    if($line =~ /id="[0-9]+"/) { +			print NEWSRC "$line\n"; +		    } else { +			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; +		    } +		} else { +			print NEWSRC "<seg id=\"$i\">$line</seg>\n"; +		} +		$i++; +	} +	close SRC; +	close NEWSRC; +	die "Empty dev set!" if ($i == 0); +} + +sub print_help { + +	my $executable = basename($0); chomp $executable; +	print << "Help"; + +Usage: $executable [options] + +	$executable [options] +		Runs a complete PRO optimization using the ini file specified. + +Required: + +	--config <cdec.ini> +		Decoder configuration file. + +	--devset <files> +		Dev set source and reference data. + +	--weights <file> +		Initial weights file (use empty file to start from 0) + +General options: + +	--help +		Print this message and exit. + +	--max-iterations <M> +		Maximum number of iterations to run.  If not specified, defaults +		to $default_max_iter. + +	--metric <method> +		Metric to optimize. +		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + +	--pass-suffix <S> +		If the decoder is doing multi-pass decoding, the pass suffix "2", +		"3", etc., is used to control what iteration of weights is set. + +	--workdir <dir> +		Directory for intermediate and output files.  If not specified, the +		name is derived from the ini filename.  Assuming that the ini +		filename begins with the decoder name and ends with ini, the default +		name of the working directory is inferred from the middle part of +		the filename.  E.g. an ini file named decoder.foo.ini would have +		a default working directory name foo. + +Regularization options: + +	--reg <F> +		l2 regularization strength [default=500]. The greater this value, +		the closer to zero the weights will be. + +	--reg-previous <F> +		l2 penalty for moving away from the weights from the previous +		iteration. [default=5000]. The greater this value, the closer +		to the previous iteration's weights the next iteration's weights +		will be. + +Job control options: + +	--jobs <I> +		Number of decoder processes to run in parallel. [default=$default_jobs] + +	--qsub +		Use qsub to run jobs in parallel (qsub must be configured in +		environment/LocalEnvironment.pm) + +	--pmem <N> +		Amount of physical memory requested for parallel decoding jobs +		(used with qsub requests only) + +Deprecated options: + +	--interpolate-with-weights <F> +		[deprecated] At each iteration the resulting weights are +		interpolated with the weights from the previous iteration, with +		this factor. [default=1.0, i.e., no effect] + +Help +} + +sub convert { +  my ($str) = @_; +  my @ps = split /;/, $str; +  my %dict = (); +  for my $p (@ps) { +    my ($k, $v) = split /=/, $p; +    $dict{$k} = $v; +  } +  return %dict; +} + + +sub cmdline { +    return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { +    my ($arg)=@_; +    return undef unless defined $arg; +    if ($arg =~ /$is_shell_special/) { +        $arg =~ s/($shell_escape_in_quote)/\\$1/g; +        return "\"$arg\""; +    } +    return $arg; +} + +sub escaped_shell_args { +    return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { +    return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { +    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub split_devset { +  my ($infile, $outsrc, $outref) = @_; +  open F, "<$infile" or die "Can't read $infile: $!"; +  open S, ">$outsrc" or die "Can't write $outsrc: $!"; +  open R, ">$outref" or die "Can't write $outref: $!"; +  while(<F>) { +    chomp; +    my ($src, @refs) = split /\s*\|\|\|\s*/; +    die "Malformed devset line: $_\n" unless scalar @refs > 0; +    print S "$src\n"; +    print R join(' ||| ', @refs) . "\n"; +  } +  close R; +  close S; +  close F; +} + | 
