diff options
Diffstat (limited to 'pro-train')
| -rwxr-xr-x | pro-train/dist-pro.pl | 9 | ||||
| -rw-r--r-- | pro-train/mr_pro_map.cc | 244 | ||||
| -rw-r--r-- | pro-train/mr_pro_reduce.cc | 57 | 
3 files changed, 240 insertions, 70 deletions
| diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index 55d7f1fa..c42e3876 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -66,6 +66,7 @@ my $bleu_weight=1;  my $use_make;  # use make to parallelize line search  my $dirargs='';  my $usefork; +my $initial_weights;  my $pass_suffix = '';  my $cpbin=1;  # Process command-line options @@ -79,6 +80,7 @@ if (GetOptions(  	"dry-run" => \$dryrun,  	"epsilon=s" => \$epsilon,  	"help" => \$help, +        "weights=s" => \$initial_weights,  	"interval" => \$interval,  	"iteration=i" => \$iteration,  	"local" => \$run_local, @@ -212,7 +214,7 @@ if ($dryrun){          close CMD;          print STDERR $cline;          chmod(0755,$cmdfile); -	check_call("touch $dir/weights.0"); +	check_call("cp $initial_weights $dir/weights.0");  	die "Can't find weights.0" unless (-e "$dir/weights.0");  	}  	write_config(*STDERR); @@ -239,7 +241,6 @@ my $random_seed = int(time / 1000);  my $lastWeightsFile;  my $lastPScore = 0;  # main optimization loop -my @mapoutputs = (); # aggregate map outputs over all iters  while (1){  	print STDERR "\n\nITERATION $iteration\n==========\n"; @@ -262,6 +263,7 @@ while (1){  	my $im1 = $iteration - 1;  	my $weightsFile="$dir/weights.$im1";          push @allweights, "-w $dir/weights.$im1"; +        `rm -f $dir/hgs/*.gz`;  	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";  	my $pcmd;  	if ($run_local) { @@ -333,6 +335,7 @@ while (1){  		print $mkfile "all: $dir/splag.$im1/map.done\n\n";  	}  	my @mkouts = ();  # only used with makefiles +	my @mapoutputs = ();  	for my $shard (@shards) {  		my $mapoutput = $shard;  		my $client_name = $shard; @@ -341,7 +344,7 @@ while (1){  		$mapoutput =~ s/mapinput/mapoutput/;  		push @mapoutputs, "$dir/splag.$im1/$mapoutput";  		$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; -		my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep @allweights < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; +		my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";  		if ($run_local) {  			print STDERR "COMMAND:\n$script\n";  			check_bash_call($script); diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc index 128d93ce..4324e8de 100644 --- a/pro-train/mr_pro_map.cc +++ b/pro-train/mr_pro_map.cc @@ -2,7 +2,9 @@  #include <iostream>  #include <fstream>  #include <vector> +#include <tr1/unordered_map> +#include <boost/functional/hash.hpp>  #include <boost/shared_ptr.hpp>  #include <boost/program_options.hpp>  #include <boost/program_options/variables_map.hpp> @@ -22,16 +24,63 @@  using namespace std;  namespace po = boost::program_options; +struct ApproxVectorHasher { +  static const size_t MASK = 0xFFFFFFFFull; +  union UType { +    double f; +    size_t i; +  }; +  static inline double round(const double x) { +    UType t; +    t.f = x; +    size_t r = t.i & MASK; +    if ((r << 1) > MASK) +      t.i += MASK - r + 1; +    else +      t.i &= (1ull - MASK); +    return t.f; +  } +  size_t operator()(const SparseVector<double>& x) const { +    size_t h = 0x573915839; +    for (SparseVector<double>::const_iterator it = x.begin(); it != x.end(); ++it) { +      UType t; +      t.f = it->second; +      if (t.f) { +        size_t z = (t.i >> 32); +        boost::hash_combine(h, it->first); +        boost::hash_combine(h, z); +      } +    } +    return h; +  } +}; + +struct ApproxVectorEquals { +  bool operator()(const SparseVector<double>& a, const SparseVector<double>& b) const { +    SparseVector<double>::const_iterator bit = b.begin(); +    for (SparseVector<double>::const_iterator ait = a.begin(); ait != a.end(); ++ait) { +      if (bit == b.end() || +          ait->first != bit->first || +          ApproxVectorHasher::round(ait->second) != ApproxVectorHasher::round(bit->second)) +        return false; +      ++bit; +    } +    if (bit != b.end()) return false; +    return true; +  } +}; +  boost::shared_ptr<MT19937> rng;  void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    po::options_description opts("Configuration options");    opts.add_options()          ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") +        ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations") +        ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)") +        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")          ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)")          ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized") -        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") -        ("weights,w",po::value<vector<string> >(), "[REQD] Weights files from previous and current iterations")          ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")          ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")          ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") @@ -46,7 +95,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {      flag = true;    }    if (!conf->count("weights")) { -    cerr << "Please specify one or more weights using -w <WEIGHTS.TXT>\n"; +    cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";      flag = true;    }    if (flag || conf->count("help")) { @@ -56,6 +105,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {  }  struct HypInfo { +  HypInfo() : g_(-100.0) {}    HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-100.0), x(feats) {}    // lazy evaluation @@ -66,10 +116,92 @@ struct HypInfo {    }    vector<WordID> hyp;    mutable double g_; - public:    SparseVector<double> x;  }; +struct HypInfoCompare { +  bool operator()(const HypInfo& a, const HypInfo& b) const { +    ApproxVectorEquals comp; +    return (a.hyp == b.hyp && comp(a.x,b.x)); +  } +}; + +struct HypInfoHasher { +  size_t operator()(const HypInfo& x) const { +    boost::hash<vector<WordID> > hhasher; +    ApproxVectorHasher vhasher; +    size_t ha = hhasher(x.hyp); +    boost::hash_combine(ha, vhasher(x.x)); +    return ha; +  } +}; + +void WriteKBest(const string& file, const vector<HypInfo>& kbest) { +  WriteFile wf(file); +  ostream& out = *wf.stream(); +  out.precision(10); +  for (int i = 0; i < kbest.size(); ++i) { +    out << TD::GetString(kbest[i].hyp) << endl; +    out << kbest[i].x << endl; +  } +} + +void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) { +  SparseVector<double>& x = *out; +  size_t last_start = cur; +  size_t last_comma = string::npos; +  while(cur <= line.size()) { +    if (line[cur] == ' ' || cur == line.size()) { +      if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { +        cerr << "[ERROR] " << line << endl << "  position = " << cur << endl; +        exit(1); +      } +      const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); +      if (cur < line.size()) line[cur] = 0; +      const double val = strtod(&line[last_comma + 1], NULL); +      x.set_value(fid, val); + +      last_comma = string::npos; +      last_start = cur+1; +    } else { +      if (line[cur] == '=') +        last_comma = cur; +    } +    ++cur; +  } +} + +void ReadKBest(const string& file, vector<HypInfo>* kbest) { +  cerr << "Reading from " << file << endl; +  ReadFile rf(file); +  istream& in = *rf.stream(); +  string cand; +  string feats; +  while(getline(in, cand)) { +    getline(in, feats); +    assert(in); +    kbest->push_back(HypInfo()); +    TD::ConvertSentence(cand, &kbest->back().hyp); +    ParseSparseVector(feats, 0, &kbest->back().x); +  } +  cerr << "  read " << kbest->size() << " hypotheses\n"; +} + +void Dedup(vector<HypInfo>* h) { +  cerr << "Dedup in=" << h->size(); +  tr1::unordered_set<HypInfo, HypInfoHasher, HypInfoCompare> u; +  while(h->size() > 0) { +    u.insert(h->back()); +    h->pop_back(); +  } +  tr1::unordered_set<HypInfo, HypInfoHasher, HypInfoCompare>::iterator it = u.begin(); +  while (it != u.end()) { +    h->push_back(*it); +    it = u.erase(it); +  } +  cerr << "  out=" << h->size() << endl; +} +  struct ThresholdAlpha {    explicit ThresholdAlpha(double t = 0.05) : threshold(t) {}    double operator()(double mag) const { @@ -81,6 +213,7 @@ struct ThresholdAlpha {  struct TrainingInstance {    TrainingInstance(const SparseVector<double>& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {}    SparseVector<double> x; +#undef DEBUGGING_PRO  #ifdef DEBUGGING_PRO    vector<WordID> a;    vector<WordID> b; @@ -88,6 +221,11 @@ struct TrainingInstance {    bool y;    double gdiff;  }; +#ifdef DEBUGGING_PRO +ostream& operator<<(ostream& os, const TrainingInstance& d) { +  return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x; +} +#endif  struct DiffOrder {    bool operator()(const TrainingInstance& a, const TrainingInstance& b) const { @@ -95,36 +233,51 @@ struct DiffOrder {    }  }; -template<typename Alpha> -void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const Alpha& alpha_i, bool invert_score, vector<TrainingInstance>* pv) { -  vector<TrainingInstance> v; +void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const bool invert_score, vector<TrainingInstance>* pv) { +  vector<TrainingInstance> v1, v2; +  double avg_diff = 0;    for (unsigned i = 0; i < gamma; ++i) { -    size_t a = rng->inclusive(0, J_i.size() - 1)(); -    size_t b = rng->inclusive(0, J_i.size() - 1)(); +    const size_t a = rng->inclusive(0, J_i.size() - 1)(); +    const size_t b = rng->inclusive(0, J_i.size() - 1)();      if (a == b) continue;      double ga = J_i[a].g(scorer);      double gb = J_i[b].g(scorer); -    bool positive = ga < gb; +    bool positive = gb < ga;      if (invert_score) positive = !positive; -    double gdiff = fabs(ga - gb); +    const double gdiff = fabs(ga - gb);      if (!gdiff) continue; -    if (rng->next() < alpha_i(gdiff)) { -      v.push_back(TrainingInstance((J_i[a].x - J_i[b].x).erase_zeros(), positive, gdiff)); +    avg_diff += gdiff; +    SparseVector<double> xdiff = (J_i[a].x - J_i[b].x).erase_zeros(); +    if (xdiff.empty()) { +      cerr << "Empty diff:\n  " << TD::GetString(J_i[a].hyp) << endl << "x=" << J_i[a].x << endl; +      cerr << "  " << TD::GetString(J_i[b].hyp) << endl << "x=" << J_i[b].x << endl; +      continue; +    } +    v1.push_back(TrainingInstance(xdiff, positive, gdiff));  #ifdef DEBUGGING_PRO -      v.back().a = J_i[a].hyp; -      v.back().b = J_i[b].hyp; +    v1.back().a = J_i[a].hyp; +    v1.back().b = J_i[b].hyp; +    cerr << "N: " << v1.back() << endl;  #endif -    }    } -  vector<TrainingInstance>::iterator mid = v.begin() + xi; -  if (xi > v.size()) mid = v.end(); -  partial_sort(v.begin(), mid, v.end(), DiffOrder()); -  copy(v.begin(), mid, back_inserter(*pv)); +  avg_diff /= v1.size(); + +  for (unsigned i = 0; i < v1.size(); ++i) { +    double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff)); +    // cerr << "avg_diff=" << avg_diff << "  gdiff=" << v1[i].gdiff << "  p=" << p << endl; +    if (rng->next() < p) v2.push_back(v1[i]); +  } +  vector<TrainingInstance>::iterator mid = v2.begin() + xi; +  if (xi > v2.size()) mid = v2.end(); +  partial_sort(v2.begin(), mid, v2.end(), DiffOrder()); +  copy(v2.begin(), mid, back_inserter(*pv));  #ifdef DEBUGGING_PRO -  if (v.size() >= 5) -    for (int i =0; i < 5; ++i) { -      cerr << v[i].gdiff << " y=" << v[i].y << "\tA:" << TD::GetString(v[i].a) << "\n\tB: " << TD::GetString(v[i].b) << endl; +  if (v2.size() >= 5) { +    for (int i =0; i < (mid - v2.begin()); ++i) { +      cerr << v2[i] << endl;      } +    cerr << pv->back() << endl; +  }  #endif  } @@ -136,6 +289,7 @@ int main(int argc, char** argv) {    else      rng.reset(new MT19937);    const string loss_function = conf["loss_function"].as<string>(); +    ScoreType type = ScoreTypeFromString(loss_function);    DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>());    cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; @@ -146,13 +300,15 @@ int main(int argc, char** argv) {    const unsigned kbest_size = conf["kbest_size"].as<unsigned>();    const unsigned gamma = conf["candidate_pairs"].as<unsigned>();    const unsigned xi = conf["best_pairs"].as<unsigned>(); -  vector<string> weights_files = conf["weights"].as<vector<string> >(); -  vector<vector<double> > weights(weights_files.size()); -  for (int i = 0; i < weights.size(); ++i) { +  string weightsf = conf["weights"].as<string>(); +  vector<double> weights; +  {      Weights w; -    w.InitFromFile(weights_files[i]); -    w.InitVector(&weights[i]); +    w.InitFromFile(weightsf); +    w.InitVector(&weights);    } +  string kbest_repo = conf["kbest_repository"].as<string>(); +  MkDirP(kbest_repo);    while(in) {      vector<TrainingInstance> v;      string line; @@ -164,24 +320,26 @@ int main(int argc, char** argv) {      // path-to-file (JSON) sent_id      is >> file >> sent_id;      ReadFile rf(file); -    HypergraphIO::ReadFromJSON(rf.stream(), &hg); +    ostringstream os;      vector<HypInfo> J_i; -    int start = weights.size(); -    start -= 4; -    if (start < 0) start = 0; -    for (int i = start; i < weights.size(); ++i) { -      hg.Reweight(weights[i]); -      KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size); - -      for (int i = 0; i < kbest_size; ++i) { -        const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = -          kbest.LazyKthBest(hg.nodes_.size() - 1, i); -        if (!d) break; -        J_i.push_back(HypInfo(d->yield, d->feature_values)); -      } +    os << kbest_repo << "/kbest." << sent_id << ".txt.gz"; +    const string kbest_file = os.str(); +    if (FileExists(kbest_file)) +      ReadKBest(kbest_file, &J_i); +    HypergraphIO::ReadFromJSON(rf.stream(), &hg); +    hg.Reweight(weights); +    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size); + +    for (int i = 0; i < kbest_size; ++i) { +      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +        kbest.LazyKthBest(hg.nodes_.size() - 1, i); +      if (!d) break; +      J_i.push_back(HypInfo(d->yield, d->feature_values));      } +    Dedup(&J_i); +    WriteKBest(kbest_file, J_i); -    Sample(gamma, xi, J_i, *ds[sent_id], ThresholdAlpha(0.05), (type == TER), &v); +    Sample(gamma, xi, J_i, *ds[sent_id], (type == TER), &v);      for (unsigned i = 0; i < v.size(); ++i) {        const TrainingInstance& vi = v[i];        cout << vi.y << "\t" << vi.x << endl; diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 2b9c5ce7..e1a7db8a 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -24,7 +24,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {          ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")          ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev")          ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)") -        ("sigma_squared,s",po::value<double>()->default_value(0.5), "Sigma squared for Gaussian prior") +        ("sigma_squared,s",po::value<double>()->default_value(1.0), "Sigma squared for Gaussian prior")          ("help,h", "Help");    po::options_description dcmdline_options;    dcmdline_options.add(opts); @@ -35,6 +35,31 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    }  } +void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) { +  SparseVector<double>& x = *out; +  size_t last_start = cur; +  size_t last_comma = string::npos; +  while(cur <= line.size()) { +    if (line[cur] == ' ' || cur == line.size()) { +      if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { +        cerr << "[ERROR] " << line << endl << "  position = " << cur << endl; +        exit(1); +      } +      const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); +      if (cur < line.size()) line[cur] = 0; +      const double val = strtod(&line[last_comma + 1], NULL); +      x.set_value(fid, val); + +      last_comma = string::npos; +      last_start = cur+1; +    } else { +      if (line[cur] == '=') +        last_comma = cur; +    } +    ++cur; +  } +} +  int main(int argc, char** argv) {    po::variables_map conf;    InitCommandLine(argc, argv, &conf); @@ -60,28 +85,7 @@ int main(int argc, char** argv) {      assert(ks == 1);      const bool y = line[0] == '1';      SparseVector<double> x; -    size_t last_start = ks + 1; -    size_t last_comma = string::npos; -    size_t cur = last_start; -    while(cur <= line.size()) { -      if (line[cur] == ' ' || cur == line.size()) { -        if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { -          cerr << "[ERROR] " << line << endl << "  position = " << cur << endl; -          exit(1); -        } -        const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); -        if (cur < line.size()) line[cur] = 0; -        const double val = strtod(&line[last_comma + 1], NULL); -        x.set_value(fid, val); - -        last_comma = string::npos; -        last_start = cur+1; -      } else { -        if (line[cur] == '=') -          last_comma = cur; -      } -      ++cur; -    } +    ParseSparseVector(line, ks + 1, &x);      training.push_back(make_pair(y, x));    }    if (flag) cerr << endl; @@ -95,6 +99,7 @@ int main(int argc, char** argv) {    SparseVector<double> g;    bool converged = false;    LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as<unsigned>()); +  double ppl = 0;    while(!converged) {      double cll = 0;      double dbias = 0; @@ -114,14 +119,18 @@ int main(int argc, char** argv) {        lp_false*=-1;        if (training[i].first) {  // true label          cll -= lp_true; +        ppl += lp_true / log(2);          g -= training[i].second * exp(lp_false);          dbias -= exp(lp_false);        } else {                  // false label          cll -= lp_false; +        ppl += lp_false / log(2);          g += training[i].second * exp(lp_true);          dbias += exp(lp_true);        }      } +    ppl /= training.size(); +    ppl = pow(2.0, - ppl);      vg.clear();      g.init_vector(&vg);      vg[0] = dbias; @@ -139,7 +148,7 @@ int main(int argc, char** argv) {      double reg = 0;  #endif      cll += reg; -    cerr << cll << " (REG=" << reg << ")\t"; +    cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t";      bool failed = false;      try {        opt.Optimize(cll, vg, &x); | 
