From c87835f5f94b3aa954682133c40117b3f8e26585 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 12 Jul 2011 22:34:34 -0400 Subject: debugged pro trainer --- pro-train/dist-pro.pl | 9 +- pro-train/mr_pro_map.cc | 244 +++++++++++++++++++++++++++++++++++++-------- pro-train/mr_pro_reduce.cc | 57 ++++++----- utils/filelib.cc | 12 +++ utils/filelib.h | 1 + 5 files changed, 253 insertions(+), 70 deletions(-) diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index 55d7f1fa..c42e3876 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -66,6 +66,7 @@ my $bleu_weight=1; my $use_make; # use make to parallelize line search my $dirargs=''; my $usefork; +my $initial_weights; my $pass_suffix = ''; my $cpbin=1; # Process command-line options @@ -79,6 +80,7 @@ if (GetOptions( "dry-run" => \$dryrun, "epsilon=s" => \$epsilon, "help" => \$help, + "weights=s" => \$initial_weights, "interval" => \$interval, "iteration=i" => \$iteration, "local" => \$run_local, @@ -212,7 +214,7 @@ if ($dryrun){ close CMD; print STDERR $cline; chmod(0755,$cmdfile); - check_call("touch $dir/weights.0"); + check_call("cp $initial_weights $dir/weights.0"); die "Can't find weights.0" unless (-e "$dir/weights.0"); } write_config(*STDERR); @@ -239,7 +241,6 @@ my $random_seed = int(time / 1000); my $lastWeightsFile; my $lastPScore = 0; # main optimization loop -my @mapoutputs = (); # aggregate map outputs over all iters while (1){ print STDERR "\n\nITERATION $iteration\n==========\n"; @@ -262,6 +263,7 @@ while (1){ my $im1 = $iteration - 1; my $weightsFile="$dir/weights.$im1"; push @allweights, "-w $dir/weights.$im1"; + `rm -f $dir/hgs/*.gz`; my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; my $pcmd; if ($run_local) { @@ -333,6 +335,7 @@ while (1){ print $mkfile "all: $dir/splag.$im1/map.done\n\n"; } my @mkouts = (); # only used with makefiles + my @mapoutputs = (); for my $shard (@shards) { my $mapoutput = $shard; my $client_name = $shard; @@ -341,7 +344,7 @@ while (1){ $mapoutput =~ s/mapinput/mapoutput/; push @mapoutputs, "$dir/splag.$im1/$mapoutput"; $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep @allweights < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; + my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; if ($run_local) { print STDERR "COMMAND:\n$script\n"; check_bash_call($script); diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc index 128d93ce..4324e8de 100644 --- a/pro-train/mr_pro_map.cc +++ b/pro-train/mr_pro_map.cc @@ -2,7 +2,9 @@ #include #include #include +#include +#include #include #include #include @@ -22,16 +24,63 @@ using namespace std; namespace po = boost::program_options; +struct ApproxVectorHasher { + static const size_t MASK = 0xFFFFFFFFull; + union UType { + double f; + size_t i; + }; + static inline double round(const double x) { + UType t; + t.f = x; + size_t r = t.i & MASK; + if ((r << 1) > MASK) + t.i += MASK - r + 1; + else + t.i &= (1ull - MASK); + return t.f; + } + size_t operator()(const SparseVector& x) const { + size_t h = 0x573915839; + for (SparseVector::const_iterator it = x.begin(); it != x.end(); ++it) { + UType t; + t.f = it->second; + if (t.f) { + size_t z = (t.i >> 32); + boost::hash_combine(h, it->first); + boost::hash_combine(h, z); + } + } + return h; + } +}; + +struct ApproxVectorEquals { + bool operator()(const SparseVector& a, const SparseVector& b) const { + SparseVector::const_iterator bit = b.begin(); + for (SparseVector::const_iterator ait = a.begin(); ait != a.end(); ++ait) { + if (bit == b.end() || + ait->first != bit->first || + ApproxVectorHasher::round(ait->second) != ApproxVectorHasher::round(bit->second)) + return false; + ++bit; + } + if (bit != b.end()) return false; + return true; + } +}; + boost::shared_ptr rng; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("reference,r",po::value >(), "[REQD] Reference translation (tokenized text)") + ("weights,w",po::value(), "[REQD] Weights files from current iterations") + ("kbest_repository,K",po::value()->default_value("./kbest"),"K-best list repository (directory)") + ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") ("source,s",po::value()->default_value(""), "Source file (ignored, except for AER)") ("loss_function,l",po::value()->default_value("ibm_bleu"), "Loss function being optimized") - ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") - ("weights,w",po::value >(), "[REQD] Weights files from previous and current iterations") ("kbest_size,k",po::value()->default_value(1500u), "Top k-hypotheses to extract") ("candidate_pairs,G", po::value()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)") ("best_pairs,X", po::value()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") @@ -46,7 +95,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { flag = true; } if (!conf->count("weights")) { - cerr << "Please specify one or more weights using -w \n"; + cerr << "Please specify weights using -w \n"; flag = true; } if (flag || conf->count("help")) { @@ -56,6 +105,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } struct HypInfo { + HypInfo() : g_(-100.0) {} HypInfo(const vector& h, const SparseVector& feats) : hyp(h), g_(-100.0), x(feats) {} // lazy evaluation @@ -66,10 +116,92 @@ struct HypInfo { } vector hyp; mutable double g_; - public: SparseVector x; }; +struct HypInfoCompare { + bool operator()(const HypInfo& a, const HypInfo& b) const { + ApproxVectorEquals comp; + return (a.hyp == b.hyp && comp(a.x,b.x)); + } +}; + +struct HypInfoHasher { + size_t operator()(const HypInfo& x) const { + boost::hash > hhasher; + ApproxVectorHasher vhasher; + size_t ha = hhasher(x.hyp); + boost::hash_combine(ha, vhasher(x.x)); + return ha; + } +}; + +void WriteKBest(const string& file, const vector& kbest) { + WriteFile wf(file); + ostream& out = *wf.stream(); + out.precision(10); + for (int i = 0; i < kbest.size(); ++i) { + out << TD::GetString(kbest[i].hyp) << endl; + out << kbest[i].x << endl; + } +} + +void ParseSparseVector(string& line, size_t cur, SparseVector* out) { + SparseVector& x = *out; + size_t last_start = cur; + size_t last_comma = string::npos; + while(cur <= line.size()) { + if (line[cur] == ' ' || cur == line.size()) { + if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { + cerr << "[ERROR] " << line << endl << " position = " << cur << endl; + exit(1); + } + const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); + if (cur < line.size()) line[cur] = 0; + const double val = strtod(&line[last_comma + 1], NULL); + x.set_value(fid, val); + + last_comma = string::npos; + last_start = cur+1; + } else { + if (line[cur] == '=') + last_comma = cur; + } + ++cur; + } +} + +void ReadKBest(const string& file, vector* kbest) { + cerr << "Reading from " << file << endl; + ReadFile rf(file); + istream& in = *rf.stream(); + string cand; + string feats; + while(getline(in, cand)) { + getline(in, feats); + assert(in); + kbest->push_back(HypInfo()); + TD::ConvertSentence(cand, &kbest->back().hyp); + ParseSparseVector(feats, 0, &kbest->back().x); + } + cerr << " read " << kbest->size() << " hypotheses\n"; +} + +void Dedup(vector* h) { + cerr << "Dedup in=" << h->size(); + tr1::unordered_set u; + while(h->size() > 0) { + u.insert(h->back()); + h->pop_back(); + } + tr1::unordered_set::iterator it = u.begin(); + while (it != u.end()) { + h->push_back(*it); + it = u.erase(it); + } + cerr << " out=" << h->size() << endl; +} + struct ThresholdAlpha { explicit ThresholdAlpha(double t = 0.05) : threshold(t) {} double operator()(double mag) const { @@ -81,6 +213,7 @@ struct ThresholdAlpha { struct TrainingInstance { TrainingInstance(const SparseVector& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {} SparseVector x; +#undef DEBUGGING_PRO #ifdef DEBUGGING_PRO vector a; vector b; @@ -88,6 +221,11 @@ struct TrainingInstance { bool y; double gdiff; }; +#ifdef DEBUGGING_PRO +ostream& operator<<(ostream& os, const TrainingInstance& d) { + return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x; +} +#endif struct DiffOrder { bool operator()(const TrainingInstance& a, const TrainingInstance& b) const { @@ -95,36 +233,51 @@ struct DiffOrder { } }; -template -void Sample(const unsigned gamma, const unsigned xi, const vector& J_i, const SentenceScorer& scorer, const Alpha& alpha_i, bool invert_score, vector* pv) { - vector v; +void Sample(const unsigned gamma, const unsigned xi, const vector& J_i, const SentenceScorer& scorer, const bool invert_score, vector* pv) { + vector v1, v2; + double avg_diff = 0; for (unsigned i = 0; i < gamma; ++i) { - size_t a = rng->inclusive(0, J_i.size() - 1)(); - size_t b = rng->inclusive(0, J_i.size() - 1)(); + const size_t a = rng->inclusive(0, J_i.size() - 1)(); + const size_t b = rng->inclusive(0, J_i.size() - 1)(); if (a == b) continue; double ga = J_i[a].g(scorer); double gb = J_i[b].g(scorer); - bool positive = ga < gb; + bool positive = gb < ga; if (invert_score) positive = !positive; - double gdiff = fabs(ga - gb); + const double gdiff = fabs(ga - gb); if (!gdiff) continue; - if (rng->next() < alpha_i(gdiff)) { - v.push_back(TrainingInstance((J_i[a].x - J_i[b].x).erase_zeros(), positive, gdiff)); + avg_diff += gdiff; + SparseVector xdiff = (J_i[a].x - J_i[b].x).erase_zeros(); + if (xdiff.empty()) { + cerr << "Empty diff:\n " << TD::GetString(J_i[a].hyp) << endl << "x=" << J_i[a].x << endl; + cerr << " " << TD::GetString(J_i[b].hyp) << endl << "x=" << J_i[b].x << endl; + continue; + } + v1.push_back(TrainingInstance(xdiff, positive, gdiff)); #ifdef DEBUGGING_PRO - v.back().a = J_i[a].hyp; - v.back().b = J_i[b].hyp; + v1.back().a = J_i[a].hyp; + v1.back().b = J_i[b].hyp; + cerr << "N: " << v1.back() << endl; #endif - } } - vector::iterator mid = v.begin() + xi; - if (xi > v.size()) mid = v.end(); - partial_sort(v.begin(), mid, v.end(), DiffOrder()); - copy(v.begin(), mid, back_inserter(*pv)); + avg_diff /= v1.size(); + + for (unsigned i = 0; i < v1.size(); ++i) { + double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff)); + // cerr << "avg_diff=" << avg_diff << " gdiff=" << v1[i].gdiff << " p=" << p << endl; + if (rng->next() < p) v2.push_back(v1[i]); + } + vector::iterator mid = v2.begin() + xi; + if (xi > v2.size()) mid = v2.end(); + partial_sort(v2.begin(), mid, v2.end(), DiffOrder()); + copy(v2.begin(), mid, back_inserter(*pv)); #ifdef DEBUGGING_PRO - if (v.size() >= 5) - for (int i =0; i < 5; ++i) { - cerr << v[i].gdiff << " y=" << v[i].y << "\tA:" << TD::GetString(v[i].a) << "\n\tB: " << TD::GetString(v[i].b) << endl; + if (v2.size() >= 5) { + for (int i =0; i < (mid - v2.begin()); ++i) { + cerr << v2[i] << endl; } + cerr << pv->back() << endl; + } #endif } @@ -136,6 +289,7 @@ int main(int argc, char** argv) { else rng.reset(new MT19937); const string loss_function = conf["loss_function"].as(); + ScoreType type = ScoreTypeFromString(loss_function); DocScorer ds(type, conf["reference"].as >(), conf["source"].as()); cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; @@ -146,13 +300,15 @@ int main(int argc, char** argv) { const unsigned kbest_size = conf["kbest_size"].as(); const unsigned gamma = conf["candidate_pairs"].as(); const unsigned xi = conf["best_pairs"].as(); - vector weights_files = conf["weights"].as >(); - vector > weights(weights_files.size()); - for (int i = 0; i < weights.size(); ++i) { + string weightsf = conf["weights"].as(); + vector weights; + { Weights w; - w.InitFromFile(weights_files[i]); - w.InitVector(&weights[i]); + w.InitFromFile(weightsf); + w.InitVector(&weights); } + string kbest_repo = conf["kbest_repository"].as(); + MkDirP(kbest_repo); while(in) { vector v; string line; @@ -164,24 +320,26 @@ int main(int argc, char** argv) { // path-to-file (JSON) sent_id is >> file >> sent_id; ReadFile rf(file); - HypergraphIO::ReadFromJSON(rf.stream(), &hg); + ostringstream os; vector J_i; - int start = weights.size(); - start -= 4; - if (start < 0) start = 0; - for (int i = start; i < weights.size(); ++i) { - hg.Reweight(weights[i]); - KBest::KBestDerivations, ESentenceTraversal> kbest(hg, kbest_size); - - for (int i = 0; i < kbest_size; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - J_i.push_back(HypInfo(d->yield, d->feature_values)); - } + os << kbest_repo << "/kbest." << sent_id << ".txt.gz"; + const string kbest_file = os.str(); + if (FileExists(kbest_file)) + ReadKBest(kbest_file, &J_i); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(weights); + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, kbest_size); + + for (int i = 0; i < kbest_size; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + J_i.push_back(HypInfo(d->yield, d->feature_values)); } + Dedup(&J_i); + WriteKBest(kbest_file, J_i); - Sample(gamma, xi, J_i, *ds[sent_id], ThresholdAlpha(0.05), (type == TER), &v); + Sample(gamma, xi, J_i, *ds[sent_id], (type == TER), &v); for (unsigned i = 0; i < v.size(); ++i) { const TrainingInstance& vi = v[i]; cout << vi.y << "\t" << vi.x << endl; diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 2b9c5ce7..e1a7db8a 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -24,7 +24,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("weights,w", po::value(), "Weights from previous iteration (used as initialization and interpolation") ("interpolation,p",po::value()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev") ("memory_buffers,m",po::value()->default_value(200), "Number of memory buffers (LBFGS)") - ("sigma_squared,s",po::value()->default_value(0.5), "Sigma squared for Gaussian prior") + ("sigma_squared,s",po::value()->default_value(1.0), "Sigma squared for Gaussian prior") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); @@ -35,6 +35,31 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } +void ParseSparseVector(string& line, size_t cur, SparseVector* out) { + SparseVector& x = *out; + size_t last_start = cur; + size_t last_comma = string::npos; + while(cur <= line.size()) { + if (line[cur] == ' ' || cur == line.size()) { + if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { + cerr << "[ERROR] " << line << endl << " position = " << cur << endl; + exit(1); + } + const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); + if (cur < line.size()) line[cur] = 0; + const double val = strtod(&line[last_comma + 1], NULL); + x.set_value(fid, val); + + last_comma = string::npos; + last_start = cur+1; + } else { + if (line[cur] == '=') + last_comma = cur; + } + ++cur; + } +} + int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); @@ -60,28 +85,7 @@ int main(int argc, char** argv) { assert(ks == 1); const bool y = line[0] == '1'; SparseVector x; - size_t last_start = ks + 1; - size_t last_comma = string::npos; - size_t cur = last_start; - while(cur <= line.size()) { - if (line[cur] == ' ' || cur == line.size()) { - if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { - cerr << "[ERROR] " << line << endl << " position = " << cur << endl; - exit(1); - } - const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); - if (cur < line.size()) line[cur] = 0; - const double val = strtod(&line[last_comma + 1], NULL); - x.set_value(fid, val); - - last_comma = string::npos; - last_start = cur+1; - } else { - if (line[cur] == '=') - last_comma = cur; - } - ++cur; - } + ParseSparseVector(line, ks + 1, &x); training.push_back(make_pair(y, x)); } if (flag) cerr << endl; @@ -95,6 +99,7 @@ int main(int argc, char** argv) { SparseVector g; bool converged = false; LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as()); + double ppl = 0; while(!converged) { double cll = 0; double dbias = 0; @@ -114,14 +119,18 @@ int main(int argc, char** argv) { lp_false*=-1; if (training[i].first) { // true label cll -= lp_true; + ppl += lp_true / log(2); g -= training[i].second * exp(lp_false); dbias -= exp(lp_false); } else { // false label cll -= lp_false; + ppl += lp_false / log(2); g += training[i].second * exp(lp_true); dbias += exp(lp_true); } } + ppl /= training.size(); + ppl = pow(2.0, - ppl); vg.clear(); g.init_vector(&vg); vg[0] = dbias; @@ -139,7 +148,7 @@ int main(int argc, char** argv) { double reg = 0; #endif cll += reg; - cerr << cll << " (REG=" << reg << ")\t"; + cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t"; bool failed = false; try { opt.Optimize(cll, vg, &x); diff --git a/utils/filelib.cc b/utils/filelib.cc index 79ad2847..a0969b1a 100644 --- a/utils/filelib.cc +++ b/utils/filelib.cc @@ -20,3 +20,15 @@ bool DirectoryExists(const string& dir) { return false; } +void MkDirP(const string& dir) { + if (DirectoryExists(dir)) return; + if (mkdir(dir.c_str(), 0777)) { + perror(dir.c_str()); + abort(); + } + if (chmod(dir.c_str(), 07777)) { + perror(dir.c_str()); + abort(); + } +} + diff --git a/utils/filelib.h b/utils/filelib.h index dda98671..a8622246 100644 --- a/utils/filelib.h +++ b/utils/filelib.h @@ -12,6 +12,7 @@ bool FileExists(const std::string& file_name); bool DirectoryExists(const std::string& dir_name); +void MkDirP(const std::string& dir_name); // reads from standard in if filename is - // uncompresses if file ends with .gz -- cgit v1.2.3