From 1da72bb6211f196a210302f18c1ef020c0c84f12 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 5 Jul 2011 23:19:43 -0400 Subject: fast phrasinator that uses DPs instead of PYPs --- utils/sampler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'utils') diff --git a/utils/sampler.h b/utils/sampler.h index a14f6e2f..153e7ef1 100644 --- a/utils/sampler.h +++ b/utils/sampler.h @@ -105,7 +105,7 @@ class SampleSet { const F& operator[](int i) const { return m_scores[i]; } F& operator[](int i) { return m_scores[i]; } bool empty() const { return m_scores.empty(); } - void add(const prob_t& s) { m_scores.push_back(s); } + void add(const F& s) { m_scores.push_back(s); } void clear() { m_scores.clear(); } size_t size() const { return m_scores.size(); } void resize(int size) { m_scores.resize(size); } -- cgit v1.2.3 From 79899fe98e2a7e4d250a8518b8a4cd8f2bb93522 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 12 Jul 2011 21:39:44 -0400 Subject: nasty bug in operator- in sparse vector --- utils/fast_sparse_vector.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'utils') diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index 4aae2039..9d72cb87 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -235,6 +235,13 @@ class FastSparseVector { } return *this; } + FastSparseVector erase_zeros(const T& EPSILON = 1e-4) const { + FastSparseVector o; + for (const_iterator it = begin(); it != end(); ++it) { + if (fabs(it->second) > EPSILON) o.set_value(it->first, it->second); + } + return o; + } const_iterator begin() const { return const_iterator(*this, false); } @@ -344,15 +351,9 @@ const FastSparseVector operator+(const FastSparseVector& x, const FastSpar template const FastSparseVector operator-(const FastSparseVector& x, const FastSparseVector& y) { - if (x.size() > y.size()) { - FastSparseVector res(x); - res -= y; - return res; - } else { - FastSparseVector res(y); - res -= x; - return res; - } + FastSparseVector res(x); + res -= y; + return res; } template -- cgit v1.2.3 From 5e3c68b62dd72255db95c5822835a3931770f285 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 12 Jul 2011 22:34:34 -0400 Subject: debugged pro trainer --- pro-train/dist-pro.pl | 9 +- pro-train/mr_pro_map.cc | 244 +++++++++++++++++++++++++++++++++++++-------- pro-train/mr_pro_reduce.cc | 57 ++++++----- utils/filelib.cc | 12 +++ utils/filelib.h | 1 + 5 files changed, 253 insertions(+), 70 deletions(-) (limited to 'utils') diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index 55d7f1fa..c42e3876 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -66,6 +66,7 @@ my $bleu_weight=1; my $use_make; # use make to parallelize line search my $dirargs=''; my $usefork; +my $initial_weights; my $pass_suffix = ''; my $cpbin=1; # Process command-line options @@ -79,6 +80,7 @@ if (GetOptions( "dry-run" => \$dryrun, "epsilon=s" => \$epsilon, "help" => \$help, + "weights=s" => \$initial_weights, "interval" => \$interval, "iteration=i" => \$iteration, "local" => \$run_local, @@ -212,7 +214,7 @@ if ($dryrun){ close CMD; print STDERR $cline; chmod(0755,$cmdfile); - check_call("touch $dir/weights.0"); + check_call("cp $initial_weights $dir/weights.0"); die "Can't find weights.0" unless (-e "$dir/weights.0"); } write_config(*STDERR); @@ -239,7 +241,6 @@ my $random_seed = int(time / 1000); my $lastWeightsFile; my $lastPScore = 0; # main optimization loop -my @mapoutputs = (); # aggregate map outputs over all iters while (1){ print STDERR "\n\nITERATION $iteration\n==========\n"; @@ -262,6 +263,7 @@ while (1){ my $im1 = $iteration - 1; my $weightsFile="$dir/weights.$im1"; push @allweights, "-w $dir/weights.$im1"; + `rm -f $dir/hgs/*.gz`; my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; my $pcmd; if ($run_local) { @@ -333,6 +335,7 @@ while (1){ print $mkfile "all: $dir/splag.$im1/map.done\n\n"; } my @mkouts = (); # only used with makefiles + my @mapoutputs = (); for my $shard (@shards) { my $mapoutput = $shard; my $client_name = $shard; @@ -341,7 +344,7 @@ while (1){ $mapoutput =~ s/mapinput/mapoutput/; push @mapoutputs, "$dir/splag.$im1/$mapoutput"; $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep @allweights < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; + my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; if ($run_local) { print STDERR "COMMAND:\n$script\n"; check_bash_call($script); diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc index 128d93ce..4324e8de 100644 --- a/pro-train/mr_pro_map.cc +++ b/pro-train/mr_pro_map.cc @@ -2,7 +2,9 @@ #include #include #include +#include +#include #include #include #include @@ -22,16 +24,63 @@ using namespace std; namespace po = boost::program_options; +struct ApproxVectorHasher { + static const size_t MASK = 0xFFFFFFFFull; + union UType { + double f; + size_t i; + }; + static inline double round(const double x) { + UType t; + t.f = x; + size_t r = t.i & MASK; + if ((r << 1) > MASK) + t.i += MASK - r + 1; + else + t.i &= (1ull - MASK); + return t.f; + } + size_t operator()(const SparseVector& x) const { + size_t h = 0x573915839; + for (SparseVector::const_iterator it = x.begin(); it != x.end(); ++it) { + UType t; + t.f = it->second; + if (t.f) { + size_t z = (t.i >> 32); + boost::hash_combine(h, it->first); + boost::hash_combine(h, z); + } + } + return h; + } +}; + +struct ApproxVectorEquals { + bool operator()(const SparseVector& a, const SparseVector& b) const { + SparseVector::const_iterator bit = b.begin(); + for (SparseVector::const_iterator ait = a.begin(); ait != a.end(); ++ait) { + if (bit == b.end() || + ait->first != bit->first || + ApproxVectorHasher::round(ait->second) != ApproxVectorHasher::round(bit->second)) + return false; + ++bit; + } + if (bit != b.end()) return false; + return true; + } +}; + boost::shared_ptr rng; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("reference,r",po::value >(), "[REQD] Reference translation (tokenized text)") + ("weights,w",po::value(), "[REQD] Weights files from current iterations") + ("kbest_repository,K",po::value()->default_value("./kbest"),"K-best list repository (directory)") + ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") ("source,s",po::value()->default_value(""), "Source file (ignored, except for AER)") ("loss_function,l",po::value()->default_value("ibm_bleu"), "Loss function being optimized") - ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") - ("weights,w",po::value >(), "[REQD] Weights files from previous and current iterations") ("kbest_size,k",po::value()->default_value(1500u), "Top k-hypotheses to extract") ("candidate_pairs,G", po::value()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)") ("best_pairs,X", po::value()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") @@ -46,7 +95,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { flag = true; } if (!conf->count("weights")) { - cerr << "Please specify one or more weights using -w \n"; + cerr << "Please specify weights using -w \n"; flag = true; } if (flag || conf->count("help")) { @@ -56,6 +105,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } struct HypInfo { + HypInfo() : g_(-100.0) {} HypInfo(const vector& h, const SparseVector& feats) : hyp(h), g_(-100.0), x(feats) {} // lazy evaluation @@ -66,10 +116,92 @@ struct HypInfo { } vector hyp; mutable double g_; - public: SparseVector x; }; +struct HypInfoCompare { + bool operator()(const HypInfo& a, const HypInfo& b) const { + ApproxVectorEquals comp; + return (a.hyp == b.hyp && comp(a.x,b.x)); + } +}; + +struct HypInfoHasher { + size_t operator()(const HypInfo& x) const { + boost::hash > hhasher; + ApproxVectorHasher vhasher; + size_t ha = hhasher(x.hyp); + boost::hash_combine(ha, vhasher(x.x)); + return ha; + } +}; + +void WriteKBest(const string& file, const vector& kbest) { + WriteFile wf(file); + ostream& out = *wf.stream(); + out.precision(10); + for (int i = 0; i < kbest.size(); ++i) { + out << TD::GetString(kbest[i].hyp) << endl; + out << kbest[i].x << endl; + } +} + +void ParseSparseVector(string& line, size_t cur, SparseVector* out) { + SparseVector& x = *out; + size_t last_start = cur; + size_t last_comma = string::npos; + while(cur <= line.size()) { + if (line[cur] == ' ' || cur == line.size()) { + if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { + cerr << "[ERROR] " << line << endl << " position = " << cur << endl; + exit(1); + } + const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); + if (cur < line.size()) line[cur] = 0; + const double val = strtod(&line[last_comma + 1], NULL); + x.set_value(fid, val); + + last_comma = string::npos; + last_start = cur+1; + } else { + if (line[cur] == '=') + last_comma = cur; + } + ++cur; + } +} + +void ReadKBest(const string& file, vector* kbest) { + cerr << "Reading from " << file << endl; + ReadFile rf(file); + istream& in = *rf.stream(); + string cand; + string feats; + while(getline(in, cand)) { + getline(in, feats); + assert(in); + kbest->push_back(HypInfo()); + TD::ConvertSentence(cand, &kbest->back().hyp); + ParseSparseVector(feats, 0, &kbest->back().x); + } + cerr << " read " << kbest->size() << " hypotheses\n"; +} + +void Dedup(vector* h) { + cerr << "Dedup in=" << h->size(); + tr1::unordered_set u; + while(h->size() > 0) { + u.insert(h->back()); + h->pop_back(); + } + tr1::unordered_set::iterator it = u.begin(); + while (it != u.end()) { + h->push_back(*it); + it = u.erase(it); + } + cerr << " out=" << h->size() << endl; +} + struct ThresholdAlpha { explicit ThresholdAlpha(double t = 0.05) : threshold(t) {} double operator()(double mag) const { @@ -81,6 +213,7 @@ struct ThresholdAlpha { struct TrainingInstance { TrainingInstance(const SparseVector& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {} SparseVector x; +#undef DEBUGGING_PRO #ifdef DEBUGGING_PRO vector a; vector b; @@ -88,6 +221,11 @@ struct TrainingInstance { bool y; double gdiff; }; +#ifdef DEBUGGING_PRO +ostream& operator<<(ostream& os, const TrainingInstance& d) { + return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x; +} +#endif struct DiffOrder { bool operator()(const TrainingInstance& a, const TrainingInstance& b) const { @@ -95,36 +233,51 @@ struct DiffOrder { } }; -template -void Sample(const unsigned gamma, const unsigned xi, const vector& J_i, const SentenceScorer& scorer, const Alpha& alpha_i, bool invert_score, vector* pv) { - vector v; +void Sample(const unsigned gamma, const unsigned xi, const vector& J_i, const SentenceScorer& scorer, const bool invert_score, vector* pv) { + vector v1, v2; + double avg_diff = 0; for (unsigned i = 0; i < gamma; ++i) { - size_t a = rng->inclusive(0, J_i.size() - 1)(); - size_t b = rng->inclusive(0, J_i.size() - 1)(); + const size_t a = rng->inclusive(0, J_i.size() - 1)(); + const size_t b = rng->inclusive(0, J_i.size() - 1)(); if (a == b) continue; double ga = J_i[a].g(scorer); double gb = J_i[b].g(scorer); - bool positive = ga < gb; + bool positive = gb < ga; if (invert_score) positive = !positive; - double gdiff = fabs(ga - gb); + const double gdiff = fabs(ga - gb); if (!gdiff) continue; - if (rng->next() < alpha_i(gdiff)) { - v.push_back(TrainingInstance((J_i[a].x - J_i[b].x).erase_zeros(), positive, gdiff)); + avg_diff += gdiff; + SparseVector xdiff = (J_i[a].x - J_i[b].x).erase_zeros(); + if (xdiff.empty()) { + cerr << "Empty diff:\n " << TD::GetString(J_i[a].hyp) << endl << "x=" << J_i[a].x << endl; + cerr << " " << TD::GetString(J_i[b].hyp) << endl << "x=" << J_i[b].x << endl; + continue; + } + v1.push_back(TrainingInstance(xdiff, positive, gdiff)); #ifdef DEBUGGING_PRO - v.back().a = J_i[a].hyp; - v.back().b = J_i[b].hyp; + v1.back().a = J_i[a].hyp; + v1.back().b = J_i[b].hyp; + cerr << "N: " << v1.back() << endl; #endif - } } - vector::iterator mid = v.begin() + xi; - if (xi > v.size()) mid = v.end(); - partial_sort(v.begin(), mid, v.end(), DiffOrder()); - copy(v.begin(), mid, back_inserter(*pv)); + avg_diff /= v1.size(); + + for (unsigned i = 0; i < v1.size(); ++i) { + double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff)); + // cerr << "avg_diff=" << avg_diff << " gdiff=" << v1[i].gdiff << " p=" << p << endl; + if (rng->next() < p) v2.push_back(v1[i]); + } + vector::iterator mid = v2.begin() + xi; + if (xi > v2.size()) mid = v2.end(); + partial_sort(v2.begin(), mid, v2.end(), DiffOrder()); + copy(v2.begin(), mid, back_inserter(*pv)); #ifdef DEBUGGING_PRO - if (v.size() >= 5) - for (int i =0; i < 5; ++i) { - cerr << v[i].gdiff << " y=" << v[i].y << "\tA:" << TD::GetString(v[i].a) << "\n\tB: " << TD::GetString(v[i].b) << endl; + if (v2.size() >= 5) { + for (int i =0; i < (mid - v2.begin()); ++i) { + cerr << v2[i] << endl; } + cerr << pv->back() << endl; + } #endif } @@ -136,6 +289,7 @@ int main(int argc, char** argv) { else rng.reset(new MT19937); const string loss_function = conf["loss_function"].as(); + ScoreType type = ScoreTypeFromString(loss_function); DocScorer ds(type, conf["reference"].as >(), conf["source"].as()); cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; @@ -146,13 +300,15 @@ int main(int argc, char** argv) { const unsigned kbest_size = conf["kbest_size"].as(); const unsigned gamma = conf["candidate_pairs"].as(); const unsigned xi = conf["best_pairs"].as(); - vector weights_files = conf["weights"].as >(); - vector > weights(weights_files.size()); - for (int i = 0; i < weights.size(); ++i) { + string weightsf = conf["weights"].as(); + vector weights; + { Weights w; - w.InitFromFile(weights_files[i]); - w.InitVector(&weights[i]); + w.InitFromFile(weightsf); + w.InitVector(&weights); } + string kbest_repo = conf["kbest_repository"].as(); + MkDirP(kbest_repo); while(in) { vector v; string line; @@ -164,24 +320,26 @@ int main(int argc, char** argv) { // path-to-file (JSON) sent_id is >> file >> sent_id; ReadFile rf(file); - HypergraphIO::ReadFromJSON(rf.stream(), &hg); + ostringstream os; vector J_i; - int start = weights.size(); - start -= 4; - if (start < 0) start = 0; - for (int i = start; i < weights.size(); ++i) { - hg.Reweight(weights[i]); - KBest::KBestDerivations, ESentenceTraversal> kbest(hg, kbest_size); - - for (int i = 0; i < kbest_size; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - J_i.push_back(HypInfo(d->yield, d->feature_values)); - } + os << kbest_repo << "/kbest." << sent_id << ".txt.gz"; + const string kbest_file = os.str(); + if (FileExists(kbest_file)) + ReadKBest(kbest_file, &J_i); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(weights); + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, kbest_size); + + for (int i = 0; i < kbest_size; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + J_i.push_back(HypInfo(d->yield, d->feature_values)); } + Dedup(&J_i); + WriteKBest(kbest_file, J_i); - Sample(gamma, xi, J_i, *ds[sent_id], ThresholdAlpha(0.05), (type == TER), &v); + Sample(gamma, xi, J_i, *ds[sent_id], (type == TER), &v); for (unsigned i = 0; i < v.size(); ++i) { const TrainingInstance& vi = v[i]; cout << vi.y << "\t" << vi.x << endl; diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 2b9c5ce7..e1a7db8a 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -24,7 +24,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("weights,w", po::value(), "Weights from previous iteration (used as initialization and interpolation") ("interpolation,p",po::value()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev") ("memory_buffers,m",po::value()->default_value(200), "Number of memory buffers (LBFGS)") - ("sigma_squared,s",po::value()->default_value(0.5), "Sigma squared for Gaussian prior") + ("sigma_squared,s",po::value()->default_value(1.0), "Sigma squared for Gaussian prior") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); @@ -35,6 +35,31 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } +void ParseSparseVector(string& line, size_t cur, SparseVector* out) { + SparseVector& x = *out; + size_t last_start = cur; + size_t last_comma = string::npos; + while(cur <= line.size()) { + if (line[cur] == ' ' || cur == line.size()) { + if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { + cerr << "[ERROR] " << line << endl << " position = " << cur << endl; + exit(1); + } + const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); + if (cur < line.size()) line[cur] = 0; + const double val = strtod(&line[last_comma + 1], NULL); + x.set_value(fid, val); + + last_comma = string::npos; + last_start = cur+1; + } else { + if (line[cur] == '=') + last_comma = cur; + } + ++cur; + } +} + int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); @@ -60,28 +85,7 @@ int main(int argc, char** argv) { assert(ks == 1); const bool y = line[0] == '1'; SparseVector x; - size_t last_start = ks + 1; - size_t last_comma = string::npos; - size_t cur = last_start; - while(cur <= line.size()) { - if (line[cur] == ' ' || cur == line.size()) { - if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { - cerr << "[ERROR] " << line << endl << " position = " << cur << endl; - exit(1); - } - const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); - if (cur < line.size()) line[cur] = 0; - const double val = strtod(&line[last_comma + 1], NULL); - x.set_value(fid, val); - - last_comma = string::npos; - last_start = cur+1; - } else { - if (line[cur] == '=') - last_comma = cur; - } - ++cur; - } + ParseSparseVector(line, ks + 1, &x); training.push_back(make_pair(y, x)); } if (flag) cerr << endl; @@ -95,6 +99,7 @@ int main(int argc, char** argv) { SparseVector g; bool converged = false; LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as()); + double ppl = 0; while(!converged) { double cll = 0; double dbias = 0; @@ -114,14 +119,18 @@ int main(int argc, char** argv) { lp_false*=-1; if (training[i].first) { // true label cll -= lp_true; + ppl += lp_true / log(2); g -= training[i].second * exp(lp_false); dbias -= exp(lp_false); } else { // false label cll -= lp_false; + ppl += lp_false / log(2); g += training[i].second * exp(lp_true); dbias += exp(lp_true); } } + ppl /= training.size(); + ppl = pow(2.0, - ppl); vg.clear(); g.init_vector(&vg); vg[0] = dbias; @@ -139,7 +148,7 @@ int main(int argc, char** argv) { double reg = 0; #endif cll += reg; - cerr << cll << " (REG=" << reg << ")\t"; + cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t"; bool failed = false; try { opt.Optimize(cll, vg, &x); diff --git a/utils/filelib.cc b/utils/filelib.cc index 79ad2847..a0969b1a 100644 --- a/utils/filelib.cc +++ b/utils/filelib.cc @@ -20,3 +20,15 @@ bool DirectoryExists(const string& dir) { return false; } +void MkDirP(const string& dir) { + if (DirectoryExists(dir)) return; + if (mkdir(dir.c_str(), 0777)) { + perror(dir.c_str()); + abort(); + } + if (chmod(dir.c_str(), 07777)) { + perror(dir.c_str()); + abort(); + } +} + diff --git a/utils/filelib.h b/utils/filelib.h index dda98671..a8622246 100644 --- a/utils/filelib.h +++ b/utils/filelib.h @@ -12,6 +12,7 @@ bool FileExists(const std::string& file_name); bool DirectoryExists(const std::string& dir_name); +void MkDirP(const std::string& dir_name); // reads from standard in if filename is - // uncompresses if file ends with .gz -- cgit v1.2.3 From b6a2a068e552de6d3c1e2507213d0161e1c7e34b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 3 Sep 2011 17:14:18 +0100 Subject: fix sparse vector to work with boost serialization --- utils/fast_sparse_vector.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ utils/sparse_vector.h | 38 -------------------------------------- 2 files changed, 46 insertions(+), 38 deletions(-) (limited to 'utils') diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index 9d72cb87..b3f9588d 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -7,6 +7,8 @@ // important: indexes are integers // important: iterators may return elements in any order +#include "config.h" + #include #include #include @@ -16,6 +18,13 @@ #include +#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP +#include +#include +#endif + +#include "fdict.h" + // this is architecture dependent, it should be // detected in some way but it's probably easiest (for me) // to just set it @@ -334,8 +343,45 @@ class FastSparseVector { } data_; unsigned char local_size_; bool is_remote_; + +#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP + private: + friend class boost::serialization::access; + template + void save(Archive & ar, const unsigned int version) const { + (void) version; + int eff_size = size(); + const_iterator it = this->begin(); + if (eff_size > 0) { + // 0 index is reserved as empty + if (it->first == 0) { ++it; --eff_size; } + } + ar & eff_size; + while (it != this->end()) { + const std::pair wire_pair(FD::Convert(it->first), it->second); + ar & wire_pair; + ++it; + } + } + template + void load(Archive & ar, const unsigned int version) { + (void) version; + this->clear(); + int sz; ar & sz; + for (int i = 0; i < sz; ++i) { + std::pair wire_pair; + ar & wire_pair; + this->set_value(FD::Convert(wire_pair.first), wire_pair.second); + } + } + BOOST_SERIALIZATION_SPLIT_MEMBER() +#endif }; +#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP +BOOST_CLASS_TRACKING(FastSparseVector,track_never) +#endif + template const FastSparseVector operator+(const FastSparseVector& x, const FastSparseVector& y) { if (x.size() > y.size()) { diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h index a55436fb..049151f7 100644 --- a/utils/sparse_vector.h +++ b/utils/sparse_vector.h @@ -1,44 +1,6 @@ #ifndef _SPARSE_VECTOR_H_ #define _SPARSE_VECTOR_H_ -#if 0 - -#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP - friend class boost::serialization::access; - template - void save(Archive & ar, const unsigned int version) const { - (void) version; - int eff_size = values_.size(); - const_iterator it = this->begin(); - if (values_.find(0) != values_.end()) { ++it; --eff_size; } - ar & eff_size; - while (it != this->end()) { - const std::pair wire_pair(FD::Convert(it->first), it->second); - ar & wire_pair; - ++it; - } - } - template - void load(Archive & ar, const unsigned int version) { - (void) version; - this->clear(); - int sz; ar & sz; - for (int i = 0; i < sz; ++i) { - std::pair wire_pair; - ar & wire_pair; - this->set_value(FD::Convert(wire_pair.first), wire_pair.second); - } - } - BOOST_SERIALIZATION_SPLIT_MEMBER() -#endif -}; - -#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP -BOOST_CLASS_TRACKING(SparseVector,track_never) -#endif - -#endif /// FIX - #include "fast_sparse_vector.h" #define SparseVector FastSparseVector -- cgit v1.2.3 From 078f434e044ba6d98d96fa4a24121528eaaa69d0 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 3 Sep 2011 17:27:12 +0100 Subject: fix header problem when serializing sparse vector with boost --- utils/fast_sparse_vector.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'utils') diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index b3f9588d..1301581a 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -19,8 +19,7 @@ #include #if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP -#include -#include +#include #endif #include "fdict.h" -- cgit v1.2.3 From f826e90538dc15fbde4152b0e2f7d30fd6e56784 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 12 Sep 2011 19:22:59 +0100 Subject: source syntax features ~ blunsom emnlp 2008 --- decoder/Makefile.am | 1 + decoder/cdec_ff.cc | 2 + decoder/ff_source_syntax.cc | 157 ++++++++++++++++++++++++++++++++++++++++++++ decoder/ff_source_syntax.h | 24 +++++++ utils/stringlib.cc | 7 +- 5 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 decoder/ff_source_syntax.cc create mode 100644 decoder/ff_source_syntax.h (limited to 'utils') diff --git a/decoder/Makefile.am b/decoder/Makefile.am index e5f7505f..ede1cff0 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -72,6 +72,7 @@ libcdec_a_SOURCES = \ ff_wordalign.cc \ ff_csplit.cc \ ff_tagger.cc \ + ff_source_syntax.cc \ ff_bleu.cc \ ff_factory.cc \ freqdict.cc \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 588842f1..d562bc3a 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -14,6 +14,7 @@ #include "ff_bleu.h" #include "ff_lm_fsa.h" #include "ff_sample_fsa.h" +#include "ff_source_syntax.h" #include "ff_register.h" #include "ff_charset.h" #include "ff_wordset.h" @@ -55,6 +56,7 @@ void register_feature_functions() { ff_registry.Register("SpanFeatures", new FFFactory()); ff_registry.Register("NgramFeatures", new FFFactory()); ff_registry.Register("RuleIdentityFeatures", new FFFactory()); + ff_registry.Register("SourceSyntaxFeatures", new FFFactory); ff_registry.Register("RuleNgramFeatures", new FFFactory()); ff_registry.Register("CMR2008ReorderingFeatures", new FFFactory()); ff_registry.Register("KLanguageModel", new KLanguageModelFactory()); diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc new file mode 100644 index 00000000..99acbd87 --- /dev/null +++ b/decoder/ff_source_syntax.cc @@ -0,0 +1,157 @@ +#include "ff_source_syntax.h" + +#include +#include + +#include "sentence_metadata.h" +#include "array2d.h" +#include "filelib.h" + +using namespace std; + +// implements the source side syntax features described in Blunsom et al. (EMNLP 2008) +// source trees must be represented in Penn Treebank format, e.g. +// (S (NP John) (VP (V left))) + +struct SourceSyntaxFeaturesImpl { + SourceSyntaxFeaturesImpl() {} + + void InitializeGrids(const string& tree, unsigned src_len) { + assert(tree.size() > 0); + fids_cat.clear(); + fids_fonly.clear(); + fids_ef.clear(); + src_tree.clear(); + fids_cat.resize(src_len, src_len + 1); + fids_fonly.resize(src_len, src_len + 1); + fids_ef.resize(src_len, src_len + 1); + src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); + ParseTreeString(tree, src_len); + } + + void ParseTreeString(const string& tree, unsigned src_len) { + stack > stk; // first = i, second = category + pair cur_cat; cur_cat.first = -1; + unsigned i = 0; + unsigned p = 0; + while(p < tree.size()) { + const char cur = tree[p]; + if (cur == '(') { + stk.push(cur_cat); + ++p; + unsigned k = p + 1; + while (k < tree.size() && tree[k] != ' ') { ++k; } + cur_cat.first = i; + cur_cat.second = TD::Convert(tree.substr(p, k - p)); + // cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; + p = k + 1; + } else if (cur == ')') { + unsigned k = p; + while (k < tree.size() && tree[k] == ')') { ++k; } + const unsigned num_closes = k - p; + for (unsigned ci = 0; ci < num_closes; ++ci) { + // cur_cat.second spans from cur_cat.first to i + // cerr << TD::Convert(cur_cat.second) << " from " << cur_cat.first << " to " << i << endl; + // NOTE: unary rule chains end up being labeled with the top-most category + src_tree(cur_cat.first, i) = cur_cat.second; + cur_cat = stk.top(); + stk.pop(); + } + p = k; + while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } + } else if (cur == ' ' || cur == '\t') { + cerr << "Unexpected whitespace in: " << tree << endl; + abort(); + } else { // terminal symbol + unsigned k = p + 1; + do { + while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } + // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; + ++i; + assert(i <= src_len); + while (k < tree.size() && tree[k] == ' ') { ++k; } + p = k; + } while (p < tree.size() && tree[p] != ')'); + } + } + // cerr << "i=" << i << " src_len=" << src_len << endl; + assert(i == src_len); // make sure tree specified in src_tree is + // the same length as the source sentence + } + + WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector* feats) { + //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; + const WordID lhs = src_tree(i,j); + int& fid_cat = fids_cat(i,j); + int& fid_fonly = fids_fonly(i,j)[&rule]; + int& fid_ef = fids_ef(i,j)[&rule]; + if (fid_ef <= 0) { + ostringstream os; + os << "SYN:" << TD::Convert(lhs); + fid_cat = FD::Convert(os.str()); + os << ':'; + unsigned ntc = 0; + for (unsigned k = 0; k < rule.f_.size(); ++k) { + if (k > 0) os << '_'; + int fj = rule.f_[k]; + if (fj <= 0) { + os << '[' << TD::Convert(ants[ntc++]) << ']'; + } else { + os << TD::Convert(fj); + } + } + fid_fonly = FD::Convert(os.str()); + os << ':'; + for (unsigned k = 0; k < rule.e_.size(); ++k) { + const int ei = rule.e_[k]; + if (k > 0) os << '_'; + if (ei <= 0) + os << '[' << (1-ei) << ']'; + else + os << TD::Convert(ei); + } + fid_ef = FD::Convert(os.str()); + } + if (fid_cat > 0) + feats->set_value(fid_cat, 1.0); + if (fid_fonly > 0) + feats->set_value(fid_fonly, 1.0); + if (fid_ef > 0) + feats->set_value(fid_ef, 1.0); + return lhs; + } + + Array2D src_tree; // src_tree(i,j) NT = type + mutable Array2D fids_cat; // fires for an LHS match + mutable Array2D > fids_fonly; // fires for an f-string + mutable Array2D > fids_ef; // fires for fully lexicalized +}; + +SourceSyntaxFeatures::SourceSyntaxFeatures(const string& param) : + FeatureFunction(sizeof(WordID)) { + impl = new SourceSyntaxFeaturesImpl; +} + +SourceSyntaxFeatures::~SourceSyntaxFeatures() { + delete impl; + impl = NULL; +} + +void SourceSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const { + WordID ants[8]; + for (unsigned i = 0; i < ant_contexts.size(); ++i) + ants[i] = *static_cast(ant_contexts[i]); + + *static_cast(context) = + impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); +} + +void SourceSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { + impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); +} + diff --git a/decoder/ff_source_syntax.h b/decoder/ff_source_syntax.h new file mode 100644 index 00000000..1e890736 --- /dev/null +++ b/decoder/ff_source_syntax.h @@ -0,0 +1,24 @@ +#ifndef _FF_SOURCE_TOOLS_H_ +#define _FF_SOURCE_TOOLS_H_ + +#include "ff.h" + +struct SourceSyntaxFeaturesImpl; + +class SourceSyntaxFeatures : public FeatureFunction { + public: + SourceSyntaxFeatures(const std::string& param); + ~SourceSyntaxFeatures(); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + virtual void PrepareForInput(const SentenceMetadata& smeta); + private: + SourceSyntaxFeaturesImpl* impl; +}; + +#endif diff --git a/utils/stringlib.cc b/utils/stringlib.cc index 7aaee9f0..ade02ca9 100644 --- a/utils/stringlib.cc +++ b/utils/stringlib.cc @@ -32,7 +32,12 @@ void ParseTranslatorInput(const string& line, string* input, string* ref) { void ProcessAndStripSGML(string* pline, map* out) { map& meta = *out; string& line = *pline; - string lline = LowercaseString(line); + string lline = *pline; + if (lline.find(" must be lowercase!\n"; + cerr << " " << *pline << endl; + abort(); + } if (lline.find(""); if (close == string::npos) return; // error -- cgit v1.2.3 From e7993fb83537105a56c274d78ed9d51a79a8a854 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 13 Sep 2011 13:25:46 +0100 Subject: optional support for doing perfect hashing of feature strings to save lots of memory --- decoder/decoder.cc | 22 ++++++++- utils/Makefile.am | 9 +++- utils/fdict.cc | 4 ++ utils/fdict.h | 36 ++++++++++++++ utils/perfect_hash.cc | 37 ++++++++++++++ utils/perfect_hash.h | 24 +++++++++ utils/phmt.cc | 44 +++++++++++++++++ utils/weights.cc | 132 ++++++++++++++++++++++++++++++++++---------------- utils/weights.h | 14 +++--- 9 files changed, 269 insertions(+), 53 deletions(-) create mode 100644 utils/perfect_hash.cc create mode 100644 utils/perfect_hash.h create mode 100644 utils/phmt.cc (limited to 'utils') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 76f31352..25eb2de4 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -328,6 +328,7 @@ struct DecoderImpl { bool write_gradient; // TODO Observer bool feature_expectations; // TODO Observer bool output_training_vector; // TODO Observer + bool remove_intersected_rule_annotations; static void ConvertSV(const SparseVector& src, SparseVector* trg) { for (SparseVector::const_iterator it = src.begin(); it != src.end(); ++it) @@ -361,6 +362,9 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("grammar,g",po::value >()->composing(),"Either SCFG grammar file(s) or phrase tables file(s)") ("per_sentence_grammar_file", po::value(), "Optional (and possibly not implemented) per sentence grammar file enables all per sentence grammars to be stored in a single large file and accessed by offset") ("list_feature_functions,L","List available feature functions") +#ifdef HAVE_CMPH + ("cmph_perfect_feature_hash,h", po::value(), "Load perfect hash function for features") +#endif ("weights,w",po::value(),"Feature weights file (initial forest / pass 1)") ("feature_function,F",po::value >()->composing(), "Pass 1 additional feature function(s) (-L for list)") @@ -433,7 +437,8 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)") ("vector_format",po::value()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)") ("combine_size,C",po::value()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)") - ("forest_output,O",po::value(),"Directory to write forests to"); + ("forest_output,O",po::value(),"Directory to write forests to") + ("remove_intersected_rule_annotations", "After forced decoding is completed, remove nonterminal annotations (i.e., the source side spans)"); // ob.AddOptions(&opts); #ifdef FSA_RESCORING @@ -443,7 +448,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream po::options_description clo("Command line options"); clo.add_options() ("config,c", po::value >(&cfg_files), "Configuration file(s) - latest has priority") - ("help,h", "Print this help message and exit") + ("help,?", "Print this help message and exit") ("usage,u", po::value(), "Describe a feature function type") ("compgen", "Print just option names suitable for bash command line completion builtin 'compgen'") ; @@ -645,6 +650,12 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream FD::Freeze(); // this means we can't see the feature names of not-weighted features } + if (conf.count("cmph_perfect_feature_hash")) { + cerr << "Loading perfect hash function from " << conf["cmph_perfect_feature_hash"].as() << " ...\n"; + FD::EnableHash(conf["cmph_perfect_feature_hash"].as()); + cerr << " " << FD::NumFeats() << " features in map\n"; + } + // set up translation back end if (formalism == "scfg") translator.reset(new SCFGTranslator(conf)); @@ -695,6 +706,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream unique_kbest = conf.count("unique_k_best"); get_oracle_forest = conf.count("get_oracle_forest"); oracle.show_derivation=conf.count("show_derivations"); + remove_intersected_rule_annotations = conf.count("remove_intersected_rule_annotations"); #ifdef FSA_RESCORING cfg_options.Validate(); @@ -1010,6 +1022,12 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { // if (!SILENT) cerr << " USING UNIFORM WEIGHTS\n"; // for (int i = 0; i < forest.edges_.size(); ++i) // forest.edges_[i].edge_prob_=prob_t::One(); } + if (remove_intersected_rule_annotations) { + for (unsigned i = 0; i < forest.edges_.size(); ++i) + if (forest.edges_[i].rule_ && + forest.edges_[i].rule_->parent_rule_) + forest.edges_[i].rule_ = forest.edges_[i].rule_->parent_rule_; + } forest.Reweight(last_weights); if (!SILENT) forest_stats(forest," Constr. forest",show_tree_structure,oracle.show_derivation); if (!SILENT) cerr << " Constr. VitTree: " << ViterbiFTree(forest) << endl; diff --git a/utils/Makefile.am b/utils/Makefile.am index 94f9be30..c50747bf 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -1,5 +1,5 @@ -noinst_PROGRAMS = ts -TESTS = ts +noinst_PROGRAMS = ts phmt +TESTS = ts phmt if HAVE_GTEST noinst_PROGRAMS += \ @@ -27,6 +27,11 @@ libutils_a_SOURCES = \ verbose.cc \ weights.cc +if HAVE_CMPH + libutils_a_SOURCES += perfect_hash.cc +endif + +phmt_SOURCES = phmt.cc ts_SOURCES = ts.cc dict_test_SOURCES = dict_test.cc dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) diff --git a/utils/fdict.cc b/utils/fdict.cc index baa0b552..676c951c 100644 --- a/utils/fdict.cc +++ b/utils/fdict.cc @@ -9,6 +9,10 @@ using namespace std; Dict FD::dict_; bool FD::frozen_ = false; +#ifdef HAVE_CMPH +PerfectHashFunction* FD::hash_ = NULL; +#endif + std::string FD::Convert(std::vector const& v) { return Convert(&*v.begin(),&*v.end()); } diff --git a/utils/fdict.h b/utils/fdict.h index f9673023..771e8b91 100644 --- a/utils/fdict.h +++ b/utils/fdict.h @@ -1,23 +1,56 @@ #ifndef _FDICT_H_ #define _FDICT_H_ +#include "config.h" + +#include #include #include #include "dict.h" +#ifdef HAVE_CMPH +#include "perfect_hash.h" +#include "string_to.h" +#endif + struct FD { // once the FD is frozen, new features not already in the // dictionary will return 0 static void Freeze() { frozen_ = true; } + static bool UsingPerfectHashFunction() { +#ifdef HAVE_CMPH + return hash_; +#else + return false; +#endif + } + static void EnableHash(const std::string& cmph_file) { +#ifdef HAVE_CMPH + hash_ = new PerfectHashFunction(cmph_file); +#endif + } static inline int NumFeats() { +#ifdef HAVE_CMPH + if (hash_) return hash_->number_of_keys(); +#endif return dict_.max() + 1; } static inline WordID Convert(const std::string& s) { +#ifdef HAVE_CMPH + if (hash_) return (*hash_)(s); +#endif return dict_.Convert(s, frozen_); } static inline const std::string& Convert(const WordID& w) { +#ifdef HAVE_CMPH + if (hash_) { + static std::string tls; + tls = to_string(w); + return tls; + } +#endif return dict_.Convert(w); } static std::string Convert(WordID const *i,WordID const* e); @@ -29,6 +62,9 @@ struct FD { static Dict dict_; private: static bool frozen_; +#ifdef HAVE_CMPH + static PerfectHashFunction* hash_; +#endif }; #endif diff --git a/utils/perfect_hash.cc b/utils/perfect_hash.cc new file mode 100644 index 00000000..706e2741 --- /dev/null +++ b/utils/perfect_hash.cc @@ -0,0 +1,37 @@ +#include "config.h" + +#ifdef HAVE_CMPH + +#include "perfect_hash.h" + +#include +#include + +using namespace std; + +PerfectHashFunction::~PerfectHashFunction() { + cmph_destroy(mphf_); +} + +PerfectHashFunction::PerfectHashFunction(const string& fname) { + FILE* f = fopen(fname.c_str(), "r"); + if (!f) { + cerr << "Failed to open file " << fname << " for reading: cannot load hash function.\n"; + abort(); + } + mphf_ = cmph_load(f); + if (!mphf_) { + cerr << "cmph_load failed on " << fname << "!\n"; + abort(); + } +} + +size_t PerfectHashFunction::operator()(const string& key) const { + return cmph_search(mphf_, &key[0], key.size()); +} + +size_t PerfectHashFunction::number_of_keys() const { + return cmph_size(mphf_); +} + +#endif diff --git a/utils/perfect_hash.h b/utils/perfect_hash.h new file mode 100644 index 00000000..8ac11f18 --- /dev/null +++ b/utils/perfect_hash.h @@ -0,0 +1,24 @@ +#ifndef _PERFECT_HASH_MAP_H_ +#define _PERFECT_HASH_MAP_H_ + +#include "config.h" + +#ifndef HAVE_CMPH +#error libcmph is required to use PerfectHashFunction +#endif + +#include +#include +#include "cmph.h" + +class PerfectHashFunction : boost::noncopyable { + public: + explicit PerfectHashFunction(const std::string& fname); + ~PerfectHashFunction(); + size_t operator()(const std::string& key) const; + size_t number_of_keys() const; + private: + cmph_t *mphf_; +}; + +#endif diff --git a/utils/phmt.cc b/utils/phmt.cc new file mode 100644 index 00000000..1f59afaf --- /dev/null +++ b/utils/phmt.cc @@ -0,0 +1,44 @@ +#include "config.h" + +#ifndef HAVE_CMPH +int main() { + return 0; +} +#else + +#include +#include "weights.h" +#include "fdict.h" + +using namespace std; + +int main(int argc, char** argv) { + if (argc != 2) { cerr << "Usage: " << argv[0] << " file.mphf\n"; return 1; } + FD::EnableHash(argv[1]); + cerr << "Number of keys: " << FD::NumFeats() << endl; + cerr << "LexFE = " << FD::Convert("LexFE") << endl; + cerr << "LexEF = " << FD::Convert("LexEF") << endl; + { + Weights w; + vector v(FD::NumFeats()); + v[FD::Convert("LexFE")] = 1.0; + v[FD::Convert("LexEF")] = 0.5; + w.InitFromVector(v); + cerr << "Writing...\n"; + w.WriteToFile("weights.bin"); + cerr << "Done.\n"; + } + { + Weights w; + vector v(FD::NumFeats()); + cerr << "Reading...\n"; + w.InitFromFile("weights.bin"); + cerr << "Done.\n"; + w.InitVector(&v); + assert(v[FD::Convert("LexFE")] == 1.0); + assert(v[FD::Convert("LexEF")] == 0.5); + } +} + +#endif + diff --git a/utils/weights.cc b/utils/weights.cc index b994a2fe..0916b72a 100644 --- a/utils/weights.cc +++ b/utils/weights.cc @@ -13,40 +13,75 @@ void Weights::InitFromFile(const std::string& filename, vector* feature_ ReadFile in_file(filename); istream& in = *in_file.stream(); assert(in); - int weight_count = 0; - bool fl = false; - string buf; - double val = 0; - while (in) { - getline(in, buf); - if (buf.size() == 0) continue; - if (buf[0] == '#') continue; - for (int i = 0; i < buf.size(); ++i) - if (buf[i] == '=') buf[i] = ' '; - int start = 0; - while(start < buf.size() && buf[start] == ' ') ++start; - int end = 0; - while(end < buf.size() && buf[end] != ' ') ++end; - const int fid = FD::Convert(buf.substr(start, end - start)); - while(end < buf.size() && buf[end] == ' ') ++end; - val = strtod(&buf.c_str()[end], NULL); - if (isnan(val)) { - cerr << FD::Convert(fid) << " has weight NaN!\n"; - abort(); + + bool read_text = true; + if (1) { + ReadFile hdrrf(filename); + istream& hi = *hdrrf.stream(); + assert(hi); + char buf[10]; + hi.get(buf, 6); + assert(hi.good()); + if (strncmp(buf, "_PHWf", 5) == 0) { + read_text = false; + } + } + + if (read_text) { + int weight_count = 0; + bool fl = false; + string buf; + weight_t val = 0; + while (in) { + getline(in, buf); + if (buf.size() == 0) continue; + if (buf[0] == '#') continue; + if (buf[0] == ' ') { + cerr << "Weights file lines may not start with whitespace.\n" << buf << endl; + abort(); + } + for (int i = buf.size() - 1; i > 0; --i) + if (buf[i] == '=' || buf[i] == '\t') { buf[i] = ' '; break; } + int start = 0; + while(start < buf.size() && buf[start] == ' ') ++start; + int end = 0; + while(end < buf.size() && buf[end] != ' ') ++end; + const int fid = FD::Convert(buf.substr(start, end - start)); + while(end < buf.size() && buf[end] == ' ') ++end; + val = strtod(&buf.c_str()[end], NULL); + if (isnan(val)) { + cerr << FD::Convert(fid) << " has weight NaN!\n"; + abort(); + } + if (wv_.size() <= fid) + wv_.resize(fid + 1); + wv_[fid] = val; + if (feature_list) { feature_list->push_back(FD::Convert(fid)); } + ++weight_count; + if (!SILENT) { + if (weight_count % 50000 == 0) { cerr << '.' << flush; fl = true; } + if (weight_count % 2000000 == 0) { cerr << " [" << weight_count << "]\n"; fl = false; } + } } - if (wv_.size() <= fid) - wv_.resize(fid + 1); - wv_[fid] = val; - if (feature_list) { feature_list->push_back(FD::Convert(fid)); } - ++weight_count; if (!SILENT) { - if (weight_count % 50000 == 0) { cerr << '.' << flush; fl = true; } - if (weight_count % 2000000 == 0) { cerr << " [" << weight_count << "]\n"; fl = false; } + if (fl) { cerr << endl; } + cerr << "Loaded " << weight_count << " feature weights\n"; + } + } else { // !read_text + char buf[6]; + in.get(buf, 6); + size_t num_keys[2]; + in.get(reinterpret_cast(&num_keys[0]), sizeof(size_t) + 1); + if (num_keys[0] != FD::NumFeats()) { + cerr << "Hash function reports " << FD::NumFeats() << " keys but weights file contains " << num_keys[0] << endl; + abort(); + } + wv_.resize(num_keys[0]); + in.get(reinterpret_cast(&wv_[0]), num_keys[0] * sizeof(weight_t)); + if (!in.good()) { + cerr << "Error loading weights!\n"; + abort(); } - } - if (!SILENT) { - if (fl) { cerr << endl; } - cerr << "Loaded " << weight_count << " feature weights\n"; } } @@ -54,37 +89,48 @@ void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_feature WriteFile out(fname); ostream& o = *out.stream(); assert(o); - if (extra) { o << "# " << *extra << endl; } - o.precision(17); - const int num_feats = FD::NumFeats(); - for (int i = 1; i < num_feats; ++i) { - const double val = (i < wv_.size() ? wv_[i] : 0.0); - if (hide_zero_value_features && val == 0.0) continue; - o << FD::Convert(i) << ' ' << val << endl; + bool write_text = !FD::UsingPerfectHashFunction(); + + if (write_text) { + if (extra) { o << "# " << *extra << endl; } + o.precision(17); + const int num_feats = FD::NumFeats(); + for (int i = 1; i < num_feats; ++i) { + const weight_t val = (i < wv_.size() ? wv_[i] : 0.0); + if (hide_zero_value_features && val == 0.0) continue; + o << FD::Convert(i) << ' ' << val << endl; + } + } else { + o.write("_PHWf", 5); + const size_t keys = FD::NumFeats(); + assert(keys <= wv_.size()); + o.write(reinterpret_cast(&keys), sizeof(keys)); + o.write(reinterpret_cast(&wv_[0]), keys * sizeof(weight_t)); } } -void Weights::InitVector(std::vector* w) const { +void Weights::InitVector(std::vector* w) const { *w = wv_; } -void Weights::InitSparseVector(SparseVector* w) const { +void Weights::InitSparseVector(SparseVector* w) const { for (int i = 1; i < wv_.size(); ++i) { - const double& weight = wv_[i]; + const weight_t& weight = wv_[i]; if (weight) w->set_value(i, weight); } } -void Weights::InitFromVector(const std::vector& w) { +void Weights::InitFromVector(const std::vector& w) { wv_ = w; if (wv_.size() > FD::NumFeats()) cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n"; wv_.resize(FD::NumFeats(), 0); } -void Weights::InitFromVector(const SparseVector& w) { +void Weights::InitFromVector(const SparseVector& w) { wv_.clear(); wv_.resize(FD::NumFeats(), 0.0); for (int i = 1; i < FD::NumFeats(); ++i) wv_[i] = w.value(i); } + diff --git a/utils/weights.h b/utils/weights.h index cc20283c..7664810b 100644 --- a/utils/weights.h +++ b/utils/weights.h @@ -2,21 +2,23 @@ #define _WEIGHTS_H_ #include -#include #include #include "sparse_vector.h" +// warning: in the future this will become float +typedef double weight_t; + class Weights { public: Weights() {} void InitFromFile(const std::string& fname, std::vector* feature_list = NULL); void WriteToFile(const std::string& fname, bool hide_zero_value_features = true, const std::string* extra = NULL) const; - void InitVector(std::vector* w) const; - void InitSparseVector(SparseVector* w) const; - void InitFromVector(const std::vector& w); - void InitFromVector(const SparseVector& w); + void InitVector(std::vector* w) const; + void InitSparseVector(SparseVector* w) const; + void InitFromVector(const std::vector& w); + void InitFromVector(const SparseVector& w); private: - std::vector wv_; + std::vector wv_; }; #endif -- cgit v1.2.3 From bb86637332d49f71c485df34576e464eaf053656 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 13 Sep 2011 17:36:23 +0100 Subject: get rid of bad Weights class so it no longer keeps a copy of a vector inside it --- decoder/decoder.cc | 64 ++++++++--------- decoder/decoder.h | 9 ++- mira/kbest_mira.cc | 62 ++++------------- pro-train/mr_pro_map.cc | 8 +-- pro-train/mr_pro_reduce.cc | 16 ++--- training/Makefile.am | 8 --- training/augment_grammar.cc | 4 +- training/collapse_weights.cc | 6 +- training/compute_cllh.cc | 23 +++--- training/grammar_convert.cc | 8 +-- training/mpi_batch_optimize.cc | 127 ++++++++-------------------------- training/mpi_online_optimize.cc | 69 +++++++----------- training/mr_optimize_reduce.cc | 19 ++--- utils/fdict.h | 2 + utils/phmt.cc | 8 +-- utils/weights.cc | 75 ++++++++++++-------- utils/weights.h | 22 +++--- vest/mr_vest_generate_mapper_input.cc | 6 +- 18 files changed, 201 insertions(+), 335 deletions(-) (limited to 'utils') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 25eb2de4..4d4b6245 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -159,8 +159,7 @@ struct RescoringPass { shared_ptr models; shared_ptr inter_conf; vector ffs; - shared_ptr w; // null == use previous weights - vector weight_vector; + shared_ptr > weight_vector; int fid_summary; // 0 == no summary feature double density_prune; // 0 == don't density prune double beam_prune; // 0 == don't beam prune @@ -169,7 +168,7 @@ struct RescoringPass { ostream& operator<<(ostream& os, const RescoringPass& rp) { os << "[num_fn=" << rp.ffs.size(); if (rp.inter_conf) { os << " int_alg=" << *rp.inter_conf; } - if (rp.w) os << " new_weights"; + //if (rp.weight_vector.size() > 0) os << " new_weights"; if (rp.fid_summary) os << " summary_feature=" << FD::Convert(rp.fid_summary); if (rp.density_prune) os << " density_prune=" << rp.density_prune; if (rp.beam_prune) os << " beam_prune=" << rp.beam_prune; @@ -181,13 +180,8 @@ struct DecoderImpl { DecoderImpl(po::variables_map& conf, int argc, char** argv, istream* cfg); ~DecoderImpl(); bool Decode(const string& input, DecoderObserver*); - void SetWeights(const vector& weights) { - init_weights = weights; - for (int i = 0; i < rescoring_passes.size(); ++i) { - if (rescoring_passes[i].models) - rescoring_passes[i].models->SetWeights(weights); - rescoring_passes[i].weight_vector = weights; - } + vector& CurrentWeightVector() { + return *rescoring_passes.back().weight_vector; } void SetId(int next_sent_id) { sent_id = next_sent_id - 1; } @@ -300,8 +294,7 @@ struct DecoderImpl { OracleBleu oracle; string formalism; shared_ptr translator; - Weights w_init_weights; // used with initial parse - vector init_weights; // weights used with initial parse + shared_ptr > init_weights; // weights used with initial parse vector > pffs; #ifdef FSA_RESCORING CFGOptions cfg_options; @@ -557,13 +550,18 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream exit(1); } - // load initial feature weights (and possibly freeze feature set) - if (conf.count("weights")) { - w_init_weights.InitFromFile(str("weights",conf)); - w_init_weights.InitVector(&init_weights); - init_weights.resize(FD::NumFeats()); + // load perfect hash function for features + if (conf.count("cmph_perfect_feature_hash")) { + cerr << "Loading perfect hash function from " << conf["cmph_perfect_feature_hash"].as() << " ...\n"; + FD::EnableHash(conf["cmph_perfect_feature_hash"].as()); + cerr << " " << FD::NumFeats() << " features in map\n"; } + // load initial feature weights (and possibly freeze feature set) + init_weights.reset(new vector); + if (conf.count("weights")) + Weights::InitFromFile(str("weights",conf), init_weights.get()); + // cube pruning pop-limit: we may want to configure this on a per-pass basis pop_limit = conf["cubepruning_pop_limit"].as(); @@ -582,9 +580,8 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream RescoringPass& rp = rescoring_passes.back(); // only configure new weights if pass > 0, otherwise we reuse the initial chart weights if (nth_pass_condition && conf.count(ws)) { - rp.w.reset(new Weights); - rp.w->InitFromFile(str(ws.c_str(), conf)); - rp.w->InitVector(&rp.weight_vector); + rp.weight_vector.reset(new vector()); + Weights::InitFromFile(str(ws.c_str(), conf), rp.weight_vector.get()); } bool has_stateful = false; if (conf.count(ff)) { @@ -624,11 +621,15 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream } // set up weight vectors since later phases may reuse weights from earlier phases - const vector* prev = &init_weights; + shared_ptr > prev_weights = init_weights; for (int pass = 0; pass < rescoring_passes.size(); ++pass) { RescoringPass& rp = rescoring_passes[pass]; - if (!rp.w) { rp.weight_vector = *prev; } else { prev = &rp.weight_vector; } - rp.models.reset(new ModelSet(rp.weight_vector, rp.ffs)); + if (!rp.weight_vector) { + rp.weight_vector = prev_weights; + } else { + prev_weights = rp.weight_vector; + } + rp.models.reset(new ModelSet(*rp.weight_vector, rp.ffs)); string ps = "Pass1 "; ps[4] += pass; if (!SILENT) show_models(conf,*rp.models,ps.c_str()); } @@ -650,12 +651,6 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream FD::Freeze(); // this means we can't see the feature names of not-weighted features } - if (conf.count("cmph_perfect_feature_hash")) { - cerr << "Loading perfect hash function from " << conf["cmph_perfect_feature_hash"].as() << " ...\n"; - FD::EnableHash(conf["cmph_perfect_feature_hash"].as()); - cerr << " " << FD::NumFeats() << " features in map\n"; - } - // set up translation back end if (formalism == "scfg") translator.reset(new SCFGTranslator(conf)); @@ -685,7 +680,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream } if (!fsa_ffs.empty()) { cerr<<"FSA: "; - show_all_features(fsa_ffs,init_weights,cerr,cerr,true,true); + show_all_features(fsa_ffs,*init_weights,cerr,cerr,true,true); } #endif @@ -733,7 +728,8 @@ bool Decoder::Decode(const string& input, DecoderObserver* o) { if (del) delete o; return res; } -void Decoder::SetWeights(const vector& weights) { pimpl_->SetWeights(weights); } +vector& Decoder::CurrentWeightVector() { return pimpl_->CurrentWeightVector(); } +const vector& Decoder::CurrentWeightVector() const { return pimpl_->CurrentWeightVector(); } void Decoder::SetSupplementalGrammar(const std::string& grammar_string) { assert(pimpl_->translator->GetDecoderType() == "SCFG"); static_cast(*pimpl_->translator).SetSupplementalGrammar(grammar_string); @@ -774,7 +770,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { translator->ProcessMarkupHints(smeta.sgml_); Timer t("Translation"); const bool translation_successful = - translator->Translate(to_translate, &smeta, init_weights, &forest); + translator->Translate(to_translate, &smeta, *init_weights, &forest); translator->SentenceComplete(); if (!translation_successful) { @@ -812,7 +808,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { for (int pass = 0; pass < rescoring_passes.size(); ++pass) { const RescoringPass& rp = rescoring_passes[pass]; - const vector& cur_weights = rp.weight_vector; + const vector& cur_weights = *rp.weight_vector; if (!SILENT) cerr << endl << " RESCORING PASS #" << (pass+1) << " " << rp << endl; #ifdef FSA_RESCORING cfg_options.maybe_output_source(forest); @@ -933,7 +929,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { #endif } - const vector& last_weights = (rescoring_passes.empty() ? init_weights : rescoring_passes.back().weight_vector); + const vector& last_weights = (rescoring_passes.empty() ? *init_weights : *rescoring_passes.back().weight_vector); // Oracle Rescoring if(get_oracle_forest) { diff --git a/decoder/decoder.h b/decoder/decoder.h index 5491369f..9d009ffa 100644 --- a/decoder/decoder.h +++ b/decoder/decoder.h @@ -7,6 +7,8 @@ #include #include +#include "weights.h" // weight_t + #undef CP_TIME //#define CP_TIME #ifdef CP_TIME @@ -39,7 +41,12 @@ struct Decoder { Decoder(int argc, char** argv); Decoder(std::istream* config_file); bool Decode(const std::string& input, DecoderObserver* observer = NULL); - void SetWeights(const std::vector& weights); + + // access this to either *read* or *write* to the decoder's last + // weight vector (i.e., the weights of the finest past) + std::vector& CurrentWeightVector(); + const std::vector& CurrentWeightVector() const; + void SetId(int id); ~Decoder(); const boost::program_options::variables_map& GetConf() const { return conf; } diff --git a/mira/kbest_mira.cc b/mira/kbest_mira.cc index 6918a9a1..459a5e6f 100644 --- a/mira/kbest_mira.cc +++ b/mira/kbest_mira.cc @@ -32,21 +32,6 @@ namespace po = boost::program_options; bool invert_score; boost::shared_ptr rng; -void SanityCheck(const vector& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - -struct FComp { - const vector& w_; - FComp(const vector& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - void RandomPermutation(int len, vector* p_ids) { vector& ids = *p_ids; ids.resize(len); @@ -58,21 +43,6 @@ void RandomPermutation(int len, vector* p_ids) { } } -void ShowLargestFeatures(const vector& w) { - vector fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - --mid; - for (vector::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() @@ -209,14 +179,16 @@ int main(int argc, char** argv) { cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; return 1; } - // load initial weights - Weights weights; - weights.InitFromFile(conf["input_weights"].as()); - SparseVector lambdas; - weights.InitSparseVector(&lambdas); ReadFile ini_rf(conf["decoder_config"].as()); Decoder decoder(ini_rf.stream()); + + // load initial weights + vector& dense_weights = decoder.CurrentWeightVector(); + SparseVector lambdas; + Weights::InitFromFile(conf["input_weights"].as(), &dense_weights); + Weights::InitSparseVector(dense_weights, &lambdas); + const double max_step_size = conf["max_step_size"].as(); const double mt_metric_scale = conf["mt_metric_scale"].as(); @@ -230,7 +202,6 @@ int main(int argc, char** argv) { double tot_loss = 0; int dots = 0; int cur_pass = 0; - vector dense_weights; SparseVector tot; tot += lambdas; // initial weights normalizer++; // count for initial weights @@ -240,27 +211,22 @@ int main(int argc, char** argv) { vector order; RandomPermutation(corpus.size(), &order); while (lcount <= max_iteration) { - dense_weights.clear(); - weights.InitFromVector(lambdas); - weights.InitVector(&dense_weights); - decoder.SetWeights(dense_weights); + lambdas.init_vector(&dense_weights); if ((cur_sent * 40 / corpus.size()) > dots) { ++dots; cerr << '.'; } if (corpus.size() == cur_sent) { cerr << " [AVG METRIC LAST PASS=" << (tot_loss / corpus.size()) << "]\n"; - ShowLargestFeatures(dense_weights); + Weights::ShowLargestFeatures(dense_weights); cur_sent = 0; tot_loss = 0; dots = 0; ostringstream os; os << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << ".gz"; - weights.WriteToFile(os.str(), true, &msg); SparseVector x = tot; x /= normalizer; ostringstream sa; sa << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "-avg.gz"; - Weights ww; - ww.InitFromVector(x); - ww.WriteToFile(sa.str(), true, &msga); + x.init_vector(&dense_weights); + Weights::WriteToFile(os.str(), dense_weights, true, &msg); ++cur_pass; RandomPermutation(corpus.size(), &order); } @@ -294,11 +260,11 @@ int main(int argc, char** argv) { ++cur_sent; } cerr << endl; - weights.WriteToFile("weights.mira-final.gz", true, &msg); + Weights::WriteToFile("weights.mira-final.gz", dense_weights, true, &msg); tot /= normalizer; - weights.InitFromVector(tot); + tot.init_vector(dense_weights); msg = "# MIRA tuned weights (averaged vector)"; - weights.WriteToFile("weights.mira-final-avg.gz", true, &msg); + Weights::WriteToFile("weights.mira-final-avg.gz", dense_weights, true, &msg); cerr << "Optimization complete.\nAVERAGED WEIGHTS: weights.mira-final-avg.gz\n"; return 0; } diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc index 4324e8de..bc59285b 100644 --- a/pro-train/mr_pro_map.cc +++ b/pro-train/mr_pro_map.cc @@ -301,12 +301,8 @@ int main(int argc, char** argv) { const unsigned gamma = conf["candidate_pairs"].as(); const unsigned xi = conf["best_pairs"].as(); string weightsf = conf["weights"].as(); - vector weights; - { - Weights w; - w.InitFromFile(weightsf); - w.InitVector(&weights); - } + vector weights; + Weights::InitFromFile(weightsf, &weights); string kbest_repo = conf["kbest_repository"].as(); MkDirP(kbest_repo); while(in) { diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 9b422f33..9caaa1d1 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -194,7 +194,7 @@ int main(int argc, char** argv) { InitCommandLine(argc, argv, &conf); string line; vector > > training, testing; - SparseVector old_weights; + SparseVector old_weights; const bool tune_regularizer = conf.count("tune_regularizer"); if (tune_regularizer && !conf.count("testset")) { cerr << "--tune_regularizer requires --testset to be set\n"; @@ -210,9 +210,9 @@ int main(int argc, char** argv) { const double psi = conf["interpolation"].as(); if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } if (conf.count("weights")) { - Weights w; - w.InitFromFile(conf["weights"].as()); - w.InitSparseVector(&old_weights); + vector dt; + Weights::InitFromFile(conf["weights"].as(), &dt); + Weights::InitSparseVector(dt, &old_weights); } ReadCorpus(&cin, &training); if (conf.count("testset")) { @@ -220,8 +220,8 @@ int main(int argc, char** argv) { ReadCorpus(rf.stream(), &testing); } cerr << "Number of features: " << FD::NumFeats() << endl; - vector x(FD::NumFeats(), 0.0); // x[0] is bias - for (SparseVector::const_iterator it = old_weights.begin(); + vector x(FD::NumFeats(), 0.0); // x[0] is bias + for (SparseVector::const_iterator it = old_weights.begin(); it != old_weights.end(); ++it) x[it->first] = it->second; double tppl = 0.0; @@ -257,7 +257,6 @@ int main(int argc, char** argv) { sigsq = sp[best_i].first; tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as(), &x); } - Weights w; if (conf.count("weights")) { for (int i = 1; i < x.size(); ++i) x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi); @@ -271,7 +270,6 @@ int main(int argc, char** argv) { cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl; } } - w.InitFromVector(x); - w.WriteToFile("-"); + Weights::WriteToFile("-", x); return 0; } diff --git a/training/Makefile.am b/training/Makefile.am index e075e417..6e2c06f5 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -12,9 +12,7 @@ bin_PROGRAMS = \ cllh_filter_grammar \ mpi_online_optimize \ mpi_batch_optimize \ - mpi_em_optimize \ compute_cllh \ - feature_expectations \ augment_grammar noinst_PROGRAMS = \ @@ -29,12 +27,6 @@ mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -feature_expectations_SOURCES = feature_expectations.cc -feature_expectations_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_em_optimize_SOURCES = mpi_em_optimize.cc optimize.cc -mpi_em_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - compute_cllh_SOURCES = compute_cllh.cc compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/training/augment_grammar.cc b/training/augment_grammar.cc index df8d4ee8..e89a92d5 100644 --- a/training/augment_grammar.cc +++ b/training/augment_grammar.cc @@ -134,9 +134,7 @@ int main(int argc, char** argv) { } else { ngram = NULL; } extra_feature = conf.count("extra_lex_feature") > 0; if (conf.count("collapse_weights")) { - Weights w; - w.InitFromFile(conf["collapse_weights"].as()); - w.InitVector(&col_weights); + Weights::InitFromFile(conf["collapse_weights"].as(), &col_weights); } clear_features = conf.count("clear_features_after_collapse") > 0; gather_rules = false; diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc index 4fb742fb..dc480f6c 100644 --- a/training/collapse_weights.cc +++ b/training/collapse_weights.cc @@ -59,10 +59,8 @@ int main(int argc, char** argv) { InitCommandLine(argc, argv, &conf); const string wfile = conf["weights"].as(); const string gfile = conf["grammar"].as(); - Weights wm; - wm.InitFromFile(wfile); - vector w; - wm.InitVector(&w); + vector w; + Weights::InitFromFile(wfile, &w); MarginalMap e_tots; MarginalMap f_tots; prob_t tot; diff --git a/training/compute_cllh.cc b/training/compute_cllh.cc index 332f6d0c..b496d196 100644 --- a/training/compute_cllh.cc +++ b/training/compute_cllh.cc @@ -148,15 +148,6 @@ int main(int argc, char** argv) { if (!InitCommandLine(argc, argv, &conf)) return false; - // load initial weights - Weights weights; - if (conf.count("weights")) - weights.InitFromFile(conf["weights"].as()); - - // freeze feature set - //const bool freeze_feature_set = conf.count("freeze_feature_set"); - //if (freeze_feature_set) FD::Freeze(); - // load cdec.ini and set up decoder ReadFile ini_rf(conf["decoder_config"].as()); Decoder decoder(ini_rf.stream()); @@ -165,17 +156,22 @@ int main(int argc, char** argv) { abort(); } + // load weights + vector& weights = decoder.CurrentWeightVector(); + if (conf.count("weights")) + Weights::InitFromFile(conf["weights"].as(), &weights); + + // freeze feature set + //const bool freeze_feature_set = conf.count("freeze_feature_set"); + //if (freeze_feature_set) FD::Freeze(); + vector corpus; vector ids; ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus, &ids); assert(corpus.size() > 0); assert(corpus.size() == ids.size()); - vector wv; - weights.InitVector(&wv); - decoder.SetWeights(wv); TrainingObserver observer; double objective = 0; - bool converged = false; observer.Reset(); if (rank == 0) @@ -197,3 +193,4 @@ int main(int argc, char** argv) { return 0; } + diff --git a/training/grammar_convert.cc b/training/grammar_convert.cc index 8d292f8a..bf8abb26 100644 --- a/training/grammar_convert.cc +++ b/training/grammar_convert.cc @@ -251,12 +251,10 @@ int main(int argc, char **argv) { const bool is_split_input = (conf["format"].as() == "split"); const bool is_json_input = is_split_input || (conf["format"].as() == "json"); const bool collapse_weights = conf.count("collapse_weights"); - Weights wts; vector w; - if (conf.count("weights")) { - wts.InitFromFile(conf["weights"].as()); - wts.InitVector(&w); - } + if (conf.count("weights")) + Weights::InitFromFile(conf["weights"].as(), &w); + if (collapse_weights && !w.size()) { cerr << "--collapse_weights requires a weights file to be specified!\n"; exit(1); diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc index 39a8af7d..cc5953f6 100644 --- a/training/mpi_batch_optimize.cc +++ b/training/mpi_batch_optimize.cc @@ -31,42 +31,12 @@ using namespace std; using boost::shared_ptr; namespace po = boost::program_options; -void SanityCheck(const vector& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - -struct FComp { - const vector& w_; - FComp(const vector& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowLargestFeatures(const vector& w) { - vector fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - for (vector::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("input_weights,w",po::value(),"Input feature weights file") ("training_data,t",po::value(),"Training data") ("decoder_config,d",po::value(),"Decoder configuration file") - ("sharded_input,s",po::value(), "Corpus and grammar files are 'sharded' so each processor loads its own input and grammar file. Argument is the directory containing the shards.") ("output_weights,o",po::value()->default_value("-"),"Output feature weights file") ("optimization_method,m", po::value()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)") ("correction_buffers,M", po::value()->default_value(10), "Number of gradients for LBFGS to maintain in memory") @@ -88,14 +58,10 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { } po::notify(*conf); - if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data") | conf->count("sharded_input")) || !conf->count("decoder_config")) { + if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data")) || !conf->count("decoder_config")) { cerr << dcmdline_options << endl; return false; } - if (conf->count("training_data") && conf->count("sharded_input")) { - cerr << "Cannot specify both --training_data and --sharded_input\n"; - return false; - } return true; } @@ -236,42 +202,9 @@ int main(int argc, char** argv) { po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) return 1; - string shard_dir; - if (conf.count("sharded_input")) { - shard_dir = conf["sharded_input"].as(); - if (!DirectoryExists(shard_dir)) { - if (rank == 0) cerr << "Can't find shard directory: " << shard_dir << endl; - return 1; - } - if (rank == 0) - cerr << "Shard directory: " << shard_dir << endl; - } - - // load initial weights - Weights weights; - if (rank == 0) { cerr << "Loading weights...\n"; } - weights.InitFromFile(conf["input_weights"].as()); - if (rank == 0) { cerr << "Done loading weights.\n"; } - - // freeze feature set (should be optional?) - const bool freeze_feature_set = true; - if (freeze_feature_set) FD::Freeze(); - // load cdec.ini and set up decoder vector cdec_ini; ReadConfig(conf["decoder_config"].as(), &cdec_ini); - if (shard_dir.size()) { - if (rank == 0) { - for (int i = 0; i < cdec_ini.size(); ++i) { - if (cdec_ini[i].find("grammar=") == 0) { - cerr << "!!! using sharded input and " << conf["decoder_config"].as() << " contains a grammar specification:\n" << cdec_ini[i] << "\n VERIFY THAT THIS IS CORRECT!\n"; - } - } - } - ostringstream g; - g << "grammar=" << shard_dir << "/grammar." << rank << "_of_" << size << ".gz"; - cdec_ini.push_back(g.str()); - } istringstream ini; StoreConfig(cdec_ini, &ini); if (rank == 0) cerr << "Loading grammar...\n"; @@ -282,22 +215,28 @@ int main(int argc, char** argv) { } if (rank == 0) cerr << "Done loading grammar!\n"; + // load initial weights + if (rank == 0) { cerr << "Loading weights...\n"; } + vector& lambdas = decoder->CurrentWeightVector(); + Weights::InitFromFile(conf["input_weights"].as(), &lambdas); + if (rank == 0) { cerr << "Done loading weights.\n"; } + + // freeze feature set (should be optional?) + const bool freeze_feature_set = true; + if (freeze_feature_set) FD::Freeze(); + const int num_feats = FD::NumFeats(); if (rank == 0) cerr << "Number of features: " << num_feats << endl; + lambdas.resize(num_feats); + const bool gaussian_prior = conf.count("gaussian_prior"); - vector means(num_feats, 0); + vector means(num_feats, 0); if (conf.count("means")) { if (!gaussian_prior) { cerr << "Don't use --means without --gaussian_prior!\n"; exit(1); } - Weights wm; - wm.InitFromFile(conf["means"].as()); - if (num_feats != FD::NumFeats()) { - cerr << "[ERROR] Means file had unexpected features!\n"; - exit(1); - } - wm.InitVector(&means); + Weights::InitFromFile(conf["means"].as(), &means); } shared_ptr o; if (rank == 0) { @@ -309,26 +248,13 @@ int main(int argc, char** argv) { cerr << "Optimizer: " << o->Name() << endl; } double objective = 0; - vector lambdas(num_feats, 0.0); - weights.InitVector(&lambdas); - if (lambdas.size() != num_feats) { - cerr << "Initial weights file did not have all features specified!\n feats=" - << num_feats << "\n weights file=" << lambdas.size() << endl; - lambdas.resize(num_feats, 0.0); - } vector gradient(num_feats, 0.0); - vector rcv_grad(num_feats, 0.0); + vector rcv_grad; + rcv_grad.clear(); bool converged = false; vector corpus; - if (shard_dir.size()) { - ostringstream os; os << shard_dir << "/corpus." << rank << "_of_" << size; - ReadTrainingCorpus(os.str(), 0, 1, &corpus); - cerr << os.str() << " has " << corpus.size() << " training examples. " << endl; - if (corpus.size() > 500) { corpus.resize(500); cerr << " TRUNCATING\n"; } - } else { - ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus); - } + ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus); assert(corpus.size() > 0); TrainingObserver observer; @@ -341,19 +267,20 @@ int main(int argc, char** argv) { if (rank == 0) { cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n"; } - decoder->SetWeights(lambdas); for (int i = 0; i < corpus.size(); ++i) decoder->Decode(corpus[i], &observer); cerr << " process " << rank << '/' << size << " done\n"; fill(gradient.begin(), gradient.end(), 0); - fill(rcv_grad.begin(), rcv_grad.end(), 0); observer.SetLocalGradientAndObjective(&gradient, &objective); double to = 0; #ifdef HAVE_MPI + rcv_grad.resize(num_feats, 0.0); mpi::reduce(world, &gradient[0], gradient.size(), &rcv_grad[0], plus(), 0); - mpi::reduce(world, objective, to, plus(), 0); swap(gradient, rcv_grad); + rcv_grad.clear(); + + mpi::reduce(world, objective, to, plus(), 0); objective = to; #endif @@ -378,7 +305,7 @@ int main(int argc, char** argv) { for (int i = 0; i < gradient.size(); ++i) gnorm += gradient[i] * gradient[i]; cerr << " GNORM=" << sqrt(gnorm) << endl; - vector old = lambdas; + vector old = lambdas; int c = 0; while (old == lambdas) { ++c; @@ -387,9 +314,8 @@ int main(int argc, char** argv) { assert(c < 5); } old.clear(); - SanityCheck(lambdas); - ShowLargestFeatures(lambdas); - weights.InitFromVector(lambdas); + Weights::SanityCheck(lambdas); + Weights::ShowLargestFeatures(lambdas); converged = o->HasConverged(); if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; } @@ -399,7 +325,7 @@ int main(int argc, char** argv) { ostringstream vv; vv << "Objective = " << objective << " (eval count=" << o->EvaluationCount() << ")"; const string svv = vv.str(); - weights.WriteToFile(fname, true, &svv); + Weights::WriteToFile(fname, lambdas, true, &svv); } // rank == 0 int cint = converged; #ifdef HAVE_MPI @@ -411,3 +337,4 @@ int main(int argc, char** argv) { } return 0; } + diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc index 32033c19..2ef4a2e7 100644 --- a/training/mpi_online_optimize.cc +++ b/training/mpi_online_optimize.cc @@ -31,35 +31,6 @@ namespace mpi = boost::mpi; using namespace std; namespace po = boost::program_options; -void SanityCheck(const vector& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - -struct FComp { - const vector& w_; - FComp(const vector& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowLargestFeatures(const vector& w) { - vector fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - for (vector::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() @@ -250,10 +221,25 @@ int main(int argc, char** argv) { if (!InitCommandLine(argc, argv, &conf)) return 1; + vector > agenda; + if (!LoadAgenda(conf["training_agenda"].as(), &agenda)) + return 1; + if (rank == 0) + cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n"; + + assert(agenda.size() > 0); + + if (1) { // hack to load the feature hash functions -- TODO this should not be in cdec.ini + const string& cur_config = agenda[0].first; + const unsigned max_iteration = agenda[0].second; + ReadFile ini_rf(cur_config); + Decoder decoder(ini_rf.stream()); + } + // load initial weights - Weights weights; + vector init_weights; if (conf.count("input_weights")) - weights.InitFromFile(conf["input_weights"].as()); + Weights::InitFromFile(conf["input_weights"].as(), &init_weights); vector frozen_fids; if (conf.count("frozen_features")) { @@ -310,19 +296,12 @@ int main(int argc, char** argv) { rng.reset(new MT19937); SparseVector x; - weights.InitSparseVector(&x); + Weights::InitSparseVector(init_weights, &x); TrainingObserver observer; int write_weights_every_ith = 100; // TODO configure int titer = -1; - vector > agenda; - if (!LoadAgenda(conf["training_agenda"].as(), &agenda)) - return 1; - if (rank == 0) - cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n"; - - vector lambdas; for (int ai = 0; ai < agenda.size(); ++ai) { const string& cur_config = agenda[ai].first; const unsigned max_iteration = agenda[ai].second; @@ -331,6 +310,8 @@ int main(int argc, char** argv) { // load cdec.ini and set up decoder ReadFile ini_rf(cur_config); Decoder decoder(ini_rf.stream()); + vector& lambdas = decoder.CurrentWeightVector(); + if (ai == 0) { lambdas.swap(init_weights); init_weights.clear(); } if (rank == 0) o->ResetEpoch(); // resets the learning rate-- TODO is this good? @@ -341,15 +322,13 @@ int main(int argc, char** argv) { #ifdef HAVE_MPI mpi::timer timer; #endif - weights.InitFromVector(x); - weights.InitVector(&lambdas); + x.init_vector(&lambdas); ++iter; ++titer; observer.Reset(); - decoder.SetWeights(lambdas); if (rank == 0) { converged = (iter == max_iteration); - SanityCheck(lambdas); - ShowLargestFeatures(lambdas); + Weights::SanityCheck(lambdas); + Weights::ShowLargestFeatures(lambdas); string fname = "weights.cur.gz"; if (iter % write_weights_every_ith == 0) { ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz"; @@ -360,7 +339,7 @@ int main(int argc, char** argv) { vv << "total iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << x.size() << '/' << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast(corpus.size())) << " eta=" << lr->eta(titer); const string svv = vv.str(); cerr << svv << endl; - weights.WriteToFile(fname, true, &svv); + Weights::WriteToFile(fname, lambdas, true, &svv); } for (int i = 0; i < size_per_proc; ++i) { diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc index b931991d..15e28fa1 100644 --- a/training/mr_optimize_reduce.cc +++ b/training/mr_optimize_reduce.cc @@ -88,25 +88,19 @@ int main(int argc, char** argv) { const bool use_b64 = conf["input_format"].as() == "b64"; - Weights weights; - weights.InitFromFile(conf["input_weights"].as()); + vector lambdas; + Weights::InitFromFile(conf["input_weights"].as(), &lambdas); const string s_obj = "**OBJ**"; int num_feats = FD::NumFeats(); cerr << "Number of features: " << num_feats << endl; const bool gaussian_prior = conf.count("gaussian_prior"); - vector means(num_feats, 0); + vector means(num_feats, 0); if (conf.count("means")) { if (!gaussian_prior) { cerr << "Don't use --means without --gaussian_prior!\n"; exit(1); } - Weights wm; - wm.InitFromFile(conf["means"].as()); - if (num_feats != FD::NumFeats()) { - cerr << "[ERROR] Means file had unexpected features!\n"; - exit(1); - } - wm.InitVector(&means); + Weights::InitFromFile(conf["means"].as(), &means); } shared_ptr o; const string omethod = conf["optimization_method"].as(); @@ -124,8 +118,6 @@ int main(int argc, char** argv) { cerr << "No state file found, assuming ITERATION 1\n"; } - vector lambdas(num_feats, 0); - weights.InitVector(&lambdas); double objective = 0; vector gradient(num_feats, 0); // 0**OBJ**=12.2;Feat1=2.3;Feat2=-0.2; @@ -223,8 +215,7 @@ int main(int argc, char** argv) { old.clear(); SanityCheck(lambdas); ShowLargestFeatures(lambdas); - weights.InitFromVector(lambdas); - weights.WriteToFile(conf["output_weights"].as(), false); + Weights::WriteToFile(conf["output_weights"].as(), lambdas, false); const bool conv = o->HasConverged(); if (conv) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; } diff --git a/utils/fdict.h b/utils/fdict.h index 771e8b91..f0871b9a 100644 --- a/utils/fdict.h +++ b/utils/fdict.h @@ -28,6 +28,8 @@ struct FD { } static void EnableHash(const std::string& cmph_file) { #ifdef HAVE_CMPH + assert(dict_.max() == 0); // dictionary must not have + // been added to hash_ = new PerfectHashFunction(cmph_file); #endif } diff --git a/utils/phmt.cc b/utils/phmt.cc index 1f59afaf..48d9f093 100644 --- a/utils/phmt.cc +++ b/utils/phmt.cc @@ -19,22 +19,18 @@ int main(int argc, char** argv) { cerr << "LexFE = " << FD::Convert("LexFE") << endl; cerr << "LexEF = " << FD::Convert("LexEF") << endl; { - Weights w; vector v(FD::NumFeats()); v[FD::Convert("LexFE")] = 1.0; v[FD::Convert("LexEF")] = 0.5; - w.InitFromVector(v); cerr << "Writing...\n"; - w.WriteToFile("weights.bin"); + Weights::WriteToFile("weights.bin", v); cerr << "Done.\n"; } { - Weights w; vector v(FD::NumFeats()); cerr << "Reading...\n"; - w.InitFromFile("weights.bin"); + Weights::InitFromFile("weights.bin", &v); cerr << "Done.\n"; - w.InitVector(&v); assert(v[FD::Convert("LexFE")] == 1.0); assert(v[FD::Convert("LexEF")] == 0.5); } diff --git a/utils/weights.cc b/utils/weights.cc index 0916b72a..c49000be 100644 --- a/utils/weights.cc +++ b/utils/weights.cc @@ -8,7 +8,10 @@ using namespace std; -void Weights::InitFromFile(const std::string& filename, vector* feature_list) { +void Weights::InitFromFile(const string& filename, + vector* pweights, + vector* feature_list) { + vector& weights = *pweights; if (!SILENT) cerr << "Reading weights from " << filename << endl; ReadFile in_file(filename); istream& in = *in_file.stream(); @@ -47,16 +50,16 @@ void Weights::InitFromFile(const std::string& filename, vector* feature_ int end = 0; while(end < buf.size() && buf[end] != ' ') ++end; const int fid = FD::Convert(buf.substr(start, end - start)); + if (feature_list) { feature_list->push_back(buf.substr(start, end - start)); } while(end < buf.size() && buf[end] == ' ') ++end; val = strtod(&buf.c_str()[end], NULL); if (isnan(val)) { cerr << FD::Convert(fid) << " has weight NaN!\n"; abort(); } - if (wv_.size() <= fid) - wv_.resize(fid + 1); - wv_[fid] = val; - if (feature_list) { feature_list->push_back(FD::Convert(fid)); } + if (weights.size() <= fid) + weights.resize(fid + 1); + weights[fid] = val; ++weight_count; if (!SILENT) { if (weight_count % 50000 == 0) { cerr << '.' << flush; fl = true; } @@ -76,8 +79,8 @@ void Weights::InitFromFile(const std::string& filename, vector* feature_ cerr << "Hash function reports " << FD::NumFeats() << " keys but weights file contains " << num_keys[0] << endl; abort(); } - wv_.resize(num_keys[0]); - in.get(reinterpret_cast(&wv_[0]), num_keys[0] * sizeof(weight_t)); + weights.resize(num_keys[0]); + in.get(reinterpret_cast(&weights[0]), num_keys[0] * sizeof(weight_t)); if (!in.good()) { cerr << "Error loading weights!\n"; abort(); @@ -85,7 +88,10 @@ void Weights::InitFromFile(const std::string& filename, vector* feature_ } } -void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features, const string* extra) const { +void Weights::WriteToFile(const string& fname, + const vector& weights, + bool hide_zero_value_features, + const string* extra) { WriteFile out(fname); ostream& o = *out.stream(); assert(o); @@ -96,41 +102,54 @@ void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_feature o.precision(17); const int num_feats = FD::NumFeats(); for (int i = 1; i < num_feats; ++i) { - const weight_t val = (i < wv_.size() ? wv_[i] : 0.0); + const weight_t val = (i < weights.size() ? weights[i] : 0.0); if (hide_zero_value_features && val == 0.0) continue; o << FD::Convert(i) << ' ' << val << endl; } } else { o.write("_PHWf", 5); const size_t keys = FD::NumFeats(); - assert(keys <= wv_.size()); + assert(keys <= weights.size()); o.write(reinterpret_cast(&keys), sizeof(keys)); - o.write(reinterpret_cast(&wv_[0]), keys * sizeof(weight_t)); + o.write(reinterpret_cast(&weights[0]), keys * sizeof(weight_t)); } } -void Weights::InitVector(std::vector* w) const { - *w = wv_; +void Weights::InitSparseVector(const vector& dv, + SparseVector* sv) { + sv->clear(); + for (unsigned i = 1; i < dv.size(); ++i) { + if (dv[i]) sv->set_value(i, dv[i]); + } } -void Weights::InitSparseVector(SparseVector* w) const { - for (int i = 1; i < wv_.size(); ++i) { - const weight_t& weight = wv_[i]; - if (weight) w->set_value(i, weight); +void Weights::SanityCheck(const vector& w) { + for (int i = 0; i < w.size(); ++i) { + assert(!isnan(w[i])); + assert(!isinf(w[i])); } } -void Weights::InitFromVector(const std::vector& w) { - wv_ = w; - if (wv_.size() > FD::NumFeats()) - cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n"; - wv_.resize(FD::NumFeats(), 0); -} +struct FComp { + const vector& w_; + FComp(const vector& w) : w_(w) {} + bool operator()(int a, int b) const { + return fabs(w_[a]) > fabs(w_[b]); + } +}; -void Weights::InitFromVector(const SparseVector& w) { - wv_.clear(); - wv_.resize(FD::NumFeats(), 0.0); - for (int i = 1; i < FD::NumFeats(); ++i) - wv_[i] = w.value(i); +void Weights::ShowLargestFeatures(const vector& w) { + vector fnums(w.size()); + for (int i = 0; i < w.size(); ++i) + fnums[i] = i; + vector::iterator mid = fnums.begin(); + mid += (w.size() > 10 ? 10 : w.size()); + partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); + cerr << "TOP FEATURES:"; + for (vector::iterator i = fnums.begin(); i != mid; ++i) { + cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; + } + cerr << endl; } + diff --git a/utils/weights.h b/utils/weights.h index 7664810b..30f71db0 100644 --- a/utils/weights.h +++ b/utils/weights.h @@ -10,15 +10,21 @@ typedef double weight_t; class Weights { public: - Weights() {} - void InitFromFile(const std::string& fname, std::vector* feature_list = NULL); - void WriteToFile(const std::string& fname, bool hide_zero_value_features = true, const std::string* extra = NULL) const; - void InitVector(std::vector* w) const; - void InitSparseVector(SparseVector* w) const; - void InitFromVector(const std::vector& w); - void InitFromVector(const SparseVector& w); + static void InitFromFile(const std::string& fname, + std::vector* weights, + std::vector* feature_list = NULL); + static void WriteToFile(const std::string& fname, + const std::vector& weights, + bool hide_zero_value_features = true, + const std::string* extra = NULL); + static void InitSparseVector(const std::vector& dv, + SparseVector* sv); + // check for infinities, NaNs, etc + static void SanityCheck(const std::vector& w); + // write weights with largest magnitude to cerr + static void ShowLargestFeatures(const std::vector& w); private: - std::vector wv_; + Weights(); }; #endif diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index b84c44bc..0c094fd5 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -223,16 +223,16 @@ struct oracle_directions { cerr << "Forest repo: " << forest_repository << endl; assert(DirectoryExists(forest_repository)); vector features; - weights.InitFromFile(weights_file, &features); + vector dorigin; + Weights::InitFromFile(weights_file, &dorigin, &features); if (optimize_features.size()) features=optimize_features; - weights.InitSparseVector(&origin); + Weights::InitSparseVector(dorigin, &origin); fids.clear(); AddFeatureIds(features); oracles.resize(dev_set_size); } - Weights weights; void AddFeatureIds(vector const& features) { int i = fids.size(); fids.resize(fids.size()+features.size()); -- cgit v1.2.3 From deb054ab7fb92e18fb1e3a7adb96577ad8e91a86 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 13 Sep 2011 18:33:08 +0100 Subject: fix weight serialization bug --- utils/weights.cc | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'utils') diff --git a/utils/weights.cc b/utils/weights.cc index c49000be..ac407dfb 100644 --- a/utils/weights.cc +++ b/utils/weights.cc @@ -23,7 +23,7 @@ void Weights::InitFromFile(const string& filename, istream& hi = *hdrrf.stream(); assert(hi); char buf[10]; - hi.get(buf, 6); + hi.read(buf, 5); assert(hi.good()); if (strncmp(buf, "_PHWf", 5) == 0) { read_text = false; @@ -72,18 +72,20 @@ void Weights::InitFromFile(const string& filename, } } else { // !read_text char buf[6]; - in.get(buf, 6); - size_t num_keys[2]; - in.get(reinterpret_cast(&num_keys[0]), sizeof(size_t) + 1); - if (num_keys[0] != FD::NumFeats()) { - cerr << "Hash function reports " << FD::NumFeats() << " keys but weights file contains " << num_keys[0] << endl; + in.read(buf, 5); + size_t num_keys; + in.read(reinterpret_cast(&num_keys), sizeof(size_t)); + if (num_keys != FD::NumFeats()) { + cerr << "Hash function reports " << FD::NumFeats() << " keys but weights file contains " << num_keys << endl; abort(); } - weights.resize(num_keys[0]); - in.get(reinterpret_cast(&weights[0]), num_keys[0] * sizeof(weight_t)); + weights.resize(num_keys); + in.read(reinterpret_cast(&weights.front()), num_keys * sizeof(weight_t)); if (!in.good()) { cerr << "Error loading weights!\n"; abort(); + } else { + cerr << " Successfully loaded " << (num_keys * sizeof(weight_t)) << " bytes\n"; } } } -- cgit v1.2.3 From a882bb5879638329d7c1cd5486f47bb3cee0d198 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 13 Sep 2011 20:16:17 +0100 Subject: tool to reconstruct text weights from a hash function, key file, and (binary) weights file --- utils/Makefile.am | 5 ++++ utils/reconstruct_weights.cc | 68 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 utils/reconstruct_weights.cc (limited to 'utils') diff --git a/utils/Makefile.am b/utils/Makefile.am index c50747bf..df667655 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -1,3 +1,6 @@ + +bin_PROGRAMS = reconstruct_weights + noinst_PROGRAMS = ts phmt TESTS = ts phmt @@ -11,6 +14,8 @@ noinst_PROGRAMS += \ TESTS += small_vector_test logval_test weights_test dict_test endif +reconstruct_weights_SOURCES = reconstruct_weights.cc + noinst_LIBRARIES = libutils.a libutils_a_SOURCES = \ diff --git a/utils/reconstruct_weights.cc b/utils/reconstruct_weights.cc new file mode 100644 index 00000000..d32e4f67 --- /dev/null +++ b/utils/reconstruct_weights.cc @@ -0,0 +1,68 @@ +#include +#include +#include + +#include +#include + +#include "filelib.h" +#include "fdict.h" +#include "weights.h" + +using namespace std; +namespace po = boost::program_options; + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("weights,w",po::value(),"Input feature weights file") + ("keys,k",po::value(),"Keys file (list of features with dummy value at start)") + ("cmph_perfect_hash_file,h",po::value(),"cmph perfect hash function file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,?", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("cmph_perfect_hash_file") || !conf->count("weights") || !conf->count("keys")) { + cerr << "Generate a text format weights file. Options -w -k and -h are required.\n"; + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +int main(int argc, char** argv) { + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) + return false; + + FD::EnableHash(conf["cmph_perfect_hash_file"].as()); + + // load weights + vector weights; + Weights::InitFromFile(conf["weights"].as(), &weights); + + ReadFile rf(conf["keys"].as()); + istream& in = *rf.stream(); + string key; + size_t lc = 0; + while(getline(in, key)) { + ++lc; + if (lc == 1) continue; + assert(lc <= weights.size()); + cout << key << " " << weights[lc - 1] << endl; + } + + return 0; +} + -- cgit v1.2.3 From 33a891dbaf7c77e822f4692c9ac4056f5b29e88c Mon Sep 17 00:00:00 2001 From: Guest_account Guest_account prguest11 Date: Thu, 15 Sep 2011 12:52:59 +0100 Subject: script to filter reachable sentences, weight cleanup --- decoder/apply_models.cc | 3 +- decoder/hg.h | 8 +- training/Makefile.am | 10 +- training/cllh_filter_grammar.cc | 197 -------------------------------------- training/mpi_extract_reachable.cc | 163 +++++++++++++++++++++++++++++++ utils/feature_vector.h | 4 +- 6 files changed, 174 insertions(+), 211 deletions(-) delete mode 100644 training/cllh_filter_grammar.cc create mode 100644 training/mpi_extract_reachable.cc (limited to 'utils') diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc index 26cdb881..40fd27e4 100644 --- a/decoder/apply_models.cc +++ b/decoder/apply_models.cc @@ -276,8 +276,7 @@ public: make_heap(cand.begin(), cand.end(), HeapCandCompare()); State2Node state2node; // "buf" in Figure 2 int pops = 0; - int pop_limit_eff=max(1,int(v.promise*pop_limit_)); - while(!cand.empty() && pops < pop_limit_eff) { + while(!cand.empty() && pops < pop_limit_) { pop_heap(cand.begin(), cand.end(), HeapCandCompare()); Candidate* item = cand.back(); cand.pop_back(); diff --git a/decoder/hg.h b/decoder/hg.h index e5ef05f8..f0ddbb76 100644 --- a/decoder/hg.h +++ b/decoder/hg.h @@ -49,16 +49,14 @@ public: // TODO get rid of cat_? // TODO keep cat_ and add span and/or state? :) struct Node { - Node() : id_(), cat_(), promise(1) {} + Node() : id_(), cat_() {} int id_; // equal to this object's position in the nodes_ vector WordID cat_; // non-terminal category if <0, 0 if not set WordID NT() const { return -cat_; } EdgesVector in_edges_; // an in edge is an edge with this node as its head. (in edges come from the bottom up to us) indices in edges_ EdgesVector out_edges_; // an out edge is an edge with this node as its tail. (out edges leave us up toward the top/goal). indices in edges_ - double promise; // set in global pruning; in [0,infty) so that mean is 1. use: e.g. scale cube poplimit. //TODO: appears to be useless, compile without this? on the other hand, pretty cheap. void copy_fixed(Node const& o) { // nonstructural fields only - structural ones are managed by sorting/pruning/subsetting cat_=o.cat_; - promise=o.promise; } void copy_reindex(Node const& o,indices_after const& n2,indices_after const& e2) { copy_fixed(o); @@ -81,7 +79,7 @@ public: int head_node_; // refers to a position in nodes_ TailNodeVector tail_nodes_; // contents refer to positions in nodes_ TRulePtr rule_; - FeatureVector feature_values_; + SparseVector feature_values_; prob_t edge_prob_; // dot product of weights and feat_values int id_; // equal to this object's position in the edges_ vector @@ -468,7 +466,7 @@ public: /// drop edge i if edge_margin[i] < prune_below, unless preserve_mask[i] void MarginPrune(EdgeProbs const& edge_margin,prob_t prune_below,EdgeMask const* preserve_mask=0,bool safe_inside=false,bool verbose=false); - //TODO: in my opinion, looking at the ratio of logprobs (features \dot weights) rather than the absolute difference generalizes more nicely across sentence lengths and weight vectors that are constant multiples of one another. at least make that an option. i worked around this a little in cdec by making "beam alpha per source word" but that's not helping with different tuning runs. this would also make me more comfortable about allocating Node.promise + //TODO: in my opinion, looking at the ratio of logprobs (features \dot weights) rather than the absolute difference generalizes more nicely across sentence lengths and weight vectors that are constant multiples of one another. at least make that an option. i worked around this a little in cdec by making "beam alpha per source word" but that's not helping with different tuning runs. // beam_alpha=0 means don't beam prune, otherwise drop things that are e^beam_alpha times worse than best - // prunes any edge whose prob_t on the best path taking that edge is more than e^alpha times //density=0 means don't density prune: // for density>=1.0, keep this many times the edges needed for the 1best derivation diff --git a/training/Makefile.am b/training/Makefile.am index 7ceeda34..5752859e 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -9,9 +9,9 @@ bin_PROGRAMS = \ atools \ plftools \ collapse_weights \ - cllh_filter_grammar \ - mpi_online_optimize \ + mpi_extract_reachable \ mpi_extract_features \ + mpi_online_optimize \ mpi_batch_optimize \ compute_cllh \ augment_grammar @@ -25,6 +25,9 @@ TESTS = lbfgs_test optimize_test mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc +mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + mpi_extract_features_SOURCES = mpi_extract_features.cc mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz @@ -34,9 +37,6 @@ mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/ compute_cllh_SOURCES = compute_cllh.cc compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -cllh_filter_grammar_SOURCES = cllh_filter_grammar.cc -cllh_filter_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - augment_grammar_SOURCES = augment_grammar.cc augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/training/cllh_filter_grammar.cc b/training/cllh_filter_grammar.cc deleted file mode 100644 index 6998ec2b..00000000 --- a/training/cllh_filter_grammar.cc +++ /dev/null @@ -1,197 +0,0 @@ -#include -#include -#include -#include // fork -#include // waitpid - -#include -#include - -#include "tdict.h" -#include "ff_register.h" -#include "verbose.h" -#include "hg.h" -#include "decoder.h" -#include "filelib.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("training_data,t",po::value(),"Training data corpus") - ("decoder_config,c",po::value(),"Decoder configuration file") - ("shards,s",po::value()->default_value(1),"Number of shards") - ("starting_shard,S",po::value()->default_value(0), "In this invocation only process shards >= S") - ("work_limit,l",po::value()->default_value(9999), "Process maximially this many shards") - ("ncpus,C",po::value()->default_value(1),"Number of CPUs to use"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c, vector* ids) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - int lc = 0; - assert(size > 0); - assert(rank < size); - while(in) { - getline(in, line); - if (!in) break; - if (lc % size == rank) { - c->push_back(line); - ids->push_back(lc); - } - ++lc; - } -} - -struct TrainingObserver : public DecoderObserver { - TrainingObserver() : s_lhs(-TD::Convert("S")), goal_lhs(-TD::Convert("Goal")) {} - - void Reset() { - total_complete = 0; - } - - virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { - state = 1; - used.clear(); - failed = true; - } - - virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 1); - for (int i = 0; i < hg->edges_.size(); ++i) { - const TRule* rule = hg->edges_[i].rule_.get(); - if (rule->lhs_ == s_lhs || rule->lhs_ == goal_lhs) // fragile hack to filter out glue rules - continue; - used.insert(rule); - } - state = 2; - } - - virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 2); - state = 3; - } - - virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { - if (state == 3) { - failed = false; - } else { - failed = true; - } - } - - set used; - - const int s_lhs; - const int goal_lhs; - bool failed; - int total_complete; - int state; -}; - -void work(const string& fname, int rank, int size, Decoder* decoder) { - cerr << "Worker " << rank << '/' << size << " starting.\n"; - vector corpus; - vector ids; - ReadTrainingCorpus(fname, rank, size, &corpus, &ids); - assert(corpus.size() > 0); - assert(corpus.size() == ids.size()); - cerr << " " << rank << '/' << size << ": has " << corpus.size() << " sentences to process\n"; - ostringstream oc; oc << "corpus." << rank << "_of_" << size; - WriteFile foc(oc.str()); - ostringstream og; og << "grammar." << rank << "_of_" << size << ".gz"; - WriteFile fog(og.str()); - - set all_used; - TrainingObserver observer; - for (int i = 0; i < corpus.size(); ++i) { - const int sent_id = ids[i]; - const string& input = corpus[i]; - decoder->SetId(sent_id); - decoder->Decode(input, &observer); - if (observer.failed) { - // do nothing - } else { - (*foc.stream()) << input << endl; - for (set::iterator it = observer.used.begin(); it != observer.used.end(); ++it) { - if (all_used.insert(*it).second) - (*fog.stream()) << **it << endl; - } - } - } -} - -int main(int argc, char** argv) { - register_feature_functions(); - - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string fname = conf["training_data"].as(); - const unsigned ncpus = conf["ncpus"].as(); - const unsigned shards = conf["shards"].as(); - const unsigned start = conf["starting_shard"].as(); - const unsigned work_limit = conf["work_limit"].as(); - const unsigned eff_shards = min(start + work_limit, shards); - cerr << "Processing shards " << start << "/" << shards << " to " << eff_shards << "/" << shards << endl; - assert(ncpus > 0); - ReadFile ini_rf(conf["decoder_config"].as()); - Decoder decoder(ini_rf.stream()); - if (decoder.GetConf()["input"].as() != "-") { - cerr << "cdec.ini must not set an input file\n"; - abort(); - } - SetSilent(true); // turn off verbose decoder output - cerr << "Forking " << ncpus << " time(s)\n"; - vector children; - for (int i = 0; i < ncpus; ++i) { - pid_t pid = fork(); - if (pid < 0) { - cerr << "Fork failed!\n"; - exit(1); - } - if (pid > 0) { - children.push_back(pid); - } else { - for (int j = start; j < eff_shards; ++j) { - if (j % ncpus == i) { - cerr << " CPU " << i << " processing shard " << j << endl; - work(fname, j, shards, &decoder); - cerr << " Shard " << j << "/" << shards << " finished.\n"; - } - } - _exit(0); - } - } - for (int i = 0; i < children.size(); ++i) { - int status; - int w = waitpid(children[i], &status, 0); - if (w < 0) { cerr << "Error while waiting for children!"; return 1; } - if (WIFSIGNALED(status)) { - cerr << "Child " << i << " received signal " << WTERMSIG(status) << endl; - if (WTERMSIG(status) == 11) { cerr << " this is a SEGV- you may be trying to print temporarily created rules\n"; } - } - } - return 0; -} diff --git a/training/mpi_extract_reachable.cc b/training/mpi_extract_reachable.cc new file mode 100644 index 00000000..2a7c2b9d --- /dev/null +++ b/training/mpi_extract_reachable.cc @@ -0,0 +1,163 @@ +#include +#include +#include +#include + +#include "config.h" +#ifdef HAVE_MPI +#include +#endif +#include +#include + +#include "ff_register.h" +#include "verbose.h" +#include "filelib.h" +#include "fdict.h" +#include "decoder.h" +#include "weights.h" + +using namespace std; +namespace po = boost::program_options; + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("training_data,t",po::value(),"Training data corpus") + ("decoder_config,c",po::value(),"Decoder configuration file") + ("weights,w", po::value(), "(Optional) weights file; weights may affect what features are encountered in pruning configurations") + ("output_prefix,o",po::value()->default_value("reachable"),"Output path prefix"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) { + cerr << "Decode an input set (optionally in parallel using MPI) and write\nout the inputs that produce reachable parallel parses.\n"; + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + int lc = 0; + while(in) { + getline(in, line); + if (!in) break; + if (lc % size == rank) c->push_back(line); + ++lc; + } +} + +static const double kMINUS_EPSILON = -1e-6; + +struct ReachabilityObserver : public DecoderObserver { + + virtual void NotifyDecodingStart(const SentenceMetadata&) { + reachable = false; + } + + // compute model expectations, denominator of objective + virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) { + } + + // compute "empirical" expectations, numerator of objective + virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { + reachable = true; + } + + bool reachable; +}; + +#ifdef HAVE_MPI +namespace mpi = boost::mpi; +#endif + +int main(int argc, char** argv) { +#ifdef HAVE_MPI + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); +#else + const int size = 1; + const int rank = 0; +#endif + if (size > 1) SetSilent(true); // turn off verbose decoder output + register_feature_functions(); + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) + return false; + + // load cdec.ini and set up decoder + ReadFile ini_rf(conf["decoder_config"].as()); + Decoder decoder(ini_rf.stream()); + if (decoder.GetConf()["input"].as() != "-") { + cerr << "cdec.ini must not set an input file\n"; + abort(); + } + + if (FD::UsingPerfectHashFunction()) { + cerr << "Your configuration file has enabled a cmph hash function. Please disable.\n"; + return 1; + } + + // load optional weights + if (conf.count("weights")) + Weights::InitFromFile(conf["weights"].as(), &decoder.CurrentWeightVector()); + + vector corpus; + ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus); + assert(corpus.size() > 0); + + + if (rank == 0) + cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n"; + + size_t num_reached = 0; + { + ostringstream os; + os << conf["output_prefix"].as() << '.' << rank << "_of_" << size; + WriteFile wf(os.str()); + ostream& out = *wf.stream(); + ReachabilityObserver observer; + for (int i = 0; i < corpus.size(); ++i) { + decoder.Decode(corpus[i], &observer); + if (observer.reachable) { + out << corpus[i] << endl; + ++num_reached; + } + corpus[i].clear(); + } + cerr << "Shard " << rank << '/' << size << " finished, wrote " + << num_reached << " instances to " << os.str() << endl; + } + + size_t total = 0; +#ifdef HAVE_MPI + reduce(world, num_reached, total, std::plus(), 0); +#else + total = num_reached; +#endif + if (rank == 0) { + cerr << "-----------------------------------------\n"; + cerr << "TOTAL = " << total << " instances\n"; + } + return 0; +} + diff --git a/utils/feature_vector.h b/utils/feature_vector.h index 733aa99e..a7b61a66 100755 --- a/utils/feature_vector.h +++ b/utils/feature_vector.h @@ -3,9 +3,9 @@ #include #include "sparse_vector.h" -#include "fdict.h" +#include "weights.h" -typedef double Featval; +typedef weight_t Featval; typedef SparseVector FeatureVector; typedef SparseVector WeightVector; typedef std::vector DenseWeightVector; -- cgit v1.2.3 From 445c62444ba3db95b7fdee80b03d313b686a6419 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 17 Sep 2011 00:20:52 +0100 Subject: add md5 hash function --- utils/stringlib.cc | 363 +++++++++++++++++++++++++++++++++++++++++++++++++++++ utils/stringlib.h | 3 + utils/ts.cc | 6 + 3 files changed, 372 insertions(+) (limited to 'utils') diff --git a/utils/stringlib.cc b/utils/stringlib.cc index ade02ca9..3a56965c 100644 --- a/utils/stringlib.cc +++ b/utils/stringlib.cc @@ -90,3 +90,366 @@ void ProcessAndStripSGML(string* pline, map* out) { } } +string SGMLOpenSegTag(const map& attr) { + ostringstream os; + os << "::const_iterator it = attr.begin(); it != attr.end(); ++it) + os << ' ' << it->first << '=' << '"' << it->second << '"'; + os << '>'; + return os.str(); +} + +class MD5 { +public: + typedef unsigned int size_type; // must be 32bit + + MD5(); + MD5(const std::string& text); + void update(const unsigned char *buf, size_type length); + void update(const char *buf, size_type length); + MD5& finalize(); + std::string hexdigest() const; + +private: + void init(); + typedef unsigned char uint1; // 8bit + typedef unsigned int uint4; // 32bit + enum {blocksize = 64}; // VC6 won't eat a const static int here + + void transform(const uint1 block[blocksize]); + static void decode(uint4 output[], const uint1 input[], size_type len); + static void encode(uint1 output[], const uint4 input[], size_type len); + + bool finalized; + uint1 buffer[blocksize]; // bytes that didn't fit in last 64 byte chunk + uint4 count[2]; // 64bit counter for number of bits (lo, hi) + uint4 state[4]; // digest so far + uint1 digest[16]; // the result + + // low level logic operations + static inline uint4 F(uint4 x, uint4 y, uint4 z); + static inline uint4 G(uint4 x, uint4 y, uint4 z); + static inline uint4 H(uint4 x, uint4 y, uint4 z); + static inline uint4 I(uint4 x, uint4 y, uint4 z); + static inline uint4 rotate_left(uint4 x, int n); + static inline void FF(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac); + static inline void GG(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac); + static inline void HH(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac); + static inline void II(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac); +}; + +// Constants for MD5Transform routine. +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + +/////////////////////////////////////////////// + +// F, G, H and I are basic MD5 functions. +inline MD5::uint4 MD5::F(uint4 x, uint4 y, uint4 z) { + return (x&y) | (~x&z); +} + +inline MD5::uint4 MD5::G(uint4 x, uint4 y, uint4 z) { + return (x&z) | (y&~z); +} + +inline MD5::uint4 MD5::H(uint4 x, uint4 y, uint4 z) { + return x^y^z; +} + +inline MD5::uint4 MD5::I(uint4 x, uint4 y, uint4 z) { + return y ^ (x | ~z); +} + +// rotate_left rotates x left n bits. +inline MD5::uint4 MD5::rotate_left(uint4 x, int n) { + return (x << n) | (x >> (32-n)); +} + +// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. +// Rotation is separate from addition to prevent recomputation. +inline void MD5::FF(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) { + a = rotate_left(a+ F(b,c,d) + x + ac, s) + b; +} + +inline void MD5::GG(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) { + a = rotate_left(a + G(b,c,d) + x + ac, s) + b; +} + +inline void MD5::HH(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) { + a = rotate_left(a + H(b,c,d) + x + ac, s) + b; +} + +inline void MD5::II(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) { + a = rotate_left(a + I(b,c,d) + x + ac, s) + b; +} + +////////////////////////////////////////////// + +// default ctor, just initailize +MD5::MD5() +{ + init(); +} + +////////////////////////////////////////////// + +// nifty shortcut ctor, compute MD5 for string and finalize it right away +MD5::MD5(const std::string &text) +{ + init(); + update(text.c_str(), text.length()); + finalize(); +} + +////////////////////////////// + +void MD5::init() +{ + finalized=false; + + count[0] = 0; + count[1] = 0; + + // load magic initialization constants. + state[0] = 0x67452301; + state[1] = 0xefcdab89; + state[2] = 0x98badcfe; + state[3] = 0x10325476; +} + +////////////////////////////// + +// decodes input (unsigned char) into output (uint4). Assumes len is a multiple of 4. +void MD5::decode(uint4 output[], const uint1 input[], size_type len) +{ + for (unsigned int i = 0, j = 0; j < len; i++, j += 4) + output[i] = ((uint4)input[j]) | (((uint4)input[j+1]) << 8) | + (((uint4)input[j+2]) << 16) | (((uint4)input[j+3]) << 24); +} + +////////////////////////////// + +// encodes input (uint4) into output (unsigned char). Assumes len is +// a multiple of 4. +void MD5::encode(uint1 output[], const uint4 input[], size_type len) +{ + for (size_type i = 0, j = 0; j < len; i++, j += 4) { + output[j] = input[i] & 0xff; + output[j+1] = (input[i] >> 8) & 0xff; + output[j+2] = (input[i] >> 16) & 0xff; + output[j+3] = (input[i] >> 24) & 0xff; + } +} + +////////////////////////////// + +// apply MD5 algo on a block +void MD5::transform(const uint1 block[blocksize]) +{ + uint4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + decode (x, block, blocksize); + + /* Round 1 */ + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ + FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ + FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ + FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ + FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ + FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ + FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ + FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ + FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ + FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ + FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ + + /* Round 2 */ + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ + GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ + GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ + GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ + GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ + GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ + GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ + + /* Round 3 */ + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ + HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ + HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ + HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ + HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ + HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ + HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ + HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ + + /* Round 4 */ + II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ + II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ + II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ + II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ + II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ + II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ + II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ + II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ + II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ + II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ + II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ + II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + // Zeroize sensitive information. + memset(x, 0, sizeof x); +} + +////////////////////////////// + +// MD5 block update operation. Continues an MD5 message-digest +// operation, processing another message block +void MD5::update(const unsigned char input[], size_type length) +{ + // compute number of bytes mod 64 + size_type index = count[0] / 8 % blocksize; + + // Update number of bits + if ((count[0] += (length << 3)) < (length << 3)) + count[1]++; + count[1] += (length >> 29); + + // number of bytes we need to fill in buffer + size_type firstpart = 64 - index; + + size_type i; + + // transform as many times as possible. + if (length >= firstpart) + { + // fill buffer first, transform + memcpy(&buffer[index], input, firstpart); + transform(buffer); + + // transform chunks of blocksize (64 bytes) + for (i = firstpart; i + blocksize <= length; i += blocksize) + transform(&input[i]); + + index = 0; + } + else + i = 0; + + // buffer remaining input + memcpy(&buffer[index], &input[i], length-i); +} + +////////////////////////////// + +// for convenience provide a verson with signed char +void MD5::update(const char input[], size_type length) +{ + update((const unsigned char*)input, length); +} + +////////////////////////////// + +// MD5 finalization. Ends an MD5 message-digest operation, writing the +// the message digest and zeroizing the context. +MD5& MD5::finalize() +{ + static unsigned char padding[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + + if (!finalized) { + // Save number of bits + unsigned char bits[8]; + encode(bits, count, 8); + + // pad out to 56 mod 64. + size_type index = count[0] / 8 % 64; + size_type padLen = (index < 56) ? (56 - index) : (120 - index); + update(padding, padLen); + + // Append length (before padding) + update(bits, 8); + + // Store state in digest + encode(digest, state, 16); + + // Zeroize sensitive information. + memset(buffer, 0, sizeof buffer); + memset(count, 0, sizeof count); + + finalized=true; + } + + return *this; +} + +////////////////////////////// + +// return hex representation of digest as string +std::string MD5::hexdigest() const +{ + if (!finalized) + return ""; + + char buf[33]; + for (int i=0; i<16; i++) + sprintf(buf+i*2, "%02x", digest[i]); + buf[32]=0; + + return std::string(buf); +} + +////////////////////////////// + +std::string md5(const std::string& str) { + MD5 md5 = MD5(str); + return md5.hexdigest(); +} + diff --git a/utils/stringlib.h b/utils/stringlib.h index 8022bb88..cafbdac3 100644 --- a/utils/stringlib.h +++ b/utils/stringlib.h @@ -249,6 +249,7 @@ inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::s } void ProcessAndStripSGML(std::string* line, std::map* out); +std::string SGMLOpenSegTag(const std::map& attr); // given the first character of a UTF8 block, find out how wide it is // see http://en.wikipedia.org/wiki/UTF-8 for more info @@ -260,4 +261,6 @@ inline unsigned int UTF8Len(unsigned char x) { else return 0; } +std::string md5(const std::string& in); + #endif diff --git a/utils/ts.cc b/utils/ts.cc index 3694e076..bf4f8f69 100644 --- a/utils/ts.cc +++ b/utils/ts.cc @@ -7,6 +7,7 @@ #include "prob.h" #include "sparse_vector.h" #include "fast_sparse_vector.h" +#include "stringlib.h" using namespace std; @@ -79,6 +80,11 @@ int main() { y -= y; } cerr << "Counted " << c << " times\n"; + + cerr << md5("this is a test") << endl; + cerr << md5("some other ||| string is") << endl; + map x; x["id"] = "12"; x["grammar"] = "/path/to/grammar.gz"; + cerr << SGMLOpenSegTag(x) << endl; return 0; } -- cgit v1.2.3 From ed2b1ce7181fb21d2363b0d5ce04d7fa52aef0fa Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 17 Sep 2011 01:08:45 +0100 Subject: enable ramdisk scratch for per-sentence-grammars --- training/mpi_batch_optimize.cc | 35 +++++++++++++++++++++++++++++++++++ utils/filelib.cc | 19 +++++++++++++++++++ utils/filelib.h | 5 +---- 3 files changed, 55 insertions(+), 4 deletions(-) (limited to 'utils') diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc index cc5953f6..0ba8c530 100644 --- a/training/mpi_batch_optimize.cc +++ b/training/mpi_batch_optimize.cc @@ -22,6 +22,7 @@ namespace mpi = boost::mpi; #include "ff_register.h" #include "decoder.h" #include "filelib.h" +#include "stringlib.h" #include "optimize.h" #include "fdict.h" #include "weights.h" @@ -42,6 +43,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("correction_buffers,M", po::value()->default_value(10), "Number of gradients for LBFGS to maintain in memory") ("gaussian_prior,p","Use a Gaussian prior on the weights") ("means,u", po::value(), "File containing the means for Gaussian prior") + ("per_sentence_grammar_scratch,P", po::value(), "(Optional) location of scratch space to copy per-sentence grammars for fast access, useful if a RAM disk is available") ("sigma_squared", po::value()->default_value(1.0), "Sigma squared term for spherical Gaussian prior"); po::options_description clo("Command line options"); clo.add_options() @@ -186,6 +188,36 @@ struct VectorPlus : public binary_function, vector, vector > { } }; +void MovePerSentenceGrammars(const string& root, int size, int rank, vector* c) { + if (!DirectoryExists(root)) { + cerr << "Can't find scratch space at " << root << endl; + abort(); + } + ostringstream os; + os << root << "/psg." << size << "_of_" << rank; + const string path = os.str(); + MkDirP(path); + string sent; + map attr; + for (unsigned i = 0; i < c->size(); ++i) { + sent = (*c)[i]; + attr.clear(); + ProcessAndStripSGML(&sent, &attr); + map::iterator it = attr.find("grammar"); + if (it != attr.end()) { + string src_file = it->second; + bool is_gzipped = (src_file.size() > 3) && (src_file.rfind(".gz") == (src_file.size() - 3)); + string new_name = path + "/" + md5(sent); + if (is_gzipped) new_name += ".gz"; + CopyFile(src_file, new_name); + it->second = new_name; + } + ostringstream ns; + ns << SGMLOpenSegTag(attr) << ' ' << sent << " "; + (*c)[i] = ns.str(); + } +} + int main(int argc, char** argv) { #ifdef HAVE_MPI mpi::environment env(argc, argv); @@ -257,6 +289,9 @@ int main(int argc, char** argv) { ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus); assert(corpus.size() > 0); + if (conf.count("per_sentence_grammar_scratch")) + MovePerSentenceGrammars(conf["per_sentence_grammar_scratch"].as(), rank, size, &corpus); + TrainingObserver observer; while (!converged) { observer.Reset(); diff --git a/utils/filelib.cc b/utils/filelib.cc index a0969b1a..d206fc19 100644 --- a/utils/filelib.cc +++ b/utils/filelib.cc @@ -2,6 +2,12 @@ #include #include +#include +#include +#include +#include +#include +#include using namespace std; @@ -32,3 +38,16 @@ void MkDirP(const string& dir) { } } +#if 0 +void CopyFile(const string& inf, const string& outf) { + WriteFile w(outf); + CopyFile(inf,*w); +} +#else +void CopyFile(const string& inf, const string& outf) { + ofstream of(outf.c_str(), fstream::trunc|fstream::binary); + ifstream in(inf.c_str(), fstream::binary); + of << in.rdbuf(); +} +#endif + diff --git a/utils/filelib.h b/utils/filelib.h index a8622246..bb6e7415 100644 --- a/utils/filelib.h +++ b/utils/filelib.h @@ -113,9 +113,6 @@ inline void CopyFile(std::string const& inf,std::ostream &out) { CopyFile(*r,out); } -inline void CopyFile(std::string const& inf,std::string const& outf) { - WriteFile w(outf); - CopyFile(inf,*w); -} +void CopyFile(std::string const& inf,std::string const& outf); #endif -- cgit v1.2.3 From 41b28681f9a286d2ee98dab7915c0e735704286e Mon Sep 17 00:00:00 2001 From: Guest_account Guest_account prguest11 Date: Sat, 17 Sep 2011 01:39:07 +0100 Subject: add dep --- training/cluster-em.pl | 114 ---------------- training/cluster-ptrain.pl | 206 ----------------------------- training/compute_cllh.cc | 196 --------------------------- training/make-lexcrf-grammar.pl | 285 ---------------------------------------- training/mpi_compute_cllh.cc | 196 +++++++++++++++++++++++++++ utils/stringlib.cc | 14 +- 6 files changed, 203 insertions(+), 808 deletions(-) delete mode 100755 training/cluster-em.pl delete mode 100755 training/cluster-ptrain.pl delete mode 100644 training/compute_cllh.cc delete mode 100755 training/make-lexcrf-grammar.pl create mode 100644 training/mpi_compute_cllh.cc (limited to 'utils') diff --git a/training/cluster-em.pl b/training/cluster-em.pl deleted file mode 100755 index 267ab642..00000000 --- a/training/cluster-em.pl +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } -use Getopt::Long; -my $parallel = 0; - -my $CWD=`pwd`; chomp $CWD; -my $BIN_DIR = "$CWD/.."; -my $REDUCER = "$BIN_DIR/training/mr_em_adapted_reduce"; -my $REDUCE2WEIGHTS = "$BIN_DIR/training/mr_reduce_to_weights"; -my $ADAPTER = "$BIN_DIR/training/mr_em_map_adapter"; -my $DECODER = "$BIN_DIR/decoder/cdec"; -my $COMBINER_CACHE_SIZE = 10000000; -my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl"; -die "Can't find $REDUCER" unless -f $REDUCER; -die "Can't execute $REDUCER" unless -x $REDUCER; -die "Can't find $REDUCE2WEIGHTS" unless -f $REDUCE2WEIGHTS; -die "Can't execute $REDUCE2WEIGHTS" unless -x $REDUCE2WEIGHTS; -die "Can't find $ADAPTER" unless -f $ADAPTER; -die "Can't execute $ADAPTER" unless -x $ADAPTER; -die "Can't find $DECODER" unless -f $DECODER; -die "Can't execute $DECODER" unless -x $DECODER; -my $restart = ''; -if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; } - -die "Usage: $0 [--restart] training.corpus cdec.ini\n" unless (scalar @ARGV == 2); - -my $training_corpus = shift @ARGV; -my $config = shift @ARGV; -my $pmem="2500mb"; -my $nodes = 40; -my $max_iteration = 1000; -my $CFLAG = "-C 1"; -if ($parallel) { - die "Can't find $PARALLEL" unless -f $PARALLEL; - die "Can't execute $PARALLEL" unless -x $PARALLEL; -} else { $CFLAG = "-C 500"; } - -my $initial_weights = ''; - -print STDERR < \$DECODER, - "distributed" => \$DISTRIBUTED, - "sigma_squared=f" => \$sigsq, - "lbfgs_memory_buffers=i" => \$mem_buffers, - "max_iteration=i" => \$max_iteration, - "means=s" => \$means_file, - "optimizer=s" => \$OALG, - "gaussian_prior" => \$PRIOR, - "restart_if_necessary" => \$RESTART_IF_NECESSARY, - "jobs=i" => \$nodes, - "pmem=s" => \$pmem - ) or usage(); -usage() unless scalar @ARGV==3; -my $config_file = shift @ARGV; -my $training_corpus = shift @ARGV; -my $initial_weights = shift @ARGV; -unless ($DISTRIBUTED) { $LOCAL = 1; } -die "Can't find $config_file" unless -f $config_file; -die "Can't find $DECODER" unless -f $DECODER; -die "Can't execute $DECODER" unless -x $DECODER; -if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; } -if ($PRIOR) { - $PRIOR_FLAG="-p --sigma_squared $sigsq"; - if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; } -} - -if ($parallel) { - die "Can't find $PARALLEL" unless -f $PARALLEL; - die "Can't execute $PARALLEL" unless -x $PARALLEL; -} -unless ($parallel) { $CFLAG = "-C 500"; } -unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; } -my $clines = num_lines($training_corpus); -my $dir = "$CWD/ptrain"; - -if ($RESTART_IF_NECESSARY && -d $dir) { - $restart = 1; -} - -print STDERR <$dir/training.in"; - my $lc = 0; - while() { - chomp; - s/^\s+//; - s/\s+$//; - die "Expected A ||| B in input file" unless / \|\|\| /; - print TO "$_\n"; - $lc++; - } - close T; - close TO; -} -$training_corpus = "$dir/training.in"; - -my $iter_attempts = 1; -while ($iter < $max_iteration) { - my $cur_time = `date`; chomp $cur_time; - print STDERR "\nStarting iteration $iter...\n"; - print STDERR " time: $cur_time\n"; - my $start = time; - my $next_iter = $iter + 1; - my $dec_cmd="$DECODER -G $CFLAG -c $config_file -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter"; - my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M $mem_buffers $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz"; - my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- "; - my $cmd = ""; - if ($parallel) { $cmd = $pcmd; } - $cmd .= "$dec_cmd | $opt_cmd"; - - print STDERR "EXECUTING: $cmd\n"; - my $result = `$cmd`; - my $exit_code = $? >> 8; - if ($exit_code == 99) { - $iter_attempts++; - if ($iter_attempts > $MAX_ITER_ATTEMPTS) { - die "Received restart request $iter_attempts times from optimizer, giving up\n"; - } - print STDERR "Function evaluation failed, retrying (attempt $iter_attempts)\n"; - next; - } - if ($? != 0) { - die "Error running iteration $iter: $!"; - } - chomp $result; - my $end = time; - my $diff = ($end - $start); - print STDERR " ITERATION $iter TOOK $diff SECONDS\n"; - $iter = $next_iter; - if ($result =~ /1$/) { - print STDERR "Training converged.\n"; - last; - } - $iter_attempts = 1; -} - -print "FINAL WEIGHTS: $dir/weights.$iter\n"; -`mv $dir/weights.$iter.gz $dir/weights.final.gz`; - -sub usage { - die <) { $lines++; } - close $fh; - return $lines; -} diff --git a/training/compute_cllh.cc b/training/compute_cllh.cc deleted file mode 100644 index b496d196..00000000 --- a/training/compute_cllh.cc +++ /dev/null @@ -1,196 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "config.h" -#ifdef HAVE_MPI -#include -#endif -#include -#include - -#include "verbose.h" -#include "hg.h" -#include "prob.h" -#include "inside_outside.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "weights.h" - -using namespace std; -namespace po = boost::program_options; - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("weights,w",po::value(),"Input feature weights file") - ("training_data,t",po::value(),"Training data corpus") - ("decoder_config,c",po::value(),"Decoder configuration file"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) { - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c, vector* ids) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - int lc = 0; - while(in) { - getline(in, line); - if (!in) break; - if (lc % size == rank) { - c->push_back(line); - ids->push_back(lc); - } - ++lc; - } -} - -static const double kMINUS_EPSILON = -1e-6; - -struct TrainingObserver : public DecoderObserver { - void Reset() { - acc_obj = 0; - } - - virtual void NotifyDecodingStart(const SentenceMetadata&) { - cur_obj = 0; - state = 1; - } - - // compute model expectations, denominator of objective - virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) { - assert(state == 1); - state = 2; - SparseVector cur_model_exp; - const prob_t z = InsideOutside, - EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); - cur_obj = log(z); - } - - // compute "empirical" expectations, numerator of objective - virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 2); - state = 3; - SparseVector ref_exp; - const prob_t ref_z = InsideOutside, - EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp); - - double log_ref_z; -#if 0 - if (crf_uniform_empirical) { - log_ref_z = ref_exp.dot(feature_weights); - } else { - log_ref_z = log(ref_z); - } -#else - log_ref_z = log(ref_z); -#endif - - // rounding errors means that <0 is too strict - if ((cur_obj - log_ref_z) < kMINUS_EPSILON) { - cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl; - exit(1); - } - assert(!isnan(log_ref_z)); - acc_obj += (cur_obj - log_ref_z); - } - - double acc_obj; - double cur_obj; - int state; -}; - -#ifdef HAVE_MPI -namespace mpi = boost::mpi; -#endif - -int main(int argc, char** argv) { -#ifdef HAVE_MPI - mpi::environment env(argc, argv); - mpi::communicator world; - const int size = world.size(); - const int rank = world.rank(); -#else - const int size = 1; - const int rank = 0; -#endif - if (size > 1) SetSilent(true); // turn off verbose decoder output - register_feature_functions(); - - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) - return false; - - // load cdec.ini and set up decoder - ReadFile ini_rf(conf["decoder_config"].as()); - Decoder decoder(ini_rf.stream()); - if (decoder.GetConf()["input"].as() != "-") { - cerr << "cdec.ini must not set an input file\n"; - abort(); - } - - // load weights - vector& weights = decoder.CurrentWeightVector(); - if (conf.count("weights")) - Weights::InitFromFile(conf["weights"].as(), &weights); - - // freeze feature set - //const bool freeze_feature_set = conf.count("freeze_feature_set"); - //if (freeze_feature_set) FD::Freeze(); - - vector corpus; vector ids; - ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus, &ids); - assert(corpus.size() > 0); - assert(corpus.size() == ids.size()); - - TrainingObserver observer; - double objective = 0; - - observer.Reset(); - if (rank == 0) - cerr << "Each processor is decoding " << corpus.size() << " training examples...\n"; - - for (int i = 0; i < corpus.size(); ++i) { - decoder.SetId(ids[i]); - decoder.Decode(corpus[i], &observer); - } - -#ifdef HAVE_MPI - reduce(world, observer.acc_obj, objective, std::plus(), 0); -#else - objective = observer.acc_obj; -#endif - - if (rank == 0) - cout << "OBJECTIVE: " << objective << endl; - - return 0; -} - diff --git a/training/make-lexcrf-grammar.pl b/training/make-lexcrf-grammar.pl deleted file mode 100755 index 8cdf7718..00000000 --- a/training/make-lexcrf-grammar.pl +++ /dev/null @@ -1,285 +0,0 @@ -#!/usr/bin/perl -w -use utf8; -use strict; -my ($effile, $model1) = @ARGV; -die "Usage: $0 corpus.fr-en corpus.model1\n" unless $effile && -f $effile && $model1 && -f $model1; - -open EF, "<$effile" or die; -open M1, "<$model1" or die; -binmode(EF,":utf8"); -binmode(M1,":utf8"); -binmode(STDOUT,":utf8"); -my %model1; -while() { - chomp; - my ($f, $e, $lp) = split /\s+/; - $model1{$f}->{$e} = $lp; -} - -my $ADD_MODEL1 = 0; # found that model1 hurts performance -my $IS_FRENCH_F = 1; # indicates that the f language is french -my $IS_ARABIC_F = 0; # indicates that the f language is arabic -my $IS_URDU_F = 0; # indicates that the f language is arabic -my $ADD_PREFIX_ID = 0; -my $ADD_LEN = 1; -my $ADD_SIM = 1; -my $ADD_DICE = 1; -my $ADD_111 = 1; -my $ADD_ID = 1; -my $ADD_PUNC = 1; -my $ADD_NUM_MM = 1; -my $ADD_NULL = 1; -my $ADD_STEM_ID = 1; -my $BEAM_RATIO = 50; - -my %fdict; -my %fcounts; -my %ecounts; - -my %sdict; - -while() { - chomp; - my ($f, $e) = split /\s*\|\|\|\s*/; - my @es = split /\s+/, $e; - my @fs = split /\s+/, $f; - for my $ew (@es){ $ecounts{$ew}++; } - push @fs, '' if $ADD_NULL; - for my $fw (@fs){ $fcounts{$fw}++; } - for my $fw (@fs){ - for my $ew (@es){ - $fdict{$fw}->{$ew}++; - } - } -} - -print STDERR "Dice 0\n" if $ADD_DICE; -print STDERR "OneOneOne 0\nId_OneOneOne 0\n" if $ADD_111; -print STDERR "Identical 0\n" if $ADD_ID; -print STDERR "PuncMiss 0\n" if $ADD_PUNC; -print STDERR "IsNull 0\n" if $ADD_NULL; -print STDERR "Model1 0\n" if $ADD_MODEL1; -print STDERR "DLen 0\n" if $ADD_LEN; -print STDERR "NumMM 0\nNumMatch 0\n" if $ADD_NUM_MM; -print STDERR "OrthoSim 0\n" if $ADD_SIM; -print STDERR "PfxIdentical 0\n" if ($ADD_PREFIX_ID); -my $fc = 1000000; -my $sids = 1000000; -for my $f (sort keys %fdict) { - my $re = $fdict{$f}; - my $max; - for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) { - my $efcount = $re->{$e}; - unless (defined $max) { $max = $efcount; } - my $m1 = $model1{$f}->{$e}; - unless (defined $m1) { next; } - $fc++; - my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); - my $feats = "F$fc=1"; - my $oe = $e; - my $of = $f; # normalized form - if ($IS_FRENCH_F) { - # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French - $of =~ s/â/as/g; - $of =~ s/ê/es/g; - $of =~ s/î/is/g; - $of =~ s/ô/os/g; - $of =~ s/û/us/g; - } elsif ($IS_ARABIC_F) { - if (length($of) > 1 && !($of =~ /\d/)) { - $of =~ s/\$/sh/g; - } - } elsif ($IS_URDU_F) { - if (length($of) > 1 && !($of =~ /\d/)) { - $of =~ s/\$/sh/g; - } - $oe =~ s/^-e-//; - $oe =~ s/^al-/al/; - $of =~ s/([a-z])\~/$1$1/g; - $of =~ s/E/'/g; - $of =~ s/^Aw/o/g; - $of =~ s/\|/a/g; - $of =~ s/@/h/g; - $of =~ s/c/ch/g; - $of =~ s/x/kh/g; - $of =~ s/\*/dh/g; - $of =~ s/w/o/g; - $of =~ s/Z/dh/g; - $of =~ s/y/i/g; - $of =~ s/Y/a/g; - $of = lc $of; - } - my $len_e = length($oe); - my $len_f = length($of); - $feats .= " Model1=$m1" if ($ADD_MODEL1); - $feats .= " Dice=$dice" if $ADD_DICE; - my $is_null = undef; - if ($ADD_NULL && $f eq '') { - $feats .= " IsNull=1"; - $is_null = 1; - } - if ($ADD_LEN) { - if (!$is_null) { - my $dlen = abs($len_e - $len_f); - $feats .= " DLen=$dlen"; - } - } - my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); - my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); - my $both_non_numeric = (!$e_num && !$f_num); - if ($ADD_NUM_MM && (($f_num && !$e_num) || ($e_num && !$f_num))) { - $feats .= " NumMM=1"; - } - if ($ADD_NUM_MM && ($f_num && $e_num) && ($oe eq $of)) { - $feats .= " NumMatch=1"; - } - if ($ADD_STEM_ID) { - my $el = 4; - my $fl = 4; - if ($oe =~ /^al|re|co/) { $el++; } - if ($of =~ /^al|re|co/) { $fl++; } - if ($oe =~ /^trans|inter/) { $el+=2; } - if ($of =~ /^trans|inter/) { $fl+=2; } - if ($fl > length($of)) { $fl = length($of); } - if ($el > length($oe)) { $el = length($oe); } - my $sf = substr $of, 0, $fl; - my $se = substr $oe, 0, $el; - my $id = $sdict{$sf}->{$se}; - if (!$id) { - $sids++; - $sdict{$sf}->{$se} = $sids; - $id = $sids; - print STDERR "S$sids 0\n" - } - $feats .= " S$id=1"; - } - if ($ADD_PREFIX_ID) { - if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { - my $pe = substr $oe, 0, 3; - my $pf = substr $of, 0, 3; - if ($pe eq $pf) { $feats .= " PfxIdentical=1"; } - } - } - if ($ADD_SIM) { - my $ld = 0; - my $eff = $len_e; - if ($eff < $len_f) { $eff = $len_f; } - if (!$is_null) { - $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); - } - $feats .= " OrthoSim=$ld"; - } - my $ident = ($e eq $f); - if ($ident && $ADD_ID) { $feats .= " Identical=1"; } - if ($ADD_111 && ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1)) { - if ($ident && $ADD_ID) { - $feats .= " Id_OneOneOne=1"; - } - $feats .= " OneOneOne=1"; - } - if ($ADD_PUNC) { - if (($f =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $e =~ /[a-z]+/) || - ($e =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $f =~ /[a-z]+/)) { - $feats .= " PuncMiss=1"; - } - } - my $r = (0.5 - rand)/5; - print STDERR "F$fc $r\n"; - print "$f ||| $e ||| $feats\n"; - } -} - -sub levenshtein -{ - # $s1 and $s2 are the two strings - # $len1 and $len2 are their respective lengths - # - my ($s1, $s2) = @_; - my ($len1, $len2) = (length $s1, length $s2); - - # If one of the strings is empty, the distance is the length - # of the other string - # - return $len2 if ($len1 == 0); - return $len1 if ($len2 == 0); - - my %mat; - - # Init the distance matrix - # - # The first row to 0..$len1 - # The first column to 0..$len2 - # The rest to 0 - # - # The first row and column are initialized so to denote distance - # from the empty string - # - for (my $i = 0; $i <= $len1; ++$i) - { - for (my $j = 0; $j <= $len2; ++$j) - { - $mat{$i}{$j} = 0; - $mat{0}{$j} = $j; - } - - $mat{$i}{0} = $i; - } - - # Some char-by-char processing is ahead, so prepare - # array of chars from the strings - # - my @ar1 = split(//, $s1); - my @ar2 = split(//, $s2); - - for (my $i = 1; $i <= $len1; ++$i) - { - for (my $j = 1; $j <= $len2; ++$j) - { - # Set the cost to 1 iff the ith char of $s1 - # equals the jth of $s2 - # - # Denotes a substitution cost. When the char are equal - # there is no need to substitute, so the cost is 0 - # - my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; - - # Cell $mat{$i}{$j} equals the minimum of: - # - # - The cell immediately above plus 1 - # - The cell immediately to the left plus 1 - # - The cell diagonally above and to the left plus the cost - # - # We can either insert a new char, delete a char or - # substitute an existing char (with an associated cost) - # - $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, - $mat{$i}{$j-1} + 1, - $mat{$i-1}{$j-1} + $cost]); - } - } - - # Finally, the Levenshtein distance equals the rightmost bottom cell - # of the matrix - # - # Note that $mat{$x}{$y} denotes the distance between the substrings - # 1..$x and 1..$y - # - return $mat{$len1}{$len2}; -} - - -# minimal element of a list -# -sub min -{ - my @list = @{$_[0]}; - my $min = $list[0]; - - foreach my $i (@list) - { - $min = $i if ($i < $min); - } - - return $min; -} - diff --git a/training/mpi_compute_cllh.cc b/training/mpi_compute_cllh.cc new file mode 100644 index 00000000..b496d196 --- /dev/null +++ b/training/mpi_compute_cllh.cc @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include + +#include "config.h" +#ifdef HAVE_MPI +#include +#endif +#include +#include + +#include "verbose.h" +#include "hg.h" +#include "prob.h" +#include "inside_outside.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "weights.h" + +using namespace std; +namespace po = boost::program_options; + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("weights,w",po::value(),"Input feature weights file") + ("training_data,t",po::value(),"Training data corpus") + ("decoder_config,c",po::value(),"Decoder configuration file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c, vector* ids) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + int lc = 0; + while(in) { + getline(in, line); + if (!in) break; + if (lc % size == rank) { + c->push_back(line); + ids->push_back(lc); + } + ++lc; + } +} + +static const double kMINUS_EPSILON = -1e-6; + +struct TrainingObserver : public DecoderObserver { + void Reset() { + acc_obj = 0; + } + + virtual void NotifyDecodingStart(const SentenceMetadata&) { + cur_obj = 0; + state = 1; + } + + // compute model expectations, denominator of objective + virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) { + assert(state == 1); + state = 2; + SparseVector cur_model_exp; + const prob_t z = InsideOutside, + EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); + cur_obj = log(z); + } + + // compute "empirical" expectations, numerator of objective + virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { + assert(state == 2); + state = 3; + SparseVector ref_exp; + const prob_t ref_z = InsideOutside, + EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp); + + double log_ref_z; +#if 0 + if (crf_uniform_empirical) { + log_ref_z = ref_exp.dot(feature_weights); + } else { + log_ref_z = log(ref_z); + } +#else + log_ref_z = log(ref_z); +#endif + + // rounding errors means that <0 is too strict + if ((cur_obj - log_ref_z) < kMINUS_EPSILON) { + cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl; + exit(1); + } + assert(!isnan(log_ref_z)); + acc_obj += (cur_obj - log_ref_z); + } + + double acc_obj; + double cur_obj; + int state; +}; + +#ifdef HAVE_MPI +namespace mpi = boost::mpi; +#endif + +int main(int argc, char** argv) { +#ifdef HAVE_MPI + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); +#else + const int size = 1; + const int rank = 0; +#endif + if (size > 1) SetSilent(true); // turn off verbose decoder output + register_feature_functions(); + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) + return false; + + // load cdec.ini and set up decoder + ReadFile ini_rf(conf["decoder_config"].as()); + Decoder decoder(ini_rf.stream()); + if (decoder.GetConf()["input"].as() != "-") { + cerr << "cdec.ini must not set an input file\n"; + abort(); + } + + // load weights + vector& weights = decoder.CurrentWeightVector(); + if (conf.count("weights")) + Weights::InitFromFile(conf["weights"].as(), &weights); + + // freeze feature set + //const bool freeze_feature_set = conf.count("freeze_feature_set"); + //if (freeze_feature_set) FD::Freeze(); + + vector corpus; vector ids; + ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus, &ids); + assert(corpus.size() > 0); + assert(corpus.size() == ids.size()); + + TrainingObserver observer; + double objective = 0; + + observer.Reset(); + if (rank == 0) + cerr << "Each processor is decoding " << corpus.size() << " training examples...\n"; + + for (int i = 0; i < corpus.size(); ++i) { + decoder.SetId(ids[i]); + decoder.Decode(corpus[i], &observer); + } + +#ifdef HAVE_MPI + reduce(world, observer.acc_obj, objective, std::plus(), 0); +#else + objective = observer.acc_obj; +#endif + + if (rank == 0) + cout << "OBJECTIVE: " << objective << endl; + + return 0; +} + diff --git a/utils/stringlib.cc b/utils/stringlib.cc index 3a56965c..1a152985 100644 --- a/utils/stringlib.cc +++ b/utils/stringlib.cc @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -104,11 +105,11 @@ public: typedef unsigned int size_type; // must be 32bit MD5(); - MD5(const std::string& text); + MD5(const string& text); void update(const unsigned char *buf, size_type length); void update(const char *buf, size_type length); MD5& finalize(); - std::string hexdigest() const; + string hexdigest() const; private: void init(); @@ -209,7 +210,7 @@ MD5::MD5() ////////////////////////////////////////////// // nifty shortcut ctor, compute MD5 for string and finalize it right away -MD5::MD5(const std::string &text) +MD5::MD5(const string &text) { init(); update(text.c_str(), text.length()); @@ -433,8 +434,7 @@ MD5& MD5::finalize() ////////////////////////////// // return hex representation of digest as string -std::string MD5::hexdigest() const -{ +string MD5::hexdigest() const { if (!finalized) return ""; @@ -443,12 +443,12 @@ std::string MD5::hexdigest() const sprintf(buf+i*2, "%02x", digest[i]); buf[32]=0; - return std::string(buf); + return string(buf); } ////////////////////////////// -std::string md5(const std::string& str) { +string md5(const string& str) { MD5 md5 = MD5(str); return md5.hexdigest(); } -- cgit v1.2.3 From ce75d2ce4e405b6c5ddec069fc9844bed64d548d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 28 Sep 2011 16:04:41 +0100 Subject: fix broken compile on weights test --- utils/weights_test.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'utils') diff --git a/utils/weights_test.cc b/utils/weights_test.cc index 8a4c26ef..938b311f 100644 --- a/utils/weights_test.cc +++ b/utils/weights_test.cc @@ -14,11 +14,10 @@ class WeightsTest : public testing::Test { virtual void TearDown() { } }; - TEST_F(WeightsTest,Load) { - Weights w; - w.InitFromFile("test_data/weights"); - w.WriteToFile("-"); + vector v; + Weights::InitFromFile("test_data/weights", &v); + Weights::WriteToFile("-", v); } int main(int argc, char **argv) { -- cgit v1.2.3 From af159e4c7066ea9a96f077e7e9265c8571f02053 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 11 Oct 2011 12:06:32 +0100 Subject: check in some experimental particle filtering code, some gitignore fixes --- .gitignore | 24 +- Makefile.am | 2 +- configure.ac | 2 +- gi/markov_al/Makefile.am | 6 + gi/markov_al/README | 2 + gi/markov_al/ml.cc | 470 +++++++++++++++++++++++++++++ gi/pf/Makefile.am | 21 ++ gi/pf/README | 2 + gi/pf/base_measures.cc | 112 +++++++ gi/pf/base_measures.h | 116 +++++++ gi/pf/brat.cc | 554 ++++++++++++++++++++++++++++++++++ gi/pf/cbgi.cc | 340 +++++++++++++++++++++ gi/pf/cfg_wfst_composer.cc | 730 +++++++++++++++++++++++++++++++++++++++++++++ gi/pf/cfg_wfst_composer.h | 46 +++ gi/pf/dpnaive.cc | 349 ++++++++++++++++++++++ gi/pf/itg.cc | 224 ++++++++++++++ gi/pf/pfbrat.cc | 554 ++++++++++++++++++++++++++++++++++ gi/pf/pfdist.cc | 621 ++++++++++++++++++++++++++++++++++++++ gi/pf/pfdist.new.cc | 620 ++++++++++++++++++++++++++++++++++++++ gi/pf/pfnaive.cc | 385 ++++++++++++++++++++++++ gi/pf/reachability.cc | 64 ++++ gi/pf/reachability.h | 28 ++ gi/pf/tpf.cc | 99 ++++++ m4/acx_pthread.m4 | 363 ++++++++++++++++++++++ utils/ccrp_nt.h | 169 +++++++++++ utils/ccrp_onetable.h | 241 +++++++++++++++ 26 files changed, 6141 insertions(+), 3 deletions(-) create mode 100644 gi/markov_al/Makefile.am create mode 100644 gi/markov_al/README create mode 100644 gi/markov_al/ml.cc create mode 100644 gi/pf/Makefile.am create mode 100644 gi/pf/README create mode 100644 gi/pf/base_measures.cc create mode 100644 gi/pf/base_measures.h create mode 100644 gi/pf/brat.cc create mode 100644 gi/pf/cbgi.cc create mode 100644 gi/pf/cfg_wfst_composer.cc create mode 100644 gi/pf/cfg_wfst_composer.h create mode 100644 gi/pf/dpnaive.cc create mode 100644 gi/pf/itg.cc create mode 100644 gi/pf/pfbrat.cc create mode 100644 gi/pf/pfdist.cc create mode 100644 gi/pf/pfdist.new.cc create mode 100644 gi/pf/pfnaive.cc create mode 100644 gi/pf/reachability.cc create mode 100644 gi/pf/reachability.h create mode 100644 gi/pf/tpf.cc create mode 100644 m4/acx_pthread.m4 create mode 100644 utils/ccrp_nt.h create mode 100644 utils/ccrp_onetable.h (limited to 'utils') diff --git a/.gitignore b/.gitignore index 2a287bbc..5efe37b0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,27 @@ +pro-train/.deps +pro-train/mr_pro_map +pro-train/mr_pro_reduce +utils/reconstruct_weights +decoder/.libs +training/augment_grammar +training/mpi_batch_optimize +training/mpi_compute_cllh +training/mpi_em_optimize +training/mpi_extract_features +training/mpi_extract_reachable klm/lm/build_binary extools/extractor_monolingual +gi/pf/.deps +gi/pf/brat +gi/pf/cbgi +gi/pf/dpnaive +gi/pf/itg +gi/pf/libpf.a +gi/pf/pfbrat +gi/pf/pfdist +gi/pf/pfnaive +gi/markov_al/.deps +gi/markov_al/ml gi/posterior-regularisation/prjava/lib/*.jar klm/lm/libklm.a klm/util/.deps @@ -120,4 +142,4 @@ gi/posterior-regularisation/prjava/lib/prjava-20100715.jar *.dvi *.ps *.toc -*~ \ No newline at end of file +*~ diff --git a/Makefile.am b/Makefile.am index 98b4bac7..59c2fc0a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,7 +1,7 @@ # warning - the subdirectories in the following list should # be kept in topologically sorted order. Also, DO NOT introduce # cyclic dependencies between these directories! -SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira vest pro-train extools +SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira vest pro-train extools gi/pf gi/markov_al #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava diff --git a/configure.ac b/configure.ac index 2e9cc36d..131a1705 100644 --- a/configure.ac +++ b/configure.ac @@ -113,4 +113,4 @@ then AM_CONDITIONAL([GLC], true) fi -AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile vest/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/cbgi/Makefile gi/ml/Makefile) +AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile vest/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile) diff --git a/gi/markov_al/Makefile.am b/gi/markov_al/Makefile.am new file mode 100644 index 00000000..fe3e3349 --- /dev/null +++ b/gi/markov_al/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = ml + +ml_SOURCES = ml.cc + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +AM_LDFLAGS = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/markov_al/README b/gi/markov_al/README new file mode 100644 index 00000000..9c10f7cd --- /dev/null +++ b/gi/markov_al/README @@ -0,0 +1,2 @@ +Experimental translation models with Markovian dependencies. + diff --git a/gi/markov_al/ml.cc b/gi/markov_al/ml.cc new file mode 100644 index 00000000..1e71edd6 --- /dev/null +++ b/gi/markov_al/ml.cc @@ -0,0 +1,470 @@ +#include +#include + +#include +#include +#include +#include + +#include "tdict.h" +#include "filelib.h" +#include "sampler.h" +#include "ccrp_onetable.h" +#include "array2d.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +void PrintTopCustomers(const CCRP_OneTable& crp) { + for (CCRP_OneTable::const_iterator it = crp.begin(); it != crp.end(); ++it) { + cerr << " " << TD::Convert(it->first) << " = " << it->second << endl; + } +} + +void PrintAlignment(const vector& src, const vector& trg, const vector& a) { + cerr << TD::GetString(src) << endl << TD::GetString(trg) << endl; + Array2D al(src.size(), trg.size()); + for (int i = 0; i < a.size(); ++i) + if (a[i] != 255) al(a[i], i) = true; + cerr << al << endl; +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct Unigram; +struct Bigram { + Bigram() : trg(), cond() {} + Bigram(WordID prev, WordID cur, WordID t) : trg(t) { cond.first = prev; cond.second = cur; } + const pair& ConditioningPair() const { + return cond; + } + WordID& prev_src() { return cond.first; } + WordID& cur_src() { return cond.second; } + const WordID& prev_src() const { return cond.first; } + const WordID& cur_src() const { return cond.second; } + WordID trg; + private: + pair cond; +}; + +struct Unigram { + Unigram() : cur_src(), trg() {} + Unigram(WordID s, WordID t) : cur_src(s), trg(t) {} + WordID cur_src; + WordID trg; +}; + +ostream& operator<<(ostream& os, const Bigram& b) { + os << "( " << TD::Convert(b.trg) << " | " << TD::Convert(b.prev_src()) << " , " << TD::Convert(b.cur_src()) << " )"; + return os; +} + +ostream& operator<<(ostream& os, const Unigram& u) { + os << "( " << TD::Convert(u.trg) << " | " << TD::Convert(u.cur_src) << " )"; + return os; +} + +bool operator==(const Bigram& a, const Bigram& b) { + return a.trg == b.trg && a.cur_src() == b.cur_src() && a.prev_src() == b.prev_src(); +} + +bool operator==(const Unigram& a, const Unigram& b) { + return a.trg == b.trg && a.cur_src == b.cur_src; +} + +size_t hash_value(const Bigram& b) { + size_t h = boost::hash_value(b.prev_src()); + boost::hash_combine(h, boost::hash_value(b.cur_src())); + boost::hash_combine(h, boost::hash_value(b.trg)); + return h; +} + +size_t hash_value(const Unigram& u) { + size_t h = boost::hash_value(u.cur_src); + boost::hash_combine(h, boost::hash_value(u.trg)); + return h; +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +struct UnigramModel { + UnigramModel(size_t src_voc_size, size_t trg_voc_size) : + unigrams(TD::NumWords() + 1, CCRP_OneTable(1,1,1,1)), + p0(1.0 / trg_voc_size) {} + + void increment(const Bigram& b) { + unigrams[b.cur_src()].increment(b.trg); + } + + void decrement(const Bigram& b) { + unigrams[b.cur_src()].decrement(b.trg); + } + + double prob(const Bigram& b) const { + const double q0 = unigrams[b.cur_src()].prob(b.trg, p0); + return q0; + } + + double LogLikelihood() const { + double llh = 0; + for (unsigned i = 0; i < unigrams.size(); ++i) { + const CCRP_OneTable& crp = unigrams[i]; + if (crp.num_customers() > 0) { + llh += crp.log_crp_prob(); + llh += crp.num_tables() * log(p0); + } + } + return llh; + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < unigrams.size(); ++i) + unigrams[i].resample_hyperparameters(rng); + } + + vector > unigrams; // unigrams[src].prob(trg, p0) = p(trg|src) + + const double p0; +}; + +struct BigramModel { + BigramModel(size_t src_voc_size, size_t trg_voc_size) : + unigrams(TD::NumWords() + 1, CCRP_OneTable(1,1,1,1)), + p0(1.0 / trg_voc_size) {} + + void increment(const Bigram& b) { + BigramMap::iterator it = bigrams.find(b.ConditioningPair()); + if (it == bigrams.end()) { + it = bigrams.insert(make_pair(b.ConditioningPair(), CCRP_OneTable(1,1,1,1))).first; + } + if (it->second.increment(b.trg)) + unigrams[b.cur_src()].increment(b.trg); + } + + void decrement(const Bigram& b) { + BigramMap::iterator it = bigrams.find(b.ConditioningPair()); + assert(it != bigrams.end()); + if (it->second.decrement(b.trg)) { + unigrams[b.cur_src()].decrement(b.trg); + if (it->second.num_customers() == 0) + bigrams.erase(it); + } + } + + double prob(const Bigram& b) const { + const double q0 = unigrams[b.cur_src()].prob(b.trg, p0); + const BigramMap::const_iterator it = bigrams.find(b.ConditioningPair()); + if (it == bigrams.end()) return q0; + return it->second.prob(b.trg, q0); + } + + double LogLikelihood() const { + double llh = 0; + for (unsigned i = 0; i < unigrams.size(); ++i) { + const CCRP_OneTable& crp = unigrams[i]; + if (crp.num_customers() > 0) { + llh += crp.log_crp_prob(); + llh += crp.num_tables() * log(p0); + } + } + for (BigramMap::const_iterator it = bigrams.begin(); it != bigrams.end(); ++it) { + const CCRP_OneTable& crp = it->second; + const WordID cur_src = it->first.second; + llh += crp.log_crp_prob(); + for (CCRP_OneTable::const_iterator bit = crp.begin(); bit != crp.end(); ++bit) { + llh += log(unigrams[cur_src].prob(bit->second, p0)); + } + } + return llh; + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < unigrams.size(); ++i) + unigrams[i].resample_hyperparameters(rng); + for (BigramMap::iterator it = bigrams.begin(); it != bigrams.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + typedef unordered_map, CCRP_OneTable, boost::hash > > BigramMap; + BigramMap bigrams; // bigrams[(src-1,src)].prob(trg, q0) = p(trg|src,src-1) + vector > unigrams; // unigrams[src].prob(trg, p0) = p(trg|src) + + const double p0; +}; + +struct BigramAlignmentModel { + BigramAlignmentModel(size_t src_voc_size, size_t trg_voc_size) : bigrams(TD::NumWords() + 1, CCRP_OneTable(1,1,1,1)), p0(1.0 / src_voc_size) {} + void increment(WordID prev, WordID next) { + bigrams[prev].increment(next); // hierarchy? + } + void decrement(WordID prev, WordID next) { + bigrams[prev].decrement(next); // hierarchy? + } + double prob(WordID prev, WordID next) { + return bigrams[prev].prob(next, p0); + } + double LogLikelihood() const { + double llh = 0; + for (unsigned i = 0; i < bigrams.size(); ++i) { + const CCRP_OneTable& crp = bigrams[i]; + if (crp.num_customers() > 0) { + llh += crp.log_crp_prob(); + llh += crp.num_tables() * log(p0); + } + } + return llh; + } + + vector > bigrams; // bigrams[prev].prob(next, p0) = p(next|prev) + const double p0; +}; + +struct Alignment { + vector a; +}; + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as(); + + boost::shared_ptr prng; + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + cerr << "Reading corpus...\n"; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + assert(corpusf.size() == corpuse.size()); + const size_t corpus_len = corpusf.size(); + const WordID kNULL = TD::Convert(""); + const WordID kBOS = TD::Convert(""); + const WordID kEOS = TD::Convert(""); + Bigram TT(kBOS, TD::Convert("我"), TD::Convert("i")); + Bigram TT2(kBOS, TD::Convert("要"), TD::Convert("i")); + + UnigramModel model(vocabf.size(), vocabe.size()); + vector alignments(corpus_len); + for (unsigned ci = 0; ci < corpus_len; ++ci) { + const vector& src = corpusf[ci]; + const vector& trg = corpuse[ci]; + vector& alg = alignments[ci].a; + alg.resize(trg.size()); + int lenp1 = src.size() + 1; + WordID prev_src = kBOS; + for (int j = 0; j < trg.size(); ++j) { + int samp = lenp1 * rng.next(); + --samp; + if (samp < 0) samp = 255; + alg[j] = samp; + WordID cur_src = (samp == 255 ? kNULL : src[alg[j]]); + Bigram b(prev_src, cur_src, trg[j]); + model.increment(b); + prev_src = cur_src; + } + Bigram b(prev_src, kEOS, kEOS); + model.increment(b); + } + cerr << "Initial LLH: " << model.LogLikelihood() << endl; + + SampleSet ss; + for (unsigned si = 0; si < 50; ++si) { + for (unsigned ci = 0; ci < corpus_len; ++ci) { + const vector& src = corpusf[ci]; + const vector& trg = corpuse[ci]; + vector& alg = alignments[ci].a; + WordID prev_src = kBOS; + for (unsigned j = 0; j < trg.size(); ++j) { + unsigned char& a_j = alg[j]; + WordID cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); + Bigram b(prev_src, cur_e_a_j, trg[j]); + //cerr << "DEC: " << b << "\t" << nextb << endl; + model.decrement(b); + ss.clear(); + for (unsigned i = 0; i <= src.size(); ++i) { + const WordID cur_src = (i ? src[i-1] : kNULL); + b.cur_src() = cur_src; + ss.add(model.prob(b)); + } + int sampled_a_j = rng.SelectSample(ss); + a_j = (sampled_a_j ? sampled_a_j - 1 : 255); + cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); + b.cur_src() = cur_e_a_j; + //cerr << "INC: " << b << "\t" << nextb << endl; + model.increment(b); + prev_src = cur_e_a_j; + } + } + cerr << '.' << flush; + if (si % 10 == 9) { + cerr << "[LLH prev=" << model.LogLikelihood(); + //model.ResampleHyperparameters(&rng); + cerr << " new=" << model.LogLikelihood() << "]\n"; + //pair xx = make_pair(kBOS, TD::Convert("我")); + //PrintTopCustomers(model.bigrams.find(xx)->second); + cerr << "p(" << TT << ") = " << model.prob(TT) << endl; + cerr << "p(" << TT2 << ") = " << model.prob(TT2) << endl; + PrintAlignment(corpusf[0], corpuse[0], alignments[0].a); + } + } + { + // MODEL 2 + BigramModel model(vocabf.size(), vocabe.size()); + BigramAlignmentModel amodel(vocabf.size(), vocabe.size()); + for (unsigned ci = 0; ci < corpus_len; ++ci) { + const vector& src = corpusf[ci]; + const vector& trg = corpuse[ci]; + vector& alg = alignments[ci].a; + WordID prev_src = kBOS; + for (int j = 0; j < trg.size(); ++j) { + WordID cur_src = (alg[j] == 255 ? kNULL : src[alg[j]]); + Bigram b(prev_src, cur_src, trg[j]); + model.increment(b); + amodel.increment(prev_src, cur_src); + prev_src = cur_src; + } + amodel.increment(prev_src, kEOS); + Bigram b(prev_src, kEOS, kEOS); + model.increment(b); + } + cerr << "Initial LLH: " << model.LogLikelihood() << " " << amodel.LogLikelihood() << endl; + + SampleSet ss; + for (unsigned si = 0; si < samples; ++si) { + for (unsigned ci = 0; ci < corpus_len; ++ci) { + const vector& src = corpusf[ci]; + const vector& trg = corpuse[ci]; + vector& alg = alignments[ci].a; + WordID prev_src = kBOS; + for (unsigned j = 0; j < trg.size(); ++j) { + unsigned char& a_j = alg[j]; + WordID cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); + Bigram b(prev_src, cur_e_a_j, trg[j]); + WordID next_src = kEOS; + WordID next_trg = kEOS; + if (j < (trg.size() - 1)) { + next_src = (alg[j+1] == 255 ? kNULL : src[alg[j + 1]]); + next_trg = trg[j + 1]; + } + Bigram nextb(cur_e_a_j, next_src, next_trg); + //cerr << "DEC: " << b << "\t" << nextb << endl; + model.decrement(b); + model.decrement(nextb); + amodel.decrement(prev_src, cur_e_a_j); + amodel.decrement(cur_e_a_j, next_src); + ss.clear(); + for (unsigned i = 0; i <= src.size(); ++i) { + const WordID cur_src = (i ? src[i-1] : kNULL); + b.cur_src() = cur_src; + ss.add(model.prob(b) * model.prob(nextb) * amodel.prob(prev_src, cur_src) * amodel.prob(cur_src, next_src)); + //cerr << log(ss[ss.size() - 1]) << "\t" << b << endl; + } + int sampled_a_j = rng.SelectSample(ss); + a_j = (sampled_a_j ? sampled_a_j - 1 : 255); + cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); + b.cur_src() = cur_e_a_j; + nextb.prev_src() = cur_e_a_j; + //cerr << "INC: " << b << "\t" << nextb << endl; + //exit(1); + model.increment(b); + model.increment(nextb); + amodel.increment(prev_src, cur_e_a_j); + amodel.increment(cur_e_a_j, next_src); + prev_src = cur_e_a_j; + } + } + cerr << '.' << flush; + if (si % 10 == 9) { + cerr << "[LLH prev=" << (model.LogLikelihood() + amodel.LogLikelihood()); + //model.ResampleHyperparameters(&rng); + cerr << " new=" << model.LogLikelihood() << "]\n"; + pair xx = make_pair(kBOS, TD::Convert("我")); + cerr << "p(" << TT << ") = " << model.prob(TT) << endl; + cerr << "p(" << TT2 << ") = " << model.prob(TT2) << endl; + pair xx2 = make_pair(kBOS, TD::Convert("要")); + PrintTopCustomers(model.bigrams.find(xx)->second); + //PrintTopCustomers(amodel.bigrams[TD::Convert("")]); + //PrintTopCustomers(model.unigrams[TD::Convert("")]); + PrintAlignment(corpusf[0], corpuse[0], alignments[0].a); + } + } + } + return 0; +} + diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am new file mode 100644 index 00000000..c9764ad5 --- /dev/null +++ b/gi/pf/Makefile.am @@ -0,0 +1,21 @@ +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive + +noinst_LIBRARIES = libpf.a +libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc + +itg_SOURCES = itg.cc + +dpnaive_SOURCES = dpnaive.cc + +pfdist_SOURCES = pfdist.cc + +pfnaive_SOURCES = pfnaive.cc + +cbgi_SOURCES = cbgi.cc + +brat_SOURCES = brat.cc + +pfbrat_SOURCES = pfbrat.cc + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pf/README b/gi/pf/README new file mode 100644 index 00000000..62e47541 --- /dev/null +++ b/gi/pf/README @@ -0,0 +1,2 @@ +Experimental Bayesian alignment tools. Nothing to see here. + diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc new file mode 100644 index 00000000..f8ddfd32 --- /dev/null +++ b/gi/pf/base_measures.cc @@ -0,0 +1,112 @@ +#include "base_measures.h" + +#include + +#include "filelib.h" + +using namespace std; + +void Model1::LoadModel1(const string& fname) { + cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + int cur = 0; + int start = 0; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + const WordID src = TD::Convert(&line[0]); + ++cur; + start = cur; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + WordID trg = TD::Convert(&line[start]); + const double logprob = strtod(&line[cur + 1], NULL); + if (src >= ttable.size()) ttable.resize(src + 1); + ttable[src][trg].logeq(logprob); + } + cerr << " read " << lc << " parameters.\n"; +} + +prob_t PhraseConditionalBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) { // for each position i in e-RHS + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + return p; +} + +prob_t PhraseJointBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + // elen | flen ~Pois(flen + 0.01) + prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + p *= ptrglen; + p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform + for (int i = 0; i < elen; ++i) { // for each position i in E + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + return p; +} + +JumpBase::JumpBase() : p(200) { + for (unsigned src_len = 1; src_len < 200; ++src_len) { + map& cpd = p[src_len]; + int min_jump = 1 - src_len; + int max_jump = src_len; + prob_t z; + for (int j = min_jump; j <= max_jump; ++j) { + prob_t& cp = cpd[j]; + if (j < 0) + cp.logeq(log_poisson(1.5-j, 1)); + else if (j > 0) + cp.logeq(log_poisson(j, 1)); + cp.poweq(0.2); + z += cp; + } + for (int j = min_jump; j <= max_jump; ++j) { + cpd[j] /= z; + } + } +} + diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h new file mode 100644 index 00000000..df17aa62 --- /dev/null +++ b/gi/pf/base_measures.h @@ -0,0 +1,116 @@ +#ifndef _BASE_MEASURES_H_ +#define _BASE_MEASURES_H_ + +#include +#include +#include +#include +#include + +#include "trule.h" +#include "prob.h" +#include "tdict.h" + +inline double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { + os << '['; + for (int i = 0; i < p.size(); ++i) + os << (i==0 ? "" : " ") << TD::Convert(p[i]); + return os << ']'; +} + +struct Model1 { + explicit Model1(const std::string& fname) : + kNULL(TD::Convert("")), + kZERO() { + LoadModel1(fname); + } + + void LoadModel1(const std::string& fname); + + // returns prob 0 if src or trg is not found + const prob_t& operator()(WordID src, WordID trg) const { + if (src == 0) src = kNULL; + if (src < ttable.size()) { + const std::map& cpd = ttable[src]; + const std::map::const_iterator it = cpd.find(trg); + if (it != cpd.end()) + return it->second; + } + return kZERO; + } + + const WordID kNULL; + const prob_t kZERO; + std::vector > ttable; +}; + +struct PhraseConditionalBase { + explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : + model1(m1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseJointBase { + explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) : + model1(m1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_SOURCE(1.0 / vocab_f_size), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_SOURCE; + const prob_t kUNIFORM_TARGET; +}; + +// base distribution for jump size multinomials +// basically p(0) = 0 and then, p(1) is max, and then +// you drop as you move to the max jump distance +struct JumpBase { + JumpBase(); + + const prob_t& operator()(int jump, unsigned src_len) const { + assert(jump != 0); + const std::map::const_iterator it = p[src_len].find(jump); + assert(it != p[src_len].end()); + return it->second; + } + std::vector > p; +}; + + +#endif diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc new file mode 100644 index 00000000..4c6ba3ef --- /dev/null +++ b/gi/pf/brat.cc @@ -0,0 +1,554 @@ +#include +#include +#include + +#include +#include +#include +#include + +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "cfg_wfst_composer.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; +struct FSTState; + +size_t hash_value(const TRule& r) { + size_t h = 2 - r.lhs_; + boost::hash_combine(h, boost::hash_value(r.e_)); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +struct ConditionalBase { + explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) : + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_TARGET(1.0 / vocab_e_size), + kNULL(TD::Convert("")) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + LoadModel1(model1fname); + } + + void LoadModel1(const string& fname) { + cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + int cur = 0; + int start = 0; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + const WordID src = TD::Convert(&line[0]); + ++cur; + start = cur; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + WordID trg = TD::Convert(&line[start]); + const double logprob = strtod(&line[cur + 1], NULL); + if (src >= ttable.size()) ttable.resize(src + 1); + ttable[src][trg].logeq(logprob); + } + cerr << " read " << lc << " parameters.\n"; + } + + // return logp0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + const int flen = rule.f_.size(); + const int elen = rule.e_.size(); + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) { // for each position i in e-RHS + const WordID trg = rule.e_[i]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? kNULL : rule.f_[j]; + const map::const_iterator it = ttable[src].find(trg); + if (it != ttable[src].end()) { + tp += kM1MIXTURE * it->second; + } + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + return p; + } + + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_TARGET; + const WordID kNULL; + vector > ttable; +}; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(3),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(3),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +struct UniphraseLM { + UniphraseLM(const vector >& corpus, + const set& vocab, + const po::variables_map& conf) : + phrases_(1,1), + gen_(1,1), + corpus_(corpus), + uniform_word_(1.0 / vocab.size()), + gen_p0_(0.5), + p_end_(0.5), + use_poisson_(conf.count("poisson_length") > 0) {} + + void ResampleHyperparameters(MT19937* rng) { + phrases_.resample_hyperparameters(rng); + gen_.resample_hyperparameters(rng); + cerr << " " << phrases_.concentration(); + } + + CCRP_NoTable > phrases_; + CCRP_NoTable gen_; + vector > z_; // z_[i] is there a phrase boundary after the ith word + const vector >& corpus_; + const double uniform_word_; + const double gen_p0_; + const double p_end_; // in base length distribution, p of the end of a phrase + const bool use_poisson_; +}; + +struct Reachability { + boost::multi_array edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? + boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + + Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), + max_src_delta(boost::extents[srclen][trglen]) { + ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); + } + + private: + struct SState { + SState() : prev_src_covered(), prev_trg_covered() {} + SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} + int prev_src_covered; + int prev_trg_covered; + }; + + struct NState { + NState() : next_src_covered(), next_trg_covered() {} + NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {} + int next_src_covered; + int next_trg_covered; + }; + + void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { + typedef boost::multi_array, 2> array_type; + array_type a(boost::extents[srclen + 1][trglen + 1]); + a[0][0].push_back(SState()); + for (int i = 0; i < srclen; ++i) { + for (int j = 0; j < trglen; ++j) { + if (a[i][j].size() == 0) continue; + const SState prev(i,j); + for (int k = 1; k <= src_max_phrase_len; ++k) { + if ((i + k) > srclen) continue; + for (int l = 1; l <= trg_max_phrase_len; ++l) { + if ((j + l) > trglen) continue; + a[i + k][j + l].push_back(prev); + } + } + } + } + a[0][0].clear(); + cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; + assert(a[srclen][trglen].size() > 0); + + typedef boost::multi_array rarray_type; + rarray_type r(boost::extents[srclen + 1][trglen + 1]); +// typedef boost::multi_array, 2> narray_type; +// narray_type b(boost::extents[srclen + 1][trglen + 1]); + r[srclen][trglen] = true; + for (int i = srclen; i >= 0; --i) { + for (int j = trglen; j >= 0; --j) { + vector& prevs = a[i][j]; + if (!r[i][j]) { prevs.clear(); } +// const NState nstate(i,j); + for (int k = 0; k < prevs.size(); ++k) { + r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; + int src_delta = i - prevs[k].prev_src_covered; + edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; + short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; + if (src_delta > msd) msd = src_delta; +// b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate); + } + } + } + assert(!edges[0][0][1][0]); + assert(!edges[0][0][0][1]); + assert(!edges[0][0][0][0]); + cerr << " MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl; + assert(max_src_delta[0][0] > 0); + //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; + //for (int i = 0; i < b[0][0].size(); ++i) { + // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; + //} + } +}; + +ostream& operator<<(ostream& os, const FSTState& q); +struct FSTState { + explicit FSTState(int src_size) : + trg_covered_(), + src_covered_(), + src_coverage_(src_size) {} + + FSTState(short trg_covered, short src_covered, const vector& src_coverage, const vector& src_prefix) : + trg_covered_(trg_covered), + src_covered_(src_covered), + src_coverage_(src_coverage), + src_prefix_(src_prefix) { + if (src_coverage_.size() == src_covered) { + assert(src_prefix.size() == 0); + } + } + + // if we extend by the word at src_position, what are + // the next states that are reachable and lie on a valid + // path to the final state? + vector Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const { + assert(src_position < src_coverage_.size()); + if (src_coverage_[src_position]) { + cerr << "Trying to extend " << *this << " with position " << src_position << endl; + abort(); + } + vector ncvg = src_coverage_; + ncvg[src_position] = true; + + vector res; + const int trg_remaining = trg_len - trg_covered_; + if (trg_remaining <= 0) { + cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl; + abort(); + } + const int src_remaining = src_len - src_covered_; + if (src_remaining <= 0) { + cerr << "Source appears to have been covered: " << *this << endl; + abort(); + } + + for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) { + if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) { + int nc = src_prefix_.size() + 1 + src_covered_; + res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector())); + } + } + + if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) { + vector nsp = src_prefix_; + nsp.push_back(src_position); + res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp)); + } + + if (res.size() == 0) { + cerr << *this << " can't be extended!\n"; + abort(); + } + return res; + } + + short trg_covered_, src_covered_; + vector src_coverage_; + vector src_prefix_; +}; +bool operator<(const FSTState& q, const FSTState& r) { + if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_; + if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_; + if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_; + return q.src_prefix_ < r.src_prefix_; +} + +ostream& operator<<(ostream& os, const FSTState& q) { + os << "[" << q.trg_covered_ << " : "; + for (int i = 0; i < q.src_coverage_.size(); ++i) + os << q.src_coverage_[i]; + os << " : <"; + for (int i = 0; i < q.src_prefix_.size(); ++i) { + if (i != 0) os << ' '; + os << q.src_prefix_[i]; + } + return os << ">]"; +} + +struct MyModel { + MyModel(ConditionalBase& rcp0) : rp0(rcp0) {} + typedef unordered_map, CCRP_NoTable, boost::hash > > SrcToRuleCRPMap; + + void DecrementRule(const TRule& rule) { + SrcToRuleCRPMap::iterator it = rules.find(rule.f_); + assert(it != rules.end()); + it->second.decrement(rule); + if (it->second.num_customers() == 0) rules.erase(it); + } + + void IncrementRule(const TRule& rule) { + SrcToRuleCRPMap::iterator it = rules.find(rule.f_); + if (it == rules.end()) { + CCRP_NoTable crp(1,1); + it = rules.insert(make_pair(rule.f_, crp)).first; + } + it->second.increment(rule); + } + + // conditioned on rule.f_ + prob_t RuleConditionalProbability(const TRule& rule) const { + const prob_t base = rp0(rule); + SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_); + if (it == rules.end()) { + return base; + } else { + const double lp = it->second.logprob(rule, log(base)); + prob_t q; q.logeq(lp); + return q; + } + } + + const ConditionalBase& rp0; + SrcToRuleCRPMap rules; +}; + +struct MyFST : public WFST { + MyFST(const vector& ssrc, const vector& strg, MyModel* m) : + src(ssrc), trg(strg), + r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE), + model(m) { + FSTState in(src.size()); + cerr << " INIT: " << in << endl; + init = GetNode(in); + for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true; + in.src_covered_ = src.size(); + in.trg_covered_ = trg.size(); + cerr << "FINAL: " << in << endl; + final = GetNode(in); + } + virtual const WFSTNode* Final() const; + virtual const WFSTNode* Initial() const; + + const WFSTNode* GetNode(const FSTState& q); + map > m; + const vector& src; + const vector& trg; + Reachability r; + const WFSTNode* init; + const WFSTNode* final; + MyModel* model; +}; + +struct MyNode : public WFSTNode { + MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {} + virtual vector > ExtendInput(unsigned srcindex) const; + const FSTState state; + mutable MyFST* container; +}; + +vector > MyNode::ExtendInput(unsigned srcindex) const { + cerr << "EXTEND " << state << " with " << srcindex << endl; + vector ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r); + vector > res(ext.size()); + for (unsigned i = 0; i < ext.size(); ++i) { + res[i].first = container->GetNode(ext[i]); + if (ext[i].src_prefix_.size() == 0) { + const unsigned trg_from = state.trg_covered_; + const unsigned trg_to = ext[i].trg_covered_; + const unsigned prev_prfx_size = state.src_prefix_.size(); + res[i].second.reset(new TRule); + res[i].second->lhs_ = -TD::Convert("X"); + vector& src = res[i].second->f_; + vector& trg = res[i].second->e_; + src.resize(prev_prfx_size + 1); + for (unsigned j = 0; j < prev_prfx_size; ++j) + src[j] = container->src[state.src_prefix_[j]]; + src[prev_prfx_size] = container->src[srcindex]; + for (unsigned j = trg_from; j < trg_to; ++j) + trg.push_back(container->trg[j]); + res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second))); + } + } + return res; +} + +const WFSTNode* MyFST::GetNode(const FSTState& q) { + boost::shared_ptr& res = m[q]; + if (!res) { + res.reset(new MyNode(q, this)); + } + return &*res; +} + +const WFSTNode* MyFST::Final() const { + return final; +} + +const WFSTNode* MyFST::Initial() const { + return init; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); + kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + shared_ptr prng; + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + ConditionalBase lp0(conf["model1_interpolation_weight"].as(), + vocabe.size(), + conf["model1"].as()); + MyModel m(lp0); + + TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0"); + m.IncrementRule(x); + TRule y("[X] ||| nY dyN ||| gave ||| 0"); + m.IncrementRule(y); + + + MyFST fst(corpusf[0], corpuse[0], &m); + ifstream in("./kimura.g"); + assert(in); + CFG_WFSTComposer comp(fst); + Hypergraph hg; + bool succeed = comp.Compose(&in, &hg); + hg.PrintGraphviz(); + if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } + +#if 0 + ifstream in2("./amnabooks.g"); + assert(in2); + MyFST fst2(corpusf[1], corpuse[1], &m); + CFG_WFSTComposer comp2(fst2); + Hypergraph hg2; + bool succeed2 = comp2.Compose(&in2, &hg2); + if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } +#endif + + SparseVector w; w.set_value(FD::Convert("Proposal"), 1.0); + hg.Reweight(w); + cerr << ViterbiFTree(hg) << endl; + return 0; +} + diff --git a/gi/pf/cbgi.cc b/gi/pf/cbgi.cc new file mode 100644 index 00000000..20204e8a --- /dev/null +++ b/gi/pf/cbgi.cc @@ -0,0 +1,340 @@ +#include +#include +#include + +#include +#include + +#include "sampler.h" +#include "filelib.h" +#include "hg_io.h" +#include "hg.h" +#include "ccrp_nt.h" +#include "trule.h" +#include "inside_outside.h" + +using namespace std; +using namespace std::tr1; + +double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +double log_decay(unsigned x, const double& b) { + assert(b > 1.0); + assert(x > 0); + return log(b - 1) - x * log(b); +} + +size_t hash_value(const TRule& r) { + // TODO fix hash function + size_t h = boost::hash_value(r.e_) * boost::hash_value(r.f_) * r.lhs_; + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +struct SimpleBase { + SimpleBase(unsigned esize, unsigned fsize, unsigned ntsize = 144) : + uniform_e(-log(esize)), + uniform_f(-log(fsize)), + uniform_nt(-log(ntsize)) { + } + + // binomial coefficient + static double choose(unsigned n, unsigned k) { + return exp(lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1)); + } + + // count the number of patterns of terminals and NTs in the rule, given elen and flen + static double log_number_of_patterns(const unsigned flen, const unsigned elen) { + static vector > counts; + if (elen >= counts.size()) counts.resize(elen + 1); + if (flen >= counts[elen].size()) counts[elen].resize(flen + 1); + double& count = counts[elen][flen]; + if (count) return log(count); + const unsigned max_arity = min(elen, flen); + for (unsigned a = 0; a <= max_arity; ++a) + count += choose(elen, a) * choose(flen, a); + return log(count); + } + + // return logp0 of rule | LHS + double operator()(const TRule& rule) const { + const unsigned flen = rule.f_.size(); + const unsigned elen = rule.e_.size(); +#if 0 + double p = 0; + p += log_poisson(flen, 0.5); // flen ~Pois(0.5) + p += log_poisson(elen, flen); // elen | flen ~Pois(flen) + p -= log_number_of_patterns(flen, elen); // pattern | flen,elen ~Uniform + for (unsigned i = 0; i < flen; ++i) { // for each position in f-RHS + if (rule.f_[i] <= 0) // according to pattern + p += uniform_nt; // draw NT ~Uniform + else + p += uniform_f; // draw f terminal ~Uniform + } + p -= lgamma(rule.Arity() + 1); // draw permutation ~Uniform + for (unsigned i = 0; i < elen; ++i) { // for each position in e-RHS + if (rule.e_[i] > 0) // according to pattern + p += uniform_e; // draw e|f term ~Uniform + // TODO this should prob be model 1 + } +#else + double p = 0; + bool is_abstract = rule.f_[0] <= 0; + p += log(0.5); + if (is_abstract) { + if (flen == 2) p += log(0.99); else p += log(0.01); + } else { + p += log_decay(flen, 3); + } + + for (unsigned i = 0; i < flen; ++i) { // for each position in f-RHS + if (rule.f_[i] <= 0) // according to pattern + p += uniform_nt; // draw NT ~Uniform + else + p += uniform_f; // draw f terminal ~Uniform + } +#endif + return p; + } + const double uniform_e; + const double uniform_f; + const double uniform_nt; + vector arities; +}; + +MT19937* rng = NULL; + +template +struct MHSamplerEdgeProb { + MHSamplerEdgeProb(const Hypergraph& hg, + const map >& rdp, + const Base& logp0, + const bool exclude_multiword_terminals) : edge_probs(hg.edges_.size()) { + for (int i = 0; i < edge_probs.size(); ++i) { + const TRule& rule = *hg.edges_[i].rule_; + const map >::const_iterator it = rdp.find(rule.lhs_); + assert(it != rdp.end()); + const CCRP_NoTable& crp = it->second; + edge_probs[i].logeq(crp.logprob(rule, logp0(rule))); + if (exclude_multiword_terminals && rule.f_[0] > 0 && rule.f_.size() > 1) + edge_probs[i] = prob_t::Zero(); + } + } + inline prob_t operator()(const Hypergraph::Edge& e) const { + return edge_probs[e.id_]; + } + prob_t DerivationProb(const vector& d) const { + prob_t p = prob_t::One(); + for (unsigned i = 0; i < d.size(); ++i) + p *= edge_probs[d[i]]; + return p; + } + vector edge_probs; +}; + +template +struct ModelAndData { + ModelAndData() : + base_lh(prob_t::One()), + logp0(10000, 10000), + mh_samples(), + mh_rejects() {} + + void SampleCorpus(const string& hgpath, int i); + void ResampleHyperparameters() { + for (map >::iterator it = rules.begin(); it != rules.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + CCRP_NoTable& RuleCRP(int lhs) { + map >::iterator it = rules.find(lhs); + if (it == rules.end()) { + rules.insert(make_pair(lhs, CCRP_NoTable(1,1))); + it = rules.find(lhs); + } + return it->second; + } + + void IncrementRule(const TRule& rule) { + CCRP_NoTable& crp = RuleCRP(rule.lhs_); + if (crp.increment(rule)) { + prob_t p; p.logeq(logp0(rule)); + base_lh *= p; + } + } + + void DecrementRule(const TRule& rule) { + CCRP_NoTable& crp = RuleCRP(rule.lhs_); + if (crp.decrement(rule)) { + prob_t p; p.logeq(logp0(rule)); + base_lh /= p; + } + } + + void DecrementDerivation(const Hypergraph& hg, const vector& d) { + for (unsigned i = 0; i < d.size(); ++i) { + const TRule& rule = *hg.edges_[d[i]].rule_; + DecrementRule(rule); + } + } + + void IncrementDerivation(const Hypergraph& hg, const vector& d) { + for (unsigned i = 0; i < d.size(); ++i) { + const TRule& rule = *hg.edges_[d[i]].rule_; + IncrementRule(rule); + } + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (map >::const_iterator it = rules.begin(); it != rules.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + } + p *= base_lh; + return p; + } + + void ResampleDerivation(const Hypergraph& hg, vector* sampled_derivation); + + map > rules; // [lhs] -> distribution over RHSs + prob_t base_lh; + SimpleBase logp0; + vector > samples; // sampled derivations + unsigned int mh_samples; + unsigned int mh_rejects; +}; + +template +void ModelAndData::SampleCorpus(const string& hgpath, int n) { + vector hgs(n); hgs.clear(); + boost::unordered_map acc; + map tot; + for (int i = 0; i < n; ++i) { + ostringstream os; + os << hgpath << '/' << i << ".json.gz"; + if (!FileExists(os.str())) continue; + hgs.push_back(Hypergraph()); + ReadFile rf(os.str()); + HypergraphIO::ReadFromJSON(rf.stream(), &hgs.back()); + } + cerr << "Read " << hgs.size() << " alignment hypergraphs.\n"; + samples.resize(hgs.size()); + const unsigned SAMPLES = 2000; + const unsigned burnin = 3 * SAMPLES / 4; + const unsigned every = 20; + for (unsigned s = 0; s < SAMPLES; ++s) { + if (s % 10 == 0) { + if (s > 0) { cerr << endl; ResampleHyperparameters(); } + cerr << "[" << s << " LLH=" << log(Likelihood()) << " REJECTS=" << ((double)mh_rejects / mh_samples) << " LHS's=" << rules.size() << " base=" << log(base_lh) << "] "; + } + cerr << '.'; + for (unsigned i = 0; i < hgs.size(); ++i) { + ResampleDerivation(hgs[i], &samples[i]); + if (s > burnin && s % every == 0) { + for (unsigned j = 0; j < samples[i].size(); ++j) { + const TRule& rule = *hgs[i].edges_[samples[i][j]].rule_; + ++acc[rule]; + ++tot[rule.lhs_]; + } + } + } + } + cerr << endl; + for (boost::unordered_map::iterator it = acc.begin(); it != acc.end(); ++it) { + cout << it->first << " MyProb=" << log(it->second)-log(tot[it->first.lhs_]) << endl; + } +} + +template +void ModelAndData::ResampleDerivation(const Hypergraph& hg, vector* sampled_deriv) { + vector cur; + cur.swap(*sampled_deriv); + + const prob_t p_cur = Likelihood(); + DecrementDerivation(hg, cur); + if (cur.empty()) { + // first iteration, create restaurants + for (int i = 0; i < hg.edges_.size(); ++i) + RuleCRP(hg.edges_[i].rule_->lhs_); + } + MHSamplerEdgeProb wf(hg, rules, logp0, cur.empty()); +// MHSamplerEdgeProb wf(hg, rules, logp0, false); + const prob_t q_cur = wf.DerivationProb(cur); + vector node_probs; + Inside >(hg, &node_probs, wf); + queue q; + q.push(hg.nodes_.size() - 3); + while(!q.empty()) { + unsigned cur_node_id = q.front(); +// cerr << "NODE=" << cur_node_id << endl; + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge = 0; + if (num_in_edges == 1) { + sampled_edge = node.in_edges_[0]; + } else { + prob_t z; + assert(num_in_edges > 1); + SampleSet ss; + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = wf.edge_probs[edge.id_]; // edge proposal prob + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; + ss.add(p); +// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; + z += p; + } +// for (unsigned j = 0; j < num_in_edges; ++j) { +// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +// } +// cerr << " --- \n"; + sampled_edge = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_deriv->push_back(sampled_edge); + const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; + for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { + q.push(edge.tail_nodes_[j]); + } + } + IncrementDerivation(hg, *sampled_deriv); + +// cerr << "sampled derivation contains " << sampled_deriv->size() << " edges\n"; +// cerr << "DERIV:\n"; +// for (int i = 0; i < sampled_deriv->size(); ++i) { +// cerr << " " << hg.edges_[(*sampled_deriv)[i]].rule_->AsString() << endl; +// } + + if (cur.empty()) return; // accept first sample + + ++mh_samples; + // only need to do MH if proposal is different to current state + if (cur != *sampled_deriv) { + const prob_t q_prop = wf.DerivationProb(*sampled_deriv); + const prob_t p_prop = Likelihood(); + if (!rng->AcceptMetropolisHastings(p_prop, p_cur, q_prop, q_cur)) { + ++mh_rejects; + DecrementDerivation(hg, *sampled_deriv); + IncrementDerivation(hg, cur); + swap(cur, *sampled_deriv); + } + } +} + +int main(int argc, char** argv) { + rng = new MT19937; + ModelAndData m; + m.SampleCorpus("./hgs", 50); + // m.SampleCorpus("./btec/hgs", 5000); + return 0; +} + diff --git a/gi/pf/cfg_wfst_composer.cc b/gi/pf/cfg_wfst_composer.cc new file mode 100644 index 00000000..a31b5be8 --- /dev/null +++ b/gi/pf/cfg_wfst_composer.cc @@ -0,0 +1,730 @@ +#include "cfg_wfst_composer.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include "fast_lexical_cast.hpp" + +#include "phrasetable_fst.h" +#include "sparse_vector.h" +#include "tdict.h" +#include "hg.h" + +using boost::shared_ptr; +namespace po = boost::program_options; +using namespace std; +using namespace std::tr1; + +WFSTNode::~WFSTNode() {} +WFST::~WFST() {} + +// Define the following macro if you want to see lots of debugging output +// when you run the chart parser +#undef DEBUG_CHART_PARSER + +// A few constants used by the chart parser /////////////// +static const int kMAX_NODES = 2000000; +static const string kPHRASE_STRING = "X"; +static bool constants_need_init = true; +static WordID kUNIQUE_START; +static WordID kPHRASE; +static TRulePtr kX1X2; +static TRulePtr kX1; +static WordID kEPS; +static TRulePtr kEPSRule; + +static void InitializeConstants() { + if (constants_need_init) { + kPHRASE = TD::Convert(kPHRASE_STRING) * -1; + kUNIQUE_START = TD::Convert("S") * -1; + kX1X2.reset(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]")); + kX1.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); + kEPSRule.reset(new TRule("[X] ||| ||| ")); + kEPS = TD::Convert(""); + constants_need_init = false; + } +} +//////////////////////////////////////////////////////////// + +class EGrammarNode { + friend bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); + friend void AddGrammarRule(const string& r, map* g); + public: +#ifdef DEBUG_CHART_PARSER + string hint; +#endif + EGrammarNode() : is_some_rule_complete(false), is_root(false) {} + const map& GetTerminals() const { return tptr; } + const map& GetNonTerminals() const { return ntptr; } + bool HasNonTerminals() const { return (!ntptr.empty()); } + bool HasTerminals() const { return (!tptr.empty()); } + bool RuleCompletes() const { + return (is_some_rule_complete || (ntptr.empty() && tptr.empty())); + } + bool GrammarContinues() const { + return !(ntptr.empty() && tptr.empty()); + } + bool IsRoot() const { + return is_root; + } + // these are the features associated with the rule from the start + // node up to this point. If you use these features, you must + // not Extend() this rule. + const SparseVector& GetCFGProductionFeatures() const { + return input_features; + } + + const EGrammarNode* Extend(const WordID& t) const { + if (t < 0) { + map::const_iterator it = ntptr.find(t); + if (it == ntptr.end()) return NULL; + return &it->second; + } else { + map::const_iterator it = tptr.find(t); + if (it == tptr.end()) return NULL; + return &it->second; + } + } + + private: + map tptr; + map ntptr; + SparseVector input_features; + bool is_some_rule_complete; + bool is_root; +}; +typedef map EGrammar; // indexed by the rule LHS + +// edges are immutable once created +struct Edge { +#ifdef DEBUG_CHART_PARSER + static int id_count; + const int id; +#endif + const WordID cat; // lhs side of rule proved/being proved + const EGrammarNode* const dot; // dot position + const WFSTNode* const q; // start of span + const WFSTNode* const r; // end of span + const Edge* const active_parent; // back pointer, NULL for PREDICT items + const Edge* const passive_parent; // back pointer, NULL for SCAN and PREDICT items + TRulePtr tps; // translations + shared_ptr > features; // features from CFG rule + + bool IsPassive() const { + // when a rule is completed, this value will be set + return static_cast(features); + } + bool IsActive() const { return !IsPassive(); } + bool IsInitial() const { + return !(active_parent || passive_parent); + } + bool IsCreatedByScan() const { + return active_parent && !passive_parent && !dot->IsRoot(); + } + bool IsCreatedByPredict() const { + return dot->IsRoot(); + } + bool IsCreatedByComplete() const { + return active_parent && passive_parent; + } + + // constructor for PREDICT + Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(NULL), passive_parent(NULL), tps() {} + Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r, const Edge* act_parent) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(act_parent), passive_parent(NULL), tps() {} + + // constructors for SCAN + Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, + const Edge* act_par, const TRulePtr& translations) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations) {} + + Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, + const Edge* act_par, const TRulePtr& translations, + const SparseVector& feats) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations), + features(new SparseVector(feats)) {} + + // constructors for COMPLETE + Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, + const Edge* act_par, const Edge *pas_par) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps() { + assert(pas_par->IsPassive()); + assert(act_par->IsActive()); + } + + Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, + const Edge* act_par, const Edge *pas_par, const SparseVector& feats) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(), + features(new SparseVector(feats)) { + assert(pas_par->IsPassive()); + assert(act_par->IsActive()); + } + + // constructor for COMPLETE query + Edge(const WFSTNode* _r) : +#ifdef DEBUG_CHART_PARSER + id(0), +#endif + cat(0), dot(NULL), q(NULL), + r(_r), active_parent(NULL), passive_parent(NULL), tps() {} + // constructor for MERGE quere + Edge(const WFSTNode* _q, int) : +#ifdef DEBUG_CHART_PARSER + id(0), +#endif + cat(0), dot(NULL), q(_q), + r(NULL), active_parent(NULL), passive_parent(NULL), tps() {} +}; +#ifdef DEBUG_CHART_PARSER +int Edge::id_count = 0; +#endif + +ostream& operator<<(ostream& os, const Edge& e) { + string type = "PREDICT"; + if (e.IsCreatedByScan()) + type = "SCAN"; + else if (e.IsCreatedByComplete()) + type = "COMPLETE"; + os << "[" +#ifdef DEBUG_CHART_PARSER + << '(' << e.id << ") " +#else + << '(' << &e << ") " +#endif + << "q=" << e.q << ", r=" << e.r + << ", cat="<< TD::Convert(e.cat*-1) << ", dot=" + << e.dot +#ifdef DEBUG_CHART_PARSER + << e.dot->hint +#endif + << (e.IsActive() ? ", Active" : ", Passive") + << ", " << type; +#ifdef DEBUG_CHART_PARSER + if (e.active_parent) { os << ", act.parent=(" << e.active_parent->id << ')'; } + if (e.passive_parent) { os << ", psv.parent=(" << e.passive_parent->id << ')'; } +#endif + if (e.tps) { os << ", tps=" << e.tps->AsString(); } + return os << ']'; +} + +struct Traversal { + const Edge* const edge; // result from the active / passive combination + const Edge* const active; + const Edge* const passive; + Traversal(const Edge* me, const Edge* a, const Edge* p) : edge(me), active(a), passive(p) {} +}; + +struct UniqueTraversalHash { + size_t operator()(const Traversal* t) const { + size_t x = 5381; + x = ((x << 5) + x) ^ reinterpret_cast(t->active); + x = ((x << 5) + x) ^ reinterpret_cast(t->passive); + x = ((x << 5) + x) ^ t->edge->IsActive(); + return x; + } +}; + +struct UniqueTraversalEquals { + size_t operator()(const Traversal* a, const Traversal* b) const { + return (a->passive == b->passive && a->active == b->active && a->edge->IsActive() == b->edge->IsActive()); + } +}; + +struct UniqueEdgeHash { + size_t operator()(const Edge* e) const { + size_t x = 5381; + if (e->IsActive()) { + x = ((x << 5) + x) ^ reinterpret_cast(e->dot); + x = ((x << 5) + x) ^ reinterpret_cast(e->q); + x = ((x << 5) + x) ^ reinterpret_cast(e->r); + x = ((x << 5) + x) ^ static_cast(e->cat); + x += 13; + } else { // with passive edges, we don't care about the dot + x = ((x << 5) + x) ^ reinterpret_cast(e->q); + x = ((x << 5) + x) ^ reinterpret_cast(e->r); + x = ((x << 5) + x) ^ static_cast(e->cat); + } + return x; + } +}; + +struct UniqueEdgeEquals { + bool operator()(const Edge* a, const Edge* b) const { + if (a->IsActive() != b->IsActive()) return false; + if (a->IsActive()) { + return (a->cat == b->cat) && (a->dot == b->dot) && (a->q == b->q) && (a->r == b->r); + } else { + return (a->cat == b->cat) && (a->q == b->q) && (a->r == b->r); + } + } +}; + +struct REdgeHash { + size_t operator()(const Edge* e) const { + size_t x = 5381; + x = ((x << 5) + x) ^ reinterpret_cast(e->r); + return x; + } +}; + +struct REdgeEquals { + bool operator()(const Edge* a, const Edge* b) const { + return (a->r == b->r); + } +}; + +struct QEdgeHash { + size_t operator()(const Edge* e) const { + size_t x = 5381; + x = ((x << 5) + x) ^ reinterpret_cast(e->q); + return x; + } +}; + +struct QEdgeEquals { + bool operator()(const Edge* a, const Edge* b) const { + return (a->q == b->q); + } +}; + +struct EdgeQueue { + queue q; + EdgeQueue() {} + void clear() { while(!q.empty()) q.pop(); } + bool HasWork() const { return !q.empty(); } + const Edge* Next() { const Edge* res = q.front(); q.pop(); return res; } + void AddEdge(const Edge* s) { q.push(s); } +}; + +class CFG_WFSTComposerImpl { + public: + CFG_WFSTComposerImpl(WordID start_cat, + const WFSTNode* q_0, + const WFSTNode* q_final) : start_cat_(start_cat), q_0_(q_0), q_final_(q_final) {} + + // returns false if the intersection is empty + bool Compose(const EGrammar& g, Hypergraph* forest) { + goal_node = NULL; + EGrammar::const_iterator sit = g.find(start_cat_); + forest->ReserveNodes(kMAX_NODES); + assert(sit != g.end()); + Edge* init = new Edge(start_cat_, &sit->second, q_0_); + assert(IncorporateNewEdge(init)); + while (exp_agenda.HasWork() || agenda.HasWork()) { + while(exp_agenda.HasWork()) { + const Edge* edge = exp_agenda.Next(); + FinishEdge(edge, forest); + } + if (agenda.HasWork()) { + const Edge* edge = agenda.Next(); +#ifdef DEBUG_CHART_PARSER + cerr << "processing (" << edge->id << ')' << endl; +#endif + if (edge->IsActive()) { + if (edge->dot->HasTerminals()) + DoScan(edge); + if (edge->dot->HasNonTerminals()) { + DoMergeWithPassives(edge); + DoPredict(edge, g); + } + } else { + DoComplete(edge); + } + } + } + if (goal_node) { + forest->PruneUnreachable(goal_node->id_); + forest->EpsilonRemove(kEPS); + } + FreeAll(); + return goal_node; + } + + void FreeAll() { + for (int i = 0; i < free_list_.size(); ++i) + delete free_list_[i]; + free_list_.clear(); + for (int i = 0; i < traversal_free_list_.size(); ++i) + delete traversal_free_list_[i]; + traversal_free_list_.clear(); + all_traversals.clear(); + exp_agenda.clear(); + agenda.clear(); + tps2node.clear(); + edge2node.clear(); + all_edges.clear(); + passive_edges.clear(); + active_edges.clear(); + } + + ~CFG_WFSTComposerImpl() { + FreeAll(); + } + + // returns the total number of edges created during composition + int EdgesCreated() const { + return free_list_.size(); + } + + private: + void DoScan(const Edge* edge) { + // here, we assume that the FST will potentially have many more outgoing + // edges than the grammar, which will be just a couple. If you want to + // efficiently handle the case where both are relatively large, this code + // will need to change how the intersection is done. The best general + // solution would probably be the Baeza-Yates double binary search. + + const EGrammarNode* dot = edge->dot; + const WFSTNode* r = edge->r; + const map& terms = dot->GetTerminals(); + for (map::const_iterator git = terms.begin(); + git != terms.end(); ++git) { + + if (!(TD::Convert(git->first)[0] >= '0' && TD::Convert(git->first)[0] <= '9')) { + std::cerr << "TERMINAL SYMBOL: " << TD::Convert(git->first) << endl; + abort(); + } + std::vector > extensions = r->ExtendInput(atoi(TD::Convert(git->first))); + for (unsigned nsi = 0; nsi < extensions.size(); ++nsi) { + const WFSTNode* next_r = extensions[nsi].first; + const EGrammarNode* next_dot = &git->second; + const bool grammar_continues = next_dot->GrammarContinues(); + const bool rule_completes = next_dot->RuleCompletes(); + if (extensions[nsi].second) + cerr << "!!! " << extensions[nsi].second->AsString() << endl; + // cerr << " rule completes: " << rule_completes << " after consuming " << TD::Convert(git->first) << endl; + assert(grammar_continues || rule_completes); + const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); + if (rule_completes) + IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second, input_features)); + if (grammar_continues) + IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second)); + } + } + } + + void DoPredict(const Edge* edge, const EGrammar& g) { + const EGrammarNode* dot = edge->dot; + const map& non_terms = dot->GetNonTerminals(); + for (map::const_iterator git = non_terms.begin(); + git != non_terms.end(); ++git) { + const WordID nt_to_predict = git->first; + //cerr << edge->id << " -- " << TD::Convert(nt_to_predict*-1) << endl; + EGrammar::const_iterator egi = g.find(nt_to_predict); + if (egi == g.end()) { + cerr << "[ERROR] Can't find any grammar rules with a LHS of type " + << TD::Convert(-1*nt_to_predict) << '!' << endl; + continue; + } + assert(edge->IsActive()); + const EGrammarNode* new_dot = &egi->second; + Edge* new_edge = new Edge(nt_to_predict, new_dot, edge->r, edge); + IncorporateNewEdge(new_edge); + } + } + + void DoComplete(const Edge* passive) { +#ifdef DEBUG_CHART_PARSER + cerr << " complete: " << *passive << endl; +#endif + const WordID completed_nt = passive->cat; + const WFSTNode* q = passive->q; + const WFSTNode* next_r = passive->r; + const Edge query(q); + const pair::iterator, + unordered_multiset::iterator > p = + active_edges.equal_range(&query); + for (unordered_multiset::iterator it = p.first; + it != p.second; ++it) { + const Edge* active = *it; +#ifdef DEBUG_CHART_PARSER + cerr << " pos: " << *active << endl; +#endif + const EGrammarNode* next_dot = active->dot->Extend(completed_nt); + if (!next_dot) continue; + const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); + // add up to 2 rules + if (next_dot->RuleCompletes()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); + if (next_dot->GrammarContinues()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); + } + } + + void DoMergeWithPassives(const Edge* active) { + // edge is active, has non-terminals, we need to find the passives that can extend it + assert(active->IsActive()); + assert(active->dot->HasNonTerminals()); +#ifdef DEBUG_CHART_PARSER + cerr << " merge active with passives: ACT=" << *active << endl; +#endif + const Edge query(active->r, 1); + const pair::iterator, + unordered_multiset::iterator > p = + passive_edges.equal_range(&query); + for (unordered_multiset::iterator it = p.first; + it != p.second; ++it) { + const Edge* passive = *it; + const EGrammarNode* next_dot = active->dot->Extend(passive->cat); + if (!next_dot) continue; + const WFSTNode* next_r = passive->r; + const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); + if (next_dot->RuleCompletes()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); + if (next_dot->GrammarContinues()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); + } + } + + // take ownership of edge memory, add to various indexes, etc + // returns true if this edge is new + bool IncorporateNewEdge(Edge* edge) { + free_list_.push_back(edge); + if (edge->passive_parent && edge->active_parent) { + Traversal* t = new Traversal(edge, edge->active_parent, edge->passive_parent); + traversal_free_list_.push_back(t); + if (all_traversals.find(t) != all_traversals.end()) { + return false; + } else { + all_traversals.insert(t); + } + } + exp_agenda.AddEdge(edge); + return true; + } + + bool FinishEdge(const Edge* edge, Hypergraph* hg) { + bool is_new = false; + if (all_edges.find(edge) == all_edges.end()) { +#ifdef DEBUG_CHART_PARSER + cerr << *edge << " is NEW\n"; +#endif + all_edges.insert(edge); + is_new = true; + if (edge->IsPassive()) passive_edges.insert(edge); + if (edge->IsActive()) active_edges.insert(edge); + agenda.AddEdge(edge); + } else { +#ifdef DEBUG_CHART_PARSER + cerr << *edge << " is NOT NEW.\n"; +#endif + } + AddEdgeToTranslationForest(edge, hg); + return is_new; + } + + // build the translation forest + void AddEdgeToTranslationForest(const Edge* edge, Hypergraph* hg) { + assert(hg->nodes_.size() < kMAX_NODES); + Hypergraph::Node* tps = NULL; + // first add any target language rules + if (edge->tps) { + Hypergraph::Node*& node = tps2node[(size_t)edge->tps.get()]; + if (!node) { + // cerr << "Creating phrases for " << edge->tps << endl; + const TRulePtr& rule = edge->tps; + node = hg->AddNode(kPHRASE); + Hypergraph::Edge* hg_edge = hg->AddEdge(rule, Hypergraph::TailNodeVector()); + hg_edge->feature_values_ += rule->GetFeatureValues(); + hg->ConnectEdgeToHeadNode(hg_edge, node); + } + tps = node; + } + Hypergraph::Node*& head_node = edge2node[edge]; + if (!head_node) + head_node = hg->AddNode(kPHRASE); + if (edge->cat == start_cat_ && edge->q == q_0_ && edge->r == q_final_ && edge->IsPassive()) { + assert(goal_node == NULL || goal_node == head_node); + goal_node = head_node; + } + Hypergraph::TailNodeVector tail; + SparseVector extra; + if (edge->IsCreatedByPredict()) { + // extra.set_value(FD::Convert("predict"), 1); + } else if (edge->IsCreatedByScan()) { + tail.push_back(edge2node[edge->active_parent]->id_); + if (tps) { + tail.push_back(tps->id_); + } + //extra.set_value(FD::Convert("scan"), 1); + } else if (edge->IsCreatedByComplete()) { + tail.push_back(edge2node[edge->active_parent]->id_); + tail.push_back(edge2node[edge->passive_parent]->id_); + //extra.set_value(FD::Convert("complete"), 1); + } else { + assert(!"unexpected edge type!"); + } + //cerr << head_node->id_ << "<--" << *edge << endl; + +#ifdef DEBUG_CHART_PARSER + for (int i = 0; i < tail.size(); ++i) + if (tail[i] == head_node->id_) { + cerr << "ERROR: " << *edge << "\n i=" << i << endl; + if (i == 1) { cerr << "\tP: " << *edge->passive_parent << endl; } + if (i == 0) { cerr << "\tA: " << *edge->active_parent << endl; } + assert(!"self-loop found!"); + } +#endif + Hypergraph::Edge* hg_edge = NULL; + if (tail.size() == 0) { + hg_edge = hg->AddEdge(kEPSRule, tail); + } else if (tail.size() == 1) { + hg_edge = hg->AddEdge(kX1, tail); + } else if (tail.size() == 2) { + hg_edge = hg->AddEdge(kX1X2, tail); + } + if (edge->features) + hg_edge->feature_values_ += *edge->features; + hg_edge->feature_values_ += extra; + hg->ConnectEdgeToHeadNode(hg_edge, head_node); + } + + Hypergraph::Node* goal_node; + EdgeQueue exp_agenda; + EdgeQueue agenda; + unordered_map tps2node; + unordered_map edge2node; + unordered_set all_traversals; + unordered_set all_edges; + unordered_multiset passive_edges; + unordered_multiset active_edges; + vector free_list_; + vector traversal_free_list_; + const WordID start_cat_; + const WFSTNode* const q_0_; + const WFSTNode* const q_final_; +}; + +#ifdef DEBUG_CHART_PARSER +static string TrimRule(const string& r) { + size_t start = r.find(" |||") + 5; + size_t end = r.rfind(" |||"); + return r.substr(start, end - start); +} +#endif + +void AddGrammarRule(const string& r, EGrammar* g) { + const size_t pos = r.find(" ||| "); + if (pos == string::npos || r[0] != '[') { + cerr << "Bad rule: " << r << endl; + return; + } + const size_t rpos = r.rfind(" ||| "); + string feats; + string rs = r; + if (rpos != pos) { + feats = r.substr(rpos + 5); + rs = r.substr(0, rpos); + } + string rhs = rs.substr(pos + 5); + string trule = rs + " ||| " + rhs + " ||| " + feats; + TRule tr(trule); + cerr << "X: " << tr.e_[0] << endl; +#ifdef DEBUG_CHART_PARSER + string hint_last_rule; +#endif + EGrammarNode* cur = &(*g)[tr.GetLHS()]; + cur->is_root = true; + for (int i = 0; i < tr.FLength(); ++i) { + WordID sym = tr.f()[i]; +#ifdef DEBUG_CHART_PARSER + hint_last_rule = TD::Convert(sym < 0 ? -sym : sym); + cur->hint += " <@@> (*" + hint_last_rule + ") " + TrimRule(tr.AsString()); +#endif + if (sym < 0) + cur = &cur->ntptr[sym]; + else + cur = &cur->tptr[sym]; + } +#ifdef DEBUG_CHART_PARSER + cur->hint += " <@@> (" + hint_last_rule + "*) " + TrimRule(tr.AsString()); +#endif + cur->is_some_rule_complete = true; + cur->input_features = tr.GetFeatureValues(); +} + +CFG_WFSTComposer::~CFG_WFSTComposer() { + delete pimpl_; +} + +CFG_WFSTComposer::CFG_WFSTComposer(const WFST& wfst) { + InitializeConstants(); + pimpl_ = new CFG_WFSTComposerImpl(kUNIQUE_START, wfst.Initial(), wfst.Final()); +} + +bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest) { + // first, convert the src forest into an EGrammar + EGrammar g; + const int nedges = src_forest.edges_.size(); + const int nnodes = src_forest.nodes_.size(); + vector cats(nnodes); + bool assign_cats = false; + for (int i = 0; i < nnodes; ++i) + if (assign_cats) { + cats[i] = TD::Convert("CAT_" + boost::lexical_cast(i)) * -1; + } else { + cats[i] = src_forest.nodes_[i].cat_; + } + // construct the grammar + for (int i = 0; i < nedges; ++i) { + const Hypergraph::Edge& edge = src_forest.edges_[i]; + const vector& src = edge.rule_->f(); + EGrammarNode* cur = &g[cats[edge.head_node_]]; + cur->is_root = true; + int ntc = 0; + for (int j = 0; j < src.size(); ++j) { + WordID sym = src[j]; + if (sym <= 0) { + sym = cats[edge.tail_nodes_[ntc]]; + ++ntc; + cur = &cur->ntptr[sym]; + } else { + cur = &cur->tptr[sym]; + } + } + cur->is_some_rule_complete = true; + cur->input_features = edge.feature_values_; + } + EGrammarNode& goal_rule = g[kUNIQUE_START]; + assert((goal_rule.ntptr.size() == 1 && goal_rule.tptr.size() == 0) || + (goal_rule.ntptr.size() == 0 && goal_rule.tptr.size() == 1)); + + return pimpl_->Compose(g, trg_forest); +} + +bool CFG_WFSTComposer::Compose(istream* in, Hypergraph* trg_forest) { + EGrammar g; + while(*in) { + string line; + getline(*in, line); + if (line.empty()) continue; + AddGrammarRule(line, &g); + } + + return pimpl_->Compose(g, trg_forest); +} diff --git a/gi/pf/cfg_wfst_composer.h b/gi/pf/cfg_wfst_composer.h new file mode 100644 index 00000000..cf47f459 --- /dev/null +++ b/gi/pf/cfg_wfst_composer.h @@ -0,0 +1,46 @@ +#ifndef _CFG_WFST_COMPOSER_H_ +#define _CFG_WFST_COMPOSER_H_ + +#include +#include +#include + +#include "trule.h" +#include "wordid.h" + +class CFG_WFSTComposerImpl; +class Hypergraph; + +struct WFSTNode { + virtual ~WFSTNode(); + // returns the next states reachable by consuming srcindex (which identifies a word) + // paired with the output string generated by taking that transition. + virtual std::vector > ExtendInput(unsigned srcindex) const = 0; +}; + +struct WFST { + virtual ~WFST(); + virtual const WFSTNode* Final() const = 0; + virtual const WFSTNode* Initial() const = 0; +}; + +class CFG_WFSTComposer { + public: + ~CFG_WFSTComposer(); + explicit CFG_WFSTComposer(const WFST& wfst); + bool Compose(const Hypergraph& in_forest, Hypergraph* trg_forest); + + // reads the grammar from a file. There must be a single top-level + // S -> X rule. Anything else is possible. Format is: + // [S] ||| [SS,1] + // [SS] ||| [NP,1] [VP,2] ||| Feature1=0.2 Feature2=-2.3 + // [SS] ||| [VP,1] [NP,2] ||| Feature1=0.8 + // [NP] ||| [DET,1] [N,2] ||| Feature3=2 + // ... + bool Compose(std::istream* grammar_file, Hypergraph* trg_forest); + + private: + CFG_WFSTComposerImpl* pimpl_; +}; + +#endif diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc new file mode 100644 index 00000000..582d1be7 --- /dev/null +++ b/gi/pf/dpnaive.cc @@ -0,0 +1,349 @@ +#include +#include +#include + +#include +#include +#include + +#include "base_measures.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; +struct FSTState; + +size_t hash_value(const TRule& r) { + size_t h = 2 - r.lhs_; + boost::hash_combine(h, boost::hash_value(r.e_)); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(4),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(4),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_e, + set* vocab_f) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +shared_ptr prng; + +template +struct ModelAndData { + explicit ModelAndData(const Base& b, const vector >& ce, const vector >& cf, const set& ve, const set& vf) : + rng(&*prng), + p0(b), + baseprob(prob_t::One()), + corpuse(ce), + corpusf(cf), + vocabe(ve), + vocabf(vf), + rules(1,1), + mh_samples(), + mh_rejects(), + kX(-TD::Convert("X")), + derivations(corpuse.size()) {} + + void ResampleHyperparameters() { + rules.resample_hyperparameters(&*prng); + } + + void InstantiateRule(const pair& from, + const pair& to, + const vector& sentf, + const vector& sente, + TRule* rule) const { + rule->f_.clear(); + rule->e_.clear(); + rule->lhs_ = kX; + for (short i = from.first; i < to.first; ++i) + rule->f_.push_back(sentf[i]); + for (short i = from.second; i < to.second; ++i) + rule->e_.push_back(sente[i]); + } + + void DecrementDerivation(const vector >& d, const vector& sentf, const vector& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + //cerr << "REMOVE: " << x.AsString() << endl; + if (rules.decrement(x)) { + baseprob /= p0(x); + //cerr << " (REMOVED ONLY INSTANCE)\n"; + } + } + } + + void PrintDerivation(const vector >& d, const vector& sentf, const vector& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + cerr << i << '/' << (d.size() - 1) << ": " << x << endl; + } + } + + void IncrementDerivation(const vector >& d, const vector& sentf, const vector& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + if (rules.increment(x)) { + baseprob *= p0(x); + } + } + } + + prob_t Likelihood() const { + prob_t p; + p.logeq(rules.log_crp_prob()); + return p * baseprob; + } + + prob_t DerivationProposalProbability(const vector >& d, const vector& sentf, const vector& sente) const { + prob_t p = prob_t::One(); + if (d.size() < 2) return p; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + prob_t rp; rp.logeq(rules.logprob(x, log(p0(x)))); + p *= rp; + } + return p; + } + + void Sample(); + + MT19937* rng; + const Base& p0; + prob_t baseprob; // cached value of generating the table table labels from p0 + // this can't be used if we go to a hierarchical prior! + const vector >& corpuse, corpusf; + const set& vocabe, vocabf; + CCRP_NoTable rules; + unsigned mh_samples, mh_rejects; + const int kX; + vector > > derivations; +}; + +template +void ModelAndData::Sample() { + unsigned MAXK = 4; + unsigned MAXL = 4; + TRule x; + x.lhs_ = -TD::Convert("X"); + for (int samples = 0; samples < 1000; ++samples) { + if (samples % 1 == 0 && samples > 0) { + //ResampleHyperparameters(); + cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n"; + for (int i = 0; i < 10; ++i) { + cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl; + PrintDerivation(derivations[i], corpusf[i], corpuse[i]); + } + } + cerr << '.' << flush; + for (int s = 0; s < corpuse.size(); ++s) { + const vector& sentf = corpusf[s]; + const vector& sente = corpuse[s]; +// cerr << " CUSTOMERS: " << rules.num_customers() << endl; +// cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl; + + vector >& deriv = derivations[s]; + const prob_t p_cur = Likelihood(); + DecrementDerivation(deriv, sentf, sente); + + boost::multi_array a(boost::extents[sentf.size() + 1][sente.size() + 1]); + boost::multi_array trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); + a[0][0] = prob_t::One(); + for (int i = 0; i < sentf.size(); ++i) { + for (int j = 0; j < sente.size(); ++j) { + const prob_t src_a = a[i][j]; + x.f_.clear(); + for (int k = 1; k <= MAXK; ++k) { + if (i + k > sentf.size()) break; + x.f_.push_back(sentf[i + k - 1]); + x.e_.clear(); + for (int l = 1; l <= MAXL; ++l) { + if (j + l > sente.size()) break; + x.e_.push_back(sente[j + l - 1]); + trans[i][j][k - 1][l - 1].logeq(rules.logprob(x, log(p0(x)))); + a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; + } + } + } + } +// cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl; + const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente); + + vector > newderiv; + int cur_i = sentf.size(); + int cur_j = sente.size(); + while(cur_i > 0 && cur_j > 0) { + newderiv.push_back(pair(cur_i, cur_j)); +// cerr << "NODE: (" << cur_i << "," << cur_j << ")\n"; + SampleSet ss; + vector > nexts; + for (int k = 1; k <= MAXK; ++k) { + const int hyp_i = cur_i - k; + if (hyp_i < 0) break; + for (int l = 1; l <= MAXL; ++l) { + const int hyp_j = cur_j - l; + if (hyp_j < 0) break; + const prob_t& inside = a[hyp_i][hyp_j]; + if (inside == prob_t::Zero()) continue; + const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1]; + if (transp == prob_t::Zero()) continue; + const prob_t p = inside * transp; + ss.add(p); + nexts.push_back(pair(hyp_i, hyp_j)); +// cerr << " (" << hyp_i << "," << hyp_j << ") <--- " << log(p) << endl; + } + } +// cerr << " sample set has " << nexts.size() << " elements.\n"; + const int selected = rng->SelectSample(ss); + cur_i = nexts[selected].first; + cur_j = nexts[selected].second; + } + newderiv.push_back(pair(0,0)); + const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente); + IncrementDerivation(newderiv, sentf, sente); +// cerr << "SANITY: " << q_new << " " <(); + kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + Model1 m1(conf["model1"].as()); + PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); + + ModelAndData posterior(lp0, corpuse, corpusf, vocabe, vocabf); + posterior.Sample(); + + return 0; +} + diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc new file mode 100644 index 00000000..2c2a86f9 --- /dev/null +++ b/gi/pf/itg.cc @@ -0,0 +1,224 @@ +#include +#include +#include + +#include +#include +#include + +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +ostream& operator<<(ostream& os, const vector& p) { + os << '['; + for (int i = 0; i < p.size(); ++i) + os << (i==0 ? "" : " ") << TD::Convert(p[i]); + return os << ']'; +} + +size_t hash_value(const TRule& r) { + size_t h = boost::hash_value(r.e_); + boost::hash_combine(h, -r.lhs_); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +struct Model1 { + explicit Model1(const string& fname) : + kNULL(TD::Convert("")), + kZERO() { + LoadModel1(fname); + } + + void LoadModel1(const string& fname) { + cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + int cur = 0; + int start = 0; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + const WordID src = TD::Convert(&line[0]); + ++cur; + start = cur; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + WordID trg = TD::Convert(&line[start]); + const double logprob = strtod(&line[cur + 1], NULL); + if (src >= ttable.size()) ttable.resize(src + 1); + ttable[src][trg].logeq(logprob); + } + cerr << " read " << lc << " parameters.\n"; + } + + // returns prob 0 if src or trg is not found! + const prob_t& operator()(WordID src, WordID trg) const { + if (src == 0) src = kNULL; + if (src < ttable.size()) { + const map& cpd = ttable[src]; + const map::const_iterator it = cpd.find(trg); + if (it != cpd.end()) + return it->second; + } + return kZERO; + } + + const WordID kNULL; + const prob_t kZERO; + vector > ttable; +}; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("particles,p",po::value()->default_value(25),"Number of particles") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(7),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(7),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); + const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + const unsigned particles = conf["particles"].as(); + const unsigned samples = conf["samples"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + shared_ptr prng; + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + cerr << "Reading corpus...\n"; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + assert(corpusf.size() == corpuse.size()); + + const int kLHS = -TD::Convert("X"); + Model1 m1(conf["model1"].as()); + Model1 invm1(conf["inverse_model1"].as()); + for (int si = 0; si < conf["samples"].as(); ++si) { + cerr << '.' << flush; + for (int ci = 0; ci < corpusf.size(); ++ci) { + const vector& src = corpusf[ci]; + const vector& trg = corpuse[ci]; + for (int i = 0; i < src.size(); ++i) { + for (int j = 0; j < trg.size(); ++j) { + const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE); + for (int k = 0; k < eff_max_src; ++k) { + const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE)); + for (int l = 0; l < eff_max_trg; ++l) { + } + } + } + } + } + } +} + diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc new file mode 100644 index 00000000..4c6ba3ef --- /dev/null +++ b/gi/pf/pfbrat.cc @@ -0,0 +1,554 @@ +#include +#include +#include + +#include +#include +#include +#include + +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "cfg_wfst_composer.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; +struct FSTState; + +size_t hash_value(const TRule& r) { + size_t h = 2 - r.lhs_; + boost::hash_combine(h, boost::hash_value(r.e_)); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +struct ConditionalBase { + explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) : + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_TARGET(1.0 / vocab_e_size), + kNULL(TD::Convert("")) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + LoadModel1(model1fname); + } + + void LoadModel1(const string& fname) { + cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + int cur = 0; + int start = 0; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + const WordID src = TD::Convert(&line[0]); + ++cur; + start = cur; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + WordID trg = TD::Convert(&line[start]); + const double logprob = strtod(&line[cur + 1], NULL); + if (src >= ttable.size()) ttable.resize(src + 1); + ttable[src][trg].logeq(logprob); + } + cerr << " read " << lc << " parameters.\n"; + } + + // return logp0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + const int flen = rule.f_.size(); + const int elen = rule.e_.size(); + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) { // for each position i in e-RHS + const WordID trg = rule.e_[i]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? kNULL : rule.f_[j]; + const map::const_iterator it = ttable[src].find(trg); + if (it != ttable[src].end()) { + tp += kM1MIXTURE * it->second; + } + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + return p; + } + + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_TARGET; + const WordID kNULL; + vector > ttable; +}; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(3),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(3),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +struct UniphraseLM { + UniphraseLM(const vector >& corpus, + const set& vocab, + const po::variables_map& conf) : + phrases_(1,1), + gen_(1,1), + corpus_(corpus), + uniform_word_(1.0 / vocab.size()), + gen_p0_(0.5), + p_end_(0.5), + use_poisson_(conf.count("poisson_length") > 0) {} + + void ResampleHyperparameters(MT19937* rng) { + phrases_.resample_hyperparameters(rng); + gen_.resample_hyperparameters(rng); + cerr << " " << phrases_.concentration(); + } + + CCRP_NoTable > phrases_; + CCRP_NoTable gen_; + vector > z_; // z_[i] is there a phrase boundary after the ith word + const vector >& corpus_; + const double uniform_word_; + const double gen_p0_; + const double p_end_; // in base length distribution, p of the end of a phrase + const bool use_poisson_; +}; + +struct Reachability { + boost::multi_array edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? + boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + + Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), + max_src_delta(boost::extents[srclen][trglen]) { + ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); + } + + private: + struct SState { + SState() : prev_src_covered(), prev_trg_covered() {} + SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} + int prev_src_covered; + int prev_trg_covered; + }; + + struct NState { + NState() : next_src_covered(), next_trg_covered() {} + NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {} + int next_src_covered; + int next_trg_covered; + }; + + void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { + typedef boost::multi_array, 2> array_type; + array_type a(boost::extents[srclen + 1][trglen + 1]); + a[0][0].push_back(SState()); + for (int i = 0; i < srclen; ++i) { + for (int j = 0; j < trglen; ++j) { + if (a[i][j].size() == 0) continue; + const SState prev(i,j); + for (int k = 1; k <= src_max_phrase_len; ++k) { + if ((i + k) > srclen) continue; + for (int l = 1; l <= trg_max_phrase_len; ++l) { + if ((j + l) > trglen) continue; + a[i + k][j + l].push_back(prev); + } + } + } + } + a[0][0].clear(); + cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; + assert(a[srclen][trglen].size() > 0); + + typedef boost::multi_array rarray_type; + rarray_type r(boost::extents[srclen + 1][trglen + 1]); +// typedef boost::multi_array, 2> narray_type; +// narray_type b(boost::extents[srclen + 1][trglen + 1]); + r[srclen][trglen] = true; + for (int i = srclen; i >= 0; --i) { + for (int j = trglen; j >= 0; --j) { + vector& prevs = a[i][j]; + if (!r[i][j]) { prevs.clear(); } +// const NState nstate(i,j); + for (int k = 0; k < prevs.size(); ++k) { + r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; + int src_delta = i - prevs[k].prev_src_covered; + edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; + short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; + if (src_delta > msd) msd = src_delta; +// b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate); + } + } + } + assert(!edges[0][0][1][0]); + assert(!edges[0][0][0][1]); + assert(!edges[0][0][0][0]); + cerr << " MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl; + assert(max_src_delta[0][0] > 0); + //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; + //for (int i = 0; i < b[0][0].size(); ++i) { + // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; + //} + } +}; + +ostream& operator<<(ostream& os, const FSTState& q); +struct FSTState { + explicit FSTState(int src_size) : + trg_covered_(), + src_covered_(), + src_coverage_(src_size) {} + + FSTState(short trg_covered, short src_covered, const vector& src_coverage, const vector& src_prefix) : + trg_covered_(trg_covered), + src_covered_(src_covered), + src_coverage_(src_coverage), + src_prefix_(src_prefix) { + if (src_coverage_.size() == src_covered) { + assert(src_prefix.size() == 0); + } + } + + // if we extend by the word at src_position, what are + // the next states that are reachable and lie on a valid + // path to the final state? + vector Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const { + assert(src_position < src_coverage_.size()); + if (src_coverage_[src_position]) { + cerr << "Trying to extend " << *this << " with position " << src_position << endl; + abort(); + } + vector ncvg = src_coverage_; + ncvg[src_position] = true; + + vector res; + const int trg_remaining = trg_len - trg_covered_; + if (trg_remaining <= 0) { + cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl; + abort(); + } + const int src_remaining = src_len - src_covered_; + if (src_remaining <= 0) { + cerr << "Source appears to have been covered: " << *this << endl; + abort(); + } + + for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) { + if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) { + int nc = src_prefix_.size() + 1 + src_covered_; + res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector())); + } + } + + if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) { + vector nsp = src_prefix_; + nsp.push_back(src_position); + res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp)); + } + + if (res.size() == 0) { + cerr << *this << " can't be extended!\n"; + abort(); + } + return res; + } + + short trg_covered_, src_covered_; + vector src_coverage_; + vector src_prefix_; +}; +bool operator<(const FSTState& q, const FSTState& r) { + if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_; + if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_; + if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_; + return q.src_prefix_ < r.src_prefix_; +} + +ostream& operator<<(ostream& os, const FSTState& q) { + os << "[" << q.trg_covered_ << " : "; + for (int i = 0; i < q.src_coverage_.size(); ++i) + os << q.src_coverage_[i]; + os << " : <"; + for (int i = 0; i < q.src_prefix_.size(); ++i) { + if (i != 0) os << ' '; + os << q.src_prefix_[i]; + } + return os << ">]"; +} + +struct MyModel { + MyModel(ConditionalBase& rcp0) : rp0(rcp0) {} + typedef unordered_map, CCRP_NoTable, boost::hash > > SrcToRuleCRPMap; + + void DecrementRule(const TRule& rule) { + SrcToRuleCRPMap::iterator it = rules.find(rule.f_); + assert(it != rules.end()); + it->second.decrement(rule); + if (it->second.num_customers() == 0) rules.erase(it); + } + + void IncrementRule(const TRule& rule) { + SrcToRuleCRPMap::iterator it = rules.find(rule.f_); + if (it == rules.end()) { + CCRP_NoTable crp(1,1); + it = rules.insert(make_pair(rule.f_, crp)).first; + } + it->second.increment(rule); + } + + // conditioned on rule.f_ + prob_t RuleConditionalProbability(const TRule& rule) const { + const prob_t base = rp0(rule); + SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_); + if (it == rules.end()) { + return base; + } else { + const double lp = it->second.logprob(rule, log(base)); + prob_t q; q.logeq(lp); + return q; + } + } + + const ConditionalBase& rp0; + SrcToRuleCRPMap rules; +}; + +struct MyFST : public WFST { + MyFST(const vector& ssrc, const vector& strg, MyModel* m) : + src(ssrc), trg(strg), + r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE), + model(m) { + FSTState in(src.size()); + cerr << " INIT: " << in << endl; + init = GetNode(in); + for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true; + in.src_covered_ = src.size(); + in.trg_covered_ = trg.size(); + cerr << "FINAL: " << in << endl; + final = GetNode(in); + } + virtual const WFSTNode* Final() const; + virtual const WFSTNode* Initial() const; + + const WFSTNode* GetNode(const FSTState& q); + map > m; + const vector& src; + const vector& trg; + Reachability r; + const WFSTNode* init; + const WFSTNode* final; + MyModel* model; +}; + +struct MyNode : public WFSTNode { + MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {} + virtual vector > ExtendInput(unsigned srcindex) const; + const FSTState state; + mutable MyFST* container; +}; + +vector > MyNode::ExtendInput(unsigned srcindex) const { + cerr << "EXTEND " << state << " with " << srcindex << endl; + vector ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r); + vector > res(ext.size()); + for (unsigned i = 0; i < ext.size(); ++i) { + res[i].first = container->GetNode(ext[i]); + if (ext[i].src_prefix_.size() == 0) { + const unsigned trg_from = state.trg_covered_; + const unsigned trg_to = ext[i].trg_covered_; + const unsigned prev_prfx_size = state.src_prefix_.size(); + res[i].second.reset(new TRule); + res[i].second->lhs_ = -TD::Convert("X"); + vector& src = res[i].second->f_; + vector& trg = res[i].second->e_; + src.resize(prev_prfx_size + 1); + for (unsigned j = 0; j < prev_prfx_size; ++j) + src[j] = container->src[state.src_prefix_[j]]; + src[prev_prfx_size] = container->src[srcindex]; + for (unsigned j = trg_from; j < trg_to; ++j) + trg.push_back(container->trg[j]); + res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second))); + } + } + return res; +} + +const WFSTNode* MyFST::GetNode(const FSTState& q) { + boost::shared_ptr& res = m[q]; + if (!res) { + res.reset(new MyNode(q, this)); + } + return &*res; +} + +const WFSTNode* MyFST::Final() const { + return final; +} + +const WFSTNode* MyFST::Initial() const { + return init; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); + kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + shared_ptr prng; + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + ConditionalBase lp0(conf["model1_interpolation_weight"].as(), + vocabe.size(), + conf["model1"].as()); + MyModel m(lp0); + + TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0"); + m.IncrementRule(x); + TRule y("[X] ||| nY dyN ||| gave ||| 0"); + m.IncrementRule(y); + + + MyFST fst(corpusf[0], corpuse[0], &m); + ifstream in("./kimura.g"); + assert(in); + CFG_WFSTComposer comp(fst); + Hypergraph hg; + bool succeed = comp.Compose(&in, &hg); + hg.PrintGraphviz(); + if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } + +#if 0 + ifstream in2("./amnabooks.g"); + assert(in2); + MyFST fst2(corpusf[1], corpuse[1], &m); + CFG_WFSTComposer comp2(fst2); + Hypergraph hg2; + bool succeed2 = comp2.Compose(&in2, &hg2); + if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } +#endif + + SparseVector w; w.set_value(FD::Convert("Proposal"), 1.0); + hg.Reweight(w); + cerr << ViterbiFTree(hg) << endl; + return 0; +} + diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc new file mode 100644 index 00000000..18dfd03b --- /dev/null +++ b/gi/pf/pfdist.cc @@ -0,0 +1,621 @@ +#include +#include +#include + +#include +#include +#include + +#include "base_measures.h" +#include "reachability.h" +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +size_t hash_value(const TRule& r) { + size_t h = boost::hash_value(r.e_); + boost::hash_combine(h, -r.lhs_); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("particles,p",po::value()->default_value(30),"Number of particles") + ("filter_frequency,f",po::value()->default_value(5),"Number of time steps between filterings") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(5),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(5),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +#if 0 +struct MyConditionalModel { + MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable(1,1)) {} + + prob_t srcp0(const vector& src) const { + prob_t p(1.0 / 3000.0); + p.poweq(src.size()); + prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0)); + p *= lenp; + return p; + } + + void DecrementRule(const TRule& rule) { + const RuleCRPMap::iterator it = rules.find(rule.f_); + assert(it != rules.end()); + if (it->second.decrement(rule)) { + base /= (*rp0)(rule); + if (it->second.num_customers() == 0) + rules.erase(it); + } + if (src_phrases.decrement(rule.f_)) + base /= srcp0(rule.f_); + } + + void IncrementRule(const TRule& rule) { + RuleCRPMap::iterator it = rules.find(rule.f_); + if (it == rules.end()) + it = rules.insert(make_pair(rule.f_, CCRP_NoTable(1,1))).first; + if (it->second.increment(rule)) { + base *= (*rp0)(rule); + } + if (src_phrases.increment(rule.f_)) + base *= srcp0(rule.f_); + } + + void IncrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + void IncrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].increment(dist)) + base *= jp0(dist, src_len); + } + + void DecrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].decrement(dist)) + base /= jp0(dist, src_len); + } + + void IncrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + IncrementJump(js[i], src_len); + } + + void DecrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + DecrementJump(js[i], src_len); + } + + // p(jump = dist | src_len , z) + prob_t JumpProbability(int dist, unsigned src_len) { + const prob_t p0 = jp0(dist, src_len); + const double lp = src_jumps[src_len].logprob(dist, log(p0)); + prob_t q; q.logeq(lp); + return q; + } + + // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) + prob_t RuleProbability(const TRule& rule) const { + const prob_t p0 = (*rp0)(rule); + prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_)))); + const RuleCRPMap::const_iterator it = rules.find(rule.f_); + if (it == rules.end()) return srcp * p0; + const double lp = it->second.logprob(rule, log(p0)); + prob_t q; q.logeq(lp); + return q * srcp; + } + + prob_t Likelihood() const { + prob_t p = base; + for (RuleCRPMap::const_iterator it = rules.begin(); + it != rules.end(); ++it) { + prob_t cl; cl.logeq(it->second.log_crp_prob()); + p *= cl; + } + for (unsigned l = 1; l < src_jumps.size(); ++l) { + if (src_jumps[l].num_customers() > 0) { + prob_t q; + q.logeq(src_jumps[l].log_crp_prob()); + p *= q; + } + } + return p; + } + + JumpBase jp0; + const PhraseConditionalBase* rp0; + prob_t base; + typedef unordered_map, CCRP_NoTable, boost::hash > > RuleCRPMap; + RuleCRPMap rules; + CCRP_NoTable > src_phrases; + vector > src_jumps; +}; + +#endif + +struct MyJointModel { + MyJointModel(PhraseJointBase& rcp0) : + rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable(1,1)) {} + + void DecrementRule(const TRule& rule) { + if (rules.decrement(rule)) + base /= rp0(rule); + } + + void IncrementRule(const TRule& rule) { + if (rules.increment(rule)) + base *= rp0(rule); + } + + void IncrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + void IncrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].increment(dist)) + base *= jp0(dist, src_len); + } + + void DecrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].decrement(dist)) + base /= jp0(dist, src_len); + } + + void IncrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + IncrementJump(js[i], src_len); + } + + void DecrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + DecrementJump(js[i], src_len); + } + + // p(jump = dist | src_len , z) + prob_t JumpProbability(int dist, unsigned src_len) { + const prob_t p0 = jp0(dist, src_len); + const double lp = src_jumps[src_len].logprob(dist, log(p0)); + prob_t q; q.logeq(lp); + return q; + } + + // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) + prob_t RuleProbability(const TRule& rule) const { + prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); + return p; + } + + prob_t Likelihood() const { + prob_t p = base; + prob_t q; q.logeq(rules.log_crp_prob()); + p *= q; + for (unsigned l = 1; l < src_jumps.size(); ++l) { + if (src_jumps[l].num_customers() > 0) { + prob_t q; + q.logeq(src_jumps[l].log_crp_prob()); + p *= q; + } + } + return p; + } + + JumpBase jp0; + const PhraseJointBase& rp0; + prob_t base; + CCRP_NoTable rules; + vector > src_jumps; +}; + +struct BackwardEstimate { + BackwardEstimate(const Model1& m1, const vector& src, const vector& trg) : + model1_(m1), src_(src), trg_(trg) { + } + const prob_t& operator()(const vector& src_cov, unsigned trg_cov) const { + assert(src_.size() == src_cov.size()); + assert(trg_cov <= trg_.size()); + prob_t& e = cache_[src_cov][trg_cov]; + if (e.is_0()) { + if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } + vector r(src_.size() + 1); r.clear(); + r.push_back(0); // NULL word + for (int i = 0; i < src_cov.size(); ++i) + if (!src_cov[i]) r.push_back(src_[i]); + const prob_t uniform_alignment(1.0 / r.size()); + e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_cov; j < trg_.size(); ++j) { + prob_t p; + for (unsigned i = 0; i < r.size(); ++i) + p += model1_(r[i], trg_[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; + abort(); + } + p *= uniform_alignment; + e *= p; + } + } + return e; + } + const Model1& model1_; + const vector& src_; + const vector& trg_; + mutable unordered_map, map, boost::hash > > cache_; +}; + +struct BackwardEstimateSym { + BackwardEstimateSym(const Model1& m1, + const Model1& invm1, const vector& src, const vector& trg) : + model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { + } + const prob_t& operator()(const vector& src_cov, unsigned trg_cov) const { + assert(src_.size() == src_cov.size()); + assert(trg_cov <= trg_.size()); + prob_t& e = cache_[src_cov][trg_cov]; + if (e.is_0()) { + if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } + vector r(src_.size() + 1); r.clear(); + for (int i = 0; i < src_cov.size(); ++i) + if (!src_cov[i]) r.push_back(src_[i]); + r.push_back(0); // NULL word + const prob_t uniform_alignment(1.0 / r.size()); + e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_cov; j < trg_.size(); ++j) { + prob_t p; + for (unsigned i = 0; i < r.size(); ++i) + p += model1_(r[i], trg_[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; + abort(); + } + p *= uniform_alignment; + e *= p; + } + r.pop_back(); + const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); + prob_t inv; + inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + for (unsigned i = 0; i < r.size(); ++i) { + prob_t p; + for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) + p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); + if (p.is_0()) { + cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; + abort(); + } + p *= inv_uniform; + inv *= p; + } + prob_t x = pow(e * inv, 0.5); + e = x; + //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; + } + return e; + } + const Model1& model1_; + const Model1& invmodel1_; + const vector& src_; + const vector& trg_; + mutable unordered_map, map, boost::hash > > cache_; +}; + +struct Particle { + Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {} + prob_t weight; + prob_t gamma_last; + vector src_jumps; + vector rules; + vector src_cv; + int src_cov; + int trg_cov; + int prev_pos; +}; + +ostream& operator<<(ostream& o, const vector& v) { + for (int i = 0; i < v.size(); ++i) + o << (v[i] ? '1' : '0'); + return o; +} +ostream& operator<<(ostream& o, const Particle& p) { + o << "[cv=" << p.src_cv << " src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']'; + return o; +} + +void FilterCrapParticlesAndReweight(vector* pps) { + vector& ps = *pps; + SampleSet ss; + for (int i = 0; i < ps.size(); ++i) + ss.add(ps[i].weight); + vector nps; nps.reserve(ps.size()); + const prob_t uniform_weight(1.0 / ps.size()); + for (int i = 0; i < ps.size(); ++i) { + nps.push_back(ps[prng->SelectSample(ss)]); + nps[i].weight = uniform_weight; + } + nps.swap(ps); +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); + const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + const unsigned particles = conf["particles"].as(); + const unsigned samples = conf["samples"].as(); + const unsigned rejuv_freq = conf["filter_frequency"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + cerr << "Reading corpus...\n"; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + assert(corpusf.size() == corpuse.size()); + + const int kLHS = -TD::Convert("X"); + Model1 m1(conf["model1"].as()); + Model1 invm1(conf["inverse_model1"].as()); + +#if 0 + PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size()); + MyConditionalModel m(lp0); +#else + PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); + MyJointModel m(lp0); +#endif + + cerr << "Initializing reachability limits...\n"; + vector ps(corpusf.size()); + vector reaches; reaches.reserve(corpusf.size()); + for (int ci = 0; ci < corpusf.size(); ++ci) + reaches.push_back(Reachability(corpusf[ci].size(), + corpuse[ci].size(), + kMAX_SRC_PHRASE, + kMAX_TRG_PHRASE)); + cerr << "Sampling...\n"; + vector tmp_p(10000); // work space + SampleSet pfss; + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpusf.size(); ++ci) { + vector& src = corpusf[ci]; + vector& trg = corpuse[ci]; + m.DecrementRules(ps[ci].rules); + m.DecrementJumps(ps[ci].src_jumps, src.size()); + + //BackwardEstimate be(m1, src, trg); + BackwardEstimateSym be(m1, invm1, src, trg); + const Reachability& r = reaches[ci]; + vector lps(particles); + + for (int pi = 0; pi < particles; ++pi) { + Particle& p = lps[pi]; + p.src_cv.resize(src.size(), false); + } + + bool all_complete = false; + while(!all_complete) { + SampleSet ss; + + // all particles have now been extended a bit, we will reweight them now + if (lps[0].trg_cov > 0) + FilterCrapParticlesAndReweight(&lps); + + // loop over all particles and extend them + bool done_nothing = true; + for (int pi = 0; pi < particles; ++pi) { + Particle& p = lps[pi]; + int tic = 0; + while(p.trg_cov < trg.size() && tic < rejuv_freq) { + ++tic; + done_nothing = false; + ss.clear(); + TRule x; x.lhs_ = kLHS; + prob_t z; + int first_uncovered = src.size(); + int last_uncovered = -1; + for (int i = 0; i < src.size(); ++i) { + const bool is_uncovered = !p.src_cv[i]; + if (i < first_uncovered && is_uncovered) first_uncovered = i; + if (is_uncovered && i > last_uncovered) last_uncovered = i; + } + assert(last_uncovered > -1); + assert(first_uncovered < src.size()); + + for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { + x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); + for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { + if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; + + const int last_possible_start = last_uncovered - src_len + 1; + assert(last_possible_start >= 0); + //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl; + //cerr << " first_uncovered=" << first_uncovered << " last_possible_start=" << last_possible_start << endl; + for (int i = first_uncovered; i <= last_possible_start; ++i) { + if (p.src_cv[i]) continue; + assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size + Particle& np = tmp_p[ss.size()]; + np = p; + x.f_.clear(); + int gap_add = 0; + bool bad = false; + prob_t jp = prob_t::One(); + int prev_pos = p.prev_pos; + for (int j = 0; j < src_len; ++j) { + if ((j + i + gap_add) == src.size()) { bad = true; break; } + while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; } + if ((j + i + gap_add) == src.size()) { bad = true; break; } + np.src_cv[i + j + gap_add] = true; + x.f_.push_back(src[i + j + gap_add]); + jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size()); + int jump = i + j + gap_add - prev_pos; + assert(jump != 0); + np.src_jumps.push_back(jump); + prev_pos = i + j + gap_add; + } + if (bad) continue; + np.prev_pos = prev_pos; + np.src_cov += x.f_.size(); + np.trg_cov += x.e_.size(); + if (x.f_.size() != src_len) continue; + prob_t rp = m.RuleProbability(x); + np.gamma_last = rp * jp; + const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2); + //cerr << "**rule=" << x << endl; + //cerr << " u=" << log(u) << " rule=" << rp << " jump=" << jp << endl; + ss.add(u); + np.rules.push_back(TRulePtr(new TRule(x))); + z += u; + + const bool completed = (p.trg_cov == trg.size()); + if (completed) { + int last_jump = src.size() - p.prev_pos; + assert(last_jump > 0); + p.src_jumps.push_back(last_jump); + p.weight *= m.JumpProbability(last_jump, src.size()); + } + } + } + } + cerr << "number of edges to consider: " << ss.size() << endl; + const int sampled = rng.SelectSample(ss); + prob_t q_n = ss[sampled] / z; + p = tmp_p[sampled]; + //m.IncrementRule(*p.rules.back()); + p.weight *= p.gamma_last / q_n; + cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; + cerr << p << endl; + } + } // loop over particles (pi = 0 .. particles) + if (done_nothing) all_complete = true; + } + pfss.clear(); + for (int i = 0; i < lps.size(); ++i) + pfss.add(lps[i].weight); + const int sampled = rng.SelectSample(pfss); + ps[ci] = lps[sampled]; + m.IncrementRules(lps[sampled].rules); + m.IncrementJumps(lps[sampled].src_jumps, src.size()); + for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } + cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; + } + cerr << "LLH: " << log(m.Likelihood()) << endl; + for (int sni = 0; sni < 5; ++sni) { + for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; } + } + } + return 0; +} + diff --git a/gi/pf/pfdist.new.cc b/gi/pf/pfdist.new.cc new file mode 100644 index 00000000..3169eb75 --- /dev/null +++ b/gi/pf/pfdist.new.cc @@ -0,0 +1,620 @@ +#include +#include +#include + +#include +#include +#include + +#include "base_measures.h" +#include "reachability.h" +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +size_t hash_value(const TRule& r) { + size_t h = boost::hash_value(r.e_); + boost::hash_combine(h, -r.lhs_); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("particles,p",po::value()->default_value(25),"Number of particles") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(5),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(5),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +#if 0 +struct MyConditionalModel { + MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable(1,1)) {} + + prob_t srcp0(const vector& src) const { + prob_t p(1.0 / 3000.0); + p.poweq(src.size()); + prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0)); + p *= lenp; + return p; + } + + void DecrementRule(const TRule& rule) { + const RuleCRPMap::iterator it = rules.find(rule.f_); + assert(it != rules.end()); + if (it->second.decrement(rule)) { + base /= (*rp0)(rule); + if (it->second.num_customers() == 0) + rules.erase(it); + } + if (src_phrases.decrement(rule.f_)) + base /= srcp0(rule.f_); + } + + void IncrementRule(const TRule& rule) { + RuleCRPMap::iterator it = rules.find(rule.f_); + if (it == rules.end()) + it = rules.insert(make_pair(rule.f_, CCRP_NoTable(1,1))).first; + if (it->second.increment(rule)) { + base *= (*rp0)(rule); + } + if (src_phrases.increment(rule.f_)) + base *= srcp0(rule.f_); + } + + void IncrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + void IncrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].increment(dist)) + base *= jp0(dist, src_len); + } + + void DecrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].decrement(dist)) + base /= jp0(dist, src_len); + } + + void IncrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + IncrementJump(js[i], src_len); + } + + void DecrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + DecrementJump(js[i], src_len); + } + + // p(jump = dist | src_len , z) + prob_t JumpProbability(int dist, unsigned src_len) { + const prob_t p0 = jp0(dist, src_len); + const double lp = src_jumps[src_len].logprob(dist, log(p0)); + prob_t q; q.logeq(lp); + return q; + } + + // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) + prob_t RuleProbability(const TRule& rule) const { + const prob_t p0 = (*rp0)(rule); + prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_)))); + const RuleCRPMap::const_iterator it = rules.find(rule.f_); + if (it == rules.end()) return srcp * p0; + const double lp = it->second.logprob(rule, log(p0)); + prob_t q; q.logeq(lp); + return q * srcp; + } + + prob_t Likelihood() const { + prob_t p = base; + for (RuleCRPMap::const_iterator it = rules.begin(); + it != rules.end(); ++it) { + prob_t cl; cl.logeq(it->second.log_crp_prob()); + p *= cl; + } + for (unsigned l = 1; l < src_jumps.size(); ++l) { + if (src_jumps[l].num_customers() > 0) { + prob_t q; + q.logeq(src_jumps[l].log_crp_prob()); + p *= q; + } + } + return p; + } + + JumpBase jp0; + const PhraseConditionalBase* rp0; + prob_t base; + typedef unordered_map, CCRP_NoTable, boost::hash > > RuleCRPMap; + RuleCRPMap rules; + CCRP_NoTable > src_phrases; + vector > src_jumps; +}; + +#endif + +struct MyJointModel { + MyJointModel(PhraseJointBase& rcp0) : + rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable(1,1)) {} + + void DecrementRule(const TRule& rule) { + if (rules.decrement(rule)) + base /= rp0(rule); + } + + void IncrementRule(const TRule& rule) { + if (rules.increment(rule)) + base *= rp0(rule); + } + + void IncrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + void IncrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].increment(dist)) + base *= jp0(dist, src_len); + } + + void DecrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].decrement(dist)) + base /= jp0(dist, src_len); + } + + void IncrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + IncrementJump(js[i], src_len); + } + + void DecrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + DecrementJump(js[i], src_len); + } + + // p(jump = dist | src_len , z) + prob_t JumpProbability(int dist, unsigned src_len) { + const prob_t p0 = jp0(dist, src_len); + const double lp = src_jumps[src_len].logprob(dist, log(p0)); + prob_t q; q.logeq(lp); + return q; + } + + // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) + prob_t RuleProbability(const TRule& rule) const { + prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); + return p; + } + + prob_t Likelihood() const { + prob_t p = base; + prob_t q; q.logeq(rules.log_crp_prob()); + p *= q; + for (unsigned l = 1; l < src_jumps.size(); ++l) { + if (src_jumps[l].num_customers() > 0) { + prob_t q; + q.logeq(src_jumps[l].log_crp_prob()); + p *= q; + } + } + return p; + } + + JumpBase jp0; + const PhraseJointBase& rp0; + prob_t base; + CCRP_NoTable rules; + vector > src_jumps; +}; + +struct BackwardEstimate { + BackwardEstimate(const Model1& m1, const vector& src, const vector& trg) : + model1_(m1), src_(src), trg_(trg) { + } + const prob_t& operator()(const vector& src_cov, unsigned trg_cov) const { + assert(src_.size() == src_cov.size()); + assert(trg_cov <= trg_.size()); + prob_t& e = cache_[src_cov][trg_cov]; + if (e.is_0()) { + if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } + vector r(src_.size() + 1); r.clear(); + r.push_back(0); // NULL word + for (int i = 0; i < src_cov.size(); ++i) + if (!src_cov[i]) r.push_back(src_[i]); + const prob_t uniform_alignment(1.0 / r.size()); + e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_cov; j < trg_.size(); ++j) { + prob_t p; + for (unsigned i = 0; i < r.size(); ++i) + p += model1_(r[i], trg_[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; + abort(); + } + p *= uniform_alignment; + e *= p; + } + } + return e; + } + const Model1& model1_; + const vector& src_; + const vector& trg_; + mutable unordered_map, map, boost::hash > > cache_; +}; + +struct BackwardEstimateSym { + BackwardEstimateSym(const Model1& m1, + const Model1& invm1, const vector& src, const vector& trg) : + model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { + } + const prob_t& operator()(const vector& src_cov, unsigned trg_cov) const { + assert(src_.size() == src_cov.size()); + assert(trg_cov <= trg_.size()); + prob_t& e = cache_[src_cov][trg_cov]; + if (e.is_0()) { + if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } + vector r(src_.size() + 1); r.clear(); + for (int i = 0; i < src_cov.size(); ++i) + if (!src_cov[i]) r.push_back(src_[i]); + r.push_back(0); // NULL word + const prob_t uniform_alignment(1.0 / r.size()); + e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_cov; j < trg_.size(); ++j) { + prob_t p; + for (unsigned i = 0; i < r.size(); ++i) + p += model1_(r[i], trg_[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; + abort(); + } + p *= uniform_alignment; + e *= p; + } + r.pop_back(); + const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); + prob_t inv; + inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + for (unsigned i = 0; i < r.size(); ++i) { + prob_t p; + for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) + p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); + if (p.is_0()) { + cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; + abort(); + } + p *= inv_uniform; + inv *= p; + } + prob_t x = pow(e * inv, 0.5); + e = x; + //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; + } + return e; + } + const Model1& model1_; + const Model1& invmodel1_; + const vector& src_; + const vector& trg_; + mutable unordered_map, map, boost::hash > > cache_; +}; + +struct Particle { + Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {} + prob_t weight; + prob_t gamma_last; + vector src_jumps; + vector rules; + vector src_cv; + int src_cov; + int trg_cov; + int prev_pos; +}; + +ostream& operator<<(ostream& o, const vector& v) { + for (int i = 0; i < v.size(); ++i) + o << (v[i] ? '1' : '0'); + return o; +} +ostream& operator<<(ostream& o, const Particle& p) { + o << "[cv=" << p.src_cv << " src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']'; + return o; +} + +void FilterCrapParticlesAndReweight(vector* pps) { + vector& ps = *pps; + SampleSet ss; + for (int i = 0; i < ps.size(); ++i) + ss.add(ps[i].weight); + vector nps; nps.reserve(ps.size()); + const prob_t uniform_weight(1.0 / ps.size()); + for (int i = 0; i < ps.size(); ++i) { + nps.push_back(ps[prng->SelectSample(ss)]); + nps[i].weight = uniform_weight; + } + nps.swap(ps); +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); + const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + const unsigned particles = conf["particles"].as(); + const unsigned samples = conf["samples"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + cerr << "Reading corpus...\n"; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + assert(corpusf.size() == corpuse.size()); + + const int kLHS = -TD::Convert("X"); + Model1 m1(conf["model1"].as()); + Model1 invm1(conf["inverse_model1"].as()); + +#if 0 + PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size()); + MyConditionalModel m(lp0); +#else + PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); + MyJointModel m(lp0); +#endif + + cerr << "Initializing reachability limits...\n"; + vector ps(corpusf.size()); + vector reaches; reaches.reserve(corpusf.size()); + for (int ci = 0; ci < corpusf.size(); ++ci) + reaches.push_back(Reachability(corpusf[ci].size(), + corpuse[ci].size(), + kMAX_SRC_PHRASE, + kMAX_TRG_PHRASE)); + cerr << "Sampling...\n"; + vector tmp_p(10000); // work space + SampleSet pfss; + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpusf.size(); ++ci) { + vector& src = corpusf[ci]; + vector& trg = corpuse[ci]; + m.DecrementRules(ps[ci].rules); + m.DecrementJumps(ps[ci].src_jumps, src.size()); + + //BackwardEstimate be(m1, src, trg); + BackwardEstimateSym be(m1, invm1, src, trg); + const Reachability& r = reaches[ci]; + vector lps(particles); + + for (int pi = 0; pi < particles; ++pi) { + Particle& p = lps[pi]; + p.src_cv.resize(src.size(), false); + } + + bool all_complete = false; + while(!all_complete) { + SampleSet ss; + + // all particles have now been extended a bit, we will reweight them now + if (lps[0].trg_cov > 0) + FilterCrapParticlesAndReweight(&lps); + + // loop over all particles and extend them + bool done_nothing = true; + for (int pi = 0; pi < particles; ++pi) { + Particle& p = lps[pi]; + int tic = 0; + const int rejuv_freq = 1; + while(p.trg_cov < trg.size() && tic < rejuv_freq) { + ++tic; + done_nothing = false; + ss.clear(); + TRule x; x.lhs_ = kLHS; + prob_t z; + int first_uncovered = src.size(); + int last_uncovered = -1; + for (int i = 0; i < src.size(); ++i) { + const bool is_uncovered = !p.src_cv[i]; + if (i < first_uncovered && is_uncovered) first_uncovered = i; + if (is_uncovered && i > last_uncovered) last_uncovered = i; + } + assert(last_uncovered > -1); + assert(first_uncovered < src.size()); + + for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { + x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); + for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { + if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; + + const int last_possible_start = last_uncovered - src_len + 1; + assert(last_possible_start >= 0); + //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl; + //cerr << " first_uncovered=" << first_uncovered << " last_possible_start=" << last_possible_start << endl; + for (int i = first_uncovered; i <= last_possible_start; ++i) { + if (p.src_cv[i]) continue; + assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size + Particle& np = tmp_p[ss.size()]; + np = p; + x.f_.clear(); + int gap_add = 0; + bool bad = false; + prob_t jp = prob_t::One(); + int prev_pos = p.prev_pos; + for (int j = 0; j < src_len; ++j) { + if ((j + i + gap_add) == src.size()) { bad = true; break; } + while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; } + if ((j + i + gap_add) == src.size()) { bad = true; break; } + np.src_cv[i + j + gap_add] = true; + x.f_.push_back(src[i + j + gap_add]); + jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size()); + int jump = i + j + gap_add - prev_pos; + assert(jump != 0); + np.src_jumps.push_back(jump); + prev_pos = i + j + gap_add; + } + if (bad) continue; + np.prev_pos = prev_pos; + np.src_cov += x.f_.size(); + np.trg_cov += x.e_.size(); + if (x.f_.size() != src_len) continue; + prob_t rp = m.RuleProbability(x); + np.gamma_last = rp * jp; + const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2); + //cerr << "**rule=" << x << endl; + //cerr << " u=" << log(u) << " rule=" << rp << " jump=" << jp << endl; + ss.add(u); + np.rules.push_back(TRulePtr(new TRule(x))); + z += u; + + const bool completed = (p.trg_cov == trg.size()); + if (completed) { + int last_jump = src.size() - p.prev_pos; + assert(last_jump > 0); + p.src_jumps.push_back(last_jump); + p.weight *= m.JumpProbability(last_jump, src.size()); + } + } + } + } + cerr << "number of edges to consider: " << ss.size() << endl; + const int sampled = rng.SelectSample(ss); + prob_t q_n = ss[sampled] / z; + p = tmp_p[sampled]; + //m.IncrementRule(*p.rules.back()); + p.weight *= p.gamma_last / q_n; + cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; + cerr << p << endl; + } + } // loop over particles (pi = 0 .. particles) + if (done_nothing) all_complete = true; + } + pfss.clear(); + for (int i = 0; i < lps.size(); ++i) + pfss.add(lps[i].weight); + const int sampled = rng.SelectSample(pfss); + ps[ci] = lps[sampled]; + m.IncrementRules(lps[sampled].rules); + m.IncrementJumps(lps[sampled].src_jumps, src.size()); + for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } + cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; + } + cerr << "LLH: " << log(m.Likelihood()) << endl; + for (int sni = 0; sni < 5; ++sni) { + for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; } + } + } + return 0; +} + diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc new file mode 100644 index 00000000..43c604c3 --- /dev/null +++ b/gi/pf/pfnaive.cc @@ -0,0 +1,385 @@ +#include +#include +#include + +#include +#include +#include + +#include "base_measures.h" +#include "reachability.h" +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +size_t hash_value(const TRule& r) { + size_t h = boost::hash_value(r.e_); + boost::hash_combine(h, -r.lhs_); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("particles,p",po::value()->default_value(30),"Number of particles") + ("filter_frequency,f",po::value()->default_value(5),"Number of time steps between filterings") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(5),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(5),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +struct MyJointModel { + MyJointModel(PhraseJointBase& rcp0) : + rp0(rcp0), base(prob_t::One()), rules(1,1) {} + + void DecrementRule(const TRule& rule) { + if (rules.decrement(rule)) + base /= rp0(rule); + } + + void IncrementRule(const TRule& rule) { + if (rules.increment(rule)) + base *= rp0(rule); + } + + void IncrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); + return p; + } + + prob_t Likelihood() const { + prob_t p = base; + prob_t q; q.logeq(rules.log_crp_prob()); + p *= q; + for (unsigned l = 1; l < src_jumps.size(); ++l) { + if (src_jumps[l].num_customers() > 0) { + prob_t q; + q.logeq(src_jumps[l].log_crp_prob()); + p *= q; + } + } + return p; + } + + const PhraseJointBase& rp0; + prob_t base; + CCRP_NoTable rules; + vector > src_jumps; +}; + +struct BackwardEstimateSym { + BackwardEstimateSym(const Model1& m1, + const Model1& invm1, const vector& src, const vector& trg) : + model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { + } + const prob_t& operator()(unsigned src_cov, unsigned trg_cov) const { + assert(src_cov <= src_.size()); + assert(trg_cov <= trg_.size()); + prob_t& e = cache_[src_cov][trg_cov]; + if (e.is_0()) { + if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } + vector r(src_.size() + 1); r.clear(); + for (int i = src_cov; i < src_.size(); ++i) + r.push_back(src_[i]); + r.push_back(0); // NULL word + const prob_t uniform_alignment(1.0 / r.size()); + e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_cov; j < trg_.size(); ++j) { + prob_t p; + for (unsigned i = 0; i < r.size(); ++i) + p += model1_(r[i], trg_[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; + abort(); + } + p *= uniform_alignment; + e *= p; + } + r.pop_back(); + const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); + prob_t inv; + inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + for (unsigned i = 0; i < r.size(); ++i) { + prob_t p; + for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) + p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); + if (p.is_0()) { + cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; + abort(); + } + p *= inv_uniform; + inv *= p; + } + prob_t x = pow(e * inv, 0.5); + e = x; + //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; + } + return e; + } + const Model1& model1_; + const Model1& invmodel1_; + const vector& src_; + const vector& trg_; + mutable unordered_map > cache_; +}; + +struct Particle { + Particle() : weight(prob_t::One()), src_cov(), trg_cov() {} + prob_t weight; + prob_t gamma_last; + vector rules; + int src_cov; + int trg_cov; +}; + +ostream& operator<<(ostream& o, const vector& v) { + for (int i = 0; i < v.size(); ++i) + o << (v[i] ? '1' : '0'); + return o; +} +ostream& operator<<(ostream& o, const Particle& p) { + o << "[src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']'; + return o; +} + +void FilterCrapParticlesAndReweight(vector* pps) { + vector& ps = *pps; + SampleSet ss; + for (int i = 0; i < ps.size(); ++i) + ss.add(ps[i].weight); + vector nps; nps.reserve(ps.size()); + const prob_t uniform_weight(1.0 / ps.size()); + for (int i = 0; i < ps.size(); ++i) { + nps.push_back(ps[prng->SelectSample(ss)]); + nps[i].weight = uniform_weight; + } + nps.swap(ps); +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); + const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + const unsigned particles = conf["particles"].as(); + const unsigned samples = conf["samples"].as(); + const unsigned rejuv_freq = conf["filter_frequency"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + cerr << "Reading corpus...\n"; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + assert(corpusf.size() == corpuse.size()); + + const int kLHS = -TD::Convert("X"); + Model1 m1(conf["model1"].as()); + Model1 invm1(conf["inverse_model1"].as()); + +#if 0 + PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size()); + MyConditionalModel m(lp0); +#else + PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); + MyJointModel m(lp0); +#endif + + cerr << "Initializing reachability limits...\n"; + vector ps(corpusf.size()); + vector reaches; reaches.reserve(corpusf.size()); + for (int ci = 0; ci < corpusf.size(); ++ci) + reaches.push_back(Reachability(corpusf[ci].size(), + corpuse[ci].size(), + kMAX_SRC_PHRASE, + kMAX_TRG_PHRASE)); + cerr << "Sampling...\n"; + vector tmp_p(10000); // work space + SampleSet pfss; + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpusf.size(); ++ci) { + vector& src = corpusf[ci]; + vector& trg = corpuse[ci]; + m.DecrementRules(ps[ci].rules); + + BackwardEstimateSym be(m1, invm1, src, trg); + const Reachability& r = reaches[ci]; + vector lps(particles); + + bool all_complete = false; + while(!all_complete) { + SampleSet ss; + + // all particles have now been extended a bit, we will reweight them now + if (lps[0].trg_cov > 0) + FilterCrapParticlesAndReweight(&lps); + + // loop over all particles and extend them + bool done_nothing = true; + for (int pi = 0; pi < particles; ++pi) { + Particle& p = lps[pi]; + int tic = 0; + while(p.trg_cov < trg.size() && tic < rejuv_freq) { + ++tic; + done_nothing = false; + ss.clear(); + TRule x; x.lhs_ = kLHS; + prob_t z; + + for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { + x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); + for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { + if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; + + int i = p.src_cov; + assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size + Particle& np = tmp_p[ss.size()]; + np = p; + x.f_.clear(); + for (int j = 0; j < src_len; ++j) + x.f_.push_back(src[i + j]); + np.src_cov += x.f_.size(); + np.trg_cov += x.e_.size(); + prob_t rp = m.RuleProbability(x); + np.gamma_last = rp; + const prob_t u = pow(np.gamma_last * pow(be(np.src_cov, np.trg_cov), 1.2), 0.1); + //cerr << "**rule=" << x << endl; + //cerr << " u=" << log(u) << " rule=" << rp << endl; + ss.add(u); + np.rules.push_back(TRulePtr(new TRule(x))); + z += u; + } + } + //cerr << "number of edges to consider: " << ss.size() << endl; + const int sampled = rng.SelectSample(ss); + prob_t q_n = ss[sampled] / z; + p = tmp_p[sampled]; + //m.IncrementRule(*p.rules.back()); + p.weight *= p.gamma_last / q_n; + //cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; + //cerr << p << endl; + } + } // loop over particles (pi = 0 .. particles) + if (done_nothing) all_complete = true; + } + pfss.clear(); + for (int i = 0; i < lps.size(); ++i) + pfss.add(lps[i].weight); + const int sampled = rng.SelectSample(pfss); + ps[ci] = lps[sampled]; + m.IncrementRules(lps[sampled].rules); + for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } + cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; + } + cerr << "LLH: " << log(m.Likelihood()) << endl; + } + return 0; +} + diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc new file mode 100644 index 00000000..73dd8d39 --- /dev/null +++ b/gi/pf/reachability.cc @@ -0,0 +1,64 @@ +#include "reachability.h" + +#include +#include + +using namespace std; + +struct SState { + SState() : prev_src_covered(), prev_trg_covered() {} + SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} + int prev_src_covered; + int prev_trg_covered; +}; + +void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { + typedef boost::multi_array, 2> array_type; + array_type a(boost::extents[srclen + 1][trglen + 1]); + a[0][0].push_back(SState()); + for (int i = 0; i < srclen; ++i) { + for (int j = 0; j < trglen; ++j) { + if (a[i][j].size() == 0) continue; + const SState prev(i,j); + for (int k = 1; k <= src_max_phrase_len; ++k) { + if ((i + k) > srclen) continue; + for (int l = 1; l <= trg_max_phrase_len; ++l) { + if ((j + l) > trglen) continue; + a[i + k][j + l].push_back(prev); + } + } + } + } + a[0][0].clear(); + //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; + if (a[srclen][trglen].size() == 0) { + cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n"; + return; + } + + typedef boost::multi_array rarray_type; + rarray_type r(boost::extents[srclen + 1][trglen + 1]); + r[srclen][trglen] = true; + for (int i = srclen; i >= 0; --i) { + for (int j = trglen; j >= 0; --j) { + vector& prevs = a[i][j]; + if (!r[i][j]) { prevs.clear(); } + for (int k = 0; k < prevs.size(); ++k) { + r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; + int src_delta = i - prevs[k].prev_src_covered; + edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; + short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; + if (src_delta > msd) msd = src_delta; + } + } + } + assert(!edges[0][0][1][0]); + assert(!edges[0][0][0][1]); + assert(!edges[0][0][0][0]); + assert(max_src_delta[0][0] > 0); + //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; + //for (int i = 0; i < b[0][0].size(); ++i) { + // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; + //} + } + diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h new file mode 100644 index 00000000..98450ec1 --- /dev/null +++ b/gi/pf/reachability.h @@ -0,0 +1,28 @@ +#ifndef _REACHABILITY_H_ +#define _REACHABILITY_H_ + +#include "boost/multi_array.hpp" + +// determines minimum and maximum lengths of outgoing edges from all +// coverage positions such that the alignment path respects src and +// trg maximum phrase sizes +// +// runs in O(n^2 * src_max * trg_max) time but should be relatively fast +// +// currently forbids 0 -> n and n -> 0 alignments + +struct Reachability { + boost::multi_array edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? + boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + + Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), + max_src_delta(boost::extents[srclen][trglen]) { + ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); + } + + private: + void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len); +}; + +#endif diff --git a/gi/pf/tpf.cc b/gi/pf/tpf.cc new file mode 100644 index 00000000..7348d21c --- /dev/null +++ b/gi/pf/tpf.cc @@ -0,0 +1,99 @@ +#include +#include +#include + +#include "sampler.h" + +using namespace std; +using namespace tr1; + +shared_ptr prng; + +struct Particle { + Particle() : weight(prob_t::One()) {} + vector states; + prob_t weight; + prob_t gamma_last; +}; + +ostream& operator<<(ostream& os, const Particle& p) { + os << "["; + for (int i = 0; i < p.states.size(); ++i) os << p.states[i] << ' '; + os << "| w=" << log(p.weight) << ']'; + return os; +} + +void Rejuvenate(vector& pps) { + SampleSet ss; + vector nps(pps.size()); + for (int i = 0; i < pps.size(); ++i) { +// cerr << pps[i] << endl; + ss.add(pps[i].weight); + } +// cerr << "REJUVINATING...\n"; + for (int i = 0; i < pps.size(); ++i) { + nps[i] = pps[prng->SelectSample(ss)]; + nps[i].weight = prob_t(1.0 / pps.size()); +// cerr << nps[i] << endl; + } + nps.swap(pps); +// exit(1); +} + +int main(int argc, char** argv) { + const unsigned particles = 100; + prng.reset(new MT19937); + MT19937& rng = *prng; + + // q(a) = 0.8 + // q(b) = 0.8 + // q(c) = 0.4 + SampleSet ssq; + ssq.add(0.4); + ssq.add(0.6); + ssq.add(0); + double qz = 1; + + // p(a) = 0.2 + // p(b) = 0.8 + vector p(3); + p[0] = 0.2; + p[1] = 0.8; + p[2] = 0; + + vector counts(3); + int tot = 0; + + vector pps(particles); + SampleSet ppss; + int LEN = 12; + int PP = 1; + while (pps[0].states.size() < LEN) { + for (int pi = 0; pi < particles; ++pi) { + Particle& prt = pps[pi]; + + bool redo = true; + const Particle savedp = prt; + while (redo) { + redo = false; + for (int i = 0; i < PP; ++i) { + int s = rng.SelectSample(ssq); + double gamma_last = p[s]; + if (!gamma_last) { redo = true; break; } + double q = ssq[s] / qz; + prt.states.push_back(s); + prt.weight *= prob_t(gamma_last / q); + } + if (redo) { prt = savedp; continue; } + } + } + Rejuvenate(pps); + } + ppss.clear(); + for (int i = 0; i < particles; ++i) { ppss.add(pps[i].weight); } + int sp = rng.SelectSample(ppss); + cerr << pps[sp] << endl; + + return 0; +} + diff --git a/m4/acx_pthread.m4 b/m4/acx_pthread.m4 new file mode 100644 index 00000000..2cf20de1 --- /dev/null +++ b/m4/acx_pthread.m4 @@ -0,0 +1,363 @@ +# This was retrieved from +# http://svn.0pointer.de/viewvc/trunk/common/acx_pthread.m4?revision=1277&root=avahi +# See also (perhaps for new versions?) +# http://svn.0pointer.de/viewvc/trunk/common/acx_pthread.m4?root=avahi +# +# We've rewritten the inconsistency check code (from avahi), to work +# more broadly. In particular, it no longer assumes ld accepts -zdefs. +# This caused a restructing of the code, but the functionality has only +# changed a little. + +dnl @synopsis ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]) +dnl +dnl @summary figure out how to build C programs using POSIX threads +dnl +dnl This macro figures out how to build C programs using POSIX threads. +dnl It sets the PTHREAD_LIBS output variable to the threads library and +dnl linker flags, and the PTHREAD_CFLAGS output variable to any special +dnl C compiler flags that are needed. (The user can also force certain +dnl compiler flags/libs to be tested by setting these environment +dnl variables.) +dnl +dnl Also sets PTHREAD_CC to any special C compiler that is needed for +dnl multi-threaded programs (defaults to the value of CC otherwise). +dnl (This is necessary on AIX to use the special cc_r compiler alias.) +dnl +dnl NOTE: You are assumed to not only compile your program with these +dnl flags, but also link it with them as well. e.g. you should link +dnl with $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS +dnl $LIBS +dnl +dnl If you are only building threads programs, you may wish to use +dnl these variables in your default LIBS, CFLAGS, and CC: +dnl +dnl LIBS="$PTHREAD_LIBS $LIBS" +dnl CFLAGS="$CFLAGS $PTHREAD_CFLAGS" +dnl CC="$PTHREAD_CC" +dnl +dnl In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute +dnl constant has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to +dnl that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX). +dnl +dnl ACTION-IF-FOUND is a list of shell commands to run if a threads +dnl library is found, and ACTION-IF-NOT-FOUND is a list of commands to +dnl run it if it is not found. If ACTION-IF-FOUND is not specified, the +dnl default action will define HAVE_PTHREAD. +dnl +dnl Please let the authors know if this macro fails on any platform, or +dnl if you have any other suggestions or comments. This macro was based +dnl on work by SGJ on autoconf scripts for FFTW (www.fftw.org) (with +dnl help from M. Frigo), as well as ac_pthread and hb_pthread macros +dnl posted by Alejandro Forero Cuervo to the autoconf macro repository. +dnl We are also grateful for the helpful feedback of numerous users. +dnl +dnl @category InstalledPackages +dnl @author Steven G. Johnson +dnl @version 2006-05-29 +dnl @license GPLWithACException +dnl +dnl Checks for GCC shared/pthread inconsistency based on work by +dnl Marcin Owsiany + + +AC_DEFUN([ACX_PTHREAD], [ +AC_REQUIRE([AC_CANONICAL_HOST]) +AC_LANG_SAVE +AC_LANG_C +acx_pthread_ok=no + +# We used to check for pthread.h first, but this fails if pthread.h +# requires special compiler flags (e.g. on True64 or Sequent). +# It gets checked for in the link test anyway. + +# First of all, check if the user has set any of the PTHREAD_LIBS, +# etcetera environment variables, and if threads linking works using +# them: +if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then + save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + save_LIBS="$LIBS" + LIBS="$PTHREAD_LIBS $LIBS" + AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS]) + AC_TRY_LINK_FUNC(pthread_join, acx_pthread_ok=yes) + AC_MSG_RESULT($acx_pthread_ok) + if test x"$acx_pthread_ok" = xno; then + PTHREAD_LIBS="" + PTHREAD_CFLAGS="" + fi + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" +fi + +# We must check for the threads library under a number of different +# names; the ordering is very important because some systems +# (e.g. DEC) have both -lpthread and -lpthreads, where one of the +# libraries is broken (non-POSIX). + +# Create a list of thread flags to try. Items starting with a "-" are +# C compiler flags, and other items are library names, except for "none" +# which indicates that we try without any flags at all, and "pthread-config" +# which is a program returning the flags for the Pth emulation library. + +acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config" + +# The ordering *is* (sometimes) important. Some notes on the +# individual items follow: + +# pthreads: AIX (must check this before -lpthread) +# none: in case threads are in libc; should be tried before -Kthread and +# other compiler flags to prevent continual compiler warnings +# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h) +# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) +# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) +# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads) +# -pthreads: Solaris/gcc +# -mthreads: Mingw32/gcc, Lynx/gcc +# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it +# doesn't hurt to check since this sometimes defines pthreads too; +# also defines -D_REENTRANT) +# ... -mt is also the pthreads flag for HP/aCC +# pthread: Linux, etcetera +# --thread-safe: KAI C++ +# pthread-config: use pthread-config program (for GNU Pth library) + +case "${host_cpu}-${host_os}" in + *solaris*) + + # On Solaris (at least, for some versions), libc contains stubbed + # (non-functional) versions of the pthreads routines, so link-based + # tests will erroneously succeed. (We need to link with -pthreads/-mt/ + # -lpthread.) (The stubs are missing pthread_cleanup_push, or rather + # a function called by this macro, so we could check for that, but + # who knows whether they'll stub that too in a future libc.) So, + # we'll just look for -pthreads and -lpthread first: + + acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags" + ;; +esac + +if test x"$acx_pthread_ok" = xno; then +for flag in $acx_pthread_flags; do + + case $flag in + none) + AC_MSG_CHECKING([whether pthreads work without any flags]) + ;; + + -*) + AC_MSG_CHECKING([whether pthreads work with $flag]) + PTHREAD_CFLAGS="$flag" + ;; + + pthread-config) + AC_CHECK_PROG(acx_pthread_config, pthread-config, yes, no) + if test x"$acx_pthread_config" = xno; then continue; fi + PTHREAD_CFLAGS="`pthread-config --cflags`" + PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`" + ;; + + *) + AC_MSG_CHECKING([for the pthreads library -l$flag]) + PTHREAD_LIBS="-l$flag" + ;; + esac + + save_LIBS="$LIBS" + save_CFLAGS="$CFLAGS" + LIBS="$PTHREAD_LIBS $LIBS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + + # Check for various functions. We must include pthread.h, + # since some functions may be macros. (On the Sequent, we + # need a special flag -Kthread to make this header compile.) + # We check for pthread_join because it is in -lpthread on IRIX + # while pthread_create is in libc. We check for pthread_attr_init + # due to DEC craziness with -lpthreads. We check for + # pthread_cleanup_push because it is one of the few pthread + # functions on Solaris that doesn't have a non-functional libc stub. + # We try pthread_create on general principles. + AC_TRY_LINK([#include ], + [pthread_t th; pthread_join(th, 0); + pthread_attr_init(0); pthread_cleanup_push(0, 0); + pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], + [acx_pthread_ok=yes]) + + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" + + AC_MSG_RESULT($acx_pthread_ok) + if test "x$acx_pthread_ok" = xyes; then + break; + fi + + PTHREAD_LIBS="" + PTHREAD_CFLAGS="" +done +fi + +# Various other checks: +if test "x$acx_pthread_ok" = xyes; then + save_LIBS="$LIBS" + LIBS="$PTHREAD_LIBS $LIBS" + save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + + # Detect AIX lossage: JOINABLE attribute is called UNDETACHED. + AC_MSG_CHECKING([for joinable pthread attribute]) + attr_name=unknown + for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do + AC_TRY_LINK([#include ], [int attr=$attr; return attr;], + [attr_name=$attr; break]) + done + AC_MSG_RESULT($attr_name) + if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then + AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name, + [Define to necessary symbol if this constant + uses a non-standard name on your system.]) + fi + + AC_MSG_CHECKING([if more special flags are required for pthreads]) + flag=no + case "${host_cpu}-${host_os}" in + *-aix* | *-freebsd* | *-darwin*) flag="-D_THREAD_SAFE";; + *solaris* | *-osf* | *-hpux*) flag="-D_REENTRANT";; + esac + AC_MSG_RESULT(${flag}) + if test "x$flag" != xno; then + PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS" + fi + + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" + # More AIX lossage: must compile with xlc_r or cc_r + if test x"$GCC" != xyes; then + AC_CHECK_PROGS(PTHREAD_CC, xlc_r cc_r, ${CC}) + else + PTHREAD_CC=$CC + fi + + # The next part tries to detect GCC inconsistency with -shared on some + # architectures and systems. The problem is that in certain + # configurations, when -shared is specified, GCC "forgets" to + # internally use various flags which are still necessary. + + # + # Prepare the flags + # + save_CFLAGS="$CFLAGS" + save_LIBS="$LIBS" + save_CC="$CC" + + # Try with the flags determined by the earlier checks. + # + # -Wl,-z,defs forces link-time symbol resolution, so that the + # linking checks with -shared actually have any value + # + # FIXME: -fPIC is required for -shared on many architectures, + # so we specify it here, but the right way would probably be to + # properly detect whether it is actually required. + CFLAGS="-shared -fPIC -Wl,-z,defs $CFLAGS $PTHREAD_CFLAGS" + LIBS="$PTHREAD_LIBS $LIBS" + CC="$PTHREAD_CC" + + # In order not to create several levels of indentation, we test + # the value of "$done" until we find the cure or run out of ideas. + done="no" + + # First, make sure the CFLAGS we added are actually accepted by our + # compiler. If not (and OS X's ld, for instance, does not accept -z), + # then we can't do this test. + if test x"$done" = xno; then + AC_MSG_CHECKING([whether to check for GCC pthread/shared inconsistencies]) + AC_TRY_LINK(,, , [done=yes]) + + if test "x$done" = xyes ; then + AC_MSG_RESULT([no]) + else + AC_MSG_RESULT([yes]) + fi + fi + + if test x"$done" = xno; then + AC_MSG_CHECKING([whether -pthread is sufficient with -shared]) + AC_TRY_LINK([#include ], + [pthread_t th; pthread_join(th, 0); + pthread_attr_init(0); pthread_cleanup_push(0, 0); + pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], + [done=yes]) + + if test "x$done" = xyes; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + fi + + # + # Linux gcc on some architectures such as mips/mipsel forgets + # about -lpthread + # + if test x"$done" = xno; then + AC_MSG_CHECKING([whether -lpthread fixes that]) + LIBS="-lpthread $PTHREAD_LIBS $save_LIBS" + AC_TRY_LINK([#include ], + [pthread_t th; pthread_join(th, 0); + pthread_attr_init(0); pthread_cleanup_push(0, 0); + pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], + [done=yes]) + + if test "x$done" = xyes; then + AC_MSG_RESULT([yes]) + PTHREAD_LIBS="-lpthread $PTHREAD_LIBS" + else + AC_MSG_RESULT([no]) + fi + fi + # + # FreeBSD 4.10 gcc forgets to use -lc_r instead of -lc + # + if test x"$done" = xno; then + AC_MSG_CHECKING([whether -lc_r fixes that]) + LIBS="-lc_r $PTHREAD_LIBS $save_LIBS" + AC_TRY_LINK([#include ], + [pthread_t th; pthread_join(th, 0); + pthread_attr_init(0); pthread_cleanup_push(0, 0); + pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], + [done=yes]) + + if test "x$done" = xyes; then + AC_MSG_RESULT([yes]) + PTHREAD_LIBS="-lc_r $PTHREAD_LIBS" + else + AC_MSG_RESULT([no]) + fi + fi + if test x"$done" = xno; then + # OK, we have run out of ideas + AC_MSG_WARN([Impossible to determine how to use pthreads with shared libraries]) + + # so it's not safe to assume that we may use pthreads + acx_pthread_ok=no + fi + + CFLAGS="$save_CFLAGS" + LIBS="$save_LIBS" + CC="$save_CC" +else + PTHREAD_CC="$CC" +fi + +AC_SUBST(PTHREAD_LIBS) +AC_SUBST(PTHREAD_CFLAGS) +AC_SUBST(PTHREAD_CC) + +# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: +if test x"$acx_pthread_ok" = xyes; then + ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1]) + : +else + acx_pthread_ok=no + $2 +fi +AC_LANG_RESTORE +])dnl ACX_PTHREAD diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h new file mode 100644 index 00000000..63b6f4c2 --- /dev/null +++ b/utils/ccrp_nt.h @@ -0,0 +1,169 @@ +#ifndef _CCRP_NT_H_ +#define _CCRP_NT_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "sampler.h" +#include "slice_sampler.h" + +// Chinese restaurant process (1 parameter) +template > +class CCRP_NoTable { + public: + explicit CCRP_NoTable(double conc) : + num_customers_(), + concentration_(conc), + concentration_prior_shape_(std::numeric_limits::quiet_NaN()), + concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} + + CCRP_NoTable(double c_shape, double c_rate, double c = 10.0) : + num_customers_(), + concentration_(c), + concentration_prior_shape_(c_shape), + concentration_prior_rate_(c_rate) {} + + double concentration() const { return concentration_; } + + bool has_concentration_prior() const { + return !std::isnan(concentration_prior_shape_); + } + + void clear() { + num_customers_ = 0; + custs_.clear(); + } + + unsigned num_customers() const { + return num_customers_; + } + + unsigned num_customers(const Dish& dish) const { + const typename std::tr1::unordered_map::const_iterator it = custs_.find(dish); + if (it == custs_.end()) return 0; + return it->second; + } + + int increment(const Dish& dish) { + int table_diff = 0; + if (++custs_[dish] == 1) + table_diff = 1; + ++num_customers_; + return table_diff; + } + + int decrement(const Dish& dish) { + int table_diff = 0; + int nc = --custs_[dish]; + if (nc == 0) { + custs_.erase(dish); + table_diff = -1; + } else if (nc < 0) { + std::cerr << "Dish counts dropped below zero for: " << dish << std::endl; + abort(); + } + --num_customers_; + return table_diff; + } + + double prob(const Dish& dish, const double& p0) const { + const unsigned at_table = num_customers(dish); + return (at_table + p0 * concentration_) / (num_customers_ + concentration_); + } + + double logprob(const Dish& dish, const double& logp0) const { + const unsigned at_table = num_customers(dish); + return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_); + } + + double log_crp_prob() const { + return log_crp_prob(concentration_); + } + + static double log_gamma_density(const double& x, const double& shape, const double& rate) { + assert(x >= 0.0); + assert(shape > 0.0); + assert(rate > 0.0); + const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); + return lp; + } + + // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process + // does not include P_0's + double log_crp_prob(const double& concentration) const { + double lp = 0.0; + if (has_concentration_prior()) + lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + assert(lp <= 0.0); + if (num_customers_) { + lp += lgamma(concentration) - lgamma(concentration + num_customers_) + + custs_.size() * log(concentration); + assert(std::isfinite(lp)); + for (typename std::tr1::unordered_map::const_iterator it = custs_.begin(); + it != custs_.end(); ++it) { + lp += lgamma(it->second); + } + } + assert(std::isfinite(lp)); + return lp; + } + + void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + assert(has_concentration_prior()); + ConcentrationResampler cr(*this); + for (int iter = 0; iter < nloop; ++iter) { + concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + } + + struct ConcentrationResampler { + ConcentrationResampler(const CCRP_NoTable& crp) : crp_(crp) {} + const CCRP_NoTable& crp_; + double operator()(const double& proposed_concentration) const { + return crp_.log_crp_prob(proposed_concentration); + } + }; + + void Print(std::ostream* out) const { + (*out) << "DP(alpha=" << concentration_ << ") customers=" << num_customers_ << std::endl; + int cc = 0; + for (typename std::tr1::unordered_map::const_iterator it = custs_.begin(); + it != custs_.end(); ++it) { + (*out) << " " << it->first << "(" << it->second << " eating)"; + ++cc; + if (cc > 10) { (*out) << " ..."; break; } + } + (*out) << std::endl; + } + + unsigned num_customers_; + std::tr1::unordered_map custs_; + + typedef typename std::tr1::unordered_map::const_iterator const_iterator; + const_iterator begin() const { + return custs_.begin(); + } + const_iterator end() const { + return custs_.end(); + } + + double concentration_; + + // optional gamma prior on concentration_ (NaN if no prior) + double concentration_prior_shape_; + double concentration_prior_rate_; +}; + +template +std::ostream& operator<<(std::ostream& o, const CCRP_NoTable& c) { + c.Print(&o); + return o; +} + +#endif diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h new file mode 100644 index 00000000..a868af9a --- /dev/null +++ b/utils/ccrp_onetable.h @@ -0,0 +1,241 @@ +#ifndef _CCRP_ONETABLE_H_ +#define _CCRP_ONETABLE_H_ + +#include +#include +#include +#include +#include +#include +#include +#include "sampler.h" +#include "slice_sampler.h" + +// Chinese restaurant process (Pitman-Yor parameters) with one table approximation + +template > +class CCRP_OneTable { + typedef std::tr1::unordered_map DishMapType; + public: + CCRP_OneTable(double disc, double conc) : + num_tables_(), + num_customers_(), + discount_(disc), + concentration_(conc), + discount_prior_alpha_(std::numeric_limits::quiet_NaN()), + discount_prior_beta_(std::numeric_limits::quiet_NaN()), + concentration_prior_shape_(std::numeric_limits::quiet_NaN()), + concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} + + CCRP_OneTable(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : + num_tables_(), + num_customers_(), + discount_(d), + concentration_(c), + discount_prior_alpha_(d_alpha), + discount_prior_beta_(d_beta), + concentration_prior_shape_(c_shape), + concentration_prior_rate_(c_rate) {} + + double discount() const { return discount_; } + double concentration() const { return concentration_; } + void set_concentration(double c) { concentration_ = c; } + void set_discount(double d) { discount_ = d; } + + bool has_discount_prior() const { + return !std::isnan(discount_prior_alpha_); + } + + bool has_concentration_prior() const { + return !std::isnan(concentration_prior_shape_); + } + + void clear() { + num_tables_ = 0; + num_customers_ = 0; + dish_counts_.clear(); + } + + unsigned num_tables() const { + return num_tables_; + } + + unsigned num_tables(const Dish& dish) const { + const typename DishMapType::const_iterator it = dish_counts_.find(dish); + if (it == dish_counts_.end()) return 0; + return 1; + } + + unsigned num_customers() const { + return num_customers_; + } + + unsigned num_customers(const Dish& dish) const { + const typename DishMapType::const_iterator it = dish_counts_.find(dish); + if (it == dish_counts_.end()) return 0; + return it->second; + } + + // returns +1 or 0 indicating whether a new table was opened + int increment(const Dish& dish) { + unsigned& dc = dish_counts_[dish]; + ++dc; + ++num_customers_; + if (dc == 1) { + ++num_tables_; + return 1; + } else { + return 0; + } + } + + // returns -1 or 0, indicating whether a table was closed + int decrement(const Dish& dish) { + unsigned& dc = dish_counts_[dish]; + assert(dc > 0); + if (dc == 1) { + dish_counts_.erase(dish); + --num_tables_; + --num_customers_; + return -1; + } else { + assert(dc > 1); + --dc; + --num_customers_; + return 0; + } + } + + double prob(const Dish& dish, const double& p0) const { + const typename DishMapType::const_iterator it = dish_counts_.find(dish); + const double r = num_tables_ * discount_ + concentration_; + if (it == dish_counts_.end()) { + return r * p0 / (num_customers_ + concentration_); + } else { + return (it->second - discount_ + r * p0) / + (num_customers_ + concentration_); + } + } + + double log_crp_prob() const { + return log_crp_prob(discount_, concentration_); + } + + static double log_beta_density(const double& x, const double& alpha, const double& beta) { + assert(x > 0.0); + assert(x < 1.0); + assert(alpha > 0.0); + assert(beta > 0.0); + const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); + return lp; + } + + static double log_gamma_density(const double& x, const double& shape, const double& rate) { + assert(x >= 0.0); + assert(shape > 0.0); + assert(rate > 0.0); + const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); + return lp; + } + + // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process + // does not include P_0's + double log_crp_prob(const double& discount, const double& concentration) const { + double lp = 0.0; + if (has_discount_prior()) + lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); + if (has_concentration_prior()) + lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + assert(lp <= 0.0); + if (num_customers_) { + if (discount > 0.0) { + const double r = lgamma(1.0 - discount); + lp += lgamma(concentration) - lgamma(concentration + num_customers_) + + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) + - lgamma(concentration / discount); + assert(std::isfinite(lp)); + for (typename DishMapType::const_iterator it = dish_counts_.begin(); + it != dish_counts_.end(); ++it) { + const unsigned& cur = it->second; + lp += lgamma(cur - discount) - r; + } + } else { + assert(!"not implemented yet"); + } + } + assert(std::isfinite(lp)); + return lp; + } + + void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + assert(has_discount_prior() || has_concentration_prior()); + DiscountResampler dr(*this); + ConcentrationResampler cr(*this); + for (int iter = 0; iter < nloop; ++iter) { + if (has_concentration_prior()) { + concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + if (has_discount_prior()) { + discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), + 1.0, 0.0, niterations, 100*niterations); + } + } + concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + + struct DiscountResampler { + DiscountResampler(const CCRP_OneTable& crp) : crp_(crp) {} + const CCRP_OneTable& crp_; + double operator()(const double& proposed_discount) const { + return crp_.log_crp_prob(proposed_discount, crp_.concentration_); + } + }; + + struct ConcentrationResampler { + ConcentrationResampler(const CCRP_OneTable& crp) : crp_(crp) {} + const CCRP_OneTable& crp_; + double operator()(const double& proposed_concentration) const { + return crp_.log_crp_prob(crp_.discount_, proposed_concentration); + } + }; + + void Print(std::ostream* out) const { + (*out) << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; + for (typename DishMapType::const_iterator it = dish_counts_.begin(); it != dish_counts_.end(); ++it) { + (*out) << " " << it->first << " = " << it->second << std::endl; + } + } + + typedef typename DishMapType::const_iterator const_iterator; + const_iterator begin() const { + return dish_counts_.begin(); + } + const_iterator end() const { + return dish_counts_.end(); + } + + unsigned num_tables_; + unsigned num_customers_; + DishMapType dish_counts_; + + double discount_; + double concentration_; + + // optional beta prior on discount_ (NaN if no prior) + double discount_prior_alpha_; + double discount_prior_beta_; + + // optional gamma prior on concentration_ (NaN if no prior) + double concentration_prior_shape_; + double concentration_prior_rate_; +}; + +template +std::ostream& operator<<(std::ostream& o, const CCRP_OneTable& c) { + c.Print(&o); + return o; +} + +#endif -- cgit v1.2.3 From ffaae62e4f1cedabbc6eb1982af129e7294d33eb Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 11 Oct 2011 12:56:49 +0100 Subject: missing numwords impl --- utils/tdict.cc | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'utils') diff --git a/utils/tdict.cc b/utils/tdict.cc index c21b2b48..de234323 100644 --- a/utils/tdict.cc +++ b/utils/tdict.cc @@ -13,6 +13,10 @@ using namespace std; Dict TD::dict_; +unsigned int TD::NumWords() { + return dict_.max(); +} + WordID TD::Convert(const std::string& s) { return dict_.Convert(s); } -- cgit v1.2.3 From 08c4a7fae8f0bec4f76c4e0928e357100eb7a1ca Mon Sep 17 00:00:00 2001 From: Guest_account Guest_account prguest11 Date: Tue, 11 Oct 2011 16:16:53 +0100 Subject: remove implicit conversion-to-double operator from LogVal that caused overflow errors, clean up some pf code --- decoder/aligner.cc | 2 +- decoder/cfg.cc | 2 +- decoder/cfg_format.h | 2 +- decoder/decoder.cc | 10 ++++---- decoder/hg.cc | 4 ++-- decoder/rule_lexer.l | 2 ++ decoder/trule.h | 15 +++++++++++- gi/pf/brat.cc | 11 --------- gi/pf/cbgi.cc | 10 -------- gi/pf/dpnaive.cc | 12 ---------- gi/pf/itg.cc | 11 --------- gi/pf/pfbrat.cc | 11 --------- gi/pf/pfdist.cc | 11 --------- gi/pf/pfnaive.cc | 11 --------- mteval/mbr_kbest.cc | 4 ++-- phrasinator/ccrp_nt.h | 24 +++++++++++++++---- training/mpi_batch_optimize.cc | 2 +- training/mpi_compute_cllh.cc | 51 +++++++++++++++++++---------------------- training/mpi_online_optimize.cc | 4 ++-- utils/logval.h | 10 ++++---- 20 files changed, 78 insertions(+), 131 deletions(-) (limited to 'utils') diff --git a/decoder/aligner.cc b/decoder/aligner.cc index 292ee123..53e059fb 100644 --- a/decoder/aligner.cc +++ b/decoder/aligner.cc @@ -165,7 +165,7 @@ inline void WriteProbGrid(const Array2D& m, ostream* pos) { if (m(i,j) == prob_t::Zero()) { os << "\t---X---"; } else { - snprintf(b, 1024, "%0.5f", static_cast(m(i,j))); + snprintf(b, 1024, "%0.5f", m(i,j).as_float()); os << '\t' << b; } } diff --git a/decoder/cfg.cc b/decoder/cfg.cc index 651978d2..cd7e66e9 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -639,7 +639,7 @@ void CFG::Print(std::ostream &o,CFGFormat const& f) const { o << '['<& src, SparseVector* trg) { for (SparseVector::const_iterator it = src.begin(); it != src.end(); ++it) - trg->set_value(it->first, it->second); + trg->set_value(it->first, it->second.as_float()); } }; @@ -788,10 +788,10 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { const bool show_tree_structure=conf.count("show_tree_structure"); if (!SILENT) forest_stats(forest," Init. forest",show_tree_structure,oracle.show_derivation); if (conf.count("show_expected_length")) { - const PRPair res = - Inside, - PRWeightFunction >(forest); - cerr << " Expected length (words): " << res.r / res.p << "\t" << res << endl; + const PRPair res = + Inside, + PRWeightFunction >(forest); + cerr << " Expected length (words): " << (res.r / res.p).as_float() << "\t" << res << endl; } if (conf.count("show_partition")) { diff --git a/decoder/hg.cc b/decoder/hg.cc index 3ad17f1a..180986d7 100644 --- a/decoder/hg.cc +++ b/decoder/hg.cc @@ -157,14 +157,14 @@ prob_t Hypergraph::ComputeEdgePosteriors(double scale, vector* posts) co const ScaledEdgeProb weight(scale); const ScaledTransitionEventWeightFunction w2(scale); SparseVector pv; - const double inside = InsideOutside, ScaledTransitionEventWeightFunction>(*this, &pv, weight, w2); posts->resize(edges_.size()); for (int i = 0; i < edges_.size(); ++i) (*posts)[i] = prob_t(pv.value(i)); - return prob_t(inside); + return inside; } prob_t Hypergraph::ComputeBestPathThroughEdges(vector* post) const { diff --git a/decoder/rule_lexer.l b/decoder/rule_lexer.l index 9331d8ed..083a5bb1 100644 --- a/decoder/rule_lexer.l +++ b/decoder/rule_lexer.l @@ -220,6 +220,8 @@ NT [^\t \[\],]+ std::cerr << "Line " << lex_line << ": LHS and RHS arity mismatch!\n"; abort(); } + // const bool ignore_grammar_features = false; + // if (ignore_grammar_features) scfglex_num_feats = 0; TRulePtr rp(new TRule(scfglex_lhs, scfglex_src_rhs, scfglex_src_rhs_size, scfglex_trg_rhs, scfglex_trg_rhs_size, scfglex_feat_ids, scfglex_feat_vals, scfglex_num_feats, scfglex_src_arity, scfglex_als, scfglex_num_als)); check_and_update_ctf_stack(rp); TRulePtr coarse_rp = ((ctf_level == 0) ? TRulePtr() : ctf_rule_stack.top()); diff --git a/decoder/trule.h b/decoder/trule.h index 4df4ec90..8eb2a059 100644 --- a/decoder/trule.h +++ b/decoder/trule.h @@ -5,7 +5,9 @@ #include #include #include -#include + +#include "boost/shared_ptr.hpp" +#include "boost/functional/hash.hpp" #include "sparse_vector.h" #include "wordid.h" @@ -162,4 +164,15 @@ class TRule { bool SanityCheck() const; }; +inline size_t hash_value(const TRule& r) { + size_t h = boost::hash_value(r.e_); + boost::hash_combine(h, -r.lhs_); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +inline bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + #endif diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc index 4c6ba3ef..7b60ef23 100644 --- a/gi/pf/brat.cc +++ b/gi/pf/brat.cc @@ -25,17 +25,6 @@ static unsigned kMAX_SRC_PHRASE; static unsigned kMAX_TRG_PHRASE; struct FSTState; -size_t hash_value(const TRule& r) { - size_t h = 2 - r.lhs_; - boost::hash_combine(h, boost::hash_value(r.e_)); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - double log_poisson(unsigned x, const double& lambda) { assert(lambda > 0.0); return log(lambda) * x - lgamma(x + 1) - lambda; diff --git a/gi/pf/cbgi.cc b/gi/pf/cbgi.cc index 20204e8a..97f1ba34 100644 --- a/gi/pf/cbgi.cc +++ b/gi/pf/cbgi.cc @@ -27,16 +27,6 @@ double log_decay(unsigned x, const double& b) { return log(b - 1) - x * log(b); } -size_t hash_value(const TRule& r) { - // TODO fix hash function - size_t h = boost::hash_value(r.e_) * boost::hash_value(r.f_) * r.lhs_; - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - struct SimpleBase { SimpleBase(unsigned esize, unsigned fsize, unsigned ntsize = 144) : uniform_e(-log(esize)), diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc index 582d1be7..608f73d5 100644 --- a/gi/pf/dpnaive.cc +++ b/gi/pf/dpnaive.cc @@ -20,18 +20,6 @@ namespace po = boost::program_options; static unsigned kMAX_SRC_PHRASE; static unsigned kMAX_TRG_PHRASE; -struct FSTState; - -size_t hash_value(const TRule& r) { - size_t h = 2 - r.lhs_; - boost::hash_combine(h, boost::hash_value(r.e_)); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc index 2c2a86f9..ac3c16a3 100644 --- a/gi/pf/itg.cc +++ b/gi/pf/itg.cc @@ -27,17 +27,6 @@ ostream& operator<<(ostream& os, const vector& p) { return os << ']'; } -size_t hash_value(const TRule& r) { - size_t h = boost::hash_value(r.e_); - boost::hash_combine(h, -r.lhs_); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - double log_poisson(unsigned x, const double& lambda) { assert(lambda > 0.0); return log(lambda) * x - lgamma(x + 1) - lambda; diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc index 4c6ba3ef..7b60ef23 100644 --- a/gi/pf/pfbrat.cc +++ b/gi/pf/pfbrat.cc @@ -25,17 +25,6 @@ static unsigned kMAX_SRC_PHRASE; static unsigned kMAX_TRG_PHRASE; struct FSTState; -size_t hash_value(const TRule& r) { - size_t h = 2 - r.lhs_; - boost::hash_combine(h, boost::hash_value(r.e_)); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - double log_poisson(unsigned x, const double& lambda) { assert(lambda > 0.0); return log(lambda) * x - lgamma(x + 1) - lambda; diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc index 18dfd03b..81abd61b 100644 --- a/gi/pf/pfdist.cc +++ b/gi/pf/pfdist.cc @@ -24,17 +24,6 @@ namespace po = boost::program_options; shared_ptr prng; -size_t hash_value(const TRule& r) { - size_t h = boost::hash_value(r.e_); - boost::hash_combine(h, -r.lhs_); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc index 43c604c3..c30e7c4f 100644 --- a/gi/pf/pfnaive.cc +++ b/gi/pf/pfnaive.cc @@ -24,17 +24,6 @@ namespace po = boost::program_options; shared_ptr prng; -size_t hash_value(const TRule& r) { - size_t h = boost::hash_value(r.e_); - boost::hash_combine(h, -r.lhs_); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc index 2867b36b..64a6a8bf 100644 --- a/mteval/mbr_kbest.cc +++ b/mteval/mbr_kbest.cc @@ -32,7 +32,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } struct LossComparer { - bool operator()(const pair, double>& a, const pair, double>& b) const { + bool operator()(const pair, prob_t>& a, const pair, prob_t>& b) const { return a.second < b.second; } }; @@ -108,7 +108,7 @@ int main(int argc, char** argv) { ScoreP s = scorer->ScoreCandidate(list[j].first); double loss = 1.0 - s->ComputeScore(); if (type == TER || type == AER) loss = 1.0 - loss; - double weighted_loss = loss * (joints[j] / marginal); + double weighted_loss = loss * (joints[j] / marginal).as_float(); wl_acc += weighted_loss; if ((!output_list) && wl_acc > mbr_loss) break; } diff --git a/phrasinator/ccrp_nt.h b/phrasinator/ccrp_nt.h index 163b643a..811bce73 100644 --- a/phrasinator/ccrp_nt.h +++ b/phrasinator/ccrp_nt.h @@ -50,15 +50,26 @@ class CCRP_NoTable { return it->second; } - void increment(const Dish& dish) { - ++custs_[dish]; + int increment(const Dish& dish) { + int table_diff = 0; + if (++custs_[dish] == 1) + table_diff = 1; ++num_customers_; + return table_diff; } - void decrement(const Dish& dish) { - if ((--custs_[dish]) == 0) + int decrement(const Dish& dish) { + int table_diff = 0; + int nc = --custs_[dish]; + if (nc == 0) { custs_.erase(dish); + table_diff = -1; + } else if (nc < 0) { + std::cerr << "Dish counts dropped below zero for: " << dish << std::endl; + abort(); + } --num_customers_; + return table_diff; } double prob(const Dish& dish, const double& p0) const { @@ -66,6 +77,11 @@ class CCRP_NoTable { return (at_table + p0 * concentration_) / (num_customers_ + concentration_); } + double logprob(const Dish& dish, const double& logp0) const { + const unsigned at_table = num_customers(dish); + return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_); + } + double log_crp_prob() const { return log_crp_prob(concentration_); } diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc index 0ba8c530..046e921c 100644 --- a/training/mpi_batch_optimize.cc +++ b/training/mpi_batch_optimize.cc @@ -92,7 +92,7 @@ struct TrainingObserver : public DecoderObserver { void SetLocalGradientAndObjective(vector* g, double* o) const { *o = acc_obj; for (SparseVector::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it) - (*g)[it->first] = it->second; + (*g)[it->first] = it->second.as_float(); } virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { diff --git a/training/mpi_compute_cllh.cc b/training/mpi_compute_cllh.cc index b496d196..d5caa745 100644 --- a/training/mpi_compute_cllh.cc +++ b/training/mpi_compute_cllh.cc @@ -1,6 +1,4 @@ -#include #include -#include #include #include #include @@ -12,6 +10,7 @@ #include #include +#include "sentence_metadata.h" #include "verbose.h" #include "hg.h" #include "prob.h" @@ -52,7 +51,8 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { return true; } -void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c, vector* ids) { +void ReadInstances(const string& fname, int rank, int size, vector* c) { + assert(fname != "-"); ReadFile rf(fname); istream& in = *rf.stream(); string line; @@ -60,20 +60,16 @@ void ReadTrainingCorpus(const string& fname, int rank, int size, vector* while(in) { getline(in, line); if (!in) break; - if (lc % size == rank) { - c->push_back(line); - ids->push_back(lc); - } + if (lc % size == rank) c->push_back(line); ++lc; } } static const double kMINUS_EPSILON = -1e-6; -struct TrainingObserver : public DecoderObserver { - void Reset() { - acc_obj = 0; - } +struct ConditionalLikelihoodObserver : public DecoderObserver { + + ConditionalLikelihoodObserver() : trg_words(), acc_obj(), cur_obj() {} virtual void NotifyDecodingStart(const SentenceMetadata&) { cur_obj = 0; @@ -120,8 +116,10 @@ struct TrainingObserver : public DecoderObserver { } assert(!isnan(log_ref_z)); acc_obj += (cur_obj - log_ref_z); + trg_words += smeta.GetReference().size(); } + unsigned trg_words; double acc_obj; double cur_obj; int state; @@ -161,35 +159,32 @@ int main(int argc, char** argv) { if (conf.count("weights")) Weights::InitFromFile(conf["weights"].as(), &weights); - // freeze feature set - //const bool freeze_feature_set = conf.count("freeze_feature_set"); - //if (freeze_feature_set) FD::Freeze(); - - vector corpus; vector ids; - ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus, &ids); + vector corpus; + ReadInstances(conf["training_data"].as(), rank, size, &corpus); assert(corpus.size() > 0); - assert(corpus.size() == ids.size()); - - TrainingObserver observer; - double objective = 0; - observer.Reset(); if (rank == 0) - cerr << "Each processor is decoding " << corpus.size() << " training examples...\n"; + cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n"; - for (int i = 0; i < corpus.size(); ++i) { - decoder.SetId(ids[i]); + ConditionalLikelihoodObserver observer; + for (int i = 0; i < corpus.size(); ++i) decoder.Decode(corpus[i], &observer); - } + double objective = 0; + unsigned total_words = 0; #ifdef HAVE_MPI reduce(world, observer.acc_obj, objective, std::plus(), 0); + reduce(world, observer.trg_words, total_words, std::plus(), 0); #else objective = observer.acc_obj; #endif - if (rank == 0) - cout << "OBJECTIVE: " << objective << endl; + if (rank == 0) { + cout << "CONDITIONAL LOG_e LIKELIHOOD: " << objective << endl; + cout << "CONDITIONAL LOG_2 LIKELIHOOD: " << (objective/log(2)) << endl; + cout << " CONDITIONAL ENTROPY: " << (objective/log(2) / total_words) << endl; + cout << " PERPLEXITY: " << pow(2, (objective/log(2) / total_words)) << endl; + } return 0; } diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc index 2ef4a2e7..f87b7274 100644 --- a/training/mpi_online_optimize.cc +++ b/training/mpi_online_optimize.cc @@ -94,7 +94,7 @@ struct TrainingObserver : public DecoderObserver { void SetLocalGradientAndObjective(vector* g, double* o) const { *o = acc_obj; for (SparseVector::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it) - (*g)[it->first] = it->second; + (*g)[it->first] = it->second.as_float(); } virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { @@ -158,7 +158,7 @@ struct TrainingObserver : public DecoderObserver { void GetGradient(SparseVector* g) const { g->clear(); for (SparseVector::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it) - g->set_value(it->first, it->second); + g->set_value(it->first, it->second.as_float()); } int total_complete; diff --git a/utils/logval.h b/utils/logval.h index 6fdc2c42..8a59d0b1 100644 --- a/utils/logval.h +++ b/utils/logval.h @@ -25,12 +25,13 @@ class LogVal { typedef LogVal Self; LogVal() : s_(), v_(LOGVAL_LOG0) {} - explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {} + LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {} + const Self& operator=(double x) { s_ = std::signbit(x); v_ = s_ ? std::log(-x) : std::log(x); return *this; } LogVal(init_minus_1) : s_(true),v_(0) { } LogVal(init_1) : s_(),v_(0) { } LogVal(init_0) : s_(),v_(LOGVAL_LOG0) { } - LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {} - LogVal(unsigned x) : s_(0), v_(std::log(x)) { } + explicit LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {} + explicit LogVal(unsigned x) : s_(0), v_(std::log(x)) { } LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {} LogVal(double lnx,init_lnx) : s_(),v_(lnx) {} static Self exp(T lnx) { return Self(lnx,false); } @@ -141,9 +142,6 @@ class LogVal { return pow(1/root); } - operator T() const { - if (s_) return -std::exp(v_); else return std::exp(v_); - } T as_float() const { if (s_) return -std::exp(v_); else return std::exp(v_); } -- cgit v1.2.3 From 575cd9f9102596b396ac8e7ab4f0c74b35d369e6 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 12 Oct 2011 14:57:15 +0100 Subject: model lenght properly, clean up --- gi/pf/Makefile.am | 2 +- gi/pf/corpus.cc | 57 ++++++++++++++++++++++++ gi/pf/corpus.h | 19 ++++++++ gi/pf/dpnaive.cc | 95 +++++++++++----------------------------- gi/pf/monotonic_pseg.h | 88 +++++++++++++++++++++++++++++++++++++ gi/pf/pfnaive.cc | 116 +++++-------------------------------------------- utils/logval_test.cc | 14 +++--- 7 files changed, 209 insertions(+), 182 deletions(-) create mode 100644 gi/pf/corpus.cc create mode 100644 gi/pf/corpus.h create mode 100644 gi/pf/monotonic_pseg.h (limited to 'utils') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index c9764ad5..42758939 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,7 +1,7 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc +libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc itg_SOURCES = itg.cc diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc new file mode 100644 index 00000000..a408e7cf --- /dev/null +++ b/gi/pf/corpus.cc @@ -0,0 +1,57 @@ +#include "corpus.h" + +#include +#include +#include + +#include "tdict.h" +#include "filelib.h" + +using namespace std; + +namespace corpus { + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + ReadFile rf(filename); + istream* in = rf.stream(); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } +} + +} + diff --git a/gi/pf/corpus.h b/gi/pf/corpus.h new file mode 100644 index 00000000..e7febdb7 --- /dev/null +++ b/gi/pf/corpus.h @@ -0,0 +1,19 @@ +#ifndef _CORPUS_H_ +#define _CORPUS_H_ + +#include +#include +#include +#include "wordid.h" + +namespace corpus { + +void ReadParallelCorpus(const std::string& filename, + std::vector >* f, + std::vector >* e, + std::set* vocab_f, + std::set* vocab_e); + +} + +#endif diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc index 608f73d5..c926487b 100644 --- a/gi/pf/dpnaive.cc +++ b/gi/pf/dpnaive.cc @@ -7,12 +7,14 @@ #include #include "base_measures.h" +#include "monotonic_pseg.h" #include "trule.h" #include "tdict.h" #include "filelib.h" #include "dict.h" #include "sampler.h" #include "ccrp_nt.h" +#include "corpus.h" using namespace std; using namespace std::tr1; @@ -52,57 +54,12 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -void ReadParallelCorpus(const string& filename, - vector >* f, - vector >* e, - set* vocab_e, - set* vocab_f) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - f->push_back(vector()); - vector& le = e->back(); - vector& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { isf = false; } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - assert(cur != kDIV); - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } - if (in != &cin) delete in; -} - shared_ptr prng; template struct ModelAndData { - explicit ModelAndData(const Base& b, const vector >& ce, const vector >& cf, const set& ve, const set& vf) : + explicit ModelAndData(MonotonicParallelSegementationModel& m, const Base& b, const vector >& ce, const vector >& cf, const set& ve, const set& vf) : + model(m), rng(&*prng), p0(b), baseprob(prob_t::One()), @@ -110,14 +67,12 @@ struct ModelAndData { corpusf(cf), vocabe(ve), vocabf(vf), - rules(1,1), mh_samples(), mh_rejects(), kX(-TD::Convert("X")), derivations(corpuse.size()) {} void ResampleHyperparameters() { - rules.resample_hyperparameters(&*prng); } void InstantiateRule(const pair& from, @@ -139,12 +94,10 @@ struct ModelAndData { TRule x; for (int i = 1; i < d.size(); ++i) { InstantiateRule(d[i], d[i-1], sentf, sente, &x); - //cerr << "REMOVE: " << x.AsString() << endl; - if (rules.decrement(x)) { - baseprob /= p0(x); - //cerr << " (REMOVED ONLY INSTANCE)\n"; - } + model.DecrementRule(x); + model.DecrementContinue(); } + model.DecrementStop(); } void PrintDerivation(const vector >& d, const vector& sentf, const vector& sente) { @@ -161,39 +114,38 @@ struct ModelAndData { TRule x; for (int i = 1; i < d.size(); ++i) { InstantiateRule(d[i], d[i-1], sentf, sente, &x); - if (rules.increment(x)) { - baseprob *= p0(x); - } + model.IncrementRule(x); + model.IncrementContinue(); } + model.IncrementStop(); } prob_t Likelihood() const { - prob_t p; - p.logeq(rules.log_crp_prob()); - return p * baseprob; + return model.Likelihood(); } prob_t DerivationProposalProbability(const vector >& d, const vector& sentf, const vector& sente) const { - prob_t p = prob_t::One(); + prob_t p = model.StopProbability(); if (d.size() < 2) return p; TRule x; + const prob_t p_cont = model.ContinueProbability(); for (int i = 1; i < d.size(); ++i) { InstantiateRule(d[i], d[i-1], sentf, sente, &x); - prob_t rp; rp.logeq(rules.logprob(x, log(p0(x)))); - p *= rp; + p *= p_cont; + p *= model.RuleProbability(x); } return p; } void Sample(); + MonotonicParallelSegementationModel& model; MT19937* rng; const Base& p0; prob_t baseprob; // cached value of generating the table table labels from p0 // this can't be used if we go to a hierarchical prior! const vector >& corpuse, corpusf; const set& vocabe, vocabf; - CCRP_NoTable rules; unsigned mh_samples, mh_rejects; const int kX; vector > > derivations; @@ -201,8 +153,8 @@ struct ModelAndData { template void ModelAndData::Sample() { - unsigned MAXK = 4; - unsigned MAXL = 4; + unsigned MAXK = kMAX_SRC_PHRASE; + unsigned MAXL = kMAX_TRG_PHRASE; TRule x; x.lhs_ = -TD::Convert("X"); for (int samples = 0; samples < 1000; ++samples) { @@ -228,6 +180,8 @@ void ModelAndData::Sample() { boost::multi_array a(boost::extents[sentf.size() + 1][sente.size() + 1]); boost::multi_array trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); a[0][0] = prob_t::One(); + const prob_t q_stop = model.StopProbability(); + const prob_t q_cont = model.ContinueProbability(); for (int i = 0; i < sentf.size(); ++i) { for (int j = 0; j < sente.size(); ++j) { const prob_t src_a = a[i][j]; @@ -239,7 +193,9 @@ void ModelAndData::Sample() { for (int l = 1; l <= MAXL; ++l) { if (j + l > sente.size()) break; x.e_.push_back(sente[j + l - 1]); - trans[i][j][k - 1][l - 1].logeq(rules.logprob(x, log(p0(x)))); + const bool stop_now = ((j + l) == sente.size()) && ((i + k) == sentf.size()); + const prob_t& cp = stop_now ? q_stop : q_cont; + trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * cp; a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; } } @@ -319,7 +275,7 @@ int main(int argc, char** argv) { vector > corpuse, corpusf; set vocabe, vocabf; - ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; @@ -328,8 +284,9 @@ int main(int argc, char** argv) { Model1 m1(conf["model1"].as()); PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); + MonotonicParallelSegementationModel m(lp0); - ModelAndData posterior(lp0, corpuse, corpusf, vocabe, vocabf); + ModelAndData posterior(m, lp0, corpuse, corpusf, vocabe, vocabf); posterior.Sample(); return 0; diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h new file mode 100644 index 00000000..7e6af3fc --- /dev/null +++ b/gi/pf/monotonic_pseg.h @@ -0,0 +1,88 @@ +#ifndef _MONOTONIC_PSEG_H_ +#define _MONOTONIC_PSEG_H_ + +#include + +#include "prob.h" +#include "ccrp_nt.h" +#include "trule.h" +#include "base_measures.h" + +struct MonotonicParallelSegementationModel { + explicit MonotonicParallelSegementationModel(PhraseJointBase& rcp0) : + rp0(rcp0), base(prob_t::One()), rules(1,1), stop(1.0) {} + + void DecrementRule(const TRule& rule) { + if (rules.decrement(rule)) + base /= rp0(rule); + } + + void IncrementRule(const TRule& rule) { + if (rules.increment(rule)) + base *= rp0(rule); + } + + void IncrementRulesAndStops(const std::vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + if (rules.size()) IncrementContinue(rules.size() - 1); + IncrementStop(); + } + + void DecrementRulesAndStops(const std::vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + if (rules.size()) { + DecrementContinue(rules.size() - 1); + DecrementStop(); + } + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); + return p; + } + + prob_t Likelihood() const { + prob_t p = base; + prob_t q; q.logeq(rules.log_crp_prob()); + p *= q; + q.logeq(stop.log_crp_prob()); + p *= q; + return p; + } + + void IncrementStop() { + stop.increment(true); + } + + void IncrementContinue(int n = 1) { + for (int i = 0; i < n; ++i) + stop.increment(false); + } + + void DecrementStop() { + stop.decrement(true); + } + + void DecrementContinue(int n = 1) { + for (int i = 0; i < n; ++i) + stop.decrement(false); + } + + prob_t StopProbability() const { + return prob_t(stop.prob(true, 0.5)); + } + + prob_t ContinueProbability() const { + return prob_t(stop.prob(false, 0.5)); + } + + const PhraseJointBase& rp0; + prob_t base; + CCRP_NoTable rules; + CCRP_NoTable stop; +}; + +#endif + diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc index c30e7c4f..33dc08c3 100644 --- a/gi/pf/pfnaive.cc +++ b/gi/pf/pfnaive.cc @@ -7,6 +7,7 @@ #include #include "base_measures.h" +#include "monotonic_pseg.h" #include "reachability.h" #include "viterbi.h" #include "hg.h" @@ -17,6 +18,7 @@ #include "sampler.h" #include "ccrp_nt.h" #include "ccrp_onetable.h" +#include "corpus.h" using namespace std; using namespace tr1; @@ -58,101 +60,6 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -void ReadParallelCorpus(const string& filename, - vector >* f, - vector >* e, - set* vocab_f, - set* vocab_e) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - f->push_back(vector()); - vector& le = e->back(); - vector& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { isf = false; } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - assert(cur != kDIV); - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } - if (in != &cin) delete in; -} - -struct MyJointModel { - MyJointModel(PhraseJointBase& rcp0) : - rp0(rcp0), base(prob_t::One()), rules(1,1) {} - - void DecrementRule(const TRule& rule) { - if (rules.decrement(rule)) - base /= rp0(rule); - } - - void IncrementRule(const TRule& rule) { - if (rules.increment(rule)) - base *= rp0(rule); - } - - void IncrementRules(const vector& rules) { - for (int i = 0; i < rules.size(); ++i) - IncrementRule(*rules[i]); - } - - void DecrementRules(const vector& rules) { - for (int i = 0; i < rules.size(); ++i) - DecrementRule(*rules[i]); - } - - prob_t RuleProbability(const TRule& rule) const { - prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); - return p; - } - - prob_t Likelihood() const { - prob_t p = base; - prob_t q; q.logeq(rules.log_crp_prob()); - p *= q; - for (unsigned l = 1; l < src_jumps.size(); ++l) { - if (src_jumps[l].num_customers() > 0) { - prob_t q; - q.logeq(src_jumps[l].log_crp_prob()); - p *= q; - } - } - return p; - } - - const PhraseJointBase& rp0; - prob_t base; - CCRP_NoTable rules; - vector > src_jumps; -}; - struct BackwardEstimateSym { BackwardEstimateSym(const Model1& m1, const Model1& invm1, const vector& src, const vector& trg) : @@ -264,7 +171,7 @@ int main(int argc, char** argv) { vector > corpuse, corpusf; set vocabe, vocabf; cerr << "Reading corpus...\n"; - ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; assert(corpusf.size() == corpuse.size()); @@ -273,13 +180,8 @@ int main(int argc, char** argv) { Model1 m1(conf["model1"].as()); Model1 invm1(conf["inverse_model1"].as()); -#if 0 - PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size()); - MyConditionalModel m(lp0); -#else PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); - MyJointModel m(lp0); -#endif + MonotonicParallelSegementationModel m(lp0); cerr << "Initializing reachability limits...\n"; vector ps(corpusf.size()); @@ -296,7 +198,10 @@ int main(int argc, char** argv) { for (int ci = 0; ci < corpusf.size(); ++ci) { vector& src = corpusf[ci]; vector& trg = corpuse[ci]; - m.DecrementRules(ps[ci].rules); + m.DecrementRulesAndStops(ps[ci].rules); + const prob_t q_stop = m.StopProbability(); + const prob_t q_cont = m.ContinueProbability(); + cerr << "P(stop)=" << q_stop << "\tP(continue)=" <AsString() << "\n"; } cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; } diff --git a/utils/logval_test.cc b/utils/logval_test.cc index 4aa452f2..6133f5ce 100644 --- a/utils/logval_test.cc +++ b/utils/logval_test.cc @@ -30,13 +30,13 @@ TEST_F(LogValTest,Negate) { LogVal x(-2.4); LogVal y(2.4); y.negate(); - EXPECT_FLOAT_EQ(x,y); + EXPECT_FLOAT_EQ(x.as_float(),y.as_float()); } TEST_F(LogValTest,Inverse) { LogVal x(1/2.4); LogVal y(2.4); - EXPECT_FLOAT_EQ(x,y.inverse()); + EXPECT_FLOAT_EQ(x.as_float(),y.inverse().as_float()); } TEST_F(LogValTest,Minus) { @@ -45,9 +45,9 @@ TEST_F(LogValTest,Minus) { LogVal z1 = x - y; LogVal z2 = x; z2 -= y; - EXPECT_FLOAT_EQ(z1, z2); - EXPECT_FLOAT_EQ(z1, 10.0); - EXPECT_FLOAT_EQ(y - x, -10.0); + EXPECT_FLOAT_EQ(z1.as_float(), z2.as_float()); + EXPECT_FLOAT_EQ(z1.as_float(), 10.0); + EXPECT_FLOAT_EQ((y - x).as_float(), -10.0); } TEST_F(LogValTest,TestOps) { @@ -62,8 +62,8 @@ TEST_F(LogValTest,TestOps) { LogVal bb(-0.3); cerr << (aa + bb) << endl; cerr << (bb + aa) << endl; - EXPECT_FLOAT_EQ((aa + bb), (bb + aa)); - EXPECT_FLOAT_EQ((aa + bb), -0.1); + EXPECT_FLOAT_EQ((aa + bb).as_float(), (bb + aa).as_float()); + EXPECT_FLOAT_EQ((aa + bb).as_float(), -0.1); } TEST_F(LogValTest,TestSizes) { -- cgit v1.2.3