summaryrefslogtreecommitdiff
path: root/gi/pf
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pf')
-rw-r--r--gi/pf/Makefile.am21
-rw-r--r--gi/pf/README2
-rw-r--r--gi/pf/base_measures.cc112
-rw-r--r--gi/pf/base_measures.h116
-rw-r--r--gi/pf/brat.cc554
-rw-r--r--gi/pf/cbgi.cc340
-rw-r--r--gi/pf/cfg_wfst_composer.cc730
-rw-r--r--gi/pf/cfg_wfst_composer.h46
-rw-r--r--gi/pf/dpnaive.cc349
-rw-r--r--gi/pf/itg.cc224
-rw-r--r--gi/pf/pfbrat.cc554
-rw-r--r--gi/pf/pfdist.cc621
-rw-r--r--gi/pf/pfdist.new.cc620
-rw-r--r--gi/pf/pfnaive.cc385
-rw-r--r--gi/pf/reachability.cc64
-rw-r--r--gi/pf/reachability.h28
-rw-r--r--gi/pf/tpf.cc99
17 files changed, 4865 insertions, 0 deletions
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
new file mode 100644
index 00000000..c9764ad5
--- /dev/null
+++ b/gi/pf/Makefile.am
@@ -0,0 +1,21 @@
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive
+
+noinst_LIBRARIES = libpf.a
+libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc
+
+itg_SOURCES = itg.cc
+
+dpnaive_SOURCES = dpnaive.cc
+
+pfdist_SOURCES = pfdist.cc
+
+pfnaive_SOURCES = pfnaive.cc
+
+cbgi_SOURCES = cbgi.cc
+
+brat_SOURCES = brat.cc
+
+pfbrat_SOURCES = pfbrat.cc
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
+AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
diff --git a/gi/pf/README b/gi/pf/README
new file mode 100644
index 00000000..62e47541
--- /dev/null
+++ b/gi/pf/README
@@ -0,0 +1,2 @@
+Experimental Bayesian alignment tools. Nothing to see here.
+
diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc
new file mode 100644
index 00000000..f8ddfd32
--- /dev/null
+++ b/gi/pf/base_measures.cc
@@ -0,0 +1,112 @@
+#include "base_measures.h"
+
+#include <iostream>
+
+#include "filelib.h"
+
+using namespace std;
+
+void Model1::LoadModel1(const string& fname) {
+ cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ unsigned lc = 0;
+ while(getline(in, line)) {
+ ++lc;
+ int cur = 0;
+ int start = 0;
+ while(cur < line.size() && line[cur] != ' ') { ++cur; }
+ assert(cur != line.size());
+ line[cur] = 0;
+ const WordID src = TD::Convert(&line[0]);
+ ++cur;
+ start = cur;
+ while(cur < line.size() && line[cur] != ' ') { ++cur; }
+ assert(cur != line.size());
+ line[cur] = 0;
+ WordID trg = TD::Convert(&line[start]);
+ const double logprob = strtod(&line[cur + 1], NULL);
+ if (src >= ttable.size()) ttable.resize(src + 1);
+ ttable[src][trg].logeq(logprob);
+ }
+ cerr << " read " << lc << " parameters.\n";
+}
+
+prob_t PhraseConditionalBase::p0(const vector<WordID>& vsrc,
+ const vector<WordID>& vtrg,
+ int start_src, int start_trg) const {
+ const int flen = vsrc.size() - start_src;
+ const int elen = vtrg.size() - start_trg;
+ prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
+ prob_t p;
+ p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
+ for (int i = 0; i < elen; ++i) { // for each position i in e-RHS
+ const WordID trg = vtrg[i + start_trg];
+ prob_t tp = prob_t::Zero();
+ for (int j = -1; j < flen; ++j) {
+ const WordID src = j < 0 ? 0 : vsrc[j + start_src];
+ tp += kM1MIXTURE * model1(src, trg);
+ tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
+ }
+ tp *= uniform_src_alignment; // draw a_i ~uniform
+ p *= tp; // draw e_i ~Model1(f_a_i) / uniform
+ }
+ if (p.is_0()) {
+ cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
+ abort();
+ }
+ return p;
+}
+
+prob_t PhraseJointBase::p0(const vector<WordID>& vsrc,
+ const vector<WordID>& vtrg,
+ int start_src, int start_trg) const {
+ const int flen = vsrc.size() - start_src;
+ const int elen = vtrg.size() - start_trg;
+ prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
+ prob_t p;
+ p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1)
+ // elen | flen ~Pois(flen + 0.01)
+ prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01));
+ p *= ptrglen;
+ p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform
+ for (int i = 0; i < elen; ++i) { // for each position i in E
+ const WordID trg = vtrg[i + start_trg];
+ prob_t tp = prob_t::Zero();
+ for (int j = -1; j < flen; ++j) {
+ const WordID src = j < 0 ? 0 : vsrc[j + start_src];
+ tp += kM1MIXTURE * model1(src, trg);
+ tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
+ }
+ tp *= uniform_src_alignment; // draw a_i ~uniform
+ p *= tp; // draw e_i ~Model1(f_a_i) / uniform
+ }
+ if (p.is_0()) {
+ cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
+ abort();
+ }
+ return p;
+}
+
+JumpBase::JumpBase() : p(200) {
+ for (unsigned src_len = 1; src_len < 200; ++src_len) {
+ map<int, prob_t>& cpd = p[src_len];
+ int min_jump = 1 - src_len;
+ int max_jump = src_len;
+ prob_t z;
+ for (int j = min_jump; j <= max_jump; ++j) {
+ prob_t& cp = cpd[j];
+ if (j < 0)
+ cp.logeq(log_poisson(1.5-j, 1));
+ else if (j > 0)
+ cp.logeq(log_poisson(j, 1));
+ cp.poweq(0.2);
+ z += cp;
+ }
+ for (int j = min_jump; j <= max_jump; ++j) {
+ cpd[j] /= z;
+ }
+ }
+}
+
diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h
new file mode 100644
index 00000000..df17aa62
--- /dev/null
+++ b/gi/pf/base_measures.h
@@ -0,0 +1,116 @@
+#ifndef _BASE_MEASURES_H_
+#define _BASE_MEASURES_H_
+
+#include <vector>
+#include <map>
+#include <string>
+#include <cmath>
+#include <iostream>
+
+#include "trule.h"
+#include "prob.h"
+#include "tdict.h"
+
+inline double log_poisson(unsigned x, const double& lambda) {
+ assert(lambda > 0.0);
+ return log(lambda) * x - lgamma(x + 1) - lambda;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) {
+ os << '[';
+ for (int i = 0; i < p.size(); ++i)
+ os << (i==0 ? "" : " ") << TD::Convert(p[i]);
+ return os << ']';
+}
+
+struct Model1 {
+ explicit Model1(const std::string& fname) :
+ kNULL(TD::Convert("<eps>")),
+ kZERO() {
+ LoadModel1(fname);
+ }
+
+ void LoadModel1(const std::string& fname);
+
+ // returns prob 0 if src or trg is not found
+ const prob_t& operator()(WordID src, WordID trg) const {
+ if (src == 0) src = kNULL;
+ if (src < ttable.size()) {
+ const std::map<WordID, prob_t>& cpd = ttable[src];
+ const std::map<WordID, prob_t>::const_iterator it = cpd.find(trg);
+ if (it != cpd.end())
+ return it->second;
+ }
+ return kZERO;
+ }
+
+ const WordID kNULL;
+ const prob_t kZERO;
+ std::vector<std::map<WordID, prob_t> > ttable;
+};
+
+struct PhraseConditionalBase {
+ explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) :
+ model1(m1),
+ kM1MIXTURE(m1mixture),
+ kUNIFORM_MIXTURE(1.0 - m1mixture),
+ kUNIFORM_TARGET(1.0 / vocab_e_size) {
+ assert(m1mixture >= 0.0 && m1mixture <= 1.0);
+ assert(vocab_e_size > 0);
+ }
+
+ // return p0 of rule.e_ | rule.f_
+ prob_t operator()(const TRule& rule) const {
+ return p0(rule.f_, rule.e_, 0, 0);
+ }
+
+ prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+ const Model1& model1;
+ const prob_t kM1MIXTURE; // Model 1 mixture component
+ const prob_t kUNIFORM_MIXTURE; // uniform mixture component
+ const prob_t kUNIFORM_TARGET;
+};
+
+struct PhraseJointBase {
+ explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) :
+ model1(m1),
+ kM1MIXTURE(m1mixture),
+ kUNIFORM_MIXTURE(1.0 - m1mixture),
+ kUNIFORM_SOURCE(1.0 / vocab_f_size),
+ kUNIFORM_TARGET(1.0 / vocab_e_size) {
+ assert(m1mixture >= 0.0 && m1mixture <= 1.0);
+ assert(vocab_e_size > 0);
+ }
+
+ // return p0 of rule.e_ | rule.f_
+ prob_t operator()(const TRule& rule) const {
+ return p0(rule.f_, rule.e_, 0, 0);
+ }
+
+ prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+ const Model1& model1;
+ const prob_t kM1MIXTURE; // Model 1 mixture component
+ const prob_t kUNIFORM_MIXTURE; // uniform mixture component
+ const prob_t kUNIFORM_SOURCE;
+ const prob_t kUNIFORM_TARGET;
+};
+
+// base distribution for jump size multinomials
+// basically p(0) = 0 and then, p(1) is max, and then
+// you drop as you move to the max jump distance
+struct JumpBase {
+ JumpBase();
+
+ const prob_t& operator()(int jump, unsigned src_len) const {
+ assert(jump != 0);
+ const std::map<int, prob_t>::const_iterator it = p[src_len].find(jump);
+ assert(it != p[src_len].end());
+ return it->second;
+ }
+ std::vector<std::map<int, prob_t> > p;
+};
+
+
+#endif
diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc
new file mode 100644
index 00000000..4c6ba3ef
--- /dev/null
+++ b/gi/pf/brat.cc
@@ -0,0 +1,554 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/multi_array.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "viterbi.h"
+#include "hg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp_nt.h"
+#include "cfg_wfst_composer.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+static unsigned kMAX_SRC_PHRASE;
+static unsigned kMAX_TRG_PHRASE;
+struct FSTState;
+
+size_t hash_value(const TRule& r) {
+ size_t h = 2 - r.lhs_;
+ boost::hash_combine(h, boost::hash_value(r.e_));
+ boost::hash_combine(h, boost::hash_value(r.f_));
+ return h;
+}
+
+bool operator==(const TRule& a, const TRule& b) {
+ return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
+}
+
+double log_poisson(unsigned x, const double& lambda) {
+ assert(lambda > 0.0);
+ return log(lambda) * x - lgamma(x + 1) - lambda;
+}
+
+struct ConditionalBase {
+ explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) :
+ kM1MIXTURE(m1mixture),
+ kUNIFORM_MIXTURE(1.0 - m1mixture),
+ kUNIFORM_TARGET(1.0 / vocab_e_size),
+ kNULL(TD::Convert("<eps>")) {
+ assert(m1mixture >= 0.0 && m1mixture <= 1.0);
+ assert(vocab_e_size > 0);
+ LoadModel1(model1fname);
+ }
+
+ void LoadModel1(const string& fname) {
+ cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ unsigned lc = 0;
+ while(getline(in, line)) {
+ ++lc;
+ int cur = 0;
+ int start = 0;
+ while(cur < line.size() && line[cur] != ' ') { ++cur; }
+ assert(cur != line.size());
+ line[cur] = 0;
+ const WordID src = TD::Convert(&line[0]);
+ ++cur;
+ start = cur;
+ while(cur < line.size() && line[cur] != ' ') { ++cur; }
+ assert(cur != line.size());
+ line[cur] = 0;
+ WordID trg = TD::Convert(&line[start]);
+ const double logprob = strtod(&line[cur + 1], NULL);
+ if (src >= ttable.size()) ttable.resize(src + 1);
+ ttable[src][trg].logeq(logprob);
+ }
+ cerr << " read " << lc << " parameters.\n";
+ }
+
+ // return logp0 of rule.e_ | rule.f_
+ prob_t operator()(const TRule& rule) const {
+ const int flen = rule.f_.size();
+ const int elen = rule.e_.size();
+ prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
+ prob_t p;
+ p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
+ for (int i = 0; i < elen; ++i) { // for each position i in e-RHS
+ const WordID trg = rule.e_[i];
+ prob_t tp = prob_t::Zero();
+ for (int j = -1; j < flen; ++j) {
+ const WordID src = j < 0 ? kNULL : rule.f_[j];
+ const map<WordID, prob_t>::const_iterator it = ttable[src].find(trg);
+ if (it != ttable[src].end()) {
+ tp += kM1MIXTURE * it->second;
+ }
+ tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
+ }
+ tp *= uniform_src_alignment; // draw a_i ~uniform
+ p *= tp; // draw e_i ~Model1(f_a_i) / uniform
+ }
+ return p;
+ }
+
+ const prob_t kM1MIXTURE; // Model 1 mixture component
+ const prob_t kUNIFORM_MIXTURE; // uniform mixture component
+ const prob_t kUNIFORM_TARGET;
+ const WordID kNULL;
+ vector<map<WordID, prob_t> > ttable;
+};
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("input,i",po::value<string>(),"Read parallel data from")
+ ("max_src_phrase",po::value<unsigned>()->default_value(3),"Maximum length of source language phrases")
+ ("max_trg_phrase",po::value<unsigned>()->default_value(3),"Maximum length of target language phrases")
+ ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
+ ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("input") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+void ReadParallelCorpus(const string& filename,
+ vector<vector<WordID> >* f,
+ vector<vector<int> >* e,
+ set<int>* vocab_f,
+ set<int>* vocab_e) {
+ f->clear();
+ e->clear();
+ vocab_f->clear();
+ vocab_e->clear();
+ istream* in;
+ if (filename == "-")
+ in = &cin;
+ else
+ in = new ifstream(filename.c_str());
+ assert(*in);
+ string line;
+ const WordID kDIV = TD::Convert("|||");
+ vector<WordID> tmp;
+ while(*in) {
+ getline(*in, line);
+ if (line.empty() && !*in) break;
+ e->push_back(vector<int>());
+ f->push_back(vector<int>());
+ vector<int>& le = e->back();
+ vector<int>& lf = f->back();
+ tmp.clear();
+ TD::ConvertSentence(line, &tmp);
+ bool isf = true;
+ for (unsigned i = 0; i < tmp.size(); ++i) {
+ const int cur = tmp[i];
+ if (isf) {
+ if (kDIV == cur) { isf = false; } else {
+ lf.push_back(cur);
+ vocab_f->insert(cur);
+ }
+ } else {
+ assert(cur != kDIV);
+ le.push_back(cur);
+ vocab_e->insert(cur);
+ }
+ }
+ assert(isf == false);
+ }
+ if (in != &cin) delete in;
+}
+
+struct UniphraseLM {
+ UniphraseLM(const vector<vector<int> >& corpus,
+ const set<int>& vocab,
+ const po::variables_map& conf) :
+ phrases_(1,1),
+ gen_(1,1),
+ corpus_(corpus),
+ uniform_word_(1.0 / vocab.size()),
+ gen_p0_(0.5),
+ p_end_(0.5),
+ use_poisson_(conf.count("poisson_length") > 0) {}
+
+ void ResampleHyperparameters(MT19937* rng) {
+ phrases_.resample_hyperparameters(rng);
+ gen_.resample_hyperparameters(rng);
+ cerr << " " << phrases_.concentration();
+ }
+
+ CCRP_NoTable<vector<int> > phrases_;
+ CCRP_NoTable<bool> gen_;
+ vector<vector<bool> > z_; // z_[i] is there a phrase boundary after the ith word
+ const vector<vector<int> >& corpus_;
+ const double uniform_word_;
+ const double gen_p0_;
+ const double p_end_; // in base length distribution, p of the end of a phrase
+ const bool use_poisson_;
+};
+
+struct Reachability {
+ boost::multi_array<bool, 4> edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring?
+ boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid
+
+ Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
+ edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
+ max_src_delta(boost::extents[srclen][trglen]) {
+ ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
+ }
+
+ private:
+ struct SState {
+ SState() : prev_src_covered(), prev_trg_covered() {}
+ SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {}
+ int prev_src_covered;
+ int prev_trg_covered;
+ };
+
+ struct NState {
+ NState() : next_src_covered(), next_trg_covered() {}
+ NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {}
+ int next_src_covered;
+ int next_trg_covered;
+ };
+
+ void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) {
+ typedef boost::multi_array<vector<SState>, 2> array_type;
+ array_type a(boost::extents[srclen + 1][trglen + 1]);
+ a[0][0].push_back(SState());
+ for (int i = 0; i < srclen; ++i) {
+ for (int j = 0; j < trglen; ++j) {
+ if (a[i][j].size() == 0) continue;
+ const SState prev(i,j);
+ for (int k = 1; k <= src_max_phrase_len; ++k) {
+ if ((i + k) > srclen) continue;
+ for (int l = 1; l <= trg_max_phrase_len; ++l) {
+ if ((j + l) > trglen) continue;
+ a[i + k][j + l].push_back(prev);
+ }
+ }
+ }
+ }
+ a[0][0].clear();
+ cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
+ assert(a[srclen][trglen].size() > 0);
+
+ typedef boost::multi_array<bool, 2> rarray_type;
+ rarray_type r(boost::extents[srclen + 1][trglen + 1]);
+// typedef boost::multi_array<vector<NState>, 2> narray_type;
+// narray_type b(boost::extents[srclen + 1][trglen + 1]);
+ r[srclen][trglen] = true;
+ for (int i = srclen; i >= 0; --i) {
+ for (int j = trglen; j >= 0; --j) {
+ vector<SState>& prevs = a[i][j];
+ if (!r[i][j]) { prevs.clear(); }
+// const NState nstate(i,j);
+ for (int k = 0; k < prevs.size(); ++k) {
+ r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true;
+ int src_delta = i - prevs[k].prev_src_covered;
+ edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true;
+ short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered];
+ if (src_delta > msd) msd = src_delta;
+// b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate);
+ }
+ }
+ }
+ assert(!edges[0][0][1][0]);
+ assert(!edges[0][0][0][1]);
+ assert(!edges[0][0][0][0]);
+ cerr << " MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl;
+ assert(max_src_delta[0][0] > 0);
+ //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n";
+ //for (int i = 0; i < b[0][0].size(); ++i) {
+ // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n";
+ //}
+ }
+};
+
+ostream& operator<<(ostream& os, const FSTState& q);
+struct FSTState {
+ explicit FSTState(int src_size) :
+ trg_covered_(),
+ src_covered_(),
+ src_coverage_(src_size) {}
+
+ FSTState(short trg_covered, short src_covered, const vector<bool>& src_coverage, const vector<short>& src_prefix) :
+ trg_covered_(trg_covered),
+ src_covered_(src_covered),
+ src_coverage_(src_coverage),
+ src_prefix_(src_prefix) {
+ if (src_coverage_.size() == src_covered) {
+ assert(src_prefix.size() == 0);
+ }
+ }
+
+ // if we extend by the word at src_position, what are
+ // the next states that are reachable and lie on a valid
+ // path to the final state?
+ vector<FSTState> Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const {
+ assert(src_position < src_coverage_.size());
+ if (src_coverage_[src_position]) {
+ cerr << "Trying to extend " << *this << " with position " << src_position << endl;
+ abort();
+ }
+ vector<bool> ncvg = src_coverage_;
+ ncvg[src_position] = true;
+
+ vector<FSTState> res;
+ const int trg_remaining = trg_len - trg_covered_;
+ if (trg_remaining <= 0) {
+ cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl;
+ abort();
+ }
+ const int src_remaining = src_len - src_covered_;
+ if (src_remaining <= 0) {
+ cerr << "Source appears to have been covered: " << *this << endl;
+ abort();
+ }
+
+ for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) {
+ if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) {
+ int nc = src_prefix_.size() + 1 + src_covered_;
+ res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector<short>()));
+ }
+ }
+
+ if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) {
+ vector<short> nsp = src_prefix_;
+ nsp.push_back(src_position);
+ res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp));
+ }
+
+ if (res.size() == 0) {
+ cerr << *this << " can't be extended!\n";
+ abort();
+ }
+ return res;
+ }
+
+ short trg_covered_, src_covered_;
+ vector<bool> src_coverage_;
+ vector<short> src_prefix_;
+};
+bool operator<(const FSTState& q, const FSTState& r) {
+ if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_;
+ if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_;
+ if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_;
+ return q.src_prefix_ < r.src_prefix_;
+}
+
+ostream& operator<<(ostream& os, const FSTState& q) {
+ os << "[" << q.trg_covered_ << " : ";
+ for (int i = 0; i < q.src_coverage_.size(); ++i)
+ os << q.src_coverage_[i];
+ os << " : <";
+ for (int i = 0; i < q.src_prefix_.size(); ++i) {
+ if (i != 0) os << ' ';
+ os << q.src_prefix_[i];
+ }
+ return os << ">]";
+}
+
+struct MyModel {
+ MyModel(ConditionalBase& rcp0) : rp0(rcp0) {}
+ typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > SrcToRuleCRPMap;
+
+ void DecrementRule(const TRule& rule) {
+ SrcToRuleCRPMap::iterator it = rules.find(rule.f_);
+ assert(it != rules.end());
+ it->second.decrement(rule);
+ if (it->second.num_customers() == 0) rules.erase(it);
+ }
+
+ void IncrementRule(const TRule& rule) {
+ SrcToRuleCRPMap::iterator it = rules.find(rule.f_);
+ if (it == rules.end()) {
+ CCRP_NoTable<TRule> crp(1,1);
+ it = rules.insert(make_pair(rule.f_, crp)).first;
+ }
+ it->second.increment(rule);
+ }
+
+ // conditioned on rule.f_
+ prob_t RuleConditionalProbability(const TRule& rule) const {
+ const prob_t base = rp0(rule);
+ SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_);
+ if (it == rules.end()) {
+ return base;
+ } else {
+ const double lp = it->second.logprob(rule, log(base));
+ prob_t q; q.logeq(lp);
+ return q;
+ }
+ }
+
+ const ConditionalBase& rp0;
+ SrcToRuleCRPMap rules;
+};
+
+struct MyFST : public WFST {
+ MyFST(const vector<WordID>& ssrc, const vector<WordID>& strg, MyModel* m) :
+ src(ssrc), trg(strg),
+ r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE),
+ model(m) {
+ FSTState in(src.size());
+ cerr << " INIT: " << in << endl;
+ init = GetNode(in);
+ for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true;
+ in.src_covered_ = src.size();
+ in.trg_covered_ = trg.size();
+ cerr << "FINAL: " << in << endl;
+ final = GetNode(in);
+ }
+ virtual const WFSTNode* Final() const;
+ virtual const WFSTNode* Initial() const;
+
+ const WFSTNode* GetNode(const FSTState& q);
+ map<FSTState, boost::shared_ptr<WFSTNode> > m;
+ const vector<WordID>& src;
+ const vector<WordID>& trg;
+ Reachability r;
+ const WFSTNode* init;
+ const WFSTNode* final;
+ MyModel* model;
+};
+
+struct MyNode : public WFSTNode {
+ MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {}
+ virtual vector<pair<const WFSTNode*, TRulePtr> > ExtendInput(unsigned srcindex) const;
+ const FSTState state;
+ mutable MyFST* container;
+};
+
+vector<pair<const WFSTNode*, TRulePtr> > MyNode::ExtendInput(unsigned srcindex) const {
+ cerr << "EXTEND " << state << " with " << srcindex << endl;
+ vector<FSTState> ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r);
+ vector<pair<const WFSTNode*,TRulePtr> > res(ext.size());
+ for (unsigned i = 0; i < ext.size(); ++i) {
+ res[i].first = container->GetNode(ext[i]);
+ if (ext[i].src_prefix_.size() == 0) {
+ const unsigned trg_from = state.trg_covered_;
+ const unsigned trg_to = ext[i].trg_covered_;
+ const unsigned prev_prfx_size = state.src_prefix_.size();
+ res[i].second.reset(new TRule);
+ res[i].second->lhs_ = -TD::Convert("X");
+ vector<WordID>& src = res[i].second->f_;
+ vector<WordID>& trg = res[i].second->e_;
+ src.resize(prev_prfx_size + 1);
+ for (unsigned j = 0; j < prev_prfx_size; ++j)
+ src[j] = container->src[state.src_prefix_[j]];
+ src[prev_prfx_size] = container->src[srcindex];
+ for (unsigned j = trg_from; j < trg_to; ++j)
+ trg.push_back(container->trg[j]);
+ res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second)));
+ }
+ }
+ return res;
+}
+
+const WFSTNode* MyFST::GetNode(const FSTState& q) {
+ boost::shared_ptr<WFSTNode>& res = m[q];
+ if (!res) {
+ res.reset(new MyNode(q, this));
+ }
+ return &*res;
+}
+
+const WFSTNode* MyFST::Final() const {
+ return final;
+}
+
+const WFSTNode* MyFST::Initial() const {
+ return init;
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
+ kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
+
+ if (!conf.count("model1")) {
+ cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
+ return 1;
+ }
+ shared_ptr<MT19937> prng;
+ if (conf.count("random_seed"))
+ prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ prng.reset(new MT19937);
+ MT19937& rng = *prng;
+
+ vector<vector<int> > corpuse, corpusf;
+ set<int> vocabe, vocabf;
+ ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+ cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
+ cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
+ assert(corpusf.size() == corpuse.size());
+
+ ConditionalBase lp0(conf["model1_interpolation_weight"].as<double>(),
+ vocabe.size(),
+ conf["model1"].as<string>());
+ MyModel m(lp0);
+
+ TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0");
+ m.IncrementRule(x);
+ TRule y("[X] ||| nY dyN ||| gave ||| 0");
+ m.IncrementRule(y);
+
+
+ MyFST fst(corpusf[0], corpuse[0], &m);
+ ifstream in("./kimura.g");
+ assert(in);
+ CFG_WFSTComposer comp(fst);
+ Hypergraph hg;
+ bool succeed = comp.Compose(&in, &hg);
+ hg.PrintGraphviz();
+ if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; }
+
+#if 0
+ ifstream in2("./amnabooks.g");
+ assert(in2);
+ MyFST fst2(corpusf[1], corpuse[1], &m);
+ CFG_WFSTComposer comp2(fst2);
+ Hypergraph hg2;
+ bool succeed2 = comp2.Compose(&in2, &hg2);
+ if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; }
+#endif
+
+ SparseVector<double> w; w.set_value(FD::Convert("Proposal"), 1.0);
+ hg.Reweight(w);
+ cerr << ViterbiFTree(hg) << endl;
+ return 0;
+}
+
diff --git a/gi/pf/cbgi.cc b/gi/pf/cbgi.cc
new file mode 100644
index 00000000..20204e8a
--- /dev/null
+++ b/gi/pf/cbgi.cc
@@ -0,0 +1,340 @@
+#include <queue>
+#include <sstream>
+#include <iostream>
+
+#include <boost/unordered_map.hpp>
+#include <boost/functional/hash.hpp>
+
+#include "sampler.h"
+#include "filelib.h"
+#include "hg_io.h"
+#include "hg.h"
+#include "ccrp_nt.h"
+#include "trule.h"
+#include "inside_outside.h"
+
+using namespace std;
+using namespace std::tr1;
+
+double log_poisson(unsigned x, const double& lambda) {
+ assert(lambda > 0.0);
+ return log(lambda) * x - lgamma(x + 1) - lambda;
+}
+
+double log_decay(unsigned x, const double& b) {
+ assert(b > 1.0);
+ assert(x > 0);
+ return log(b - 1) - x * log(b);
+}
+
+size_t hash_value(const TRule& r) {
+ // TODO fix hash function
+ size_t h = boost::hash_value(r.e_) * boost::hash_value(r.f_) * r.lhs_;
+ return h;
+}
+
+bool operator==(const TRule& a, const TRule& b) {
+ return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
+}
+
+struct SimpleBase {
+ SimpleBase(unsigned esize, unsigned fsize, unsigned ntsize = 144) :
+ uniform_e(-log(esize)),
+ uniform_f(-log(fsize)),
+ uniform_nt(-log(ntsize)) {
+ }
+
+ // binomial coefficient
+ static double choose(unsigned n, unsigned k) {
+ return exp(lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1));
+ }
+
+ // count the number of patterns of terminals and NTs in the rule, given elen and flen
+ static double log_number_of_patterns(const unsigned flen, const unsigned elen) {
+ static vector<vector<double> > counts;
+ if (elen >= counts.size()) counts.resize(elen + 1);
+ if (flen >= counts[elen].size()) counts[elen].resize(flen + 1);
+ double& count = counts[elen][flen];
+ if (count) return log(count);
+ const unsigned max_arity = min(elen, flen);
+ for (unsigned a = 0; a <= max_arity; ++a)
+ count += choose(elen, a) * choose(flen, a);
+ return log(count);
+ }
+
+ // return logp0 of rule | LHS
+ double operator()(const TRule& rule) const {
+ const unsigned flen = rule.f_.size();
+ const unsigned elen = rule.e_.size();
+#if 0
+ double p = 0;
+ p += log_poisson(flen, 0.5); // flen ~Pois(0.5)
+ p += log_poisson(elen, flen); // elen | flen ~Pois(flen)
+ p -= log_number_of_patterns(flen, elen); // pattern | flen,elen ~Uniform
+ for (unsigned i = 0; i < flen; ++i) { // for each position in f-RHS
+ if (rule.f_[i] <= 0) // according to pattern
+ p += uniform_nt; // draw NT ~Uniform
+ else
+ p += uniform_f; // draw f terminal ~Uniform
+ }
+ p -= lgamma(rule.Arity() + 1); // draw permutation ~Uniform
+ for (unsigned i = 0; i < elen; ++i) { // for each position in e-RHS
+ if (rule.e_[i] > 0) // according to pattern
+ p += uniform_e; // draw e|f term ~Uniform
+ // TODO this should prob be model 1
+ }
+#else
+ double p = 0;
+ bool is_abstract = rule.f_[0] <= 0;
+ p += log(0.5);
+ if (is_abstract) {
+ if (flen == 2) p += log(0.99); else p += log(0.01);
+ } else {
+ p += log_decay(flen, 3);
+ }
+
+ for (unsigned i = 0; i < flen; ++i) { // for each position in f-RHS
+ if (rule.f_[i] <= 0) // according to pattern
+ p += uniform_nt; // draw NT ~Uniform
+ else
+ p += uniform_f; // draw f terminal ~Uniform
+ }
+#endif
+ return p;
+ }
+ const double uniform_e;
+ const double uniform_f;
+ const double uniform_nt;
+ vector<double> arities;
+};
+
+MT19937* rng = NULL;
+
+template <typename Base>
+struct MHSamplerEdgeProb {
+ MHSamplerEdgeProb(const Hypergraph& hg,
+ const map<int, CCRP_NoTable<TRule> >& rdp,
+ const Base& logp0,
+ const bool exclude_multiword_terminals) : edge_probs(hg.edges_.size()) {
+ for (int i = 0; i < edge_probs.size(); ++i) {
+ const TRule& rule = *hg.edges_[i].rule_;
+ const map<int, CCRP_NoTable<TRule> >::const_iterator it = rdp.find(rule.lhs_);
+ assert(it != rdp.end());
+ const CCRP_NoTable<TRule>& crp = it->second;
+ edge_probs[i].logeq(crp.logprob(rule, logp0(rule)));
+ if (exclude_multiword_terminals && rule.f_[0] > 0 && rule.f_.size() > 1)
+ edge_probs[i] = prob_t::Zero();
+ }
+ }
+ inline prob_t operator()(const Hypergraph::Edge& e) const {
+ return edge_probs[e.id_];
+ }
+ prob_t DerivationProb(const vector<int>& d) const {
+ prob_t p = prob_t::One();
+ for (unsigned i = 0; i < d.size(); ++i)
+ p *= edge_probs[d[i]];
+ return p;
+ }
+ vector<prob_t> edge_probs;
+};
+
+template <typename Base>
+struct ModelAndData {
+ ModelAndData() :
+ base_lh(prob_t::One()),
+ logp0(10000, 10000),
+ mh_samples(),
+ mh_rejects() {}
+
+ void SampleCorpus(const string& hgpath, int i);
+ void ResampleHyperparameters() {
+ for (map<int, CCRP_NoTable<TRule> >::iterator it = rules.begin(); it != rules.end(); ++it)
+ it->second.resample_hyperparameters(rng);
+ }
+
+ CCRP_NoTable<TRule>& RuleCRP(int lhs) {
+ map<int, CCRP_NoTable<TRule> >::iterator it = rules.find(lhs);
+ if (it == rules.end()) {
+ rules.insert(make_pair(lhs, CCRP_NoTable<TRule>(1,1)));
+ it = rules.find(lhs);
+ }
+ return it->second;
+ }
+
+ void IncrementRule(const TRule& rule) {
+ CCRP_NoTable<TRule>& crp = RuleCRP(rule.lhs_);
+ if (crp.increment(rule)) {
+ prob_t p; p.logeq(logp0(rule));
+ base_lh *= p;
+ }
+ }
+
+ void DecrementRule(const TRule& rule) {
+ CCRP_NoTable<TRule>& crp = RuleCRP(rule.lhs_);
+ if (crp.decrement(rule)) {
+ prob_t p; p.logeq(logp0(rule));
+ base_lh /= p;
+ }
+ }
+
+ void DecrementDerivation(const Hypergraph& hg, const vector<int>& d) {
+ for (unsigned i = 0; i < d.size(); ++i) {
+ const TRule& rule = *hg.edges_[d[i]].rule_;
+ DecrementRule(rule);
+ }
+ }
+
+ void IncrementDerivation(const Hypergraph& hg, const vector<int>& d) {
+ for (unsigned i = 0; i < d.size(); ++i) {
+ const TRule& rule = *hg.edges_[d[i]].rule_;
+ IncrementRule(rule);
+ }
+ }
+
+ prob_t Likelihood() const {
+ prob_t p = prob_t::One();
+ for (map<int, CCRP_NoTable<TRule> >::const_iterator it = rules.begin(); it != rules.end(); ++it) {
+ prob_t q; q.logeq(it->second.log_crp_prob());
+ p *= q;
+ }
+ p *= base_lh;
+ return p;
+ }
+
+ void ResampleDerivation(const Hypergraph& hg, vector<int>* sampled_derivation);
+
+ map<int, CCRP_NoTable<TRule> > rules; // [lhs] -> distribution over RHSs
+ prob_t base_lh;
+ SimpleBase logp0;
+ vector<vector<int> > samples; // sampled derivations
+ unsigned int mh_samples;
+ unsigned int mh_rejects;
+};
+
+template <typename Base>
+void ModelAndData<Base>::SampleCorpus(const string& hgpath, int n) {
+ vector<Hypergraph> hgs(n); hgs.clear();
+ boost::unordered_map<TRule, unsigned> acc;
+ map<int, unsigned> tot;
+ for (int i = 0; i < n; ++i) {
+ ostringstream os;
+ os << hgpath << '/' << i << ".json.gz";
+ if (!FileExists(os.str())) continue;
+ hgs.push_back(Hypergraph());
+ ReadFile rf(os.str());
+ HypergraphIO::ReadFromJSON(rf.stream(), &hgs.back());
+ }
+ cerr << "Read " << hgs.size() << " alignment hypergraphs.\n";
+ samples.resize(hgs.size());
+ const unsigned SAMPLES = 2000;
+ const unsigned burnin = 3 * SAMPLES / 4;
+ const unsigned every = 20;
+ for (unsigned s = 0; s < SAMPLES; ++s) {
+ if (s % 10 == 0) {
+ if (s > 0) { cerr << endl; ResampleHyperparameters(); }
+ cerr << "[" << s << " LLH=" << log(Likelihood()) << " REJECTS=" << ((double)mh_rejects / mh_samples) << " LHS's=" << rules.size() << " base=" << log(base_lh) << "] ";
+ }
+ cerr << '.';
+ for (unsigned i = 0; i < hgs.size(); ++i) {
+ ResampleDerivation(hgs[i], &samples[i]);
+ if (s > burnin && s % every == 0) {
+ for (unsigned j = 0; j < samples[i].size(); ++j) {
+ const TRule& rule = *hgs[i].edges_[samples[i][j]].rule_;
+ ++acc[rule];
+ ++tot[rule.lhs_];
+ }
+ }
+ }
+ }
+ cerr << endl;
+ for (boost::unordered_map<TRule,unsigned>::iterator it = acc.begin(); it != acc.end(); ++it) {
+ cout << it->first << " MyProb=" << log(it->second)-log(tot[it->first.lhs_]) << endl;
+ }
+}
+
+template <typename Base>
+void ModelAndData<Base>::ResampleDerivation(const Hypergraph& hg, vector<int>* sampled_deriv) {
+ vector<int> cur;
+ cur.swap(*sampled_deriv);
+
+ const prob_t p_cur = Likelihood();
+ DecrementDerivation(hg, cur);
+ if (cur.empty()) {
+ // first iteration, create restaurants
+ for (int i = 0; i < hg.edges_.size(); ++i)
+ RuleCRP(hg.edges_[i].rule_->lhs_);
+ }
+ MHSamplerEdgeProb<SimpleBase> wf(hg, rules, logp0, cur.empty());
+// MHSamplerEdgeProb<SimpleBase> wf(hg, rules, logp0, false);
+ const prob_t q_cur = wf.DerivationProb(cur);
+ vector<prob_t> node_probs;
+ Inside<prob_t, MHSamplerEdgeProb<SimpleBase> >(hg, &node_probs, wf);
+ queue<unsigned> q;
+ q.push(hg.nodes_.size() - 3);
+ while(!q.empty()) {
+ unsigned cur_node_id = q.front();
+// cerr << "NODE=" << cur_node_id << endl;
+ q.pop();
+ const Hypergraph::Node& node = hg.nodes_[cur_node_id];
+ const unsigned num_in_edges = node.in_edges_.size();
+ unsigned sampled_edge = 0;
+ if (num_in_edges == 1) {
+ sampled_edge = node.in_edges_[0];
+ } else {
+ prob_t z;
+ assert(num_in_edges > 1);
+ SampleSet<prob_t> ss;
+ for (unsigned j = 0; j < num_in_edges; ++j) {
+ const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+ prob_t p = wf.edge_probs[edge.id_]; // edge proposal prob
+ for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
+ p *= node_probs[edge.tail_nodes_[k]];
+ ss.add(p);
+// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl;
+ z += p;
+ }
+// for (unsigned j = 0; j < num_in_edges; ++j) {
+// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl;
+// }
+// cerr << " --- \n";
+ sampled_edge = node.in_edges_[rng->SelectSample(ss)];
+ }
+ sampled_deriv->push_back(sampled_edge);
+ const Hypergraph::Edge& edge = hg.edges_[sampled_edge];
+ for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
+ q.push(edge.tail_nodes_[j]);
+ }
+ }
+ IncrementDerivation(hg, *sampled_deriv);
+
+// cerr << "sampled derivation contains " << sampled_deriv->size() << " edges\n";
+// cerr << "DERIV:\n";
+// for (int i = 0; i < sampled_deriv->size(); ++i) {
+// cerr << " " << hg.edges_[(*sampled_deriv)[i]].rule_->AsString() << endl;
+// }
+
+ if (cur.empty()) return; // accept first sample
+
+ ++mh_samples;
+ // only need to do MH if proposal is different to current state
+ if (cur != *sampled_deriv) {
+ const prob_t q_prop = wf.DerivationProb(*sampled_deriv);
+ const prob_t p_prop = Likelihood();
+ if (!rng->AcceptMetropolisHastings(p_prop, p_cur, q_prop, q_cur)) {
+ ++mh_rejects;
+ DecrementDerivation(hg, *sampled_deriv);
+ IncrementDerivation(hg, cur);
+ swap(cur, *sampled_deriv);
+ }
+ }
+}
+
+int main(int argc, char** argv) {
+ rng = new MT19937;
+ ModelAndData<SimpleBase> m;
+ m.SampleCorpus("./hgs", 50);
+ // m.SampleCorpus("./btec/hgs", 5000);
+ return 0;
+}
+
diff --git a/gi/pf/cfg_wfst_composer.cc b/gi/pf/cfg_wfst_composer.cc
new file mode 100644
index 00000000..a31b5be8
--- /dev/null
+++ b/gi/pf/cfg_wfst_composer.cc
@@ -0,0 +1,730 @@
+#include "cfg_wfst_composer.h"
+
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <queue>
+#include <tr1/unordered_set>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+#include "fast_lexical_cast.hpp"
+
+#include "phrasetable_fst.h"
+#include "sparse_vector.h"
+#include "tdict.h"
+#include "hg.h"
+
+using boost::shared_ptr;
+namespace po = boost::program_options;
+using namespace std;
+using namespace std::tr1;
+
+WFSTNode::~WFSTNode() {}
+WFST::~WFST() {}
+
+// Define the following macro if you want to see lots of debugging output
+// when you run the chart parser
+#undef DEBUG_CHART_PARSER
+
+// A few constants used by the chart parser ///////////////
+static const int kMAX_NODES = 2000000;
+static const string kPHRASE_STRING = "X";
+static bool constants_need_init = true;
+static WordID kUNIQUE_START;
+static WordID kPHRASE;
+static TRulePtr kX1X2;
+static TRulePtr kX1;
+static WordID kEPS;
+static TRulePtr kEPSRule;
+
+static void InitializeConstants() {
+ if (constants_need_init) {
+ kPHRASE = TD::Convert(kPHRASE_STRING) * -1;
+ kUNIQUE_START = TD::Convert("S") * -1;
+ kX1X2.reset(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]"));
+ kX1.reset(new TRule("[X] ||| [X,1] ||| [X,1]"));
+ kEPSRule.reset(new TRule("[X] ||| <eps> ||| <eps>"));
+ kEPS = TD::Convert("<eps>");
+ constants_need_init = false;
+ }
+}
+////////////////////////////////////////////////////////////
+
+class EGrammarNode {
+ friend bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest);
+ friend void AddGrammarRule(const string& r, map<WordID, EGrammarNode>* g);
+ public:
+#ifdef DEBUG_CHART_PARSER
+ string hint;
+#endif
+ EGrammarNode() : is_some_rule_complete(false), is_root(false) {}
+ const map<WordID, EGrammarNode>& GetTerminals() const { return tptr; }
+ const map<WordID, EGrammarNode>& GetNonTerminals() const { return ntptr; }
+ bool HasNonTerminals() const { return (!ntptr.empty()); }
+ bool HasTerminals() const { return (!tptr.empty()); }
+ bool RuleCompletes() const {
+ return (is_some_rule_complete || (ntptr.empty() && tptr.empty()));
+ }
+ bool GrammarContinues() const {
+ return !(ntptr.empty() && tptr.empty());
+ }
+ bool IsRoot() const {
+ return is_root;
+ }
+ // these are the features associated with the rule from the start
+ // node up to this point. If you use these features, you must
+ // not Extend() this rule.
+ const SparseVector<double>& GetCFGProductionFeatures() const {
+ return input_features;
+ }
+
+ const EGrammarNode* Extend(const WordID& t) const {
+ if (t < 0) {
+ map<WordID, EGrammarNode>::const_iterator it = ntptr.find(t);
+ if (it == ntptr.end()) return NULL;
+ return &it->second;
+ } else {
+ map<WordID, EGrammarNode>::const_iterator it = tptr.find(t);
+ if (it == tptr.end()) return NULL;
+ return &it->second;
+ }
+ }
+
+ private:
+ map<WordID, EGrammarNode> tptr;
+ map<WordID, EGrammarNode> ntptr;
+ SparseVector<double> input_features;
+ bool is_some_rule_complete;
+ bool is_root;
+};
+typedef map<WordID, EGrammarNode> EGrammar; // indexed by the rule LHS
+
+// edges are immutable once created
+struct Edge {
+#ifdef DEBUG_CHART_PARSER
+ static int id_count;
+ const int id;
+#endif
+ const WordID cat; // lhs side of rule proved/being proved
+ const EGrammarNode* const dot; // dot position
+ const WFSTNode* const q; // start of span
+ const WFSTNode* const r; // end of span
+ const Edge* const active_parent; // back pointer, NULL for PREDICT items
+ const Edge* const passive_parent; // back pointer, NULL for SCAN and PREDICT items
+ TRulePtr tps; // translations
+ shared_ptr<SparseVector<double> > features; // features from CFG rule
+
+ bool IsPassive() const {
+ // when a rule is completed, this value will be set
+ return static_cast<bool>(features);
+ }
+ bool IsActive() const { return !IsPassive(); }
+ bool IsInitial() const {
+ return !(active_parent || passive_parent);
+ }
+ bool IsCreatedByScan() const {
+ return active_parent && !passive_parent && !dot->IsRoot();
+ }
+ bool IsCreatedByPredict() const {
+ return dot->IsRoot();
+ }
+ bool IsCreatedByComplete() const {
+ return active_parent && passive_parent;
+ }
+
+ // constructor for PREDICT
+ Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r) :
+#ifdef DEBUG_CHART_PARSER
+ id(++id_count),
+#endif
+ cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(NULL), passive_parent(NULL), tps() {}
+ Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r, const Edge* act_parent) :
+#ifdef DEBUG_CHART_PARSER
+ id(++id_count),
+#endif
+ cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(act_parent), passive_parent(NULL), tps() {}
+
+ // constructors for SCAN
+ Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j,
+ const Edge* act_par, const TRulePtr& translations) :
+#ifdef DEBUG_CHART_PARSER
+ id(++id_count),
+#endif
+ cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations) {}
+
+ Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j,
+ const Edge* act_par, const TRulePtr& translations,
+ const SparseVector<double>& feats) :
+#ifdef DEBUG_CHART_PARSER
+ id(++id_count),
+#endif
+ cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations),
+ features(new SparseVector<double>(feats)) {}
+
+ // constructors for COMPLETE
+ Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j,
+ const Edge* act_par, const Edge *pas_par) :
+#ifdef DEBUG_CHART_PARSER
+ id(++id_count),
+#endif
+ cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps() {
+ assert(pas_par->IsPassive());
+ assert(act_par->IsActive());
+ }
+
+ Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j,
+ const Edge* act_par, const Edge *pas_par, const SparseVector<double>& feats) :
+#ifdef DEBUG_CHART_PARSER
+ id(++id_count),
+#endif
+ cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(),
+ features(new SparseVector<double>(feats)) {
+ assert(pas_par->IsPassive());
+ assert(act_par->IsActive());
+ }
+
+ // constructor for COMPLETE query
+ Edge(const WFSTNode* _r) :
+#ifdef DEBUG_CHART_PARSER
+ id(0),
+#endif
+ cat(0), dot(NULL), q(NULL),
+ r(_r), active_parent(NULL), passive_parent(NULL), tps() {}
+ // constructor for MERGE quere
+ Edge(const WFSTNode* _q, int) :
+#ifdef DEBUG_CHART_PARSER
+ id(0),
+#endif
+ cat(0), dot(NULL), q(_q),
+ r(NULL), active_parent(NULL), passive_parent(NULL), tps() {}
+};
+#ifdef DEBUG_CHART_PARSER
+int Edge::id_count = 0;
+#endif
+
+ostream& operator<<(ostream& os, const Edge& e) {
+ string type = "PREDICT";
+ if (e.IsCreatedByScan())
+ type = "SCAN";
+ else if (e.IsCreatedByComplete())
+ type = "COMPLETE";
+ os << "["
+#ifdef DEBUG_CHART_PARSER
+ << '(' << e.id << ") "
+#else
+ << '(' << &e << ") "
+#endif
+ << "q=" << e.q << ", r=" << e.r
+ << ", cat="<< TD::Convert(e.cat*-1) << ", dot="
+ << e.dot
+#ifdef DEBUG_CHART_PARSER
+ << e.dot->hint
+#endif
+ << (e.IsActive() ? ", Active" : ", Passive")
+ << ", " << type;
+#ifdef DEBUG_CHART_PARSER
+ if (e.active_parent) { os << ", act.parent=(" << e.active_parent->id << ')'; }
+ if (e.passive_parent) { os << ", psv.parent=(" << e.passive_parent->id << ')'; }
+#endif
+ if (e.tps) { os << ", tps=" << e.tps->AsString(); }
+ return os << ']';
+}
+
+struct Traversal {
+ const Edge* const edge; // result from the active / passive combination
+ const Edge* const active;
+ const Edge* const passive;
+ Traversal(const Edge* me, const Edge* a, const Edge* p) : edge(me), active(a), passive(p) {}
+};
+
+struct UniqueTraversalHash {
+ size_t operator()(const Traversal* t) const {
+ size_t x = 5381;
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(t->active);
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(t->passive);
+ x = ((x << 5) + x) ^ t->edge->IsActive();
+ return x;
+ }
+};
+
+struct UniqueTraversalEquals {
+ size_t operator()(const Traversal* a, const Traversal* b) const {
+ return (a->passive == b->passive && a->active == b->active && a->edge->IsActive() == b->edge->IsActive());
+ }
+};
+
+struct UniqueEdgeHash {
+ size_t operator()(const Edge* e) const {
+ size_t x = 5381;
+ if (e->IsActive()) {
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->dot);
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q);
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r);
+ x = ((x << 5) + x) ^ static_cast<size_t>(e->cat);
+ x += 13;
+ } else { // with passive edges, we don't care about the dot
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q);
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r);
+ x = ((x << 5) + x) ^ static_cast<size_t>(e->cat);
+ }
+ return x;
+ }
+};
+
+struct UniqueEdgeEquals {
+ bool operator()(const Edge* a, const Edge* b) const {
+ if (a->IsActive() != b->IsActive()) return false;
+ if (a->IsActive()) {
+ return (a->cat == b->cat) && (a->dot == b->dot) && (a->q == b->q) && (a->r == b->r);
+ } else {
+ return (a->cat == b->cat) && (a->q == b->q) && (a->r == b->r);
+ }
+ }
+};
+
+struct REdgeHash {
+ size_t operator()(const Edge* e) const {
+ size_t x = 5381;
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r);
+ return x;
+ }
+};
+
+struct REdgeEquals {
+ bool operator()(const Edge* a, const Edge* b) const {
+ return (a->r == b->r);
+ }
+};
+
+struct QEdgeHash {
+ size_t operator()(const Edge* e) const {
+ size_t x = 5381;
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q);
+ return x;
+ }
+};
+
+struct QEdgeEquals {
+ bool operator()(const Edge* a, const Edge* b) const {
+ return (a->q == b->q);
+ }
+};
+
+struct EdgeQueue {
+ queue<const Edge*> q;
+ EdgeQueue() {}
+ void clear() { while(!q.empty()) q.pop(); }
+ bool HasWork() const { return !q.empty(); }
+ const Edge* Next() { const Edge* res = q.front(); q.pop(); return res; }
+ void AddEdge(const Edge* s) { q.push(s); }
+};
+
+class CFG_WFSTComposerImpl {
+ public:
+ CFG_WFSTComposerImpl(WordID start_cat,
+ const WFSTNode* q_0,
+ const WFSTNode* q_final) : start_cat_(start_cat), q_0_(q_0), q_final_(q_final) {}
+
+ // returns false if the intersection is empty
+ bool Compose(const EGrammar& g, Hypergraph* forest) {
+ goal_node = NULL;
+ EGrammar::const_iterator sit = g.find(start_cat_);
+ forest->ReserveNodes(kMAX_NODES);
+ assert(sit != g.end());
+ Edge* init = new Edge(start_cat_, &sit->second, q_0_);
+ assert(IncorporateNewEdge(init));
+ while (exp_agenda.HasWork() || agenda.HasWork()) {
+ while(exp_agenda.HasWork()) {
+ const Edge* edge = exp_agenda.Next();
+ FinishEdge(edge, forest);
+ }
+ if (agenda.HasWork()) {
+ const Edge* edge = agenda.Next();
+#ifdef DEBUG_CHART_PARSER
+ cerr << "processing (" << edge->id << ')' << endl;
+#endif
+ if (edge->IsActive()) {
+ if (edge->dot->HasTerminals())
+ DoScan(edge);
+ if (edge->dot->HasNonTerminals()) {
+ DoMergeWithPassives(edge);
+ DoPredict(edge, g);
+ }
+ } else {
+ DoComplete(edge);
+ }
+ }
+ }
+ if (goal_node) {
+ forest->PruneUnreachable(goal_node->id_);
+ forest->EpsilonRemove(kEPS);
+ }
+ FreeAll();
+ return goal_node;
+ }
+
+ void FreeAll() {
+ for (int i = 0; i < free_list_.size(); ++i)
+ delete free_list_[i];
+ free_list_.clear();
+ for (int i = 0; i < traversal_free_list_.size(); ++i)
+ delete traversal_free_list_[i];
+ traversal_free_list_.clear();
+ all_traversals.clear();
+ exp_agenda.clear();
+ agenda.clear();
+ tps2node.clear();
+ edge2node.clear();
+ all_edges.clear();
+ passive_edges.clear();
+ active_edges.clear();
+ }
+
+ ~CFG_WFSTComposerImpl() {
+ FreeAll();
+ }
+
+ // returns the total number of edges created during composition
+ int EdgesCreated() const {
+ return free_list_.size();
+ }
+
+ private:
+ void DoScan(const Edge* edge) {
+ // here, we assume that the FST will potentially have many more outgoing
+ // edges than the grammar, which will be just a couple. If you want to
+ // efficiently handle the case where both are relatively large, this code
+ // will need to change how the intersection is done. The best general
+ // solution would probably be the Baeza-Yates double binary search.
+
+ const EGrammarNode* dot = edge->dot;
+ const WFSTNode* r = edge->r;
+ const map<WordID, EGrammarNode>& terms = dot->GetTerminals();
+ for (map<WordID, EGrammarNode>::const_iterator git = terms.begin();
+ git != terms.end(); ++git) {
+
+ if (!(TD::Convert(git->first)[0] >= '0' && TD::Convert(git->first)[0] <= '9')) {
+ std::cerr << "TERMINAL SYMBOL: " << TD::Convert(git->first) << endl;
+ abort();
+ }
+ std::vector<std::pair<const WFSTNode*, TRulePtr> > extensions = r->ExtendInput(atoi(TD::Convert(git->first)));
+ for (unsigned nsi = 0; nsi < extensions.size(); ++nsi) {
+ const WFSTNode* next_r = extensions[nsi].first;
+ const EGrammarNode* next_dot = &git->second;
+ const bool grammar_continues = next_dot->GrammarContinues();
+ const bool rule_completes = next_dot->RuleCompletes();
+ if (extensions[nsi].second)
+ cerr << "!!! " << extensions[nsi].second->AsString() << endl;
+ // cerr << " rule completes: " << rule_completes << " after consuming " << TD::Convert(git->first) << endl;
+ assert(grammar_continues || rule_completes);
+ const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures();
+ if (rule_completes)
+ IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second, input_features));
+ if (grammar_continues)
+ IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second));
+ }
+ }
+ }
+
+ void DoPredict(const Edge* edge, const EGrammar& g) {
+ const EGrammarNode* dot = edge->dot;
+ const map<WordID, EGrammarNode>& non_terms = dot->GetNonTerminals();
+ for (map<WordID, EGrammarNode>::const_iterator git = non_terms.begin();
+ git != non_terms.end(); ++git) {
+ const WordID nt_to_predict = git->first;
+ //cerr << edge->id << " -- " << TD::Convert(nt_to_predict*-1) << endl;
+ EGrammar::const_iterator egi = g.find(nt_to_predict);
+ if (egi == g.end()) {
+ cerr << "[ERROR] Can't find any grammar rules with a LHS of type "
+ << TD::Convert(-1*nt_to_predict) << '!' << endl;
+ continue;
+ }
+ assert(edge->IsActive());
+ const EGrammarNode* new_dot = &egi->second;
+ Edge* new_edge = new Edge(nt_to_predict, new_dot, edge->r, edge);
+ IncorporateNewEdge(new_edge);
+ }
+ }
+
+ void DoComplete(const Edge* passive) {
+#ifdef DEBUG_CHART_PARSER
+ cerr << " complete: " << *passive << endl;
+#endif
+ const WordID completed_nt = passive->cat;
+ const WFSTNode* q = passive->q;
+ const WFSTNode* next_r = passive->r;
+ const Edge query(q);
+ const pair<unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator,
+ unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator > p =
+ active_edges.equal_range(&query);
+ for (unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator it = p.first;
+ it != p.second; ++it) {
+ const Edge* active = *it;
+#ifdef DEBUG_CHART_PARSER
+ cerr << " pos: " << *active << endl;
+#endif
+ const EGrammarNode* next_dot = active->dot->Extend(completed_nt);
+ if (!next_dot) continue;
+ const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures();
+ // add up to 2 rules
+ if (next_dot->RuleCompletes())
+ IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features));
+ if (next_dot->GrammarContinues())
+ IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive));
+ }
+ }
+
+ void DoMergeWithPassives(const Edge* active) {
+ // edge is active, has non-terminals, we need to find the passives that can extend it
+ assert(active->IsActive());
+ assert(active->dot->HasNonTerminals());
+#ifdef DEBUG_CHART_PARSER
+ cerr << " merge active with passives: ACT=" << *active << endl;
+#endif
+ const Edge query(active->r, 1);
+ const pair<unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator,
+ unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator > p =
+ passive_edges.equal_range(&query);
+ for (unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator it = p.first;
+ it != p.second; ++it) {
+ const Edge* passive = *it;
+ const EGrammarNode* next_dot = active->dot->Extend(passive->cat);
+ if (!next_dot) continue;
+ const WFSTNode* next_r = passive->r;
+ const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures();
+ if (next_dot->RuleCompletes())
+ IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features));
+ if (next_dot->GrammarContinues())
+ IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive));
+ }
+ }
+
+ // take ownership of edge memory, add to various indexes, etc
+ // returns true if this edge is new
+ bool IncorporateNewEdge(Edge* edge) {
+ free_list_.push_back(edge);
+ if (edge->passive_parent && edge->active_parent) {
+ Traversal* t = new Traversal(edge, edge->active_parent, edge->passive_parent);
+ traversal_free_list_.push_back(t);
+ if (all_traversals.find(t) != all_traversals.end()) {
+ return false;
+ } else {
+ all_traversals.insert(t);
+ }
+ }
+ exp_agenda.AddEdge(edge);
+ return true;
+ }
+
+ bool FinishEdge(const Edge* edge, Hypergraph* hg) {
+ bool is_new = false;
+ if (all_edges.find(edge) == all_edges.end()) {
+#ifdef DEBUG_CHART_PARSER
+ cerr << *edge << " is NEW\n";
+#endif
+ all_edges.insert(edge);
+ is_new = true;
+ if (edge->IsPassive()) passive_edges.insert(edge);
+ if (edge->IsActive()) active_edges.insert(edge);
+ agenda.AddEdge(edge);
+ } else {
+#ifdef DEBUG_CHART_PARSER
+ cerr << *edge << " is NOT NEW.\n";
+#endif
+ }
+ AddEdgeToTranslationForest(edge, hg);
+ return is_new;
+ }
+
+ // build the translation forest
+ void AddEdgeToTranslationForest(const Edge* edge, Hypergraph* hg) {
+ assert(hg->nodes_.size() < kMAX_NODES);
+ Hypergraph::Node* tps = NULL;
+ // first add any target language rules
+ if (edge->tps) {
+ Hypergraph::Node*& node = tps2node[(size_t)edge->tps.get()];
+ if (!node) {
+ // cerr << "Creating phrases for " << edge->tps << endl;
+ const TRulePtr& rule = edge->tps;
+ node = hg->AddNode(kPHRASE);
+ Hypergraph::Edge* hg_edge = hg->AddEdge(rule, Hypergraph::TailNodeVector());
+ hg_edge->feature_values_ += rule->GetFeatureValues();
+ hg->ConnectEdgeToHeadNode(hg_edge, node);
+ }
+ tps = node;
+ }
+ Hypergraph::Node*& head_node = edge2node[edge];
+ if (!head_node)
+ head_node = hg->AddNode(kPHRASE);
+ if (edge->cat == start_cat_ && edge->q == q_0_ && edge->r == q_final_ && edge->IsPassive()) {
+ assert(goal_node == NULL || goal_node == head_node);
+ goal_node = head_node;
+ }
+ Hypergraph::TailNodeVector tail;
+ SparseVector<double> extra;
+ if (edge->IsCreatedByPredict()) {
+ // extra.set_value(FD::Convert("predict"), 1);
+ } else if (edge->IsCreatedByScan()) {
+ tail.push_back(edge2node[edge->active_parent]->id_);
+ if (tps) {
+ tail.push_back(tps->id_);
+ }
+ //extra.set_value(FD::Convert("scan"), 1);
+ } else if (edge->IsCreatedByComplete()) {
+ tail.push_back(edge2node[edge->active_parent]->id_);
+ tail.push_back(edge2node[edge->passive_parent]->id_);
+ //extra.set_value(FD::Convert("complete"), 1);
+ } else {
+ assert(!"unexpected edge type!");
+ }
+ //cerr << head_node->id_ << "<--" << *edge << endl;
+
+#ifdef DEBUG_CHART_PARSER
+ for (int i = 0; i < tail.size(); ++i)
+ if (tail[i] == head_node->id_) {
+ cerr << "ERROR: " << *edge << "\n i=" << i << endl;
+ if (i == 1) { cerr << "\tP: " << *edge->passive_parent << endl; }
+ if (i == 0) { cerr << "\tA: " << *edge->active_parent << endl; }
+ assert(!"self-loop found!");
+ }
+#endif
+ Hypergraph::Edge* hg_edge = NULL;
+ if (tail.size() == 0) {
+ hg_edge = hg->AddEdge(kEPSRule, tail);
+ } else if (tail.size() == 1) {
+ hg_edge = hg->AddEdge(kX1, tail);
+ } else if (tail.size() == 2) {
+ hg_edge = hg->AddEdge(kX1X2, tail);
+ }
+ if (edge->features)
+ hg_edge->feature_values_ += *edge->features;
+ hg_edge->feature_values_ += extra;
+ hg->ConnectEdgeToHeadNode(hg_edge, head_node);
+ }
+
+ Hypergraph::Node* goal_node;
+ EdgeQueue exp_agenda;
+ EdgeQueue agenda;
+ unordered_map<size_t, Hypergraph::Node*> tps2node;
+ unordered_map<const Edge*, Hypergraph::Node*, UniqueEdgeHash, UniqueEdgeEquals> edge2node;
+ unordered_set<const Traversal*, UniqueTraversalHash, UniqueTraversalEquals> all_traversals;
+ unordered_set<const Edge*, UniqueEdgeHash, UniqueEdgeEquals> all_edges;
+ unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals> passive_edges;
+ unordered_multiset<const Edge*, REdgeHash, REdgeEquals> active_edges;
+ vector<Edge*> free_list_;
+ vector<Traversal*> traversal_free_list_;
+ const WordID start_cat_;
+ const WFSTNode* const q_0_;
+ const WFSTNode* const q_final_;
+};
+
+#ifdef DEBUG_CHART_PARSER
+static string TrimRule(const string& r) {
+ size_t start = r.find(" |||") + 5;
+ size_t end = r.rfind(" |||");
+ return r.substr(start, end - start);
+}
+#endif
+
+void AddGrammarRule(const string& r, EGrammar* g) {
+ const size_t pos = r.find(" ||| ");
+ if (pos == string::npos || r[0] != '[') {
+ cerr << "Bad rule: " << r << endl;
+ return;
+ }
+ const size_t rpos = r.rfind(" ||| ");
+ string feats;
+ string rs = r;
+ if (rpos != pos) {
+ feats = r.substr(rpos + 5);
+ rs = r.substr(0, rpos);
+ }
+ string rhs = rs.substr(pos + 5);
+ string trule = rs + " ||| " + rhs + " ||| " + feats;
+ TRule tr(trule);
+ cerr << "X: " << tr.e_[0] << endl;
+#ifdef DEBUG_CHART_PARSER
+ string hint_last_rule;
+#endif
+ EGrammarNode* cur = &(*g)[tr.GetLHS()];
+ cur->is_root = true;
+ for (int i = 0; i < tr.FLength(); ++i) {
+ WordID sym = tr.f()[i];
+#ifdef DEBUG_CHART_PARSER
+ hint_last_rule = TD::Convert(sym < 0 ? -sym : sym);
+ cur->hint += " <@@> (*" + hint_last_rule + ") " + TrimRule(tr.AsString());
+#endif
+ if (sym < 0)
+ cur = &cur->ntptr[sym];
+ else
+ cur = &cur->tptr[sym];
+ }
+#ifdef DEBUG_CHART_PARSER
+ cur->hint += " <@@> (" + hint_last_rule + "*) " + TrimRule(tr.AsString());
+#endif
+ cur->is_some_rule_complete = true;
+ cur->input_features = tr.GetFeatureValues();
+}
+
+CFG_WFSTComposer::~CFG_WFSTComposer() {
+ delete pimpl_;
+}
+
+CFG_WFSTComposer::CFG_WFSTComposer(const WFST& wfst) {
+ InitializeConstants();
+ pimpl_ = new CFG_WFSTComposerImpl(kUNIQUE_START, wfst.Initial(), wfst.Final());
+}
+
+bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest) {
+ // first, convert the src forest into an EGrammar
+ EGrammar g;
+ const int nedges = src_forest.edges_.size();
+ const int nnodes = src_forest.nodes_.size();
+ vector<int> cats(nnodes);
+ bool assign_cats = false;
+ for (int i = 0; i < nnodes; ++i)
+ if (assign_cats) {
+ cats[i] = TD::Convert("CAT_" + boost::lexical_cast<string>(i)) * -1;
+ } else {
+ cats[i] = src_forest.nodes_[i].cat_;
+ }
+ // construct the grammar
+ for (int i = 0; i < nedges; ++i) {
+ const Hypergraph::Edge& edge = src_forest.edges_[i];
+ const vector<WordID>& src = edge.rule_->f();
+ EGrammarNode* cur = &g[cats[edge.head_node_]];
+ cur->is_root = true;
+ int ntc = 0;
+ for (int j = 0; j < src.size(); ++j) {
+ WordID sym = src[j];
+ if (sym <= 0) {
+ sym = cats[edge.tail_nodes_[ntc]];
+ ++ntc;
+ cur = &cur->ntptr[sym];
+ } else {
+ cur = &cur->tptr[sym];
+ }
+ }
+ cur->is_some_rule_complete = true;
+ cur->input_features = edge.feature_values_;
+ }
+ EGrammarNode& goal_rule = g[kUNIQUE_START];
+ assert((goal_rule.ntptr.size() == 1 && goal_rule.tptr.size() == 0) ||
+ (goal_rule.ntptr.size() == 0 && goal_rule.tptr.size() == 1));
+
+ return pimpl_->Compose(g, trg_forest);
+}
+
+bool CFG_WFSTComposer::Compose(istream* in, Hypergraph* trg_forest) {
+ EGrammar g;
+ while(*in) {
+ string line;
+ getline(*in, line);
+ if (line.empty()) continue;
+ AddGrammarRule(line, &g);
+ }
+
+ return pimpl_->Compose(g, trg_forest);
+}
diff --git a/gi/pf/cfg_wfst_composer.h b/gi/pf/cfg_wfst_composer.h
new file mode 100644
index 00000000..cf47f459
--- /dev/null
+++ b/gi/pf/cfg_wfst_composer.h
@@ -0,0 +1,46 @@
+#ifndef _CFG_WFST_COMPOSER_H_
+#define _CFG_WFST_COMPOSER_H_
+
+#include <iostream>
+#include <vector>
+#include <utility>
+
+#include "trule.h"
+#include "wordid.h"
+
+class CFG_WFSTComposerImpl;
+class Hypergraph;
+
+struct WFSTNode {
+ virtual ~WFSTNode();
+ // returns the next states reachable by consuming srcindex (which identifies a word)
+ // paired with the output string generated by taking that transition.
+ virtual std::vector<std::pair<const WFSTNode*,TRulePtr> > ExtendInput(unsigned srcindex) const = 0;
+};
+
+struct WFST {
+ virtual ~WFST();
+ virtual const WFSTNode* Final() const = 0;
+ virtual const WFSTNode* Initial() const = 0;
+};
+
+class CFG_WFSTComposer {
+ public:
+ ~CFG_WFSTComposer();
+ explicit CFG_WFSTComposer(const WFST& wfst);
+ bool Compose(const Hypergraph& in_forest, Hypergraph* trg_forest);
+
+ // reads the grammar from a file. There must be a single top-level
+ // S -> X rule. Anything else is possible. Format is:
+ // [S] ||| [SS,1]
+ // [SS] ||| [NP,1] [VP,2] ||| Feature1=0.2 Feature2=-2.3
+ // [SS] ||| [VP,1] [NP,2] ||| Feature1=0.8
+ // [NP] ||| [DET,1] [N,2] ||| Feature3=2
+ // ...
+ bool Compose(std::istream* grammar_file, Hypergraph* trg_forest);
+
+ private:
+ CFG_WFSTComposerImpl* pimpl_;
+};
+
+#endif
diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc
new file mode 100644
index 00000000..582d1be7
--- /dev/null
+++ b/gi/pf/dpnaive.cc
@@ -0,0 +1,349 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/multi_array.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "base_measures.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp_nt.h"
+
+using namespace std;
+using namespace std::tr1;
+namespace po = boost::program_options;
+
+static unsigned kMAX_SRC_PHRASE;
+static unsigned kMAX_TRG_PHRASE;
+struct FSTState;
+
+size_t hash_value(const TRule& r) {
+ size_t h = 2 - r.lhs_;
+ boost::hash_combine(h, boost::hash_value(r.e_));
+ boost::hash_combine(h, boost::hash_value(r.f_));
+ return h;
+}
+
+bool operator==(const TRule& a, const TRule& b) {
+ return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
+}
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("input,i",po::value<string>(),"Read parallel data from")
+ ("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases")
+ ("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases")
+ ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
+ ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("input") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+void ReadParallelCorpus(const string& filename,
+ vector<vector<WordID> >* f,
+ vector<vector<int> >* e,
+ set<int>* vocab_e,
+ set<int>* vocab_f) {
+ f->clear();
+ e->clear();
+ vocab_f->clear();
+ vocab_e->clear();
+ istream* in;
+ if (filename == "-")
+ in = &cin;
+ else
+ in = new ifstream(filename.c_str());
+ assert(*in);
+ string line;
+ const WordID kDIV = TD::Convert("|||");
+ vector<WordID> tmp;
+ while(*in) {
+ getline(*in, line);
+ if (line.empty() && !*in) break;
+ e->push_back(vector<int>());
+ f->push_back(vector<int>());
+ vector<int>& le = e->back();
+ vector<int>& lf = f->back();
+ tmp.clear();
+ TD::ConvertSentence(line, &tmp);
+ bool isf = true;
+ for (unsigned i = 0; i < tmp.size(); ++i) {
+ const int cur = tmp[i];
+ if (isf) {
+ if (kDIV == cur) { isf = false; } else {
+ lf.push_back(cur);
+ vocab_f->insert(cur);
+ }
+ } else {
+ assert(cur != kDIV);
+ le.push_back(cur);
+ vocab_e->insert(cur);
+ }
+ }
+ assert(isf == false);
+ }
+ if (in != &cin) delete in;
+}
+
+shared_ptr<MT19937> prng;
+
+template <typename Base>
+struct ModelAndData {
+ explicit ModelAndData(const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) :
+ rng(&*prng),
+ p0(b),
+ baseprob(prob_t::One()),
+ corpuse(ce),
+ corpusf(cf),
+ vocabe(ve),
+ vocabf(vf),
+ rules(1,1),
+ mh_samples(),
+ mh_rejects(),
+ kX(-TD::Convert("X")),
+ derivations(corpuse.size()) {}
+
+ void ResampleHyperparameters() {
+ rules.resample_hyperparameters(&*prng);
+ }
+
+ void InstantiateRule(const pair<short,short>& from,
+ const pair<short,short>& to,
+ const vector<int>& sentf,
+ const vector<int>& sente,
+ TRule* rule) const {
+ rule->f_.clear();
+ rule->e_.clear();
+ rule->lhs_ = kX;
+ for (short i = from.first; i < to.first; ++i)
+ rule->f_.push_back(sentf[i]);
+ for (short i = from.second; i < to.second; ++i)
+ rule->e_.push_back(sente[i]);
+ }
+
+ void DecrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
+ if (d.size() < 2) return;
+ TRule x;
+ for (int i = 1; i < d.size(); ++i) {
+ InstantiateRule(d[i], d[i-1], sentf, sente, &x);
+ //cerr << "REMOVE: " << x.AsString() << endl;
+ if (rules.decrement(x)) {
+ baseprob /= p0(x);
+ //cerr << " (REMOVED ONLY INSTANCE)\n";
+ }
+ }
+ }
+
+ void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
+ if (d.size() < 2) return;
+ TRule x;
+ for (int i = 1; i < d.size(); ++i) {
+ InstantiateRule(d[i], d[i-1], sentf, sente, &x);
+ cerr << i << '/' << (d.size() - 1) << ": " << x << endl;
+ }
+ }
+
+ void IncrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
+ if (d.size() < 2) return;
+ TRule x;
+ for (int i = 1; i < d.size(); ++i) {
+ InstantiateRule(d[i], d[i-1], sentf, sente, &x);
+ if (rules.increment(x)) {
+ baseprob *= p0(x);
+ }
+ }
+ }
+
+ prob_t Likelihood() const {
+ prob_t p;
+ p.logeq(rules.log_crp_prob());
+ return p * baseprob;
+ }
+
+ prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const {
+ prob_t p = prob_t::One();
+ if (d.size() < 2) return p;
+ TRule x;
+ for (int i = 1; i < d.size(); ++i) {
+ InstantiateRule(d[i], d[i-1], sentf, sente, &x);
+ prob_t rp; rp.logeq(rules.logprob(x, log(p0(x))));
+ p *= rp;
+ }
+ return p;
+ }
+
+ void Sample();
+
+ MT19937* rng;
+ const Base& p0;
+ prob_t baseprob; // cached value of generating the table table labels from p0
+ // this can't be used if we go to a hierarchical prior!
+ const vector<vector<int> >& corpuse, corpusf;
+ const set<int>& vocabe, vocabf;
+ CCRP_NoTable<TRule> rules;
+ unsigned mh_samples, mh_rejects;
+ const int kX;
+ vector<vector<pair<short, short> > > derivations;
+};
+
+template <typename Base>
+void ModelAndData<Base>::Sample() {
+ unsigned MAXK = 4;
+ unsigned MAXL = 4;
+ TRule x;
+ x.lhs_ = -TD::Convert("X");
+ for (int samples = 0; samples < 1000; ++samples) {
+ if (samples % 1 == 0 && samples > 0) {
+ //ResampleHyperparameters();
+ cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n";
+ for (int i = 0; i < 10; ++i) {
+ cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl;
+ PrintDerivation(derivations[i], corpusf[i], corpuse[i]);
+ }
+ }
+ cerr << '.' << flush;
+ for (int s = 0; s < corpuse.size(); ++s) {
+ const vector<int>& sentf = corpusf[s];
+ const vector<int>& sente = corpuse[s];
+// cerr << " CUSTOMERS: " << rules.num_customers() << endl;
+// cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl;
+
+ vector<pair<short, short> >& deriv = derivations[s];
+ const prob_t p_cur = Likelihood();
+ DecrementDerivation(deriv, sentf, sente);
+
+ boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]);
+ boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]);
+ a[0][0] = prob_t::One();
+ for (int i = 0; i < sentf.size(); ++i) {
+ for (int j = 0; j < sente.size(); ++j) {
+ const prob_t src_a = a[i][j];
+ x.f_.clear();
+ for (int k = 1; k <= MAXK; ++k) {
+ if (i + k > sentf.size()) break;
+ x.f_.push_back(sentf[i + k - 1]);
+ x.e_.clear();
+ for (int l = 1; l <= MAXL; ++l) {
+ if (j + l > sente.size()) break;
+ x.e_.push_back(sente[j + l - 1]);
+ trans[i][j][k - 1][l - 1].logeq(rules.logprob(x, log(p0(x))));
+ a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1];
+ }
+ }
+ }
+ }
+// cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl;
+ const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente);
+
+ vector<pair<short,short> > newderiv;
+ int cur_i = sentf.size();
+ int cur_j = sente.size();
+ while(cur_i > 0 && cur_j > 0) {
+ newderiv.push_back(pair<short,short>(cur_i, cur_j));
+// cerr << "NODE: (" << cur_i << "," << cur_j << ")\n";
+ SampleSet<prob_t> ss;
+ vector<pair<short,short> > nexts;
+ for (int k = 1; k <= MAXK; ++k) {
+ const int hyp_i = cur_i - k;
+ if (hyp_i < 0) break;
+ for (int l = 1; l <= MAXL; ++l) {
+ const int hyp_j = cur_j - l;
+ if (hyp_j < 0) break;
+ const prob_t& inside = a[hyp_i][hyp_j];
+ if (inside == prob_t::Zero()) continue;
+ const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1];
+ if (transp == prob_t::Zero()) continue;
+ const prob_t p = inside * transp;
+ ss.add(p);
+ nexts.push_back(pair<short,short>(hyp_i, hyp_j));
+// cerr << " (" << hyp_i << "," << hyp_j << ") <--- " << log(p) << endl;
+ }
+ }
+// cerr << " sample set has " << nexts.size() << " elements.\n";
+ const int selected = rng->SelectSample(ss);
+ cur_i = nexts[selected].first;
+ cur_j = nexts[selected].second;
+ }
+ newderiv.push_back(pair<short,short>(0,0));
+ const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente);
+ IncrementDerivation(newderiv, sentf, sente);
+// cerr << "SANITY: " << q_new << " " <<log(DerivationProposalProbability(newderiv, sentf, sente)) << endl;
+ if (deriv.empty()) { deriv = newderiv; continue; }
+ ++mh_samples;
+
+ if (deriv != newderiv) {
+ const prob_t p_new = Likelihood();
+// cerr << "p_cur=" << log(p_cur) << "\t p_new=" << log(p_new) << endl;
+// cerr << "q_cur=" << log(q_cur) << "\t q_new=" << log(q_new) << endl;
+ if (!rng->AcceptMetropolisHastings(p_new, p_cur, q_new, q_cur)) {
+ ++mh_rejects;
+ DecrementDerivation(newderiv, sentf, sente);
+ IncrementDerivation(deriv, sentf, sente);
+ } else {
+// cerr << " ACCEPT\n";
+ deriv = newderiv;
+ }
+ }
+ }
+ }
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
+ kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
+
+ if (!conf.count("model1")) {
+ cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
+ return 1;
+ }
+ if (conf.count("random_seed"))
+ prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ prng.reset(new MT19937);
+// MT19937& rng = *prng;
+
+ vector<vector<int> > corpuse, corpusf;
+ set<int> vocabe, vocabf;
+ ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+ cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
+ cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
+ assert(corpusf.size() == corpuse.size());
+
+ Model1 m1(conf["model1"].as<string>());
+ PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
+
+ ModelAndData<PhraseJointBase> posterior(lp0, corpuse, corpusf, vocabe, vocabf);
+ posterior.Sample();
+
+ return 0;
+}
+
diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc
new file mode 100644
index 00000000..2c2a86f9
--- /dev/null
+++ b/gi/pf/itg.cc
@@ -0,0 +1,224 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "viterbi.h"
+#include "hg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp_nt.h"
+#include "ccrp_onetable.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+ostream& operator<<(ostream& os, const vector<WordID>& p) {
+ os << '[';
+ for (int i = 0; i < p.size(); ++i)
+ os << (i==0 ? "" : " ") << TD::Convert(p[i]);
+ return os << ']';
+}
+
+size_t hash_value(const TRule& r) {
+ size_t h = boost::hash_value(r.e_);
+ boost::hash_combine(h, -r.lhs_);
+ boost::hash_combine(h, boost::hash_value(r.f_));
+ return h;
+}
+
+bool operator==(const TRule& a, const TRule& b) {
+ return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
+}
+
+double log_poisson(unsigned x, const double& lambda) {
+ assert(lambda > 0.0);
+ return log(lambda) * x - lgamma(x + 1) - lambda;
+}
+
+struct Model1 {
+ explicit Model1(const string& fname) :
+ kNULL(TD::Convert("<eps>")),
+ kZERO() {
+ LoadModel1(fname);
+ }
+
+ void LoadModel1(const string& fname) {
+ cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ unsigned lc = 0;
+ while(getline(in, line)) {
+ ++lc;
+ int cur = 0;
+ int start = 0;
+ while(cur < line.size() && line[cur] != ' ') { ++cur; }
+ assert(cur != line.size());
+ line[cur] = 0;
+ const WordID src = TD::Convert(&line[0]);
+ ++cur;
+ start = cur;
+ while(cur < line.size() && line[cur] != ' ') { ++cur; }
+ assert(cur != line.size());
+ line[cur] = 0;
+ WordID trg = TD::Convert(&line[start]);
+ const double logprob = strtod(&line[cur + 1], NULL);
+ if (src >= ttable.size()) ttable.resize(src + 1);
+ ttable[src][trg].logeq(logprob);
+ }
+ cerr << " read " << lc << " parameters.\n";
+ }
+
+ // returns prob 0 if src or trg is not found!
+ const prob_t& operator()(WordID src, WordID trg) const {
+ if (src == 0) src = kNULL;
+ if (src < ttable.size()) {
+ const map<WordID, prob_t>& cpd = ttable[src];
+ const map<WordID, prob_t>::const_iterator it = cpd.find(trg);
+ if (it != cpd.end())
+ return it->second;
+ }
+ return kZERO;
+ }
+
+ const WordID kNULL;
+ const prob_t kZERO;
+ vector<map<WordID, prob_t> > ttable;
+};
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles")
+ ("input,i",po::value<string>(),"Read parallel data from")
+ ("max_src_phrase",po::value<unsigned>()->default_value(7),"Maximum length of source language phrases")
+ ("max_trg_phrase",po::value<unsigned>()->default_value(7),"Maximum length of target language phrases")
+ ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
+ ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")
+ ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("input") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+void ReadParallelCorpus(const string& filename,
+ vector<vector<WordID> >* f,
+ vector<vector<WordID> >* e,
+ set<WordID>* vocab_f,
+ set<WordID>* vocab_e) {
+ f->clear();
+ e->clear();
+ vocab_f->clear();
+ vocab_e->clear();
+ istream* in;
+ if (filename == "-")
+ in = &cin;
+ else
+ in = new ifstream(filename.c_str());
+ assert(*in);
+ string line;
+ const WordID kDIV = TD::Convert("|||");
+ vector<WordID> tmp;
+ while(*in) {
+ getline(*in, line);
+ if (line.empty() && !*in) break;
+ e->push_back(vector<int>());
+ f->push_back(vector<int>());
+ vector<int>& le = e->back();
+ vector<int>& lf = f->back();
+ tmp.clear();
+ TD::ConvertSentence(line, &tmp);
+ bool isf = true;
+ for (unsigned i = 0; i < tmp.size(); ++i) {
+ const int cur = tmp[i];
+ if (isf) {
+ if (kDIV == cur) { isf = false; } else {
+ lf.push_back(cur);
+ vocab_f->insert(cur);
+ }
+ } else {
+ assert(cur != kDIV);
+ le.push_back(cur);
+ vocab_e->insert(cur);
+ }
+ }
+ assert(isf == false);
+ }
+ if (in != &cin) delete in;
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
+ const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
+ const unsigned particles = conf["particles"].as<unsigned>();
+ const unsigned samples = conf["samples"].as<unsigned>();
+
+ if (!conf.count("model1")) {
+ cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
+ return 1;
+ }
+ shared_ptr<MT19937> prng;
+ if (conf.count("random_seed"))
+ prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ prng.reset(new MT19937);
+ MT19937& rng = *prng;
+
+ vector<vector<WordID> > corpuse, corpusf;
+ set<WordID> vocabe, vocabf;
+ cerr << "Reading corpus...\n";
+ ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+ cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
+ cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+ assert(corpusf.size() == corpuse.size());
+
+ const int kLHS = -TD::Convert("X");
+ Model1 m1(conf["model1"].as<string>());
+ Model1 invm1(conf["inverse_model1"].as<string>());
+ for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) {
+ cerr << '.' << flush;
+ for (int ci = 0; ci < corpusf.size(); ++ci) {
+ const vector<WordID>& src = corpusf[ci];
+ const vector<WordID>& trg = corpuse[ci];
+ for (int i = 0; i < src.size(); ++i) {
+ for (int j = 0; j < trg.size(); ++j) {
+ const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE);
+ for (int k = 0; k < eff_max_src; ++k) {
+ const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE));
+ for (int l = 0; l < eff_max_trg; ++l) {
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc
new file mode 100644
index 00000000..4c6ba3ef
--- /dev/null
+++ b/gi/pf/pfbrat.cc
@@ -0,0 +1,554 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/multi_array.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "viterbi.h"
+#include "hg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp_nt.h"
+#include "cfg_wfst_composer.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+static unsigned kMAX_SRC_PHRASE;
+static unsigned kMAX_TRG_PHRASE;
+struct FSTState;
+
+size_t hash_value(const TRule& r) {
+ size_t h = 2 - r.lhs_;
+ boost::hash_combine(h, boost::hash_value(r.e_));
+ boost::hash_combine(h, boost::hash_value(r.f_));
+ return h;
+}
+
+bool operator==(const TRule& a, const TRule& b) {
+ return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
+}
+
+double log_poisson(unsigned x, const double& lambda) {
+ assert(lambda > 0.0);
+ return log(lambda) * x - lgamma(x + 1) - lambda;
+}
+
+struct ConditionalBase {
+ explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) :
+ kM1MIXTURE(m1mixture),
+ kUNIFORM_MIXTURE(1.0 - m1mixture),
+ kUNIFORM_TARGET(1.0 / vocab_e_size),
+ kNULL(TD::Convert("<eps>")) {
+ assert(m1mixture >= 0.0 && m1mixture <= 1.0);
+ assert(vocab_e_size > 0);
+ LoadModel1(model1fname);
+ }
+
+ void LoadModel1(const string& fname) {
+ cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ unsigned lc = 0;
+ while(getline(in, line)) {
+ ++lc;
+ int cur = 0;
+ int start = 0;
+ while(cur < line.size() && line[cur] != ' ') { ++cur; }
+ assert(cur != line.size());
+ line[cur] = 0;
+ const WordID src = TD::Convert(&line[0]);
+ ++cur;
+ start = cur;
+ while(cur < line.size() && line[cur] != ' ') { ++cur; }
+ assert(cur != line.size());
+ line[cur] = 0;
+ WordID trg = TD::Convert(&line[start]);
+ const double logprob = strtod(&line[cur + 1], NULL);
+ if (src >= ttable.size()) ttable.resize(src + 1);
+ ttable[src][trg].logeq(logprob);
+ }
+ cerr << " read " << lc << " parameters.\n";
+ }
+
+ // return logp0 of rule.e_ | rule.f_
+ prob_t operator()(const TRule& rule) const {
+ const int flen = rule.f_.size();
+ const int elen = rule.e_.size();
+ prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
+ prob_t p;
+ p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
+ for (int i = 0; i < elen; ++i) { // for each position i in e-RHS
+ const WordID trg = rule.e_[i];
+ prob_t tp = prob_t::Zero();
+ for (int j = -1; j < flen; ++j) {
+ const WordID src = j < 0 ? kNULL : rule.f_[j];
+ const map<WordID, prob_t>::const_iterator it = ttable[src].find(trg);
+ if (it != ttable[src].end()) {
+ tp += kM1MIXTURE * it->second;
+ }
+ tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
+ }
+ tp *= uniform_src_alignment; // draw a_i ~uniform
+ p *= tp; // draw e_i ~Model1(f_a_i) / uniform
+ }
+ return p;
+ }
+
+ const prob_t kM1MIXTURE; // Model 1 mixture component
+ const prob_t kUNIFORM_MIXTURE; // uniform mixture component
+ const prob_t kUNIFORM_TARGET;
+ const WordID kNULL;
+ vector<map<WordID, prob_t> > ttable;
+};
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("input,i",po::value<string>(),"Read parallel data from")
+ ("max_src_phrase",po::value<unsigned>()->default_value(3),"Maximum length of source language phrases")
+ ("max_trg_phrase",po::value<unsigned>()->default_value(3),"Maximum length of target language phrases")
+ ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
+ ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("input") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+void ReadParallelCorpus(const string& filename,
+ vector<vector<WordID> >* f,
+ vector<vector<int> >* e,
+ set<int>* vocab_f,
+ set<int>* vocab_e) {
+ f->clear();
+ e->clear();
+ vocab_f->clear();
+ vocab_e->clear();
+ istream* in;
+ if (filename == "-")
+ in = &cin;
+ else
+ in = new ifstream(filename.c_str());
+ assert(*in);
+ string line;
+ const WordID kDIV = TD::Convert("|||");
+ vector<WordID> tmp;
+ while(*in) {
+ getline(*in, line);
+ if (line.empty() && !*in) break;
+ e->push_back(vector<int>());
+ f->push_back(vector<int>());
+ vector<int>& le = e->back();
+ vector<int>& lf = f->back();
+ tmp.clear();
+ TD::ConvertSentence(line, &tmp);
+ bool isf = true;
+ for (unsigned i = 0; i < tmp.size(); ++i) {
+ const int cur = tmp[i];
+ if (isf) {
+ if (kDIV == cur) { isf = false; } else {
+ lf.push_back(cur);
+ vocab_f->insert(cur);
+ }
+ } else {
+ assert(cur != kDIV);
+ le.push_back(cur);
+ vocab_e->insert(cur);
+ }
+ }
+ assert(isf == false);
+ }
+ if (in != &cin) delete in;
+}
+
+struct UniphraseLM {
+ UniphraseLM(const vector<vector<int> >& corpus,
+ const set<int>& vocab,
+ const po::variables_map& conf) :
+ phrases_(1,1),
+ gen_(1,1),
+ corpus_(corpus),
+ uniform_word_(1.0 / vocab.size()),
+ gen_p0_(0.5),
+ p_end_(0.5),
+ use_poisson_(conf.count("poisson_length") > 0) {}
+
+ void ResampleHyperparameters(MT19937* rng) {
+ phrases_.resample_hyperparameters(rng);
+ gen_.resample_hyperparameters(rng);
+ cerr << " " << phrases_.concentration();
+ }
+
+ CCRP_NoTable<vector<int> > phrases_;
+ CCRP_NoTable<bool> gen_;
+ vector<vector<bool> > z_; // z_[i] is there a phrase boundary after the ith word
+ const vector<vector<int> >& corpus_;
+ const double uniform_word_;
+ const double gen_p0_;
+ const double p_end_; // in base length distribution, p of the end of a phrase
+ const bool use_poisson_;
+};
+
+struct Reachability {
+ boost::multi_array<bool, 4> edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring?
+ boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid
+
+ Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
+ edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
+ max_src_delta(boost::extents[srclen][trglen]) {
+ ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
+ }
+
+ private:
+ struct SState {
+ SState() : prev_src_covered(), prev_trg_covered() {}
+ SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {}
+ int prev_src_covered;
+ int prev_trg_covered;
+ };
+
+ struct NState {
+ NState() : next_src_covered(), next_trg_covered() {}
+ NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {}
+ int next_src_covered;
+ int next_trg_covered;
+ };
+
+ void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) {
+ typedef boost::multi_array<vector<SState>, 2> array_type;
+ array_type a(boost::extents[srclen + 1][trglen + 1]);
+ a[0][0].push_back(SState());
+ for (int i = 0; i < srclen; ++i) {
+ for (int j = 0; j < trglen; ++j) {
+ if (a[i][j].size() == 0) continue;
+ const SState prev(i,j);
+ for (int k = 1; k <= src_max_phrase_len; ++k) {
+ if ((i + k) > srclen) continue;
+ for (int l = 1; l <= trg_max_phrase_len; ++l) {
+ if ((j + l) > trglen) continue;
+ a[i + k][j + l].push_back(prev);
+ }
+ }
+ }
+ }
+ a[0][0].clear();
+ cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
+ assert(a[srclen][trglen].size() > 0);
+
+ typedef boost::multi_array<bool, 2> rarray_type;
+ rarray_type r(boost::extents[srclen + 1][trglen + 1]);
+// typedef boost::multi_array<vector<NState>, 2> narray_type;
+// narray_type b(boost::extents[srclen + 1][trglen + 1]);
+ r[srclen][trglen] = true;
+ for (int i = srclen; i >= 0; --i) {
+ for (int j = trglen; j >= 0; --j) {
+ vector<SState>& prevs = a[i][j];
+ if (!r[i][j]) { prevs.clear(); }
+// const NState nstate(i,j);
+ for (int k = 0; k < prevs.size(); ++k) {
+ r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true;
+ int src_delta = i - prevs[k].prev_src_covered;
+ edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true;
+ short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered];
+ if (src_delta > msd) msd = src_delta;
+// b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate);
+ }
+ }
+ }
+ assert(!edges[0][0][1][0]);
+ assert(!edges[0][0][0][1]);
+ assert(!edges[0][0][0][0]);
+ cerr << " MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl;
+ assert(max_src_delta[0][0] > 0);
+ //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n";
+ //for (int i = 0; i < b[0][0].size(); ++i) {
+ // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n";
+ //}
+ }
+};
+
+ostream& operator<<(ostream& os, const FSTState& q);
+struct FSTState {
+ explicit FSTState(int src_size) :
+ trg_covered_(),
+ src_covered_(),
+ src_coverage_(src_size) {}
+
+ FSTState(short trg_covered, short src_covered, const vector<bool>& src_coverage, const vector<short>& src_prefix) :
+ trg_covered_(trg_covered),
+ src_covered_(src_covered),
+ src_coverage_(src_coverage),
+ src_prefix_(src_prefix) {
+ if (src_coverage_.size() == src_covered) {
+ assert(src_prefix.size() == 0);
+ }
+ }
+
+ // if we extend by the word at src_position, what are
+ // the next states that are reachable and lie on a valid
+ // path to the final state?
+ vector<FSTState> Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const {
+ assert(src_position < src_coverage_.size());
+ if (src_coverage_[src_position]) {
+ cerr << "Trying to extend " << *this << " with position " << src_position << endl;
+ abort();
+ }
+ vector<bool> ncvg = src_coverage_;
+ ncvg[src_position] = true;
+
+ vector<FSTState> res;
+ const int trg_remaining = trg_len - trg_covered_;
+ if (trg_remaining <= 0) {
+ cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl;
+ abort();
+ }
+ const int src_remaining = src_len - src_covered_;
+ if (src_remaining <= 0) {
+ cerr << "Source appears to have been covered: " << *this << endl;
+ abort();
+ }
+
+ for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) {
+ if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) {
+ int nc = src_prefix_.size() + 1 + src_covered_;
+ res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector<short>()));
+ }
+ }
+
+ if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) {
+ vector<short> nsp = src_prefix_;
+ nsp.push_back(src_position);
+ res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp));
+ }
+
+ if (res.size() == 0) {
+ cerr << *this << " can't be extended!\n";
+ abort();
+ }
+ return res;
+ }
+
+ short trg_covered_, src_covered_;
+ vector<bool> src_coverage_;
+ vector<short> src_prefix_;
+};
+bool operator<(const FSTState& q, const FSTState& r) {
+ if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_;
+ if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_;
+ if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_;
+ return q.src_prefix_ < r.src_prefix_;
+}
+
+ostream& operator<<(ostream& os, const FSTState& q) {
+ os << "[" << q.trg_covered_ << " : ";
+ for (int i = 0; i < q.src_coverage_.size(); ++i)
+ os << q.src_coverage_[i];
+ os << " : <";
+ for (int i = 0; i < q.src_prefix_.size(); ++i) {
+ if (i != 0) os << ' ';
+ os << q.src_prefix_[i];
+ }
+ return os << ">]";
+}
+
+struct MyModel {
+ MyModel(ConditionalBase& rcp0) : rp0(rcp0) {}
+ typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > SrcToRuleCRPMap;
+
+ void DecrementRule(const TRule& rule) {
+ SrcToRuleCRPMap::iterator it = rules.find(rule.f_);
+ assert(it != rules.end());
+ it->second.decrement(rule);
+ if (it->second.num_customers() == 0) rules.erase(it);
+ }
+
+ void IncrementRule(const TRule& rule) {
+ SrcToRuleCRPMap::iterator it = rules.find(rule.f_);
+ if (it == rules.end()) {
+ CCRP_NoTable<TRule> crp(1,1);
+ it = rules.insert(make_pair(rule.f_, crp)).first;
+ }
+ it->second.increment(rule);
+ }
+
+ // conditioned on rule.f_
+ prob_t RuleConditionalProbability(const TRule& rule) const {
+ const prob_t base = rp0(rule);
+ SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_);
+ if (it == rules.end()) {
+ return base;
+ } else {
+ const double lp = it->second.logprob(rule, log(base));
+ prob_t q; q.logeq(lp);
+ return q;
+ }
+ }
+
+ const ConditionalBase& rp0;
+ SrcToRuleCRPMap rules;
+};
+
+struct MyFST : public WFST {
+ MyFST(const vector<WordID>& ssrc, const vector<WordID>& strg, MyModel* m) :
+ src(ssrc), trg(strg),
+ r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE),
+ model(m) {
+ FSTState in(src.size());
+ cerr << " INIT: " << in << endl;
+ init = GetNode(in);
+ for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true;
+ in.src_covered_ = src.size();
+ in.trg_covered_ = trg.size();
+ cerr << "FINAL: " << in << endl;
+ final = GetNode(in);
+ }
+ virtual const WFSTNode* Final() const;
+ virtual const WFSTNode* Initial() const;
+
+ const WFSTNode* GetNode(const FSTState& q);
+ map<FSTState, boost::shared_ptr<WFSTNode> > m;
+ const vector<WordID>& src;
+ const vector<WordID>& trg;
+ Reachability r;
+ const WFSTNode* init;
+ const WFSTNode* final;
+ MyModel* model;
+};
+
+struct MyNode : public WFSTNode {
+ MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {}
+ virtual vector<pair<const WFSTNode*, TRulePtr> > ExtendInput(unsigned srcindex) const;
+ const FSTState state;
+ mutable MyFST* container;
+};
+
+vector<pair<const WFSTNode*, TRulePtr> > MyNode::ExtendInput(unsigned srcindex) const {
+ cerr << "EXTEND " << state << " with " << srcindex << endl;
+ vector<FSTState> ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r);
+ vector<pair<const WFSTNode*,TRulePtr> > res(ext.size());
+ for (unsigned i = 0; i < ext.size(); ++i) {
+ res[i].first = container->GetNode(ext[i]);
+ if (ext[i].src_prefix_.size() == 0) {
+ const unsigned trg_from = state.trg_covered_;
+ const unsigned trg_to = ext[i].trg_covered_;
+ const unsigned prev_prfx_size = state.src_prefix_.size();
+ res[i].second.reset(new TRule);
+ res[i].second->lhs_ = -TD::Convert("X");
+ vector<WordID>& src = res[i].second->f_;
+ vector<WordID>& trg = res[i].second->e_;
+ src.resize(prev_prfx_size + 1);
+ for (unsigned j = 0; j < prev_prfx_size; ++j)
+ src[j] = container->src[state.src_prefix_[j]];
+ src[prev_prfx_size] = container->src[srcindex];
+ for (unsigned j = trg_from; j < trg_to; ++j)
+ trg.push_back(container->trg[j]);
+ res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second)));
+ }
+ }
+ return res;
+}
+
+const WFSTNode* MyFST::GetNode(const FSTState& q) {
+ boost::shared_ptr<WFSTNode>& res = m[q];
+ if (!res) {
+ res.reset(new MyNode(q, this));
+ }
+ return &*res;
+}
+
+const WFSTNode* MyFST::Final() const {
+ return final;
+}
+
+const WFSTNode* MyFST::Initial() const {
+ return init;
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
+ kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
+
+ if (!conf.count("model1")) {
+ cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
+ return 1;
+ }
+ shared_ptr<MT19937> prng;
+ if (conf.count("random_seed"))
+ prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ prng.reset(new MT19937);
+ MT19937& rng = *prng;
+
+ vector<vector<int> > corpuse, corpusf;
+ set<int> vocabe, vocabf;
+ ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+ cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
+ cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
+ assert(corpusf.size() == corpuse.size());
+
+ ConditionalBase lp0(conf["model1_interpolation_weight"].as<double>(),
+ vocabe.size(),
+ conf["model1"].as<string>());
+ MyModel m(lp0);
+
+ TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0");
+ m.IncrementRule(x);
+ TRule y("[X] ||| nY dyN ||| gave ||| 0");
+ m.IncrementRule(y);
+
+
+ MyFST fst(corpusf[0], corpuse[0], &m);
+ ifstream in("./kimura.g");
+ assert(in);
+ CFG_WFSTComposer comp(fst);
+ Hypergraph hg;
+ bool succeed = comp.Compose(&in, &hg);
+ hg.PrintGraphviz();
+ if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; }
+
+#if 0
+ ifstream in2("./amnabooks.g");
+ assert(in2);
+ MyFST fst2(corpusf[1], corpuse[1], &m);
+ CFG_WFSTComposer comp2(fst2);
+ Hypergraph hg2;
+ bool succeed2 = comp2.Compose(&in2, &hg2);
+ if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; }
+#endif
+
+ SparseVector<double> w; w.set_value(FD::Convert("Proposal"), 1.0);
+ hg.Reweight(w);
+ cerr << ViterbiFTree(hg) << endl;
+ return 0;
+}
+
diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc
new file mode 100644
index 00000000..18dfd03b
--- /dev/null
+++ b/gi/pf/pfdist.cc
@@ -0,0 +1,621 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "base_measures.h"
+#include "reachability.h"
+#include "viterbi.h"
+#include "hg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp_nt.h"
+#include "ccrp_onetable.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+shared_ptr<MT19937> prng;
+
+size_t hash_value(const TRule& r) {
+ size_t h = boost::hash_value(r.e_);
+ boost::hash_combine(h, -r.lhs_);
+ boost::hash_combine(h, boost::hash_value(r.f_));
+ return h;
+}
+
+bool operator==(const TRule& a, const TRule& b) {
+ return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
+}
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("particles,p",po::value<unsigned>()->default_value(30),"Number of particles")
+ ("filter_frequency,f",po::value<unsigned>()->default_value(5),"Number of time steps between filterings")
+ ("input,i",po::value<string>(),"Read parallel data from")
+ ("max_src_phrase",po::value<unsigned>()->default_value(5),"Maximum length of source language phrases")
+ ("max_trg_phrase",po::value<unsigned>()->default_value(5),"Maximum length of target language phrases")
+ ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
+ ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")
+ ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("input") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+void ReadParallelCorpus(const string& filename,
+ vector<vector<WordID> >* f,
+ vector<vector<WordID> >* e,
+ set<WordID>* vocab_f,
+ set<WordID>* vocab_e) {
+ f->clear();
+ e->clear();
+ vocab_f->clear();
+ vocab_e->clear();
+ istream* in;
+ if (filename == "-")
+ in = &cin;
+ else
+ in = new ifstream(filename.c_str());
+ assert(*in);
+ string line;
+ const WordID kDIV = TD::Convert("|||");
+ vector<WordID> tmp;
+ while(*in) {
+ getline(*in, line);
+ if (line.empty() && !*in) break;
+ e->push_back(vector<int>());
+ f->push_back(vector<int>());
+ vector<int>& le = e->back();
+ vector<int>& lf = f->back();
+ tmp.clear();
+ TD::ConvertSentence(line, &tmp);
+ bool isf = true;
+ for (unsigned i = 0; i < tmp.size(); ++i) {
+ const int cur = tmp[i];
+ if (isf) {
+ if (kDIV == cur) { isf = false; } else {
+ lf.push_back(cur);
+ vocab_f->insert(cur);
+ }
+ } else {
+ assert(cur != kDIV);
+ le.push_back(cur);
+ vocab_e->insert(cur);
+ }
+ }
+ assert(isf == false);
+ }
+ if (in != &cin) delete in;
+}
+
+#if 0
+struct MyConditionalModel {
+ MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {}
+
+ prob_t srcp0(const vector<WordID>& src) const {
+ prob_t p(1.0 / 3000.0);
+ p.poweq(src.size());
+ prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0));
+ p *= lenp;
+ return p;
+ }
+
+ void DecrementRule(const TRule& rule) {
+ const RuleCRPMap::iterator it = rules.find(rule.f_);
+ assert(it != rules.end());
+ if (it->second.decrement(rule)) {
+ base /= (*rp0)(rule);
+ if (it->second.num_customers() == 0)
+ rules.erase(it);
+ }
+ if (src_phrases.decrement(rule.f_))
+ base /= srcp0(rule.f_);
+ }
+
+ void IncrementRule(const TRule& rule) {
+ RuleCRPMap::iterator it = rules.find(rule.f_);
+ if (it == rules.end())
+ it = rules.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1,1))).first;
+ if (it->second.increment(rule)) {
+ base *= (*rp0)(rule);
+ }
+ if (src_phrases.increment(rule.f_))
+ base *= srcp0(rule.f_);
+ }
+
+ void IncrementRules(const vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ IncrementRule(*rules[i]);
+ }
+
+ void DecrementRules(const vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ DecrementRule(*rules[i]);
+ }
+
+ void IncrementJump(int dist, unsigned src_len) {
+ assert(src_len > 0);
+ if (src_jumps[src_len].increment(dist))
+ base *= jp0(dist, src_len);
+ }
+
+ void DecrementJump(int dist, unsigned src_len) {
+ assert(src_len > 0);
+ if (src_jumps[src_len].decrement(dist))
+ base /= jp0(dist, src_len);
+ }
+
+ void IncrementJumps(const vector<int>& js, unsigned src_len) {
+ for (unsigned i = 0; i < js.size(); ++i)
+ IncrementJump(js[i], src_len);
+ }
+
+ void DecrementJumps(const vector<int>& js, unsigned src_len) {
+ for (unsigned i = 0; i < js.size(); ++i)
+ DecrementJump(js[i], src_len);
+ }
+
+ // p(jump = dist | src_len , z)
+ prob_t JumpProbability(int dist, unsigned src_len) {
+ const prob_t p0 = jp0(dist, src_len);
+ const double lp = src_jumps[src_len].logprob(dist, log(p0));
+ prob_t q; q.logeq(lp);
+ return q;
+ }
+
+ // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z)
+ prob_t RuleProbability(const TRule& rule) const {
+ const prob_t p0 = (*rp0)(rule);
+ prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_))));
+ const RuleCRPMap::const_iterator it = rules.find(rule.f_);
+ if (it == rules.end()) return srcp * p0;
+ const double lp = it->second.logprob(rule, log(p0));
+ prob_t q; q.logeq(lp);
+ return q * srcp;
+ }
+
+ prob_t Likelihood() const {
+ prob_t p = base;
+ for (RuleCRPMap::const_iterator it = rules.begin();
+ it != rules.end(); ++it) {
+ prob_t cl; cl.logeq(it->second.log_crp_prob());
+ p *= cl;
+ }
+ for (unsigned l = 1; l < src_jumps.size(); ++l) {
+ if (src_jumps[l].num_customers() > 0) {
+ prob_t q;
+ q.logeq(src_jumps[l].log_crp_prob());
+ p *= q;
+ }
+ }
+ return p;
+ }
+
+ JumpBase jp0;
+ const PhraseConditionalBase* rp0;
+ prob_t base;
+ typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > RuleCRPMap;
+ RuleCRPMap rules;
+ CCRP_NoTable<vector<WordID> > src_phrases;
+ vector<CCRP_NoTable<int> > src_jumps;
+};
+
+#endif
+
+struct MyJointModel {
+ MyJointModel(PhraseJointBase& rcp0) :
+ rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {}
+
+ void DecrementRule(const TRule& rule) {
+ if (rules.decrement(rule))
+ base /= rp0(rule);
+ }
+
+ void IncrementRule(const TRule& rule) {
+ if (rules.increment(rule))
+ base *= rp0(rule);
+ }
+
+ void IncrementRules(const vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ IncrementRule(*rules[i]);
+ }
+
+ void DecrementRules(const vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ DecrementRule(*rules[i]);
+ }
+
+ void IncrementJump(int dist, unsigned src_len) {
+ assert(src_len > 0);
+ if (src_jumps[src_len].increment(dist))
+ base *= jp0(dist, src_len);
+ }
+
+ void DecrementJump(int dist, unsigned src_len) {
+ assert(src_len > 0);
+ if (src_jumps[src_len].decrement(dist))
+ base /= jp0(dist, src_len);
+ }
+
+ void IncrementJumps(const vector<int>& js, unsigned src_len) {
+ for (unsigned i = 0; i < js.size(); ++i)
+ IncrementJump(js[i], src_len);
+ }
+
+ void DecrementJumps(const vector<int>& js, unsigned src_len) {
+ for (unsigned i = 0; i < js.size(); ++i)
+ DecrementJump(js[i], src_len);
+ }
+
+ // p(jump = dist | src_len , z)
+ prob_t JumpProbability(int dist, unsigned src_len) {
+ const prob_t p0 = jp0(dist, src_len);
+ const double lp = src_jumps[src_len].logprob(dist, log(p0));
+ prob_t q; q.logeq(lp);
+ return q;
+ }
+
+ // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z)
+ prob_t RuleProbability(const TRule& rule) const {
+ prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule))));
+ return p;
+ }
+
+ prob_t Likelihood() const {
+ prob_t p = base;
+ prob_t q; q.logeq(rules.log_crp_prob());
+ p *= q;
+ for (unsigned l = 1; l < src_jumps.size(); ++l) {
+ if (src_jumps[l].num_customers() > 0) {
+ prob_t q;
+ q.logeq(src_jumps[l].log_crp_prob());
+ p *= q;
+ }
+ }
+ return p;
+ }
+
+ JumpBase jp0;
+ const PhraseJointBase& rp0;
+ prob_t base;
+ CCRP_NoTable<TRule> rules;
+ vector<CCRP_NoTable<int> > src_jumps;
+};
+
+struct BackwardEstimate {
+ BackwardEstimate(const Model1& m1, const vector<WordID>& src, const vector<WordID>& trg) :
+ model1_(m1), src_(src), trg_(trg) {
+ }
+ const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const {
+ assert(src_.size() == src_cov.size());
+ assert(trg_cov <= trg_.size());
+ prob_t& e = cache_[src_cov][trg_cov];
+ if (e.is_0()) {
+ if (trg_cov == trg_.size()) { e = prob_t::One(); return e; }
+ vector<WordID> r(src_.size() + 1); r.clear();
+ r.push_back(0); // NULL word
+ for (int i = 0; i < src_cov.size(); ++i)
+ if (!src_cov[i]) r.push_back(src_[i]);
+ const prob_t uniform_alignment(1.0 / r.size());
+ e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
+ for (unsigned j = trg_cov; j < trg_.size(); ++j) {
+ prob_t p;
+ for (unsigned i = 0; i < r.size(); ++i)
+ p += model1_(r[i], trg_[j]);
+ if (p.is_0()) {
+ cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n";
+ abort();
+ }
+ p *= uniform_alignment;
+ e *= p;
+ }
+ }
+ return e;
+ }
+ const Model1& model1_;
+ const vector<WordID>& src_;
+ const vector<WordID>& trg_;
+ mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_;
+};
+
+struct BackwardEstimateSym {
+ BackwardEstimateSym(const Model1& m1,
+ const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) :
+ model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) {
+ }
+ const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const {
+ assert(src_.size() == src_cov.size());
+ assert(trg_cov <= trg_.size());
+ prob_t& e = cache_[src_cov][trg_cov];
+ if (e.is_0()) {
+ if (trg_cov == trg_.size()) { e = prob_t::One(); return e; }
+ vector<WordID> r(src_.size() + 1); r.clear();
+ for (int i = 0; i < src_cov.size(); ++i)
+ if (!src_cov[i]) r.push_back(src_[i]);
+ r.push_back(0); // NULL word
+ const prob_t uniform_alignment(1.0 / r.size());
+ e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
+ for (unsigned j = trg_cov; j < trg_.size(); ++j) {
+ prob_t p;
+ for (unsigned i = 0; i < r.size(); ++i)
+ p += model1_(r[i], trg_[j]);
+ if (p.is_0()) {
+ cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n";
+ abort();
+ }
+ p *= uniform_alignment;
+ e *= p;
+ }
+ r.pop_back();
+ const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0));
+ prob_t inv;
+ inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov));
+ for (unsigned i = 0; i < r.size(); ++i) {
+ prob_t p;
+ for (unsigned j = trg_cov - 1; j < trg_.size(); ++j)
+ p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]);
+ if (p.is_0()) {
+ cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n";
+ abort();
+ }
+ p *= inv_uniform;
+ inv *= p;
+ }
+ prob_t x = pow(e * inv, 0.5);
+ e = x;
+ //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl;
+ }
+ return e;
+ }
+ const Model1& model1_;
+ const Model1& invmodel1_;
+ const vector<WordID>& src_;
+ const vector<WordID>& trg_;
+ mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_;
+};
+
+struct Particle {
+ Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {}
+ prob_t weight;
+ prob_t gamma_last;
+ vector<int> src_jumps;
+ vector<TRulePtr> rules;
+ vector<bool> src_cv;
+ int src_cov;
+ int trg_cov;
+ int prev_pos;
+};
+
+ostream& operator<<(ostream& o, const vector<bool>& v) {
+ for (int i = 0; i < v.size(); ++i)
+ o << (v[i] ? '1' : '0');
+ return o;
+}
+ostream& operator<<(ostream& o, const Particle& p) {
+ o << "[cv=" << p.src_cv << " src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']';
+ return o;
+}
+
+void FilterCrapParticlesAndReweight(vector<Particle>* pps) {
+ vector<Particle>& ps = *pps;
+ SampleSet<prob_t> ss;
+ for (int i = 0; i < ps.size(); ++i)
+ ss.add(ps[i].weight);
+ vector<Particle> nps; nps.reserve(ps.size());
+ const prob_t uniform_weight(1.0 / ps.size());
+ for (int i = 0; i < ps.size(); ++i) {
+ nps.push_back(ps[prng->SelectSample(ss)]);
+ nps[i].weight = uniform_weight;
+ }
+ nps.swap(ps);
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
+ const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
+ const unsigned particles = conf["particles"].as<unsigned>();
+ const unsigned samples = conf["samples"].as<unsigned>();
+ const unsigned rejuv_freq = conf["filter_frequency"].as<unsigned>();
+
+ if (!conf.count("model1")) {
+ cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
+ return 1;
+ }
+ if (conf.count("random_seed"))
+ prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ prng.reset(new MT19937);
+ MT19937& rng = *prng;
+
+ vector<vector<WordID> > corpuse, corpusf;
+ set<WordID> vocabe, vocabf;
+ cerr << "Reading corpus...\n";
+ ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+ cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
+ cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+ assert(corpusf.size() == corpuse.size());
+
+ const int kLHS = -TD::Convert("X");
+ Model1 m1(conf["model1"].as<string>());
+ Model1 invm1(conf["inverse_model1"].as<string>());
+
+#if 0
+ PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size());
+ MyConditionalModel m(lp0);
+#else
+ PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
+ MyJointModel m(lp0);
+#endif
+
+ cerr << "Initializing reachability limits...\n";
+ vector<Particle> ps(corpusf.size());
+ vector<Reachability> reaches; reaches.reserve(corpusf.size());
+ for (int ci = 0; ci < corpusf.size(); ++ci)
+ reaches.push_back(Reachability(corpusf[ci].size(),
+ corpuse[ci].size(),
+ kMAX_SRC_PHRASE,
+ kMAX_TRG_PHRASE));
+ cerr << "Sampling...\n";
+ vector<Particle> tmp_p(10000); // work space
+ SampleSet<prob_t> pfss;
+ for (int SS=0; SS < samples; ++SS) {
+ for (int ci = 0; ci < corpusf.size(); ++ci) {
+ vector<int>& src = corpusf[ci];
+ vector<int>& trg = corpuse[ci];
+ m.DecrementRules(ps[ci].rules);
+ m.DecrementJumps(ps[ci].src_jumps, src.size());
+
+ //BackwardEstimate be(m1, src, trg);
+ BackwardEstimateSym be(m1, invm1, src, trg);
+ const Reachability& r = reaches[ci];
+ vector<Particle> lps(particles);
+
+ for (int pi = 0; pi < particles; ++pi) {
+ Particle& p = lps[pi];
+ p.src_cv.resize(src.size(), false);
+ }
+
+ bool all_complete = false;
+ while(!all_complete) {
+ SampleSet<prob_t> ss;
+
+ // all particles have now been extended a bit, we will reweight them now
+ if (lps[0].trg_cov > 0)
+ FilterCrapParticlesAndReweight(&lps);
+
+ // loop over all particles and extend them
+ bool done_nothing = true;
+ for (int pi = 0; pi < particles; ++pi) {
+ Particle& p = lps[pi];
+ int tic = 0;
+ while(p.trg_cov < trg.size() && tic < rejuv_freq) {
+ ++tic;
+ done_nothing = false;
+ ss.clear();
+ TRule x; x.lhs_ = kLHS;
+ prob_t z;
+ int first_uncovered = src.size();
+ int last_uncovered = -1;
+ for (int i = 0; i < src.size(); ++i) {
+ const bool is_uncovered = !p.src_cv[i];
+ if (i < first_uncovered && is_uncovered) first_uncovered = i;
+ if (is_uncovered && i > last_uncovered) last_uncovered = i;
+ }
+ assert(last_uncovered > -1);
+ assert(first_uncovered < src.size());
+
+ for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) {
+ x.e_.push_back(trg[trg_len - 1 + p.trg_cov]);
+ for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) {
+ if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue;
+
+ const int last_possible_start = last_uncovered - src_len + 1;
+ assert(last_possible_start >= 0);
+ //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl;
+ //cerr << " first_uncovered=" << first_uncovered << " last_possible_start=" << last_possible_start << endl;
+ for (int i = first_uncovered; i <= last_possible_start; ++i) {
+ if (p.src_cv[i]) continue;
+ assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size
+ Particle& np = tmp_p[ss.size()];
+ np = p;
+ x.f_.clear();
+ int gap_add = 0;
+ bool bad = false;
+ prob_t jp = prob_t::One();
+ int prev_pos = p.prev_pos;
+ for (int j = 0; j < src_len; ++j) {
+ if ((j + i + gap_add) == src.size()) { bad = true; break; }
+ while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; }
+ if ((j + i + gap_add) == src.size()) { bad = true; break; }
+ np.src_cv[i + j + gap_add] = true;
+ x.f_.push_back(src[i + j + gap_add]);
+ jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size());
+ int jump = i + j + gap_add - prev_pos;
+ assert(jump != 0);
+ np.src_jumps.push_back(jump);
+ prev_pos = i + j + gap_add;
+ }
+ if (bad) continue;
+ np.prev_pos = prev_pos;
+ np.src_cov += x.f_.size();
+ np.trg_cov += x.e_.size();
+ if (x.f_.size() != src_len) continue;
+ prob_t rp = m.RuleProbability(x);
+ np.gamma_last = rp * jp;
+ const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2);
+ //cerr << "**rule=" << x << endl;
+ //cerr << " u=" << log(u) << " rule=" << rp << " jump=" << jp << endl;
+ ss.add(u);
+ np.rules.push_back(TRulePtr(new TRule(x)));
+ z += u;
+
+ const bool completed = (p.trg_cov == trg.size());
+ if (completed) {
+ int last_jump = src.size() - p.prev_pos;
+ assert(last_jump > 0);
+ p.src_jumps.push_back(last_jump);
+ p.weight *= m.JumpProbability(last_jump, src.size());
+ }
+ }
+ }
+ }
+ cerr << "number of edges to consider: " << ss.size() << endl;
+ const int sampled = rng.SelectSample(ss);
+ prob_t q_n = ss[sampled] / z;
+ p = tmp_p[sampled];
+ //m.IncrementRule(*p.rules.back());
+ p.weight *= p.gamma_last / q_n;
+ cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl;
+ cerr << p << endl;
+ }
+ } // loop over particles (pi = 0 .. particles)
+ if (done_nothing) all_complete = true;
+ }
+ pfss.clear();
+ for (int i = 0; i < lps.size(); ++i)
+ pfss.add(lps[i].weight);
+ const int sampled = rng.SelectSample(pfss);
+ ps[ci] = lps[sampled];
+ m.IncrementRules(lps[sampled].rules);
+ m.IncrementJumps(lps[sampled].src_jumps, src.size());
+ for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; }
+ cerr << "tmp-LLH: " << log(m.Likelihood()) << endl;
+ }
+ cerr << "LLH: " << log(m.Likelihood()) << endl;
+ for (int sni = 0; sni < 5; ++sni) {
+ for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; }
+ }
+ }
+ return 0;
+}
+
diff --git a/gi/pf/pfdist.new.cc b/gi/pf/pfdist.new.cc
new file mode 100644
index 00000000..3169eb75
--- /dev/null
+++ b/gi/pf/pfdist.new.cc
@@ -0,0 +1,620 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "base_measures.h"
+#include "reachability.h"
+#include "viterbi.h"
+#include "hg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp_nt.h"
+#include "ccrp_onetable.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+shared_ptr<MT19937> prng;
+
+size_t hash_value(const TRule& r) {
+ size_t h = boost::hash_value(r.e_);
+ boost::hash_combine(h, -r.lhs_);
+ boost::hash_combine(h, boost::hash_value(r.f_));
+ return h;
+}
+
+bool operator==(const TRule& a, const TRule& b) {
+ return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
+}
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles")
+ ("input,i",po::value<string>(),"Read parallel data from")
+ ("max_src_phrase",po::value<unsigned>()->default_value(5),"Maximum length of source language phrases")
+ ("max_trg_phrase",po::value<unsigned>()->default_value(5),"Maximum length of target language phrases")
+ ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
+ ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")
+ ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("input") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+void ReadParallelCorpus(const string& filename,
+ vector<vector<WordID> >* f,
+ vector<vector<WordID> >* e,
+ set<WordID>* vocab_f,
+ set<WordID>* vocab_e) {
+ f->clear();
+ e->clear();
+ vocab_f->clear();
+ vocab_e->clear();
+ istream* in;
+ if (filename == "-")
+ in = &cin;
+ else
+ in = new ifstream(filename.c_str());
+ assert(*in);
+ string line;
+ const WordID kDIV = TD::Convert("|||");
+ vector<WordID> tmp;
+ while(*in) {
+ getline(*in, line);
+ if (line.empty() && !*in) break;
+ e->push_back(vector<int>());
+ f->push_back(vector<int>());
+ vector<int>& le = e->back();
+ vector<int>& lf = f->back();
+ tmp.clear();
+ TD::ConvertSentence(line, &tmp);
+ bool isf = true;
+ for (unsigned i = 0; i < tmp.size(); ++i) {
+ const int cur = tmp[i];
+ if (isf) {
+ if (kDIV == cur) { isf = false; } else {
+ lf.push_back(cur);
+ vocab_f->insert(cur);
+ }
+ } else {
+ assert(cur != kDIV);
+ le.push_back(cur);
+ vocab_e->insert(cur);
+ }
+ }
+ assert(isf == false);
+ }
+ if (in != &cin) delete in;
+}
+
+#if 0
+struct MyConditionalModel {
+ MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {}
+
+ prob_t srcp0(const vector<WordID>& src) const {
+ prob_t p(1.0 / 3000.0);
+ p.poweq(src.size());
+ prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0));
+ p *= lenp;
+ return p;
+ }
+
+ void DecrementRule(const TRule& rule) {
+ const RuleCRPMap::iterator it = rules.find(rule.f_);
+ assert(it != rules.end());
+ if (it->second.decrement(rule)) {
+ base /= (*rp0)(rule);
+ if (it->second.num_customers() == 0)
+ rules.erase(it);
+ }
+ if (src_phrases.decrement(rule.f_))
+ base /= srcp0(rule.f_);
+ }
+
+ void IncrementRule(const TRule& rule) {
+ RuleCRPMap::iterator it = rules.find(rule.f_);
+ if (it == rules.end())
+ it = rules.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1,1))).first;
+ if (it->second.increment(rule)) {
+ base *= (*rp0)(rule);
+ }
+ if (src_phrases.increment(rule.f_))
+ base *= srcp0(rule.f_);
+ }
+
+ void IncrementRules(const vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ IncrementRule(*rules[i]);
+ }
+
+ void DecrementRules(const vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ DecrementRule(*rules[i]);
+ }
+
+ void IncrementJump(int dist, unsigned src_len) {
+ assert(src_len > 0);
+ if (src_jumps[src_len].increment(dist))
+ base *= jp0(dist, src_len);
+ }
+
+ void DecrementJump(int dist, unsigned src_len) {
+ assert(src_len > 0);
+ if (src_jumps[src_len].decrement(dist))
+ base /= jp0(dist, src_len);
+ }
+
+ void IncrementJumps(const vector<int>& js, unsigned src_len) {
+ for (unsigned i = 0; i < js.size(); ++i)
+ IncrementJump(js[i], src_len);
+ }
+
+ void DecrementJumps(const vector<int>& js, unsigned src_len) {
+ for (unsigned i = 0; i < js.size(); ++i)
+ DecrementJump(js[i], src_len);
+ }
+
+ // p(jump = dist | src_len , z)
+ prob_t JumpProbability(int dist, unsigned src_len) {
+ const prob_t p0 = jp0(dist, src_len);
+ const double lp = src_jumps[src_len].logprob(dist, log(p0));
+ prob_t q; q.logeq(lp);
+ return q;
+ }
+
+ // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z)
+ prob_t RuleProbability(const TRule& rule) const {
+ const prob_t p0 = (*rp0)(rule);
+ prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_))));
+ const RuleCRPMap::const_iterator it = rules.find(rule.f_);
+ if (it == rules.end()) return srcp * p0;
+ const double lp = it->second.logprob(rule, log(p0));
+ prob_t q; q.logeq(lp);
+ return q * srcp;
+ }
+
+ prob_t Likelihood() const {
+ prob_t p = base;
+ for (RuleCRPMap::const_iterator it = rules.begin();
+ it != rules.end(); ++it) {
+ prob_t cl; cl.logeq(it->second.log_crp_prob());
+ p *= cl;
+ }
+ for (unsigned l = 1; l < src_jumps.size(); ++l) {
+ if (src_jumps[l].num_customers() > 0) {
+ prob_t q;
+ q.logeq(src_jumps[l].log_crp_prob());
+ p *= q;
+ }
+ }
+ return p;
+ }
+
+ JumpBase jp0;
+ const PhraseConditionalBase* rp0;
+ prob_t base;
+ typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > RuleCRPMap;
+ RuleCRPMap rules;
+ CCRP_NoTable<vector<WordID> > src_phrases;
+ vector<CCRP_NoTable<int> > src_jumps;
+};
+
+#endif
+
+struct MyJointModel {
+ MyJointModel(PhraseJointBase& rcp0) :
+ rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {}
+
+ void DecrementRule(const TRule& rule) {
+ if (rules.decrement(rule))
+ base /= rp0(rule);
+ }
+
+ void IncrementRule(const TRule& rule) {
+ if (rules.increment(rule))
+ base *= rp0(rule);
+ }
+
+ void IncrementRules(const vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ IncrementRule(*rules[i]);
+ }
+
+ void DecrementRules(const vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ DecrementRule(*rules[i]);
+ }
+
+ void IncrementJump(int dist, unsigned src_len) {
+ assert(src_len > 0);
+ if (src_jumps[src_len].increment(dist))
+ base *= jp0(dist, src_len);
+ }
+
+ void DecrementJump(int dist, unsigned src_len) {
+ assert(src_len > 0);
+ if (src_jumps[src_len].decrement(dist))
+ base /= jp0(dist, src_len);
+ }
+
+ void IncrementJumps(const vector<int>& js, unsigned src_len) {
+ for (unsigned i = 0; i < js.size(); ++i)
+ IncrementJump(js[i], src_len);
+ }
+
+ void DecrementJumps(const vector<int>& js, unsigned src_len) {
+ for (unsigned i = 0; i < js.size(); ++i)
+ DecrementJump(js[i], src_len);
+ }
+
+ // p(jump = dist | src_len , z)
+ prob_t JumpProbability(int dist, unsigned src_len) {
+ const prob_t p0 = jp0(dist, src_len);
+ const double lp = src_jumps[src_len].logprob(dist, log(p0));
+ prob_t q; q.logeq(lp);
+ return q;
+ }
+
+ // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z)
+ prob_t RuleProbability(const TRule& rule) const {
+ prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule))));
+ return p;
+ }
+
+ prob_t Likelihood() const {
+ prob_t p = base;
+ prob_t q; q.logeq(rules.log_crp_prob());
+ p *= q;
+ for (unsigned l = 1; l < src_jumps.size(); ++l) {
+ if (src_jumps[l].num_customers() > 0) {
+ prob_t q;
+ q.logeq(src_jumps[l].log_crp_prob());
+ p *= q;
+ }
+ }
+ return p;
+ }
+
+ JumpBase jp0;
+ const PhraseJointBase& rp0;
+ prob_t base;
+ CCRP_NoTable<TRule> rules;
+ vector<CCRP_NoTable<int> > src_jumps;
+};
+
+struct BackwardEstimate {
+ BackwardEstimate(const Model1& m1, const vector<WordID>& src, const vector<WordID>& trg) :
+ model1_(m1), src_(src), trg_(trg) {
+ }
+ const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const {
+ assert(src_.size() == src_cov.size());
+ assert(trg_cov <= trg_.size());
+ prob_t& e = cache_[src_cov][trg_cov];
+ if (e.is_0()) {
+ if (trg_cov == trg_.size()) { e = prob_t::One(); return e; }
+ vector<WordID> r(src_.size() + 1); r.clear();
+ r.push_back(0); // NULL word
+ for (int i = 0; i < src_cov.size(); ++i)
+ if (!src_cov[i]) r.push_back(src_[i]);
+ const prob_t uniform_alignment(1.0 / r.size());
+ e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
+ for (unsigned j = trg_cov; j < trg_.size(); ++j) {
+ prob_t p;
+ for (unsigned i = 0; i < r.size(); ++i)
+ p += model1_(r[i], trg_[j]);
+ if (p.is_0()) {
+ cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n";
+ abort();
+ }
+ p *= uniform_alignment;
+ e *= p;
+ }
+ }
+ return e;
+ }
+ const Model1& model1_;
+ const vector<WordID>& src_;
+ const vector<WordID>& trg_;
+ mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_;
+};
+
+struct BackwardEstimateSym {
+ BackwardEstimateSym(const Model1& m1,
+ const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) :
+ model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) {
+ }
+ const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const {
+ assert(src_.size() == src_cov.size());
+ assert(trg_cov <= trg_.size());
+ prob_t& e = cache_[src_cov][trg_cov];
+ if (e.is_0()) {
+ if (trg_cov == trg_.size()) { e = prob_t::One(); return e; }
+ vector<WordID> r(src_.size() + 1); r.clear();
+ for (int i = 0; i < src_cov.size(); ++i)
+ if (!src_cov[i]) r.push_back(src_[i]);
+ r.push_back(0); // NULL word
+ const prob_t uniform_alignment(1.0 / r.size());
+ e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
+ for (unsigned j = trg_cov; j < trg_.size(); ++j) {
+ prob_t p;
+ for (unsigned i = 0; i < r.size(); ++i)
+ p += model1_(r[i], trg_[j]);
+ if (p.is_0()) {
+ cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n";
+ abort();
+ }
+ p *= uniform_alignment;
+ e *= p;
+ }
+ r.pop_back();
+ const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0));
+ prob_t inv;
+ inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov));
+ for (unsigned i = 0; i < r.size(); ++i) {
+ prob_t p;
+ for (unsigned j = trg_cov - 1; j < trg_.size(); ++j)
+ p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]);
+ if (p.is_0()) {
+ cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n";
+ abort();
+ }
+ p *= inv_uniform;
+ inv *= p;
+ }
+ prob_t x = pow(e * inv, 0.5);
+ e = x;
+ //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl;
+ }
+ return e;
+ }
+ const Model1& model1_;
+ const Model1& invmodel1_;
+ const vector<WordID>& src_;
+ const vector<WordID>& trg_;
+ mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_;
+};
+
+struct Particle {
+ Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {}
+ prob_t weight;
+ prob_t gamma_last;
+ vector<int> src_jumps;
+ vector<TRulePtr> rules;
+ vector<bool> src_cv;
+ int src_cov;
+ int trg_cov;
+ int prev_pos;
+};
+
+ostream& operator<<(ostream& o, const vector<bool>& v) {
+ for (int i = 0; i < v.size(); ++i)
+ o << (v[i] ? '1' : '0');
+ return o;
+}
+ostream& operator<<(ostream& o, const Particle& p) {
+ o << "[cv=" << p.src_cv << " src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']';
+ return o;
+}
+
+void FilterCrapParticlesAndReweight(vector<Particle>* pps) {
+ vector<Particle>& ps = *pps;
+ SampleSet<prob_t> ss;
+ for (int i = 0; i < ps.size(); ++i)
+ ss.add(ps[i].weight);
+ vector<Particle> nps; nps.reserve(ps.size());
+ const prob_t uniform_weight(1.0 / ps.size());
+ for (int i = 0; i < ps.size(); ++i) {
+ nps.push_back(ps[prng->SelectSample(ss)]);
+ nps[i].weight = uniform_weight;
+ }
+ nps.swap(ps);
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
+ const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
+ const unsigned particles = conf["particles"].as<unsigned>();
+ const unsigned samples = conf["samples"].as<unsigned>();
+
+ if (!conf.count("model1")) {
+ cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
+ return 1;
+ }
+ if (conf.count("random_seed"))
+ prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ prng.reset(new MT19937);
+ MT19937& rng = *prng;
+
+ vector<vector<WordID> > corpuse, corpusf;
+ set<WordID> vocabe, vocabf;
+ cerr << "Reading corpus...\n";
+ ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+ cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
+ cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+ assert(corpusf.size() == corpuse.size());
+
+ const int kLHS = -TD::Convert("X");
+ Model1 m1(conf["model1"].as<string>());
+ Model1 invm1(conf["inverse_model1"].as<string>());
+
+#if 0
+ PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size());
+ MyConditionalModel m(lp0);
+#else
+ PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
+ MyJointModel m(lp0);
+#endif
+
+ cerr << "Initializing reachability limits...\n";
+ vector<Particle> ps(corpusf.size());
+ vector<Reachability> reaches; reaches.reserve(corpusf.size());
+ for (int ci = 0; ci < corpusf.size(); ++ci)
+ reaches.push_back(Reachability(corpusf[ci].size(),
+ corpuse[ci].size(),
+ kMAX_SRC_PHRASE,
+ kMAX_TRG_PHRASE));
+ cerr << "Sampling...\n";
+ vector<Particle> tmp_p(10000); // work space
+ SampleSet<prob_t> pfss;
+ for (int SS=0; SS < samples; ++SS) {
+ for (int ci = 0; ci < corpusf.size(); ++ci) {
+ vector<int>& src = corpusf[ci];
+ vector<int>& trg = corpuse[ci];
+ m.DecrementRules(ps[ci].rules);
+ m.DecrementJumps(ps[ci].src_jumps, src.size());
+
+ //BackwardEstimate be(m1, src, trg);
+ BackwardEstimateSym be(m1, invm1, src, trg);
+ const Reachability& r = reaches[ci];
+ vector<Particle> lps(particles);
+
+ for (int pi = 0; pi < particles; ++pi) {
+ Particle& p = lps[pi];
+ p.src_cv.resize(src.size(), false);
+ }
+
+ bool all_complete = false;
+ while(!all_complete) {
+ SampleSet<prob_t> ss;
+
+ // all particles have now been extended a bit, we will reweight them now
+ if (lps[0].trg_cov > 0)
+ FilterCrapParticlesAndReweight(&lps);
+
+ // loop over all particles and extend them
+ bool done_nothing = true;
+ for (int pi = 0; pi < particles; ++pi) {
+ Particle& p = lps[pi];
+ int tic = 0;
+ const int rejuv_freq = 1;
+ while(p.trg_cov < trg.size() && tic < rejuv_freq) {
+ ++tic;
+ done_nothing = false;
+ ss.clear();
+ TRule x; x.lhs_ = kLHS;
+ prob_t z;
+ int first_uncovered = src.size();
+ int last_uncovered = -1;
+ for (int i = 0; i < src.size(); ++i) {
+ const bool is_uncovered = !p.src_cv[i];
+ if (i < first_uncovered && is_uncovered) first_uncovered = i;
+ if (is_uncovered && i > last_uncovered) last_uncovered = i;
+ }
+ assert(last_uncovered > -1);
+ assert(first_uncovered < src.size());
+
+ for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) {
+ x.e_.push_back(trg[trg_len - 1 + p.trg_cov]);
+ for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) {
+ if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue;
+
+ const int last_possible_start = last_uncovered - src_len + 1;
+ assert(last_possible_start >= 0);
+ //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl;
+ //cerr << " first_uncovered=" << first_uncovered << " last_possible_start=" << last_possible_start << endl;
+ for (int i = first_uncovered; i <= last_possible_start; ++i) {
+ if (p.src_cv[i]) continue;
+ assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size
+ Particle& np = tmp_p[ss.size()];
+ np = p;
+ x.f_.clear();
+ int gap_add = 0;
+ bool bad = false;
+ prob_t jp = prob_t::One();
+ int prev_pos = p.prev_pos;
+ for (int j = 0; j < src_len; ++j) {
+ if ((j + i + gap_add) == src.size()) { bad = true; break; }
+ while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; }
+ if ((j + i + gap_add) == src.size()) { bad = true; break; }
+ np.src_cv[i + j + gap_add] = true;
+ x.f_.push_back(src[i + j + gap_add]);
+ jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size());
+ int jump = i + j + gap_add - prev_pos;
+ assert(jump != 0);
+ np.src_jumps.push_back(jump);
+ prev_pos = i + j + gap_add;
+ }
+ if (bad) continue;
+ np.prev_pos = prev_pos;
+ np.src_cov += x.f_.size();
+ np.trg_cov += x.e_.size();
+ if (x.f_.size() != src_len) continue;
+ prob_t rp = m.RuleProbability(x);
+ np.gamma_last = rp * jp;
+ const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2);
+ //cerr << "**rule=" << x << endl;
+ //cerr << " u=" << log(u) << " rule=" << rp << " jump=" << jp << endl;
+ ss.add(u);
+ np.rules.push_back(TRulePtr(new TRule(x)));
+ z += u;
+
+ const bool completed = (p.trg_cov == trg.size());
+ if (completed) {
+ int last_jump = src.size() - p.prev_pos;
+ assert(last_jump > 0);
+ p.src_jumps.push_back(last_jump);
+ p.weight *= m.JumpProbability(last_jump, src.size());
+ }
+ }
+ }
+ }
+ cerr << "number of edges to consider: " << ss.size() << endl;
+ const int sampled = rng.SelectSample(ss);
+ prob_t q_n = ss[sampled] / z;
+ p = tmp_p[sampled];
+ //m.IncrementRule(*p.rules.back());
+ p.weight *= p.gamma_last / q_n;
+ cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl;
+ cerr << p << endl;
+ }
+ } // loop over particles (pi = 0 .. particles)
+ if (done_nothing) all_complete = true;
+ }
+ pfss.clear();
+ for (int i = 0; i < lps.size(); ++i)
+ pfss.add(lps[i].weight);
+ const int sampled = rng.SelectSample(pfss);
+ ps[ci] = lps[sampled];
+ m.IncrementRules(lps[sampled].rules);
+ m.IncrementJumps(lps[sampled].src_jumps, src.size());
+ for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; }
+ cerr << "tmp-LLH: " << log(m.Likelihood()) << endl;
+ }
+ cerr << "LLH: " << log(m.Likelihood()) << endl;
+ for (int sni = 0; sni < 5; ++sni) {
+ for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; }
+ }
+ }
+ return 0;
+}
+
diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc
new file mode 100644
index 00000000..43c604c3
--- /dev/null
+++ b/gi/pf/pfnaive.cc
@@ -0,0 +1,385 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "base_measures.h"
+#include "reachability.h"
+#include "viterbi.h"
+#include "hg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp_nt.h"
+#include "ccrp_onetable.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+shared_ptr<MT19937> prng;
+
+size_t hash_value(const TRule& r) {
+ size_t h = boost::hash_value(r.e_);
+ boost::hash_combine(h, -r.lhs_);
+ boost::hash_combine(h, boost::hash_value(r.f_));
+ return h;
+}
+
+bool operator==(const TRule& a, const TRule& b) {
+ return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
+}
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("particles,p",po::value<unsigned>()->default_value(30),"Number of particles")
+ ("filter_frequency,f",po::value<unsigned>()->default_value(5),"Number of time steps between filterings")
+ ("input,i",po::value<string>(),"Read parallel data from")
+ ("max_src_phrase",po::value<unsigned>()->default_value(5),"Maximum length of source language phrases")
+ ("max_trg_phrase",po::value<unsigned>()->default_value(5),"Maximum length of target language phrases")
+ ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
+ ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")
+ ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("input") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+void ReadParallelCorpus(const string& filename,
+ vector<vector<WordID> >* f,
+ vector<vector<WordID> >* e,
+ set<WordID>* vocab_f,
+ set<WordID>* vocab_e) {
+ f->clear();
+ e->clear();
+ vocab_f->clear();
+ vocab_e->clear();
+ istream* in;
+ if (filename == "-")
+ in = &cin;
+ else
+ in = new ifstream(filename.c_str());
+ assert(*in);
+ string line;
+ const WordID kDIV = TD::Convert("|||");
+ vector<WordID> tmp;
+ while(*in) {
+ getline(*in, line);
+ if (line.empty() && !*in) break;
+ e->push_back(vector<int>());
+ f->push_back(vector<int>());
+ vector<int>& le = e->back();
+ vector<int>& lf = f->back();
+ tmp.clear();
+ TD::ConvertSentence(line, &tmp);
+ bool isf = true;
+ for (unsigned i = 0; i < tmp.size(); ++i) {
+ const int cur = tmp[i];
+ if (isf) {
+ if (kDIV == cur) { isf = false; } else {
+ lf.push_back(cur);
+ vocab_f->insert(cur);
+ }
+ } else {
+ assert(cur != kDIV);
+ le.push_back(cur);
+ vocab_e->insert(cur);
+ }
+ }
+ assert(isf == false);
+ }
+ if (in != &cin) delete in;
+}
+
+struct MyJointModel {
+ MyJointModel(PhraseJointBase& rcp0) :
+ rp0(rcp0), base(prob_t::One()), rules(1,1) {}
+
+ void DecrementRule(const TRule& rule) {
+ if (rules.decrement(rule))
+ base /= rp0(rule);
+ }
+
+ void IncrementRule(const TRule& rule) {
+ if (rules.increment(rule))
+ base *= rp0(rule);
+ }
+
+ void IncrementRules(const vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ IncrementRule(*rules[i]);
+ }
+
+ void DecrementRules(const vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ DecrementRule(*rules[i]);
+ }
+
+ prob_t RuleProbability(const TRule& rule) const {
+ prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule))));
+ return p;
+ }
+
+ prob_t Likelihood() const {
+ prob_t p = base;
+ prob_t q; q.logeq(rules.log_crp_prob());
+ p *= q;
+ for (unsigned l = 1; l < src_jumps.size(); ++l) {
+ if (src_jumps[l].num_customers() > 0) {
+ prob_t q;
+ q.logeq(src_jumps[l].log_crp_prob());
+ p *= q;
+ }
+ }
+ return p;
+ }
+
+ const PhraseJointBase& rp0;
+ prob_t base;
+ CCRP_NoTable<TRule> rules;
+ vector<CCRP_NoTable<int> > src_jumps;
+};
+
+struct BackwardEstimateSym {
+ BackwardEstimateSym(const Model1& m1,
+ const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) :
+ model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) {
+ }
+ const prob_t& operator()(unsigned src_cov, unsigned trg_cov) const {
+ assert(src_cov <= src_.size());
+ assert(trg_cov <= trg_.size());
+ prob_t& e = cache_[src_cov][trg_cov];
+ if (e.is_0()) {
+ if (trg_cov == trg_.size()) { e = prob_t::One(); return e; }
+ vector<WordID> r(src_.size() + 1); r.clear();
+ for (int i = src_cov; i < src_.size(); ++i)
+ r.push_back(src_[i]);
+ r.push_back(0); // NULL word
+ const prob_t uniform_alignment(1.0 / r.size());
+ e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
+ for (unsigned j = trg_cov; j < trg_.size(); ++j) {
+ prob_t p;
+ for (unsigned i = 0; i < r.size(); ++i)
+ p += model1_(r[i], trg_[j]);
+ if (p.is_0()) {
+ cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n";
+ abort();
+ }
+ p *= uniform_alignment;
+ e *= p;
+ }
+ r.pop_back();
+ const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0));
+ prob_t inv;
+ inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov));
+ for (unsigned i = 0; i < r.size(); ++i) {
+ prob_t p;
+ for (unsigned j = trg_cov - 1; j < trg_.size(); ++j)
+ p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]);
+ if (p.is_0()) {
+ cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n";
+ abort();
+ }
+ p *= inv_uniform;
+ inv *= p;
+ }
+ prob_t x = pow(e * inv, 0.5);
+ e = x;
+ //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl;
+ }
+ return e;
+ }
+ const Model1& model1_;
+ const Model1& invmodel1_;
+ const vector<WordID>& src_;
+ const vector<WordID>& trg_;
+ mutable unordered_map<unsigned, map<unsigned, prob_t> > cache_;
+};
+
+struct Particle {
+ Particle() : weight(prob_t::One()), src_cov(), trg_cov() {}
+ prob_t weight;
+ prob_t gamma_last;
+ vector<TRulePtr> rules;
+ int src_cov;
+ int trg_cov;
+};
+
+ostream& operator<<(ostream& o, const vector<bool>& v) {
+ for (int i = 0; i < v.size(); ++i)
+ o << (v[i] ? '1' : '0');
+ return o;
+}
+ostream& operator<<(ostream& o, const Particle& p) {
+ o << "[src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']';
+ return o;
+}
+
+void FilterCrapParticlesAndReweight(vector<Particle>* pps) {
+ vector<Particle>& ps = *pps;
+ SampleSet<prob_t> ss;
+ for (int i = 0; i < ps.size(); ++i)
+ ss.add(ps[i].weight);
+ vector<Particle> nps; nps.reserve(ps.size());
+ const prob_t uniform_weight(1.0 / ps.size());
+ for (int i = 0; i < ps.size(); ++i) {
+ nps.push_back(ps[prng->SelectSample(ss)]);
+ nps[i].weight = uniform_weight;
+ }
+ nps.swap(ps);
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
+ const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
+ const unsigned particles = conf["particles"].as<unsigned>();
+ const unsigned samples = conf["samples"].as<unsigned>();
+ const unsigned rejuv_freq = conf["filter_frequency"].as<unsigned>();
+
+ if (!conf.count("model1")) {
+ cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
+ return 1;
+ }
+ if (conf.count("random_seed"))
+ prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ prng.reset(new MT19937);
+ MT19937& rng = *prng;
+
+ vector<vector<WordID> > corpuse, corpusf;
+ set<WordID> vocabe, vocabf;
+ cerr << "Reading corpus...\n";
+ ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+ cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
+ cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+ assert(corpusf.size() == corpuse.size());
+
+ const int kLHS = -TD::Convert("X");
+ Model1 m1(conf["model1"].as<string>());
+ Model1 invm1(conf["inverse_model1"].as<string>());
+
+#if 0
+ PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size());
+ MyConditionalModel m(lp0);
+#else
+ PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
+ MyJointModel m(lp0);
+#endif
+
+ cerr << "Initializing reachability limits...\n";
+ vector<Particle> ps(corpusf.size());
+ vector<Reachability> reaches; reaches.reserve(corpusf.size());
+ for (int ci = 0; ci < corpusf.size(); ++ci)
+ reaches.push_back(Reachability(corpusf[ci].size(),
+ corpuse[ci].size(),
+ kMAX_SRC_PHRASE,
+ kMAX_TRG_PHRASE));
+ cerr << "Sampling...\n";
+ vector<Particle> tmp_p(10000); // work space
+ SampleSet<prob_t> pfss;
+ for (int SS=0; SS < samples; ++SS) {
+ for (int ci = 0; ci < corpusf.size(); ++ci) {
+ vector<int>& src = corpusf[ci];
+ vector<int>& trg = corpuse[ci];
+ m.DecrementRules(ps[ci].rules);
+
+ BackwardEstimateSym be(m1, invm1, src, trg);
+ const Reachability& r = reaches[ci];
+ vector<Particle> lps(particles);
+
+ bool all_complete = false;
+ while(!all_complete) {
+ SampleSet<prob_t> ss;
+
+ // all particles have now been extended a bit, we will reweight them now
+ if (lps[0].trg_cov > 0)
+ FilterCrapParticlesAndReweight(&lps);
+
+ // loop over all particles and extend them
+ bool done_nothing = true;
+ for (int pi = 0; pi < particles; ++pi) {
+ Particle& p = lps[pi];
+ int tic = 0;
+ while(p.trg_cov < trg.size() && tic < rejuv_freq) {
+ ++tic;
+ done_nothing = false;
+ ss.clear();
+ TRule x; x.lhs_ = kLHS;
+ prob_t z;
+
+ for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) {
+ x.e_.push_back(trg[trg_len - 1 + p.trg_cov]);
+ for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) {
+ if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue;
+
+ int i = p.src_cov;
+ assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size
+ Particle& np = tmp_p[ss.size()];
+ np = p;
+ x.f_.clear();
+ for (int j = 0; j < src_len; ++j)
+ x.f_.push_back(src[i + j]);
+ np.src_cov += x.f_.size();
+ np.trg_cov += x.e_.size();
+ prob_t rp = m.RuleProbability(x);
+ np.gamma_last = rp;
+ const prob_t u = pow(np.gamma_last * pow(be(np.src_cov, np.trg_cov), 1.2), 0.1);
+ //cerr << "**rule=" << x << endl;
+ //cerr << " u=" << log(u) << " rule=" << rp << endl;
+ ss.add(u);
+ np.rules.push_back(TRulePtr(new TRule(x)));
+ z += u;
+ }
+ }
+ //cerr << "number of edges to consider: " << ss.size() << endl;
+ const int sampled = rng.SelectSample(ss);
+ prob_t q_n = ss[sampled] / z;
+ p = tmp_p[sampled];
+ //m.IncrementRule(*p.rules.back());
+ p.weight *= p.gamma_last / q_n;
+ //cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl;
+ //cerr << p << endl;
+ }
+ } // loop over particles (pi = 0 .. particles)
+ if (done_nothing) all_complete = true;
+ }
+ pfss.clear();
+ for (int i = 0; i < lps.size(); ++i)
+ pfss.add(lps[i].weight);
+ const int sampled = rng.SelectSample(pfss);
+ ps[ci] = lps[sampled];
+ m.IncrementRules(lps[sampled].rules);
+ for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; }
+ cerr << "tmp-LLH: " << log(m.Likelihood()) << endl;
+ }
+ cerr << "LLH: " << log(m.Likelihood()) << endl;
+ }
+ return 0;
+}
+
diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc
new file mode 100644
index 00000000..73dd8d39
--- /dev/null
+++ b/gi/pf/reachability.cc
@@ -0,0 +1,64 @@
+#include "reachability.h"
+
+#include <vector>
+#include <iostream>
+
+using namespace std;
+
+struct SState {
+ SState() : prev_src_covered(), prev_trg_covered() {}
+ SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {}
+ int prev_src_covered;
+ int prev_trg_covered;
+};
+
+void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) {
+ typedef boost::multi_array<vector<SState>, 2> array_type;
+ array_type a(boost::extents[srclen + 1][trglen + 1]);
+ a[0][0].push_back(SState());
+ for (int i = 0; i < srclen; ++i) {
+ for (int j = 0; j < trglen; ++j) {
+ if (a[i][j].size() == 0) continue;
+ const SState prev(i,j);
+ for (int k = 1; k <= src_max_phrase_len; ++k) {
+ if ((i + k) > srclen) continue;
+ for (int l = 1; l <= trg_max_phrase_len; ++l) {
+ if ((j + l) > trglen) continue;
+ a[i + k][j + l].push_back(prev);
+ }
+ }
+ }
+ }
+ a[0][0].clear();
+ //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
+ if (a[srclen][trglen].size() == 0) {
+ cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n";
+ return;
+ }
+
+ typedef boost::multi_array<bool, 2> rarray_type;
+ rarray_type r(boost::extents[srclen + 1][trglen + 1]);
+ r[srclen][trglen] = true;
+ for (int i = srclen; i >= 0; --i) {
+ for (int j = trglen; j >= 0; --j) {
+ vector<SState>& prevs = a[i][j];
+ if (!r[i][j]) { prevs.clear(); }
+ for (int k = 0; k < prevs.size(); ++k) {
+ r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true;
+ int src_delta = i - prevs[k].prev_src_covered;
+ edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true;
+ short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered];
+ if (src_delta > msd) msd = src_delta;
+ }
+ }
+ }
+ assert(!edges[0][0][1][0]);
+ assert(!edges[0][0][0][1]);
+ assert(!edges[0][0][0][0]);
+ assert(max_src_delta[0][0] > 0);
+ //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n";
+ //for (int i = 0; i < b[0][0].size(); ++i) {
+ // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n";
+ //}
+ }
+
diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h
new file mode 100644
index 00000000..98450ec1
--- /dev/null
+++ b/gi/pf/reachability.h
@@ -0,0 +1,28 @@
+#ifndef _REACHABILITY_H_
+#define _REACHABILITY_H_
+
+#include "boost/multi_array.hpp"
+
+// determines minimum and maximum lengths of outgoing edges from all
+// coverage positions such that the alignment path respects src and
+// trg maximum phrase sizes
+//
+// runs in O(n^2 * src_max * trg_max) time but should be relatively fast
+//
+// currently forbids 0 -> n and n -> 0 alignments
+
+struct Reachability {
+ boost::multi_array<bool, 4> edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring?
+ boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid
+
+ Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
+ edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
+ max_src_delta(boost::extents[srclen][trglen]) {
+ ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
+ }
+
+ private:
+ void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len);
+};
+
+#endif
diff --git a/gi/pf/tpf.cc b/gi/pf/tpf.cc
new file mode 100644
index 00000000..7348d21c
--- /dev/null
+++ b/gi/pf/tpf.cc
@@ -0,0 +1,99 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include "sampler.h"
+
+using namespace std;
+using namespace tr1;
+
+shared_ptr<MT19937> prng;
+
+struct Particle {
+ Particle() : weight(prob_t::One()) {}
+ vector<int> states;
+ prob_t weight;
+ prob_t gamma_last;
+};
+
+ostream& operator<<(ostream& os, const Particle& p) {
+ os << "[";
+ for (int i = 0; i < p.states.size(); ++i) os << p.states[i] << ' ';
+ os << "| w=" << log(p.weight) << ']';
+ return os;
+}
+
+void Rejuvenate(vector<Particle>& pps) {
+ SampleSet<prob_t> ss;
+ vector<Particle> nps(pps.size());
+ for (int i = 0; i < pps.size(); ++i) {
+// cerr << pps[i] << endl;
+ ss.add(pps[i].weight);
+ }
+// cerr << "REJUVINATING...\n";
+ for (int i = 0; i < pps.size(); ++i) {
+ nps[i] = pps[prng->SelectSample(ss)];
+ nps[i].weight = prob_t(1.0 / pps.size());
+// cerr << nps[i] << endl;
+ }
+ nps.swap(pps);
+// exit(1);
+}
+
+int main(int argc, char** argv) {
+ const unsigned particles = 100;
+ prng.reset(new MT19937);
+ MT19937& rng = *prng;
+
+ // q(a) = 0.8
+ // q(b) = 0.8
+ // q(c) = 0.4
+ SampleSet<double> ssq;
+ ssq.add(0.4);
+ ssq.add(0.6);
+ ssq.add(0);
+ double qz = 1;
+
+ // p(a) = 0.2
+ // p(b) = 0.8
+ vector<double> p(3);
+ p[0] = 0.2;
+ p[1] = 0.8;
+ p[2] = 0;
+
+ vector<int> counts(3);
+ int tot = 0;
+
+ vector<Particle> pps(particles);
+ SampleSet<prob_t> ppss;
+ int LEN = 12;
+ int PP = 1;
+ while (pps[0].states.size() < LEN) {
+ for (int pi = 0; pi < particles; ++pi) {
+ Particle& prt = pps[pi];
+
+ bool redo = true;
+ const Particle savedp = prt;
+ while (redo) {
+ redo = false;
+ for (int i = 0; i < PP; ++i) {
+ int s = rng.SelectSample(ssq);
+ double gamma_last = p[s];
+ if (!gamma_last) { redo = true; break; }
+ double q = ssq[s] / qz;
+ prt.states.push_back(s);
+ prt.weight *= prob_t(gamma_last / q);
+ }
+ if (redo) { prt = savedp; continue; }
+ }
+ }
+ Rejuvenate(pps);
+ }
+ ppss.clear();
+ for (int i = 0; i < particles; ++i) { ppss.add(pps[i].weight); }
+ int sp = rng.SelectSample(ppss);
+ cerr << pps[sp] << endl;
+
+ return 0;
+}
+