From af159e4c7066ea9a96f077e7e9265c8571f02053 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 11 Oct 2011 12:06:32 +0100 Subject: check in some experimental particle filtering code, some gitignore fixes --- gi/markov_al/Makefile.am | 6 + gi/markov_al/README | 2 + gi/markov_al/ml.cc | 470 +++++++++++++++++++++++++++++ gi/pf/Makefile.am | 21 ++ gi/pf/README | 2 + gi/pf/base_measures.cc | 112 +++++++ gi/pf/base_measures.h | 116 +++++++ gi/pf/brat.cc | 554 ++++++++++++++++++++++++++++++++++ gi/pf/cbgi.cc | 340 +++++++++++++++++++++ gi/pf/cfg_wfst_composer.cc | 730 +++++++++++++++++++++++++++++++++++++++++++++ gi/pf/cfg_wfst_composer.h | 46 +++ gi/pf/dpnaive.cc | 349 ++++++++++++++++++++++ gi/pf/itg.cc | 224 ++++++++++++++ gi/pf/pfbrat.cc | 554 ++++++++++++++++++++++++++++++++++ gi/pf/pfdist.cc | 621 ++++++++++++++++++++++++++++++++++++++ gi/pf/pfdist.new.cc | 620 ++++++++++++++++++++++++++++++++++++++ gi/pf/pfnaive.cc | 385 ++++++++++++++++++++++++ gi/pf/reachability.cc | 64 ++++ gi/pf/reachability.h | 28 ++ gi/pf/tpf.cc | 99 ++++++ 20 files changed, 5343 insertions(+) create mode 100644 gi/markov_al/Makefile.am create mode 100644 gi/markov_al/README create mode 100644 gi/markov_al/ml.cc create mode 100644 gi/pf/Makefile.am create mode 100644 gi/pf/README create mode 100644 gi/pf/base_measures.cc create mode 100644 gi/pf/base_measures.h create mode 100644 gi/pf/brat.cc create mode 100644 gi/pf/cbgi.cc create mode 100644 gi/pf/cfg_wfst_composer.cc create mode 100644 gi/pf/cfg_wfst_composer.h create mode 100644 gi/pf/dpnaive.cc create mode 100644 gi/pf/itg.cc create mode 100644 gi/pf/pfbrat.cc create mode 100644 gi/pf/pfdist.cc create mode 100644 gi/pf/pfdist.new.cc create mode 100644 gi/pf/pfnaive.cc create mode 100644 gi/pf/reachability.cc create mode 100644 gi/pf/reachability.h create mode 100644 gi/pf/tpf.cc (limited to 'gi') diff --git a/gi/markov_al/Makefile.am b/gi/markov_al/Makefile.am new file mode 100644 index 00000000..fe3e3349 --- /dev/null +++ b/gi/markov_al/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = ml + +ml_SOURCES = ml.cc + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +AM_LDFLAGS = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/markov_al/README b/gi/markov_al/README new file mode 100644 index 00000000..9c10f7cd --- /dev/null +++ b/gi/markov_al/README @@ -0,0 +1,2 @@ +Experimental translation models with Markovian dependencies. + diff --git a/gi/markov_al/ml.cc b/gi/markov_al/ml.cc new file mode 100644 index 00000000..1e71edd6 --- /dev/null +++ b/gi/markov_al/ml.cc @@ -0,0 +1,470 @@ +#include +#include + +#include +#include +#include +#include + +#include "tdict.h" +#include "filelib.h" +#include "sampler.h" +#include "ccrp_onetable.h" +#include "array2d.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +void PrintTopCustomers(const CCRP_OneTable& crp) { + for (CCRP_OneTable::const_iterator it = crp.begin(); it != crp.end(); ++it) { + cerr << " " << TD::Convert(it->first) << " = " << it->second << endl; + } +} + +void PrintAlignment(const vector& src, const vector& trg, const vector& a) { + cerr << TD::GetString(src) << endl << TD::GetString(trg) << endl; + Array2D al(src.size(), trg.size()); + for (int i = 0; i < a.size(); ++i) + if (a[i] != 255) al(a[i], i) = true; + cerr << al << endl; +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct Unigram; +struct Bigram { + Bigram() : trg(), cond() {} + Bigram(WordID prev, WordID cur, WordID t) : trg(t) { cond.first = prev; cond.second = cur; } + const pair& ConditioningPair() const { + return cond; + } + WordID& prev_src() { return cond.first; } + WordID& cur_src() { return cond.second; } + const WordID& prev_src() const { return cond.first; } + const WordID& cur_src() const { return cond.second; } + WordID trg; + private: + pair cond; +}; + +struct Unigram { + Unigram() : cur_src(), trg() {} + Unigram(WordID s, WordID t) : cur_src(s), trg(t) {} + WordID cur_src; + WordID trg; +}; + +ostream& operator<<(ostream& os, const Bigram& b) { + os << "( " << TD::Convert(b.trg) << " | " << TD::Convert(b.prev_src()) << " , " << TD::Convert(b.cur_src()) << " )"; + return os; +} + +ostream& operator<<(ostream& os, const Unigram& u) { + os << "( " << TD::Convert(u.trg) << " | " << TD::Convert(u.cur_src) << " )"; + return os; +} + +bool operator==(const Bigram& a, const Bigram& b) { + return a.trg == b.trg && a.cur_src() == b.cur_src() && a.prev_src() == b.prev_src(); +} + +bool operator==(const Unigram& a, const Unigram& b) { + return a.trg == b.trg && a.cur_src == b.cur_src; +} + +size_t hash_value(const Bigram& b) { + size_t h = boost::hash_value(b.prev_src()); + boost::hash_combine(h, boost::hash_value(b.cur_src())); + boost::hash_combine(h, boost::hash_value(b.trg)); + return h; +} + +size_t hash_value(const Unigram& u) { + size_t h = boost::hash_value(u.cur_src); + boost::hash_combine(h, boost::hash_value(u.trg)); + return h; +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +struct UnigramModel { + UnigramModel(size_t src_voc_size, size_t trg_voc_size) : + unigrams(TD::NumWords() + 1, CCRP_OneTable(1,1,1,1)), + p0(1.0 / trg_voc_size) {} + + void increment(const Bigram& b) { + unigrams[b.cur_src()].increment(b.trg); + } + + void decrement(const Bigram& b) { + unigrams[b.cur_src()].decrement(b.trg); + } + + double prob(const Bigram& b) const { + const double q0 = unigrams[b.cur_src()].prob(b.trg, p0); + return q0; + } + + double LogLikelihood() const { + double llh = 0; + for (unsigned i = 0; i < unigrams.size(); ++i) { + const CCRP_OneTable& crp = unigrams[i]; + if (crp.num_customers() > 0) { + llh += crp.log_crp_prob(); + llh += crp.num_tables() * log(p0); + } + } + return llh; + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < unigrams.size(); ++i) + unigrams[i].resample_hyperparameters(rng); + } + + vector > unigrams; // unigrams[src].prob(trg, p0) = p(trg|src) + + const double p0; +}; + +struct BigramModel { + BigramModel(size_t src_voc_size, size_t trg_voc_size) : + unigrams(TD::NumWords() + 1, CCRP_OneTable(1,1,1,1)), + p0(1.0 / trg_voc_size) {} + + void increment(const Bigram& b) { + BigramMap::iterator it = bigrams.find(b.ConditioningPair()); + if (it == bigrams.end()) { + it = bigrams.insert(make_pair(b.ConditioningPair(), CCRP_OneTable(1,1,1,1))).first; + } + if (it->second.increment(b.trg)) + unigrams[b.cur_src()].increment(b.trg); + } + + void decrement(const Bigram& b) { + BigramMap::iterator it = bigrams.find(b.ConditioningPair()); + assert(it != bigrams.end()); + if (it->second.decrement(b.trg)) { + unigrams[b.cur_src()].decrement(b.trg); + if (it->second.num_customers() == 0) + bigrams.erase(it); + } + } + + double prob(const Bigram& b) const { + const double q0 = unigrams[b.cur_src()].prob(b.trg, p0); + const BigramMap::const_iterator it = bigrams.find(b.ConditioningPair()); + if (it == bigrams.end()) return q0; + return it->second.prob(b.trg, q0); + } + + double LogLikelihood() const { + double llh = 0; + for (unsigned i = 0; i < unigrams.size(); ++i) { + const CCRP_OneTable& crp = unigrams[i]; + if (crp.num_customers() > 0) { + llh += crp.log_crp_prob(); + llh += crp.num_tables() * log(p0); + } + } + for (BigramMap::const_iterator it = bigrams.begin(); it != bigrams.end(); ++it) { + const CCRP_OneTable& crp = it->second; + const WordID cur_src = it->first.second; + llh += crp.log_crp_prob(); + for (CCRP_OneTable::const_iterator bit = crp.begin(); bit != crp.end(); ++bit) { + llh += log(unigrams[cur_src].prob(bit->second, p0)); + } + } + return llh; + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < unigrams.size(); ++i) + unigrams[i].resample_hyperparameters(rng); + for (BigramMap::iterator it = bigrams.begin(); it != bigrams.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + typedef unordered_map, CCRP_OneTable, boost::hash > > BigramMap; + BigramMap bigrams; // bigrams[(src-1,src)].prob(trg, q0) = p(trg|src,src-1) + vector > unigrams; // unigrams[src].prob(trg, p0) = p(trg|src) + + const double p0; +}; + +struct BigramAlignmentModel { + BigramAlignmentModel(size_t src_voc_size, size_t trg_voc_size) : bigrams(TD::NumWords() + 1, CCRP_OneTable(1,1,1,1)), p0(1.0 / src_voc_size) {} + void increment(WordID prev, WordID next) { + bigrams[prev].increment(next); // hierarchy? + } + void decrement(WordID prev, WordID next) { + bigrams[prev].decrement(next); // hierarchy? + } + double prob(WordID prev, WordID next) { + return bigrams[prev].prob(next, p0); + } + double LogLikelihood() const { + double llh = 0; + for (unsigned i = 0; i < bigrams.size(); ++i) { + const CCRP_OneTable& crp = bigrams[i]; + if (crp.num_customers() > 0) { + llh += crp.log_crp_prob(); + llh += crp.num_tables() * log(p0); + } + } + return llh; + } + + vector > bigrams; // bigrams[prev].prob(next, p0) = p(next|prev) + const double p0; +}; + +struct Alignment { + vector a; +}; + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as(); + + boost::shared_ptr prng; + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + cerr << "Reading corpus...\n"; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + assert(corpusf.size() == corpuse.size()); + const size_t corpus_len = corpusf.size(); + const WordID kNULL = TD::Convert(""); + const WordID kBOS = TD::Convert(""); + const WordID kEOS = TD::Convert(""); + Bigram TT(kBOS, TD::Convert("我"), TD::Convert("i")); + Bigram TT2(kBOS, TD::Convert("要"), TD::Convert("i")); + + UnigramModel model(vocabf.size(), vocabe.size()); + vector alignments(corpus_len); + for (unsigned ci = 0; ci < corpus_len; ++ci) { + const vector& src = corpusf[ci]; + const vector& trg = corpuse[ci]; + vector& alg = alignments[ci].a; + alg.resize(trg.size()); + int lenp1 = src.size() + 1; + WordID prev_src = kBOS; + for (int j = 0; j < trg.size(); ++j) { + int samp = lenp1 * rng.next(); + --samp; + if (samp < 0) samp = 255; + alg[j] = samp; + WordID cur_src = (samp == 255 ? kNULL : src[alg[j]]); + Bigram b(prev_src, cur_src, trg[j]); + model.increment(b); + prev_src = cur_src; + } + Bigram b(prev_src, kEOS, kEOS); + model.increment(b); + } + cerr << "Initial LLH: " << model.LogLikelihood() << endl; + + SampleSet ss; + for (unsigned si = 0; si < 50; ++si) { + for (unsigned ci = 0; ci < corpus_len; ++ci) { + const vector& src = corpusf[ci]; + const vector& trg = corpuse[ci]; + vector& alg = alignments[ci].a; + WordID prev_src = kBOS; + for (unsigned j = 0; j < trg.size(); ++j) { + unsigned char& a_j = alg[j]; + WordID cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); + Bigram b(prev_src, cur_e_a_j, trg[j]); + //cerr << "DEC: " << b << "\t" << nextb << endl; + model.decrement(b); + ss.clear(); + for (unsigned i = 0; i <= src.size(); ++i) { + const WordID cur_src = (i ? src[i-1] : kNULL); + b.cur_src() = cur_src; + ss.add(model.prob(b)); + } + int sampled_a_j = rng.SelectSample(ss); + a_j = (sampled_a_j ? sampled_a_j - 1 : 255); + cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); + b.cur_src() = cur_e_a_j; + //cerr << "INC: " << b << "\t" << nextb << endl; + model.increment(b); + prev_src = cur_e_a_j; + } + } + cerr << '.' << flush; + if (si % 10 == 9) { + cerr << "[LLH prev=" << model.LogLikelihood(); + //model.ResampleHyperparameters(&rng); + cerr << " new=" << model.LogLikelihood() << "]\n"; + //pair xx = make_pair(kBOS, TD::Convert("我")); + //PrintTopCustomers(model.bigrams.find(xx)->second); + cerr << "p(" << TT << ") = " << model.prob(TT) << endl; + cerr << "p(" << TT2 << ") = " << model.prob(TT2) << endl; + PrintAlignment(corpusf[0], corpuse[0], alignments[0].a); + } + } + { + // MODEL 2 + BigramModel model(vocabf.size(), vocabe.size()); + BigramAlignmentModel amodel(vocabf.size(), vocabe.size()); + for (unsigned ci = 0; ci < corpus_len; ++ci) { + const vector& src = corpusf[ci]; + const vector& trg = corpuse[ci]; + vector& alg = alignments[ci].a; + WordID prev_src = kBOS; + for (int j = 0; j < trg.size(); ++j) { + WordID cur_src = (alg[j] == 255 ? kNULL : src[alg[j]]); + Bigram b(prev_src, cur_src, trg[j]); + model.increment(b); + amodel.increment(prev_src, cur_src); + prev_src = cur_src; + } + amodel.increment(prev_src, kEOS); + Bigram b(prev_src, kEOS, kEOS); + model.increment(b); + } + cerr << "Initial LLH: " << model.LogLikelihood() << " " << amodel.LogLikelihood() << endl; + + SampleSet ss; + for (unsigned si = 0; si < samples; ++si) { + for (unsigned ci = 0; ci < corpus_len; ++ci) { + const vector& src = corpusf[ci]; + const vector& trg = corpuse[ci]; + vector& alg = alignments[ci].a; + WordID prev_src = kBOS; + for (unsigned j = 0; j < trg.size(); ++j) { + unsigned char& a_j = alg[j]; + WordID cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); + Bigram b(prev_src, cur_e_a_j, trg[j]); + WordID next_src = kEOS; + WordID next_trg = kEOS; + if (j < (trg.size() - 1)) { + next_src = (alg[j+1] == 255 ? kNULL : src[alg[j + 1]]); + next_trg = trg[j + 1]; + } + Bigram nextb(cur_e_a_j, next_src, next_trg); + //cerr << "DEC: " << b << "\t" << nextb << endl; + model.decrement(b); + model.decrement(nextb); + amodel.decrement(prev_src, cur_e_a_j); + amodel.decrement(cur_e_a_j, next_src); + ss.clear(); + for (unsigned i = 0; i <= src.size(); ++i) { + const WordID cur_src = (i ? src[i-1] : kNULL); + b.cur_src() = cur_src; + ss.add(model.prob(b) * model.prob(nextb) * amodel.prob(prev_src, cur_src) * amodel.prob(cur_src, next_src)); + //cerr << log(ss[ss.size() - 1]) << "\t" << b << endl; + } + int sampled_a_j = rng.SelectSample(ss); + a_j = (sampled_a_j ? sampled_a_j - 1 : 255); + cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); + b.cur_src() = cur_e_a_j; + nextb.prev_src() = cur_e_a_j; + //cerr << "INC: " << b << "\t" << nextb << endl; + //exit(1); + model.increment(b); + model.increment(nextb); + amodel.increment(prev_src, cur_e_a_j); + amodel.increment(cur_e_a_j, next_src); + prev_src = cur_e_a_j; + } + } + cerr << '.' << flush; + if (si % 10 == 9) { + cerr << "[LLH prev=" << (model.LogLikelihood() + amodel.LogLikelihood()); + //model.ResampleHyperparameters(&rng); + cerr << " new=" << model.LogLikelihood() << "]\n"; + pair xx = make_pair(kBOS, TD::Convert("我")); + cerr << "p(" << TT << ") = " << model.prob(TT) << endl; + cerr << "p(" << TT2 << ") = " << model.prob(TT2) << endl; + pair xx2 = make_pair(kBOS, TD::Convert("要")); + PrintTopCustomers(model.bigrams.find(xx)->second); + //PrintTopCustomers(amodel.bigrams[TD::Convert("")]); + //PrintTopCustomers(model.unigrams[TD::Convert("")]); + PrintAlignment(corpusf[0], corpuse[0], alignments[0].a); + } + } + } + return 0; +} + diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am new file mode 100644 index 00000000..c9764ad5 --- /dev/null +++ b/gi/pf/Makefile.am @@ -0,0 +1,21 @@ +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive + +noinst_LIBRARIES = libpf.a +libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc + +itg_SOURCES = itg.cc + +dpnaive_SOURCES = dpnaive.cc + +pfdist_SOURCES = pfdist.cc + +pfnaive_SOURCES = pfnaive.cc + +cbgi_SOURCES = cbgi.cc + +brat_SOURCES = brat.cc + +pfbrat_SOURCES = pfbrat.cc + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pf/README b/gi/pf/README new file mode 100644 index 00000000..62e47541 --- /dev/null +++ b/gi/pf/README @@ -0,0 +1,2 @@ +Experimental Bayesian alignment tools. Nothing to see here. + diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc new file mode 100644 index 00000000..f8ddfd32 --- /dev/null +++ b/gi/pf/base_measures.cc @@ -0,0 +1,112 @@ +#include "base_measures.h" + +#include + +#include "filelib.h" + +using namespace std; + +void Model1::LoadModel1(const string& fname) { + cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + int cur = 0; + int start = 0; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + const WordID src = TD::Convert(&line[0]); + ++cur; + start = cur; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + WordID trg = TD::Convert(&line[start]); + const double logprob = strtod(&line[cur + 1], NULL); + if (src >= ttable.size()) ttable.resize(src + 1); + ttable[src][trg].logeq(logprob); + } + cerr << " read " << lc << " parameters.\n"; +} + +prob_t PhraseConditionalBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) { // for each position i in e-RHS + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + return p; +} + +prob_t PhraseJointBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + // elen | flen ~Pois(flen + 0.01) + prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + p *= ptrglen; + p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform + for (int i = 0; i < elen; ++i) { // for each position i in E + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + return p; +} + +JumpBase::JumpBase() : p(200) { + for (unsigned src_len = 1; src_len < 200; ++src_len) { + map& cpd = p[src_len]; + int min_jump = 1 - src_len; + int max_jump = src_len; + prob_t z; + for (int j = min_jump; j <= max_jump; ++j) { + prob_t& cp = cpd[j]; + if (j < 0) + cp.logeq(log_poisson(1.5-j, 1)); + else if (j > 0) + cp.logeq(log_poisson(j, 1)); + cp.poweq(0.2); + z += cp; + } + for (int j = min_jump; j <= max_jump; ++j) { + cpd[j] /= z; + } + } +} + diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h new file mode 100644 index 00000000..df17aa62 --- /dev/null +++ b/gi/pf/base_measures.h @@ -0,0 +1,116 @@ +#ifndef _BASE_MEASURES_H_ +#define _BASE_MEASURES_H_ + +#include +#include +#include +#include +#include + +#include "trule.h" +#include "prob.h" +#include "tdict.h" + +inline double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { + os << '['; + for (int i = 0; i < p.size(); ++i) + os << (i==0 ? "" : " ") << TD::Convert(p[i]); + return os << ']'; +} + +struct Model1 { + explicit Model1(const std::string& fname) : + kNULL(TD::Convert("")), + kZERO() { + LoadModel1(fname); + } + + void LoadModel1(const std::string& fname); + + // returns prob 0 if src or trg is not found + const prob_t& operator()(WordID src, WordID trg) const { + if (src == 0) src = kNULL; + if (src < ttable.size()) { + const std::map& cpd = ttable[src]; + const std::map::const_iterator it = cpd.find(trg); + if (it != cpd.end()) + return it->second; + } + return kZERO; + } + + const WordID kNULL; + const prob_t kZERO; + std::vector > ttable; +}; + +struct PhraseConditionalBase { + explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : + model1(m1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseJointBase { + explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) : + model1(m1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_SOURCE(1.0 / vocab_f_size), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_SOURCE; + const prob_t kUNIFORM_TARGET; +}; + +// base distribution for jump size multinomials +// basically p(0) = 0 and then, p(1) is max, and then +// you drop as you move to the max jump distance +struct JumpBase { + JumpBase(); + + const prob_t& operator()(int jump, unsigned src_len) const { + assert(jump != 0); + const std::map::const_iterator it = p[src_len].find(jump); + assert(it != p[src_len].end()); + return it->second; + } + std::vector > p; +}; + + +#endif diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc new file mode 100644 index 00000000..4c6ba3ef --- /dev/null +++ b/gi/pf/brat.cc @@ -0,0 +1,554 @@ +#include +#include +#include + +#include +#include +#include +#include + +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "cfg_wfst_composer.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; +struct FSTState; + +size_t hash_value(const TRule& r) { + size_t h = 2 - r.lhs_; + boost::hash_combine(h, boost::hash_value(r.e_)); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +struct ConditionalBase { + explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) : + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_TARGET(1.0 / vocab_e_size), + kNULL(TD::Convert("")) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + LoadModel1(model1fname); + } + + void LoadModel1(const string& fname) { + cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + int cur = 0; + int start = 0; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + const WordID src = TD::Convert(&line[0]); + ++cur; + start = cur; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + WordID trg = TD::Convert(&line[start]); + const double logprob = strtod(&line[cur + 1], NULL); + if (src >= ttable.size()) ttable.resize(src + 1); + ttable[src][trg].logeq(logprob); + } + cerr << " read " << lc << " parameters.\n"; + } + + // return logp0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + const int flen = rule.f_.size(); + const int elen = rule.e_.size(); + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) { // for each position i in e-RHS + const WordID trg = rule.e_[i]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? kNULL : rule.f_[j]; + const map::const_iterator it = ttable[src].find(trg); + if (it != ttable[src].end()) { + tp += kM1MIXTURE * it->second; + } + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + return p; + } + + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_TARGET; + const WordID kNULL; + vector > ttable; +}; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(3),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(3),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +struct UniphraseLM { + UniphraseLM(const vector >& corpus, + const set& vocab, + const po::variables_map& conf) : + phrases_(1,1), + gen_(1,1), + corpus_(corpus), + uniform_word_(1.0 / vocab.size()), + gen_p0_(0.5), + p_end_(0.5), + use_poisson_(conf.count("poisson_length") > 0) {} + + void ResampleHyperparameters(MT19937* rng) { + phrases_.resample_hyperparameters(rng); + gen_.resample_hyperparameters(rng); + cerr << " " << phrases_.concentration(); + } + + CCRP_NoTable > phrases_; + CCRP_NoTable gen_; + vector > z_; // z_[i] is there a phrase boundary after the ith word + const vector >& corpus_; + const double uniform_word_; + const double gen_p0_; + const double p_end_; // in base length distribution, p of the end of a phrase + const bool use_poisson_; +}; + +struct Reachability { + boost::multi_array edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? + boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + + Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), + max_src_delta(boost::extents[srclen][trglen]) { + ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); + } + + private: + struct SState { + SState() : prev_src_covered(), prev_trg_covered() {} + SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} + int prev_src_covered; + int prev_trg_covered; + }; + + struct NState { + NState() : next_src_covered(), next_trg_covered() {} + NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {} + int next_src_covered; + int next_trg_covered; + }; + + void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { + typedef boost::multi_array, 2> array_type; + array_type a(boost::extents[srclen + 1][trglen + 1]); + a[0][0].push_back(SState()); + for (int i = 0; i < srclen; ++i) { + for (int j = 0; j < trglen; ++j) { + if (a[i][j].size() == 0) continue; + const SState prev(i,j); + for (int k = 1; k <= src_max_phrase_len; ++k) { + if ((i + k) > srclen) continue; + for (int l = 1; l <= trg_max_phrase_len; ++l) { + if ((j + l) > trglen) continue; + a[i + k][j + l].push_back(prev); + } + } + } + } + a[0][0].clear(); + cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; + assert(a[srclen][trglen].size() > 0); + + typedef boost::multi_array rarray_type; + rarray_type r(boost::extents[srclen + 1][trglen + 1]); +// typedef boost::multi_array, 2> narray_type; +// narray_type b(boost::extents[srclen + 1][trglen + 1]); + r[srclen][trglen] = true; + for (int i = srclen; i >= 0; --i) { + for (int j = trglen; j >= 0; --j) { + vector& prevs = a[i][j]; + if (!r[i][j]) { prevs.clear(); } +// const NState nstate(i,j); + for (int k = 0; k < prevs.size(); ++k) { + r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; + int src_delta = i - prevs[k].prev_src_covered; + edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; + short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; + if (src_delta > msd) msd = src_delta; +// b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate); + } + } + } + assert(!edges[0][0][1][0]); + assert(!edges[0][0][0][1]); + assert(!edges[0][0][0][0]); + cerr << " MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl; + assert(max_src_delta[0][0] > 0); + //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; + //for (int i = 0; i < b[0][0].size(); ++i) { + // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; + //} + } +}; + +ostream& operator<<(ostream& os, const FSTState& q); +struct FSTState { + explicit FSTState(int src_size) : + trg_covered_(), + src_covered_(), + src_coverage_(src_size) {} + + FSTState(short trg_covered, short src_covered, const vector& src_coverage, const vector& src_prefix) : + trg_covered_(trg_covered), + src_covered_(src_covered), + src_coverage_(src_coverage), + src_prefix_(src_prefix) { + if (src_coverage_.size() == src_covered) { + assert(src_prefix.size() == 0); + } + } + + // if we extend by the word at src_position, what are + // the next states that are reachable and lie on a valid + // path to the final state? + vector Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const { + assert(src_position < src_coverage_.size()); + if (src_coverage_[src_position]) { + cerr << "Trying to extend " << *this << " with position " << src_position << endl; + abort(); + } + vector ncvg = src_coverage_; + ncvg[src_position] = true; + + vector res; + const int trg_remaining = trg_len - trg_covered_; + if (trg_remaining <= 0) { + cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl; + abort(); + } + const int src_remaining = src_len - src_covered_; + if (src_remaining <= 0) { + cerr << "Source appears to have been covered: " << *this << endl; + abort(); + } + + for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) { + if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) { + int nc = src_prefix_.size() + 1 + src_covered_; + res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector())); + } + } + + if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) { + vector nsp = src_prefix_; + nsp.push_back(src_position); + res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp)); + } + + if (res.size() == 0) { + cerr << *this << " can't be extended!\n"; + abort(); + } + return res; + } + + short trg_covered_, src_covered_; + vector src_coverage_; + vector src_prefix_; +}; +bool operator<(const FSTState& q, const FSTState& r) { + if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_; + if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_; + if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_; + return q.src_prefix_ < r.src_prefix_; +} + +ostream& operator<<(ostream& os, const FSTState& q) { + os << "[" << q.trg_covered_ << " : "; + for (int i = 0; i < q.src_coverage_.size(); ++i) + os << q.src_coverage_[i]; + os << " : <"; + for (int i = 0; i < q.src_prefix_.size(); ++i) { + if (i != 0) os << ' '; + os << q.src_prefix_[i]; + } + return os << ">]"; +} + +struct MyModel { + MyModel(ConditionalBase& rcp0) : rp0(rcp0) {} + typedef unordered_map, CCRP_NoTable, boost::hash > > SrcToRuleCRPMap; + + void DecrementRule(const TRule& rule) { + SrcToRuleCRPMap::iterator it = rules.find(rule.f_); + assert(it != rules.end()); + it->second.decrement(rule); + if (it->second.num_customers() == 0) rules.erase(it); + } + + void IncrementRule(const TRule& rule) { + SrcToRuleCRPMap::iterator it = rules.find(rule.f_); + if (it == rules.end()) { + CCRP_NoTable crp(1,1); + it = rules.insert(make_pair(rule.f_, crp)).first; + } + it->second.increment(rule); + } + + // conditioned on rule.f_ + prob_t RuleConditionalProbability(const TRule& rule) const { + const prob_t base = rp0(rule); + SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_); + if (it == rules.end()) { + return base; + } else { + const double lp = it->second.logprob(rule, log(base)); + prob_t q; q.logeq(lp); + return q; + } + } + + const ConditionalBase& rp0; + SrcToRuleCRPMap rules; +}; + +struct MyFST : public WFST { + MyFST(const vector& ssrc, const vector& strg, MyModel* m) : + src(ssrc), trg(strg), + r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE), + model(m) { + FSTState in(src.size()); + cerr << " INIT: " << in << endl; + init = GetNode(in); + for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true; + in.src_covered_ = src.size(); + in.trg_covered_ = trg.size(); + cerr << "FINAL: " << in << endl; + final = GetNode(in); + } + virtual const WFSTNode* Final() const; + virtual const WFSTNode* Initial() const; + + const WFSTNode* GetNode(const FSTState& q); + map > m; + const vector& src; + const vector& trg; + Reachability r; + const WFSTNode* init; + const WFSTNode* final; + MyModel* model; +}; + +struct MyNode : public WFSTNode { + MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {} + virtual vector > ExtendInput(unsigned srcindex) const; + const FSTState state; + mutable MyFST* container; +}; + +vector > MyNode::ExtendInput(unsigned srcindex) const { + cerr << "EXTEND " << state << " with " << srcindex << endl; + vector ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r); + vector > res(ext.size()); + for (unsigned i = 0; i < ext.size(); ++i) { + res[i].first = container->GetNode(ext[i]); + if (ext[i].src_prefix_.size() == 0) { + const unsigned trg_from = state.trg_covered_; + const unsigned trg_to = ext[i].trg_covered_; + const unsigned prev_prfx_size = state.src_prefix_.size(); + res[i].second.reset(new TRule); + res[i].second->lhs_ = -TD::Convert("X"); + vector& src = res[i].second->f_; + vector& trg = res[i].second->e_; + src.resize(prev_prfx_size + 1); + for (unsigned j = 0; j < prev_prfx_size; ++j) + src[j] = container->src[state.src_prefix_[j]]; + src[prev_prfx_size] = container->src[srcindex]; + for (unsigned j = trg_from; j < trg_to; ++j) + trg.push_back(container->trg[j]); + res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second))); + } + } + return res; +} + +const WFSTNode* MyFST::GetNode(const FSTState& q) { + boost::shared_ptr& res = m[q]; + if (!res) { + res.reset(new MyNode(q, this)); + } + return &*res; +} + +const WFSTNode* MyFST::Final() const { + return final; +} + +const WFSTNode* MyFST::Initial() const { + return init; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); + kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + shared_ptr prng; + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + ConditionalBase lp0(conf["model1_interpolation_weight"].as(), + vocabe.size(), + conf["model1"].as()); + MyModel m(lp0); + + TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0"); + m.IncrementRule(x); + TRule y("[X] ||| nY dyN ||| gave ||| 0"); + m.IncrementRule(y); + + + MyFST fst(corpusf[0], corpuse[0], &m); + ifstream in("./kimura.g"); + assert(in); + CFG_WFSTComposer comp(fst); + Hypergraph hg; + bool succeed = comp.Compose(&in, &hg); + hg.PrintGraphviz(); + if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } + +#if 0 + ifstream in2("./amnabooks.g"); + assert(in2); + MyFST fst2(corpusf[1], corpuse[1], &m); + CFG_WFSTComposer comp2(fst2); + Hypergraph hg2; + bool succeed2 = comp2.Compose(&in2, &hg2); + if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } +#endif + + SparseVector w; w.set_value(FD::Convert("Proposal"), 1.0); + hg.Reweight(w); + cerr << ViterbiFTree(hg) << endl; + return 0; +} + diff --git a/gi/pf/cbgi.cc b/gi/pf/cbgi.cc new file mode 100644 index 00000000..20204e8a --- /dev/null +++ b/gi/pf/cbgi.cc @@ -0,0 +1,340 @@ +#include +#include +#include + +#include +#include + +#include "sampler.h" +#include "filelib.h" +#include "hg_io.h" +#include "hg.h" +#include "ccrp_nt.h" +#include "trule.h" +#include "inside_outside.h" + +using namespace std; +using namespace std::tr1; + +double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +double log_decay(unsigned x, const double& b) { + assert(b > 1.0); + assert(x > 0); + return log(b - 1) - x * log(b); +} + +size_t hash_value(const TRule& r) { + // TODO fix hash function + size_t h = boost::hash_value(r.e_) * boost::hash_value(r.f_) * r.lhs_; + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +struct SimpleBase { + SimpleBase(unsigned esize, unsigned fsize, unsigned ntsize = 144) : + uniform_e(-log(esize)), + uniform_f(-log(fsize)), + uniform_nt(-log(ntsize)) { + } + + // binomial coefficient + static double choose(unsigned n, unsigned k) { + return exp(lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1)); + } + + // count the number of patterns of terminals and NTs in the rule, given elen and flen + static double log_number_of_patterns(const unsigned flen, const unsigned elen) { + static vector > counts; + if (elen >= counts.size()) counts.resize(elen + 1); + if (flen >= counts[elen].size()) counts[elen].resize(flen + 1); + double& count = counts[elen][flen]; + if (count) return log(count); + const unsigned max_arity = min(elen, flen); + for (unsigned a = 0; a <= max_arity; ++a) + count += choose(elen, a) * choose(flen, a); + return log(count); + } + + // return logp0 of rule | LHS + double operator()(const TRule& rule) const { + const unsigned flen = rule.f_.size(); + const unsigned elen = rule.e_.size(); +#if 0 + double p = 0; + p += log_poisson(flen, 0.5); // flen ~Pois(0.5) + p += log_poisson(elen, flen); // elen | flen ~Pois(flen) + p -= log_number_of_patterns(flen, elen); // pattern | flen,elen ~Uniform + for (unsigned i = 0; i < flen; ++i) { // for each position in f-RHS + if (rule.f_[i] <= 0) // according to pattern + p += uniform_nt; // draw NT ~Uniform + else + p += uniform_f; // draw f terminal ~Uniform + } + p -= lgamma(rule.Arity() + 1); // draw permutation ~Uniform + for (unsigned i = 0; i < elen; ++i) { // for each position in e-RHS + if (rule.e_[i] > 0) // according to pattern + p += uniform_e; // draw e|f term ~Uniform + // TODO this should prob be model 1 + } +#else + double p = 0; + bool is_abstract = rule.f_[0] <= 0; + p += log(0.5); + if (is_abstract) { + if (flen == 2) p += log(0.99); else p += log(0.01); + } else { + p += log_decay(flen, 3); + } + + for (unsigned i = 0; i < flen; ++i) { // for each position in f-RHS + if (rule.f_[i] <= 0) // according to pattern + p += uniform_nt; // draw NT ~Uniform + else + p += uniform_f; // draw f terminal ~Uniform + } +#endif + return p; + } + const double uniform_e; + const double uniform_f; + const double uniform_nt; + vector arities; +}; + +MT19937* rng = NULL; + +template +struct MHSamplerEdgeProb { + MHSamplerEdgeProb(const Hypergraph& hg, + const map >& rdp, + const Base& logp0, + const bool exclude_multiword_terminals) : edge_probs(hg.edges_.size()) { + for (int i = 0; i < edge_probs.size(); ++i) { + const TRule& rule = *hg.edges_[i].rule_; + const map >::const_iterator it = rdp.find(rule.lhs_); + assert(it != rdp.end()); + const CCRP_NoTable& crp = it->second; + edge_probs[i].logeq(crp.logprob(rule, logp0(rule))); + if (exclude_multiword_terminals && rule.f_[0] > 0 && rule.f_.size() > 1) + edge_probs[i] = prob_t::Zero(); + } + } + inline prob_t operator()(const Hypergraph::Edge& e) const { + return edge_probs[e.id_]; + } + prob_t DerivationProb(const vector& d) const { + prob_t p = prob_t::One(); + for (unsigned i = 0; i < d.size(); ++i) + p *= edge_probs[d[i]]; + return p; + } + vector edge_probs; +}; + +template +struct ModelAndData { + ModelAndData() : + base_lh(prob_t::One()), + logp0(10000, 10000), + mh_samples(), + mh_rejects() {} + + void SampleCorpus(const string& hgpath, int i); + void ResampleHyperparameters() { + for (map >::iterator it = rules.begin(); it != rules.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + CCRP_NoTable& RuleCRP(int lhs) { + map >::iterator it = rules.find(lhs); + if (it == rules.end()) { + rules.insert(make_pair(lhs, CCRP_NoTable(1,1))); + it = rules.find(lhs); + } + return it->second; + } + + void IncrementRule(const TRule& rule) { + CCRP_NoTable& crp = RuleCRP(rule.lhs_); + if (crp.increment(rule)) { + prob_t p; p.logeq(logp0(rule)); + base_lh *= p; + } + } + + void DecrementRule(const TRule& rule) { + CCRP_NoTable& crp = RuleCRP(rule.lhs_); + if (crp.decrement(rule)) { + prob_t p; p.logeq(logp0(rule)); + base_lh /= p; + } + } + + void DecrementDerivation(const Hypergraph& hg, const vector& d) { + for (unsigned i = 0; i < d.size(); ++i) { + const TRule& rule = *hg.edges_[d[i]].rule_; + DecrementRule(rule); + } + } + + void IncrementDerivation(const Hypergraph& hg, const vector& d) { + for (unsigned i = 0; i < d.size(); ++i) { + const TRule& rule = *hg.edges_[d[i]].rule_; + IncrementRule(rule); + } + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (map >::const_iterator it = rules.begin(); it != rules.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + } + p *= base_lh; + return p; + } + + void ResampleDerivation(const Hypergraph& hg, vector* sampled_derivation); + + map > rules; // [lhs] -> distribution over RHSs + prob_t base_lh; + SimpleBase logp0; + vector > samples; // sampled derivations + unsigned int mh_samples; + unsigned int mh_rejects; +}; + +template +void ModelAndData::SampleCorpus(const string& hgpath, int n) { + vector hgs(n); hgs.clear(); + boost::unordered_map acc; + map tot; + for (int i = 0; i < n; ++i) { + ostringstream os; + os << hgpath << '/' << i << ".json.gz"; + if (!FileExists(os.str())) continue; + hgs.push_back(Hypergraph()); + ReadFile rf(os.str()); + HypergraphIO::ReadFromJSON(rf.stream(), &hgs.back()); + } + cerr << "Read " << hgs.size() << " alignment hypergraphs.\n"; + samples.resize(hgs.size()); + const unsigned SAMPLES = 2000; + const unsigned burnin = 3 * SAMPLES / 4; + const unsigned every = 20; + for (unsigned s = 0; s < SAMPLES; ++s) { + if (s % 10 == 0) { + if (s > 0) { cerr << endl; ResampleHyperparameters(); } + cerr << "[" << s << " LLH=" << log(Likelihood()) << " REJECTS=" << ((double)mh_rejects / mh_samples) << " LHS's=" << rules.size() << " base=" << log(base_lh) << "] "; + } + cerr << '.'; + for (unsigned i = 0; i < hgs.size(); ++i) { + ResampleDerivation(hgs[i], &samples[i]); + if (s > burnin && s % every == 0) { + for (unsigned j = 0; j < samples[i].size(); ++j) { + const TRule& rule = *hgs[i].edges_[samples[i][j]].rule_; + ++acc[rule]; + ++tot[rule.lhs_]; + } + } + } + } + cerr << endl; + for (boost::unordered_map::iterator it = acc.begin(); it != acc.end(); ++it) { + cout << it->first << " MyProb=" << log(it->second)-log(tot[it->first.lhs_]) << endl; + } +} + +template +void ModelAndData::ResampleDerivation(const Hypergraph& hg, vector* sampled_deriv) { + vector cur; + cur.swap(*sampled_deriv); + + const prob_t p_cur = Likelihood(); + DecrementDerivation(hg, cur); + if (cur.empty()) { + // first iteration, create restaurants + for (int i = 0; i < hg.edges_.size(); ++i) + RuleCRP(hg.edges_[i].rule_->lhs_); + } + MHSamplerEdgeProb wf(hg, rules, logp0, cur.empty()); +// MHSamplerEdgeProb wf(hg, rules, logp0, false); + const prob_t q_cur = wf.DerivationProb(cur); + vector node_probs; + Inside >(hg, &node_probs, wf); + queue q; + q.push(hg.nodes_.size() - 3); + while(!q.empty()) { + unsigned cur_node_id = q.front(); +// cerr << "NODE=" << cur_node_id << endl; + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge = 0; + if (num_in_edges == 1) { + sampled_edge = node.in_edges_[0]; + } else { + prob_t z; + assert(num_in_edges > 1); + SampleSet ss; + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = wf.edge_probs[edge.id_]; // edge proposal prob + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; + ss.add(p); +// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; + z += p; + } +// for (unsigned j = 0; j < num_in_edges; ++j) { +// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +// } +// cerr << " --- \n"; + sampled_edge = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_deriv->push_back(sampled_edge); + const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; + for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { + q.push(edge.tail_nodes_[j]); + } + } + IncrementDerivation(hg, *sampled_deriv); + +// cerr << "sampled derivation contains " << sampled_deriv->size() << " edges\n"; +// cerr << "DERIV:\n"; +// for (int i = 0; i < sampled_deriv->size(); ++i) { +// cerr << " " << hg.edges_[(*sampled_deriv)[i]].rule_->AsString() << endl; +// } + + if (cur.empty()) return; // accept first sample + + ++mh_samples; + // only need to do MH if proposal is different to current state + if (cur != *sampled_deriv) { + const prob_t q_prop = wf.DerivationProb(*sampled_deriv); + const prob_t p_prop = Likelihood(); + if (!rng->AcceptMetropolisHastings(p_prop, p_cur, q_prop, q_cur)) { + ++mh_rejects; + DecrementDerivation(hg, *sampled_deriv); + IncrementDerivation(hg, cur); + swap(cur, *sampled_deriv); + } + } +} + +int main(int argc, char** argv) { + rng = new MT19937; + ModelAndData m; + m.SampleCorpus("./hgs", 50); + // m.SampleCorpus("./btec/hgs", 5000); + return 0; +} + diff --git a/gi/pf/cfg_wfst_composer.cc b/gi/pf/cfg_wfst_composer.cc new file mode 100644 index 00000000..a31b5be8 --- /dev/null +++ b/gi/pf/cfg_wfst_composer.cc @@ -0,0 +1,730 @@ +#include "cfg_wfst_composer.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include "fast_lexical_cast.hpp" + +#include "phrasetable_fst.h" +#include "sparse_vector.h" +#include "tdict.h" +#include "hg.h" + +using boost::shared_ptr; +namespace po = boost::program_options; +using namespace std; +using namespace std::tr1; + +WFSTNode::~WFSTNode() {} +WFST::~WFST() {} + +// Define the following macro if you want to see lots of debugging output +// when you run the chart parser +#undef DEBUG_CHART_PARSER + +// A few constants used by the chart parser /////////////// +static const int kMAX_NODES = 2000000; +static const string kPHRASE_STRING = "X"; +static bool constants_need_init = true; +static WordID kUNIQUE_START; +static WordID kPHRASE; +static TRulePtr kX1X2; +static TRulePtr kX1; +static WordID kEPS; +static TRulePtr kEPSRule; + +static void InitializeConstants() { + if (constants_need_init) { + kPHRASE = TD::Convert(kPHRASE_STRING) * -1; + kUNIQUE_START = TD::Convert("S") * -1; + kX1X2.reset(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]")); + kX1.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); + kEPSRule.reset(new TRule("[X] ||| ||| ")); + kEPS = TD::Convert(""); + constants_need_init = false; + } +} +//////////////////////////////////////////////////////////// + +class EGrammarNode { + friend bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); + friend void AddGrammarRule(const string& r, map* g); + public: +#ifdef DEBUG_CHART_PARSER + string hint; +#endif + EGrammarNode() : is_some_rule_complete(false), is_root(false) {} + const map& GetTerminals() const { return tptr; } + const map& GetNonTerminals() const { return ntptr; } + bool HasNonTerminals() const { return (!ntptr.empty()); } + bool HasTerminals() const { return (!tptr.empty()); } + bool RuleCompletes() const { + return (is_some_rule_complete || (ntptr.empty() && tptr.empty())); + } + bool GrammarContinues() const { + return !(ntptr.empty() && tptr.empty()); + } + bool IsRoot() const { + return is_root; + } + // these are the features associated with the rule from the start + // node up to this point. If you use these features, you must + // not Extend() this rule. + const SparseVector& GetCFGProductionFeatures() const { + return input_features; + } + + const EGrammarNode* Extend(const WordID& t) const { + if (t < 0) { + map::const_iterator it = ntptr.find(t); + if (it == ntptr.end()) return NULL; + return &it->second; + } else { + map::const_iterator it = tptr.find(t); + if (it == tptr.end()) return NULL; + return &it->second; + } + } + + private: + map tptr; + map ntptr; + SparseVector input_features; + bool is_some_rule_complete; + bool is_root; +}; +typedef map EGrammar; // indexed by the rule LHS + +// edges are immutable once created +struct Edge { +#ifdef DEBUG_CHART_PARSER + static int id_count; + const int id; +#endif + const WordID cat; // lhs side of rule proved/being proved + const EGrammarNode* const dot; // dot position + const WFSTNode* const q; // start of span + const WFSTNode* const r; // end of span + const Edge* const active_parent; // back pointer, NULL for PREDICT items + const Edge* const passive_parent; // back pointer, NULL for SCAN and PREDICT items + TRulePtr tps; // translations + shared_ptr > features; // features from CFG rule + + bool IsPassive() const { + // when a rule is completed, this value will be set + return static_cast(features); + } + bool IsActive() const { return !IsPassive(); } + bool IsInitial() const { + return !(active_parent || passive_parent); + } + bool IsCreatedByScan() const { + return active_parent && !passive_parent && !dot->IsRoot(); + } + bool IsCreatedByPredict() const { + return dot->IsRoot(); + } + bool IsCreatedByComplete() const { + return active_parent && passive_parent; + } + + // constructor for PREDICT + Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(NULL), passive_parent(NULL), tps() {} + Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r, const Edge* act_parent) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(act_parent), passive_parent(NULL), tps() {} + + // constructors for SCAN + Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, + const Edge* act_par, const TRulePtr& translations) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations) {} + + Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, + const Edge* act_par, const TRulePtr& translations, + const SparseVector& feats) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations), + features(new SparseVector(feats)) {} + + // constructors for COMPLETE + Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, + const Edge* act_par, const Edge *pas_par) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps() { + assert(pas_par->IsPassive()); + assert(act_par->IsActive()); + } + + Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, + const Edge* act_par, const Edge *pas_par, const SparseVector& feats) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(), + features(new SparseVector(feats)) { + assert(pas_par->IsPassive()); + assert(act_par->IsActive()); + } + + // constructor for COMPLETE query + Edge(const WFSTNode* _r) : +#ifdef DEBUG_CHART_PARSER + id(0), +#endif + cat(0), dot(NULL), q(NULL), + r(_r), active_parent(NULL), passive_parent(NULL), tps() {} + // constructor for MERGE quere + Edge(const WFSTNode* _q, int) : +#ifdef DEBUG_CHART_PARSER + id(0), +#endif + cat(0), dot(NULL), q(_q), + r(NULL), active_parent(NULL), passive_parent(NULL), tps() {} +}; +#ifdef DEBUG_CHART_PARSER +int Edge::id_count = 0; +#endif + +ostream& operator<<(ostream& os, const Edge& e) { + string type = "PREDICT"; + if (e.IsCreatedByScan()) + type = "SCAN"; + else if (e.IsCreatedByComplete()) + type = "COMPLETE"; + os << "[" +#ifdef DEBUG_CHART_PARSER + << '(' << e.id << ") " +#else + << '(' << &e << ") " +#endif + << "q=" << e.q << ", r=" << e.r + << ", cat="<< TD::Convert(e.cat*-1) << ", dot=" + << e.dot +#ifdef DEBUG_CHART_PARSER + << e.dot->hint +#endif + << (e.IsActive() ? ", Active" : ", Passive") + << ", " << type; +#ifdef DEBUG_CHART_PARSER + if (e.active_parent) { os << ", act.parent=(" << e.active_parent->id << ')'; } + if (e.passive_parent) { os << ", psv.parent=(" << e.passive_parent->id << ')'; } +#endif + if (e.tps) { os << ", tps=" << e.tps->AsString(); } + return os << ']'; +} + +struct Traversal { + const Edge* const edge; // result from the active / passive combination + const Edge* const active; + const Edge* const passive; + Traversal(const Edge* me, const Edge* a, const Edge* p) : edge(me), active(a), passive(p) {} +}; + +struct UniqueTraversalHash { + size_t operator()(const Traversal* t) const { + size_t x = 5381; + x = ((x << 5) + x) ^ reinterpret_cast(t->active); + x = ((x << 5) + x) ^ reinterpret_cast(t->passive); + x = ((x << 5) + x) ^ t->edge->IsActive(); + return x; + } +}; + +struct UniqueTraversalEquals { + size_t operator()(const Traversal* a, const Traversal* b) const { + return (a->passive == b->passive && a->active == b->active && a->edge->IsActive() == b->edge->IsActive()); + } +}; + +struct UniqueEdgeHash { + size_t operator()(const Edge* e) const { + size_t x = 5381; + if (e->IsActive()) { + x = ((x << 5) + x) ^ reinterpret_cast(e->dot); + x = ((x << 5) + x) ^ reinterpret_cast(e->q); + x = ((x << 5) + x) ^ reinterpret_cast(e->r); + x = ((x << 5) + x) ^ static_cast(e->cat); + x += 13; + } else { // with passive edges, we don't care about the dot + x = ((x << 5) + x) ^ reinterpret_cast(e->q); + x = ((x << 5) + x) ^ reinterpret_cast(e->r); + x = ((x << 5) + x) ^ static_cast(e->cat); + } + return x; + } +}; + +struct UniqueEdgeEquals { + bool operator()(const Edge* a, const Edge* b) const { + if (a->IsActive() != b->IsActive()) return false; + if (a->IsActive()) { + return (a->cat == b->cat) && (a->dot == b->dot) && (a->q == b->q) && (a->r == b->r); + } else { + return (a->cat == b->cat) && (a->q == b->q) && (a->r == b->r); + } + } +}; + +struct REdgeHash { + size_t operator()(const Edge* e) const { + size_t x = 5381; + x = ((x << 5) + x) ^ reinterpret_cast(e->r); + return x; + } +}; + +struct REdgeEquals { + bool operator()(const Edge* a, const Edge* b) const { + return (a->r == b->r); + } +}; + +struct QEdgeHash { + size_t operator()(const Edge* e) const { + size_t x = 5381; + x = ((x << 5) + x) ^ reinterpret_cast(e->q); + return x; + } +}; + +struct QEdgeEquals { + bool operator()(const Edge* a, const Edge* b) const { + return (a->q == b->q); + } +}; + +struct EdgeQueue { + queue q; + EdgeQueue() {} + void clear() { while(!q.empty()) q.pop(); } + bool HasWork() const { return !q.empty(); } + const Edge* Next() { const Edge* res = q.front(); q.pop(); return res; } + void AddEdge(const Edge* s) { q.push(s); } +}; + +class CFG_WFSTComposerImpl { + public: + CFG_WFSTComposerImpl(WordID start_cat, + const WFSTNode* q_0, + const WFSTNode* q_final) : start_cat_(start_cat), q_0_(q_0), q_final_(q_final) {} + + // returns false if the intersection is empty + bool Compose(const EGrammar& g, Hypergraph* forest) { + goal_node = NULL; + EGrammar::const_iterator sit = g.find(start_cat_); + forest->ReserveNodes(kMAX_NODES); + assert(sit != g.end()); + Edge* init = new Edge(start_cat_, &sit->second, q_0_); + assert(IncorporateNewEdge(init)); + while (exp_agenda.HasWork() || agenda.HasWork()) { + while(exp_agenda.HasWork()) { + const Edge* edge = exp_agenda.Next(); + FinishEdge(edge, forest); + } + if (agenda.HasWork()) { + const Edge* edge = agenda.Next(); +#ifdef DEBUG_CHART_PARSER + cerr << "processing (" << edge->id << ')' << endl; +#endif + if (edge->IsActive()) { + if (edge->dot->HasTerminals()) + DoScan(edge); + if (edge->dot->HasNonTerminals()) { + DoMergeWithPassives(edge); + DoPredict(edge, g); + } + } else { + DoComplete(edge); + } + } + } + if (goal_node) { + forest->PruneUnreachable(goal_node->id_); + forest->EpsilonRemove(kEPS); + } + FreeAll(); + return goal_node; + } + + void FreeAll() { + for (int i = 0; i < free_list_.size(); ++i) + delete free_list_[i]; + free_list_.clear(); + for (int i = 0; i < traversal_free_list_.size(); ++i) + delete traversal_free_list_[i]; + traversal_free_list_.clear(); + all_traversals.clear(); + exp_agenda.clear(); + agenda.clear(); + tps2node.clear(); + edge2node.clear(); + all_edges.clear(); + passive_edges.clear(); + active_edges.clear(); + } + + ~CFG_WFSTComposerImpl() { + FreeAll(); + } + + // returns the total number of edges created during composition + int EdgesCreated() const { + return free_list_.size(); + } + + private: + void DoScan(const Edge* edge) { + // here, we assume that the FST will potentially have many more outgoing + // edges than the grammar, which will be just a couple. If you want to + // efficiently handle the case where both are relatively large, this code + // will need to change how the intersection is done. The best general + // solution would probably be the Baeza-Yates double binary search. + + const EGrammarNode* dot = edge->dot; + const WFSTNode* r = edge->r; + const map& terms = dot->GetTerminals(); + for (map::const_iterator git = terms.begin(); + git != terms.end(); ++git) { + + if (!(TD::Convert(git->first)[0] >= '0' && TD::Convert(git->first)[0] <= '9')) { + std::cerr << "TERMINAL SYMBOL: " << TD::Convert(git->first) << endl; + abort(); + } + std::vector > extensions = r->ExtendInput(atoi(TD::Convert(git->first))); + for (unsigned nsi = 0; nsi < extensions.size(); ++nsi) { + const WFSTNode* next_r = extensions[nsi].first; + const EGrammarNode* next_dot = &git->second; + const bool grammar_continues = next_dot->GrammarContinues(); + const bool rule_completes = next_dot->RuleCompletes(); + if (extensions[nsi].second) + cerr << "!!! " << extensions[nsi].second->AsString() << endl; + // cerr << " rule completes: " << rule_completes << " after consuming " << TD::Convert(git->first) << endl; + assert(grammar_continues || rule_completes); + const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); + if (rule_completes) + IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second, input_features)); + if (grammar_continues) + IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second)); + } + } + } + + void DoPredict(const Edge* edge, const EGrammar& g) { + const EGrammarNode* dot = edge->dot; + const map& non_terms = dot->GetNonTerminals(); + for (map::const_iterator git = non_terms.begin(); + git != non_terms.end(); ++git) { + const WordID nt_to_predict = git->first; + //cerr << edge->id << " -- " << TD::Convert(nt_to_predict*-1) << endl; + EGrammar::const_iterator egi = g.find(nt_to_predict); + if (egi == g.end()) { + cerr << "[ERROR] Can't find any grammar rules with a LHS of type " + << TD::Convert(-1*nt_to_predict) << '!' << endl; + continue; + } + assert(edge->IsActive()); + const EGrammarNode* new_dot = &egi->second; + Edge* new_edge = new Edge(nt_to_predict, new_dot, edge->r, edge); + IncorporateNewEdge(new_edge); + } + } + + void DoComplete(const Edge* passive) { +#ifdef DEBUG_CHART_PARSER + cerr << " complete: " << *passive << endl; +#endif + const WordID completed_nt = passive->cat; + const WFSTNode* q = passive->q; + const WFSTNode* next_r = passive->r; + const Edge query(q); + const pair::iterator, + unordered_multiset::iterator > p = + active_edges.equal_range(&query); + for (unordered_multiset::iterator it = p.first; + it != p.second; ++it) { + const Edge* active = *it; +#ifdef DEBUG_CHART_PARSER + cerr << " pos: " << *active << endl; +#endif + const EGrammarNode* next_dot = active->dot->Extend(completed_nt); + if (!next_dot) continue; + const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); + // add up to 2 rules + if (next_dot->RuleCompletes()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); + if (next_dot->GrammarContinues()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); + } + } + + void DoMergeWithPassives(const Edge* active) { + // edge is active, has non-terminals, we need to find the passives that can extend it + assert(active->IsActive()); + assert(active->dot->HasNonTerminals()); +#ifdef DEBUG_CHART_PARSER + cerr << " merge active with passives: ACT=" << *active << endl; +#endif + const Edge query(active->r, 1); + const pair::iterator, + unordered_multiset::iterator > p = + passive_edges.equal_range(&query); + for (unordered_multiset::iterator it = p.first; + it != p.second; ++it) { + const Edge* passive = *it; + const EGrammarNode* next_dot = active->dot->Extend(passive->cat); + if (!next_dot) continue; + const WFSTNode* next_r = passive->r; + const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); + if (next_dot->RuleCompletes()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); + if (next_dot->GrammarContinues()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); + } + } + + // take ownership of edge memory, add to various indexes, etc + // returns true if this edge is new + bool IncorporateNewEdge(Edge* edge) { + free_list_.push_back(edge); + if (edge->passive_parent && edge->active_parent) { + Traversal* t = new Traversal(edge, edge->active_parent, edge->passive_parent); + traversal_free_list_.push_back(t); + if (all_traversals.find(t) != all_traversals.end()) { + return false; + } else { + all_traversals.insert(t); + } + } + exp_agenda.AddEdge(edge); + return true; + } + + bool FinishEdge(const Edge* edge, Hypergraph* hg) { + bool is_new = false; + if (all_edges.find(edge) == all_edges.end()) { +#ifdef DEBUG_CHART_PARSER + cerr << *edge << " is NEW\n"; +#endif + all_edges.insert(edge); + is_new = true; + if (edge->IsPassive()) passive_edges.insert(edge); + if (edge->IsActive()) active_edges.insert(edge); + agenda.AddEdge(edge); + } else { +#ifdef DEBUG_CHART_PARSER + cerr << *edge << " is NOT NEW.\n"; +#endif + } + AddEdgeToTranslationForest(edge, hg); + return is_new; + } + + // build the translation forest + void AddEdgeToTranslationForest(const Edge* edge, Hypergraph* hg) { + assert(hg->nodes_.size() < kMAX_NODES); + Hypergraph::Node* tps = NULL; + // first add any target language rules + if (edge->tps) { + Hypergraph::Node*& node = tps2node[(size_t)edge->tps.get()]; + if (!node) { + // cerr << "Creating phrases for " << edge->tps << endl; + const TRulePtr& rule = edge->tps; + node = hg->AddNode(kPHRASE); + Hypergraph::Edge* hg_edge = hg->AddEdge(rule, Hypergraph::TailNodeVector()); + hg_edge->feature_values_ += rule->GetFeatureValues(); + hg->ConnectEdgeToHeadNode(hg_edge, node); + } + tps = node; + } + Hypergraph::Node*& head_node = edge2node[edge]; + if (!head_node) + head_node = hg->AddNode(kPHRASE); + if (edge->cat == start_cat_ && edge->q == q_0_ && edge->r == q_final_ && edge->IsPassive()) { + assert(goal_node == NULL || goal_node == head_node); + goal_node = head_node; + } + Hypergraph::TailNodeVector tail; + SparseVector extra; + if (edge->IsCreatedByPredict()) { + // extra.set_value(FD::Convert("predict"), 1); + } else if (edge->IsCreatedByScan()) { + tail.push_back(edge2node[edge->active_parent]->id_); + if (tps) { + tail.push_back(tps->id_); + } + //extra.set_value(FD::Convert("scan"), 1); + } else if (edge->IsCreatedByComplete()) { + tail.push_back(edge2node[edge->active_parent]->id_); + tail.push_back(edge2node[edge->passive_parent]->id_); + //extra.set_value(FD::Convert("complete"), 1); + } else { + assert(!"unexpected edge type!"); + } + //cerr << head_node->id_ << "<--" << *edge << endl; + +#ifdef DEBUG_CHART_PARSER + for (int i = 0; i < tail.size(); ++i) + if (tail[i] == head_node->id_) { + cerr << "ERROR: " << *edge << "\n i=" << i << endl; + if (i == 1) { cerr << "\tP: " << *edge->passive_parent << endl; } + if (i == 0) { cerr << "\tA: " << *edge->active_parent << endl; } + assert(!"self-loop found!"); + } +#endif + Hypergraph::Edge* hg_edge = NULL; + if (tail.size() == 0) { + hg_edge = hg->AddEdge(kEPSRule, tail); + } else if (tail.size() == 1) { + hg_edge = hg->AddEdge(kX1, tail); + } else if (tail.size() == 2) { + hg_edge = hg->AddEdge(kX1X2, tail); + } + if (edge->features) + hg_edge->feature_values_ += *edge->features; + hg_edge->feature_values_ += extra; + hg->ConnectEdgeToHeadNode(hg_edge, head_node); + } + + Hypergraph::Node* goal_node; + EdgeQueue exp_agenda; + EdgeQueue agenda; + unordered_map tps2node; + unordered_map edge2node; + unordered_set all_traversals; + unordered_set all_edges; + unordered_multiset passive_edges; + unordered_multiset active_edges; + vector free_list_; + vector traversal_free_list_; + const WordID start_cat_; + const WFSTNode* const q_0_; + const WFSTNode* const q_final_; +}; + +#ifdef DEBUG_CHART_PARSER +static string TrimRule(const string& r) { + size_t start = r.find(" |||") + 5; + size_t end = r.rfind(" |||"); + return r.substr(start, end - start); +} +#endif + +void AddGrammarRule(const string& r, EGrammar* g) { + const size_t pos = r.find(" ||| "); + if (pos == string::npos || r[0] != '[') { + cerr << "Bad rule: " << r << endl; + return; + } + const size_t rpos = r.rfind(" ||| "); + string feats; + string rs = r; + if (rpos != pos) { + feats = r.substr(rpos + 5); + rs = r.substr(0, rpos); + } + string rhs = rs.substr(pos + 5); + string trule = rs + " ||| " + rhs + " ||| " + feats; + TRule tr(trule); + cerr << "X: " << tr.e_[0] << endl; +#ifdef DEBUG_CHART_PARSER + string hint_last_rule; +#endif + EGrammarNode* cur = &(*g)[tr.GetLHS()]; + cur->is_root = true; + for (int i = 0; i < tr.FLength(); ++i) { + WordID sym = tr.f()[i]; +#ifdef DEBUG_CHART_PARSER + hint_last_rule = TD::Convert(sym < 0 ? -sym : sym); + cur->hint += " <@@> (*" + hint_last_rule + ") " + TrimRule(tr.AsString()); +#endif + if (sym < 0) + cur = &cur->ntptr[sym]; + else + cur = &cur->tptr[sym]; + } +#ifdef DEBUG_CHART_PARSER + cur->hint += " <@@> (" + hint_last_rule + "*) " + TrimRule(tr.AsString()); +#endif + cur->is_some_rule_complete = true; + cur->input_features = tr.GetFeatureValues(); +} + +CFG_WFSTComposer::~CFG_WFSTComposer() { + delete pimpl_; +} + +CFG_WFSTComposer::CFG_WFSTComposer(const WFST& wfst) { + InitializeConstants(); + pimpl_ = new CFG_WFSTComposerImpl(kUNIQUE_START, wfst.Initial(), wfst.Final()); +} + +bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest) { + // first, convert the src forest into an EGrammar + EGrammar g; + const int nedges = src_forest.edges_.size(); + const int nnodes = src_forest.nodes_.size(); + vector cats(nnodes); + bool assign_cats = false; + for (int i = 0; i < nnodes; ++i) + if (assign_cats) { + cats[i] = TD::Convert("CAT_" + boost::lexical_cast(i)) * -1; + } else { + cats[i] = src_forest.nodes_[i].cat_; + } + // construct the grammar + for (int i = 0; i < nedges; ++i) { + const Hypergraph::Edge& edge = src_forest.edges_[i]; + const vector& src = edge.rule_->f(); + EGrammarNode* cur = &g[cats[edge.head_node_]]; + cur->is_root = true; + int ntc = 0; + for (int j = 0; j < src.size(); ++j) { + WordID sym = src[j]; + if (sym <= 0) { + sym = cats[edge.tail_nodes_[ntc]]; + ++ntc; + cur = &cur->ntptr[sym]; + } else { + cur = &cur->tptr[sym]; + } + } + cur->is_some_rule_complete = true; + cur->input_features = edge.feature_values_; + } + EGrammarNode& goal_rule = g[kUNIQUE_START]; + assert((goal_rule.ntptr.size() == 1 && goal_rule.tptr.size() == 0) || + (goal_rule.ntptr.size() == 0 && goal_rule.tptr.size() == 1)); + + return pimpl_->Compose(g, trg_forest); +} + +bool CFG_WFSTComposer::Compose(istream* in, Hypergraph* trg_forest) { + EGrammar g; + while(*in) { + string line; + getline(*in, line); + if (line.empty()) continue; + AddGrammarRule(line, &g); + } + + return pimpl_->Compose(g, trg_forest); +} diff --git a/gi/pf/cfg_wfst_composer.h b/gi/pf/cfg_wfst_composer.h new file mode 100644 index 00000000..cf47f459 --- /dev/null +++ b/gi/pf/cfg_wfst_composer.h @@ -0,0 +1,46 @@ +#ifndef _CFG_WFST_COMPOSER_H_ +#define _CFG_WFST_COMPOSER_H_ + +#include +#include +#include + +#include "trule.h" +#include "wordid.h" + +class CFG_WFSTComposerImpl; +class Hypergraph; + +struct WFSTNode { + virtual ~WFSTNode(); + // returns the next states reachable by consuming srcindex (which identifies a word) + // paired with the output string generated by taking that transition. + virtual std::vector > ExtendInput(unsigned srcindex) const = 0; +}; + +struct WFST { + virtual ~WFST(); + virtual const WFSTNode* Final() const = 0; + virtual const WFSTNode* Initial() const = 0; +}; + +class CFG_WFSTComposer { + public: + ~CFG_WFSTComposer(); + explicit CFG_WFSTComposer(const WFST& wfst); + bool Compose(const Hypergraph& in_forest, Hypergraph* trg_forest); + + // reads the grammar from a file. There must be a single top-level + // S -> X rule. Anything else is possible. Format is: + // [S] ||| [SS,1] + // [SS] ||| [NP,1] [VP,2] ||| Feature1=0.2 Feature2=-2.3 + // [SS] ||| [VP,1] [NP,2] ||| Feature1=0.8 + // [NP] ||| [DET,1] [N,2] ||| Feature3=2 + // ... + bool Compose(std::istream* grammar_file, Hypergraph* trg_forest); + + private: + CFG_WFSTComposerImpl* pimpl_; +}; + +#endif diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc new file mode 100644 index 00000000..582d1be7 --- /dev/null +++ b/gi/pf/dpnaive.cc @@ -0,0 +1,349 @@ +#include +#include +#include + +#include +#include +#include + +#include "base_measures.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; +struct FSTState; + +size_t hash_value(const TRule& r) { + size_t h = 2 - r.lhs_; + boost::hash_combine(h, boost::hash_value(r.e_)); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(4),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(4),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_e, + set* vocab_f) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +shared_ptr prng; + +template +struct ModelAndData { + explicit ModelAndData(const Base& b, const vector >& ce, const vector >& cf, const set& ve, const set& vf) : + rng(&*prng), + p0(b), + baseprob(prob_t::One()), + corpuse(ce), + corpusf(cf), + vocabe(ve), + vocabf(vf), + rules(1,1), + mh_samples(), + mh_rejects(), + kX(-TD::Convert("X")), + derivations(corpuse.size()) {} + + void ResampleHyperparameters() { + rules.resample_hyperparameters(&*prng); + } + + void InstantiateRule(const pair& from, + const pair& to, + const vector& sentf, + const vector& sente, + TRule* rule) const { + rule->f_.clear(); + rule->e_.clear(); + rule->lhs_ = kX; + for (short i = from.first; i < to.first; ++i) + rule->f_.push_back(sentf[i]); + for (short i = from.second; i < to.second; ++i) + rule->e_.push_back(sente[i]); + } + + void DecrementDerivation(const vector >& d, const vector& sentf, const vector& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + //cerr << "REMOVE: " << x.AsString() << endl; + if (rules.decrement(x)) { + baseprob /= p0(x); + //cerr << " (REMOVED ONLY INSTANCE)\n"; + } + } + } + + void PrintDerivation(const vector >& d, const vector& sentf, const vector& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + cerr << i << '/' << (d.size() - 1) << ": " << x << endl; + } + } + + void IncrementDerivation(const vector >& d, const vector& sentf, const vector& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + if (rules.increment(x)) { + baseprob *= p0(x); + } + } + } + + prob_t Likelihood() const { + prob_t p; + p.logeq(rules.log_crp_prob()); + return p * baseprob; + } + + prob_t DerivationProposalProbability(const vector >& d, const vector& sentf, const vector& sente) const { + prob_t p = prob_t::One(); + if (d.size() < 2) return p; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + prob_t rp; rp.logeq(rules.logprob(x, log(p0(x)))); + p *= rp; + } + return p; + } + + void Sample(); + + MT19937* rng; + const Base& p0; + prob_t baseprob; // cached value of generating the table table labels from p0 + // this can't be used if we go to a hierarchical prior! + const vector >& corpuse, corpusf; + const set& vocabe, vocabf; + CCRP_NoTable rules; + unsigned mh_samples, mh_rejects; + const int kX; + vector > > derivations; +}; + +template +void ModelAndData::Sample() { + unsigned MAXK = 4; + unsigned MAXL = 4; + TRule x; + x.lhs_ = -TD::Convert("X"); + for (int samples = 0; samples < 1000; ++samples) { + if (samples % 1 == 0 && samples > 0) { + //ResampleHyperparameters(); + cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n"; + for (int i = 0; i < 10; ++i) { + cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl; + PrintDerivation(derivations[i], corpusf[i], corpuse[i]); + } + } + cerr << '.' << flush; + for (int s = 0; s < corpuse.size(); ++s) { + const vector& sentf = corpusf[s]; + const vector& sente = corpuse[s]; +// cerr << " CUSTOMERS: " << rules.num_customers() << endl; +// cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl; + + vector >& deriv = derivations[s]; + const prob_t p_cur = Likelihood(); + DecrementDerivation(deriv, sentf, sente); + + boost::multi_array a(boost::extents[sentf.size() + 1][sente.size() + 1]); + boost::multi_array trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); + a[0][0] = prob_t::One(); + for (int i = 0; i < sentf.size(); ++i) { + for (int j = 0; j < sente.size(); ++j) { + const prob_t src_a = a[i][j]; + x.f_.clear(); + for (int k = 1; k <= MAXK; ++k) { + if (i + k > sentf.size()) break; + x.f_.push_back(sentf[i + k - 1]); + x.e_.clear(); + for (int l = 1; l <= MAXL; ++l) { + if (j + l > sente.size()) break; + x.e_.push_back(sente[j + l - 1]); + trans[i][j][k - 1][l - 1].logeq(rules.logprob(x, log(p0(x)))); + a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; + } + } + } + } +// cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl; + const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente); + + vector > newderiv; + int cur_i = sentf.size(); + int cur_j = sente.size(); + while(cur_i > 0 && cur_j > 0) { + newderiv.push_back(pair(cur_i, cur_j)); +// cerr << "NODE: (" << cur_i << "," << cur_j << ")\n"; + SampleSet ss; + vector > nexts; + for (int k = 1; k <= MAXK; ++k) { + const int hyp_i = cur_i - k; + if (hyp_i < 0) break; + for (int l = 1; l <= MAXL; ++l) { + const int hyp_j = cur_j - l; + if (hyp_j < 0) break; + const prob_t& inside = a[hyp_i][hyp_j]; + if (inside == prob_t::Zero()) continue; + const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1]; + if (transp == prob_t::Zero()) continue; + const prob_t p = inside * transp; + ss.add(p); + nexts.push_back(pair(hyp_i, hyp_j)); +// cerr << " (" << hyp_i << "," << hyp_j << ") <--- " << log(p) << endl; + } + } +// cerr << " sample set has " << nexts.size() << " elements.\n"; + const int selected = rng->SelectSample(ss); + cur_i = nexts[selected].first; + cur_j = nexts[selected].second; + } + newderiv.push_back(pair(0,0)); + const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente); + IncrementDerivation(newderiv, sentf, sente); +// cerr << "SANITY: " << q_new << " " <(); + kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + Model1 m1(conf["model1"].as()); + PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); + + ModelAndData posterior(lp0, corpuse, corpusf, vocabe, vocabf); + posterior.Sample(); + + return 0; +} + diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc new file mode 100644 index 00000000..2c2a86f9 --- /dev/null +++ b/gi/pf/itg.cc @@ -0,0 +1,224 @@ +#include +#include +#include + +#include +#include +#include + +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +ostream& operator<<(ostream& os, const vector& p) { + os << '['; + for (int i = 0; i < p.size(); ++i) + os << (i==0 ? "" : " ") << TD::Convert(p[i]); + return os << ']'; +} + +size_t hash_value(const TRule& r) { + size_t h = boost::hash_value(r.e_); + boost::hash_combine(h, -r.lhs_); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +struct Model1 { + explicit Model1(const string& fname) : + kNULL(TD::Convert("")), + kZERO() { + LoadModel1(fname); + } + + void LoadModel1(const string& fname) { + cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + int cur = 0; + int start = 0; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + const WordID src = TD::Convert(&line[0]); + ++cur; + start = cur; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + WordID trg = TD::Convert(&line[start]); + const double logprob = strtod(&line[cur + 1], NULL); + if (src >= ttable.size()) ttable.resize(src + 1); + ttable[src][trg].logeq(logprob); + } + cerr << " read " << lc << " parameters.\n"; + } + + // returns prob 0 if src or trg is not found! + const prob_t& operator()(WordID src, WordID trg) const { + if (src == 0) src = kNULL; + if (src < ttable.size()) { + const map& cpd = ttable[src]; + const map::const_iterator it = cpd.find(trg); + if (it != cpd.end()) + return it->second; + } + return kZERO; + } + + const WordID kNULL; + const prob_t kZERO; + vector > ttable; +}; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("particles,p",po::value()->default_value(25),"Number of particles") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(7),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(7),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); + const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + const unsigned particles = conf["particles"].as(); + const unsigned samples = conf["samples"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + shared_ptr prng; + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + cerr << "Reading corpus...\n"; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + assert(corpusf.size() == corpuse.size()); + + const int kLHS = -TD::Convert("X"); + Model1 m1(conf["model1"].as()); + Model1 invm1(conf["inverse_model1"].as()); + for (int si = 0; si < conf["samples"].as(); ++si) { + cerr << '.' << flush; + for (int ci = 0; ci < corpusf.size(); ++ci) { + const vector& src = corpusf[ci]; + const vector& trg = corpuse[ci]; + for (int i = 0; i < src.size(); ++i) { + for (int j = 0; j < trg.size(); ++j) { + const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE); + for (int k = 0; k < eff_max_src; ++k) { + const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE)); + for (int l = 0; l < eff_max_trg; ++l) { + } + } + } + } + } + } +} + diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc new file mode 100644 index 00000000..4c6ba3ef --- /dev/null +++ b/gi/pf/pfbrat.cc @@ -0,0 +1,554 @@ +#include +#include +#include + +#include +#include +#include +#include + +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "cfg_wfst_composer.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; +struct FSTState; + +size_t hash_value(const TRule& r) { + size_t h = 2 - r.lhs_; + boost::hash_combine(h, boost::hash_value(r.e_)); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +struct ConditionalBase { + explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) : + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_TARGET(1.0 / vocab_e_size), + kNULL(TD::Convert("")) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + LoadModel1(model1fname); + } + + void LoadModel1(const string& fname) { + cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + int cur = 0; + int start = 0; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + const WordID src = TD::Convert(&line[0]); + ++cur; + start = cur; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + WordID trg = TD::Convert(&line[start]); + const double logprob = strtod(&line[cur + 1], NULL); + if (src >= ttable.size()) ttable.resize(src + 1); + ttable[src][trg].logeq(logprob); + } + cerr << " read " << lc << " parameters.\n"; + } + + // return logp0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + const int flen = rule.f_.size(); + const int elen = rule.e_.size(); + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) { // for each position i in e-RHS + const WordID trg = rule.e_[i]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? kNULL : rule.f_[j]; + const map::const_iterator it = ttable[src].find(trg); + if (it != ttable[src].end()) { + tp += kM1MIXTURE * it->second; + } + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + return p; + } + + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_TARGET; + const WordID kNULL; + vector > ttable; +}; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(3),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(3),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +struct UniphraseLM { + UniphraseLM(const vector >& corpus, + const set& vocab, + const po::variables_map& conf) : + phrases_(1,1), + gen_(1,1), + corpus_(corpus), + uniform_word_(1.0 / vocab.size()), + gen_p0_(0.5), + p_end_(0.5), + use_poisson_(conf.count("poisson_length") > 0) {} + + void ResampleHyperparameters(MT19937* rng) { + phrases_.resample_hyperparameters(rng); + gen_.resample_hyperparameters(rng); + cerr << " " << phrases_.concentration(); + } + + CCRP_NoTable > phrases_; + CCRP_NoTable gen_; + vector > z_; // z_[i] is there a phrase boundary after the ith word + const vector >& corpus_; + const double uniform_word_; + const double gen_p0_; + const double p_end_; // in base length distribution, p of the end of a phrase + const bool use_poisson_; +}; + +struct Reachability { + boost::multi_array edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? + boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + + Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), + max_src_delta(boost::extents[srclen][trglen]) { + ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); + } + + private: + struct SState { + SState() : prev_src_covered(), prev_trg_covered() {} + SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} + int prev_src_covered; + int prev_trg_covered; + }; + + struct NState { + NState() : next_src_covered(), next_trg_covered() {} + NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {} + int next_src_covered; + int next_trg_covered; + }; + + void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { + typedef boost::multi_array, 2> array_type; + array_type a(boost::extents[srclen + 1][trglen + 1]); + a[0][0].push_back(SState()); + for (int i = 0; i < srclen; ++i) { + for (int j = 0; j < trglen; ++j) { + if (a[i][j].size() == 0) continue; + const SState prev(i,j); + for (int k = 1; k <= src_max_phrase_len; ++k) { + if ((i + k) > srclen) continue; + for (int l = 1; l <= trg_max_phrase_len; ++l) { + if ((j + l) > trglen) continue; + a[i + k][j + l].push_back(prev); + } + } + } + } + a[0][0].clear(); + cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; + assert(a[srclen][trglen].size() > 0); + + typedef boost::multi_array rarray_type; + rarray_type r(boost::extents[srclen + 1][trglen + 1]); +// typedef boost::multi_array, 2> narray_type; +// narray_type b(boost::extents[srclen + 1][trglen + 1]); + r[srclen][trglen] = true; + for (int i = srclen; i >= 0; --i) { + for (int j = trglen; j >= 0; --j) { + vector& prevs = a[i][j]; + if (!r[i][j]) { prevs.clear(); } +// const NState nstate(i,j); + for (int k = 0; k < prevs.size(); ++k) { + r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; + int src_delta = i - prevs[k].prev_src_covered; + edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; + short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; + if (src_delta > msd) msd = src_delta; +// b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate); + } + } + } + assert(!edges[0][0][1][0]); + assert(!edges[0][0][0][1]); + assert(!edges[0][0][0][0]); + cerr << " MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl; + assert(max_src_delta[0][0] > 0); + //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; + //for (int i = 0; i < b[0][0].size(); ++i) { + // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; + //} + } +}; + +ostream& operator<<(ostream& os, const FSTState& q); +struct FSTState { + explicit FSTState(int src_size) : + trg_covered_(), + src_covered_(), + src_coverage_(src_size) {} + + FSTState(short trg_covered, short src_covered, const vector& src_coverage, const vector& src_prefix) : + trg_covered_(trg_covered), + src_covered_(src_covered), + src_coverage_(src_coverage), + src_prefix_(src_prefix) { + if (src_coverage_.size() == src_covered) { + assert(src_prefix.size() == 0); + } + } + + // if we extend by the word at src_position, what are + // the next states that are reachable and lie on a valid + // path to the final state? + vector Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const { + assert(src_position < src_coverage_.size()); + if (src_coverage_[src_position]) { + cerr << "Trying to extend " << *this << " with position " << src_position << endl; + abort(); + } + vector ncvg = src_coverage_; + ncvg[src_position] = true; + + vector res; + const int trg_remaining = trg_len - trg_covered_; + if (trg_remaining <= 0) { + cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl; + abort(); + } + const int src_remaining = src_len - src_covered_; + if (src_remaining <= 0) { + cerr << "Source appears to have been covered: " << *this << endl; + abort(); + } + + for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) { + if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) { + int nc = src_prefix_.size() + 1 + src_covered_; + res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector())); + } + } + + if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) { + vector nsp = src_prefix_; + nsp.push_back(src_position); + res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp)); + } + + if (res.size() == 0) { + cerr << *this << " can't be extended!\n"; + abort(); + } + return res; + } + + short trg_covered_, src_covered_; + vector src_coverage_; + vector src_prefix_; +}; +bool operator<(const FSTState& q, const FSTState& r) { + if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_; + if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_; + if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_; + return q.src_prefix_ < r.src_prefix_; +} + +ostream& operator<<(ostream& os, const FSTState& q) { + os << "[" << q.trg_covered_ << " : "; + for (int i = 0; i < q.src_coverage_.size(); ++i) + os << q.src_coverage_[i]; + os << " : <"; + for (int i = 0; i < q.src_prefix_.size(); ++i) { + if (i != 0) os << ' '; + os << q.src_prefix_[i]; + } + return os << ">]"; +} + +struct MyModel { + MyModel(ConditionalBase& rcp0) : rp0(rcp0) {} + typedef unordered_map, CCRP_NoTable, boost::hash > > SrcToRuleCRPMap; + + void DecrementRule(const TRule& rule) { + SrcToRuleCRPMap::iterator it = rules.find(rule.f_); + assert(it != rules.end()); + it->second.decrement(rule); + if (it->second.num_customers() == 0) rules.erase(it); + } + + void IncrementRule(const TRule& rule) { + SrcToRuleCRPMap::iterator it = rules.find(rule.f_); + if (it == rules.end()) { + CCRP_NoTable crp(1,1); + it = rules.insert(make_pair(rule.f_, crp)).first; + } + it->second.increment(rule); + } + + // conditioned on rule.f_ + prob_t RuleConditionalProbability(const TRule& rule) const { + const prob_t base = rp0(rule); + SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_); + if (it == rules.end()) { + return base; + } else { + const double lp = it->second.logprob(rule, log(base)); + prob_t q; q.logeq(lp); + return q; + } + } + + const ConditionalBase& rp0; + SrcToRuleCRPMap rules; +}; + +struct MyFST : public WFST { + MyFST(const vector& ssrc, const vector& strg, MyModel* m) : + src(ssrc), trg(strg), + r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE), + model(m) { + FSTState in(src.size()); + cerr << " INIT: " << in << endl; + init = GetNode(in); + for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true; + in.src_covered_ = src.size(); + in.trg_covered_ = trg.size(); + cerr << "FINAL: " << in << endl; + final = GetNode(in); + } + virtual const WFSTNode* Final() const; + virtual const WFSTNode* Initial() const; + + const WFSTNode* GetNode(const FSTState& q); + map > m; + const vector& src; + const vector& trg; + Reachability r; + const WFSTNode* init; + const WFSTNode* final; + MyModel* model; +}; + +struct MyNode : public WFSTNode { + MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {} + virtual vector > ExtendInput(unsigned srcindex) const; + const FSTState state; + mutable MyFST* container; +}; + +vector > MyNode::ExtendInput(unsigned srcindex) const { + cerr << "EXTEND " << state << " with " << srcindex << endl; + vector ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r); + vector > res(ext.size()); + for (unsigned i = 0; i < ext.size(); ++i) { + res[i].first = container->GetNode(ext[i]); + if (ext[i].src_prefix_.size() == 0) { + const unsigned trg_from = state.trg_covered_; + const unsigned trg_to = ext[i].trg_covered_; + const unsigned prev_prfx_size = state.src_prefix_.size(); + res[i].second.reset(new TRule); + res[i].second->lhs_ = -TD::Convert("X"); + vector& src = res[i].second->f_; + vector& trg = res[i].second->e_; + src.resize(prev_prfx_size + 1); + for (unsigned j = 0; j < prev_prfx_size; ++j) + src[j] = container->src[state.src_prefix_[j]]; + src[prev_prfx_size] = container->src[srcindex]; + for (unsigned j = trg_from; j < trg_to; ++j) + trg.push_back(container->trg[j]); + res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second))); + } + } + return res; +} + +const WFSTNode* MyFST::GetNode(const FSTState& q) { + boost::shared_ptr& res = m[q]; + if (!res) { + res.reset(new MyNode(q, this)); + } + return &*res; +} + +const WFSTNode* MyFST::Final() const { + return final; +} + +const WFSTNode* MyFST::Initial() const { + return init; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); + kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + shared_ptr prng; + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + ConditionalBase lp0(conf["model1_interpolation_weight"].as(), + vocabe.size(), + conf["model1"].as()); + MyModel m(lp0); + + TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0"); + m.IncrementRule(x); + TRule y("[X] ||| nY dyN ||| gave ||| 0"); + m.IncrementRule(y); + + + MyFST fst(corpusf[0], corpuse[0], &m); + ifstream in("./kimura.g"); + assert(in); + CFG_WFSTComposer comp(fst); + Hypergraph hg; + bool succeed = comp.Compose(&in, &hg); + hg.PrintGraphviz(); + if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } + +#if 0 + ifstream in2("./amnabooks.g"); + assert(in2); + MyFST fst2(corpusf[1], corpuse[1], &m); + CFG_WFSTComposer comp2(fst2); + Hypergraph hg2; + bool succeed2 = comp2.Compose(&in2, &hg2); + if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } +#endif + + SparseVector w; w.set_value(FD::Convert("Proposal"), 1.0); + hg.Reweight(w); + cerr << ViterbiFTree(hg) << endl; + return 0; +} + diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc new file mode 100644 index 00000000..18dfd03b --- /dev/null +++ b/gi/pf/pfdist.cc @@ -0,0 +1,621 @@ +#include +#include +#include + +#include +#include +#include + +#include "base_measures.h" +#include "reachability.h" +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +size_t hash_value(const TRule& r) { + size_t h = boost::hash_value(r.e_); + boost::hash_combine(h, -r.lhs_); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("particles,p",po::value()->default_value(30),"Number of particles") + ("filter_frequency,f",po::value()->default_value(5),"Number of time steps between filterings") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(5),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(5),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +#if 0 +struct MyConditionalModel { + MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable(1,1)) {} + + prob_t srcp0(const vector& src) const { + prob_t p(1.0 / 3000.0); + p.poweq(src.size()); + prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0)); + p *= lenp; + return p; + } + + void DecrementRule(const TRule& rule) { + const RuleCRPMap::iterator it = rules.find(rule.f_); + assert(it != rules.end()); + if (it->second.decrement(rule)) { + base /= (*rp0)(rule); + if (it->second.num_customers() == 0) + rules.erase(it); + } + if (src_phrases.decrement(rule.f_)) + base /= srcp0(rule.f_); + } + + void IncrementRule(const TRule& rule) { + RuleCRPMap::iterator it = rules.find(rule.f_); + if (it == rules.end()) + it = rules.insert(make_pair(rule.f_, CCRP_NoTable(1,1))).first; + if (it->second.increment(rule)) { + base *= (*rp0)(rule); + } + if (src_phrases.increment(rule.f_)) + base *= srcp0(rule.f_); + } + + void IncrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + void IncrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].increment(dist)) + base *= jp0(dist, src_len); + } + + void DecrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].decrement(dist)) + base /= jp0(dist, src_len); + } + + void IncrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + IncrementJump(js[i], src_len); + } + + void DecrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + DecrementJump(js[i], src_len); + } + + // p(jump = dist | src_len , z) + prob_t JumpProbability(int dist, unsigned src_len) { + const prob_t p0 = jp0(dist, src_len); + const double lp = src_jumps[src_len].logprob(dist, log(p0)); + prob_t q; q.logeq(lp); + return q; + } + + // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) + prob_t RuleProbability(const TRule& rule) const { + const prob_t p0 = (*rp0)(rule); + prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_)))); + const RuleCRPMap::const_iterator it = rules.find(rule.f_); + if (it == rules.end()) return srcp * p0; + const double lp = it->second.logprob(rule, log(p0)); + prob_t q; q.logeq(lp); + return q * srcp; + } + + prob_t Likelihood() const { + prob_t p = base; + for (RuleCRPMap::const_iterator it = rules.begin(); + it != rules.end(); ++it) { + prob_t cl; cl.logeq(it->second.log_crp_prob()); + p *= cl; + } + for (unsigned l = 1; l < src_jumps.size(); ++l) { + if (src_jumps[l].num_customers() > 0) { + prob_t q; + q.logeq(src_jumps[l].log_crp_prob()); + p *= q; + } + } + return p; + } + + JumpBase jp0; + const PhraseConditionalBase* rp0; + prob_t base; + typedef unordered_map, CCRP_NoTable, boost::hash > > RuleCRPMap; + RuleCRPMap rules; + CCRP_NoTable > src_phrases; + vector > src_jumps; +}; + +#endif + +struct MyJointModel { + MyJointModel(PhraseJointBase& rcp0) : + rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable(1,1)) {} + + void DecrementRule(const TRule& rule) { + if (rules.decrement(rule)) + base /= rp0(rule); + } + + void IncrementRule(const TRule& rule) { + if (rules.increment(rule)) + base *= rp0(rule); + } + + void IncrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + void IncrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].increment(dist)) + base *= jp0(dist, src_len); + } + + void DecrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].decrement(dist)) + base /= jp0(dist, src_len); + } + + void IncrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + IncrementJump(js[i], src_len); + } + + void DecrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + DecrementJump(js[i], src_len); + } + + // p(jump = dist | src_len , z) + prob_t JumpProbability(int dist, unsigned src_len) { + const prob_t p0 = jp0(dist, src_len); + const double lp = src_jumps[src_len].logprob(dist, log(p0)); + prob_t q; q.logeq(lp); + return q; + } + + // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) + prob_t RuleProbability(const TRule& rule) const { + prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); + return p; + } + + prob_t Likelihood() const { + prob_t p = base; + prob_t q; q.logeq(rules.log_crp_prob()); + p *= q; + for (unsigned l = 1; l < src_jumps.size(); ++l) { + if (src_jumps[l].num_customers() > 0) { + prob_t q; + q.logeq(src_jumps[l].log_crp_prob()); + p *= q; + } + } + return p; + } + + JumpBase jp0; + const PhraseJointBase& rp0; + prob_t base; + CCRP_NoTable rules; + vector > src_jumps; +}; + +struct BackwardEstimate { + BackwardEstimate(const Model1& m1, const vector& src, const vector& trg) : + model1_(m1), src_(src), trg_(trg) { + } + const prob_t& operator()(const vector& src_cov, unsigned trg_cov) const { + assert(src_.size() == src_cov.size()); + assert(trg_cov <= trg_.size()); + prob_t& e = cache_[src_cov][trg_cov]; + if (e.is_0()) { + if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } + vector r(src_.size() + 1); r.clear(); + r.push_back(0); // NULL word + for (int i = 0; i < src_cov.size(); ++i) + if (!src_cov[i]) r.push_back(src_[i]); + const prob_t uniform_alignment(1.0 / r.size()); + e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_cov; j < trg_.size(); ++j) { + prob_t p; + for (unsigned i = 0; i < r.size(); ++i) + p += model1_(r[i], trg_[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; + abort(); + } + p *= uniform_alignment; + e *= p; + } + } + return e; + } + const Model1& model1_; + const vector& src_; + const vector& trg_; + mutable unordered_map, map, boost::hash > > cache_; +}; + +struct BackwardEstimateSym { + BackwardEstimateSym(const Model1& m1, + const Model1& invm1, const vector& src, const vector& trg) : + model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { + } + const prob_t& operator()(const vector& src_cov, unsigned trg_cov) const { + assert(src_.size() == src_cov.size()); + assert(trg_cov <= trg_.size()); + prob_t& e = cache_[src_cov][trg_cov]; + if (e.is_0()) { + if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } + vector r(src_.size() + 1); r.clear(); + for (int i = 0; i < src_cov.size(); ++i) + if (!src_cov[i]) r.push_back(src_[i]); + r.push_back(0); // NULL word + const prob_t uniform_alignment(1.0 / r.size()); + e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_cov; j < trg_.size(); ++j) { + prob_t p; + for (unsigned i = 0; i < r.size(); ++i) + p += model1_(r[i], trg_[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; + abort(); + } + p *= uniform_alignment; + e *= p; + } + r.pop_back(); + const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); + prob_t inv; + inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + for (unsigned i = 0; i < r.size(); ++i) { + prob_t p; + for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) + p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); + if (p.is_0()) { + cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; + abort(); + } + p *= inv_uniform; + inv *= p; + } + prob_t x = pow(e * inv, 0.5); + e = x; + //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; + } + return e; + } + const Model1& model1_; + const Model1& invmodel1_; + const vector& src_; + const vector& trg_; + mutable unordered_map, map, boost::hash > > cache_; +}; + +struct Particle { + Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {} + prob_t weight; + prob_t gamma_last; + vector src_jumps; + vector rules; + vector src_cv; + int src_cov; + int trg_cov; + int prev_pos; +}; + +ostream& operator<<(ostream& o, const vector& v) { + for (int i = 0; i < v.size(); ++i) + o << (v[i] ? '1' : '0'); + return o; +} +ostream& operator<<(ostream& o, const Particle& p) { + o << "[cv=" << p.src_cv << " src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']'; + return o; +} + +void FilterCrapParticlesAndReweight(vector* pps) { + vector& ps = *pps; + SampleSet ss; + for (int i = 0; i < ps.size(); ++i) + ss.add(ps[i].weight); + vector nps; nps.reserve(ps.size()); + const prob_t uniform_weight(1.0 / ps.size()); + for (int i = 0; i < ps.size(); ++i) { + nps.push_back(ps[prng->SelectSample(ss)]); + nps[i].weight = uniform_weight; + } + nps.swap(ps); +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); + const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + const unsigned particles = conf["particles"].as(); + const unsigned samples = conf["samples"].as(); + const unsigned rejuv_freq = conf["filter_frequency"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + cerr << "Reading corpus...\n"; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + assert(corpusf.size() == corpuse.size()); + + const int kLHS = -TD::Convert("X"); + Model1 m1(conf["model1"].as()); + Model1 invm1(conf["inverse_model1"].as()); + +#if 0 + PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size()); + MyConditionalModel m(lp0); +#else + PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); + MyJointModel m(lp0); +#endif + + cerr << "Initializing reachability limits...\n"; + vector ps(corpusf.size()); + vector reaches; reaches.reserve(corpusf.size()); + for (int ci = 0; ci < corpusf.size(); ++ci) + reaches.push_back(Reachability(corpusf[ci].size(), + corpuse[ci].size(), + kMAX_SRC_PHRASE, + kMAX_TRG_PHRASE)); + cerr << "Sampling...\n"; + vector tmp_p(10000); // work space + SampleSet pfss; + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpusf.size(); ++ci) { + vector& src = corpusf[ci]; + vector& trg = corpuse[ci]; + m.DecrementRules(ps[ci].rules); + m.DecrementJumps(ps[ci].src_jumps, src.size()); + + //BackwardEstimate be(m1, src, trg); + BackwardEstimateSym be(m1, invm1, src, trg); + const Reachability& r = reaches[ci]; + vector lps(particles); + + for (int pi = 0; pi < particles; ++pi) { + Particle& p = lps[pi]; + p.src_cv.resize(src.size(), false); + } + + bool all_complete = false; + while(!all_complete) { + SampleSet ss; + + // all particles have now been extended a bit, we will reweight them now + if (lps[0].trg_cov > 0) + FilterCrapParticlesAndReweight(&lps); + + // loop over all particles and extend them + bool done_nothing = true; + for (int pi = 0; pi < particles; ++pi) { + Particle& p = lps[pi]; + int tic = 0; + while(p.trg_cov < trg.size() && tic < rejuv_freq) { + ++tic; + done_nothing = false; + ss.clear(); + TRule x; x.lhs_ = kLHS; + prob_t z; + int first_uncovered = src.size(); + int last_uncovered = -1; + for (int i = 0; i < src.size(); ++i) { + const bool is_uncovered = !p.src_cv[i]; + if (i < first_uncovered && is_uncovered) first_uncovered = i; + if (is_uncovered && i > last_uncovered) last_uncovered = i; + } + assert(last_uncovered > -1); + assert(first_uncovered < src.size()); + + for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { + x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); + for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { + if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; + + const int last_possible_start = last_uncovered - src_len + 1; + assert(last_possible_start >= 0); + //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl; + //cerr << " first_uncovered=" << first_uncovered << " last_possible_start=" << last_possible_start << endl; + for (int i = first_uncovered; i <= last_possible_start; ++i) { + if (p.src_cv[i]) continue; + assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size + Particle& np = tmp_p[ss.size()]; + np = p; + x.f_.clear(); + int gap_add = 0; + bool bad = false; + prob_t jp = prob_t::One(); + int prev_pos = p.prev_pos; + for (int j = 0; j < src_len; ++j) { + if ((j + i + gap_add) == src.size()) { bad = true; break; } + while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; } + if ((j + i + gap_add) == src.size()) { bad = true; break; } + np.src_cv[i + j + gap_add] = true; + x.f_.push_back(src[i + j + gap_add]); + jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size()); + int jump = i + j + gap_add - prev_pos; + assert(jump != 0); + np.src_jumps.push_back(jump); + prev_pos = i + j + gap_add; + } + if (bad) continue; + np.prev_pos = prev_pos; + np.src_cov += x.f_.size(); + np.trg_cov += x.e_.size(); + if (x.f_.size() != src_len) continue; + prob_t rp = m.RuleProbability(x); + np.gamma_last = rp * jp; + const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2); + //cerr << "**rule=" << x << endl; + //cerr << " u=" << log(u) << " rule=" << rp << " jump=" << jp << endl; + ss.add(u); + np.rules.push_back(TRulePtr(new TRule(x))); + z += u; + + const bool completed = (p.trg_cov == trg.size()); + if (completed) { + int last_jump = src.size() - p.prev_pos; + assert(last_jump > 0); + p.src_jumps.push_back(last_jump); + p.weight *= m.JumpProbability(last_jump, src.size()); + } + } + } + } + cerr << "number of edges to consider: " << ss.size() << endl; + const int sampled = rng.SelectSample(ss); + prob_t q_n = ss[sampled] / z; + p = tmp_p[sampled]; + //m.IncrementRule(*p.rules.back()); + p.weight *= p.gamma_last / q_n; + cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; + cerr << p << endl; + } + } // loop over particles (pi = 0 .. particles) + if (done_nothing) all_complete = true; + } + pfss.clear(); + for (int i = 0; i < lps.size(); ++i) + pfss.add(lps[i].weight); + const int sampled = rng.SelectSample(pfss); + ps[ci] = lps[sampled]; + m.IncrementRules(lps[sampled].rules); + m.IncrementJumps(lps[sampled].src_jumps, src.size()); + for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } + cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; + } + cerr << "LLH: " << log(m.Likelihood()) << endl; + for (int sni = 0; sni < 5; ++sni) { + for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; } + } + } + return 0; +} + diff --git a/gi/pf/pfdist.new.cc b/gi/pf/pfdist.new.cc new file mode 100644 index 00000000..3169eb75 --- /dev/null +++ b/gi/pf/pfdist.new.cc @@ -0,0 +1,620 @@ +#include +#include +#include + +#include +#include +#include + +#include "base_measures.h" +#include "reachability.h" +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +size_t hash_value(const TRule& r) { + size_t h = boost::hash_value(r.e_); + boost::hash_combine(h, -r.lhs_); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("particles,p",po::value()->default_value(25),"Number of particles") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(5),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(5),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +#if 0 +struct MyConditionalModel { + MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable(1,1)) {} + + prob_t srcp0(const vector& src) const { + prob_t p(1.0 / 3000.0); + p.poweq(src.size()); + prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0)); + p *= lenp; + return p; + } + + void DecrementRule(const TRule& rule) { + const RuleCRPMap::iterator it = rules.find(rule.f_); + assert(it != rules.end()); + if (it->second.decrement(rule)) { + base /= (*rp0)(rule); + if (it->second.num_customers() == 0) + rules.erase(it); + } + if (src_phrases.decrement(rule.f_)) + base /= srcp0(rule.f_); + } + + void IncrementRule(const TRule& rule) { + RuleCRPMap::iterator it = rules.find(rule.f_); + if (it == rules.end()) + it = rules.insert(make_pair(rule.f_, CCRP_NoTable(1,1))).first; + if (it->second.increment(rule)) { + base *= (*rp0)(rule); + } + if (src_phrases.increment(rule.f_)) + base *= srcp0(rule.f_); + } + + void IncrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + void IncrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].increment(dist)) + base *= jp0(dist, src_len); + } + + void DecrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].decrement(dist)) + base /= jp0(dist, src_len); + } + + void IncrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + IncrementJump(js[i], src_len); + } + + void DecrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + DecrementJump(js[i], src_len); + } + + // p(jump = dist | src_len , z) + prob_t JumpProbability(int dist, unsigned src_len) { + const prob_t p0 = jp0(dist, src_len); + const double lp = src_jumps[src_len].logprob(dist, log(p0)); + prob_t q; q.logeq(lp); + return q; + } + + // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) + prob_t RuleProbability(const TRule& rule) const { + const prob_t p0 = (*rp0)(rule); + prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_)))); + const RuleCRPMap::const_iterator it = rules.find(rule.f_); + if (it == rules.end()) return srcp * p0; + const double lp = it->second.logprob(rule, log(p0)); + prob_t q; q.logeq(lp); + return q * srcp; + } + + prob_t Likelihood() const { + prob_t p = base; + for (RuleCRPMap::const_iterator it = rules.begin(); + it != rules.end(); ++it) { + prob_t cl; cl.logeq(it->second.log_crp_prob()); + p *= cl; + } + for (unsigned l = 1; l < src_jumps.size(); ++l) { + if (src_jumps[l].num_customers() > 0) { + prob_t q; + q.logeq(src_jumps[l].log_crp_prob()); + p *= q; + } + } + return p; + } + + JumpBase jp0; + const PhraseConditionalBase* rp0; + prob_t base; + typedef unordered_map, CCRP_NoTable, boost::hash > > RuleCRPMap; + RuleCRPMap rules; + CCRP_NoTable > src_phrases; + vector > src_jumps; +}; + +#endif + +struct MyJointModel { + MyJointModel(PhraseJointBase& rcp0) : + rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable(1,1)) {} + + void DecrementRule(const TRule& rule) { + if (rules.decrement(rule)) + base /= rp0(rule); + } + + void IncrementRule(const TRule& rule) { + if (rules.increment(rule)) + base *= rp0(rule); + } + + void IncrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + void IncrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].increment(dist)) + base *= jp0(dist, src_len); + } + + void DecrementJump(int dist, unsigned src_len) { + assert(src_len > 0); + if (src_jumps[src_len].decrement(dist)) + base /= jp0(dist, src_len); + } + + void IncrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + IncrementJump(js[i], src_len); + } + + void DecrementJumps(const vector& js, unsigned src_len) { + for (unsigned i = 0; i < js.size(); ++i) + DecrementJump(js[i], src_len); + } + + // p(jump = dist | src_len , z) + prob_t JumpProbability(int dist, unsigned src_len) { + const prob_t p0 = jp0(dist, src_len); + const double lp = src_jumps[src_len].logprob(dist, log(p0)); + prob_t q; q.logeq(lp); + return q; + } + + // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) + prob_t RuleProbability(const TRule& rule) const { + prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); + return p; + } + + prob_t Likelihood() const { + prob_t p = base; + prob_t q; q.logeq(rules.log_crp_prob()); + p *= q; + for (unsigned l = 1; l < src_jumps.size(); ++l) { + if (src_jumps[l].num_customers() > 0) { + prob_t q; + q.logeq(src_jumps[l].log_crp_prob()); + p *= q; + } + } + return p; + } + + JumpBase jp0; + const PhraseJointBase& rp0; + prob_t base; + CCRP_NoTable rules; + vector > src_jumps; +}; + +struct BackwardEstimate { + BackwardEstimate(const Model1& m1, const vector& src, const vector& trg) : + model1_(m1), src_(src), trg_(trg) { + } + const prob_t& operator()(const vector& src_cov, unsigned trg_cov) const { + assert(src_.size() == src_cov.size()); + assert(trg_cov <= trg_.size()); + prob_t& e = cache_[src_cov][trg_cov]; + if (e.is_0()) { + if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } + vector r(src_.size() + 1); r.clear(); + r.push_back(0); // NULL word + for (int i = 0; i < src_cov.size(); ++i) + if (!src_cov[i]) r.push_back(src_[i]); + const prob_t uniform_alignment(1.0 / r.size()); + e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_cov; j < trg_.size(); ++j) { + prob_t p; + for (unsigned i = 0; i < r.size(); ++i) + p += model1_(r[i], trg_[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; + abort(); + } + p *= uniform_alignment; + e *= p; + } + } + return e; + } + const Model1& model1_; + const vector& src_; + const vector& trg_; + mutable unordered_map, map, boost::hash > > cache_; +}; + +struct BackwardEstimateSym { + BackwardEstimateSym(const Model1& m1, + const Model1& invm1, const vector& src, const vector& trg) : + model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { + } + const prob_t& operator()(const vector& src_cov, unsigned trg_cov) const { + assert(src_.size() == src_cov.size()); + assert(trg_cov <= trg_.size()); + prob_t& e = cache_[src_cov][trg_cov]; + if (e.is_0()) { + if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } + vector r(src_.size() + 1); r.clear(); + for (int i = 0; i < src_cov.size(); ++i) + if (!src_cov[i]) r.push_back(src_[i]); + r.push_back(0); // NULL word + const prob_t uniform_alignment(1.0 / r.size()); + e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_cov; j < trg_.size(); ++j) { + prob_t p; + for (unsigned i = 0; i < r.size(); ++i) + p += model1_(r[i], trg_[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; + abort(); + } + p *= uniform_alignment; + e *= p; + } + r.pop_back(); + const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); + prob_t inv; + inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + for (unsigned i = 0; i < r.size(); ++i) { + prob_t p; + for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) + p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); + if (p.is_0()) { + cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; + abort(); + } + p *= inv_uniform; + inv *= p; + } + prob_t x = pow(e * inv, 0.5); + e = x; + //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; + } + return e; + } + const Model1& model1_; + const Model1& invmodel1_; + const vector& src_; + const vector& trg_; + mutable unordered_map, map, boost::hash > > cache_; +}; + +struct Particle { + Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {} + prob_t weight; + prob_t gamma_last; + vector src_jumps; + vector rules; + vector src_cv; + int src_cov; + int trg_cov; + int prev_pos; +}; + +ostream& operator<<(ostream& o, const vector& v) { + for (int i = 0; i < v.size(); ++i) + o << (v[i] ? '1' : '0'); + return o; +} +ostream& operator<<(ostream& o, const Particle& p) { + o << "[cv=" << p.src_cv << " src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']'; + return o; +} + +void FilterCrapParticlesAndReweight(vector* pps) { + vector& ps = *pps; + SampleSet ss; + for (int i = 0; i < ps.size(); ++i) + ss.add(ps[i].weight); + vector nps; nps.reserve(ps.size()); + const prob_t uniform_weight(1.0 / ps.size()); + for (int i = 0; i < ps.size(); ++i) { + nps.push_back(ps[prng->SelectSample(ss)]); + nps[i].weight = uniform_weight; + } + nps.swap(ps); +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); + const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + const unsigned particles = conf["particles"].as(); + const unsigned samples = conf["samples"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + cerr << "Reading corpus...\n"; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + assert(corpusf.size() == corpuse.size()); + + const int kLHS = -TD::Convert("X"); + Model1 m1(conf["model1"].as()); + Model1 invm1(conf["inverse_model1"].as()); + +#if 0 + PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size()); + MyConditionalModel m(lp0); +#else + PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); + MyJointModel m(lp0); +#endif + + cerr << "Initializing reachability limits...\n"; + vector ps(corpusf.size()); + vector reaches; reaches.reserve(corpusf.size()); + for (int ci = 0; ci < corpusf.size(); ++ci) + reaches.push_back(Reachability(corpusf[ci].size(), + corpuse[ci].size(), + kMAX_SRC_PHRASE, + kMAX_TRG_PHRASE)); + cerr << "Sampling...\n"; + vector tmp_p(10000); // work space + SampleSet pfss; + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpusf.size(); ++ci) { + vector& src = corpusf[ci]; + vector& trg = corpuse[ci]; + m.DecrementRules(ps[ci].rules); + m.DecrementJumps(ps[ci].src_jumps, src.size()); + + //BackwardEstimate be(m1, src, trg); + BackwardEstimateSym be(m1, invm1, src, trg); + const Reachability& r = reaches[ci]; + vector lps(particles); + + for (int pi = 0; pi < particles; ++pi) { + Particle& p = lps[pi]; + p.src_cv.resize(src.size(), false); + } + + bool all_complete = false; + while(!all_complete) { + SampleSet ss; + + // all particles have now been extended a bit, we will reweight them now + if (lps[0].trg_cov > 0) + FilterCrapParticlesAndReweight(&lps); + + // loop over all particles and extend them + bool done_nothing = true; + for (int pi = 0; pi < particles; ++pi) { + Particle& p = lps[pi]; + int tic = 0; + const int rejuv_freq = 1; + while(p.trg_cov < trg.size() && tic < rejuv_freq) { + ++tic; + done_nothing = false; + ss.clear(); + TRule x; x.lhs_ = kLHS; + prob_t z; + int first_uncovered = src.size(); + int last_uncovered = -1; + for (int i = 0; i < src.size(); ++i) { + const bool is_uncovered = !p.src_cv[i]; + if (i < first_uncovered && is_uncovered) first_uncovered = i; + if (is_uncovered && i > last_uncovered) last_uncovered = i; + } + assert(last_uncovered > -1); + assert(first_uncovered < src.size()); + + for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { + x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); + for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { + if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; + + const int last_possible_start = last_uncovered - src_len + 1; + assert(last_possible_start >= 0); + //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl; + //cerr << " first_uncovered=" << first_uncovered << " last_possible_start=" << last_possible_start << endl; + for (int i = first_uncovered; i <= last_possible_start; ++i) { + if (p.src_cv[i]) continue; + assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size + Particle& np = tmp_p[ss.size()]; + np = p; + x.f_.clear(); + int gap_add = 0; + bool bad = false; + prob_t jp = prob_t::One(); + int prev_pos = p.prev_pos; + for (int j = 0; j < src_len; ++j) { + if ((j + i + gap_add) == src.size()) { bad = true; break; } + while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; } + if ((j + i + gap_add) == src.size()) { bad = true; break; } + np.src_cv[i + j + gap_add] = true; + x.f_.push_back(src[i + j + gap_add]); + jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size()); + int jump = i + j + gap_add - prev_pos; + assert(jump != 0); + np.src_jumps.push_back(jump); + prev_pos = i + j + gap_add; + } + if (bad) continue; + np.prev_pos = prev_pos; + np.src_cov += x.f_.size(); + np.trg_cov += x.e_.size(); + if (x.f_.size() != src_len) continue; + prob_t rp = m.RuleProbability(x); + np.gamma_last = rp * jp; + const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2); + //cerr << "**rule=" << x << endl; + //cerr << " u=" << log(u) << " rule=" << rp << " jump=" << jp << endl; + ss.add(u); + np.rules.push_back(TRulePtr(new TRule(x))); + z += u; + + const bool completed = (p.trg_cov == trg.size()); + if (completed) { + int last_jump = src.size() - p.prev_pos; + assert(last_jump > 0); + p.src_jumps.push_back(last_jump); + p.weight *= m.JumpProbability(last_jump, src.size()); + } + } + } + } + cerr << "number of edges to consider: " << ss.size() << endl; + const int sampled = rng.SelectSample(ss); + prob_t q_n = ss[sampled] / z; + p = tmp_p[sampled]; + //m.IncrementRule(*p.rules.back()); + p.weight *= p.gamma_last / q_n; + cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; + cerr << p << endl; + } + } // loop over particles (pi = 0 .. particles) + if (done_nothing) all_complete = true; + } + pfss.clear(); + for (int i = 0; i < lps.size(); ++i) + pfss.add(lps[i].weight); + const int sampled = rng.SelectSample(pfss); + ps[ci] = lps[sampled]; + m.IncrementRules(lps[sampled].rules); + m.IncrementJumps(lps[sampled].src_jumps, src.size()); + for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } + cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; + } + cerr << "LLH: " << log(m.Likelihood()) << endl; + for (int sni = 0; sni < 5; ++sni) { + for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; } + } + } + return 0; +} + diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc new file mode 100644 index 00000000..43c604c3 --- /dev/null +++ b/gi/pf/pfnaive.cc @@ -0,0 +1,385 @@ +#include +#include +#include + +#include +#include +#include + +#include "base_measures.h" +#include "reachability.h" +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +size_t hash_value(const TRule& r) { + size_t h = boost::hash_value(r.e_); + boost::hash_combine(h, -r.lhs_); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("particles,p",po::value()->default_value(30),"Number of particles") + ("filter_frequency,f",po::value()->default_value(5),"Number of time steps between filterings") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(5),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(5),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +struct MyJointModel { + MyJointModel(PhraseJointBase& rcp0) : + rp0(rcp0), base(prob_t::One()), rules(1,1) {} + + void DecrementRule(const TRule& rule) { + if (rules.decrement(rule)) + base /= rp0(rule); + } + + void IncrementRule(const TRule& rule) { + if (rules.increment(rule)) + base *= rp0(rule); + } + + void IncrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); + return p; + } + + prob_t Likelihood() const { + prob_t p = base; + prob_t q; q.logeq(rules.log_crp_prob()); + p *= q; + for (unsigned l = 1; l < src_jumps.size(); ++l) { + if (src_jumps[l].num_customers() > 0) { + prob_t q; + q.logeq(src_jumps[l].log_crp_prob()); + p *= q; + } + } + return p; + } + + const PhraseJointBase& rp0; + prob_t base; + CCRP_NoTable rules; + vector > src_jumps; +}; + +struct BackwardEstimateSym { + BackwardEstimateSym(const Model1& m1, + const Model1& invm1, const vector& src, const vector& trg) : + model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { + } + const prob_t& operator()(unsigned src_cov, unsigned trg_cov) const { + assert(src_cov <= src_.size()); + assert(trg_cov <= trg_.size()); + prob_t& e = cache_[src_cov][trg_cov]; + if (e.is_0()) { + if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } + vector r(src_.size() + 1); r.clear(); + for (int i = src_cov; i < src_.size(); ++i) + r.push_back(src_[i]); + r.push_back(0); // NULL word + const prob_t uniform_alignment(1.0 / r.size()); + e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_cov; j < trg_.size(); ++j) { + prob_t p; + for (unsigned i = 0; i < r.size(); ++i) + p += model1_(r[i], trg_[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; + abort(); + } + p *= uniform_alignment; + e *= p; + } + r.pop_back(); + const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); + prob_t inv; + inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + for (unsigned i = 0; i < r.size(); ++i) { + prob_t p; + for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) + p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); + if (p.is_0()) { + cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; + abort(); + } + p *= inv_uniform; + inv *= p; + } + prob_t x = pow(e * inv, 0.5); + e = x; + //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; + } + return e; + } + const Model1& model1_; + const Model1& invmodel1_; + const vector& src_; + const vector& trg_; + mutable unordered_map > cache_; +}; + +struct Particle { + Particle() : weight(prob_t::One()), src_cov(), trg_cov() {} + prob_t weight; + prob_t gamma_last; + vector rules; + int src_cov; + int trg_cov; +}; + +ostream& operator<<(ostream& o, const vector& v) { + for (int i = 0; i < v.size(); ++i) + o << (v[i] ? '1' : '0'); + return o; +} +ostream& operator<<(ostream& o, const Particle& p) { + o << "[src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']'; + return o; +} + +void FilterCrapParticlesAndReweight(vector* pps) { + vector& ps = *pps; + SampleSet ss; + for (int i = 0; i < ps.size(); ++i) + ss.add(ps[i].weight); + vector nps; nps.reserve(ps.size()); + const prob_t uniform_weight(1.0 / ps.size()); + for (int i = 0; i < ps.size(); ++i) { + nps.push_back(ps[prng->SelectSample(ss)]); + nps[i].weight = uniform_weight; + } + nps.swap(ps); +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); + const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + const unsigned particles = conf["particles"].as(); + const unsigned samples = conf["samples"].as(); + const unsigned rejuv_freq = conf["filter_frequency"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + cerr << "Reading corpus...\n"; + ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + assert(corpusf.size() == corpuse.size()); + + const int kLHS = -TD::Convert("X"); + Model1 m1(conf["model1"].as()); + Model1 invm1(conf["inverse_model1"].as()); + +#if 0 + PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size()); + MyConditionalModel m(lp0); +#else + PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); + MyJointModel m(lp0); +#endif + + cerr << "Initializing reachability limits...\n"; + vector ps(corpusf.size()); + vector reaches; reaches.reserve(corpusf.size()); + for (int ci = 0; ci < corpusf.size(); ++ci) + reaches.push_back(Reachability(corpusf[ci].size(), + corpuse[ci].size(), + kMAX_SRC_PHRASE, + kMAX_TRG_PHRASE)); + cerr << "Sampling...\n"; + vector tmp_p(10000); // work space + SampleSet pfss; + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpusf.size(); ++ci) { + vector& src = corpusf[ci]; + vector& trg = corpuse[ci]; + m.DecrementRules(ps[ci].rules); + + BackwardEstimateSym be(m1, invm1, src, trg); + const Reachability& r = reaches[ci]; + vector lps(particles); + + bool all_complete = false; + while(!all_complete) { + SampleSet ss; + + // all particles have now been extended a bit, we will reweight them now + if (lps[0].trg_cov > 0) + FilterCrapParticlesAndReweight(&lps); + + // loop over all particles and extend them + bool done_nothing = true; + for (int pi = 0; pi < particles; ++pi) { + Particle& p = lps[pi]; + int tic = 0; + while(p.trg_cov < trg.size() && tic < rejuv_freq) { + ++tic; + done_nothing = false; + ss.clear(); + TRule x; x.lhs_ = kLHS; + prob_t z; + + for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { + x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); + for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { + if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; + + int i = p.src_cov; + assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size + Particle& np = tmp_p[ss.size()]; + np = p; + x.f_.clear(); + for (int j = 0; j < src_len; ++j) + x.f_.push_back(src[i + j]); + np.src_cov += x.f_.size(); + np.trg_cov += x.e_.size(); + prob_t rp = m.RuleProbability(x); + np.gamma_last = rp; + const prob_t u = pow(np.gamma_last * pow(be(np.src_cov, np.trg_cov), 1.2), 0.1); + //cerr << "**rule=" << x << endl; + //cerr << " u=" << log(u) << " rule=" << rp << endl; + ss.add(u); + np.rules.push_back(TRulePtr(new TRule(x))); + z += u; + } + } + //cerr << "number of edges to consider: " << ss.size() << endl; + const int sampled = rng.SelectSample(ss); + prob_t q_n = ss[sampled] / z; + p = tmp_p[sampled]; + //m.IncrementRule(*p.rules.back()); + p.weight *= p.gamma_last / q_n; + //cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; + //cerr << p << endl; + } + } // loop over particles (pi = 0 .. particles) + if (done_nothing) all_complete = true; + } + pfss.clear(); + for (int i = 0; i < lps.size(); ++i) + pfss.add(lps[i].weight); + const int sampled = rng.SelectSample(pfss); + ps[ci] = lps[sampled]; + m.IncrementRules(lps[sampled].rules); + for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } + cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; + } + cerr << "LLH: " << log(m.Likelihood()) << endl; + } + return 0; +} + diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc new file mode 100644 index 00000000..73dd8d39 --- /dev/null +++ b/gi/pf/reachability.cc @@ -0,0 +1,64 @@ +#include "reachability.h" + +#include +#include + +using namespace std; + +struct SState { + SState() : prev_src_covered(), prev_trg_covered() {} + SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} + int prev_src_covered; + int prev_trg_covered; +}; + +void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { + typedef boost::multi_array, 2> array_type; + array_type a(boost::extents[srclen + 1][trglen + 1]); + a[0][0].push_back(SState()); + for (int i = 0; i < srclen; ++i) { + for (int j = 0; j < trglen; ++j) { + if (a[i][j].size() == 0) continue; + const SState prev(i,j); + for (int k = 1; k <= src_max_phrase_len; ++k) { + if ((i + k) > srclen) continue; + for (int l = 1; l <= trg_max_phrase_len; ++l) { + if ((j + l) > trglen) continue; + a[i + k][j + l].push_back(prev); + } + } + } + } + a[0][0].clear(); + //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; + if (a[srclen][trglen].size() == 0) { + cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n"; + return; + } + + typedef boost::multi_array rarray_type; + rarray_type r(boost::extents[srclen + 1][trglen + 1]); + r[srclen][trglen] = true; + for (int i = srclen; i >= 0; --i) { + for (int j = trglen; j >= 0; --j) { + vector& prevs = a[i][j]; + if (!r[i][j]) { prevs.clear(); } + for (int k = 0; k < prevs.size(); ++k) { + r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; + int src_delta = i - prevs[k].prev_src_covered; + edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; + short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; + if (src_delta > msd) msd = src_delta; + } + } + } + assert(!edges[0][0][1][0]); + assert(!edges[0][0][0][1]); + assert(!edges[0][0][0][0]); + assert(max_src_delta[0][0] > 0); + //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; + //for (int i = 0; i < b[0][0].size(); ++i) { + // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; + //} + } + diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h new file mode 100644 index 00000000..98450ec1 --- /dev/null +++ b/gi/pf/reachability.h @@ -0,0 +1,28 @@ +#ifndef _REACHABILITY_H_ +#define _REACHABILITY_H_ + +#include "boost/multi_array.hpp" + +// determines minimum and maximum lengths of outgoing edges from all +// coverage positions such that the alignment path respects src and +// trg maximum phrase sizes +// +// runs in O(n^2 * src_max * trg_max) time but should be relatively fast +// +// currently forbids 0 -> n and n -> 0 alignments + +struct Reachability { + boost::multi_array edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? + boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + + Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), + max_src_delta(boost::extents[srclen][trglen]) { + ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); + } + + private: + void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len); +}; + +#endif diff --git a/gi/pf/tpf.cc b/gi/pf/tpf.cc new file mode 100644 index 00000000..7348d21c --- /dev/null +++ b/gi/pf/tpf.cc @@ -0,0 +1,99 @@ +#include +#include +#include + +#include "sampler.h" + +using namespace std; +using namespace tr1; + +shared_ptr prng; + +struct Particle { + Particle() : weight(prob_t::One()) {} + vector states; + prob_t weight; + prob_t gamma_last; +}; + +ostream& operator<<(ostream& os, const Particle& p) { + os << "["; + for (int i = 0; i < p.states.size(); ++i) os << p.states[i] << ' '; + os << "| w=" << log(p.weight) << ']'; + return os; +} + +void Rejuvenate(vector& pps) { + SampleSet ss; + vector nps(pps.size()); + for (int i = 0; i < pps.size(); ++i) { +// cerr << pps[i] << endl; + ss.add(pps[i].weight); + } +// cerr << "REJUVINATING...\n"; + for (int i = 0; i < pps.size(); ++i) { + nps[i] = pps[prng->SelectSample(ss)]; + nps[i].weight = prob_t(1.0 / pps.size()); +// cerr << nps[i] << endl; + } + nps.swap(pps); +// exit(1); +} + +int main(int argc, char** argv) { + const unsigned particles = 100; + prng.reset(new MT19937); + MT19937& rng = *prng; + + // q(a) = 0.8 + // q(b) = 0.8 + // q(c) = 0.4 + SampleSet ssq; + ssq.add(0.4); + ssq.add(0.6); + ssq.add(0); + double qz = 1; + + // p(a) = 0.2 + // p(b) = 0.8 + vector p(3); + p[0] = 0.2; + p[1] = 0.8; + p[2] = 0; + + vector counts(3); + int tot = 0; + + vector pps(particles); + SampleSet ppss; + int LEN = 12; + int PP = 1; + while (pps[0].states.size() < LEN) { + for (int pi = 0; pi < particles; ++pi) { + Particle& prt = pps[pi]; + + bool redo = true; + const Particle savedp = prt; + while (redo) { + redo = false; + for (int i = 0; i < PP; ++i) { + int s = rng.SelectSample(ssq); + double gamma_last = p[s]; + if (!gamma_last) { redo = true; break; } + double q = ssq[s] / qz; + prt.states.push_back(s); + prt.weight *= prob_t(gamma_last / q); + } + if (redo) { prt = savedp; continue; } + } + } + Rejuvenate(pps); + } + ppss.clear(); + for (int i = 0; i < particles; ++i) { ppss.add(pps[i].weight); } + int sp = rng.SelectSample(ppss); + cerr << pps[sp] << endl; + + return 0; +} + -- cgit v1.2.3