summaryrefslogtreecommitdiff
path: root/gi/pf
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
commite26434979adc33bd949566ba7bf02dff64e80a3e (patch)
treed1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/pf
parent0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'gi/pf')
-rw-r--r--gi/pf/Makefile.am44
-rw-r--r--gi/pf/README2
-rw-r--r--gi/pf/align-lexonly-pyp.cc243
-rw-r--r--gi/pf/align-tl.cc339
-rw-r--r--gi/pf/backward.cc89
-rw-r--r--gi/pf/backward.h33
-rw-r--r--gi/pf/base_distributions.cc241
-rw-r--r--gi/pf/base_distributions.h238
-rw-r--r--gi/pf/bayes_lattice_score.cc309
-rw-r--r--gi/pf/brat.cc543
-rw-r--r--gi/pf/cbgi.cc330
-rw-r--r--gi/pf/cfg_wfst_composer.cc731
-rw-r--r--gi/pf/cfg_wfst_composer.h46
-rw-r--r--gi/pf/conditional_pseg.h275
-rw-r--r--gi/pf/condnaive.cc298
-rw-r--r--gi/pf/corpus.cc62
-rw-r--r--gi/pf/corpus.h19
-rw-r--r--gi/pf/dpnaive.cc301
-rwxr-xr-xgi/pf/guess-translits.pl72
-rw-r--r--gi/pf/hpyp_tm.cc133
-rw-r--r--gi/pf/hpyp_tm.h38
-rw-r--r--gi/pf/itg.cc275
-rw-r--r--gi/pf/learn_cfg.cc428
-rwxr-xr-xgi/pf/make-freq-bins.pl26
-rw-r--r--gi/pf/mh_test.cc148
-rw-r--r--gi/pf/monotonic_pseg.h89
-rw-r--r--gi/pf/ngram_base.cc69
-rw-r--r--gi/pf/ngram_base.h25
-rw-r--r--gi/pf/nuisance_test.cc161
-rw-r--r--gi/pf/os_phrase.h15
-rw-r--r--gi/pf/pf.h84
-rw-r--r--gi/pf/pf_test.cc148
-rw-r--r--gi/pf/pfbrat.cc543
-rw-r--r--gi/pf/pfdist.cc598
-rw-r--r--gi/pf/pfdist.new.cc620
-rw-r--r--gi/pf/pfnaive.cc284
-rw-r--r--gi/pf/poisson_uniform_word_model.h50
-rw-r--r--gi/pf/pyp_lm.cc273
-rw-r--r--gi/pf/pyp_tm.cc128
-rw-r--r--gi/pf/pyp_tm.h36
-rw-r--r--gi/pf/pyp_word_model.h61
-rw-r--r--gi/pf/quasi_model2.h177
-rw-r--r--gi/pf/reachability.cc74
-rw-r--r--gi/pf/reachability.h34
-rw-r--r--gi/pf/tied_resampler.h122
-rw-r--r--gi/pf/tpf.cc99
-rw-r--r--gi/pf/transliterations.cc334
-rw-r--r--gi/pf/transliterations.h24
-rw-r--r--gi/pf/unigrams.cc80
-rw-r--r--gi/pf/unigrams.h69
50 files changed, 0 insertions, 9460 deletions
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
deleted file mode 100644
index 86f8e07b..00000000
--- a/gi/pf/Makefile.am
+++ /dev/null
@@ -1,44 +0,0 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl pf_test bayes_lattice_score
-
-noinst_LIBRARIES = libpf.a
-
-libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc hpyp_tm.cc pyp_tm.cc
-
-bayes_lattice_score_SOURCES = bayes_lattice_score.cc
-bayes_lattice_score_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
-
-pf_test_SOURCES = pf_test.cc
-pf_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
-
-nuisance_test_SOURCES = nuisance_test.cc
-nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
-
-align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
-align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
-
-align_tl_SOURCES = align-tl.cc
-align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
-
-itg_SOURCES = itg.cc
-
-pyp_lm_SOURCES = pyp_lm.cc
-
-learn_cfg_SOURCES = learn_cfg.cc
-
-condnaive_SOURCES = condnaive.cc
-
-dpnaive_SOURCES = dpnaive.cc
-
-pfdist_SOURCES = pfdist.cc
-
-pfnaive_SOURCES = pfnaive.cc
-
-cbgi_SOURCES = cbgi.cc
-
-brat_SOURCES = brat.cc
-
-pfbrat_SOURCES = pfbrat.cc
-
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm
-
-AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz
diff --git a/gi/pf/README b/gi/pf/README
deleted file mode 100644
index 62e47541..00000000
--- a/gi/pf/README
+++ /dev/null
@@ -1,2 +0,0 @@
-Experimental Bayesian alignment tools. Nothing to see here.
-
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
deleted file mode 100644
index e7509f57..00000000
--- a/gi/pf/align-lexonly-pyp.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-#include <iostream>
-#include <queue>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "tdict.h"
-#include "stringlib.h"
-#include "filelib.h"
-#include "array2d.h"
-#include "sampler.h"
-#include "corpus.h"
-#include "pyp_tm.h"
-#include "hpyp_tm.h"
-#include "quasi_model2.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed")
- ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null")
- ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?")
- ("input,i",po::value<string>(),"Read parallel data from")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-MT19937* prng;
-
-struct LexicalAlignment {
- unsigned char src_index;
- bool is_transliteration;
- vector<pair<short, short> > derivation;
-};
-
-struct AlignedSentencePair {
- vector<WordID> src;
- vector<WordID> trg;
- vector<LexicalAlignment> a;
- Array2D<short> posterior;
-};
-
-template <class LexicalTranslationModel>
-struct Aligner {
- Aligner(const vector<vector<WordID> >& lets,
- int vocab_size,
- int num_letters,
- const po::variables_map& conf,
- vector<AlignedSentencePair>* c) :
- corpus(*c),
- paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()),
- infer_paj(conf.count("infer_alignment_hyperparameters") > 0),
- model(lets, vocab_size, num_letters),
- kNULL(TD::Convert("NULL")) {
- assert(lets[kNULL].size() == 0);
- }
-
- vector<AlignedSentencePair>& corpus;
- QuasiModel2 paj_model;
- const bool infer_paj;
- LexicalTranslationModel model;
- const WordID kNULL;
-
- void ResampleHyperparameters() {
- model.ResampleHyperparameters(prng);
- if (infer_paj) paj_model.ResampleHyperparameters(prng);
- }
-
- void InitializeRandom() {
- cerr << "Initializing with random alignments ...\n";
- for (unsigned i = 0; i < corpus.size(); ++i) {
- AlignedSentencePair& asp = corpus[i];
- asp.a.resize(asp.trg.size());
- for (unsigned j = 0; j < asp.trg.size(); ++j) {
- unsigned char& a_j = asp.a[j].src_index;
- a_j = prng->next() * (1 + asp.src.size());
- const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
- model.Increment(f_a_j, asp.trg[j], &*prng);
- paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
- }
- }
- cerr << "Corpus intialized randomly." << endl;
- cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood()
- << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;
- }
-
- void ResampleCorpus() {
- for (unsigned i = 0; i < corpus.size(); ++i) {
- AlignedSentencePair& asp = corpus[i];
- SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
- for (unsigned j = 0; j < asp.trg.size(); ++j) {
- unsigned char& a_j = asp.a[j].src_index;
- const WordID e_j = asp.trg[j];
- WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
- model.Decrement(f_a_j, e_j, prng);
- paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size());
-
- for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
- const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
- ss[prop_a_j] = model.Prob(prop_f, e_j);
- ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size());
- }
- a_j = prng->SelectSample(ss);
- f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
- model.Increment(f_a_j, e_j, prng);
- paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
- }
- }
- }
-
- prob_t Likelihood() const {
- return model.Likelihood() * paj_model.Likelihood();
- }
-};
-
-void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
- for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
- vector<WordID>& letters = (*l)[*it];
- if (letters.size()) continue; // if e and f have the same word
-
- const string& w = TD::Convert(*it);
-
- size_t cur = 0;
- while (cur < w.size()) {
- const size_t len = UTF8Len(w[cur]);
- letters.push_back(TD::Convert(w.substr(cur, len)));
- if (letset) letset->insert(letters.back());
- cur += len;
- }
- }
-}
-
-void Debug(const AlignedSentencePair& asp) {
- cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
- Array2D<bool> a(asp.src.size(), asp.trg.size());
- for (unsigned j = 0; j < asp.trg.size(); ++j) {
- assert(asp.a[j].src_index <= asp.src.size());
- if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
- }
- cerr << a << endl;
-}
-
-void AddSample(AlignedSentencePair* asp) {
- for (unsigned j = 0; j < asp->trg.size(); ++j)
- asp->posterior(asp->a[j].src_index, j)++;
-}
-
-void WriteAlignments(const AlignedSentencePair& asp) {
- bool first = true;
- for (unsigned j = 0; j < asp.trg.size(); ++j) {
- int src_index = -1;
- int mc = -1;
- for (unsigned i = 0; i <= asp.src.size(); ++i) {
- if (asp.posterior(i, j) > mc) {
- mc = asp.posterior(i, j);
- src_index = i;
- }
- }
-
- if (src_index) {
- if (first) first = false; else cout << ' ';
- cout << (src_index - 1) << '-' << j;
- }
- }
- cout << endl;
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- if (conf.count("random_seed"))
- prng = new MT19937(conf["random_seed"].as<uint32_t>());
- else
- prng = new MT19937;
-
- vector<vector<int> > corpuse, corpusf;
- set<int> vocabe, vocabf;
- corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
- cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
- cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
- cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
- cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
- assert(corpusf.size() == corpuse.size());
-
- vector<AlignedSentencePair> corpus(corpuse.size());
- for (unsigned i = 0; i < corpuse.size(); ++i) {
- corpus[i].src.swap(corpusf[i]);
- corpus[i].trg.swap(corpuse[i]);
- corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size());
- }
- corpusf.clear(); corpuse.clear();
-
- vocabf.insert(TD::Convert("NULL"));
- vector<vector<WordID> > letters(TD::NumWords());
- set<WordID> letset;
- ExtractLetters(vocabe, &letters, &letset);
- ExtractLetters(vocabf, &letters, NULL);
- letters[TD::Convert("NULL")].clear();
-
- //Aligner<PYPLexicalTranslation> aligner(letters, vocabe.size(), letset.size(), conf, &corpus);
- Aligner<HPYPLexicalTranslation> aligner(letters, vocabe.size(), letset.size(), conf, &corpus);
- aligner.InitializeRandom();
-
- const unsigned samples = conf["samples"].as<unsigned>();
- for (int i = 0; i < samples; ++i) {
- for (int j = 65; j < 67; ++j) Debug(corpus[j]);
- if (i % 10 == 9) {
- aligner.ResampleHyperparameters();
- cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood()
- << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl;
- }
- aligner.ResampleCorpus();
- if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
- }
- for (unsigned i = 0; i < corpus.size(); ++i)
- WriteAlignments(corpus[i]);
- aligner.model.Summary();
-
- return 0;
-}
diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc
deleted file mode 100644
index f6608f1d..00000000
--- a/gi/pf/align-tl.cc
+++ /dev/null
@@ -1,339 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/multi_array.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "backward.h"
-#include "array2d.h"
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "conditional_pseg.h"
-#include "trule.h"
-#include "tdict.h"
-#include "stringlib.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "mfcr.h"
-#include "corpus.h"
-#include "ngram_base.h"
-#include "transliterations.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("input,i",po::value<string>(),"Read parallel data from")
- ("s2t", po::value<string>(), "character level source-to-target prior transliteration probabilities")
- ("t2s", po::value<string>(), "character level target-to-source prior transliteration probabilities")
- ("max_src_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in source")
- ("max_trg_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in target")
- ("expected_src_to_trg_ratio", po::value<double>()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-boost::shared_ptr<MT19937> prng;
-
-struct LexicalAlignment {
- unsigned char src_index;
- bool is_transliteration;
- vector<pair<short, short> > derivation;
-};
-
-struct AlignedSentencePair {
- vector<WordID> src;
- vector<WordID> trg;
- vector<LexicalAlignment> a;
- Array2D<short> posterior;
-};
-
-struct HierarchicalWordBase {
- explicit HierarchicalWordBase(const unsigned vocab_e_size) :
- base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {}
-
- void ResampleHyperparameters(MT19937* rng) {
- r.resample_hyperparameters(rng);
- }
-
- inline double logp0(const vector<WordID>& s) const {
- return Md::log_poisson(s.size(), 7.5) + s.size() * u0;
- }
-
- // return p0 of rule.e_
- prob_t operator()(const TRule& rule) const {
- v[0].logeq(logp0(rule.e_));
- return r.prob(rule.e_, v.begin(), l.begin());
- }
-
- void Increment(const TRule& rule) {
- v[0].logeq(logp0(rule.e_));
- if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) {
- base *= v[0] * l[0];
- }
- }
-
- void Decrement(const TRule& rule) {
- if (r.decrement(rule.e_, &*prng).count) {
- base /= prob_t(exp(logp0(rule.e_)));
- }
- }
-
- prob_t Likelihood() const {
- prob_t p; p.logeq(r.log_crp_prob());
- p *= base;
- return p;
- }
-
- void Summary() const {
- cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl;
- for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
- cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl;
- }
-
- prob_t base;
- MFCR<1,vector<WordID> > r;
- const double u0;
- const vector<prob_t> l;
- mutable vector<prob_t> v;
-};
-
-struct BasicLexicalAlignment {
- explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets,
- const unsigned words_e,
- const unsigned letters_e,
- vector<AlignedSentencePair>* corp) :
- letters(lets),
- corpus(*corp),
- //up0(words_e),
- //up0("en.chars.1gram", letters_e),
- //up0("en.words.1gram"),
- up0(letters_e),
- //up0("en.chars.2gram"),
- tmodel(up0) {
- }
-
- void InstantiateRule(const WordID src,
- const WordID trg,
- TRule* rule) const {
- static const WordID kX = TD::Convert("X") * -1;
- rule->lhs_ = kX;
- rule->e_ = letters[trg];
- rule->f_ = letters[src];
- }
-
- void InitializeRandom() {
- const WordID kNULL = TD::Convert("NULL");
- cerr << "Initializing with random alignments ...\n";
- for (unsigned i = 0; i < corpus.size(); ++i) {
- AlignedSentencePair& asp = corpus[i];
- asp.a.resize(asp.trg.size());
- for (unsigned j = 0; j < asp.trg.size(); ++j) {
- const unsigned char a_j = prng->next() * (1 + asp.src.size());
- const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
- TRule r;
- InstantiateRule(f_a_j, asp.trg[j], &r);
- asp.a[j].is_transliteration = false;
- asp.a[j].src_index = a_j;
- if (tmodel.IncrementRule(r, &*prng))
- up0.Increment(r);
- }
- }
- cerr << " LLH = " << Likelihood() << endl;
- }
-
- prob_t Likelihood() const {
- prob_t p = tmodel.Likelihood();
- p *= up0.Likelihood();
- return p;
- }
-
- void ResampleHyperparemeters() {
- tmodel.ResampleHyperparameters(&*prng);
- up0.ResampleHyperparameters(&*prng);
- cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n";
- }
-
- void ResampleCorpus();
-
- const vector<vector<WordID> >& letters; // spelling dictionary
- vector<AlignedSentencePair>& corpus;
- //PhraseConditionalUninformativeBase up0;
- //PhraseConditionalUninformativeUnigramBase up0;
- //UnigramWordBase up0;
- //HierarchicalUnigramBase up0;
- HierarchicalWordBase up0;
- //CompletelyUniformBase up0;
- //FixedNgramBase up0;
- //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel;
- //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel;
- //ConditionalTranslationModel<UnigramWordBase> tmodel;
- //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel;
- MConditionalTranslationModel<HierarchicalWordBase> tmodel;
- //ConditionalTranslationModel<FixedNgramBase> tmodel;
- //ConditionalTranslationModel<CompletelyUniformBase> tmodel;
-};
-
-void BasicLexicalAlignment::ResampleCorpus() {
- static const WordID kNULL = TD::Convert("NULL");
- for (unsigned i = 0; i < corpus.size(); ++i) {
- AlignedSentencePair& asp = corpus[i];
- SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
- for (unsigned j = 0; j < asp.trg.size(); ++j) {
- TRule r;
- unsigned char& a_j = asp.a[j].src_index;
- WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
- InstantiateRule(f_a_j, asp.trg[j], &r);
- if (tmodel.DecrementRule(r, &*prng))
- up0.Decrement(r);
-
- for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
- const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
- InstantiateRule(prop_f, asp.trg[j], &r);
- ss[prop_a_j] = tmodel.RuleProbability(r);
- }
- a_j = prng->SelectSample(ss);
- f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
- InstantiateRule(f_a_j, asp.trg[j], &r);
- if (tmodel.IncrementRule(r, &*prng))
- up0.Increment(r);
- }
- }
- cerr << " LLH = " << Likelihood() << endl;
-}
-
-void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
- for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
- vector<WordID>& letters = (*l)[*it];
- if (letters.size()) continue; // if e and f have the same word
-
- const string& w = TD::Convert(*it);
-
- size_t cur = 0;
- while (cur < w.size()) {
- const size_t len = UTF8Len(w[cur]);
- letters.push_back(TD::Convert(w.substr(cur, len)));
- if (letset) letset->insert(letters.back());
- cur += len;
- }
- }
-}
-
-void Debug(const AlignedSentencePair& asp) {
- cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
- Array2D<bool> a(asp.src.size(), asp.trg.size());
- for (unsigned j = 0; j < asp.trg.size(); ++j)
- if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
- cerr << a << endl;
-}
-
-void AddSample(AlignedSentencePair* asp) {
- for (unsigned j = 0; j < asp->trg.size(); ++j)
- asp->posterior(asp->a[j].src_index, j)++;
-}
-
-void WriteAlignments(const AlignedSentencePair& asp) {
- bool first = true;
- for (unsigned j = 0; j < asp.trg.size(); ++j) {
- int src_index = -1;
- int mc = -1;
- for (unsigned i = 0; i <= asp.src.size(); ++i) {
- if (asp.posterior(i, j) > mc) {
- mc = asp.posterior(i, j);
- src_index = i;
- }
- }
-
- if (src_index) {
- if (first) first = false; else cout << ' ';
- cout << (src_index - 1) << '-' << j;
- }
- }
- cout << endl;
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
-// MT19937& rng = *prng;
-
- vector<vector<int> > corpuse, corpusf;
- set<int> vocabe, vocabf;
- corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
- cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
- cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
- cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
- cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
- assert(corpusf.size() == corpuse.size());
-
- vector<AlignedSentencePair> corpus(corpuse.size());
- for (unsigned i = 0; i < corpuse.size(); ++i) {
- corpus[i].src.swap(corpusf[i]);
- corpus[i].trg.swap(corpuse[i]);
- corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size());
- }
- corpusf.clear(); corpuse.clear();
-
- vocabf.insert(TD::Convert("NULL"));
- vector<vector<WordID> > letters(TD::NumWords() + 1);
- set<WordID> letset;
- ExtractLetters(vocabe, &letters, &letset);
- ExtractLetters(vocabf, &letters, NULL);
- letters[TD::Convert("NULL")].clear();
-
- // TODO configure this
- const int max_src_chunk = conf["max_src_chunk"].as<unsigned>();
- const int max_trg_chunk = conf["max_trg_chunk"].as<unsigned>();
- const double s2t_rat = conf["expected_src_to_trg_ratio"].as<double>();
- const BackwardEstimator be(conf["s2t"].as<string>(), conf["t2s"].as<string>());
- Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be);
-
- cerr << "Initializing transliteration graph structures ...\n";
- for (int i = 0; i < corpus.size(); ++i) {
- const vector<int>& src = corpus[i].src;
- const vector<int>& trg = corpus[i].trg;
- for (int j = 0; j < src.size(); ++j) {
- const vector<int>& src_let = letters[src[j]];
- for (int k = 0; k < trg.size(); ++k) {
- const vector<int>& trg_let = letters[trg[k]];
- tl.Initialize(src[j], src_let, trg[k], trg_let);
- //if (src_let.size() < min_trans_src)
- // tl.Forbid(src[j], src_let, trg[k], trg_let);
- }
- }
- }
- cerr << endl;
- tl.GraphSummary();
-
- return 0;
-}
diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc
deleted file mode 100644
index b92629fd..00000000
--- a/gi/pf/backward.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "backward.h"
-
-#include <queue>
-#include <utility>
-
-#include "array2d.h"
-#include "reachability.h"
-#include "base_distributions.h"
-
-using namespace std;
-
-BackwardEstimator::BackwardEstimator(const string& s2t,
- const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {}
-
-BackwardEstimator::~BackwardEstimator() {
- delete m1; m1 = NULL;
- delete m1inv; m1inv = NULL;
-}
-
-float BackwardEstimator::ComputeBackwardProb(const std::vector<WordID>& src,
- const std::vector<WordID>& trg,
- unsigned src_covered,
- unsigned trg_covered,
- double s2t_ratio) const {
- if (src_covered == src.size() || trg_covered == trg.size()) {
- assert(src_covered == src.size());
- assert(trg_covered == trg.size());
- return 0;
- }
- static const WordID kNULL = TD::Convert("<eps>");
- const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1));
- // TODO factor in expected length ratio
- prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining)
- for (unsigned j = trg_covered; j < trg.size(); ++j) {
- prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12);
- for (unsigned i = src_covered; i < src.size(); ++i)
- p += (*m1)(src[i], trg[j]);
- if (p.is_0()) {
- cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n";
- assert(!"failed");
- }
- p *= uniform_alignment;
- e *= p;
- }
- // TODO factor in expected length ratio
- const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0));
- prob_t inv;
- inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio));
- for (unsigned i = src_covered; i < src.size(); ++i) {
- prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12);
- for (unsigned j = trg_covered; j < trg.size(); ++j)
- p += (*m1inv)(trg[j], src[i]);
- if (p.is_0()) {
- cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n";
- assert(!"failed");
- }
- p *= inv_uniform;
- inv *= p;
- }
- return (log(e) + log(inv)) / 2;
-}
-
-void BackwardEstimator::InitializeGrid(const vector<WordID>& src,
- const vector<WordID>& trg,
- const Reachability& r,
- double s2t_ratio,
- float* grid) const {
- queue<pair<int,int> > q;
- q.push(make_pair(0,0));
- Array2D<bool> done(src.size()+1, trg.size()+1, false);
- //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl;
- while(!q.empty()) {
- const pair<int,int> n = q.front();
- q.pop();
- if (done(n.first,n.second)) continue;
- done(n.first,n.second) = true;
-
- float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio);
- if (n.first == 0 && n.second == 0) grid[0] = lp;
- //cerr << " " << n.first << "," << n.second << "\t" << lp << endl;
-
- if (n.first == src.size() || n.second == trg.size()) continue;
- const vector<pair<short,short> >& edges = r.valid_deltas[n.first][n.second];
- for (int i = 0; i < edges.size(); ++i)
- q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second));
- }
- //static int cc = 0; ++cc; if (cc == 80) exit(1);
-}
-
diff --git a/gi/pf/backward.h b/gi/pf/backward.h
deleted file mode 100644
index e67eff0c..00000000
--- a/gi/pf/backward.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _BACKWARD_H_
-#define _BACKWARD_H_
-
-#include <vector>
-#include <string>
-#include "wordid.h"
-
-struct Reachability;
-struct Model1;
-
-struct BackwardEstimator {
- BackwardEstimator(const std::string& s2t,
- const std::string& t2s);
- ~BackwardEstimator();
-
- void InitializeGrid(const std::vector<WordID>& src,
- const std::vector<WordID>& trg,
- const Reachability& r,
- double src2trg_ratio,
- float* grid) const;
-
- private:
- float ComputeBackwardProb(const std::vector<WordID>& src,
- const std::vector<WordID>& trg,
- unsigned src_covered,
- unsigned trg_covered,
- double src2trg_ratio) const;
-
- Model1* m1;
- Model1* m1inv;
-};
-
-#endif
diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc
deleted file mode 100644
index 57e0bbe1..00000000
--- a/gi/pf/base_distributions.cc
+++ /dev/null
@@ -1,241 +0,0 @@
-#include "base_distributions.h"
-
-#include <iostream>
-
-#include "filelib.h"
-
-using namespace std;
-
-TableLookupBase::TableLookupBase(const string& fname) {
- cerr << "TableLookupBase reading from " << fname << " ..." << endl;
- ReadFile rf(fname);
- istream& in = *rf.stream();
- string line;
- unsigned lc = 0;
- const WordID kDIV = TD::Convert("|||");
- vector<WordID> tmp;
- vector<int> le, lf;
- TRule x;
- x.lhs_ = -TD::Convert("X");
- bool flag = false;
- while(getline(in, line)) {
- ++lc;
- if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; }
- else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; }
- tmp.clear();
- TD::ConvertSentence(line, &tmp);
- x.f_.clear();
- x.e_.clear();
- size_t pos = 0;
- int cc = 0;
- while(pos < tmp.size()) {
- const WordID cur = tmp[pos++];
- if (cur == kDIV) {
- ++cc;
- } else if (cc == 0) {
- x.f_.push_back(cur);
- } else if (cc == 1) {
- x.e_.push_back(cur);
- } else if (cc == 2) {
- table[x].logeq(atof(TD::Convert(cur).c_str()));
- ++cc;
- } else {
- if (flag) cerr << endl;
- cerr << "Bad format in " << lc << ": " << line << endl; abort();
- }
- }
- if (cc != 3) {
- if (flag) cerr << endl;
- cerr << "Bad format in " << lc << ": " << line << endl; abort();
- }
- }
- if (flag) cerr << endl;
- cerr << " read " << lc << " entries\n";
-}
-
-prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc,
- const vector<WordID>& vtrg,
- int start_src, int start_trg) const {
- const int flen = vsrc.size() - start_src;
- const int elen = vtrg.size() - start_trg;
- prob_t p;
- p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
- //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01)
- for (int i = 0; i < elen; ++i)
- p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform
- return p;
-}
-
-prob_t PhraseConditionalUninformativeBase::p0(const vector<WordID>& vsrc,
- const vector<WordID>& vtrg,
- int start_src, int start_trg) const {
- const int flen = vsrc.size() - start_src;
- const int elen = vtrg.size() - start_trg;
- prob_t p;
- //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
- p.logeq(Md::log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01)
- for (int i = 0; i < elen; ++i)
- p *= kUNIFORM_TARGET; // draw e_i ~Uniform
- return p;
-}
-
-void Model1::LoadModel1(const string& fname) {
- cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
- ReadFile rf(fname);
- istream& in = *rf.stream();
- string line;
- unsigned lc = 0;
- while(getline(in, line)) {
- ++lc;
- int cur = 0;
- int start = 0;
- while(cur < line.size() && line[cur] != ' ') { ++cur; }
- assert(cur != line.size());
- line[cur] = 0;
- const WordID src = TD::Convert(&line[0]);
- ++cur;
- start = cur;
- while(cur < line.size() && line[cur] != ' ') { ++cur; }
- assert(cur != line.size());
- line[cur] = 0;
- WordID trg = TD::Convert(&line[start]);
- const double logprob = strtod(&line[cur + 1], NULL);
- if (src >= ttable.size()) ttable.resize(src + 1);
- ttable[src][trg].logeq(logprob);
- }
- cerr << " read " << lc << " parameters.\n";
-}
-
-prob_t PhraseConditionalBase::p0(const vector<WordID>& vsrc,
- const vector<WordID>& vtrg,
- int start_src, int start_trg) const {
- const int flen = vsrc.size() - start_src;
- const int elen = vtrg.size() - start_trg;
- prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
- prob_t p;
- p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
- for (int i = 0; i < elen; ++i) { // for each position i in e-RHS
- const WordID trg = vtrg[i + start_trg];
- prob_t tp = prob_t::Zero();
- for (int j = -1; j < flen; ++j) {
- const WordID src = j < 0 ? 0 : vsrc[j + start_src];
- tp += kM1MIXTURE * model1(src, trg);
- tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
- }
- tp *= uniform_src_alignment; // draw a_i ~uniform
- p *= tp; // draw e_i ~Model1(f_a_i) / uniform
- }
- if (p.is_0()) {
- cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
- abort();
- }
- return p;
-}
-
-prob_t PhraseJointBase::p0(const vector<WordID>& vsrc,
- const vector<WordID>& vtrg,
- int start_src, int start_trg) const {
- const int flen = vsrc.size() - start_src;
- const int elen = vtrg.size() - start_trg;
- prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
- prob_t p;
- p.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1)
- // elen | flen ~Pois(flen + 0.01)
- prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01));
- p *= ptrglen;
- p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform
- for (int i = 0; i < elen; ++i) { // for each position i in E
- const WordID trg = vtrg[i + start_trg];
- prob_t tp = prob_t::Zero();
- for (int j = -1; j < flen; ++j) {
- const WordID src = j < 0 ? 0 : vsrc[j + start_src];
- tp += kM1MIXTURE * model1(src, trg);
- tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
- }
- tp *= uniform_src_alignment; // draw a_i ~uniform
- p *= tp; // draw e_i ~Model1(f_a_i) / uniform
- }
- if (p.is_0()) {
- cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
- abort();
- }
- return p;
-}
-
-prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc,
- const vector<WordID>& vtrg,
- int start_src, int start_trg) const {
- const int flen = vsrc.size() - start_src;
- const int elen = vtrg.size() - start_trg;
- prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
- prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1));
-
- prob_t p1;
- p1.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1)
- // elen | flen ~Pois(flen + 0.01)
- prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01));
- p1 *= ptrglen;
- p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform
- for (int i = 0; i < elen; ++i) { // for each position i in E
- const WordID trg = vtrg[i + start_trg];
- prob_t tp = prob_t::Zero();
- for (int j = -1; j < flen; ++j) {
- const WordID src = j < 0 ? 0 : vsrc[j + start_src];
- tp += kM1MIXTURE * model1(src, trg);
- tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
- }
- tp *= uniform_src_alignment; // draw a_i ~uniform
- p1 *= tp; // draw e_i ~Model1(f_a_i) / uniform
- }
- if (p1.is_0()) {
- cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
- abort();
- }
-
- prob_t p2;
- p2.logeq(Md::log_poisson(elen, 1.0)); // elen ~Pois(1)
- // flen | elen ~Pois(flen + 0.01)
- prob_t psrclen; psrclen.logeq(Md::log_poisson(flen, elen + 0.01));
- p2 *= psrclen;
- p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform
- for (int i = 0; i < flen; ++i) { // for each position i in E
- const WordID src = vsrc[i + start_src];
- prob_t tp = prob_t::Zero();
- for (int j = -1; j < elen; ++j) {
- const WordID trg = j < 0 ? 0 : vtrg[j + start_trg];
- tp += kM1MIXTURE * invmodel1(trg, src);
- tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE;
- }
- tp *= uniform_trg_alignment; // draw a_i ~uniform
- p2 *= tp; // draw e_i ~Model1(f_a_i) / uniform
- }
- if (p2.is_0()) {
- cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
- abort();
- }
-
- static const prob_t kHALF(0.5);
- return (p1 + p2) * kHALF;
-}
-
-JumpBase::JumpBase() : p(200) {
- for (unsigned src_len = 1; src_len < 200; ++src_len) {
- map<int, prob_t>& cpd = p[src_len];
- int min_jump = 1 - src_len;
- int max_jump = src_len;
- prob_t z;
- for (int j = min_jump; j <= max_jump; ++j) {
- prob_t& cp = cpd[j];
- if (j < 0)
- cp.logeq(Md::log_poisson(1.5-j, 1));
- else if (j > 0)
- cp.logeq(Md::log_poisson(j, 1));
- cp.poweq(0.2);
- z += cp;
- }
- for (int j = min_jump; j <= max_jump; ++j) {
- cpd[j] /= z;
- }
- }
-}
-
diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h
deleted file mode 100644
index 41b513f8..00000000
--- a/gi/pf/base_distributions.h
+++ /dev/null
@@ -1,238 +0,0 @@
-#ifndef _BASE_MEASURES_H_
-#define _BASE_MEASURES_H_
-
-#include <vector>
-#include <map>
-#include <string>
-#include <cmath>
-#include <iostream>
-#include <cassert>
-
-#include "unigrams.h"
-#include "trule.h"
-#include "prob.h"
-#include "tdict.h"
-#include "sampler.h"
-#include "m.h"
-#include "os_phrase.h"
-
-struct Model1 {
- explicit Model1(const std::string& fname) :
- kNULL(TD::Convert("<eps>")),
- kZERO() {
- LoadModel1(fname);
- }
-
- void LoadModel1(const std::string& fname);
-
- // returns prob 0 if src or trg is not found
- const prob_t& operator()(WordID src, WordID trg) const {
- if (src == 0) src = kNULL;
- if (src < ttable.size()) {
- const std::map<WordID, prob_t>& cpd = ttable[src];
- const std::map<WordID, prob_t>::const_iterator it = cpd.find(trg);
- if (it != cpd.end())
- return it->second;
- }
- return kZERO;
- }
-
- const WordID kNULL;
- const prob_t kZERO;
- std::vector<std::map<WordID, prob_t> > ttable;
-};
-
-struct PoissonUniformUninformativeBase {
- explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
- prob_t operator()(const TRule& r) const {
- prob_t p; p.logeq(Md::log_poisson(r.e_.size(), 1.0));
- prob_t q = kUNIFORM; q.poweq(r.e_.size());
- p *= q;
- return p;
- }
- void Summary() const {}
- void ResampleHyperparameters(MT19937*) {}
- void Increment(const TRule&) {}
- void Decrement(const TRule&) {}
- prob_t Likelihood() const { return prob_t::One(); }
- const prob_t kUNIFORM;
-};
-
-struct CompletelyUniformBase {
- explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
- prob_t operator()(const TRule&) const {
- return kUNIFORM;
- }
- void Summary() const {}
- void ResampleHyperparameters(MT19937*) {}
- void Increment(const TRule&) {}
- void Decrement(const TRule&) {}
- prob_t Likelihood() const { return prob_t::One(); }
- const prob_t kUNIFORM;
-};
-
-struct UnigramWordBase {
- explicit UnigramWordBase(const std::string& fname) : un(fname) {}
- prob_t operator()(const TRule& r) const {
- return un(r.e_);
- }
- const UnigramWordModel un;
-};
-
-struct RuleHasher {
- size_t operator()(const TRule& r) const {
- return hash_value(r);
- }
-};
-
-struct TableLookupBase {
- TableLookupBase(const std::string& fname);
-
- prob_t operator()(const TRule& rule) const {
- const std::tr1::unordered_map<TRule,prob_t,RuleHasher>::const_iterator it = table.find(rule);
- if (it == table.end()) {
- std::cerr << rule << " not found\n";
- abort();
- }
- return it->second;
- }
-
- void ResampleHyperparameters(MT19937*) {}
- void Increment(const TRule&) {}
- void Decrement(const TRule&) {}
- prob_t Likelihood() const { return prob_t::One(); }
- void Summary() const {}
-
- std::tr1::unordered_map<TRule,prob_t,RuleHasher> table;
-};
-
-struct PhraseConditionalUninformativeBase {
- explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) :
- kUNIFORM_TARGET(1.0 / vocab_e_size) {
- assert(vocab_e_size > 0);
- }
-
- // return p0 of rule.e_ | rule.f_
- prob_t operator()(const TRule& rule) const {
- return p0(rule.f_, rule.e_, 0, 0);
- }
-
- prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
-
- void Summary() const {}
- void ResampleHyperparameters(MT19937*) {}
- void Increment(const TRule&) {}
- void Decrement(const TRule&) {}
- prob_t Likelihood() const { return prob_t::One(); }
- const prob_t kUNIFORM_TARGET;
-};
-
-struct PhraseConditionalUninformativeUnigramBase {
- explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {}
-
- // return p0 of rule.e_ | rule.f_
- prob_t operator()(const TRule& rule) const {
- return p0(rule.f_, rule.e_, 0, 0);
- }
-
- prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
-
- const UnigramModel u;
-};
-
-struct PhraseConditionalBase {
- explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) :
- model1(m1),
- kM1MIXTURE(m1mixture),
- kUNIFORM_MIXTURE(1.0 - m1mixture),
- kUNIFORM_TARGET(1.0 / vocab_e_size) {
- assert(m1mixture >= 0.0 && m1mixture <= 1.0);
- assert(vocab_e_size > 0);
- }
-
- // return p0 of rule.e_ | rule.f_
- prob_t operator()(const TRule& rule) const {
- return p0(rule.f_, rule.e_, 0, 0);
- }
-
- prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
-
- const Model1& model1;
- const prob_t kM1MIXTURE; // Model 1 mixture component
- const prob_t kUNIFORM_MIXTURE; // uniform mixture component
- const prob_t kUNIFORM_TARGET;
-};
-
-struct PhraseJointBase {
- explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) :
- model1(m1),
- kM1MIXTURE(m1mixture),
- kUNIFORM_MIXTURE(1.0 - m1mixture),
- kUNIFORM_SOURCE(1.0 / vocab_f_size),
- kUNIFORM_TARGET(1.0 / vocab_e_size) {
- assert(m1mixture >= 0.0 && m1mixture <= 1.0);
- assert(vocab_e_size > 0);
- }
-
- // return p0 of rule.e_ , rule.f_
- prob_t operator()(const TRule& rule) const {
- return p0(rule.f_, rule.e_, 0, 0);
- }
-
- prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
-
- const Model1& model1;
- const prob_t kM1MIXTURE; // Model 1 mixture component
- const prob_t kUNIFORM_MIXTURE; // uniform mixture component
- const prob_t kUNIFORM_SOURCE;
- const prob_t kUNIFORM_TARGET;
-};
-
-struct PhraseJointBase_BiDir {
- explicit PhraseJointBase_BiDir(const Model1& m1,
- const Model1& im1,
- const double m1mixture,
- const unsigned vocab_e_size,
- const unsigned vocab_f_size) :
- model1(m1),
- invmodel1(im1),
- kM1MIXTURE(m1mixture),
- kUNIFORM_MIXTURE(1.0 - m1mixture),
- kUNIFORM_SOURCE(1.0 / vocab_f_size),
- kUNIFORM_TARGET(1.0 / vocab_e_size) {
- assert(m1mixture >= 0.0 && m1mixture <= 1.0);
- assert(vocab_e_size > 0);
- }
-
- // return p0 of rule.e_ , rule.f_
- prob_t operator()(const TRule& rule) const {
- return p0(rule.f_, rule.e_, 0, 0);
- }
-
- prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
-
- const Model1& model1;
- const Model1& invmodel1;
- const prob_t kM1MIXTURE; // Model 1 mixture component
- const prob_t kUNIFORM_MIXTURE; // uniform mixture component
- const prob_t kUNIFORM_SOURCE;
- const prob_t kUNIFORM_TARGET;
-};
-
-// base distribution for jump size multinomials
-// basically p(0) = 0 and then, p(1) is max, and then
-// you drop as you move to the max jump distance
-struct JumpBase {
- JumpBase();
-
- const prob_t& operator()(int jump, unsigned src_len) const {
- assert(jump != 0);
- const std::map<int, prob_t>::const_iterator it = p[src_len].find(jump);
- assert(it != p[src_len].end());
- return it->second;
- }
- std::vector<std::map<int, prob_t> > p;
-};
-
-
-#endif
diff --git a/gi/pf/bayes_lattice_score.cc b/gi/pf/bayes_lattice_score.cc
deleted file mode 100644
index 70cb8dc2..00000000
--- a/gi/pf/bayes_lattice_score.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-#include <iostream>
-#include <queue>
-
-#include <boost/functional.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "inside_outside.h"
-#include "hg.h"
-#include "hg_io.h"
-#include "bottom_up_parser.h"
-#include "fdict.h"
-#include "grammar.h"
-#include "m.h"
-#include "trule.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp.h"
-#include "ccrp_onetable.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-boost::shared_ptr<MT19937> prng;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("input,i",po::value<string>(),"Read parallel data from")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-unsigned ReadCorpus(const string& filename,
- vector<Lattice>* e,
- set<WordID>* vocab_e) {
- e->clear();
- vocab_e->clear();
- ReadFile rf(filename);
- istream* in = rf.stream();
- assert(*in);
- string line;
- unsigned toks = 0;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
- e->push_back(Lattice());
- Lattice& le = e->back();
- LatticeTools::ConvertTextOrPLF(line, & le);
- for (unsigned i = 0; i < le.size(); ++i)
- for (unsigned j = 0; j < le[i].size(); ++j)
- vocab_e->insert(le[i][j].label);
- toks += le.size();
- }
- return toks;
-}
-
-struct BaseModel {
- explicit BaseModel(unsigned tc) :
- unif(1.0 / tc), p(prob_t::One()) {}
- prob_t prob(const TRule& r) const {
- return unif;
- }
- void increment(const TRule& r, MT19937* rng) {
- p *= prob(r);
- }
- void decrement(const TRule& r, MT19937* rng) {
- p /= prob(r);
- }
- prob_t Likelihood() const {
- return p;
- }
- const prob_t unif;
- prob_t p;
-};
-
-struct UnigramModel {
- explicit UnigramModel(unsigned tc) : base(tc), crp(1,1,1,1), glue(1,1,1,1) {}
- BaseModel base;
- CCRP<TRule> crp;
- CCRP<TRule> glue;
-
- prob_t Prob(const TRule& r) const {
- if (r.Arity() != 0) {
- return glue.prob(r, prob_t(0.5));
- }
- return crp.prob(r, base.prob(r));
- }
-
- int Increment(const TRule& r, MT19937* rng) {
- if (r.Arity() != 0) {
- glue.increment(r, 0.5, rng);
- return 0;
- } else {
- if (crp.increment(r, base.prob(r), rng)) {
- base.increment(r, rng);
- return 1;
- }
- return 0;
- }
- }
-
- int Decrement(const TRule& r, MT19937* rng) {
- if (r.Arity() != 0) {
- glue.decrement(r, rng);
- return 0;
- } else {
- if (crp.decrement(r, rng)) {
- base.decrement(r, rng);
- return -1;
- }
- return 0;
- }
- }
-
- prob_t Likelihood() const {
- prob_t p;
- p.logeq(crp.log_crp_prob() + glue.log_crp_prob());
- p *= base.Likelihood();
- return p;
- }
-
- void ResampleHyperparameters(MT19937* rng) {
- crp.resample_hyperparameters(rng);
- glue.resample_hyperparameters(rng);
- cerr << " d=" << crp.discount() << ", s=" << crp.strength() << "\t STOP d=" << glue.discount() << ", s=" << glue.strength() << endl;
- }
-};
-
-UnigramModel* plm;
-
-void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv) {
- vector<prob_t> node_probs;
- Inside<prob_t, EdgeProb>(hg, &node_probs);
- queue<unsigned> q;
- q.push(hg.nodes_.size() - 2);
- while(!q.empty()) {
- unsigned cur_node_id = q.front();
-// cerr << "NODE=" << cur_node_id << endl;
- q.pop();
- const Hypergraph::Node& node = hg.nodes_[cur_node_id];
- const unsigned num_in_edges = node.in_edges_.size();
- unsigned sampled_edge = 0;
- if (num_in_edges == 1) {
- sampled_edge = node.in_edges_[0];
- } else {
- //prob_t z;
- assert(num_in_edges > 1);
- SampleSet<prob_t> ss;
- for (unsigned j = 0; j < num_in_edges; ++j) {
- const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
- prob_t p = edge.edge_prob_;
- for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
- p *= node_probs[edge.tail_nodes_[k]];
- ss.add(p);
-// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl;
- //z += p;
- }
-// for (unsigned j = 0; j < num_in_edges; ++j) {
-// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
-// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl;
-// }
-// cerr << " --- \n";
- sampled_edge = node.in_edges_[rng->SelectSample(ss)];
- }
- sampled_deriv->push_back(sampled_edge);
- const Hypergraph::Edge& edge = hg.edges_[sampled_edge];
- for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
- q.push(edge.tail_nodes_[j]);
- }
- }
-// for (unsigned i = 0; i < sampled_deriv->size(); ++i) {
-// cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl;
-// }
-}
-
-void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, UnigramModel* plm, MT19937* rng) {
- for (unsigned i = 0; i < d.size(); ++i)
- plm->Increment(*hg.edges_[d[i]].rule_, rng);
-}
-
-void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, UnigramModel* plm, MT19937* rng) {
- for (unsigned i = 0; i < d.size(); ++i)
- plm->Decrement(*hg.edges_[d[i]].rule_, rng);
-}
-
-prob_t TotalProb(const Hypergraph& hg) {
- return Inside<prob_t, EdgeProb>(hg);
-}
-
-void IncrementLatticePath(const Hypergraph& hg, const vector<unsigned>& d, Lattice* pl) {
- Lattice& lat = *pl;
- for (int i = 0; i < d.size(); ++i) {
- const Hypergraph::Edge& edge = hg.edges_[d[i]];
- if (edge.rule_->Arity() != 0) continue;
- WordID sym = edge.rule_->e_[0];
- vector<LatticeArc>& las = lat[edge.i_];
- int dist = edge.j_ - edge.i_;
- assert(dist > 0);
- for (int j = 0; j < las.size(); ++j) {
- if (las[j].dist2next == dist &&
- las[j].label == sym) {
- las[j].cost += 1;
- }
- }
- }
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
-
- InitCommandLine(argc, argv, &conf);
- vector<GrammarPtr> grammars(2);
- grammars[0].reset(new GlueGrammar("S","X"));
- const unsigned samples = conf["samples"].as<unsigned>();
-
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
- MT19937& rng = *prng;
- vector<Lattice> corpuse;
- set<WordID> vocabe;
- cerr << "Reading corpus...\n";
- const unsigned toks = ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe);
- cerr << "E-corpus size: " << corpuse.size() << " lattices\t (" << vocabe.size() << " word types)\n";
- UnigramModel lm(vocabe.size());
- vector<Hypergraph> hgs(corpuse.size());
- vector<vector<unsigned> > derivs(corpuse.size());
- for (int i = 0; i < corpuse.size(); ++i) {
- grammars[1].reset(new PassThroughGrammar(corpuse[i], "X"));
- ExhaustiveBottomUpParser parser("S", grammars);
- bool res = parser.Parse(corpuse[i], &hgs[i]); // exhaustive parse
- assert(res);
- }
-
- double csamples = 0;
- for (int SS=0; SS < samples; ++SS) {
- const bool is_last = ((samples - 1) == SS);
- prob_t dlh = prob_t::One();
- bool record_sample = (SS > (samples * 1 / 3) && (SS % 5 == 3));
- if (record_sample) csamples++;
- for (int ci = 0; ci < corpuse.size(); ++ci) {
- Lattice& lat = corpuse[ci];
- Hypergraph& hg = hgs[ci];
- vector<unsigned>& d = derivs[ci];
- if (!is_last) DecrementDerivation(hg, d, &lm, &rng);
- for (unsigned i = 0; i < hg.edges_.size(); ++i) {
- TRule& r = *hg.edges_[i].rule_;
- if (r.Arity() != 0)
- hg.edges_[i].edge_prob_ = prob_t::One();
- else
- hg.edges_[i].edge_prob_ = lm.Prob(r);
- }
- if (!is_last) {
- d.clear();
- SampleDerivation(hg, &rng, &d);
- IncrementDerivation(hg, derivs[ci], &lm, &rng);
- } else {
- prob_t p = TotalProb(hg);
- dlh *= p;
- cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl;
- }
- if (record_sample) IncrementLatticePath(hg, derivs[ci], &lat);
- }
- double llh = log(lm.Likelihood());
- cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl;
- if (SS % 10 == 9) lm.ResampleHyperparameters(&rng);
- if (is_last) {
- double z = log(dlh);
- cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl;
- }
- }
- cerr << lm.crp << endl;
- cerr << lm.glue << endl;
- for (int i = 0; i < corpuse.size(); ++i) {
- for (int j = 0; j < corpuse[i].size(); ++j)
- for (int k = 0; k < corpuse[i][j].size(); ++k) {
- corpuse[i][j][k].cost /= csamples;
- corpuse[i][j][k].cost += 1e-3;
- corpuse[i][j][k].cost = log(corpuse[i][j][k].cost);
- }
- cout << HypergraphIO::AsPLF(corpuse[i]) << endl;
- }
- return 0;
-}
-
diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc
deleted file mode 100644
index 832f22cf..00000000
--- a/gi/pf/brat.cc
+++ /dev/null
@@ -1,543 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/functional.hpp>
-#include <boost/multi_array.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "viterbi.h"
-#include "hg.h"
-#include "trule.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp_nt.h"
-#include "cfg_wfst_composer.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-static unsigned kMAX_SRC_PHRASE;
-static unsigned kMAX_TRG_PHRASE;
-struct FSTState;
-
-double log_poisson(unsigned x, const double& lambda) {
- assert(lambda > 0.0);
- return log(lambda) * x - lgamma(x + 1) - lambda;
-}
-
-struct ConditionalBase {
- explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) :
- kM1MIXTURE(m1mixture),
- kUNIFORM_MIXTURE(1.0 - m1mixture),
- kUNIFORM_TARGET(1.0 / vocab_e_size),
- kNULL(TD::Convert("<eps>")) {
- assert(m1mixture >= 0.0 && m1mixture <= 1.0);
- assert(vocab_e_size > 0);
- LoadModel1(model1fname);
- }
-
- void LoadModel1(const string& fname) {
- cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
- ReadFile rf(fname);
- istream& in = *rf.stream();
- string line;
- unsigned lc = 0;
- while(getline(in, line)) {
- ++lc;
- int cur = 0;
- int start = 0;
- while(cur < line.size() && line[cur] != ' ') { ++cur; }
- assert(cur != line.size());
- line[cur] = 0;
- const WordID src = TD::Convert(&line[0]);
- ++cur;
- start = cur;
- while(cur < line.size() && line[cur] != ' ') { ++cur; }
- assert(cur != line.size());
- line[cur] = 0;
- WordID trg = TD::Convert(&line[start]);
- const double logprob = strtod(&line[cur + 1], NULL);
- if (src >= ttable.size()) ttable.resize(src + 1);
- ttable[src][trg].logeq(logprob);
- }
- cerr << " read " << lc << " parameters.\n";
- }
-
- // return logp0 of rule.e_ | rule.f_
- prob_t operator()(const TRule& rule) const {
- const int flen = rule.f_.size();
- const int elen = rule.e_.size();
- prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
- prob_t p;
- p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
- for (int i = 0; i < elen; ++i) { // for each position i in e-RHS
- const WordID trg = rule.e_[i];
- prob_t tp = prob_t::Zero();
- for (int j = -1; j < flen; ++j) {
- const WordID src = j < 0 ? kNULL : rule.f_[j];
- const map<WordID, prob_t>::const_iterator it = ttable[src].find(trg);
- if (it != ttable[src].end()) {
- tp += kM1MIXTURE * it->second;
- }
- tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
- }
- tp *= uniform_src_alignment; // draw a_i ~uniform
- p *= tp; // draw e_i ~Model1(f_a_i) / uniform
- }
- return p;
- }
-
- const prob_t kM1MIXTURE; // Model 1 mixture component
- const prob_t kUNIFORM_MIXTURE; // uniform mixture component
- const prob_t kUNIFORM_TARGET;
- const WordID kNULL;
- vector<map<WordID, prob_t> > ttable;
-};
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("input,i",po::value<string>(),"Read parallel data from")
- ("max_src_phrase",po::value<unsigned>()->default_value(3),"Maximum length of source language phrases")
- ("max_trg_phrase",po::value<unsigned>()->default_value(3),"Maximum length of target language phrases")
- ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
- ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-void ReadParallelCorpus(const string& filename,
- vector<vector<WordID> >* f,
- vector<vector<int> >* e,
- set<int>* vocab_f,
- set<int>* vocab_e) {
- f->clear();
- e->clear();
- vocab_f->clear();
- vocab_e->clear();
- istream* in;
- if (filename == "-")
- in = &cin;
- else
- in = new ifstream(filename.c_str());
- assert(*in);
- string line;
- const WordID kDIV = TD::Convert("|||");
- vector<WordID> tmp;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
- e->push_back(vector<int>());
- f->push_back(vector<int>());
- vector<int>& le = e->back();
- vector<int>& lf = f->back();
- tmp.clear();
- TD::ConvertSentence(line, &tmp);
- bool isf = true;
- for (unsigned i = 0; i < tmp.size(); ++i) {
- const int cur = tmp[i];
- if (isf) {
- if (kDIV == cur) { isf = false; } else {
- lf.push_back(cur);
- vocab_f->insert(cur);
- }
- } else {
- assert(cur != kDIV);
- le.push_back(cur);
- vocab_e->insert(cur);
- }
- }
- assert(isf == false);
- }
- if (in != &cin) delete in;
-}
-
-struct UniphraseLM {
- UniphraseLM(const vector<vector<int> >& corpus,
- const set<int>& vocab,
- const po::variables_map& conf) :
- phrases_(1,1),
- gen_(1,1),
- corpus_(corpus),
- uniform_word_(1.0 / vocab.size()),
- gen_p0_(0.5),
- p_end_(0.5),
- use_poisson_(conf.count("poisson_length") > 0) {}
-
- void ResampleHyperparameters(MT19937* rng) {
- phrases_.resample_hyperparameters(rng);
- gen_.resample_hyperparameters(rng);
- cerr << " " << phrases_.alpha();
- }
-
- CCRP_NoTable<vector<int> > phrases_;
- CCRP_NoTable<bool> gen_;
- vector<vector<bool> > z_; // z_[i] is there a phrase boundary after the ith word
- const vector<vector<int> >& corpus_;
- const double uniform_word_;
- const double gen_p0_;
- const double p_end_; // in base length distribution, p of the end of a phrase
- const bool use_poisson_;
-};
-
-struct Reachability {
- boost::multi_array<bool, 4> edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring?
- boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid
-
- Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
- edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
- max_src_delta(boost::extents[srclen][trglen]) {
- ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
- }
-
- private:
- struct SState {
- SState() : prev_src_covered(), prev_trg_covered() {}
- SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {}
- int prev_src_covered;
- int prev_trg_covered;
- };
-
- struct NState {
- NState() : next_src_covered(), next_trg_covered() {}
- NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {}
- int next_src_covered;
- int next_trg_covered;
- };
-
- void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) {
- typedef boost::multi_array<vector<SState>, 2> array_type;
- array_type a(boost::extents[srclen + 1][trglen + 1]);
- a[0][0].push_back(SState());
- for (int i = 0; i < srclen; ++i) {
- for (int j = 0; j < trglen; ++j) {
- if (a[i][j].size() == 0) continue;
- const SState prev(i,j);
- for (int k = 1; k <= src_max_phrase_len; ++k) {
- if ((i + k) > srclen) continue;
- for (int l = 1; l <= trg_max_phrase_len; ++l) {
- if ((j + l) > trglen) continue;
- a[i + k][j + l].push_back(prev);
- }
- }
- }
- }
- a[0][0].clear();
- cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
- assert(a[srclen][trglen].size() > 0);
-
- typedef boost::multi_array<bool, 2> rarray_type;
- rarray_type r(boost::extents[srclen + 1][trglen + 1]);
-// typedef boost::multi_array<vector<NState>, 2> narray_type;
-// narray_type b(boost::extents[srclen + 1][trglen + 1]);
- r[srclen][trglen] = true;
- for (int i = srclen; i >= 0; --i) {
- for (int j = trglen; j >= 0; --j) {
- vector<SState>& prevs = a[i][j];
- if (!r[i][j]) { prevs.clear(); }
-// const NState nstate(i,j);
- for (int k = 0; k < prevs.size(); ++k) {
- r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true;
- int src_delta = i - prevs[k].prev_src_covered;
- edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true;
- short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered];
- if (src_delta > msd) msd = src_delta;
-// b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate);
- }
- }
- }
- assert(!edges[0][0][1][0]);
- assert(!edges[0][0][0][1]);
- assert(!edges[0][0][0][0]);
- cerr << " MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl;
- assert(max_src_delta[0][0] > 0);
- //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n";
- //for (int i = 0; i < b[0][0].size(); ++i) {
- // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n";
- //}
- }
-};
-
-ostream& operator<<(ostream& os, const FSTState& q);
-struct FSTState {
- explicit FSTState(int src_size) :
- trg_covered_(),
- src_covered_(),
- src_coverage_(src_size) {}
-
- FSTState(short trg_covered, short src_covered, const vector<bool>& src_coverage, const vector<short>& src_prefix) :
- trg_covered_(trg_covered),
- src_covered_(src_covered),
- src_coverage_(src_coverage),
- src_prefix_(src_prefix) {
- if (src_coverage_.size() == src_covered) {
- assert(src_prefix.size() == 0);
- }
- }
-
- // if we extend by the word at src_position, what are
- // the next states that are reachable and lie on a valid
- // path to the final state?
- vector<FSTState> Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const {
- assert(src_position < src_coverage_.size());
- if (src_coverage_[src_position]) {
- cerr << "Trying to extend " << *this << " with position " << src_position << endl;
- abort();
- }
- vector<bool> ncvg = src_coverage_;
- ncvg[src_position] = true;
-
- vector<FSTState> res;
- const int trg_remaining = trg_len - trg_covered_;
- if (trg_remaining <= 0) {
- cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl;
- abort();
- }
- const int src_remaining = src_len - src_covered_;
- if (src_remaining <= 0) {
- cerr << "Source appears to have been covered: " << *this << endl;
- abort();
- }
-
- for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) {
- if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) {
- int nc = src_prefix_.size() + 1 + src_covered_;
- res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector<short>()));
- }
- }
-
- if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) {
- vector<short> nsp = src_prefix_;
- nsp.push_back(src_position);
- res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp));
- }
-
- if (res.size() == 0) {
- cerr << *this << " can't be extended!\n";
- abort();
- }
- return res;
- }
-
- short trg_covered_, src_covered_;
- vector<bool> src_coverage_;
- vector<short> src_prefix_;
-};
-bool operator<(const FSTState& q, const FSTState& r) {
- if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_;
- if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_;
- if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_;
- return q.src_prefix_ < r.src_prefix_;
-}
-
-ostream& operator<<(ostream& os, const FSTState& q) {
- os << "[" << q.trg_covered_ << " : ";
- for (int i = 0; i < q.src_coverage_.size(); ++i)
- os << q.src_coverage_[i];
- os << " : <";
- for (int i = 0; i < q.src_prefix_.size(); ++i) {
- if (i != 0) os << ' ';
- os << q.src_prefix_[i];
- }
- return os << ">]";
-}
-
-struct MyModel {
- MyModel(ConditionalBase& rcp0) : rp0(rcp0) {}
- typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > SrcToRuleCRPMap;
-
- void DecrementRule(const TRule& rule) {
- SrcToRuleCRPMap::iterator it = rules.find(rule.f_);
- assert(it != rules.end());
- it->second.decrement(rule);
- if (it->second.num_customers() == 0) rules.erase(it);
- }
-
- void IncrementRule(const TRule& rule) {
- SrcToRuleCRPMap::iterator it = rules.find(rule.f_);
- if (it == rules.end()) {
- CCRP_NoTable<TRule> crp(1,1);
- it = rules.insert(make_pair(rule.f_, crp)).first;
- }
- it->second.increment(rule);
- }
-
- // conditioned on rule.f_
- prob_t RuleConditionalProbability(const TRule& rule) const {
- const prob_t base = rp0(rule);
- SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_);
- if (it == rules.end()) {
- return base;
- } else {
- const double lp = it->second.logprob(rule, log(base));
- prob_t q; q.logeq(lp);
- return q;
- }
- }
-
- const ConditionalBase& rp0;
- SrcToRuleCRPMap rules;
-};
-
-struct MyFST : public WFST {
- MyFST(const vector<WordID>& ssrc, const vector<WordID>& strg, MyModel* m) :
- src(ssrc), trg(strg),
- r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE),
- model(m) {
- FSTState in(src.size());
- cerr << " INIT: " << in << endl;
- init = GetNode(in);
- for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true;
- in.src_covered_ = src.size();
- in.trg_covered_ = trg.size();
- cerr << "FINAL: " << in << endl;
- final = GetNode(in);
- }
- virtual const WFSTNode* Final() const;
- virtual const WFSTNode* Initial() const;
-
- const WFSTNode* GetNode(const FSTState& q);
- map<FSTState, boost::shared_ptr<WFSTNode> > m;
- const vector<WordID>& src;
- const vector<WordID>& trg;
- Reachability r;
- const WFSTNode* init;
- const WFSTNode* final;
- MyModel* model;
-};
-
-struct MyNode : public WFSTNode {
- MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {}
- virtual vector<pair<const WFSTNode*, TRulePtr> > ExtendInput(unsigned srcindex) const;
- const FSTState state;
- mutable MyFST* container;
-};
-
-vector<pair<const WFSTNode*, TRulePtr> > MyNode::ExtendInput(unsigned srcindex) const {
- cerr << "EXTEND " << state << " with " << srcindex << endl;
- vector<FSTState> ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r);
- vector<pair<const WFSTNode*,TRulePtr> > res(ext.size());
- for (unsigned i = 0; i < ext.size(); ++i) {
- res[i].first = container->GetNode(ext[i]);
- if (ext[i].src_prefix_.size() == 0) {
- const unsigned trg_from = state.trg_covered_;
- const unsigned trg_to = ext[i].trg_covered_;
- const unsigned prev_prfx_size = state.src_prefix_.size();
- res[i].second.reset(new TRule);
- res[i].second->lhs_ = -TD::Convert("X");
- vector<WordID>& src = res[i].second->f_;
- vector<WordID>& trg = res[i].second->e_;
- src.resize(prev_prfx_size + 1);
- for (unsigned j = 0; j < prev_prfx_size; ++j)
- src[j] = container->src[state.src_prefix_[j]];
- src[prev_prfx_size] = container->src[srcindex];
- for (unsigned j = trg_from; j < trg_to; ++j)
- trg.push_back(container->trg[j]);
- res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second)));
- }
- }
- return res;
-}
-
-const WFSTNode* MyFST::GetNode(const FSTState& q) {
- boost::shared_ptr<WFSTNode>& res = m[q];
- if (!res) {
- res.reset(new MyNode(q, this));
- }
- return &*res;
-}
-
-const WFSTNode* MyFST::Final() const {
- return final;
-}
-
-const WFSTNode* MyFST::Initial() const {
- return init;
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
- kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
-
- if (!conf.count("model1")) {
- cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
- return 1;
- }
- boost::shared_ptr<MT19937> prng;
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
- MT19937& rng = *prng;
-
- vector<vector<int> > corpuse, corpusf;
- set<int> vocabe, vocabf;
- ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
- cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
- cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
- cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
- cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
- assert(corpusf.size() == corpuse.size());
-
- ConditionalBase lp0(conf["model1_interpolation_weight"].as<double>(),
- vocabe.size(),
- conf["model1"].as<string>());
- MyModel m(lp0);
-
- TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0");
- m.IncrementRule(x);
- TRule y("[X] ||| nY dyN ||| gave ||| 0");
- m.IncrementRule(y);
-
-
- MyFST fst(corpusf[0], corpuse[0], &m);
- ifstream in("./kimura.g");
- assert(in);
- CFG_WFSTComposer comp(fst);
- Hypergraph hg;
- bool succeed = comp.Compose(&in, &hg);
- hg.PrintGraphviz();
- if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; }
-
-#if 0
- ifstream in2("./amnabooks.g");
- assert(in2);
- MyFST fst2(corpusf[1], corpuse[1], &m);
- CFG_WFSTComposer comp2(fst2);
- Hypergraph hg2;
- bool succeed2 = comp2.Compose(&in2, &hg2);
- if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; }
-#endif
-
- SparseVector<double> w; w.set_value(FD::Convert("Proposal"), 1.0);
- hg.Reweight(w);
- cerr << ViterbiFTree(hg) << endl;
- return 0;
-}
-
diff --git a/gi/pf/cbgi.cc b/gi/pf/cbgi.cc
deleted file mode 100644
index 97f1ba34..00000000
--- a/gi/pf/cbgi.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-#include <queue>
-#include <sstream>
-#include <iostream>
-
-#include <boost/unordered_map.hpp>
-#include <boost/functional/hash.hpp>
-
-#include "sampler.h"
-#include "filelib.h"
-#include "hg_io.h"
-#include "hg.h"
-#include "ccrp_nt.h"
-#include "trule.h"
-#include "inside_outside.h"
-
-using namespace std;
-using namespace std::tr1;
-
-double log_poisson(unsigned x, const double& lambda) {
- assert(lambda > 0.0);
- return log(lambda) * x - lgamma(x + 1) - lambda;
-}
-
-double log_decay(unsigned x, const double& b) {
- assert(b > 1.0);
- assert(x > 0);
- return log(b - 1) - x * log(b);
-}
-
-struct SimpleBase {
- SimpleBase(unsigned esize, unsigned fsize, unsigned ntsize = 144) :
- uniform_e(-log(esize)),
- uniform_f(-log(fsize)),
- uniform_nt(-log(ntsize)) {
- }
-
- // binomial coefficient
- static double choose(unsigned n, unsigned k) {
- return exp(lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1));
- }
-
- // count the number of patterns of terminals and NTs in the rule, given elen and flen
- static double log_number_of_patterns(const unsigned flen, const unsigned elen) {
- static vector<vector<double> > counts;
- if (elen >= counts.size()) counts.resize(elen + 1);
- if (flen >= counts[elen].size()) counts[elen].resize(flen + 1);
- double& count = counts[elen][flen];
- if (count) return log(count);
- const unsigned max_arity = min(elen, flen);
- for (unsigned a = 0; a <= max_arity; ++a)
- count += choose(elen, a) * choose(flen, a);
- return log(count);
- }
-
- // return logp0 of rule | LHS
- double operator()(const TRule& rule) const {
- const unsigned flen = rule.f_.size();
- const unsigned elen = rule.e_.size();
-#if 0
- double p = 0;
- p += log_poisson(flen, 0.5); // flen ~Pois(0.5)
- p += log_poisson(elen, flen); // elen | flen ~Pois(flen)
- p -= log_number_of_patterns(flen, elen); // pattern | flen,elen ~Uniform
- for (unsigned i = 0; i < flen; ++i) { // for each position in f-RHS
- if (rule.f_[i] <= 0) // according to pattern
- p += uniform_nt; // draw NT ~Uniform
- else
- p += uniform_f; // draw f terminal ~Uniform
- }
- p -= lgamma(rule.Arity() + 1); // draw permutation ~Uniform
- for (unsigned i = 0; i < elen; ++i) { // for each position in e-RHS
- if (rule.e_[i] > 0) // according to pattern
- p += uniform_e; // draw e|f term ~Uniform
- // TODO this should prob be model 1
- }
-#else
- double p = 0;
- bool is_abstract = rule.f_[0] <= 0;
- p += log(0.5);
- if (is_abstract) {
- if (flen == 2) p += log(0.99); else p += log(0.01);
- } else {
- p += log_decay(flen, 3);
- }
-
- for (unsigned i = 0; i < flen; ++i) { // for each position in f-RHS
- if (rule.f_[i] <= 0) // according to pattern
- p += uniform_nt; // draw NT ~Uniform
- else
- p += uniform_f; // draw f terminal ~Uniform
- }
-#endif
- return p;
- }
- const double uniform_e;
- const double uniform_f;
- const double uniform_nt;
- vector<double> arities;
-};
-
-MT19937* rng = NULL;
-
-template <typename Base>
-struct MHSamplerEdgeProb {
- MHSamplerEdgeProb(const Hypergraph& hg,
- const map<int, CCRP_NoTable<TRule> >& rdp,
- const Base& logp0,
- const bool exclude_multiword_terminals) : edge_probs(hg.edges_.size()) {
- for (int i = 0; i < edge_probs.size(); ++i) {
- const TRule& rule = *hg.edges_[i].rule_;
- const map<int, CCRP_NoTable<TRule> >::const_iterator it = rdp.find(rule.lhs_);
- assert(it != rdp.end());
- const CCRP_NoTable<TRule>& crp = it->second;
- edge_probs[i].logeq(crp.logprob(rule, logp0(rule)));
- if (exclude_multiword_terminals && rule.f_[0] > 0 && rule.f_.size() > 1)
- edge_probs[i] = prob_t::Zero();
- }
- }
- inline prob_t operator()(const Hypergraph::Edge& e) const {
- return edge_probs[e.id_];
- }
- prob_t DerivationProb(const vector<int>& d) const {
- prob_t p = prob_t::One();
- for (unsigned i = 0; i < d.size(); ++i)
- p *= edge_probs[d[i]];
- return p;
- }
- vector<prob_t> edge_probs;
-};
-
-template <typename Base>
-struct ModelAndData {
- ModelAndData() :
- base_lh(prob_t::One()),
- logp0(10000, 10000),
- mh_samples(),
- mh_rejects() {}
-
- void SampleCorpus(const string& hgpath, int i);
- void ResampleHyperparameters() {
- for (map<int, CCRP_NoTable<TRule> >::iterator it = rules.begin(); it != rules.end(); ++it)
- it->second.resample_hyperparameters(rng);
- }
-
- CCRP_NoTable<TRule>& RuleCRP(int lhs) {
- map<int, CCRP_NoTable<TRule> >::iterator it = rules.find(lhs);
- if (it == rules.end()) {
- rules.insert(make_pair(lhs, CCRP_NoTable<TRule>(1,1)));
- it = rules.find(lhs);
- }
- return it->second;
- }
-
- void IncrementRule(const TRule& rule) {
- CCRP_NoTable<TRule>& crp = RuleCRP(rule.lhs_);
- if (crp.increment(rule)) {
- prob_t p; p.logeq(logp0(rule));
- base_lh *= p;
- }
- }
-
- void DecrementRule(const TRule& rule) {
- CCRP_NoTable<TRule>& crp = RuleCRP(rule.lhs_);
- if (crp.decrement(rule)) {
- prob_t p; p.logeq(logp0(rule));
- base_lh /= p;
- }
- }
-
- void DecrementDerivation(const Hypergraph& hg, const vector<int>& d) {
- for (unsigned i = 0; i < d.size(); ++i) {
- const TRule& rule = *hg.edges_[d[i]].rule_;
- DecrementRule(rule);
- }
- }
-
- void IncrementDerivation(const Hypergraph& hg, const vector<int>& d) {
- for (unsigned i = 0; i < d.size(); ++i) {
- const TRule& rule = *hg.edges_[d[i]].rule_;
- IncrementRule(rule);
- }
- }
-
- prob_t Likelihood() const {
- prob_t p = prob_t::One();
- for (map<int, CCRP_NoTable<TRule> >::const_iterator it = rules.begin(); it != rules.end(); ++it) {
- prob_t q; q.logeq(it->second.log_crp_prob());
- p *= q;
- }
- p *= base_lh;
- return p;
- }
-
- void ResampleDerivation(const Hypergraph& hg, vector<int>* sampled_derivation);
-
- map<int, CCRP_NoTable<TRule> > rules; // [lhs] -> distribution over RHSs
- prob_t base_lh;
- SimpleBase logp0;
- vector<vector<int> > samples; // sampled derivations
- unsigned int mh_samples;
- unsigned int mh_rejects;
-};
-
-template <typename Base>
-void ModelAndData<Base>::SampleCorpus(const string& hgpath, int n) {
- vector<Hypergraph> hgs(n); hgs.clear();
- boost::unordered_map<TRule, unsigned> acc;
- map<int, unsigned> tot;
- for (int i = 0; i < n; ++i) {
- ostringstream os;
- os << hgpath << '/' << i << ".json.gz";
- if (!FileExists(os.str())) continue;
- hgs.push_back(Hypergraph());
- ReadFile rf(os.str());
- HypergraphIO::ReadFromJSON(rf.stream(), &hgs.back());
- }
- cerr << "Read " << hgs.size() << " alignment hypergraphs.\n";
- samples.resize(hgs.size());
- const unsigned SAMPLES = 2000;
- const unsigned burnin = 3 * SAMPLES / 4;
- const unsigned every = 20;
- for (unsigned s = 0; s < SAMPLES; ++s) {
- if (s % 10 == 0) {
- if (s > 0) { cerr << endl; ResampleHyperparameters(); }
- cerr << "[" << s << " LLH=" << log(Likelihood()) << " REJECTS=" << ((double)mh_rejects / mh_samples) << " LHS's=" << rules.size() << " base=" << log(base_lh) << "] ";
- }
- cerr << '.';
- for (unsigned i = 0; i < hgs.size(); ++i) {
- ResampleDerivation(hgs[i], &samples[i]);
- if (s > burnin && s % every == 0) {
- for (unsigned j = 0; j < samples[i].size(); ++j) {
- const TRule& rule = *hgs[i].edges_[samples[i][j]].rule_;
- ++acc[rule];
- ++tot[rule.lhs_];
- }
- }
- }
- }
- cerr << endl;
- for (boost::unordered_map<TRule,unsigned>::iterator it = acc.begin(); it != acc.end(); ++it) {
- cout << it->first << " MyProb=" << log(it->second)-log(tot[it->first.lhs_]) << endl;
- }
-}
-
-template <typename Base>
-void ModelAndData<Base>::ResampleDerivation(const Hypergraph& hg, vector<int>* sampled_deriv) {
- vector<int> cur;
- cur.swap(*sampled_deriv);
-
- const prob_t p_cur = Likelihood();
- DecrementDerivation(hg, cur);
- if (cur.empty()) {
- // first iteration, create restaurants
- for (int i = 0; i < hg.edges_.size(); ++i)
- RuleCRP(hg.edges_[i].rule_->lhs_);
- }
- MHSamplerEdgeProb<SimpleBase> wf(hg, rules, logp0, cur.empty());
-// MHSamplerEdgeProb<SimpleBase> wf(hg, rules, logp0, false);
- const prob_t q_cur = wf.DerivationProb(cur);
- vector<prob_t> node_probs;
- Inside<prob_t, MHSamplerEdgeProb<SimpleBase> >(hg, &node_probs, wf);
- queue<unsigned> q;
- q.push(hg.nodes_.size() - 3);
- while(!q.empty()) {
- unsigned cur_node_id = q.front();
-// cerr << "NODE=" << cur_node_id << endl;
- q.pop();
- const Hypergraph::Node& node = hg.nodes_[cur_node_id];
- const unsigned num_in_edges = node.in_edges_.size();
- unsigned sampled_edge = 0;
- if (num_in_edges == 1) {
- sampled_edge = node.in_edges_[0];
- } else {
- prob_t z;
- assert(num_in_edges > 1);
- SampleSet<prob_t> ss;
- for (unsigned j = 0; j < num_in_edges; ++j) {
- const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
- prob_t p = wf.edge_probs[edge.id_]; // edge proposal prob
- for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
- p *= node_probs[edge.tail_nodes_[k]];
- ss.add(p);
-// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl;
- z += p;
- }
-// for (unsigned j = 0; j < num_in_edges; ++j) {
-// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
-// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl;
-// }
-// cerr << " --- \n";
- sampled_edge = node.in_edges_[rng->SelectSample(ss)];
- }
- sampled_deriv->push_back(sampled_edge);
- const Hypergraph::Edge& edge = hg.edges_[sampled_edge];
- for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
- q.push(edge.tail_nodes_[j]);
- }
- }
- IncrementDerivation(hg, *sampled_deriv);
-
-// cerr << "sampled derivation contains " << sampled_deriv->size() << " edges\n";
-// cerr << "DERIV:\n";
-// for (int i = 0; i < sampled_deriv->size(); ++i) {
-// cerr << " " << hg.edges_[(*sampled_deriv)[i]].rule_->AsString() << endl;
-// }
-
- if (cur.empty()) return; // accept first sample
-
- ++mh_samples;
- // only need to do MH if proposal is different to current state
- if (cur != *sampled_deriv) {
- const prob_t q_prop = wf.DerivationProb(*sampled_deriv);
- const prob_t p_prop = Likelihood();
- if (!rng->AcceptMetropolisHastings(p_prop, p_cur, q_prop, q_cur)) {
- ++mh_rejects;
- DecrementDerivation(hg, *sampled_deriv);
- IncrementDerivation(hg, cur);
- swap(cur, *sampled_deriv);
- }
- }
-}
-
-int main(int argc, char** argv) {
- rng = new MT19937;
- ModelAndData<SimpleBase> m;
- m.SampleCorpus("./hgs", 50);
- // m.SampleCorpus("./btec/hgs", 5000);
- return 0;
-}
-
diff --git a/gi/pf/cfg_wfst_composer.cc b/gi/pf/cfg_wfst_composer.cc
deleted file mode 100644
index 21d5ec5b..00000000
--- a/gi/pf/cfg_wfst_composer.cc
+++ /dev/null
@@ -1,731 +0,0 @@
-#include "cfg_wfst_composer.h"
-
-#include <iostream>
-#include <fstream>
-#include <map>
-#include <queue>
-#include <tr1/unordered_map>
-#include <tr1/unordered_set>
-
-#include <boost/shared_ptr.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include "fast_lexical_cast.hpp"
-
-#include "phrasetable_fst.h"
-#include "sparse_vector.h"
-#include "tdict.h"
-#include "hg.h"
-#include "hg_remove_eps.h"
-
-namespace po = boost::program_options;
-using namespace std;
-using namespace std::tr1;
-
-WFSTNode::~WFSTNode() {}
-WFST::~WFST() {}
-
-// Define the following macro if you want to see lots of debugging output
-// when you run the chart parser
-#undef DEBUG_CHART_PARSER
-
-// A few constants used by the chart parser ///////////////
-static const int kMAX_NODES = 2000000;
-static const string kPHRASE_STRING = "X";
-static bool constants_need_init = true;
-static WordID kUNIQUE_START;
-static WordID kPHRASE;
-static TRulePtr kX1X2;
-static TRulePtr kX1;
-static WordID kEPS;
-static TRulePtr kEPSRule;
-
-static void InitializeConstants() {
- if (constants_need_init) {
- kPHRASE = TD::Convert(kPHRASE_STRING) * -1;
- kUNIQUE_START = TD::Convert("S") * -1;
- kX1X2.reset(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]"));
- kX1.reset(new TRule("[X] ||| [X,1] ||| [X,1]"));
- kEPSRule.reset(new TRule("[X] ||| <eps> ||| <eps>"));
- kEPS = TD::Convert("<eps>");
- constants_need_init = false;
- }
-}
-////////////////////////////////////////////////////////////
-
-class EGrammarNode {
- friend bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest);
- friend void AddGrammarRule(const string& r, map<WordID, EGrammarNode>* g);
- public:
-#ifdef DEBUG_CHART_PARSER
- string hint;
-#endif
- EGrammarNode() : is_some_rule_complete(false), is_root(false) {}
- const map<WordID, EGrammarNode>& GetTerminals() const { return tptr; }
- const map<WordID, EGrammarNode>& GetNonTerminals() const { return ntptr; }
- bool HasNonTerminals() const { return (!ntptr.empty()); }
- bool HasTerminals() const { return (!tptr.empty()); }
- bool RuleCompletes() const {
- return (is_some_rule_complete || (ntptr.empty() && tptr.empty()));
- }
- bool GrammarContinues() const {
- return !(ntptr.empty() && tptr.empty());
- }
- bool IsRoot() const {
- return is_root;
- }
- // these are the features associated with the rule from the start
- // node up to this point. If you use these features, you must
- // not Extend() this rule.
- const SparseVector<double>& GetCFGProductionFeatures() const {
- return input_features;
- }
-
- const EGrammarNode* Extend(const WordID& t) const {
- if (t < 0) {
- map<WordID, EGrammarNode>::const_iterator it = ntptr.find(t);
- if (it == ntptr.end()) return NULL;
- return &it->second;
- } else {
- map<WordID, EGrammarNode>::const_iterator it = tptr.find(t);
- if (it == tptr.end()) return NULL;
- return &it->second;
- }
- }
-
- private:
- map<WordID, EGrammarNode> tptr;
- map<WordID, EGrammarNode> ntptr;
- SparseVector<double> input_features;
- bool is_some_rule_complete;
- bool is_root;
-};
-typedef map<WordID, EGrammarNode> EGrammar; // indexed by the rule LHS
-
-// edges are immutable once created
-struct Edge {
-#ifdef DEBUG_CHART_PARSER
- static int id_count;
- const int id;
-#endif
- const WordID cat; // lhs side of rule proved/being proved
- const EGrammarNode* const dot; // dot position
- const WFSTNode* const q; // start of span
- const WFSTNode* const r; // end of span
- const Edge* const active_parent; // back pointer, NULL for PREDICT items
- const Edge* const passive_parent; // back pointer, NULL for SCAN and PREDICT items
- TRulePtr tps; // translations
- boost::shared_ptr<SparseVector<double> > features; // features from CFG rule
-
- bool IsPassive() const {
- // when a rule is completed, this value will be set
- return static_cast<bool>(features);
- }
- bool IsActive() const { return !IsPassive(); }
- bool IsInitial() const {
- return !(active_parent || passive_parent);
- }
- bool IsCreatedByScan() const {
- return active_parent && !passive_parent && !dot->IsRoot();
- }
- bool IsCreatedByPredict() const {
- return dot->IsRoot();
- }
- bool IsCreatedByComplete() const {
- return active_parent && passive_parent;
- }
-
- // constructor for PREDICT
- Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r) :
-#ifdef DEBUG_CHART_PARSER
- id(++id_count),
-#endif
- cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(NULL), passive_parent(NULL), tps() {}
- Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r, const Edge* act_parent) :
-#ifdef DEBUG_CHART_PARSER
- id(++id_count),
-#endif
- cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(act_parent), passive_parent(NULL), tps() {}
-
- // constructors for SCAN
- Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j,
- const Edge* act_par, const TRulePtr& translations) :
-#ifdef DEBUG_CHART_PARSER
- id(++id_count),
-#endif
- cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations) {}
-
- Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j,
- const Edge* act_par, const TRulePtr& translations,
- const SparseVector<double>& feats) :
-#ifdef DEBUG_CHART_PARSER
- id(++id_count),
-#endif
- cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations),
- features(new SparseVector<double>(feats)) {}
-
- // constructors for COMPLETE
- Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j,
- const Edge* act_par, const Edge *pas_par) :
-#ifdef DEBUG_CHART_PARSER
- id(++id_count),
-#endif
- cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps() {
- assert(pas_par->IsPassive());
- assert(act_par->IsActive());
- }
-
- Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j,
- const Edge* act_par, const Edge *pas_par, const SparseVector<double>& feats) :
-#ifdef DEBUG_CHART_PARSER
- id(++id_count),
-#endif
- cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(),
- features(new SparseVector<double>(feats)) {
- assert(pas_par->IsPassive());
- assert(act_par->IsActive());
- }
-
- // constructor for COMPLETE query
- Edge(const WFSTNode* _r) :
-#ifdef DEBUG_CHART_PARSER
- id(0),
-#endif
- cat(0), dot(NULL), q(NULL),
- r(_r), active_parent(NULL), passive_parent(NULL), tps() {}
- // constructor for MERGE quere
- Edge(const WFSTNode* _q, int) :
-#ifdef DEBUG_CHART_PARSER
- id(0),
-#endif
- cat(0), dot(NULL), q(_q),
- r(NULL), active_parent(NULL), passive_parent(NULL), tps() {}
-};
-#ifdef DEBUG_CHART_PARSER
-int Edge::id_count = 0;
-#endif
-
-ostream& operator<<(ostream& os, const Edge& e) {
- string type = "PREDICT";
- if (e.IsCreatedByScan())
- type = "SCAN";
- else if (e.IsCreatedByComplete())
- type = "COMPLETE";
- os << "["
-#ifdef DEBUG_CHART_PARSER
- << '(' << e.id << ") "
-#else
- << '(' << &e << ") "
-#endif
- << "q=" << e.q << ", r=" << e.r
- << ", cat="<< TD::Convert(e.cat*-1) << ", dot="
- << e.dot
-#ifdef DEBUG_CHART_PARSER
- << e.dot->hint
-#endif
- << (e.IsActive() ? ", Active" : ", Passive")
- << ", " << type;
-#ifdef DEBUG_CHART_PARSER
- if (e.active_parent) { os << ", act.parent=(" << e.active_parent->id << ')'; }
- if (e.passive_parent) { os << ", psv.parent=(" << e.passive_parent->id << ')'; }
-#endif
- if (e.tps) { os << ", tps=" << e.tps->AsString(); }
- return os << ']';
-}
-
-struct Traversal {
- const Edge* const edge; // result from the active / passive combination
- const Edge* const active;
- const Edge* const passive;
- Traversal(const Edge* me, const Edge* a, const Edge* p) : edge(me), active(a), passive(p) {}
-};
-
-struct UniqueTraversalHash {
- size_t operator()(const Traversal* t) const {
- size_t x = 5381;
- x = ((x << 5) + x) ^ reinterpret_cast<size_t>(t->active);
- x = ((x << 5) + x) ^ reinterpret_cast<size_t>(t->passive);
- x = ((x << 5) + x) ^ t->edge->IsActive();
- return x;
- }
-};
-
-struct UniqueTraversalEquals {
- size_t operator()(const Traversal* a, const Traversal* b) const {
- return (a->passive == b->passive && a->active == b->active && a->edge->IsActive() == b->edge->IsActive());
- }
-};
-
-struct UniqueEdgeHash {
- size_t operator()(const Edge* e) const {
- size_t x = 5381;
- if (e->IsActive()) {
- x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->dot);
- x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q);
- x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r);
- x = ((x << 5) + x) ^ static_cast<size_t>(e->cat);
- x += 13;
- } else { // with passive edges, we don't care about the dot
- x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q);
- x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r);
- x = ((x << 5) + x) ^ static_cast<size_t>(e->cat);
- }
- return x;
- }
-};
-
-struct UniqueEdgeEquals {
- bool operator()(const Edge* a, const Edge* b) const {
- if (a->IsActive() != b->IsActive()) return false;
- if (a->IsActive()) {
- return (a->cat == b->cat) && (a->dot == b->dot) && (a->q == b->q) && (a->r == b->r);
- } else {
- return (a->cat == b->cat) && (a->q == b->q) && (a->r == b->r);
- }
- }
-};
-
-struct REdgeHash {
- size_t operator()(const Edge* e) const {
- size_t x = 5381;
- x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r);
- return x;
- }
-};
-
-struct REdgeEquals {
- bool operator()(const Edge* a, const Edge* b) const {
- return (a->r == b->r);
- }
-};
-
-struct QEdgeHash {
- size_t operator()(const Edge* e) const {
- size_t x = 5381;
- x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q);
- return x;
- }
-};
-
-struct QEdgeEquals {
- bool operator()(const Edge* a, const Edge* b) const {
- return (a->q == b->q);
- }
-};
-
-struct EdgeQueue {
- queue<const Edge*> q;
- EdgeQueue() {}
- void clear() { while(!q.empty()) q.pop(); }
- bool HasWork() const { return !q.empty(); }
- const Edge* Next() { const Edge* res = q.front(); q.pop(); return res; }
- void AddEdge(const Edge* s) { q.push(s); }
-};
-
-class CFG_WFSTComposerImpl {
- public:
- CFG_WFSTComposerImpl(WordID start_cat,
- const WFSTNode* q_0,
- const WFSTNode* q_final) : start_cat_(start_cat), q_0_(q_0), q_final_(q_final) {}
-
- // returns false if the intersection is empty
- bool Compose(const EGrammar& g, Hypergraph* forest) {
- goal_node = NULL;
- EGrammar::const_iterator sit = g.find(start_cat_);
- forest->ReserveNodes(kMAX_NODES);
- assert(sit != g.end());
- Edge* init = new Edge(start_cat_, &sit->second, q_0_);
- assert(IncorporateNewEdge(init));
- while (exp_agenda.HasWork() || agenda.HasWork()) {
- while(exp_agenda.HasWork()) {
- const Edge* edge = exp_agenda.Next();
- FinishEdge(edge, forest);
- }
- if (agenda.HasWork()) {
- const Edge* edge = agenda.Next();
-#ifdef DEBUG_CHART_PARSER
- cerr << "processing (" << edge->id << ')' << endl;
-#endif
- if (edge->IsActive()) {
- if (edge->dot->HasTerminals())
- DoScan(edge);
- if (edge->dot->HasNonTerminals()) {
- DoMergeWithPassives(edge);
- DoPredict(edge, g);
- }
- } else {
- DoComplete(edge);
- }
- }
- }
- if (goal_node) {
- forest->PruneUnreachable(goal_node->id_);
- RemoveEpsilons(forest, kEPS);
- }
- FreeAll();
- return goal_node;
- }
-
- void FreeAll() {
- for (int i = 0; i < free_list_.size(); ++i)
- delete free_list_[i];
- free_list_.clear();
- for (int i = 0; i < traversal_free_list_.size(); ++i)
- delete traversal_free_list_[i];
- traversal_free_list_.clear();
- all_traversals.clear();
- exp_agenda.clear();
- agenda.clear();
- tps2node.clear();
- edge2node.clear();
- all_edges.clear();
- passive_edges.clear();
- active_edges.clear();
- }
-
- ~CFG_WFSTComposerImpl() {
- FreeAll();
- }
-
- // returns the total number of edges created during composition
- int EdgesCreated() const {
- return free_list_.size();
- }
-
- private:
- void DoScan(const Edge* edge) {
- // here, we assume that the FST will potentially have many more outgoing
- // edges than the grammar, which will be just a couple. If you want to
- // efficiently handle the case where both are relatively large, this code
- // will need to change how the intersection is done. The best general
- // solution would probably be the Baeza-Yates double binary search.
-
- const EGrammarNode* dot = edge->dot;
- const WFSTNode* r = edge->r;
- const map<WordID, EGrammarNode>& terms = dot->GetTerminals();
- for (map<WordID, EGrammarNode>::const_iterator git = terms.begin();
- git != terms.end(); ++git) {
-
- if (!(TD::Convert(git->first)[0] >= '0' && TD::Convert(git->first)[0] <= '9')) {
- std::cerr << "TERMINAL SYMBOL: " << TD::Convert(git->first) << endl;
- abort();
- }
- std::vector<std::pair<const WFSTNode*, TRulePtr> > extensions = r->ExtendInput(atoi(TD::Convert(git->first).c_str()));
- for (unsigned nsi = 0; nsi < extensions.size(); ++nsi) {
- const WFSTNode* next_r = extensions[nsi].first;
- const EGrammarNode* next_dot = &git->second;
- const bool grammar_continues = next_dot->GrammarContinues();
- const bool rule_completes = next_dot->RuleCompletes();
- if (extensions[nsi].second)
- cerr << "!!! " << extensions[nsi].second->AsString() << endl;
- // cerr << " rule completes: " << rule_completes << " after consuming " << TD::Convert(git->first) << endl;
- assert(grammar_continues || rule_completes);
- const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures();
- if (rule_completes)
- IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second, input_features));
- if (grammar_continues)
- IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second));
- }
- }
- }
-
- void DoPredict(const Edge* edge, const EGrammar& g) {
- const EGrammarNode* dot = edge->dot;
- const map<WordID, EGrammarNode>& non_terms = dot->GetNonTerminals();
- for (map<WordID, EGrammarNode>::const_iterator git = non_terms.begin();
- git != non_terms.end(); ++git) {
- const WordID nt_to_predict = git->first;
- //cerr << edge->id << " -- " << TD::Convert(nt_to_predict*-1) << endl;
- EGrammar::const_iterator egi = g.find(nt_to_predict);
- if (egi == g.end()) {
- cerr << "[ERROR] Can't find any grammar rules with a LHS of type "
- << TD::Convert(-1*nt_to_predict) << '!' << endl;
- continue;
- }
- assert(edge->IsActive());
- const EGrammarNode* new_dot = &egi->second;
- Edge* new_edge = new Edge(nt_to_predict, new_dot, edge->r, edge);
- IncorporateNewEdge(new_edge);
- }
- }
-
- void DoComplete(const Edge* passive) {
-#ifdef DEBUG_CHART_PARSER
- cerr << " complete: " << *passive << endl;
-#endif
- const WordID completed_nt = passive->cat;
- const WFSTNode* q = passive->q;
- const WFSTNode* next_r = passive->r;
- const Edge query(q);
- const pair<unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator,
- unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator > p =
- active_edges.equal_range(&query);
- for (unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator it = p.first;
- it != p.second; ++it) {
- const Edge* active = *it;
-#ifdef DEBUG_CHART_PARSER
- cerr << " pos: " << *active << endl;
-#endif
- const EGrammarNode* next_dot = active->dot->Extend(completed_nt);
- if (!next_dot) continue;
- const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures();
- // add up to 2 rules
- if (next_dot->RuleCompletes())
- IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features));
- if (next_dot->GrammarContinues())
- IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive));
- }
- }
-
- void DoMergeWithPassives(const Edge* active) {
- // edge is active, has non-terminals, we need to find the passives that can extend it
- assert(active->IsActive());
- assert(active->dot->HasNonTerminals());
-#ifdef DEBUG_CHART_PARSER
- cerr << " merge active with passives: ACT=" << *active << endl;
-#endif
- const Edge query(active->r, 1);
- const pair<unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator,
- unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator > p =
- passive_edges.equal_range(&query);
- for (unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator it = p.first;
- it != p.second; ++it) {
- const Edge* passive = *it;
- const EGrammarNode* next_dot = active->dot->Extend(passive->cat);
- if (!next_dot) continue;
- const WFSTNode* next_r = passive->r;
- const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures();
- if (next_dot->RuleCompletes())
- IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features));
- if (next_dot->GrammarContinues())
- IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive));
- }
- }
-
- // take ownership of edge memory, add to various indexes, etc
- // returns true if this edge is new
- bool IncorporateNewEdge(Edge* edge) {
- free_list_.push_back(edge);
- if (edge->passive_parent && edge->active_parent) {
- Traversal* t = new Traversal(edge, edge->active_parent, edge->passive_parent);
- traversal_free_list_.push_back(t);
- if (all_traversals.find(t) != all_traversals.end()) {
- return false;
- } else {
- all_traversals.insert(t);
- }
- }
- exp_agenda.AddEdge(edge);
- return true;
- }
-
- bool FinishEdge(const Edge* edge, Hypergraph* hg) {
- bool is_new = false;
- if (all_edges.find(edge) == all_edges.end()) {
-#ifdef DEBUG_CHART_PARSER
- cerr << *edge << " is NEW\n";
-#endif
- all_edges.insert(edge);
- is_new = true;
- if (edge->IsPassive()) passive_edges.insert(edge);
- if (edge->IsActive()) active_edges.insert(edge);
- agenda.AddEdge(edge);
- } else {
-#ifdef DEBUG_CHART_PARSER
- cerr << *edge << " is NOT NEW.\n";
-#endif
- }
- AddEdgeToTranslationForest(edge, hg);
- return is_new;
- }
-
- // build the translation forest
- void AddEdgeToTranslationForest(const Edge* edge, Hypergraph* hg) {
- assert(hg->nodes_.size() < kMAX_NODES);
- Hypergraph::Node* tps = NULL;
- // first add any target language rules
- if (edge->tps) {
- Hypergraph::Node*& node = tps2node[(size_t)edge->tps.get()];
- if (!node) {
- // cerr << "Creating phrases for " << edge->tps << endl;
- const TRulePtr& rule = edge->tps;
- node = hg->AddNode(kPHRASE);
- Hypergraph::Edge* hg_edge = hg->AddEdge(rule, Hypergraph::TailNodeVector());
- hg_edge->feature_values_ += rule->GetFeatureValues();
- hg->ConnectEdgeToHeadNode(hg_edge, node);
- }
- tps = node;
- }
- Hypergraph::Node*& head_node = edge2node[edge];
- if (!head_node)
- head_node = hg->AddNode(kPHRASE);
- if (edge->cat == start_cat_ && edge->q == q_0_ && edge->r == q_final_ && edge->IsPassive()) {
- assert(goal_node == NULL || goal_node == head_node);
- goal_node = head_node;
- }
- Hypergraph::TailNodeVector tail;
- SparseVector<double> extra;
- if (edge->IsCreatedByPredict()) {
- // extra.set_value(FD::Convert("predict"), 1);
- } else if (edge->IsCreatedByScan()) {
- tail.push_back(edge2node[edge->active_parent]->id_);
- if (tps) {
- tail.push_back(tps->id_);
- }
- //extra.set_value(FD::Convert("scan"), 1);
- } else if (edge->IsCreatedByComplete()) {
- tail.push_back(edge2node[edge->active_parent]->id_);
- tail.push_back(edge2node[edge->passive_parent]->id_);
- //extra.set_value(FD::Convert("complete"), 1);
- } else {
- assert(!"unexpected edge type!");
- }
- //cerr << head_node->id_ << "<--" << *edge << endl;
-
-#ifdef DEBUG_CHART_PARSER
- for (int i = 0; i < tail.size(); ++i)
- if (tail[i] == head_node->id_) {
- cerr << "ERROR: " << *edge << "\n i=" << i << endl;
- if (i == 1) { cerr << "\tP: " << *edge->passive_parent << endl; }
- if (i == 0) { cerr << "\tA: " << *edge->active_parent << endl; }
- assert(!"self-loop found!");
- }
-#endif
- Hypergraph::Edge* hg_edge = NULL;
- if (tail.size() == 0) {
- hg_edge = hg->AddEdge(kEPSRule, tail);
- } else if (tail.size() == 1) {
- hg_edge = hg->AddEdge(kX1, tail);
- } else if (tail.size() == 2) {
- hg_edge = hg->AddEdge(kX1X2, tail);
- }
- if (edge->features)
- hg_edge->feature_values_ += *edge->features;
- hg_edge->feature_values_ += extra;
- hg->ConnectEdgeToHeadNode(hg_edge, head_node);
- }
-
- Hypergraph::Node* goal_node;
- EdgeQueue exp_agenda;
- EdgeQueue agenda;
- unordered_map<size_t, Hypergraph::Node*> tps2node;
- unordered_map<const Edge*, Hypergraph::Node*, UniqueEdgeHash, UniqueEdgeEquals> edge2node;
- unordered_set<const Traversal*, UniqueTraversalHash, UniqueTraversalEquals> all_traversals;
- unordered_set<const Edge*, UniqueEdgeHash, UniqueEdgeEquals> all_edges;
- unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals> passive_edges;
- unordered_multiset<const Edge*, REdgeHash, REdgeEquals> active_edges;
- vector<Edge*> free_list_;
- vector<Traversal*> traversal_free_list_;
- const WordID start_cat_;
- const WFSTNode* const q_0_;
- const WFSTNode* const q_final_;
-};
-
-#ifdef DEBUG_CHART_PARSER
-static string TrimRule(const string& r) {
- size_t start = r.find(" |||") + 5;
- size_t end = r.rfind(" |||");
- return r.substr(start, end - start);
-}
-#endif
-
-void AddGrammarRule(const string& r, EGrammar* g) {
- const size_t pos = r.find(" ||| ");
- if (pos == string::npos || r[0] != '[') {
- cerr << "Bad rule: " << r << endl;
- return;
- }
- const size_t rpos = r.rfind(" ||| ");
- string feats;
- string rs = r;
- if (rpos != pos) {
- feats = r.substr(rpos + 5);
- rs = r.substr(0, rpos);
- }
- string rhs = rs.substr(pos + 5);
- string trule = rs + " ||| " + rhs + " ||| " + feats;
- TRule tr(trule);
- cerr << "X: " << tr.e_[0] << endl;
-#ifdef DEBUG_CHART_PARSER
- string hint_last_rule;
-#endif
- EGrammarNode* cur = &(*g)[tr.GetLHS()];
- cur->is_root = true;
- for (int i = 0; i < tr.FLength(); ++i) {
- WordID sym = tr.f()[i];
-#ifdef DEBUG_CHART_PARSER
- hint_last_rule = TD::Convert(sym < 0 ? -sym : sym);
- cur->hint += " <@@> (*" + hint_last_rule + ") " + TrimRule(tr.AsString());
-#endif
- if (sym < 0)
- cur = &cur->ntptr[sym];
- else
- cur = &cur->tptr[sym];
- }
-#ifdef DEBUG_CHART_PARSER
- cur->hint += " <@@> (" + hint_last_rule + "*) " + TrimRule(tr.AsString());
-#endif
- cur->is_some_rule_complete = true;
- cur->input_features = tr.GetFeatureValues();
-}
-
-CFG_WFSTComposer::~CFG_WFSTComposer() {
- delete pimpl_;
-}
-
-CFG_WFSTComposer::CFG_WFSTComposer(const WFST& wfst) {
- InitializeConstants();
- pimpl_ = new CFG_WFSTComposerImpl(kUNIQUE_START, wfst.Initial(), wfst.Final());
-}
-
-bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest) {
- // first, convert the src forest into an EGrammar
- EGrammar g;
- const int nedges = src_forest.edges_.size();
- const int nnodes = src_forest.nodes_.size();
- vector<int> cats(nnodes);
- bool assign_cats = false;
- for (int i = 0; i < nnodes; ++i)
- if (assign_cats) {
- cats[i] = TD::Convert("CAT_" + boost::lexical_cast<string>(i)) * -1;
- } else {
- cats[i] = src_forest.nodes_[i].cat_;
- }
- // construct the grammar
- for (int i = 0; i < nedges; ++i) {
- const Hypergraph::Edge& edge = src_forest.edges_[i];
- const vector<WordID>& src = edge.rule_->f();
- EGrammarNode* cur = &g[cats[edge.head_node_]];
- cur->is_root = true;
- int ntc = 0;
- for (int j = 0; j < src.size(); ++j) {
- WordID sym = src[j];
- if (sym <= 0) {
- sym = cats[edge.tail_nodes_[ntc]];
- ++ntc;
- cur = &cur->ntptr[sym];
- } else {
- cur = &cur->tptr[sym];
- }
- }
- cur->is_some_rule_complete = true;
- cur->input_features = edge.feature_values_;
- }
- EGrammarNode& goal_rule = g[kUNIQUE_START];
- assert((goal_rule.ntptr.size() == 1 && goal_rule.tptr.size() == 0) ||
- (goal_rule.ntptr.size() == 0 && goal_rule.tptr.size() == 1));
-
- return pimpl_->Compose(g, trg_forest);
-}
-
-bool CFG_WFSTComposer::Compose(istream* in, Hypergraph* trg_forest) {
- EGrammar g;
- while(*in) {
- string line;
- getline(*in, line);
- if (line.empty()) continue;
- AddGrammarRule(line, &g);
- }
-
- return pimpl_->Compose(g, trg_forest);
-}
diff --git a/gi/pf/cfg_wfst_composer.h b/gi/pf/cfg_wfst_composer.h
deleted file mode 100644
index cf47f459..00000000
--- a/gi/pf/cfg_wfst_composer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef _CFG_WFST_COMPOSER_H_
-#define _CFG_WFST_COMPOSER_H_
-
-#include <iostream>
-#include <vector>
-#include <utility>
-
-#include "trule.h"
-#include "wordid.h"
-
-class CFG_WFSTComposerImpl;
-class Hypergraph;
-
-struct WFSTNode {
- virtual ~WFSTNode();
- // returns the next states reachable by consuming srcindex (which identifies a word)
- // paired with the output string generated by taking that transition.
- virtual std::vector<std::pair<const WFSTNode*,TRulePtr> > ExtendInput(unsigned srcindex) const = 0;
-};
-
-struct WFST {
- virtual ~WFST();
- virtual const WFSTNode* Final() const = 0;
- virtual const WFSTNode* Initial() const = 0;
-};
-
-class CFG_WFSTComposer {
- public:
- ~CFG_WFSTComposer();
- explicit CFG_WFSTComposer(const WFST& wfst);
- bool Compose(const Hypergraph& in_forest, Hypergraph* trg_forest);
-
- // reads the grammar from a file. There must be a single top-level
- // S -> X rule. Anything else is possible. Format is:
- // [S] ||| [SS,1]
- // [SS] ||| [NP,1] [VP,2] ||| Feature1=0.2 Feature2=-2.3
- // [SS] ||| [VP,1] [NP,2] ||| Feature1=0.8
- // [NP] ||| [DET,1] [N,2] ||| Feature3=2
- // ...
- bool Compose(std::istream* grammar_file, Hypergraph* trg_forest);
-
- private:
- CFG_WFSTComposerImpl* pimpl_;
-};
-
-#endif
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
deleted file mode 100644
index 81ddb206..00000000
--- a/gi/pf/conditional_pseg.h
+++ /dev/null
@@ -1,275 +0,0 @@
-#ifndef _CONDITIONAL_PSEG_H_
-#define _CONDITIONAL_PSEG_H_
-
-#include <vector>
-#include <tr1/unordered_map>
-#include <boost/functional/hash.hpp>
-#include <iostream>
-
-#include "m.h"
-#include "prob.h"
-#include "ccrp_nt.h"
-#include "mfcr.h"
-#include "trule.h"
-#include "base_distributions.h"
-#include "tdict.h"
-
-template <typename ConditionalBaseMeasure>
-struct MConditionalTranslationModel {
- explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :
- rp0(rcp0), d(0.5), strength(1.0), lambdas(1, prob_t::One()), p0s(1) {}
-
- void Summary() const {
- std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
- for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
- std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl;
- for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
- std::cerr << " " << i2->second.total_dish_count_ << '\t' << i2->first << std::endl;
- }
- }
-
- double log_likelihood(const double& dd, const double& aa) const {
- if (aa <= -dd) return -std::numeric_limits<double>::infinity();
- //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
- double llh = Md::log_beta_density(dd, 1, 1) +
- Md::log_gamma_density(dd + aa, 1, 1);
- typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::const_iterator it;
- for (it = r.begin(); it != r.end(); ++it)
- llh += it->second.log_crp_prob(dd, aa);
- return llh;
- }
-
- struct DiscountResampler {
- DiscountResampler(const MConditionalTranslationModel& m) : m_(m) {}
- const MConditionalTranslationModel& m_;
- double operator()(const double& proposed_discount) const {
- return m_.log_likelihood(proposed_discount, m_.strength);
- }
- };
-
- struct AlphaResampler {
- AlphaResampler(const MConditionalTranslationModel& m) : m_(m) {}
- const MConditionalTranslationModel& m_;
- double operator()(const double& proposed_strength) const {
- return m_.log_likelihood(m_.d, proposed_strength);
- }
- };
-
- void ResampleHyperparameters(MT19937* rng) {
- typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::iterator it;
-#if 1
- for (it = r.begin(); it != r.end(); ++it) {
- it->second.resample_hyperparameters(rng);
- }
-#else
- const unsigned nloop = 5;
- const unsigned niterations = 10;
- DiscountResampler dr(*this);
- AlphaResampler ar(*this);
- for (int iter = 0; iter < nloop; ++iter) {
- strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- double min_discount = std::numeric_limits<double>::min();
- if (strength < 0.0) min_discount -= strength;
- d = slice_sampler1d(dr, d, *rng, min_discount,
- 1.0, 0.0, niterations, 100*niterations);
- }
- strength = slice_sampler1d(ar, strength, *rng, -d,
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl;
- for (it = r.begin(); it != r.end(); ++it) {
- it->second.set_discount(d);
- it->second.set_strength(strength);
- }
-#endif
- }
-
- int DecrementRule(const TRule& rule, MT19937* rng) {
- RuleModelHash::iterator it = r.find(rule.f_);
- assert(it != r.end());
- const TableCount delta = it->second.decrement(rule, rng);
- if (delta.count) {
- if (it->second.num_customers() == 0) r.erase(it);
- }
- return delta.count;
- }
-
- int IncrementRule(const TRule& rule, MT19937* rng) {
- RuleModelHash::iterator it = r.find(rule.f_);
- if (it == r.end()) {
- //it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first;
- it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1,1,1,1,0.6, -0.12))).first;
- }
- p0s[0] = rp0(rule);
- TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng);
- return delta.count;
- }
-
- prob_t RuleProbability(const TRule& rule) const {
- prob_t p;
- RuleModelHash::const_iterator it = r.find(rule.f_);
- if (it == r.end()) {
- p = rp0(rule);
- } else {
- p0s[0] = rp0(rule);
- p = it->second.prob(rule, p0s.begin(), lambdas.begin());
- }
- return p;
- }
-
- prob_t Likelihood() const {
- prob_t p; p.logeq(log_likelihood(d, strength));
- return p;
- }
-
- const ConditionalBaseMeasure& rp0;
- typedef std::tr1::unordered_map<std::vector<WordID>,
- MFCR<1, TRule>,
- boost::hash<std::vector<WordID> > > RuleModelHash;
- RuleModelHash r;
- double d, strength;
- std::vector<prob_t> lambdas;
- mutable std::vector<prob_t> p0s;
-};
-
-template <typename ConditionalBaseMeasure>
-struct ConditionalTranslationModel {
- explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :
- rp0(rcp0) {}
-
- void Summary() const {
- std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
- for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
- std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
- for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
- std::cerr << " " << i2->second << '\t' << i2->first << std::endl;
- }
- }
-
- void ResampleHyperparameters(MT19937* rng) {
- for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
- it->second.resample_hyperparameters(rng);
- }
-
- int DecrementRule(const TRule& rule) {
- RuleModelHash::iterator it = r.find(rule.f_);
- assert(it != r.end());
- int count = it->second.decrement(rule);
- if (count) {
- if (it->second.num_customers() == 0) r.erase(it);
- }
- return count;
- }
-
- int IncrementRule(const TRule& rule) {
- RuleModelHash::iterator it = r.find(rule.f_);
- if (it == r.end()) {
- it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1.0, 1.0, 8.0))).first;
- }
- int count = it->second.increment(rule);
- return count;
- }
-
- void IncrementRules(const std::vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- IncrementRule(*rules[i]);
- }
-
- void DecrementRules(const std::vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- DecrementRule(*rules[i]);
- }
-
- prob_t RuleProbability(const TRule& rule) const {
- prob_t p;
- RuleModelHash::const_iterator it = r.find(rule.f_);
- if (it == r.end()) {
- p.logeq(log(rp0(rule)));
- } else {
- p.logeq(it->second.logprob(rule, log(rp0(rule))));
- }
- return p;
- }
-
- prob_t Likelihood() const {
- prob_t p = prob_t::One();
- for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
- prob_t q; q.logeq(it->second.log_crp_prob());
- p *= q;
- for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
- p *= rp0(i2->first);
- }
- return p;
- }
-
- const ConditionalBaseMeasure& rp0;
- typedef std::tr1::unordered_map<std::vector<WordID>,
- CCRP_NoTable<TRule>,
- boost::hash<std::vector<WordID> > > RuleModelHash;
- RuleModelHash r;
-};
-
-template <typename ConditionalBaseMeasure>
-struct ConditionalParallelSegementationModel {
- explicit ConditionalParallelSegementationModel(ConditionalBaseMeasure& rcp0) :
- tmodel(rcp0), base(prob_t::One()), aligns(1,1) {}
-
- ConditionalTranslationModel<ConditionalBaseMeasure> tmodel;
-
- void DecrementRule(const TRule& rule) {
- tmodel.DecrementRule(rule);
- }
-
- void IncrementRule(const TRule& rule) {
- tmodel.IncrementRule(rule);
- }
-
- void IncrementRulesAndAlignments(const std::vector<TRulePtr>& rules) {
- tmodel.IncrementRules(rules);
- for (int i = 0; i < rules.size(); ++i) {
- IncrementAlign(rules[i]->f_.size());
- }
- }
-
- void DecrementRulesAndAlignments(const std::vector<TRulePtr>& rules) {
- tmodel.DecrementRules(rules);
- for (int i = 0; i < rules.size(); ++i) {
- DecrementAlign(rules[i]->f_.size());
- }
- }
-
- prob_t RuleProbability(const TRule& rule) const {
- return tmodel.RuleProbability(rule);
- }
-
- void IncrementAlign(unsigned span) {
- if (aligns.increment(span)) {
- // TODO
- }
- }
-
- void DecrementAlign(unsigned span) {
- if (aligns.decrement(span)) {
- // TODO
- }
- }
-
- prob_t AlignProbability(unsigned span) const {
- prob_t p;
- p.logeq(aligns.logprob(span, Md::log_poisson(span, 1.0)));
- return p;
- }
-
- prob_t Likelihood() const {
- prob_t p; p.logeq(aligns.log_crp_prob());
- p *= base;
- p *= tmodel.Likelihood();
- return p;
- }
-
- prob_t base;
- CCRP_NoTable<unsigned> aligns;
-};
-
-#endif
-
diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc
deleted file mode 100644
index 419731ac..00000000
--- a/gi/pf/condnaive.cc
+++ /dev/null
@@ -1,298 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/multi_array.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "conditional_pseg.h"
-#include "trule.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp_nt.h"
-#include "corpus.h"
-
-using namespace std;
-using namespace std::tr1;
-namespace po = boost::program_options;
-
-static unsigned kMAX_SRC_PHRASE;
-static unsigned kMAX_TRG_PHRASE;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("input,i",po::value<string>(),"Read parallel data from")
- ("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases")
- ("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases")
- ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
- ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-boost::shared_ptr<MT19937> prng;
-
-struct ModelAndData {
- explicit ModelAndData(ConditionalParallelSegementationModel<PhraseConditionalBase>& m, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) :
- model(m),
- rng(&*prng),
- corpuse(ce),
- corpusf(cf),
- vocabe(ve),
- vocabf(vf),
- mh_samples(),
- mh_rejects(),
- kX(-TD::Convert("X")),
- derivations(corpuse.size()) {}
-
- void ResampleHyperparameters() {
- }
-
- void InstantiateRule(const pair<short,short>& from,
- const pair<short,short>& to,
- const vector<int>& sentf,
- const vector<int>& sente,
- TRule* rule) const {
- rule->f_.clear();
- rule->e_.clear();
- rule->lhs_ = kX;
- for (short i = from.first; i < to.first; ++i)
- rule->f_.push_back(sentf[i]);
- for (short i = from.second; i < to.second; ++i)
- rule->e_.push_back(sente[i]);
- }
-
- void DecrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
- if (d.size() < 2) return;
- TRule x;
- for (int i = 1; i < d.size(); ++i) {
- InstantiateRule(d[i], d[i-1], sentf, sente, &x);
- model.DecrementRule(x);
- model.DecrementAlign(x.f_.size());
- }
- }
-
- void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
- if (d.size() < 2) return;
- TRule x;
- for (int i = 1; i < d.size(); ++i) {
- InstantiateRule(d[i], d[i-1], sentf, sente, &x);
- cerr << i << '/' << (d.size() - 1) << ": " << x << endl;
- }
- }
-
- void IncrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
- if (d.size() < 2) return;
- TRule x;
- for (int i = 1; i < d.size(); ++i) {
- InstantiateRule(d[i], d[i-1], sentf, sente, &x);
- model.IncrementRule(x);
- model.IncrementAlign(x.f_.size());
- }
- }
-
- prob_t Likelihood() const {
- return model.Likelihood();
- }
-
- prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const {
- prob_t p = prob_t::One();
- TRule x;
- for (int i = 1; i < d.size(); ++i) {
- InstantiateRule(d[i], d[i-1], sentf, sente, &x);
- p *= model.RuleProbability(x);
- p *= model.AlignProbability(x.f_.size());
- }
- return p;
- }
-
- void Sample();
-
- ConditionalParallelSegementationModel<PhraseConditionalBase>& model;
- MT19937* rng;
- const vector<vector<int> >& corpuse, corpusf;
- const set<int>& vocabe, vocabf;
- unsigned mh_samples, mh_rejects;
- const int kX;
- vector<vector<pair<short, short> > > derivations;
-};
-
-void ModelAndData::Sample() {
- unsigned MAXK = kMAX_SRC_PHRASE;
- unsigned MAXL = kMAX_TRG_PHRASE;
- TRule x;
- x.lhs_ = -TD::Convert("X");
-
- for (int samples = 0; samples < 1000; ++samples) {
- if (samples % 1 == 0 && samples > 0) {
- //ResampleHyperparameters();
- cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n";
- for (int i = 0; i < 10; ++i) {
- cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl;
- PrintDerivation(derivations[i], corpusf[i], corpuse[i]);
- }
- static TRule xx("[X] ||| w n ||| s h ||| X=0");
- const CCRP_NoTable<TRule>& dcrp = model.tmodel.r.find(xx.f_)->second;
- for (CCRP_NoTable<TRule>::const_iterator it = dcrp.begin(); it != dcrp.end(); ++it) {
- cerr << "\t" << it->second << "\t" << it->first << endl;
- }
- }
- cerr << '.' << flush;
- for (int s = 0; s < corpuse.size(); ++s) {
- const vector<int>& sentf = corpusf[s];
- const vector<int>& sente = corpuse[s];
-// cerr << " CUSTOMERS: " << rules.num_customers() << endl;
-// cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl;
-
- vector<pair<short, short> >& deriv = derivations[s];
- const prob_t p_cur = Likelihood();
- DecrementDerivation(deriv, sentf, sente);
-
- boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]);
- boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]);
- a[0][0] = prob_t::One();
- for (int i = 0; i < sentf.size(); ++i) {
- for (int j = 0; j < sente.size(); ++j) {
- const prob_t src_a = a[i][j];
- x.f_.clear();
- for (int k = 1; k <= MAXK; ++k) {
- if (i + k > sentf.size()) break;
- x.f_.push_back(sentf[i + k - 1]);
- x.e_.clear();
- const prob_t p_span = model.AlignProbability(k); // prob of consuming this much source
- for (int l = 1; l <= MAXL; ++l) {
- if (j + l > sente.size()) break;
- x.e_.push_back(sente[j + l - 1]);
- trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * p_span;
- a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1];
- }
- }
- }
- }
-// cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl;
- const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente);
-
- vector<pair<short,short> > newderiv;
- int cur_i = sentf.size();
- int cur_j = sente.size();
- while(cur_i > 0 && cur_j > 0) {
- newderiv.push_back(pair<short,short>(cur_i, cur_j));
-// cerr << "NODE: (" << cur_i << "," << cur_j << ")\n";
- SampleSet<prob_t> ss;
- vector<pair<short,short> > nexts;
- for (int k = 1; k <= MAXK; ++k) {
- const int hyp_i = cur_i - k;
- if (hyp_i < 0) break;
- for (int l = 1; l <= MAXL; ++l) {
- const int hyp_j = cur_j - l;
- if (hyp_j < 0) break;
- const prob_t& inside = a[hyp_i][hyp_j];
- if (inside == prob_t::Zero()) continue;
- const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1];
- if (transp == prob_t::Zero()) continue;
- const prob_t p = inside * transp;
- ss.add(p);
- nexts.push_back(pair<short,short>(hyp_i, hyp_j));
-// cerr << " (" << hyp_i << "," << hyp_j << ") <--- " << log(p) << endl;
- }
- }
-// cerr << " sample set has " << nexts.size() << " elements.\n";
- const int selected = rng->SelectSample(ss);
- cur_i = nexts[selected].first;
- cur_j = nexts[selected].second;
- }
- newderiv.push_back(pair<short,short>(0,0));
- const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente);
- IncrementDerivation(newderiv, sentf, sente);
-// cerr << "SANITY: " << q_new << " " <<log(DerivationProposalProbability(newderiv, sentf, sente)) << endl;
- if (deriv.empty()) { deriv = newderiv; continue; }
- ++mh_samples;
-
- if (deriv != newderiv) {
- const prob_t p_new = Likelihood();
-// cerr << "p_cur=" << log(p_cur) << "\t p_new=" << log(p_new) << endl;
-// cerr << "q_cur=" << log(q_cur) << "\t q_new=" << log(q_new) << endl;
- if (!rng->AcceptMetropolisHastings(p_new, p_cur, q_new, q_cur)) {
- ++mh_rejects;
- DecrementDerivation(newderiv, sentf, sente);
- IncrementDerivation(deriv, sentf, sente);
- } else {
-// cerr << " ACCEPT\n";
- deriv = newderiv;
- }
- }
- }
- }
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
- kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
-
- if (!conf.count("model1")) {
- cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
- return 1;
- }
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
-// MT19937& rng = *prng;
-
- vector<vector<int> > corpuse, corpusf;
- set<int> vocabe, vocabf;
- corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
- cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
- cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
- cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
- cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
- assert(corpusf.size() == corpuse.size());
-
- Model1 m1(conf["model1"].as<string>());
-
- PhraseConditionalBase pcb0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size());
- ConditionalParallelSegementationModel<PhraseConditionalBase> x(pcb0);
-
- ModelAndData posterior(x, corpuse, corpusf, vocabe, vocabf);
- posterior.Sample();
-
- TRule r1("[X] ||| x ||| l e ||| X=0");
- TRule r2("[X] ||| A ||| a d ||| X=0");
- TRule r3("[X] ||| n ||| e r ||| X=0");
- TRule r4("[X] ||| x A n ||| b l a g ||| X=0");
-
- PhraseConditionalUninformativeBase u0(vocabe.size());
-
- cerr << (pcb0(r1)*pcb0(r2)*pcb0(r3)) << endl;
- cerr << (u0(r4)) << endl;
-
- return 0;
-}
-
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc
deleted file mode 100644
index cb6e4ed7..00000000
--- a/gi/pf/corpus.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "corpus.h"
-
-#include <set>
-#include <vector>
-#include <string>
-
-#include "tdict.h"
-#include "filelib.h"
-
-using namespace std;
-
-namespace corpus {
-
-void ReadParallelCorpus(const string& filename,
- vector<vector<WordID> >* f,
- vector<vector<WordID> >* e,
- set<WordID>* vocab_f,
- set<WordID>* vocab_e) {
- f->clear();
- e->clear();
- vocab_f->clear();
- vocab_e->clear();
- ReadFile rf(filename);
- istream* in = rf.stream();
- assert(*in);
- string line;
- unsigned lc = 0;
- const WordID kDIV = TD::Convert("|||");
- vector<WordID> tmp;
- while(getline(*in, line)) {
- ++lc;
- e->push_back(vector<int>());
- f->push_back(vector<int>());
- vector<int>& le = e->back();
- vector<int>& lf = f->back();
- tmp.clear();
- TD::ConvertSentence(line, &tmp);
- bool isf = true;
- for (unsigned i = 0; i < tmp.size(); ++i) {
- const int cur = tmp[i];
- if (isf) {
- if (kDIV == cur) {
- isf = false;
- } else {
- lf.push_back(cur);
- vocab_f->insert(cur);
- }
- } else {
- if (cur == kDIV) {
- cerr << "ERROR in " << lc << ": " << line << endl << endl;
- abort();
- }
- le.push_back(cur);
- vocab_e->insert(cur);
- }
- }
- assert(isf == false);
- }
-}
-
-}
-
diff --git a/gi/pf/corpus.h b/gi/pf/corpus.h
deleted file mode 100644
index e7febdb7..00000000
--- a/gi/pf/corpus.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef _CORPUS_H_
-#define _CORPUS_H_
-
-#include <string>
-#include <vector>
-#include <set>
-#include "wordid.h"
-
-namespace corpus {
-
-void ReadParallelCorpus(const std::string& filename,
- std::vector<std::vector<WordID> >* f,
- std::vector<std::vector<WordID> >* e,
- std::set<WordID>* vocab_f,
- std::set<WordID>* vocab_e);
-
-}
-
-#endif
diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc
deleted file mode 100644
index 75ccad72..00000000
--- a/gi/pf/dpnaive.cc
+++ /dev/null
@@ -1,301 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/multi_array.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "trule.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp_nt.h"
-#include "corpus.h"
-
-using namespace std;
-using namespace std::tr1;
-namespace po = boost::program_options;
-
-static unsigned kMAX_SRC_PHRASE;
-static unsigned kMAX_TRG_PHRASE;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("input,i",po::value<string>(),"Read parallel data from")
- ("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases")
- ("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases")
- ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
- ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in base distribution)")
- ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-boost::shared_ptr<MT19937> prng;
-
-template <typename Base>
-struct ModelAndData {
- explicit ModelAndData(MonotonicParallelSegementationModel<PhraseJointBase_BiDir>& m, const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) :
- model(m),
- rng(&*prng),
- p0(b),
- baseprob(prob_t::One()),
- corpuse(ce),
- corpusf(cf),
- vocabe(ve),
- vocabf(vf),
- mh_samples(),
- mh_rejects(),
- kX(-TD::Convert("X")),
- derivations(corpuse.size()) {}
-
- void ResampleHyperparameters() {
- }
-
- void InstantiateRule(const pair<short,short>& from,
- const pair<short,short>& to,
- const vector<int>& sentf,
- const vector<int>& sente,
- TRule* rule) const {
- rule->f_.clear();
- rule->e_.clear();
- rule->lhs_ = kX;
- for (short i = from.first; i < to.first; ++i)
- rule->f_.push_back(sentf[i]);
- for (short i = from.second; i < to.second; ++i)
- rule->e_.push_back(sente[i]);
- }
-
- void DecrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
- if (d.size() < 2) return;
- TRule x;
- for (int i = 1; i < d.size(); ++i) {
- InstantiateRule(d[i], d[i-1], sentf, sente, &x);
- model.DecrementRule(x);
- model.DecrementContinue();
- }
- model.DecrementStop();
- }
-
- void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
- if (d.size() < 2) return;
- TRule x;
- for (int i = 1; i < d.size(); ++i) {
- InstantiateRule(d[i], d[i-1], sentf, sente, &x);
- cerr << i << '/' << (d.size() - 1) << ": " << x << endl;
- }
- }
-
- void IncrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
- if (d.size() < 2) return;
- TRule x;
- for (int i = 1; i < d.size(); ++i) {
- InstantiateRule(d[i], d[i-1], sentf, sente, &x);
- model.IncrementRule(x);
- model.IncrementContinue();
- }
- model.IncrementStop();
- }
-
- prob_t Likelihood() const {
- return model.Likelihood();
- }
-
- prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const {
- prob_t p = model.StopProbability();
- if (d.size() < 2) return p;
- TRule x;
- const prob_t p_cont = model.ContinueProbability();
- for (int i = 1; i < d.size(); ++i) {
- InstantiateRule(d[i], d[i-1], sentf, sente, &x);
- p *= p_cont;
- p *= model.RuleProbability(x);
- }
- return p;
- }
-
- void Sample();
-
- MonotonicParallelSegementationModel<PhraseJointBase_BiDir>& model;
- MT19937* rng;
- const Base& p0;
- prob_t baseprob; // cached value of generating the table table labels from p0
- // this can't be used if we go to a hierarchical prior!
- const vector<vector<int> >& corpuse, corpusf;
- const set<int>& vocabe, vocabf;
- unsigned mh_samples, mh_rejects;
- const int kX;
- vector<vector<pair<short, short> > > derivations;
-};
-
-template <typename Base>
-void ModelAndData<Base>::Sample() {
- unsigned MAXK = kMAX_SRC_PHRASE;
- unsigned MAXL = kMAX_TRG_PHRASE;
- TRule x;
- x.lhs_ = -TD::Convert("X");
- for (int samples = 0; samples < 1000; ++samples) {
- if (samples % 1 == 0 && samples > 0) {
- //ResampleHyperparameters();
- cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n";
- for (int i = 0; i < 10; ++i) {
- cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl;
- PrintDerivation(derivations[i], corpusf[i], corpuse[i]);
- }
- }
- cerr << '.' << flush;
- for (int s = 0; s < corpuse.size(); ++s) {
- const vector<int>& sentf = corpusf[s];
- const vector<int>& sente = corpuse[s];
-// cerr << " CUSTOMERS: " << rules.num_customers() << endl;
-// cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl;
-
- vector<pair<short, short> >& deriv = derivations[s];
- const prob_t p_cur = Likelihood();
- DecrementDerivation(deriv, sentf, sente);
-
- boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]);
- boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]);
- a[0][0] = prob_t::One();
- const prob_t q_stop = model.StopProbability();
- const prob_t q_cont = model.ContinueProbability();
- for (int i = 0; i < sentf.size(); ++i) {
- for (int j = 0; j < sente.size(); ++j) {
- const prob_t src_a = a[i][j];
- x.f_.clear();
- for (int k = 1; k <= MAXK; ++k) {
- if (i + k > sentf.size()) break;
- x.f_.push_back(sentf[i + k - 1]);
- x.e_.clear();
- for (int l = 1; l <= MAXL; ++l) {
- if (j + l > sente.size()) break;
- x.e_.push_back(sente[j + l - 1]);
- const bool stop_now = ((j + l) == sente.size()) && ((i + k) == sentf.size());
- const prob_t& cp = stop_now ? q_stop : q_cont;
- trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * cp;
- a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1];
- }
- }
- }
- }
-// cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl;
- const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente);
-
- vector<pair<short,short> > newderiv;
- int cur_i = sentf.size();
- int cur_j = sente.size();
- while(cur_i > 0 && cur_j > 0) {
- newderiv.push_back(pair<short,short>(cur_i, cur_j));
-// cerr << "NODE: (" << cur_i << "," << cur_j << ")\n";
- SampleSet<prob_t> ss;
- vector<pair<short,short> > nexts;
- for (int k = 1; k <= MAXK; ++k) {
- const int hyp_i = cur_i - k;
- if (hyp_i < 0) break;
- for (int l = 1; l <= MAXL; ++l) {
- const int hyp_j = cur_j - l;
- if (hyp_j < 0) break;
- const prob_t& inside = a[hyp_i][hyp_j];
- if (inside == prob_t::Zero()) continue;
- const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1];
- if (transp == prob_t::Zero()) continue;
- const prob_t p = inside * transp;
- ss.add(p);
- nexts.push_back(pair<short,short>(hyp_i, hyp_j));
-// cerr << " (" << hyp_i << "," << hyp_j << ") <--- " << log(p) << endl;
- }
- }
-// cerr << " sample set has " << nexts.size() << " elements.\n";
- const int selected = rng->SelectSample(ss);
- cur_i = nexts[selected].first;
- cur_j = nexts[selected].second;
- }
- newderiv.push_back(pair<short,short>(0,0));
- const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente);
- IncrementDerivation(newderiv, sentf, sente);
-// cerr << "SANITY: " << q_new << " " <<log(DerivationProposalProbability(newderiv, sentf, sente)) << endl;
- if (deriv.empty()) { deriv = newderiv; continue; }
- ++mh_samples;
-
- if (deriv != newderiv) {
- const prob_t p_new = Likelihood();
-// cerr << "p_cur=" << log(p_cur) << "\t p_new=" << log(p_new) << endl;
-// cerr << "q_cur=" << log(q_cur) << "\t q_new=" << log(q_new) << endl;
- if (!rng->AcceptMetropolisHastings(p_new, p_cur, q_new, q_cur)) {
- ++mh_rejects;
- DecrementDerivation(newderiv, sentf, sente);
- IncrementDerivation(deriv, sentf, sente);
- } else {
-// cerr << " ACCEPT\n";
- deriv = newderiv;
- }
- }
- }
- }
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
- kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
-
- if (!conf.count("model1")) {
- cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
- return 1;
- }
- if (!conf.count("inverse_model1")) {
- cerr << argv[0] << "Please use --inverse_model1 to specify inverse model 1 parameters\n";
- return 1;
- }
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
-// MT19937& rng = *prng;
-
- vector<vector<int> > corpuse, corpusf;
- set<int> vocabe, vocabf;
- corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
- cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
- cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
- cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
- cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
- assert(corpusf.size() == corpuse.size());
-
- Model1 m1(conf["model1"].as<string>());
- Model1 invm1(conf["inverse_model1"].as<string>());
-// PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
- PhraseJointBase_BiDir alp0(m1, invm1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
- MonotonicParallelSegementationModel<PhraseJointBase_BiDir> m(alp0);
-
- ModelAndData<PhraseJointBase_BiDir> posterior(m, alp0, corpuse, corpusf, vocabe, vocabf);
- posterior.Sample();
-
- return 0;
-}
-
diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl
deleted file mode 100755
index d00c2168..00000000
--- a/gi/pf/guess-translits.pl
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-use utf8;
-
-my $MIN_PMI = -3;
-
-my %fs;
-my %es;
-my %ef;
-
-die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0;
-
-binmode(STDIN,":utf8");
-binmode(STDOUT,":utf8");
-binmode(STDERR,":utf8");
-
-my $tot = 0;
-print STDERR "Reading alignments from STDIN ...\n";
-while(<STDIN>) {
- chomp;
- my ($fsent, $esent, $alsent) = split / \|\|\| /;
- die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent;
-
- my @fws = split /\s+/, $fsent;
- my @ews = split /\s+/, $esent;
- my @as = split /\s+/, $alsent;
- my %a2b;
- my %b2a;
- for my $ap (@as) {
- my ($a,$b) = split /-/, $ap;
- die "BAD INPUT: $_\n" unless defined $a && defined $b;
- $a2b{$a}->{$b} = 1;
- $b2a{$b}->{$a} = 1;
- }
- for my $a (keys %a2b) {
- my $bref = $a2b{$a};
- next unless scalar keys %$bref < 2;
- my $b = (keys %$bref)[0];
- next unless scalar keys %{$b2a{$b}} < 2;
- my $f = $fws[$a];
- next unless defined $f;
- next unless length($f) > 3;
- my $e = $ews[$b];
- next unless defined $e;
- next unless length($e) > 3;
-
- $ef{$f}->{$e}++;
- $es{$e}++;
- $fs{$f}++;
- $tot++;
- }
-}
-my $ltot = log($tot);
-my $num = 0;
-print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n";
-for my $f (keys %fs) {
- my $logf = log($fs{$f});
- my $esref = $ef{$f};
- for my $e (keys %$esref) {
- my $loge = log($es{$e});
- my $ef = $esref->{$e};
- my $logef = log($ef);
- my $pmi = $logef - ($loge + $logf);
- next if $pmi < $MIN_PMI;
- my @flets = split //, $f;
- my @elets = split //, $e;
- print "@flets ||| @elets\n";
- $num++;
- }
-}
-print STDERR "Extracted $num pairs.\n";
-print STDERR "Recommend running:\n ../../training/model1 -v -d -t -99999 output.txt\n";
diff --git a/gi/pf/hpyp_tm.cc b/gi/pf/hpyp_tm.cc
deleted file mode 100644
index f362d3f8..00000000
--- a/gi/pf/hpyp_tm.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-#include "hpyp_tm.h"
-
-#include <tr1/unordered_map>
-#include <iostream>
-#include <queue>
-
-#include "tdict.h"
-#include "ccrp.h"
-#include "pyp_word_model.h"
-#include "tied_resampler.h"
-
-using namespace std;
-using namespace std::tr1;
-
-struct FreqBinner {
- FreqBinner(const std::string& fname) { fd_.Load(fname); }
- unsigned NumberOfBins() const { return fd_.Max() + 1; }
- unsigned Bin(const WordID& w) const { return fd_.LookUp(w); }
- FreqDict<unsigned> fd_;
-};
-
-template <typename Base, class Binner = FreqBinner>
-struct ConditionalPYPWordModel {
- ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) :
- base(*b),
- binner(bnr),
- btr(binner ? binner->NumberOfBins() + 1u : 2u) {}
-
- void Summary() const {
- cerr << "Number of conditioning contexts: " << r.size() << endl;
- for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
- cerr << TD::Convert(it->first) << " \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl;
- for (CCRP<vector<WordID> >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
- cerr << " " << i2->second << endl;
- }
- }
-
- void ResampleHyperparameters(MT19937* rng) {
- btr.ResampleHyperparameters(rng);
- }
-
- prob_t Prob(const WordID src, const vector<WordID>& trglets) const {
- RuleModelHash::const_iterator it = r.find(src);
- if (it == r.end()) {
- return base(trglets);
- } else {
- return it->second.prob(trglets, base(trglets));
- }
- }
-
- void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
- RuleModelHash::iterator it = r.find(src);
- if (it == r.end()) {
- it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first;
- static const WordID kNULL = TD::Convert("NULL");
- unsigned bin = (src == kNULL ? 0 : 1);
- if (binner && bin) { bin = binner->Bin(src) + 1; }
- btr.Add(bin, &it->second);
- }
- if (it->second.increment(trglets, base(trglets), rng))
- base.Increment(trglets, rng);
- }
-
- void Decrement(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
- RuleModelHash::iterator it = r.find(src);
- assert(it != r.end());
- if (it->second.decrement(trglets, rng)) {
- base.Decrement(trglets, rng);
- }
- }
-
- prob_t Likelihood() const {
- prob_t p = prob_t::One();
- for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
- prob_t q; q.logeq(it->second.log_crp_prob());
- p *= q;
- }
- return p;
- }
-
- unsigned UniqueConditioningContexts() const {
- return r.size();
- }
-
- // TODO tie PYP hyperparameters based on source word frequency bins
- Base& base;
- const Binner* binner;
- BinTiedResampler<CCRP<vector<WordID> > > btr;
- typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
- RuleModelHash r;
-};
-
-HPYPLexicalTranslation::HPYPLexicalTranslation(const vector<vector<WordID> >& lets,
- const unsigned vocab_size,
- const unsigned num_letters) :
- letters(lets),
- base(vocab_size, num_letters, 5),
- up0(new PYPWordModel<PoissonUniformWordModel>(&base)),
- tmodel(new ConditionalPYPWordModel<PYPWordModel<PoissonUniformWordModel> >(up0, new FreqBinner("10k.freq"))),
- kX(-TD::Convert("X")) {}
-
-void HPYPLexicalTranslation::Summary() const {
- tmodel->Summary();
- up0->Summary();
-}
-
-prob_t HPYPLexicalTranslation::Likelihood() const {
- prob_t p = up0->Likelihood();
- p *= tmodel->Likelihood();
- return p;
-}
-
-void HPYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) {
- tmodel->ResampleHyperparameters(rng);
- up0->ResampleHyperparameters(rng);
-}
-
-unsigned HPYPLexicalTranslation::UniqueConditioningContexts() const {
- return tmodel->UniqueConditioningContexts();
-}
-
-prob_t HPYPLexicalTranslation::Prob(WordID src, WordID trg) const {
- return tmodel->Prob(src, letters[trg]);
-}
-
-void HPYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) {
- tmodel->Increment(src, letters[trg], rng);
-}
-
-void HPYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) {
- tmodel->Decrement(src, letters[trg], rng);
-}
-
diff --git a/gi/pf/hpyp_tm.h b/gi/pf/hpyp_tm.h
deleted file mode 100644
index af3215ba..00000000
--- a/gi/pf/hpyp_tm.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef HPYP_LEX_TRANS
-#define HPYP_LEX_TRANS
-
-#include <vector>
-#include "wordid.h"
-#include "prob.h"
-#include "sampler.h"
-#include "freqdict.h"
-#include "poisson_uniform_word_model.h"
-
-struct FreqBinner;
-template <class B> struct PYPWordModel;
-template <typename T, class B> struct ConditionalPYPWordModel;
-
-struct HPYPLexicalTranslation {
- explicit HPYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets,
- const unsigned vocab_size,
- const unsigned num_letters);
-
- prob_t Likelihood() const;
-
- void ResampleHyperparameters(MT19937* rng);
- prob_t Prob(WordID src, WordID trg) const; // return p(trg | src)
- void Summary() const;
- void Increment(WordID src, WordID trg, MT19937* rng);
- void Decrement(WordID src, WordID trg, MT19937* rng);
- unsigned UniqueConditioningContexts() const;
-
- private:
- const std::vector<std::vector<WordID> >& letters; // spelling dictionary
- PoissonUniformWordModel base; // "generator" of English types
- PYPWordModel<PoissonUniformWordModel>* up0; // model English lexicon
- ConditionalPYPWordModel<PYPWordModel<PoissonUniformWordModel>, FreqBinner>* tmodel; // translation distributions
- // (model English word | French word)
- const WordID kX;
-};
-
-#endif
diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc
deleted file mode 100644
index 29ec3860..00000000
--- a/gi/pf/itg.cc
+++ /dev/null
@@ -1,275 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/functional.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "viterbi.h"
-#include "hg.h"
-#include "trule.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp_nt.h"
-#include "ccrp_onetable.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-ostream& operator<<(ostream& os, const vector<WordID>& p) {
- os << '[';
- for (int i = 0; i < p.size(); ++i)
- os << (i==0 ? "" : " ") << TD::Convert(p[i]);
- return os << ']';
-}
-
-struct UnigramModel {
- explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) :
- use_uniform_(fname.size() == 0),
- p0null_(p0null),
- uniform_((1.0 - p0null) / vocab_size),
- probs_(TD::NumWords() + 1) {
- if (fname.size() > 0) LoadUnigrams(fname);
- probs_[0] = p0null_;
- }
-
-//
-// \data\
-// ngram 1=9295
-//
-// \1-grams:
-// -3.191193 "
-
- void LoadUnigrams(const string& fname) {
- cerr << "Loading unigram probabilities from " << fname << " ..." << endl;
- ReadFile rf(fname);
- string line;
- istream& in = *rf.stream();
- assert(in);
- getline(in, line);
- assert(line.empty());
- getline(in, line);
- assert(line == "\\data\\");
- getline(in, line);
- size_t pos = line.find("ngram 1=");
- assert(pos == 0);
- assert(line.size() > 8);
- const size_t num_unigrams = atoi(&line[8]);
- getline(in, line);
- assert(line.empty());
- getline(in, line);
- assert(line == "\\1-grams:");
- for (size_t i = 0; i < num_unigrams; ++i) {
- getline(in, line);
- assert(line.size() > 0);
- pos = line.find('\t');
- assert(pos > 0);
- assert(pos + 1 < line.size());
- const WordID w = TD::Convert(line.substr(pos + 1));
- line[pos] = 0;
- float p = atof(&line[0]);
- const prob_t pnon_null(1.0 - p0null_.as_float());
- if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort();
- }
- }
-
- const prob_t& operator()(const WordID& w) const {
- if (!w) return p0null_;
- if (use_uniform_) return uniform_;
- return probs_[w];
- }
-
- const bool use_uniform_;
- const prob_t p0null_;
- const prob_t uniform_;
- vector<prob_t> probs_;
-};
-
-struct Model1 {
- explicit Model1(const string& fname) :
- kNULL(TD::Convert("<eps>")),
- kZERO() {
- LoadModel1(fname);
- }
-
- void LoadModel1(const string& fname) {
- cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
- ReadFile rf(fname);
- istream& in = *rf.stream();
- string line;
- unsigned lc = 0;
- while(getline(in, line)) {
- ++lc;
- int cur = 0;
- int start = 0;
- while(cur < line.size() && line[cur] != ' ') { ++cur; }
- assert(cur != line.size());
- line[cur] = 0;
- const WordID src = TD::Convert(&line[0]);
- ++cur;
- start = cur;
- while(cur < line.size() && line[cur] != ' ') { ++cur; }
- assert(cur != line.size());
- line[cur] = 0;
- WordID trg = TD::Convert(&line[start]);
- const double logprob = strtod(&line[cur + 1], NULL);
- if (src >= ttable.size()) ttable.resize(src + 1);
- ttable[src][trg].logeq(logprob);
- }
- cerr << " read " << lc << " parameters.\n";
- }
-
- // returns prob 0 if src or trg is not found!
- const prob_t& operator()(WordID src, WordID trg) const {
- if (src == 0) src = kNULL;
- if (src < ttable.size()) {
- const map<WordID, prob_t>& cpd = ttable[src];
- const map<WordID, prob_t>::const_iterator it = cpd.find(trg);
- if (it != cpd.end())
- return it->second;
- }
- return kZERO;
- }
-
- const WordID kNULL;
- const prob_t kZERO;
- vector<map<WordID, prob_t> > ttable;
-};
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles")
- ("input,i",po::value<string>(),"Read parallel data from")
- ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
- ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")
- ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
- ("src_unigram,u",po::value<string>()->default_value(""),"Source unigram distribution; empty for uniform")
- ("trg_unigram,U",po::value<string>()->default_value(""),"Target unigram distribution; empty for uniform")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-void ReadParallelCorpus(const string& filename,
- vector<vector<WordID> >* f,
- vector<vector<WordID> >* e,
- set<WordID>* vocab_f,
- set<WordID>* vocab_e) {
- f->clear();
- e->clear();
- vocab_f->clear();
- vocab_e->clear();
- istream* in;
- if (filename == "-")
- in = &cin;
- else
- in = new ifstream(filename.c_str());
- assert(*in);
- string line;
- const WordID kDIV = TD::Convert("|||");
- vector<WordID> tmp;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
- e->push_back(vector<int>());
- f->push_back(vector<int>());
- vector<int>& le = e->back();
- vector<int>& lf = f->back();
- tmp.clear();
- TD::ConvertSentence(line, &tmp);
- bool isf = true;
- for (unsigned i = 0; i < tmp.size(); ++i) {
- const int cur = tmp[i];
- if (isf) {
- if (kDIV == cur) { isf = false; } else {
- lf.push_back(cur);
- vocab_f->insert(cur);
- }
- } else {
- assert(cur != kDIV);
- le.push_back(cur);
- vocab_e->insert(cur);
- }
- }
- assert(isf == false);
- }
- if (in != &cin) delete in;
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- const unsigned particles = conf["particles"].as<unsigned>();
- const unsigned samples = conf["samples"].as<unsigned>();
- TD::Convert("<s>");
- TD::Convert("</s>");
- TD::Convert("<unk>");
- if (!conf.count("model1")) {
- cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
- return 1;
- }
- boost::shared_ptr<MT19937> prng;
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
- MT19937& rng = *prng;
-
- vector<vector<WordID> > corpuse, corpusf;
- set<WordID> vocabe, vocabf;
- cerr << "Reading corpus...\n";
- ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
- cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
- cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
- assert(corpusf.size() == corpuse.size());
- UnigramModel src_unigram(conf["src_unigram"].as<string>(), vocabf.size());
- UnigramModel trg_unigram(conf["trg_unigram"].as<string>(), vocabe.size());
- const prob_t kHALF(0.5);
-
- const string kEMPTY = "NULL";
- const int kLHS = -TD::Convert("X");
- Model1 m1(conf["model1"].as<string>());
- Model1 invm1(conf["inverse_model1"].as<string>());
- for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) {
- cerr << '.' << flush;
- for (int ci = 0; ci < corpusf.size(); ++ci) {
- const vector<WordID>& trg = corpuse[ci];
- const vector<WordID>& src = corpusf[ci];
- for (int i = 0; i <= trg.size(); ++i) {
- const WordID e_i = i > 0 ? trg[i-1] : 0;
- for (int j = 0; j <= src.size(); ++j) {
- const WordID f_j = j > 0 ? src[j-1] : 0;
- if (e_i == 0 && f_j == 0) continue;
- prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j);
- cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl;
- if (e_i && f_j)
- cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl;
- }
- }
- }
- }
-}
-
diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc
deleted file mode 100644
index 1d5126e4..00000000
--- a/gi/pf/learn_cfg.cc
+++ /dev/null
@@ -1,428 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/functional.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "inside_outside.h"
-#include "hg.h"
-#include "bottom_up_parser.h"
-#include "fdict.h"
-#include "grammar.h"
-#include "m.h"
-#include "trule.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp.h"
-#include "ccrp_onetable.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-boost::shared_ptr<MT19937> prng;
-vector<int> nt_vocab;
-vector<int> nt_id_to_index;
-static unsigned kMAX_RULE_SIZE = 0;
-static unsigned kMAX_ARITY = 0;
-static bool kALLOW_MIXED = true; // allow rules with mixed terminals and NTs
-static bool kHIERARCHICAL_PRIOR = false;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("input,i",po::value<string>(),"Read parallel data from")
- ("max_rule_size,m", po::value<unsigned>()->default_value(0), "Maximum rule size (0 for unlimited)")
- ("max_arity,a", po::value<unsigned>()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)")
- ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS")
- ("nonterminals,n", po::value<unsigned>()->default_value(1), "Size of nonterminal vocabulary")
- ("hierarchical_prior,h", "Use hierarchical prior")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-unsigned ReadCorpus(const string& filename,
- vector<vector<WordID> >* e,
- set<WordID>* vocab_e) {
- e->clear();
- vocab_e->clear();
- istream* in;
- if (filename == "-")
- in = &cin;
- else
- in = new ifstream(filename.c_str());
- assert(*in);
- string line;
- unsigned toks = 0;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
- e->push_back(vector<int>());
- vector<int>& le = e->back();
- TD::ConvertSentence(line, &le);
- for (unsigned i = 0; i < le.size(); ++i)
- vocab_e->insert(le[i]);
- toks += le.size();
- }
- if (in != &cin) delete in;
- return toks;
-}
-
-struct Grid {
- // a b c d e
- // 0 - 0 - -
- vector<int> grid;
-};
-
-struct BaseRuleModel {
- explicit BaseRuleModel(unsigned term_size,
- unsigned nonterm_size = 1) :
- unif_term(1.0 / term_size),
- unif_nonterm(1.0 / nonterm_size) {}
- prob_t operator()(const TRule& r) const {
- prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size()));
- const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2));
- const prob_t nonterm_prob(1.0 - term_prob.as_float());
- for (unsigned i = 0; i < r.f_.size(); ++i) {
- if (r.f_[i] <= 0) { // nonterminal
- if (kALLOW_MIXED) p *= nonterm_prob;
- p *= unif_nonterm;
- } else { // terminal
- if (kALLOW_MIXED) p *= term_prob;
- p *= unif_term;
- }
- }
- return p;
- }
- const prob_t unif_term, unif_nonterm;
-};
-
-struct HieroLMModel {
- explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) :
- base(vocab_size, num_nts),
- q0(1,1,1,1),
- nts(num_nts, CCRP<TRule>(1,1,1,1)) {}
-
- prob_t Prob(const TRule& r) const {
- return nts[nt_id_to_index[-r.lhs_]].prob(r, p0(r));
- }
-
- inline prob_t p0(const TRule& r) const {
- if (kHIERARCHICAL_PRIOR)
- return q0.prob(r, base(r));
- else
- return base(r);
- }
-
- int Increment(const TRule& r, MT19937* rng) {
- const int delta = nts[nt_id_to_index[-r.lhs_]].increment(r, p0(r), rng);
- if (kHIERARCHICAL_PRIOR && delta)
- q0.increment(r, base(r), rng);
- return delta;
- // return x.increment(r);
- }
-
- int Decrement(const TRule& r, MT19937* rng) {
- const int delta = nts[nt_id_to_index[-r.lhs_]].decrement(r, rng);
- if (kHIERARCHICAL_PRIOR && delta)
- q0.decrement(r, rng);
- return delta;
- //return x.decrement(r);
- }
-
- prob_t Likelihood() const {
- prob_t p = prob_t::One();
- for (unsigned i = 0; i < nts.size(); ++i) {
- prob_t q; q.logeq(nts[i].log_crp_prob());
- p *= q;
- for (CCRP<TRule>::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) {
- prob_t tp = p0(it->first);
- tp.poweq(it->second.num_tables());
- p *= tp;
- }
- }
- if (kHIERARCHICAL_PRIOR) {
- prob_t q; q.logeq(q0.log_crp_prob());
- p *= q;
- for (CCRP<TRule>::const_iterator it = q0.begin(); it != q0.end(); ++it) {
- prob_t tp = base(it->first);
- tp.poweq(it->second.num_tables());
- p *= tp;
- }
- }
- //for (CCRP_OneTable<TRule>::const_iterator it = x.begin(); it != x.end(); ++it)
- // p *= base(it->first);
- return p;
- }
-
- void ResampleHyperparameters(MT19937* rng) {
- for (unsigned i = 0; i < nts.size(); ++i)
- nts[i].resample_hyperparameters(rng);
- if (kHIERARCHICAL_PRIOR) {
- q0.resample_hyperparameters(rng);
- cerr << "[base d=" << q0.discount() << ", s=" << q0.strength() << "]";
- }
- cerr << " d=" << nts[0].discount() << ", s=" << nts[0].strength() << endl;
- }
-
- const BaseRuleModel base;
- CCRP<TRule> q0;
- vector<CCRP<TRule> > nts;
- //CCRP_OneTable<TRule> x;
-};
-
-vector<GrammarIter* > tofreelist;
-
-HieroLMModel* plm;
-
-struct NPGrammarIter : public GrammarIter, public RuleBin {
- NPGrammarIter() : arity() { tofreelist.push_back(this); }
- NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) {
- if (inr) {
- r.reset(new TRule(*inr));
- } else {
- r.reset(new TRule);
- }
- TRule& rr = *r;
- rr.lhs_ = nt_vocab[0];
- rr.f_.push_back(symbol);
- rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol);
- tofreelist.push_back(this);
- }
- inline static unsigned NextArity(int cur_a, int symbol) {
- return cur_a + (symbol <= 0 ? 1 : 0);
- }
- virtual int GetNumRules() const {
- if (r) return nt_vocab.size(); else return 0;
- }
- virtual TRulePtr GetIthRule(int i) const {
- if (i == 0) return r;
- TRulePtr nr(new TRule(*r));
- nr->lhs_ = nt_vocab[i];
- return nr;
- }
- virtual int Arity() const {
- return arity;
- }
- virtual const RuleBin* GetRules() const {
- if (!r) return NULL; else return this;
- }
- virtual const GrammarIter* Extend(int symbol) const {
- const int next_arity = NextArity(arity, symbol);
- if (kMAX_ARITY && next_arity > kMAX_ARITY)
- return NULL;
- if (!kALLOW_MIXED && r) {
- bool t1 = r->f_.front() <= 0;
- bool t2 = symbol <= 0;
- if (t1 != t2) return NULL;
- }
- if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE))
- return new NPGrammarIter(r, next_arity, symbol);
- else
- return NULL;
- }
- const unsigned char arity;
- TRulePtr r;
-};
-
-struct NPGrammar : public Grammar {
- virtual const GrammarIter* GetRoot() const {
- return new NPGrammarIter;
- }
-};
-
-prob_t TotalProb(const Hypergraph& hg) {
- return Inside<prob_t, EdgeProb>(hg);
-}
-
-void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv) {
- vector<prob_t> node_probs;
- Inside<prob_t, EdgeProb>(hg, &node_probs);
- queue<unsigned> q;
- q.push(hg.nodes_.size() - 2);
- while(!q.empty()) {
- unsigned cur_node_id = q.front();
-// cerr << "NODE=" << cur_node_id << endl;
- q.pop();
- const Hypergraph::Node& node = hg.nodes_[cur_node_id];
- const unsigned num_in_edges = node.in_edges_.size();
- unsigned sampled_edge = 0;
- if (num_in_edges == 1) {
- sampled_edge = node.in_edges_[0];
- } else {
- //prob_t z;
- assert(num_in_edges > 1);
- SampleSet<prob_t> ss;
- for (unsigned j = 0; j < num_in_edges; ++j) {
- const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
- prob_t p = edge.edge_prob_;
- for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
- p *= node_probs[edge.tail_nodes_[k]];
- ss.add(p);
-// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl;
- //z += p;
- }
-// for (unsigned j = 0; j < num_in_edges; ++j) {
-// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
-// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl;
-// }
-// cerr << " --- \n";
- sampled_edge = node.in_edges_[rng->SelectSample(ss)];
- }
- sampled_deriv->push_back(sampled_edge);
- const Hypergraph::Edge& edge = hg.edges_[sampled_edge];
- for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
- q.push(edge.tail_nodes_[j]);
- }
- }
- for (unsigned i = 0; i < sampled_deriv->size(); ++i) {
- cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl;
- }
-}
-
-void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
- for (unsigned i = 0; i < d.size(); ++i)
- plm->Increment(*hg.edges_[d[i]].rule_, rng);
-}
-
-void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
- for (unsigned i = 0; i < d.size(); ++i)
- plm->Decrement(*hg.edges_[d[i]].rule_, rng);
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
-
- InitCommandLine(argc, argv, &conf);
- nt_vocab.resize(conf["nonterminals"].as<unsigned>());
- assert(nt_vocab.size() > 0);
- assert(nt_vocab.size() < 26);
- {
- string nt = "X";
- for (unsigned i = 0; i < nt_vocab.size(); ++i) {
- if (nt_vocab.size() > 1) nt[0] = ('A' + i);
- int pid = TD::Convert(nt);
- nt_vocab[i] = -pid;
- if (pid >= nt_id_to_index.size()) {
- nt_id_to_index.resize(pid + 1, -1);
- }
- nt_id_to_index[pid] = i;
- }
- }
- vector<GrammarPtr> grammars;
- grammars.push_back(GrammarPtr(new NPGrammar));
-
- const unsigned samples = conf["samples"].as<unsigned>();
- kMAX_RULE_SIZE = conf["max_rule_size"].as<unsigned>();
- if (kMAX_RULE_SIZE == 1) {
- cerr << "Invalid maximum rule size: must be 0 or >1\n";
- return 1;
- }
- kMAX_ARITY = conf["max_arity"].as<unsigned>();
- if (kMAX_ARITY == 1) {
- cerr << "Invalid maximum arity: must be 0 or >1\n";
- return 1;
- }
- kALLOW_MIXED = !conf.count("no_mixed_rules");
-
- kHIERARCHICAL_PRIOR = conf.count("hierarchical_prior");
-
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
- MT19937& rng = *prng;
- vector<vector<WordID> > corpuse;
- set<WordID> vocabe;
- cerr << "Reading corpus...\n";
- const unsigned toks = ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe);
- cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
- HieroLMModel lm(vocabe.size(), nt_vocab.size());
-
- plm = &lm;
- ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars);
-
- Hypergraph hg;
- const int kGoal = -TD::Convert("Goal");
- const int kLP = FD::Convert("LogProb");
- SparseVector<double> v; v.set_value(kLP, 1.0);
- vector<vector<unsigned> > derivs(corpuse.size());
- vector<Lattice> cl(corpuse.size());
- for (int ci = 0; ci < corpuse.size(); ++ci) {
- vector<int>& src = corpuse[ci];
- Lattice& lat = cl[ci];
- lat.resize(src.size());
- for (unsigned i = 0; i < src.size(); ++i)
- lat[i].push_back(LatticeArc(src[i], 0.0, 1));
- }
- for (int SS=0; SS < samples; ++SS) {
- const bool is_last = ((samples - 1) == SS);
- prob_t dlh = prob_t::One();
- for (int ci = 0; ci < corpuse.size(); ++ci) {
- const vector<int>& src = corpuse[ci];
- const Lattice& lat = cl[ci];
- cerr << TD::GetString(src) << endl;
- hg.clear();
- parser.Parse(lat, &hg); // exhaustive parse
- vector<unsigned>& d = derivs[ci];
- if (!is_last) DecrementDerivation(hg, d, &lm, &rng);
- for (unsigned i = 0; i < hg.edges_.size(); ++i) {
- TRule& r = *hg.edges_[i].rule_;
- if (r.lhs_ == kGoal)
- hg.edges_[i].edge_prob_ = prob_t::One();
- else
- hg.edges_[i].edge_prob_ = lm.Prob(r);
- }
- if (!is_last) {
- d.clear();
- SampleDerivation(hg, &rng, &d);
- IncrementDerivation(hg, derivs[ci], &lm, &rng);
- } else {
- prob_t p = TotalProb(hg);
- dlh *= p;
- cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl;
- }
- if (tofreelist.size() > 200000) {
- cerr << "Freeing ... ";
- for (unsigned i = 0; i < tofreelist.size(); ++i)
- delete tofreelist[i];
- tofreelist.clear();
- cerr << "Freed.\n";
- }
- }
- double llh = log(lm.Likelihood());
- cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl;
- if (SS % 10 == 9) lm.ResampleHyperparameters(&rng);
- if (is_last) {
- double z = log(dlh);
- cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl;
- }
- }
- for (unsigned i = 0; i < nt_vocab.size(); ++i)
- cerr << lm.nts[i] << endl;
- return 0;
-}
-
diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl
deleted file mode 100755
index fdcd3555..00000000
--- a/gi/pf/make-freq-bins.pl
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $BASE = 6;
-my $CUTOFF = 3;
-
-my %d;
-my $num = 0;
-while(<>){
- chomp;
- my @words = split /\s+/;
- for my $w (@words) {$d{$w}++; $num++;}
-}
-
-my @vocab = sort {$d{$b} <=> $d{$a}} keys %d;
-
-for (my $i=0; $i<scalar @vocab; $i++) {
- my $most = $d{$vocab[$i]};
- my $least = 1;
-
- my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF);
- if ($nl < 0) { $nl = 0; }
- print "$vocab[$i] $nl\n"
-}
-
-
diff --git a/gi/pf/mh_test.cc b/gi/pf/mh_test.cc
deleted file mode 100644
index 296e7285..00000000
--- a/gi/pf/mh_test.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-#include "ccrp.h"
-
-#include <vector>
-#include <iostream>
-
-#include "tdict.h"
-#include "transliterations.h"
-
-using namespace std;
-
-MT19937 rng;
-
-static bool verbose = false;
-
-struct Model {
-
- Model() : bp(), base(0.2, 0.6) , ccrps(5, CCRP<int>(0.8, 0.5)) {}
-
- double p0(int x) const {
- assert(x > 0);
- assert(x < 5);
- return 1.0/4.0;
- }
-
- double llh() const {
- double lh = bp + base.log_crp_prob();
- for (int ctx = 1; ctx < 5; ++ctx)
- lh += ccrps[ctx].log_crp_prob();
- return lh;
- }
-
- double prob(int ctx, int x) const {
- assert(ctx > 0 && ctx < 5);
- return ccrps[ctx].prob(x, base.prob(x, p0(x)));
- }
-
- void increment(int ctx, int x) {
- assert(ctx > 0 && ctx < 5);
- if (ccrps[ctx].increment(x, base.prob(x, p0(x)), &rng)) {
- if (base.increment(x, p0(x), &rng)) {
- bp += log(1.0 / 4.0);
- }
- }
- }
-
- // this is just a biased estimate
- double est_base_prob(int x) {
- return (x + 1) * x / 40.0;
- }
-
- void increment_is(int ctx, int x) {
- assert(ctx > 0 && ctx < 5);
- SampleSet<double> ss;
- const int PARTICLES = 25;
- vector<CCRP<int> > s1s(PARTICLES, CCRP<int>(0.5,0.5));
- vector<CCRP<int> > sbs(PARTICLES, CCRP<int>(0.5,0.5));
- vector<double> sp0s(PARTICLES);
-
- CCRP<int> s1 = ccrps[ctx];
- CCRP<int> sb = base;
- double sp0 = bp;
- for (int pp = 0; pp < PARTICLES; ++pp) {
- if (pp > 0) {
- ccrps[ctx] = s1;
- base = sb;
- bp = sp0;
- }
-
- double q = 1;
- double gamma = 1;
- double est_p = est_base_prob(x);
- //base.prob(x, p0(x)) + rng.next() * 0.1;
- if (ccrps[ctx].increment(x, est_p, &rng, &q)) {
- gamma = q * base.prob(x, p0(x));
- q *= est_p;
- if (verbose) cerr << "(DP-base draw) ";
- double qq = -1;
- if (base.increment(x, p0(x), &rng, &qq)) {
- if (verbose) cerr << "(G0 draw) ";
- bp += log(p0(x));
- qq *= p0(x);
- }
- } else { gamma = q; }
- double w = gamma / q;
- if (verbose)
- cerr << "gamma=" << gamma << " q=" << q << "\tw=" << w << endl;
- ss.add(w);
- s1s[pp] = ccrps[ctx];
- sbs[pp] = base;
- sp0s[pp] = bp;
- }
- int ps = rng.SelectSample(ss);
- ccrps[ctx] = s1s[ps];
- base = sbs[ps];
- bp = sp0s[ps];
- if (verbose) {
- cerr << "SELECTED: " << ps << endl;
- static int cc = 0; cc++; if (cc ==10) exit(1);
- }
- }
-
- void decrement(int ctx, int x) {
- assert(ctx > 0 && ctx < 5);
- if (ccrps[ctx].decrement(x, &rng)) {
- if (base.decrement(x, &rng)) {
- bp -= log(p0(x));
- }
- }
- }
-
- double bp;
- CCRP<int> base;
- vector<CCRP<int> > ccrps;
-
-};
-
-int main(int argc, char** argv) {
- if (argc > 1) { verbose = true; }
- vector<int> counts(15, 0);
- vector<int> tcounts(15, 0);
- int points[] = {1,2, 2,2, 3,2, 4,1, 3, 4, 3, 3, 2, 3, 4, 1, 4, 1, 3, 2, 1, 3, 1, 4, 0, 0};
- double tlh = 0;
- double tt = 0;
- for (int n = 0; n < 1000; ++n) {
- if (n % 10 == 0) cerr << '.';
- if ((n+1) % 400 == 0) cerr << " [" << (n+1) << "]\n";
- Model m;
- for (int *x = points; *x; x += 2)
- m.increment(x[0], x[1]);
-
- for (int j = 0; j < 24; ++j) {
- for (int *x = points; *x; x += 2) {
- if (rng.next() < 0.8) {
- m.decrement(x[0], x[1]);
- m.increment_is(x[0], x[1]);
- }
- }
- }
- counts[m.base.num_customers()]++;
- tcounts[m.base.num_tables()]++;
- tlh += m.llh();
- tt += 1.0;
- }
- cerr << "mean LLH = " << (tlh / tt) << endl;
- for (int i = 0; i < 15; ++i)
- cerr << i << ": " << (counts[i] / tt) << "\t" << (tcounts[i] / tt) << endl;
-}
-
diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h
deleted file mode 100644
index 10d171fe..00000000
--- a/gi/pf/monotonic_pseg.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef _MONOTONIC_PSEG_H_
-#define _MONOTONIC_PSEG_H_
-
-#include <vector>
-
-#include "prob.h"
-#include "ccrp_nt.h"
-#include "trule.h"
-#include "base_distributions.h"
-
-template <typename BaseMeasure>
-struct MonotonicParallelSegementationModel {
- explicit MonotonicParallelSegementationModel(BaseMeasure& rcp0) :
- rp0(rcp0), base(prob_t::One()), rules(1,1), stop(1.0) {}
-
- void DecrementRule(const TRule& rule) {
- if (rules.decrement(rule))
- base /= rp0(rule);
- }
-
- void IncrementRule(const TRule& rule) {
- if (rules.increment(rule))
- base *= rp0(rule);
- }
-
- void IncrementRulesAndStops(const std::vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- IncrementRule(*rules[i]);
- if (rules.size()) IncrementContinue(rules.size() - 1);
- IncrementStop();
- }
-
- void DecrementRulesAndStops(const std::vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- DecrementRule(*rules[i]);
- if (rules.size()) {
- DecrementContinue(rules.size() - 1);
- DecrementStop();
- }
- }
-
- prob_t RuleProbability(const TRule& rule) const {
- prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule))));
- return p;
- }
-
- prob_t Likelihood() const {
- prob_t p = base;
- prob_t q; q.logeq(rules.log_crp_prob());
- p *= q;
- q.logeq(stop.log_crp_prob());
- p *= q;
- return p;
- }
-
- void IncrementStop() {
- stop.increment(true);
- }
-
- void IncrementContinue(int n = 1) {
- for (int i = 0; i < n; ++i)
- stop.increment(false);
- }
-
- void DecrementStop() {
- stop.decrement(true);
- }
-
- void DecrementContinue(int n = 1) {
- for (int i = 0; i < n; ++i)
- stop.decrement(false);
- }
-
- prob_t StopProbability() const {
- return prob_t(stop.prob(true, 0.5));
- }
-
- prob_t ContinueProbability() const {
- return prob_t(stop.prob(false, 0.5));
- }
-
- const BaseMeasure& rp0;
- prob_t base;
- CCRP_NoTable<TRule> rules;
- CCRP_NoTable<bool> stop;
-};
-
-#endif
-
diff --git a/gi/pf/ngram_base.cc b/gi/pf/ngram_base.cc
deleted file mode 100644
index 1299f06f..00000000
--- a/gi/pf/ngram_base.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-#include "ngram_base.h"
-
-#include "lm/model.hh"
-#include "tdict.h"
-
-using namespace std;
-
-namespace {
-struct GICSVMapper : public lm::EnumerateVocab {
- GICSVMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); }
- void Add(lm::WordIndex index, const StringPiece &str) {
- const WordID cdec_id = TD::Convert(str.as_string());
- if (cdec_id >= out_->size())
- out_->resize(cdec_id + 1, kLM_UNKNOWN_TOKEN);
- (*out_)[cdec_id] = index;
- }
- vector<lm::WordIndex>* out_;
- const lm::WordIndex kLM_UNKNOWN_TOKEN;
-};
-}
-
-struct FixedNgramBaseImpl {
- FixedNgramBaseImpl(const string& param) {
- GICSVMapper vm(&cdec2klm_map_);
- lm::ngram::Config conf;
- conf.enumerate_vocab = &vm;
- cerr << "Reading character LM from " << param << endl;
- model = new lm::ngram::ProbingModel(param.c_str(), conf);
- order = model->Order();
- kEOS = MapWord(TD::Convert("</s>"));
- assert(kEOS > 0);
- }
-
- lm::WordIndex MapWord(const WordID w) const {
- if (w < cdec2klm_map_.size()) return cdec2klm_map_[w];
- return 0;
- }
-
- ~FixedNgramBaseImpl() { delete model; }
-
- prob_t StringProbability(const vector<WordID>& s) const {
- lm::ngram::State state = model->BeginSentenceState();
- double prob = 0;
- for (unsigned i = 0; i < s.size(); ++i) {
- const lm::ngram::State scopy(state);
- prob += model->Score(scopy, MapWord(s[i]), state);
- }
- const lm::ngram::State scopy(state);
- prob += model->Score(scopy, kEOS, state);
- prob_t p; p.logeq(prob * log(10));
- return p;
- }
-
- lm::ngram::ProbingModel* model;
- unsigned order;
- vector<lm::WordIndex> cdec2klm_map_;
- lm::WordIndex kEOS;
-};
-
-FixedNgramBase::~FixedNgramBase() { delete impl; }
-
-FixedNgramBase::FixedNgramBase(const string& lmfname) {
- impl = new FixedNgramBaseImpl(lmfname);
-}
-
-prob_t FixedNgramBase::StringProbability(const vector<WordID>& s) const {
- return impl->StringProbability(s);
-}
-
diff --git a/gi/pf/ngram_base.h b/gi/pf/ngram_base.h
deleted file mode 100644
index 4ea999f3..00000000
--- a/gi/pf/ngram_base.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _NGRAM_BASE_H_
-#define _NGRAM_BASE_H_
-
-#include <string>
-#include <vector>
-#include "trule.h"
-#include "wordid.h"
-#include "prob.h"
-
-struct FixedNgramBaseImpl;
-struct FixedNgramBase {
- FixedNgramBase(const std::string& lmfname);
- ~FixedNgramBase();
- prob_t StringProbability(const std::vector<WordID>& s) const;
-
- prob_t operator()(const TRule& rule) const {
- return StringProbability(rule.e_);
- }
-
- private:
- FixedNgramBaseImpl* impl;
-
-};
-
-#endif
diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc
deleted file mode 100644
index fc0af9cb..00000000
--- a/gi/pf/nuisance_test.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-#include "ccrp.h"
-
-#include <vector>
-#include <iostream>
-
-#include "tdict.h"
-#include "transliterations.h"
-
-using namespace std;
-
-MT19937 rng;
-
-ostream& operator<<(ostream&os, const vector<int>& v) {
- os << '[' << v[0];
- if (v.size() == 2) os << ' ' << v[1];
- return os << ']';
-}
-
-struct Base {
- Base() : llh(), v(2), v1(1), v2(1), crp(0.25, 0.5) {}
- inline double p0(const vector<int>& x) const {
- double p = 0.75;
- if (x.size() == 2) p = 0.25;
- p *= 1.0 / 3.0;
- if (x.size() == 2) p *= 1.0 / 3.0;
- return p;
- }
- double est_deriv_prob(int a, int b, int seg) const {
- assert(a > 0 && a < 4); // a \in {1,2,3}
- assert(b > 0 && b < 4); // b \in {1,2,3}
- assert(seg == 0 || seg == 1); // seg \in {0,1}
- if (seg == 0) {
- v[0] = a;
- v[1] = b;
- return crp.prob(v, p0(v));
- } else {
- v1[0] = a;
- v2[0] = b;
- return crp.prob(v1, p0(v1)) * crp.prob(v2, p0(v2));
- }
- }
- double est_marginal_prob(int a, int b) const {
- return est_deriv_prob(a,b,0) + est_deriv_prob(a,b,1);
- }
- int increment(int a, int b, double* pw = NULL) {
- double p1 = est_deriv_prob(a, b, 0);
- double p2 = est_deriv_prob(a, b, 1);
- //p1 = 0.5; p2 = 0.5;
- int seg = rng.SelectSample(p1,p2);
- double tmp = 0;
- if (!pw) pw = &tmp;
- double& w = *pw;
- if (seg == 0) {
- v[0] = a;
- v[1] = b;
- w = crp.prob(v, p0(v)) / p1;
- if (crp.increment(v, p0(v), &rng)) {
- llh += log(p0(v));
- }
- } else {
- v1[0] = a;
- w = crp.prob(v1, p0(v1)) / p2;
- if (crp.increment(v1, p0(v1), &rng)) {
- llh += log(p0(v1));
- }
- v2[0] = b;
- w *= crp.prob(v2, p0(v2));
- if (crp.increment(v2, p0(v2), &rng)) {
- llh += log(p0(v2));
- }
- }
- return seg;
- }
- void increment(int a, int b, int seg) {
- if (seg == 0) {
- v[0] = a;
- v[1] = b;
- if (crp.increment(v, p0(v), &rng)) {
- llh += log(p0(v));
- }
- } else {
- v1[0] = a;
- if (crp.increment(v1, p0(v1), &rng)) {
- llh += log(p0(v1));
- }
- v2[0] = b;
- if (crp.increment(v2, p0(v2), &rng)) {
- llh += log(p0(v2));
- }
- }
- }
- void decrement(int a, int b, int seg) {
- if (seg == 0) {
- v[0] = a;
- v[1] = b;
- if (crp.decrement(v, &rng)) {
- llh -= log(p0(v));
- }
- } else {
- v1[0] = a;
- if (crp.decrement(v1, &rng)) {
- llh -= log(p0(v1));
- }
- v2[0] = b;
- if (crp.decrement(v2, &rng)) {
- llh -= log(p0(v2));
- }
- }
- }
- double log_likelihood() const {
- return llh + crp.log_crp_prob();
- }
- double llh;
- mutable vector<int> v, v1, v2;
- CCRP<vector<int> > crp;
-};
-
-int main(int argc, char** argv) {
- double tl = 0;
- const int ITERS = 1000;
- const int PARTICLES = 20;
- const int DATAPOINTS = 50;
- WordID x = TD::Convert("souvenons");
- WordID y = TD::Convert("remember");
- vector<WordID> src; TD::ConvertSentence("s o u v e n o n s", &src);
- vector<WordID> trg; TD::ConvertSentence("r e m e m b e r", &trg);
-// Transliterations xx;
-// xx.Initialize(x, src, y, trg);
-// return 1;
-
- for (int j = 0; j < ITERS; ++j) {
- Base b;
- vector<int> segs(DATAPOINTS);
- SampleSet<double> ss;
- vector<int> sss;
- for (int i = 0; i < DATAPOINTS; i++) {
- ss.clear();
- sss.clear();
- int x = ((i / 10) % 3) + 1;
- int y = (i % 3) + 1;
- //double ep = b.est_marginal_prob(x,y);
- //cerr << "est p(" << x << "," << y << ") = " << ep << endl;
- for (int n = 0; n < PARTICLES; ++n) {
- double w;
- int seg = b.increment(x,y,&w);
- //cerr << seg << " w=" << w << endl;
- ss.add(w);
- sss.push_back(seg);
- b.decrement(x,y,seg);
- }
- int seg = sss[rng.SelectSample(ss)];
- b.increment(x, y, seg);
- //cerr << "Selected: " << seg << endl;
- //return 1;
- segs[i] = seg;
- }
- tl += b.log_likelihood();
- }
- cerr << "LLH=" << tl / ITERS << endl;
-}
-
diff --git a/gi/pf/os_phrase.h b/gi/pf/os_phrase.h
deleted file mode 100644
index dfe40cb1..00000000
--- a/gi/pf/os_phrase.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _OS_PHRASE_H_
-#define _OS_PHRASE_H_
-
-#include <iostream>
-#include <vector>
-#include "tdict.h"
-
-inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) {
- os << '[';
- for (int i = 0; i < p.size(); ++i)
- os << (i==0 ? "" : " ") << TD::Convert(p[i]);
- return os << ']';
-}
-
-#endif
diff --git a/gi/pf/pf.h b/gi/pf/pf.h
deleted file mode 100644
index ede7cda8..00000000
--- a/gi/pf/pf.h
+++ /dev/null
@@ -1,84 +0,0 @@
-#ifndef _PF_H_
-#define _PF_H_
-
-#include <cassert>
-#include <vector>
-#include "sampler.h"
-#include "prob.h"
-
-template <typename ParticleType>
-struct ParticleRenormalizer {
- void operator()(std::vector<ParticleType>* pv) const {
- if (pv->empty()) return;
- prob_t z = prob_t::Zero();
- for (unsigned i = 0; i < pv->size(); ++i)
- z += (*pv)[i].weight;
- assert(z > prob_t::Zero());
- for (unsigned i = 0; i < pv->size(); ++i)
- (*pv)[i].weight /= z;
- }
-};
-
-template <typename ParticleType>
-struct MultinomialResampleFilter {
- explicit MultinomialResampleFilter(MT19937* rng) : rng_(rng) {}
-
- void operator()(std::vector<ParticleType>* pv) {
- if (pv->empty()) return;
- std::vector<ParticleType>& ps = *pv;
- SampleSet<prob_t> ss;
- for (int i = 0; i < ps.size(); ++i)
- ss.add(ps[i].weight);
- std::vector<ParticleType> nps; nps.reserve(ps.size());
- const prob_t uniform_weight(1.0 / ps.size());
- for (int i = 0; i < ps.size(); ++i) {
- nps.push_back(ps[rng_->SelectSample(ss)]);
- nps[i].weight = uniform_weight;
- }
- nps.swap(ps);
- }
-
- private:
- MT19937* rng_;
-};
-
-template <typename ParticleType>
-struct SystematicResampleFilter {
- explicit SystematicResampleFilter(MT19937* rng) : rng_(rng), renorm_() {}
-
- void operator()(std::vector<ParticleType>* pv) {
- if (pv->empty()) return;
- renorm_(pv);
- std::vector<ParticleType>& ps = *pv;
- std::vector<ParticleType> nps; nps.reserve(ps.size());
- double lower = 0, upper = 0;
- const double skip = 1.0 / ps.size();
- double u_j = rng_->next() * skip;
- //std::cerr << "u_0: " << u_j << std::endl;
- int j = 0;
- for (unsigned i = 0; i < ps.size(); ++i) {
- upper += ps[i].weight.as_float();
- //std::cerr << "lower: " << lower << " upper: " << upper << std::endl;
- // how many children does ps[i] have?
- while (u_j < lower) { u_j += skip; ++j; }
- while (u_j >= lower && u_j <= upper) {
- assert(j < ps.size());
- nps.push_back(ps[i]);
- u_j += skip;
- //std::cerr << " add u_j=" << u_j << std::endl;
- ++j;
- }
- lower = upper;
- }
- //std::cerr << ps.size() << " " << nps.size() << "\n";
- assert(ps.size() == nps.size());
- //exit(1);
- ps.swap(nps);
- }
-
- private:
- MT19937* rng_;
- ParticleRenormalizer<ParticleType> renorm_;
-};
-
-#endif
diff --git a/gi/pf/pf_test.cc b/gi/pf/pf_test.cc
deleted file mode 100644
index 296e7285..00000000
--- a/gi/pf/pf_test.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-#include "ccrp.h"
-
-#include <vector>
-#include <iostream>
-
-#include "tdict.h"
-#include "transliterations.h"
-
-using namespace std;
-
-MT19937 rng;
-
-static bool verbose = false;
-
-struct Model {
-
- Model() : bp(), base(0.2, 0.6) , ccrps(5, CCRP<int>(0.8, 0.5)) {}
-
- double p0(int x) const {
- assert(x > 0);
- assert(x < 5);
- return 1.0/4.0;
- }
-
- double llh() const {
- double lh = bp + base.log_crp_prob();
- for (int ctx = 1; ctx < 5; ++ctx)
- lh += ccrps[ctx].log_crp_prob();
- return lh;
- }
-
- double prob(int ctx, int x) const {
- assert(ctx > 0 && ctx < 5);
- return ccrps[ctx].prob(x, base.prob(x, p0(x)));
- }
-
- void increment(int ctx, int x) {
- assert(ctx > 0 && ctx < 5);
- if (ccrps[ctx].increment(x, base.prob(x, p0(x)), &rng)) {
- if (base.increment(x, p0(x), &rng)) {
- bp += log(1.0 / 4.0);
- }
- }
- }
-
- // this is just a biased estimate
- double est_base_prob(int x) {
- return (x + 1) * x / 40.0;
- }
-
- void increment_is(int ctx, int x) {
- assert(ctx > 0 && ctx < 5);
- SampleSet<double> ss;
- const int PARTICLES = 25;
- vector<CCRP<int> > s1s(PARTICLES, CCRP<int>(0.5,0.5));
- vector<CCRP<int> > sbs(PARTICLES, CCRP<int>(0.5,0.5));
- vector<double> sp0s(PARTICLES);
-
- CCRP<int> s1 = ccrps[ctx];
- CCRP<int> sb = base;
- double sp0 = bp;
- for (int pp = 0; pp < PARTICLES; ++pp) {
- if (pp > 0) {
- ccrps[ctx] = s1;
- base = sb;
- bp = sp0;
- }
-
- double q = 1;
- double gamma = 1;
- double est_p = est_base_prob(x);
- //base.prob(x, p0(x)) + rng.next() * 0.1;
- if (ccrps[ctx].increment(x, est_p, &rng, &q)) {
- gamma = q * base.prob(x, p0(x));
- q *= est_p;
- if (verbose) cerr << "(DP-base draw) ";
- double qq = -1;
- if (base.increment(x, p0(x), &rng, &qq)) {
- if (verbose) cerr << "(G0 draw) ";
- bp += log(p0(x));
- qq *= p0(x);
- }
- } else { gamma = q; }
- double w = gamma / q;
- if (verbose)
- cerr << "gamma=" << gamma << " q=" << q << "\tw=" << w << endl;
- ss.add(w);
- s1s[pp] = ccrps[ctx];
- sbs[pp] = base;
- sp0s[pp] = bp;
- }
- int ps = rng.SelectSample(ss);
- ccrps[ctx] = s1s[ps];
- base = sbs[ps];
- bp = sp0s[ps];
- if (verbose) {
- cerr << "SELECTED: " << ps << endl;
- static int cc = 0; cc++; if (cc ==10) exit(1);
- }
- }
-
- void decrement(int ctx, int x) {
- assert(ctx > 0 && ctx < 5);
- if (ccrps[ctx].decrement(x, &rng)) {
- if (base.decrement(x, &rng)) {
- bp -= log(p0(x));
- }
- }
- }
-
- double bp;
- CCRP<int> base;
- vector<CCRP<int> > ccrps;
-
-};
-
-int main(int argc, char** argv) {
- if (argc > 1) { verbose = true; }
- vector<int> counts(15, 0);
- vector<int> tcounts(15, 0);
- int points[] = {1,2, 2,2, 3,2, 4,1, 3, 4, 3, 3, 2, 3, 4, 1, 4, 1, 3, 2, 1, 3, 1, 4, 0, 0};
- double tlh = 0;
- double tt = 0;
- for (int n = 0; n < 1000; ++n) {
- if (n % 10 == 0) cerr << '.';
- if ((n+1) % 400 == 0) cerr << " [" << (n+1) << "]\n";
- Model m;
- for (int *x = points; *x; x += 2)
- m.increment(x[0], x[1]);
-
- for (int j = 0; j < 24; ++j) {
- for (int *x = points; *x; x += 2) {
- if (rng.next() < 0.8) {
- m.decrement(x[0], x[1]);
- m.increment_is(x[0], x[1]);
- }
- }
- }
- counts[m.base.num_customers()]++;
- tcounts[m.base.num_tables()]++;
- tlh += m.llh();
- tt += 1.0;
- }
- cerr << "mean LLH = " << (tlh / tt) << endl;
- for (int i = 0; i < 15; ++i)
- cerr << i << ": " << (counts[i] / tt) << "\t" << (tcounts[i] / tt) << endl;
-}
-
diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc
deleted file mode 100644
index 832f22cf..00000000
--- a/gi/pf/pfbrat.cc
+++ /dev/null
@@ -1,543 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/functional.hpp>
-#include <boost/multi_array.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "viterbi.h"
-#include "hg.h"
-#include "trule.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp_nt.h"
-#include "cfg_wfst_composer.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-static unsigned kMAX_SRC_PHRASE;
-static unsigned kMAX_TRG_PHRASE;
-struct FSTState;
-
-double log_poisson(unsigned x, const double& lambda) {
- assert(lambda > 0.0);
- return log(lambda) * x - lgamma(x + 1) - lambda;
-}
-
-struct ConditionalBase {
- explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) :
- kM1MIXTURE(m1mixture),
- kUNIFORM_MIXTURE(1.0 - m1mixture),
- kUNIFORM_TARGET(1.0 / vocab_e_size),
- kNULL(TD::Convert("<eps>")) {
- assert(m1mixture >= 0.0 && m1mixture <= 1.0);
- assert(vocab_e_size > 0);
- LoadModel1(model1fname);
- }
-
- void LoadModel1(const string& fname) {
- cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
- ReadFile rf(fname);
- istream& in = *rf.stream();
- string line;
- unsigned lc = 0;
- while(getline(in, line)) {
- ++lc;
- int cur = 0;
- int start = 0;
- while(cur < line.size() && line[cur] != ' ') { ++cur; }
- assert(cur != line.size());
- line[cur] = 0;
- const WordID src = TD::Convert(&line[0]);
- ++cur;
- start = cur;
- while(cur < line.size() && line[cur] != ' ') { ++cur; }
- assert(cur != line.size());
- line[cur] = 0;
- WordID trg = TD::Convert(&line[start]);
- const double logprob = strtod(&line[cur + 1], NULL);
- if (src >= ttable.size()) ttable.resize(src + 1);
- ttable[src][trg].logeq(logprob);
- }
- cerr << " read " << lc << " parameters.\n";
- }
-
- // return logp0 of rule.e_ | rule.f_
- prob_t operator()(const TRule& rule) const {
- const int flen = rule.f_.size();
- const int elen = rule.e_.size();
- prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
- prob_t p;
- p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
- for (int i = 0; i < elen; ++i) { // for each position i in e-RHS
- const WordID trg = rule.e_[i];
- prob_t tp = prob_t::Zero();
- for (int j = -1; j < flen; ++j) {
- const WordID src = j < 0 ? kNULL : rule.f_[j];
- const map<WordID, prob_t>::const_iterator it = ttable[src].find(trg);
- if (it != ttable[src].end()) {
- tp += kM1MIXTURE * it->second;
- }
- tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
- }
- tp *= uniform_src_alignment; // draw a_i ~uniform
- p *= tp; // draw e_i ~Model1(f_a_i) / uniform
- }
- return p;
- }
-
- const prob_t kM1MIXTURE; // Model 1 mixture component
- const prob_t kUNIFORM_MIXTURE; // uniform mixture component
- const prob_t kUNIFORM_TARGET;
- const WordID kNULL;
- vector<map<WordID, prob_t> > ttable;
-};
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("input,i",po::value<string>(),"Read parallel data from")
- ("max_src_phrase",po::value<unsigned>()->default_value(3),"Maximum length of source language phrases")
- ("max_trg_phrase",po::value<unsigned>()->default_value(3),"Maximum length of target language phrases")
- ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
- ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-void ReadParallelCorpus(const string& filename,
- vector<vector<WordID> >* f,
- vector<vector<int> >* e,
- set<int>* vocab_f,
- set<int>* vocab_e) {
- f->clear();
- e->clear();
- vocab_f->clear();
- vocab_e->clear();
- istream* in;
- if (filename == "-")
- in = &cin;
- else
- in = new ifstream(filename.c_str());
- assert(*in);
- string line;
- const WordID kDIV = TD::Convert("|||");
- vector<WordID> tmp;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
- e->push_back(vector<int>());
- f->push_back(vector<int>());
- vector<int>& le = e->back();
- vector<int>& lf = f->back();
- tmp.clear();
- TD::ConvertSentence(line, &tmp);
- bool isf = true;
- for (unsigned i = 0; i < tmp.size(); ++i) {
- const int cur = tmp[i];
- if (isf) {
- if (kDIV == cur) { isf = false; } else {
- lf.push_back(cur);
- vocab_f->insert(cur);
- }
- } else {
- assert(cur != kDIV);
- le.push_back(cur);
- vocab_e->insert(cur);
- }
- }
- assert(isf == false);
- }
- if (in != &cin) delete in;
-}
-
-struct UniphraseLM {
- UniphraseLM(const vector<vector<int> >& corpus,
- const set<int>& vocab,
- const po::variables_map& conf) :
- phrases_(1,1),
- gen_(1,1),
- corpus_(corpus),
- uniform_word_(1.0 / vocab.size()),
- gen_p0_(0.5),
- p_end_(0.5),
- use_poisson_(conf.count("poisson_length") > 0) {}
-
- void ResampleHyperparameters(MT19937* rng) {
- phrases_.resample_hyperparameters(rng);
- gen_.resample_hyperparameters(rng);
- cerr << " " << phrases_.alpha();
- }
-
- CCRP_NoTable<vector<int> > phrases_;
- CCRP_NoTable<bool> gen_;
- vector<vector<bool> > z_; // z_[i] is there a phrase boundary after the ith word
- const vector<vector<int> >& corpus_;
- const double uniform_word_;
- const double gen_p0_;
- const double p_end_; // in base length distribution, p of the end of a phrase
- const bool use_poisson_;
-};
-
-struct Reachability {
- boost::multi_array<bool, 4> edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring?
- boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid
-
- Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
- edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
- max_src_delta(boost::extents[srclen][trglen]) {
- ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
- }
-
- private:
- struct SState {
- SState() : prev_src_covered(), prev_trg_covered() {}
- SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {}
- int prev_src_covered;
- int prev_trg_covered;
- };
-
- struct NState {
- NState() : next_src_covered(), next_trg_covered() {}
- NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {}
- int next_src_covered;
- int next_trg_covered;
- };
-
- void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) {
- typedef boost::multi_array<vector<SState>, 2> array_type;
- array_type a(boost::extents[srclen + 1][trglen + 1]);
- a[0][0].push_back(SState());
- for (int i = 0; i < srclen; ++i) {
- for (int j = 0; j < trglen; ++j) {
- if (a[i][j].size() == 0) continue;
- const SState prev(i,j);
- for (int k = 1; k <= src_max_phrase_len; ++k) {
- if ((i + k) > srclen) continue;
- for (int l = 1; l <= trg_max_phrase_len; ++l) {
- if ((j + l) > trglen) continue;
- a[i + k][j + l].push_back(prev);
- }
- }
- }
- }
- a[0][0].clear();
- cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
- assert(a[srclen][trglen].size() > 0);
-
- typedef boost::multi_array<bool, 2> rarray_type;
- rarray_type r(boost::extents[srclen + 1][trglen + 1]);
-// typedef boost::multi_array<vector<NState>, 2> narray_type;
-// narray_type b(boost::extents[srclen + 1][trglen + 1]);
- r[srclen][trglen] = true;
- for (int i = srclen; i >= 0; --i) {
- for (int j = trglen; j >= 0; --j) {
- vector<SState>& prevs = a[i][j];
- if (!r[i][j]) { prevs.clear(); }
-// const NState nstate(i,j);
- for (int k = 0; k < prevs.size(); ++k) {
- r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true;
- int src_delta = i - prevs[k].prev_src_covered;
- edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true;
- short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered];
- if (src_delta > msd) msd = src_delta;
-// b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate);
- }
- }
- }
- assert(!edges[0][0][1][0]);
- assert(!edges[0][0][0][1]);
- assert(!edges[0][0][0][0]);
- cerr << " MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl;
- assert(max_src_delta[0][0] > 0);
- //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n";
- //for (int i = 0; i < b[0][0].size(); ++i) {
- // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n";
- //}
- }
-};
-
-ostream& operator<<(ostream& os, const FSTState& q);
-struct FSTState {
- explicit FSTState(int src_size) :
- trg_covered_(),
- src_covered_(),
- src_coverage_(src_size) {}
-
- FSTState(short trg_covered, short src_covered, const vector<bool>& src_coverage, const vector<short>& src_prefix) :
- trg_covered_(trg_covered),
- src_covered_(src_covered),
- src_coverage_(src_coverage),
- src_prefix_(src_prefix) {
- if (src_coverage_.size() == src_covered) {
- assert(src_prefix.size() == 0);
- }
- }
-
- // if we extend by the word at src_position, what are
- // the next states that are reachable and lie on a valid
- // path to the final state?
- vector<FSTState> Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const {
- assert(src_position < src_coverage_.size());
- if (src_coverage_[src_position]) {
- cerr << "Trying to extend " << *this << " with position " << src_position << endl;
- abort();
- }
- vector<bool> ncvg = src_coverage_;
- ncvg[src_position] = true;
-
- vector<FSTState> res;
- const int trg_remaining = trg_len - trg_covered_;
- if (trg_remaining <= 0) {
- cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl;
- abort();
- }
- const int src_remaining = src_len - src_covered_;
- if (src_remaining <= 0) {
- cerr << "Source appears to have been covered: " << *this << endl;
- abort();
- }
-
- for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) {
- if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) {
- int nc = src_prefix_.size() + 1 + src_covered_;
- res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector<short>()));
- }
- }
-
- if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) {
- vector<short> nsp = src_prefix_;
- nsp.push_back(src_position);
- res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp));
- }
-
- if (res.size() == 0) {
- cerr << *this << " can't be extended!\n";
- abort();
- }
- return res;
- }
-
- short trg_covered_, src_covered_;
- vector<bool> src_coverage_;
- vector<short> src_prefix_;
-};
-bool operator<(const FSTState& q, const FSTState& r) {
- if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_;
- if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_;
- if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_;
- return q.src_prefix_ < r.src_prefix_;
-}
-
-ostream& operator<<(ostream& os, const FSTState& q) {
- os << "[" << q.trg_covered_ << " : ";
- for (int i = 0; i < q.src_coverage_.size(); ++i)
- os << q.src_coverage_[i];
- os << " : <";
- for (int i = 0; i < q.src_prefix_.size(); ++i) {
- if (i != 0) os << ' ';
- os << q.src_prefix_[i];
- }
- return os << ">]";
-}
-
-struct MyModel {
- MyModel(ConditionalBase& rcp0) : rp0(rcp0) {}
- typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > SrcToRuleCRPMap;
-
- void DecrementRule(const TRule& rule) {
- SrcToRuleCRPMap::iterator it = rules.find(rule.f_);
- assert(it != rules.end());
- it->second.decrement(rule);
- if (it->second.num_customers() == 0) rules.erase(it);
- }
-
- void IncrementRule(const TRule& rule) {
- SrcToRuleCRPMap::iterator it = rules.find(rule.f_);
- if (it == rules.end()) {
- CCRP_NoTable<TRule> crp(1,1);
- it = rules.insert(make_pair(rule.f_, crp)).first;
- }
- it->second.increment(rule);
- }
-
- // conditioned on rule.f_
- prob_t RuleConditionalProbability(const TRule& rule) const {
- const prob_t base = rp0(rule);
- SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_);
- if (it == rules.end()) {
- return base;
- } else {
- const double lp = it->second.logprob(rule, log(base));
- prob_t q; q.logeq(lp);
- return q;
- }
- }
-
- const ConditionalBase& rp0;
- SrcToRuleCRPMap rules;
-};
-
-struct MyFST : public WFST {
- MyFST(const vector<WordID>& ssrc, const vector<WordID>& strg, MyModel* m) :
- src(ssrc), trg(strg),
- r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE),
- model(m) {
- FSTState in(src.size());
- cerr << " INIT: " << in << endl;
- init = GetNode(in);
- for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true;
- in.src_covered_ = src.size();
- in.trg_covered_ = trg.size();
- cerr << "FINAL: " << in << endl;
- final = GetNode(in);
- }
- virtual const WFSTNode* Final() const;
- virtual const WFSTNode* Initial() const;
-
- const WFSTNode* GetNode(const FSTState& q);
- map<FSTState, boost::shared_ptr<WFSTNode> > m;
- const vector<WordID>& src;
- const vector<WordID>& trg;
- Reachability r;
- const WFSTNode* init;
- const WFSTNode* final;
- MyModel* model;
-};
-
-struct MyNode : public WFSTNode {
- MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {}
- virtual vector<pair<const WFSTNode*, TRulePtr> > ExtendInput(unsigned srcindex) const;
- const FSTState state;
- mutable MyFST* container;
-};
-
-vector<pair<const WFSTNode*, TRulePtr> > MyNode::ExtendInput(unsigned srcindex) const {
- cerr << "EXTEND " << state << " with " << srcindex << endl;
- vector<FSTState> ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r);
- vector<pair<const WFSTNode*,TRulePtr> > res(ext.size());
- for (unsigned i = 0; i < ext.size(); ++i) {
- res[i].first = container->GetNode(ext[i]);
- if (ext[i].src_prefix_.size() == 0) {
- const unsigned trg_from = state.trg_covered_;
- const unsigned trg_to = ext[i].trg_covered_;
- const unsigned prev_prfx_size = state.src_prefix_.size();
- res[i].second.reset(new TRule);
- res[i].second->lhs_ = -TD::Convert("X");
- vector<WordID>& src = res[i].second->f_;
- vector<WordID>& trg = res[i].second->e_;
- src.resize(prev_prfx_size + 1);
- for (unsigned j = 0; j < prev_prfx_size; ++j)
- src[j] = container->src[state.src_prefix_[j]];
- src[prev_prfx_size] = container->src[srcindex];
- for (unsigned j = trg_from; j < trg_to; ++j)
- trg.push_back(container->trg[j]);
- res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second)));
- }
- }
- return res;
-}
-
-const WFSTNode* MyFST::GetNode(const FSTState& q) {
- boost::shared_ptr<WFSTNode>& res = m[q];
- if (!res) {
- res.reset(new MyNode(q, this));
- }
- return &*res;
-}
-
-const WFSTNode* MyFST::Final() const {
- return final;
-}
-
-const WFSTNode* MyFST::Initial() const {
- return init;
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
- kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
-
- if (!conf.count("model1")) {
- cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
- return 1;
- }
- boost::shared_ptr<MT19937> prng;
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
- MT19937& rng = *prng;
-
- vector<vector<int> > corpuse, corpusf;
- set<int> vocabe, vocabf;
- ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
- cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
- cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
- cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
- cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
- assert(corpusf.size() == corpuse.size());
-
- ConditionalBase lp0(conf["model1_interpolation_weight"].as<double>(),
- vocabe.size(),
- conf["model1"].as<string>());
- MyModel m(lp0);
-
- TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0");
- m.IncrementRule(x);
- TRule y("[X] ||| nY dyN ||| gave ||| 0");
- m.IncrementRule(y);
-
-
- MyFST fst(corpusf[0], corpuse[0], &m);
- ifstream in("./kimura.g");
- assert(in);
- CFG_WFSTComposer comp(fst);
- Hypergraph hg;
- bool succeed = comp.Compose(&in, &hg);
- hg.PrintGraphviz();
- if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; }
-
-#if 0
- ifstream in2("./amnabooks.g");
- assert(in2);
- MyFST fst2(corpusf[1], corpuse[1], &m);
- CFG_WFSTComposer comp2(fst2);
- Hypergraph hg2;
- bool succeed2 = comp2.Compose(&in2, &hg2);
- if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; }
-#endif
-
- SparseVector<double> w; w.set_value(FD::Convert("Proposal"), 1.0);
- hg.Reweight(w);
- cerr << ViterbiFTree(hg) << endl;
- return 0;
-}
-
diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc
deleted file mode 100644
index a3e46064..00000000
--- a/gi/pf/pfdist.cc
+++ /dev/null
@@ -1,598 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/functional.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "pf.h"
-#include "base_distributions.h"
-#include "reachability.h"
-#include "viterbi.h"
-#include "hg.h"
-#include "trule.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp_nt.h"
-#include "ccrp_onetable.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-boost::shared_ptr<MT19937> prng;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("particles,p",po::value<unsigned>()->default_value(30),"Number of particles")
- ("filter_frequency,f",po::value<unsigned>()->default_value(5),"Number of time steps between filterings")
- ("input,i",po::value<string>(),"Read parallel data from")
- ("max_src_phrase",po::value<unsigned>()->default_value(5),"Maximum length of source language phrases")
- ("max_trg_phrase",po::value<unsigned>()->default_value(5),"Maximum length of target language phrases")
- ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
- ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")
- ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-void ReadParallelCorpus(const string& filename,
- vector<vector<WordID> >* f,
- vector<vector<WordID> >* e,
- set<WordID>* vocab_f,
- set<WordID>* vocab_e) {
- f->clear();
- e->clear();
- vocab_f->clear();
- vocab_e->clear();
- istream* in;
- if (filename == "-")
- in = &cin;
- else
- in = new ifstream(filename.c_str());
- assert(*in);
- string line;
- const WordID kDIV = TD::Convert("|||");
- vector<WordID> tmp;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
- e->push_back(vector<int>());
- f->push_back(vector<int>());
- vector<int>& le = e->back();
- vector<int>& lf = f->back();
- tmp.clear();
- TD::ConvertSentence(line, &tmp);
- bool isf = true;
- for (unsigned i = 0; i < tmp.size(); ++i) {
- const int cur = tmp[i];
- if (isf) {
- if (kDIV == cur) { isf = false; } else {
- lf.push_back(cur);
- vocab_f->insert(cur);
- }
- } else {
- assert(cur != kDIV);
- le.push_back(cur);
- vocab_e->insert(cur);
- }
- }
- assert(isf == false);
- }
- if (in != &cin) delete in;
-}
-
-#if 0
-struct MyConditionalModel {
- MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {}
-
- prob_t srcp0(const vector<WordID>& src) const {
- prob_t p(1.0 / 3000.0);
- p.poweq(src.size());
- prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0));
- p *= lenp;
- return p;
- }
-
- void DecrementRule(const TRule& rule) {
- const RuleCRPMap::iterator it = rules.find(rule.f_);
- assert(it != rules.end());
- if (it->second.decrement(rule)) {
- base /= (*rp0)(rule);
- if (it->second.num_customers() == 0)
- rules.erase(it);
- }
- if (src_phrases.decrement(rule.f_))
- base /= srcp0(rule.f_);
- }
-
- void IncrementRule(const TRule& rule) {
- RuleCRPMap::iterator it = rules.find(rule.f_);
- if (it == rules.end())
- it = rules.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1,1))).first;
- if (it->second.increment(rule)) {
- base *= (*rp0)(rule);
- }
- if (src_phrases.increment(rule.f_))
- base *= srcp0(rule.f_);
- }
-
- void IncrementRules(const vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- IncrementRule(*rules[i]);
- }
-
- void DecrementRules(const vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- DecrementRule(*rules[i]);
- }
-
- void IncrementJump(int dist, unsigned src_len) {
- assert(src_len > 0);
- if (src_jumps[src_len].increment(dist))
- base *= jp0(dist, src_len);
- }
-
- void DecrementJump(int dist, unsigned src_len) {
- assert(src_len > 0);
- if (src_jumps[src_len].decrement(dist))
- base /= jp0(dist, src_len);
- }
-
- void IncrementJumps(const vector<int>& js, unsigned src_len) {
- for (unsigned i = 0; i < js.size(); ++i)
- IncrementJump(js[i], src_len);
- }
-
- void DecrementJumps(const vector<int>& js, unsigned src_len) {
- for (unsigned i = 0; i < js.size(); ++i)
- DecrementJump(js[i], src_len);
- }
-
- // p(jump = dist | src_len , z)
- prob_t JumpProbability(int dist, unsigned src_len) {
- const prob_t p0 = jp0(dist, src_len);
- const double lp = src_jumps[src_len].logprob(dist, log(p0));
- prob_t q; q.logeq(lp);
- return q;
- }
-
- // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z)
- prob_t RuleProbability(const TRule& rule) const {
- const prob_t p0 = (*rp0)(rule);
- prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_))));
- const RuleCRPMap::const_iterator it = rules.find(rule.f_);
- if (it == rules.end()) return srcp * p0;
- const double lp = it->second.logprob(rule, log(p0));
- prob_t q; q.logeq(lp);
- return q * srcp;
- }
-
- prob_t Likelihood() const {
- prob_t p = base;
- for (RuleCRPMap::const_iterator it = rules.begin();
- it != rules.end(); ++it) {
- prob_t cl; cl.logeq(it->second.log_crp_prob());
- p *= cl;
- }
- for (unsigned l = 1; l < src_jumps.size(); ++l) {
- if (src_jumps[l].num_customers() > 0) {
- prob_t q;
- q.logeq(src_jumps[l].log_crp_prob());
- p *= q;
- }
- }
- return p;
- }
-
- JumpBase jp0;
- const PhraseConditionalBase* rp0;
- prob_t base;
- typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > RuleCRPMap;
- RuleCRPMap rules;
- CCRP_NoTable<vector<WordID> > src_phrases;
- vector<CCRP_NoTable<int> > src_jumps;
-};
-
-#endif
-
-struct MyJointModel {
- MyJointModel(PhraseJointBase& rcp0) :
- rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {}
-
- void DecrementRule(const TRule& rule) {
- if (rules.decrement(rule))
- base /= rp0(rule);
- }
-
- void IncrementRule(const TRule& rule) {
- if (rules.increment(rule))
- base *= rp0(rule);
- }
-
- void IncrementRules(const vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- IncrementRule(*rules[i]);
- }
-
- void DecrementRules(const vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- DecrementRule(*rules[i]);
- }
-
- void IncrementJump(int dist, unsigned src_len) {
- assert(src_len > 0);
- if (src_jumps[src_len].increment(dist))
- base *= jp0(dist, src_len);
- }
-
- void DecrementJump(int dist, unsigned src_len) {
- assert(src_len > 0);
- if (src_jumps[src_len].decrement(dist))
- base /= jp0(dist, src_len);
- }
-
- void IncrementJumps(const vector<int>& js, unsigned src_len) {
- for (unsigned i = 0; i < js.size(); ++i)
- IncrementJump(js[i], src_len);
- }
-
- void DecrementJumps(const vector<int>& js, unsigned src_len) {
- for (unsigned i = 0; i < js.size(); ++i)
- DecrementJump(js[i], src_len);
- }
-
- // p(jump = dist | src_len , z)
- prob_t JumpProbability(int dist, unsigned src_len) {
- const prob_t p0 = jp0(dist, src_len);
- const double lp = src_jumps[src_len].logprob(dist, log(p0));
- prob_t q; q.logeq(lp);
- return q;
- }
-
- // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z)
- prob_t RuleProbability(const TRule& rule) const {
- prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule))));
- return p;
- }
-
- prob_t Likelihood() const {
- prob_t p = base;
- prob_t q; q.logeq(rules.log_crp_prob());
- p *= q;
- for (unsigned l = 1; l < src_jumps.size(); ++l) {
- if (src_jumps[l].num_customers() > 0) {
- prob_t q;
- q.logeq(src_jumps[l].log_crp_prob());
- p *= q;
- }
- }
- return p;
- }
-
- JumpBase jp0;
- const PhraseJointBase& rp0;
- prob_t base;
- CCRP_NoTable<TRule> rules;
- vector<CCRP_NoTable<int> > src_jumps;
-};
-
-struct BackwardEstimate {
- BackwardEstimate(const Model1& m1, const vector<WordID>& src, const vector<WordID>& trg) :
- model1_(m1), src_(src), trg_(trg) {
- }
- const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const {
- assert(src_.size() == src_cov.size());
- assert(trg_cov <= trg_.size());
- prob_t& e = cache_[src_cov][trg_cov];
- if (e.is_0()) {
- if (trg_cov == trg_.size()) { e = prob_t::One(); return e; }
- vector<WordID> r(src_.size() + 1); r.clear();
- r.push_back(0); // NULL word
- for (int i = 0; i < src_cov.size(); ++i)
- if (!src_cov[i]) r.push_back(src_[i]);
- const prob_t uniform_alignment(1.0 / r.size());
- e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
- for (unsigned j = trg_cov; j < trg_.size(); ++j) {
- prob_t p;
- for (unsigned i = 0; i < r.size(); ++i)
- p += model1_(r[i], trg_[j]);
- if (p.is_0()) {
- cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n";
- abort();
- }
- p *= uniform_alignment;
- e *= p;
- }
- }
- return e;
- }
- const Model1& model1_;
- const vector<WordID>& src_;
- const vector<WordID>& trg_;
- mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_;
-};
-
-struct BackwardEstimateSym {
- BackwardEstimateSym(const Model1& m1,
- const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) :
- model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) {
- }
- const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const {
- assert(src_.size() == src_cov.size());
- assert(trg_cov <= trg_.size());
- prob_t& e = cache_[src_cov][trg_cov];
- if (e.is_0()) {
- if (trg_cov == trg_.size()) { e = prob_t::One(); return e; }
- vector<WordID> r(src_.size() + 1); r.clear();
- for (int i = 0; i < src_cov.size(); ++i)
- if (!src_cov[i]) r.push_back(src_[i]);
- r.push_back(0); // NULL word
- const prob_t uniform_alignment(1.0 / r.size());
- e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
- for (unsigned j = trg_cov; j < trg_.size(); ++j) {
- prob_t p;
- for (unsigned i = 0; i < r.size(); ++i)
- p += model1_(r[i], trg_[j]);
- if (p.is_0()) {
- cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n";
- abort();
- }
- p *= uniform_alignment;
- e *= p;
- }
- r.pop_back();
- const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0));
- prob_t inv;
- inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov));
- for (unsigned i = 0; i < r.size(); ++i) {
- prob_t p;
- for (unsigned j = trg_cov - 1; j < trg_.size(); ++j)
- p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]);
- if (p.is_0()) {
- cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n";
- abort();
- }
- p *= inv_uniform;
- inv *= p;
- }
- prob_t x = pow(e * inv, 0.5);
- e = x;
- //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl;
- }
- return e;
- }
- const Model1& model1_;
- const Model1& invmodel1_;
- const vector<WordID>& src_;
- const vector<WordID>& trg_;
- mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_;
-};
-
-struct Particle {
- Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {}
- prob_t weight;
- prob_t gamma_last;
- vector<int> src_jumps;
- vector<TRulePtr> rules;
- vector<bool> src_cv;
- int src_cov;
- int trg_cov;
- int prev_pos;
-};
-
-ostream& operator<<(ostream& o, const vector<bool>& v) {
- for (int i = 0; i < v.size(); ++i)
- o << (v[i] ? '1' : '0');
- return o;
-}
-ostream& operator<<(ostream& o, const Particle& p) {
- o << "[cv=" << p.src_cv << " src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']';
- return o;
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
- const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
- const unsigned particles = conf["particles"].as<unsigned>();
- const unsigned samples = conf["samples"].as<unsigned>();
- const unsigned rejuv_freq = conf["filter_frequency"].as<unsigned>();
-
- if (!conf.count("model1")) {
- cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
- return 1;
- }
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
- MT19937& rng = *prng;
-
- vector<vector<WordID> > corpuse, corpusf;
- set<WordID> vocabe, vocabf;
- cerr << "Reading corpus...\n";
- ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
- cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
- cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
- assert(corpusf.size() == corpuse.size());
-
- const int kLHS = -TD::Convert("X");
- Model1 m1(conf["model1"].as<string>());
- Model1 invm1(conf["inverse_model1"].as<string>());
-
-#if 0
- PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size());
- MyConditionalModel m(lp0);
-#else
- PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
- MyJointModel m(lp0);
-#endif
-
- MultinomialResampleFilter<Particle> filter(&rng);
- cerr << "Initializing reachability limits...\n";
- vector<Particle> ps(corpusf.size());
- vector<Reachability> reaches; reaches.reserve(corpusf.size());
- for (int ci = 0; ci < corpusf.size(); ++ci)
- reaches.push_back(Reachability(corpusf[ci].size(),
- corpuse[ci].size(),
- kMAX_SRC_PHRASE,
- kMAX_TRG_PHRASE));
- cerr << "Sampling...\n";
- vector<Particle> tmp_p(10000); // work space
- SampleSet<prob_t> pfss;
- for (int SS=0; SS < samples; ++SS) {
- for (int ci = 0; ci < corpusf.size(); ++ci) {
- vector<int>& src = corpusf[ci];
- vector<int>& trg = corpuse[ci];
- m.DecrementRules(ps[ci].rules);
- m.DecrementJumps(ps[ci].src_jumps, src.size());
-
- //BackwardEstimate be(m1, src, trg);
- BackwardEstimateSym be(m1, invm1, src, trg);
- const Reachability& r = reaches[ci];
- vector<Particle> lps(particles);
-
- for (int pi = 0; pi < particles; ++pi) {
- Particle& p = lps[pi];
- p.src_cv.resize(src.size(), false);
- }
-
- bool all_complete = false;
- while(!all_complete) {
- SampleSet<prob_t> ss;
-
- // all particles have now been extended a bit, we will reweight them now
- if (lps[0].trg_cov > 0)
- filter(&lps);
-
- // loop over all particles and extend them
- bool done_nothing = true;
- for (int pi = 0; pi < particles; ++pi) {
- Particle& p = lps[pi];
- int tic = 0;
- while(p.trg_cov < trg.size() && tic < rejuv_freq) {
- ++tic;
- done_nothing = false;
- ss.clear();
- TRule x; x.lhs_ = kLHS;
- prob_t z;
- int first_uncovered = src.size();
- int last_uncovered = -1;
- for (int i = 0; i < src.size(); ++i) {
- const bool is_uncovered = !p.src_cv[i];
- if (i < first_uncovered && is_uncovered) first_uncovered = i;
- if (is_uncovered && i > last_uncovered) last_uncovered = i;
- }
- assert(last_uncovered > -1);
- assert(first_uncovered < src.size());
-
- for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) {
- x.e_.push_back(trg[trg_len - 1 + p.trg_cov]);
- for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) {
- if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue;
-
- const int last_possible_start = last_uncovered - src_len + 1;
- assert(last_possible_start >= 0);
- //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl;
- //cerr << " first_uncovered=" << first_uncovered << " last_possible_start=" << last_possible_start << endl;
- for (int i = first_uncovered; i <= last_possible_start; ++i) {
- if (p.src_cv[i]) continue;
- assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size
- Particle& np = tmp_p[ss.size()];
- np = p;
- x.f_.clear();
- int gap_add = 0;
- bool bad = false;
- prob_t jp = prob_t::One();
- int prev_pos = p.prev_pos;
- for (int j = 0; j < src_len; ++j) {
- if ((j + i + gap_add) == src.size()) { bad = true; break; }
- while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; }
- if ((j + i + gap_add) == src.size()) { bad = true; break; }
- np.src_cv[i + j + gap_add] = true;
- x.f_.push_back(src[i + j + gap_add]);
- jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size());
- int jump = i + j + gap_add - prev_pos;
- assert(jump != 0);
- np.src_jumps.push_back(jump);
- prev_pos = i + j + gap_add;
- }
- if (bad) continue;
- np.prev_pos = prev_pos;
- np.src_cov += x.f_.size();
- np.trg_cov += x.e_.size();
- if (x.f_.size() != src_len) continue;
- prob_t rp = m.RuleProbability(x);
- np.gamma_last = rp * jp;
- const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2);
- //cerr << "**rule=" << x << endl;
- //cerr << " u=" << log(u) << " rule=" << rp << " jump=" << jp << endl;
- ss.add(u);
- np.rules.push_back(TRulePtr(new TRule(x)));
- z += u;
-
- const bool completed = (p.trg_cov == trg.size());
- if (completed) {
- int last_jump = src.size() - p.prev_pos;
- assert(last_jump > 0);
- p.src_jumps.push_back(last_jump);
- p.weight *= m.JumpProbability(last_jump, src.size());
- }
- }
- }
- }
- cerr << "number of edges to consider: " << ss.size() << endl;
- const int sampled = rng.SelectSample(ss);
- prob_t q_n = ss[sampled] / z;
- p = tmp_p[sampled];
- //m.IncrementRule(*p.rules.back());
- p.weight *= p.gamma_last / q_n;
- cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl;
- cerr << p << endl;
- }
- } // loop over particles (pi = 0 .. particles)
- if (done_nothing) all_complete = true;
- }
- pfss.clear();
- for (int i = 0; i < lps.size(); ++i)
- pfss.add(lps[i].weight);
- const int sampled = rng.SelectSample(pfss);
- ps[ci] = lps[sampled];
- m.IncrementRules(lps[sampled].rules);
- m.IncrementJumps(lps[sampled].src_jumps, src.size());
- for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; }
- cerr << "tmp-LLH: " << log(m.Likelihood()) << endl;
- }
- cerr << "LLH: " << log(m.Likelihood()) << endl;
- for (int sni = 0; sni < 5; ++sni) {
- for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; }
- }
- }
- return 0;
-}
-
diff --git a/gi/pf/pfdist.new.cc b/gi/pf/pfdist.new.cc
deleted file mode 100644
index 3169eb75..00000000
--- a/gi/pf/pfdist.new.cc
+++ /dev/null
@@ -1,620 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/functional.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "base_measures.h"
-#include "reachability.h"
-#include "viterbi.h"
-#include "hg.h"
-#include "trule.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp_nt.h"
-#include "ccrp_onetable.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-shared_ptr<MT19937> prng;
-
-size_t hash_value(const TRule& r) {
- size_t h = boost::hash_value(r.e_);
- boost::hash_combine(h, -r.lhs_);
- boost::hash_combine(h, boost::hash_value(r.f_));
- return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
- return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles")
- ("input,i",po::value<string>(),"Read parallel data from")
- ("max_src_phrase",po::value<unsigned>()->default_value(5),"Maximum length of source language phrases")
- ("max_trg_phrase",po::value<unsigned>()->default_value(5),"Maximum length of target language phrases")
- ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
- ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")
- ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-void ReadParallelCorpus(const string& filename,
- vector<vector<WordID> >* f,
- vector<vector<WordID> >* e,
- set<WordID>* vocab_f,
- set<WordID>* vocab_e) {
- f->clear();
- e->clear();
- vocab_f->clear();
- vocab_e->clear();
- istream* in;
- if (filename == "-")
- in = &cin;
- else
- in = new ifstream(filename.c_str());
- assert(*in);
- string line;
- const WordID kDIV = TD::Convert("|||");
- vector<WordID> tmp;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
- e->push_back(vector<int>());
- f->push_back(vector<int>());
- vector<int>& le = e->back();
- vector<int>& lf = f->back();
- tmp.clear();
- TD::ConvertSentence(line, &tmp);
- bool isf = true;
- for (unsigned i = 0; i < tmp.size(); ++i) {
- const int cur = tmp[i];
- if (isf) {
- if (kDIV == cur) { isf = false; } else {
- lf.push_back(cur);
- vocab_f->insert(cur);
- }
- } else {
- assert(cur != kDIV);
- le.push_back(cur);
- vocab_e->insert(cur);
- }
- }
- assert(isf == false);
- }
- if (in != &cin) delete in;
-}
-
-#if 0
-struct MyConditionalModel {
- MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {}
-
- prob_t srcp0(const vector<WordID>& src) const {
- prob_t p(1.0 / 3000.0);
- p.poweq(src.size());
- prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0));
- p *= lenp;
- return p;
- }
-
- void DecrementRule(const TRule& rule) {
- const RuleCRPMap::iterator it = rules.find(rule.f_);
- assert(it != rules.end());
- if (it->second.decrement(rule)) {
- base /= (*rp0)(rule);
- if (it->second.num_customers() == 0)
- rules.erase(it);
- }
- if (src_phrases.decrement(rule.f_))
- base /= srcp0(rule.f_);
- }
-
- void IncrementRule(const TRule& rule) {
- RuleCRPMap::iterator it = rules.find(rule.f_);
- if (it == rules.end())
- it = rules.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1,1))).first;
- if (it->second.increment(rule)) {
- base *= (*rp0)(rule);
- }
- if (src_phrases.increment(rule.f_))
- base *= srcp0(rule.f_);
- }
-
- void IncrementRules(const vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- IncrementRule(*rules[i]);
- }
-
- void DecrementRules(const vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- DecrementRule(*rules[i]);
- }
-
- void IncrementJump(int dist, unsigned src_len) {
- assert(src_len > 0);
- if (src_jumps[src_len].increment(dist))
- base *= jp0(dist, src_len);
- }
-
- void DecrementJump(int dist, unsigned src_len) {
- assert(src_len > 0);
- if (src_jumps[src_len].decrement(dist))
- base /= jp0(dist, src_len);
- }
-
- void IncrementJumps(const vector<int>& js, unsigned src_len) {
- for (unsigned i = 0; i < js.size(); ++i)
- IncrementJump(js[i], src_len);
- }
-
- void DecrementJumps(const vector<int>& js, unsigned src_len) {
- for (unsigned i = 0; i < js.size(); ++i)
- DecrementJump(js[i], src_len);
- }
-
- // p(jump = dist | src_len , z)
- prob_t JumpProbability(int dist, unsigned src_len) {
- const prob_t p0 = jp0(dist, src_len);
- const double lp = src_jumps[src_len].logprob(dist, log(p0));
- prob_t q; q.logeq(lp);
- return q;
- }
-
- // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z)
- prob_t RuleProbability(const TRule& rule) const {
- const prob_t p0 = (*rp0)(rule);
- prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_))));
- const RuleCRPMap::const_iterator it = rules.find(rule.f_);
- if (it == rules.end()) return srcp * p0;
- const double lp = it->second.logprob(rule, log(p0));
- prob_t q; q.logeq(lp);
- return q * srcp;
- }
-
- prob_t Likelihood() const {
- prob_t p = base;
- for (RuleCRPMap::const_iterator it = rules.begin();
- it != rules.end(); ++it) {
- prob_t cl; cl.logeq(it->second.log_crp_prob());
- p *= cl;
- }
- for (unsigned l = 1; l < src_jumps.size(); ++l) {
- if (src_jumps[l].num_customers() > 0) {
- prob_t q;
- q.logeq(src_jumps[l].log_crp_prob());
- p *= q;
- }
- }
- return p;
- }
-
- JumpBase jp0;
- const PhraseConditionalBase* rp0;
- prob_t base;
- typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > RuleCRPMap;
- RuleCRPMap rules;
- CCRP_NoTable<vector<WordID> > src_phrases;
- vector<CCRP_NoTable<int> > src_jumps;
-};
-
-#endif
-
-struct MyJointModel {
- MyJointModel(PhraseJointBase& rcp0) :
- rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {}
-
- void DecrementRule(const TRule& rule) {
- if (rules.decrement(rule))
- base /= rp0(rule);
- }
-
- void IncrementRule(const TRule& rule) {
- if (rules.increment(rule))
- base *= rp0(rule);
- }
-
- void IncrementRules(const vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- IncrementRule(*rules[i]);
- }
-
- void DecrementRules(const vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- DecrementRule(*rules[i]);
- }
-
- void IncrementJump(int dist, unsigned src_len) {
- assert(src_len > 0);
- if (src_jumps[src_len].increment(dist))
- base *= jp0(dist, src_len);
- }
-
- void DecrementJump(int dist, unsigned src_len) {
- assert(src_len > 0);
- if (src_jumps[src_len].decrement(dist))
- base /= jp0(dist, src_len);
- }
-
- void IncrementJumps(const vector<int>& js, unsigned src_len) {
- for (unsigned i = 0; i < js.size(); ++i)
- IncrementJump(js[i], src_len);
- }
-
- void DecrementJumps(const vector<int>& js, unsigned src_len) {
- for (unsigned i = 0; i < js.size(); ++i)
- DecrementJump(js[i], src_len);
- }
-
- // p(jump = dist | src_len , z)
- prob_t JumpProbability(int dist, unsigned src_len) {
- const prob_t p0 = jp0(dist, src_len);
- const double lp = src_jumps[src_len].logprob(dist, log(p0));
- prob_t q; q.logeq(lp);
- return q;
- }
-
- // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z)
- prob_t RuleProbability(const TRule& rule) const {
- prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule))));
- return p;
- }
-
- prob_t Likelihood() const {
- prob_t p = base;
- prob_t q; q.logeq(rules.log_crp_prob());
- p *= q;
- for (unsigned l = 1; l < src_jumps.size(); ++l) {
- if (src_jumps[l].num_customers() > 0) {
- prob_t q;
- q.logeq(src_jumps[l].log_crp_prob());
- p *= q;
- }
- }
- return p;
- }
-
- JumpBase jp0;
- const PhraseJointBase& rp0;
- prob_t base;
- CCRP_NoTable<TRule> rules;
- vector<CCRP_NoTable<int> > src_jumps;
-};
-
-struct BackwardEstimate {
- BackwardEstimate(const Model1& m1, const vector<WordID>& src, const vector<WordID>& trg) :
- model1_(m1), src_(src), trg_(trg) {
- }
- const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const {
- assert(src_.size() == src_cov.size());
- assert(trg_cov <= trg_.size());
- prob_t& e = cache_[src_cov][trg_cov];
- if (e.is_0()) {
- if (trg_cov == trg_.size()) { e = prob_t::One(); return e; }
- vector<WordID> r(src_.size() + 1); r.clear();
- r.push_back(0); // NULL word
- for (int i = 0; i < src_cov.size(); ++i)
- if (!src_cov[i]) r.push_back(src_[i]);
- const prob_t uniform_alignment(1.0 / r.size());
- e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
- for (unsigned j = trg_cov; j < trg_.size(); ++j) {
- prob_t p;
- for (unsigned i = 0; i < r.size(); ++i)
- p += model1_(r[i], trg_[j]);
- if (p.is_0()) {
- cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n";
- abort();
- }
- p *= uniform_alignment;
- e *= p;
- }
- }
- return e;
- }
- const Model1& model1_;
- const vector<WordID>& src_;
- const vector<WordID>& trg_;
- mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_;
-};
-
-struct BackwardEstimateSym {
- BackwardEstimateSym(const Model1& m1,
- const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) :
- model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) {
- }
- const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const {
- assert(src_.size() == src_cov.size());
- assert(trg_cov <= trg_.size());
- prob_t& e = cache_[src_cov][trg_cov];
- if (e.is_0()) {
- if (trg_cov == trg_.size()) { e = prob_t::One(); return e; }
- vector<WordID> r(src_.size() + 1); r.clear();
- for (int i = 0; i < src_cov.size(); ++i)
- if (!src_cov[i]) r.push_back(src_[i]);
- r.push_back(0); // NULL word
- const prob_t uniform_alignment(1.0 / r.size());
- e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
- for (unsigned j = trg_cov; j < trg_.size(); ++j) {
- prob_t p;
- for (unsigned i = 0; i < r.size(); ++i)
- p += model1_(r[i], trg_[j]);
- if (p.is_0()) {
- cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n";
- abort();
- }
- p *= uniform_alignment;
- e *= p;
- }
- r.pop_back();
- const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0));
- prob_t inv;
- inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov));
- for (unsigned i = 0; i < r.size(); ++i) {
- prob_t p;
- for (unsigned j = trg_cov - 1; j < trg_.size(); ++j)
- p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]);
- if (p.is_0()) {
- cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n";
- abort();
- }
- p *= inv_uniform;
- inv *= p;
- }
- prob_t x = pow(e * inv, 0.5);
- e = x;
- //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl;
- }
- return e;
- }
- const Model1& model1_;
- const Model1& invmodel1_;
- const vector<WordID>& src_;
- const vector<WordID>& trg_;
- mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_;
-};
-
-struct Particle {
- Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {}
- prob_t weight;
- prob_t gamma_last;
- vector<int> src_jumps;
- vector<TRulePtr> rules;
- vector<bool> src_cv;
- int src_cov;
- int trg_cov;
- int prev_pos;
-};
-
-ostream& operator<<(ostream& o, const vector<bool>& v) {
- for (int i = 0; i < v.size(); ++i)
- o << (v[i] ? '1' : '0');
- return o;
-}
-ostream& operator<<(ostream& o, const Particle& p) {
- o << "[cv=" << p.src_cv << " src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']';
- return o;
-}
-
-void FilterCrapParticlesAndReweight(vector<Particle>* pps) {
- vector<Particle>& ps = *pps;
- SampleSet<prob_t> ss;
- for (int i = 0; i < ps.size(); ++i)
- ss.add(ps[i].weight);
- vector<Particle> nps; nps.reserve(ps.size());
- const prob_t uniform_weight(1.0 / ps.size());
- for (int i = 0; i < ps.size(); ++i) {
- nps.push_back(ps[prng->SelectSample(ss)]);
- nps[i].weight = uniform_weight;
- }
- nps.swap(ps);
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
- const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
- const unsigned particles = conf["particles"].as<unsigned>();
- const unsigned samples = conf["samples"].as<unsigned>();
-
- if (!conf.count("model1")) {
- cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
- return 1;
- }
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
- MT19937& rng = *prng;
-
- vector<vector<WordID> > corpuse, corpusf;
- set<WordID> vocabe, vocabf;
- cerr << "Reading corpus...\n";
- ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
- cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
- cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
- assert(corpusf.size() == corpuse.size());
-
- const int kLHS = -TD::Convert("X");
- Model1 m1(conf["model1"].as<string>());
- Model1 invm1(conf["inverse_model1"].as<string>());
-
-#if 0
- PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size());
- MyConditionalModel m(lp0);
-#else
- PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
- MyJointModel m(lp0);
-#endif
-
- cerr << "Initializing reachability limits...\n";
- vector<Particle> ps(corpusf.size());
- vector<Reachability> reaches; reaches.reserve(corpusf.size());
- for (int ci = 0; ci < corpusf.size(); ++ci)
- reaches.push_back(Reachability(corpusf[ci].size(),
- corpuse[ci].size(),
- kMAX_SRC_PHRASE,
- kMAX_TRG_PHRASE));
- cerr << "Sampling...\n";
- vector<Particle> tmp_p(10000); // work space
- SampleSet<prob_t> pfss;
- for (int SS=0; SS < samples; ++SS) {
- for (int ci = 0; ci < corpusf.size(); ++ci) {
- vector<int>& src = corpusf[ci];
- vector<int>& trg = corpuse[ci];
- m.DecrementRules(ps[ci].rules);
- m.DecrementJumps(ps[ci].src_jumps, src.size());
-
- //BackwardEstimate be(m1, src, trg);
- BackwardEstimateSym be(m1, invm1, src, trg);
- const Reachability& r = reaches[ci];
- vector<Particle> lps(particles);
-
- for (int pi = 0; pi < particles; ++pi) {
- Particle& p = lps[pi];
- p.src_cv.resize(src.size(), false);
- }
-
- bool all_complete = false;
- while(!all_complete) {
- SampleSet<prob_t> ss;
-
- // all particles have now been extended a bit, we will reweight them now
- if (lps[0].trg_cov > 0)
- FilterCrapParticlesAndReweight(&lps);
-
- // loop over all particles and extend them
- bool done_nothing = true;
- for (int pi = 0; pi < particles; ++pi) {
- Particle& p = lps[pi];
- int tic = 0;
- const int rejuv_freq = 1;
- while(p.trg_cov < trg.size() && tic < rejuv_freq) {
- ++tic;
- done_nothing = false;
- ss.clear();
- TRule x; x.lhs_ = kLHS;
- prob_t z;
- int first_uncovered = src.size();
- int last_uncovered = -1;
- for (int i = 0; i < src.size(); ++i) {
- const bool is_uncovered = !p.src_cv[i];
- if (i < first_uncovered && is_uncovered) first_uncovered = i;
- if (is_uncovered && i > last_uncovered) last_uncovered = i;
- }
- assert(last_uncovered > -1);
- assert(first_uncovered < src.size());
-
- for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) {
- x.e_.push_back(trg[trg_len - 1 + p.trg_cov]);
- for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) {
- if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue;
-
- const int last_possible_start = last_uncovered - src_len + 1;
- assert(last_possible_start >= 0);
- //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl;
- //cerr << " first_uncovered=" << first_uncovered << " last_possible_start=" << last_possible_start << endl;
- for (int i = first_uncovered; i <= last_possible_start; ++i) {
- if (p.src_cv[i]) continue;
- assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size
- Particle& np = tmp_p[ss.size()];
- np = p;
- x.f_.clear();
- int gap_add = 0;
- bool bad = false;
- prob_t jp = prob_t::One();
- int prev_pos = p.prev_pos;
- for (int j = 0; j < src_len; ++j) {
- if ((j + i + gap_add) == src.size()) { bad = true; break; }
- while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; }
- if ((j + i + gap_add) == src.size()) { bad = true; break; }
- np.src_cv[i + j + gap_add] = true;
- x.f_.push_back(src[i + j + gap_add]);
- jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size());
- int jump = i + j + gap_add - prev_pos;
- assert(jump != 0);
- np.src_jumps.push_back(jump);
- prev_pos = i + j + gap_add;
- }
- if (bad) continue;
- np.prev_pos = prev_pos;
- np.src_cov += x.f_.size();
- np.trg_cov += x.e_.size();
- if (x.f_.size() != src_len) continue;
- prob_t rp = m.RuleProbability(x);
- np.gamma_last = rp * jp;
- const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2);
- //cerr << "**rule=" << x << endl;
- //cerr << " u=" << log(u) << " rule=" << rp << " jump=" << jp << endl;
- ss.add(u);
- np.rules.push_back(TRulePtr(new TRule(x)));
- z += u;
-
- const bool completed = (p.trg_cov == trg.size());
- if (completed) {
- int last_jump = src.size() - p.prev_pos;
- assert(last_jump > 0);
- p.src_jumps.push_back(last_jump);
- p.weight *= m.JumpProbability(last_jump, src.size());
- }
- }
- }
- }
- cerr << "number of edges to consider: " << ss.size() << endl;
- const int sampled = rng.SelectSample(ss);
- prob_t q_n = ss[sampled] / z;
- p = tmp_p[sampled];
- //m.IncrementRule(*p.rules.back());
- p.weight *= p.gamma_last / q_n;
- cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl;
- cerr << p << endl;
- }
- } // loop over particles (pi = 0 .. particles)
- if (done_nothing) all_complete = true;
- }
- pfss.clear();
- for (int i = 0; i < lps.size(); ++i)
- pfss.add(lps[i].weight);
- const int sampled = rng.SelectSample(pfss);
- ps[ci] = lps[sampled];
- m.IncrementRules(lps[sampled].rules);
- m.IncrementJumps(lps[sampled].src_jumps, src.size());
- for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; }
- cerr << "tmp-LLH: " << log(m.Likelihood()) << endl;
- }
- cerr << "LLH: " << log(m.Likelihood()) << endl;
- for (int sni = 0; sni < 5; ++sni) {
- for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; }
- }
- }
- return 0;
-}
-
diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc
deleted file mode 100644
index 958ec4e2..00000000
--- a/gi/pf/pfnaive.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/functional.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "pf.h"
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "reachability.h"
-#include "viterbi.h"
-#include "hg.h"
-#include "trule.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp_nt.h"
-#include "ccrp_onetable.h"
-#include "corpus.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-boost::shared_ptr<MT19937> prng;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("particles,p",po::value<unsigned>()->default_value(30),"Number of particles")
- ("filter_frequency,f",po::value<unsigned>()->default_value(5),"Number of time steps between filterings")
- ("input,i",po::value<string>(),"Read parallel data from")
- ("max_src_phrase",po::value<unsigned>()->default_value(5),"Maximum length of source language phrases")
- ("max_trg_phrase",po::value<unsigned>()->default_value(5),"Maximum length of target language phrases")
- ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
- ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")
- ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-struct BackwardEstimateSym {
- BackwardEstimateSym(const Model1& m1,
- const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) :
- model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) {
- }
- const prob_t& operator()(unsigned src_cov, unsigned trg_cov) const {
- assert(src_cov <= src_.size());
- assert(trg_cov <= trg_.size());
- prob_t& e = cache_[src_cov][trg_cov];
- if (e.is_0()) {
- if (trg_cov == trg_.size()) { e = prob_t::One(); return e; }
- vector<WordID> r(src_.size() + 1); r.clear();
- for (int i = src_cov; i < src_.size(); ++i)
- r.push_back(src_[i]);
- r.push_back(0); // NULL word
- const prob_t uniform_alignment(1.0 / r.size());
- e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
- for (unsigned j = trg_cov; j < trg_.size(); ++j) {
- prob_t p;
- for (unsigned i = 0; i < r.size(); ++i)
- p += model1_(r[i], trg_[j]);
- if (p.is_0()) {
- cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n";
- abort();
- }
- p *= uniform_alignment;
- e *= p;
- }
- r.pop_back();
- const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0));
- prob_t inv;
- inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov));
- for (unsigned i = 0; i < r.size(); ++i) {
- prob_t p;
- for (unsigned j = trg_cov - 1; j < trg_.size(); ++j)
- p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]);
- if (p.is_0()) {
- cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n";
- abort();
- }
- p *= inv_uniform;
- inv *= p;
- }
- prob_t x = pow(e * inv, 0.5);
- e = x;
- //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl;
- }
- return e;
- }
- const Model1& model1_;
- const Model1& invmodel1_;
- const vector<WordID>& src_;
- const vector<WordID>& trg_;
- mutable unordered_map<unsigned, map<unsigned, prob_t> > cache_;
-};
-
-struct Particle {
- Particle() : weight(prob_t::One()), src_cov(), trg_cov() {}
- prob_t weight;
- prob_t gamma_last;
- vector<TRulePtr> rules;
- int src_cov;
- int trg_cov;
-};
-
-ostream& operator<<(ostream& o, const vector<bool>& v) {
- for (int i = 0; i < v.size(); ++i)
- o << (v[i] ? '1' : '0');
- return o;
-}
-ostream& operator<<(ostream& o, const Particle& p) {
- o << "[src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']';
- return o;
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
- const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
- const unsigned particles = conf["particles"].as<unsigned>();
- const unsigned samples = conf["samples"].as<unsigned>();
- const unsigned rejuv_freq = conf["filter_frequency"].as<unsigned>();
-
- if (!conf.count("model1")) {
- cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
- return 1;
- }
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
- MT19937& rng = *prng;
-
- vector<vector<WordID> > corpuse, corpusf;
- set<WordID> vocabe, vocabf;
- cerr << "Reading corpus...\n";
- corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
- cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
- cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
- assert(corpusf.size() == corpuse.size());
-
- const int kLHS = -TD::Convert("X");
- Model1 m1(conf["model1"].as<string>());
- Model1 invm1(conf["inverse_model1"].as<string>());
-
- PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
- PhraseJointBase_BiDir alp0(m1, invm1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
- MonotonicParallelSegementationModel<PhraseJointBase_BiDir> m(alp0);
- TRule xx("[X] ||| ms. kimura ||| MS. KIMURA ||| X=0");
- cerr << xx << endl << lp0(xx) << " " << alp0(xx) << endl;
- TRule xx12("[X] ||| . ||| PHARMACY . ||| X=0");
- TRule xx21("[X] ||| pharmacy . ||| . ||| X=0");
-// TRule xx22("[X] ||| . ||| . ||| X=0");
- TRule xx22("[X] ||| . ||| THE . ||| X=0");
- cerr << xx12 << "\t" << lp0(xx12) << " " << alp0(xx12) << endl;
- cerr << xx21 << "\t" << lp0(xx21) << " " << alp0(xx21) << endl;
- cerr << xx22 << "\t" << lp0(xx22) << " " << alp0(xx22) << endl;
-
- cerr << "Initializing reachability limits...\n";
- vector<Particle> ps(corpusf.size());
- vector<Reachability> reaches; reaches.reserve(corpusf.size());
- for (int ci = 0; ci < corpusf.size(); ++ci)
- reaches.push_back(Reachability(corpusf[ci].size(),
- corpuse[ci].size(),
- kMAX_SRC_PHRASE,
- kMAX_TRG_PHRASE));
- cerr << "Sampling...\n";
- vector<Particle> tmp_p(10000); // work space
- SampleSet<prob_t> pfss;
- SystematicResampleFilter<Particle> filter(&rng);
- // MultinomialResampleFilter<Particle> filter(&rng);
- for (int SS=0; SS < samples; ++SS) {
- for (int ci = 0; ci < corpusf.size(); ++ci) {
- vector<int>& src = corpusf[ci];
- vector<int>& trg = corpuse[ci];
- m.DecrementRulesAndStops(ps[ci].rules);
- const prob_t q_stop = m.StopProbability();
- const prob_t q_cont = m.ContinueProbability();
- cerr << "P(stop)=" << q_stop << "\tP(continue)=" <<q_cont << endl;
-
- BackwardEstimateSym be(m1, invm1, src, trg);
- const Reachability& r = reaches[ci];
- vector<Particle> lps(particles);
-
- bool all_complete = false;
- while(!all_complete) {
- SampleSet<prob_t> ss;
-
- // all particles have now been extended a bit, we will reweight them now
- if (lps[0].trg_cov > 0)
- filter(&lps);
-
- // loop over all particles and extend them
- bool done_nothing = true;
- for (int pi = 0; pi < particles; ++pi) {
- Particle& p = lps[pi];
- int tic = 0;
- while(p.trg_cov < trg.size() && tic < rejuv_freq) {
- ++tic;
- done_nothing = false;
- ss.clear();
- TRule x; x.lhs_ = kLHS;
- prob_t z;
-
- for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) {
- x.e_.push_back(trg[trg_len - 1 + p.trg_cov]);
- for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) {
- if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue;
-
- int i = p.src_cov;
- assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size
- Particle& np = tmp_p[ss.size()];
- np = p;
- x.f_.clear();
- for (int j = 0; j < src_len; ++j)
- x.f_.push_back(src[i + j]);
- np.src_cov += x.f_.size();
- np.trg_cov += x.e_.size();
- const bool stop_now = (np.src_cov == src_len && np.trg_cov == trg_len);
- prob_t rp = m.RuleProbability(x) * (stop_now ? q_stop : q_cont);
- np.gamma_last = rp;
- const prob_t u = pow(np.gamma_last * pow(be(np.src_cov, np.trg_cov), 1.2), 0.1);
- //cerr << "**rule=" << x << endl;
- //cerr << " u=" << log(u) << " rule=" << rp << endl;
- ss.add(u);
- np.rules.push_back(TRulePtr(new TRule(x)));
- z += u;
- }
- }
- //cerr << "number of edges to consider: " << ss.size() << endl;
- const int sampled = rng.SelectSample(ss);
- prob_t q_n = ss[sampled] / z;
- p = tmp_p[sampled];
- //m.IncrementRule(*p.rules.back());
- p.weight *= p.gamma_last / q_n;
- //cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl;
- //cerr << p << endl;
- }
- } // loop over particles (pi = 0 .. particles)
- if (done_nothing) all_complete = true;
- prob_t wv = prob_t::Zero();
- for (int pp = 0; pp < lps.size(); ++pp)
- wv += lps[pp].weight;
- for (int pp = 0; pp < lps.size(); ++pp)
- lps[pp].weight /= wv;
- }
- pfss.clear();
- for (int i = 0; i < lps.size(); ++i)
- pfss.add(lps[i].weight);
- const int sampled = rng.SelectSample(pfss);
- ps[ci] = lps[sampled];
- m.IncrementRulesAndStops(lps[sampled].rules);
- for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; }
- cerr << "tmp-LLH: " << log(m.Likelihood()) << endl;
- }
- cerr << "LLH: " << log(m.Likelihood()) << endl;
- }
- return 0;
-}
-
diff --git a/gi/pf/poisson_uniform_word_model.h b/gi/pf/poisson_uniform_word_model.h
deleted file mode 100644
index 76204a0e..00000000
--- a/gi/pf/poisson_uniform_word_model.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef _POISSON_UNIFORM_WORD_MODEL_H_
-#define _POISSON_UNIFORM_WORD_MODEL_H_
-
-#include <cmath>
-#include <vector>
-#include "prob.h"
-#include "m.h"
-
-// len ~ Poisson(lambda)
-// for (1..len)
-// e_i ~ Uniform({Vocabulary})
-struct PoissonUniformWordModel {
- explicit PoissonUniformWordModel(const unsigned vocab_size,
- const unsigned alphabet_size,
- const double mean_len = 5) :
- lh(prob_t::One()),
- v0(-std::log(vocab_size)),
- u0(-std::log(alphabet_size)),
- mean_length(mean_len) {}
-
- void ResampleHyperparameters(MT19937*) {}
-
- inline prob_t operator()(const std::vector<WordID>& s) const {
- prob_t p;
- p.logeq(Md::log_poisson(s.size(), mean_length) + s.size() * u0);
- //p.logeq(v0);
- return p;
- }
-
- inline void Increment(const std::vector<WordID>& w, MT19937*) {
- lh *= (*this)(w);
- }
-
- inline void Decrement(const std::vector<WordID>& w, MT19937 *) {
- lh /= (*this)(w);
- }
-
- inline prob_t Likelihood() const { return lh; }
-
- void Summary() const {}
-
- private:
-
- prob_t lh; // keeps track of the draws from the base distribution
- const double v0; // uniform log prob of generating a word
- const double u0; // uniform log prob of generating a letter
- const double mean_length; // mean length of a word in the base distribution
-};
-
-#endif
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
deleted file mode 100644
index 605d8206..00000000
--- a/gi/pf/pyp_lm.cc
+++ /dev/null
@@ -1,273 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/functional.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "gamma_poisson.h"
-#include "corpus_tools.h"
-#include "m.h"
-#include "tdict.h"
-#include "sampler.h"
-#include "ccrp.h"
-#include "tied_resampler.h"
-
-// A not very memory-efficient implementation of an N-gram LM based on PYPs
-// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model
-// based on Pitman-Yor Processes. In Proc. ACL.
-
-// I use templates to handle the recursive formalation of the prior, so
-// the order of the model has to be specified here, at compile time:
-#define kORDER 3
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-boost::shared_ptr<MT19937> prng;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,n",po::value<unsigned>()->default_value(300),"Number of samples")
- ("train,i",po::value<string>(),"Training data file")
- ("test,T",po::value<string>(),"Test data file")
- ("discount_prior_a,a",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): a=this")
- ("discount_prior_b,b",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): b=this")
- ("strength_prior_s,s",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): s=this")
- ("strength_prior_r,r",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): r=this")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("train") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-// uniform distribution over a fixed vocabulary
-struct UniformVocabulary {
- UniformVocabulary(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {}
- void increment(WordID, const vector<WordID>&, MT19937*) { ++draws; }
- void decrement(WordID, const vector<WordID>&, MT19937*) { --draws; assert(draws >= 0); }
- double prob(WordID, const vector<WordID>&) const { return p0; }
- void resample_hyperparameters(MT19937*) {}
- double log_likelihood() const { return draws * log(p0); }
- const double p0;
- int draws;
-};
-
-// Lord Rothschild. 1986. THE DISTRIBUTION OF ENGLISH DICTIONARY WORD LENGTHS.
-// Journal of Statistical Planning and Inference 14 (1986) 311-322
-struct PoissonLengthUniformCharWordModel {
- explicit PoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : plen(5,5), uc(-log(95)), llh() {}
- void increment(WordID w, const vector<WordID>& v, MT19937*) {
- llh += log(prob(w, v)); // this isn't quite right
- plen.increment(TD::Convert(w).size() - 1);
- }
- void decrement(WordID w, const vector<WordID>& v, MT19937*) {
- plen.decrement(TD::Convert(w).size() - 1);
- llh -= log(prob(w, v)); // this isn't quite right
- }
- double prob(WordID w, const vector<WordID>&) const {
- const unsigned len = TD::Convert(w).size();
- return plen.prob(len - 1) * exp(uc * len);
- }
- double log_likelihood() const { return llh; }
- void resample_hyperparameters(MT19937*) {}
- GammaPoisson plen;
- const double uc;
- double llh;
-};
-
-struct PYPAdaptedPoissonLengthUniformCharWordModel {
- explicit PYPAdaptedPoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) :
- base(vocab_size,1,1,1,1),
- crp(1,1,1,1) {}
- void increment(WordID w, const vector<WordID>& v, MT19937* rng) {
- double p0 = base.prob(w, v);
- if (crp.increment(w, p0, rng))
- base.increment(w, v, rng);
- }
- void decrement(WordID w, const vector<WordID>& v, MT19937* rng) {
- if (crp.decrement(w, rng))
- base.decrement(w, v, rng);
- }
- double prob(WordID w, const vector<WordID>& v) const {
- double p0 = base.prob(w, v);
- return crp.prob(w, p0);
- }
- double log_likelihood() const { return crp.log_crp_prob() + base.log_likelihood(); }
- void resample_hyperparameters(MT19937* rng) { crp.resample_hyperparameters(rng); }
- PoissonLengthUniformCharWordModel base;
- CCRP<WordID> crp;
-};
-
-template <unsigned N> struct PYPLM;
-
-#if 1
-template<> struct PYPLM<0> : public UniformVocabulary {
- PYPLM(unsigned vs, double a, double b, double c, double d) :
- UniformVocabulary(vs, a, b, c, d) {}
-};
-#else
-#if 0
-template<> struct PYPLM<0> : public PoissonLengthUniformCharWordModel {
- PYPLM(unsigned vs, double a, double b, double c, double d) :
- PoissonLengthUniformCharWordModel(vs, a, b, c, d) {}
-};
-#else
-template<> struct PYPLM<0> : public PYPAdaptedPoissonLengthUniformCharWordModel {
- PYPLM(unsigned vs, double a, double b, double c, double d) :
- PYPAdaptedPoissonLengthUniformCharWordModel(vs, a, b, c, d) {}
-};
-#endif
-#endif
-
-// represents an N-gram LM
-template <unsigned N> struct PYPLM {
- PYPLM(unsigned vs, double da, double db, double ss, double sr) :
- backoff(vs, da, db, ss, sr),
- tr(da, db, ss, sr, 0.8, 1.0),
- lookup(N-1) {}
- void increment(WordID w, const vector<WordID>& context, MT19937* rng) {
- const double bo = backoff.prob(w, context);
- for (unsigned i = 0; i < N-1; ++i)
- lookup[i] = context[context.size() - 1 - i];
- typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
- if (it == p.end()) {
- it = p.insert(make_pair(lookup, CCRP<WordID>(0.5,1))).first;
- tr.Add(&it->second); // add to resampler
- }
- if (it->second.increment(w, bo, rng))
- backoff.increment(w, context, rng);
- }
- void decrement(WordID w, const vector<WordID>& context, MT19937* rng) {
- for (unsigned i = 0; i < N-1; ++i)
- lookup[i] = context[context.size() - 1 - i];
- typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
- assert(it != p.end());
- if (it->second.decrement(w, rng))
- backoff.decrement(w, context, rng);
- }
- double prob(WordID w, const vector<WordID>& context) const {
- const double bo = backoff.prob(w, context);
- for (unsigned i = 0; i < N-1; ++i)
- lookup[i] = context[context.size() - 1 - i];
- typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it = p.find(lookup);
- if (it == p.end()) return bo;
- return it->second.prob(w, bo);
- }
-
- double log_likelihood() const {
- double llh = backoff.log_likelihood();
- typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it;
- for (it = p.begin(); it != p.end(); ++it)
- llh += it->second.log_crp_prob();
- llh += tr.LogLikelihood();
- return llh;
- }
-
- void resample_hyperparameters(MT19937* rng) {
- tr.ResampleHyperparameters(rng);
- backoff.resample_hyperparameters(rng);
- }
-
- PYPLM<N-1> backoff;
- TiedResampler<CCRP<WordID> > tr;
- double discount_a, discount_b, strength_s, strength_r;
- double d, strength;
- mutable vector<WordID> lookup; // thread-local
- unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p;
-};
-
-int main(int argc, char** argv) {
- po::variables_map conf;
-
- InitCommandLine(argc, argv, &conf);
- const unsigned samples = conf["samples"].as<unsigned>();
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
- MT19937& rng = *prng;
- vector<vector<WordID> > corpuse;
- set<WordID> vocabe;
- const WordID kEOS = TD::Convert("</s>");
- cerr << "Reading corpus...\n";
- CorpusTools::ReadFromFile(conf["train"].as<string>(), &corpuse, &vocabe);
- cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
- vector<vector<WordID> > test;
- if (conf.count("test"))
- CorpusTools::ReadFromFile(conf["test"].as<string>(), &test);
- else
- test = corpuse;
- PYPLM<kORDER> lm(vocabe.size(),
- conf["discount_prior_a"].as<double>(),
- conf["discount_prior_b"].as<double>(),
- conf["strength_prior_s"].as<double>(),
- conf["strength_prior_r"].as<double>());
- vector<WordID> ctx(kORDER - 1, TD::Convert("<s>"));
- for (int SS=0; SS < samples; ++SS) {
- for (int ci = 0; ci < corpuse.size(); ++ci) {
- ctx.resize(kORDER - 1);
- const vector<WordID>& s = corpuse[ci];
- for (int i = 0; i <= s.size(); ++i) {
- WordID w = (i < s.size() ? s[i] : kEOS);
- if (SS > 0) lm.decrement(w, ctx, &rng);
- lm.increment(w, ctx, &rng);
- ctx.push_back(w);
- }
- }
- if (SS % 10 == 9) {
- cerr << " [LLH=" << lm.log_likelihood() << "]" << endl;
- if (SS % 30 == 29) lm.resample_hyperparameters(&rng);
- } else { cerr << '.' << flush; }
- }
- double llh = 0;
- unsigned cnt = 0;
- unsigned oovs = 0;
- for (int ci = 0; ci < test.size(); ++ci) {
- ctx.resize(kORDER - 1);
- const vector<WordID>& s = test[ci];
- for (int i = 0; i <= s.size(); ++i) {
- WordID w = (i < s.size() ? s[i] : kEOS);
- double lp = log(lm.prob(w, ctx)) / log(2);
- if (i < s.size() && vocabe.count(w) == 0) {
- cerr << "**OOV ";
- ++oovs;
- lp = 0;
- }
- cerr << "p(" << TD::Convert(w) << " |";
- for (int j = ctx.size() + 1 - kORDER; j < ctx.size(); ++j)
- cerr << ' ' << TD::Convert(ctx[j]);
- cerr << ") = " << lp << endl;
- ctx.push_back(w);
- llh -= lp;
- cnt++;
- }
- }
- cerr << " Log_10 prob: " << (-llh * log(2) / log(10)) << endl;
- cerr << " Count: " << cnt << endl;
- cerr << " OOVs: " << oovs << endl;
- cerr << "Cross-entropy: " << (llh / cnt) << endl;
- cerr << " Perplexity: " << pow(2, llh / cnt) << endl;
- return 0;
-}
-
-
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
deleted file mode 100644
index 37b9a604..00000000
--- a/gi/pf/pyp_tm.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-#include "pyp_tm.h"
-
-#include <tr1/unordered_map>
-#include <iostream>
-#include <queue>
-
-#include "tdict.h"
-#include "ccrp.h"
-#include "pyp_word_model.h"
-#include "tied_resampler.h"
-
-using namespace std;
-using namespace std::tr1;
-
-struct FreqBinner {
- FreqBinner(const std::string& fname) { fd_.Load(fname); }
- unsigned NumberOfBins() const { return fd_.Max() + 1; }
- unsigned Bin(const WordID& w) const { return fd_.LookUp(w); }
- FreqDict<unsigned> fd_;
-};
-
-template <typename Base, class Binner = FreqBinner>
-struct ConditionalPYPWordModel {
- ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) :
- base(*b),
- binner(bnr),
- btr(binner ? binner->NumberOfBins() + 1u : 2u) {}
-
- void Summary() const {
- cerr << "Number of conditioning contexts: " << r.size() << endl;
- for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
- cerr << TD::Convert(it->first) << " \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl;
- for (CCRP<vector<WordID> >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
- cerr << " " << i2->second << '\t' << TD::GetString(i2->first) << endl;
- }
- }
-
- void ResampleHyperparameters(MT19937* rng) {
- btr.ResampleHyperparameters(rng);
- }
-
- prob_t Prob(const WordID src, const vector<WordID>& trglets) const {
- RuleModelHash::const_iterator it = r.find(src);
- if (it == r.end()) {
- return base(trglets);
- } else {
- return it->second.prob(trglets, base(trglets));
- }
- }
-
- void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
- RuleModelHash::iterator it = r.find(src);
- if (it == r.end()) {
- it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first;
- static const WordID kNULL = TD::Convert("NULL");
- unsigned bin = (src == kNULL ? 0 : 1);
- if (binner && bin) { bin = binner->Bin(src) + 1; }
- btr.Add(bin, &it->second);
- }
- if (it->second.increment(trglets, base(trglets), rng))
- base.Increment(trglets, rng);
- }
-
- void Decrement(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
- RuleModelHash::iterator it = r.find(src);
- assert(it != r.end());
- if (it->second.decrement(trglets, rng)) {
- base.Decrement(trglets, rng);
- }
- }
-
- prob_t Likelihood() const {
- prob_t p = prob_t::One();
- for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
- prob_t q; q.logeq(it->second.log_crp_prob());
- p *= q;
- }
- return p;
- }
-
- unsigned UniqueConditioningContexts() const {
- return r.size();
- }
-
- // TODO tie PYP hyperparameters based on source word frequency bins
- Base& base;
- const Binner* binner;
- BinTiedResampler<CCRP<vector<WordID> > > btr;
- typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
- RuleModelHash r;
-};
-
-PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets,
- const unsigned vocab_size,
- const unsigned num_letters) :
- letters(lets),
- base(vocab_size, num_letters, 5),
- tmodel(new ConditionalPYPWordModel<PoissonUniformWordModel>(&base, new FreqBinner("10k.freq"))),
- kX(-TD::Convert("X")) {}
-
-void PYPLexicalTranslation::Summary() const {
- tmodel->Summary();
-}
-
-prob_t PYPLexicalTranslation::Likelihood() const {
- return tmodel->Likelihood() * base.Likelihood();
-}
-
-void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) {
- tmodel->ResampleHyperparameters(rng);
-}
-
-unsigned PYPLexicalTranslation::UniqueConditioningContexts() const {
- return tmodel->UniqueConditioningContexts();
-}
-
-prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const {
- return tmodel->Prob(src, letters[trg]);
-}
-
-void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) {
- tmodel->Increment(src, letters[trg], rng);
-}
-
-void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) {
- tmodel->Decrement(src, letters[trg], rng);
-}
-
diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h
deleted file mode 100644
index 2b076a25..00000000
--- a/gi/pf/pyp_tm.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef PYP_LEX_TRANS
-#define PYP_LEX_TRANS
-
-#include <vector>
-#include "wordid.h"
-#include "prob.h"
-#include "sampler.h"
-#include "freqdict.h"
-#include "poisson_uniform_word_model.h"
-
-struct FreqBinner;
-template <typename T, class B> struct ConditionalPYPWordModel;
-
-struct PYPLexicalTranslation {
- explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets,
- const unsigned vocab_size,
- const unsigned num_letters);
-
- prob_t Likelihood() const;
-
- void ResampleHyperparameters(MT19937* rng);
- prob_t Prob(WordID src, WordID trg) const; // return p(trg | src)
- void Summary() const;
- void Increment(WordID src, WordID trg, MT19937* rng);
- void Decrement(WordID src, WordID trg, MT19937* rng);
- unsigned UniqueConditioningContexts() const;
-
- private:
- const std::vector<std::vector<WordID> >& letters; // spelling dictionary
- PoissonUniformWordModel base; // "generator" of English types
- ConditionalPYPWordModel<PoissonUniformWordModel, FreqBinner>* tmodel; // translation distributions
- // (model English word | French word)
- const WordID kX;
-};
-
-#endif
diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h
deleted file mode 100644
index 0bebb751..00000000
--- a/gi/pf/pyp_word_model.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef _PYP_WORD_MODEL_H_
-#define _PYP_WORD_MODEL_H_
-
-#include <iostream>
-#include <cmath>
-#include <vector>
-#include "prob.h"
-#include "ccrp.h"
-#include "m.h"
-#include "tdict.h"
-#include "os_phrase.h"
-
-// PYP(d,s,poisson-uniform) represented as a CRP
-template <class Base>
-struct PYPWordModel {
- explicit PYPWordModel(Base* b) :
- base(*b),
- r(1,1,1,1,0.66,50.0)
- {}
-
- void ResampleHyperparameters(MT19937* rng) {
- r.resample_hyperparameters(rng);
- std::cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n";
- }
-
- inline prob_t operator()(const std::vector<WordID>& s) const {
- return r.prob(s, base(s));
- }
-
- inline void Increment(const std::vector<WordID>& s, MT19937* rng) {
- if (r.increment(s, base(s), rng))
- base.Increment(s, rng);
- }
-
- inline void Decrement(const std::vector<WordID>& s, MT19937 *rng) {
- if (r.decrement(s, rng))
- base.Decrement(s, rng);
- }
-
- inline prob_t Likelihood() const {
- prob_t p; p.logeq(r.log_crp_prob());
- p *= base.Likelihood();
- return p;
- }
-
- void Summary() const {
- std::cerr << "PYPWordModel: generations=" << r.num_customers()
- << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << std::endl;
- for (typename CCRP<std::vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) {
- std::cerr << " " << it->second
- << TD::GetString(it->first) << std::endl;
- }
- }
-
- private:
-
- Base& base; // keeps track of the draws from the base distribution
- CCRP<std::vector<WordID> > r;
-};
-
-#endif
diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h
deleted file mode 100644
index 4075affe..00000000
--- a/gi/pf/quasi_model2.h
+++ /dev/null
@@ -1,177 +0,0 @@
-#ifndef _QUASI_MODEL2_H_
-#define _QUASI_MODEL2_H_
-
-#include <vector>
-#include <cmath>
-#include <tr1/unordered_map>
-#include "boost/functional.hpp"
-#include "prob.h"
-#include "array2d.h"
-#include "slice_sampler.h"
-#include "m.h"
-#include "have_64_bits.h"
-
-struct AlignmentObservation {
- AlignmentObservation() : src_len(), trg_len(), j(), a_j() {}
- AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) :
- src_len(sl), trg_len(tl), j(tw), a_j(sw) {}
- unsigned short src_len;
- unsigned short trg_len;
- unsigned short j;
- unsigned short a_j;
-};
-
-#ifdef HAVE_64_BITS
-inline size_t hash_value(const AlignmentObservation& o) {
- return reinterpret_cast<const size_t&>(o);
-}
-inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) {
- return hash_value(a) == hash_value(b);
-}
-#else
-inline size_t hash_value(const AlignmentObservation& o) {
- size_t h = 1;
- boost::hash_combine(h, o.src_len);
- boost::hash_combine(h, o.trg_len);
- boost::hash_combine(h, o.j);
- boost::hash_combine(h, o.a_j);
- return h;
-}
-#endif
-
-struct QuasiModel2 {
- explicit QuasiModel2(double alpha, double pnull = 0.1) :
- alpha_(alpha),
- pnull_(pnull),
- pnotnull_(1 - pnull) {}
-
- // a_j = 0 => NULL; src_len does *not* include null
- prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const {
- if (!a_j) return pnull_;
- return pnotnull_ *
- prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len));
- }
-
- void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) {
- assert(a_j <= src_len);
- assert(j < trg_len);
- ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)];
- }
-
- void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) {
- const AlignmentObservation ao(src_len, trg_len, j, a_j);
- int &cc = obs_[ao];
- assert(cc > 0);
- --cc;
- if (!cc) obs_.erase(ao);
- }
-
- struct PNullResampler {
- PNullResampler(const QuasiModel2& m) : m_(m) {}
- const QuasiModel2& m_;
- double operator()(const double& proposed_pnull) const {
- return log(m_.Likelihood(m_.alpha_, proposed_pnull));
- }
- };
-
- struct AlphaResampler {
- AlphaResampler(const QuasiModel2& m) : m_(m) {}
- const QuasiModel2& m_;
- double operator()(const double& proposed_alpha) const {
- return log(m_.Likelihood(proposed_alpha, m_.pnull_.as_float()));
- }
- };
-
- void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
- const PNullResampler dr(*this);
- const AlphaResampler ar(*this);
- for (unsigned i = 0; i < nloop; ++i) {
- double pnull = slice_sampler1d(dr, pnull_.as_float(), *rng, 0.00000001,
- 1.0, 0.0, niterations, 100*niterations);
- pnull_ = prob_t(pnull);
- alpha_ = slice_sampler1d(ar, alpha_, *rng, 0.00000001,
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- }
- std::cerr << "QuasiModel2(alpha=" << alpha_ << ",p_null="
- << pnull_.as_float() << ") = " << Likelihood() << std::endl;
- zcache_.clear();
- }
-
- prob_t Likelihood() const {
- return Likelihood(alpha_, pnull_.as_float());
- }
-
- prob_t Likelihood(double alpha, double ppnull) const {
- const prob_t pnull(ppnull);
- const prob_t pnotnull(1 - ppnull);
-
- prob_t p;
- p.logeq(Md::log_gamma_density(alpha, 0.1, 25)); // TODO configure
- assert(!p.is_0());
- prob_t prob_of_ppnull; prob_of_ppnull.logeq(Md::log_beta_density(ppnull, 2, 10));
- assert(!prob_of_ppnull.is_0());
- p *= prob_of_ppnull;
- for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) {
- const AlignmentObservation& ao = it->first;
- if (ao.a_j) {
- prob_t u = XUnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha);
- prob_t z = XComputeZ(ao.j, ao.src_len, ao.trg_len, alpha);
- prob_t pa(u / z);
- pa *= pnotnull;
- pa.poweq(it->second);
- p *= pa;
- } else {
- p *= pnull.pow(it->second);
- }
- }
- return p;
- }
-
- private:
- static prob_t XUnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
- prob_t p;
- p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha);
- return p;
- }
-
- static prob_t XComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
- prob_t z = prob_t::Zero();
- for (int a_j = 1; a_j <= src_len; ++a_j)
- z += XUnnormalizedProb(a_j, j, src_len, trg_len, alpha);
- return z;
- }
-
- static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
- return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha);
- }
-
- static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
- double z = 0;
- for (int a_j = 1; a_j <= src_len; ++a_j)
- z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha);
- return z;
- }
-
- const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const {
- if (src_len >= zcache_.size())
- zcache_.resize(src_len + 1);
- if (trg_len >= zcache_[src_len].size())
- zcache_[src_len].resize(trg_len + 1);
- std::vector<double>& zv = zcache_[src_len][trg_len];
- if (zv.size() == 0)
- zv.resize(trg_len);
- double& z = zv[j];
- if (!z)
- z = ComputeZ(j, src_len, trg_len, alpha_);
- return z;
- }
-
- double alpha_;
- prob_t pnull_;
- prob_t pnotnull_;
- mutable std::vector<std::vector<std::vector<double> > > zcache_;
- typedef std::tr1::unordered_map<AlignmentObservation, int, boost::hash<AlignmentObservation> > ObsCount;
- ObsCount obs_;
-};
-
-#endif
diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc
deleted file mode 100644
index 7d0d04ac..00000000
--- a/gi/pf/reachability.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-#include "reachability.h"
-
-#include <vector>
-#include <iostream>
-
-using namespace std;
-
-struct SState {
- SState() : prev_src_covered(), prev_trg_covered() {}
- SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {}
- int prev_src_covered;
- int prev_trg_covered;
-};
-
-void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) {
- typedef boost::multi_array<vector<SState>, 2> array_type;
- array_type a(boost::extents[srclen + 1][trglen + 1]);
- a[0][0].push_back(SState());
- for (int i = 0; i < srclen; ++i) {
- for (int j = 0; j < trglen; ++j) {
- if (a[i][j].size() == 0) continue;
- const SState prev(i,j);
- for (int k = 1; k <= src_max_phrase_len; ++k) {
- if ((i + k) > srclen) continue;
- for (int l = 1; l <= trg_max_phrase_len; ++l) {
- if ((j + l) > trglen) continue;
- a[i + k][j + l].push_back(prev);
- }
- }
- }
- }
- a[0][0].clear();
- //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
- if (a[srclen][trglen].empty()) {
- cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n";
- nodes = 0;
- return;
- }
-
- typedef boost::multi_array<bool, 2> rarray_type;
- rarray_type r(boost::extents[srclen + 1][trglen + 1]);
- r[srclen][trglen] = true;
- nodes = 0;
- for (int i = srclen; i >= 0; --i) {
- for (int j = trglen; j >= 0; --j) {
- vector<SState>& prevs = a[i][j];
- if (!r[i][j]) { prevs.clear(); }
- for (int k = 0; k < prevs.size(); ++k) {
- r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true;
- int src_delta = i - prevs[k].prev_src_covered;
- edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true;
- valid_deltas[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(make_pair<short,short>(src_delta,j - prevs[k].prev_trg_covered));
- short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered];
- if (src_delta > msd) msd = src_delta;
- }
- }
- }
- assert(!edges[0][0][1][0]);
- assert(!edges[0][0][0][1]);
- assert(!edges[0][0][0][0]);
- assert(max_src_delta[0][0] > 0);
- nodes = 0;
- for (int i = 0; i < srclen; ++i) {
- for (int j = 0; j < trglen; ++j) {
- if (valid_deltas[i][j].size() > 0) {
- node_addresses[i][j] = nodes++;
- } else {
- node_addresses[i][j] = -1;
- }
- }
- }
- cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node, " << nodes << " nodes in total, and outside estimate matrix will require " << sizeof(float)*nodes << " bytes\n";
- }
-
diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h
deleted file mode 100644
index 1e22c76a..00000000
--- a/gi/pf/reachability.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef _REACHABILITY_H_
-#define _REACHABILITY_H_
-
-#include "boost/multi_array.hpp"
-
-// determines minimum and maximum lengths of outgoing edges from all
-// coverage positions such that the alignment path respects src and
-// trg maximum phrase sizes
-//
-// runs in O(n^2 * src_max * trg_max) time but should be relatively fast
-//
-// currently forbids 0 -> n and n -> 0 alignments
-
-struct Reachability {
- unsigned nodes;
- boost::multi_array<bool, 4> edges; // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring?
- boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid
- boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes")
- boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node
-
- Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
- nodes(),
- edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
- max_src_delta(boost::extents[srclen][trglen]),
- node_addresses(boost::extents[srclen][trglen]),
- valid_deltas(boost::extents[srclen][trglen]) {
- ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
- }
-
- private:
- void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len);
-};
-
-#endif
diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h
deleted file mode 100644
index a4f4af36..00000000
--- a/gi/pf/tied_resampler.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifndef _TIED_RESAMPLER_H_
-#define _TIED_RESAMPLER_H_
-
-#include <set>
-#include <vector>
-#include "sampler.h"
-#include "slice_sampler.h"
-#include "m.h"
-
-template <class CRP>
-struct TiedResampler {
- explicit TiedResampler(double da, double db, double ss, double sr, double d=0.5, double s=1.0) :
- d_alpha(da),
- d_beta(db),
- s_shape(ss),
- s_rate(sr),
- discount(d),
- strength(s) {}
-
- void Add(CRP* crp) {
- crps.insert(crp);
- crp->set_discount(discount);
- crp->set_strength(strength);
- assert(!crp->has_discount_prior());
- assert(!crp->has_strength_prior());
- }
-
- void Remove(CRP* crp) {
- crps.erase(crp);
- }
-
- size_t size() const {
- return crps.size();
- }
-
- double LogLikelihood(double d, double s) const {
- if (s <= -d) return -std::numeric_limits<double>::infinity();
- double llh = Md::log_beta_density(d, d_alpha, d_beta) +
- Md::log_gamma_density(d + s, s_shape, s_rate);
- for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it)
- llh += (*it)->log_crp_prob(d, s);
- return llh;
- }
-
- double LogLikelihood() const {
- return LogLikelihood(discount, strength);
- }
-
- struct DiscountResampler {
- DiscountResampler(const TiedResampler& m) : m_(m) {}
- const TiedResampler& m_;
- double operator()(const double& proposed_discount) const {
- return m_.LogLikelihood(proposed_discount, m_.strength);
- }
- };
-
- struct AlphaResampler {
- AlphaResampler(const TiedResampler& m) : m_(m) {}
- const TiedResampler& m_;
- double operator()(const double& proposed_strength) const {
- return m_.LogLikelihood(m_.discount, proposed_strength);
- }
- };
-
- void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
- if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; }
- const DiscountResampler dr(*this);
- const AlphaResampler ar(*this);
- for (int iter = 0; iter < nloop; ++iter) {
- strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(),
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- double min_discount = std::numeric_limits<double>::min();
- if (strength < 0.0) min_discount -= strength;
- discount = slice_sampler1d(dr, discount, *rng, min_discount,
- 1.0, 0.0, niterations, 100*niterations);
- }
- strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(),
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- std::cerr << "TiedCRPs(d=" << discount << ",s="
- << strength << ") = " << LogLikelihood(discount, strength) << std::endl;
- for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it)
- (*it)->set_hyperparameters(discount, strength);
- }
- private:
- std::set<CRP*> crps;
- const double d_alpha, d_beta, s_shape, s_rate;
- double discount, strength;
-};
-
-// split according to some criterion
-template <class CRP>
-struct BinTiedResampler {
- explicit BinTiedResampler(unsigned nbins) :
- resamplers(nbins, TiedResampler<CRP>(1,1,1,1)) {}
-
- void Add(unsigned bin, CRP* crp) {
- resamplers[bin].Add(crp);
- }
-
- void Remove(unsigned bin, CRP* crp) {
- resamplers[bin].Remove(crp);
- }
-
- void ResampleHyperparameters(MT19937* rng) {
- for (unsigned i = 0; i < resamplers.size(); ++i) {
- std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush;
- resamplers[i].ResampleHyperparameters(rng);
- }
- }
-
- double LogLikelihood() const {
- double llh = 0;
- for (unsigned i = 0; i < resamplers.size(); ++i)
- llh += resamplers[i].LogLikelihood();
- return llh;
- }
-
- private:
- std::vector<TiedResampler<CRP> > resamplers;
-};
-
-#endif
diff --git a/gi/pf/tpf.cc b/gi/pf/tpf.cc
deleted file mode 100644
index 7348d21c..00000000
--- a/gi/pf/tpf.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include "sampler.h"
-
-using namespace std;
-using namespace tr1;
-
-shared_ptr<MT19937> prng;
-
-struct Particle {
- Particle() : weight(prob_t::One()) {}
- vector<int> states;
- prob_t weight;
- prob_t gamma_last;
-};
-
-ostream& operator<<(ostream& os, const Particle& p) {
- os << "[";
- for (int i = 0; i < p.states.size(); ++i) os << p.states[i] << ' ';
- os << "| w=" << log(p.weight) << ']';
- return os;
-}
-
-void Rejuvenate(vector<Particle>& pps) {
- SampleSet<prob_t> ss;
- vector<Particle> nps(pps.size());
- for (int i = 0; i < pps.size(); ++i) {
-// cerr << pps[i] << endl;
- ss.add(pps[i].weight);
- }
-// cerr << "REJUVINATING...\n";
- for (int i = 0; i < pps.size(); ++i) {
- nps[i] = pps[prng->SelectSample(ss)];
- nps[i].weight = prob_t(1.0 / pps.size());
-// cerr << nps[i] << endl;
- }
- nps.swap(pps);
-// exit(1);
-}
-
-int main(int argc, char** argv) {
- const unsigned particles = 100;
- prng.reset(new MT19937);
- MT19937& rng = *prng;
-
- // q(a) = 0.8
- // q(b) = 0.8
- // q(c) = 0.4
- SampleSet<double> ssq;
- ssq.add(0.4);
- ssq.add(0.6);
- ssq.add(0);
- double qz = 1;
-
- // p(a) = 0.2
- // p(b) = 0.8
- vector<double> p(3);
- p[0] = 0.2;
- p[1] = 0.8;
- p[2] = 0;
-
- vector<int> counts(3);
- int tot = 0;
-
- vector<Particle> pps(particles);
- SampleSet<prob_t> ppss;
- int LEN = 12;
- int PP = 1;
- while (pps[0].states.size() < LEN) {
- for (int pi = 0; pi < particles; ++pi) {
- Particle& prt = pps[pi];
-
- bool redo = true;
- const Particle savedp = prt;
- while (redo) {
- redo = false;
- for (int i = 0; i < PP; ++i) {
- int s = rng.SelectSample(ssq);
- double gamma_last = p[s];
- if (!gamma_last) { redo = true; break; }
- double q = ssq[s] / qz;
- prt.states.push_back(s);
- prt.weight *= prob_t(gamma_last / q);
- }
- if (redo) { prt = savedp; continue; }
- }
- }
- Rejuvenate(pps);
- }
- ppss.clear();
- for (int i = 0; i < particles; ++i) { ppss.add(pps[i].weight); }
- int sp = rng.SelectSample(ppss);
- cerr << pps[sp] << endl;
-
- return 0;
-}
-
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc
deleted file mode 100644
index b2996f65..00000000
--- a/gi/pf/transliterations.cc
+++ /dev/null
@@ -1,334 +0,0 @@
-#include "transliterations.h"
-
-#include <iostream>
-#include <vector>
-
-#include "boost/shared_ptr.hpp"
-
-#include "backward.h"
-#include "filelib.h"
-#include "tdict.h"
-#include "trule.h"
-#include "filelib.h"
-#include "ccrp_nt.h"
-#include "m.h"
-#include "reachability.h"
-
-using namespace std;
-using namespace std::tr1;
-
-struct TruncatedConditionalLengthModel {
- TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) :
- plens(max_src_size+1, vector<prob_t>(max_trg_size+1, 0.0)) {
- for (unsigned i = 1; i <= max_src_size; ++i) {
- prob_t z = prob_t::Zero();
- for (unsigned j = 1; j <= max_trg_size; ++j)
- z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio))));
- for (unsigned j = 1; j <= max_trg_size; ++j)
- plens[i][j] /= z;
- //for (unsigned j = 1; j <= max_trg_size; ++j)
- // cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl;
- }
- }
-
- // return p(tlen | slen) for *chunks* not full words
- inline const prob_t& operator()(int slen, int tlen) const {
- return plens[slen][tlen];
- }
-
- vector<vector<prob_t> > plens;
-};
-
-struct CondBaseDist {
- CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) :
- tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {}
-
- prob_t operator()(const vector<WordID>& src, unsigned sf, unsigned st,
- const vector<WordID>& trg, unsigned tf, unsigned tt) const {
- prob_t p = tclm(st - sf, tt - tf); // target len | source length ~ TCLM(source len)
- assert(!"not impl");
- return p;
- }
- inline prob_t operator()(const vector<WordID>& src, const vector<WordID>& trg) const {
- return (*this)(src, 0, src.size(), trg, 0, trg.size());
- }
- TruncatedConditionalLengthModel tclm;
-};
-
-// represents transliteration phrase probabilities, e.g.
-// p( a l - | A l ) , p( o | A w ) , ...
-struct TransliterationChunkConditionalModel {
- explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) :
- d(0.0),
- strength(1.0),
- rp0(pp0) {
- }
-
- void Summary() const {
- std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
- for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
- std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
- for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
- std::cerr << " " << i2->second << '\t' << i2->first << std::endl;
- }
- }
-
- int DecrementRule(const TRule& rule) {
- RuleModelHash::iterator it = r.find(rule.f_);
- assert(it != r.end());
- int count = it->second.decrement(rule);
- if (count) {
- if (it->second.num_customers() == 0) r.erase(it);
- }
- return count;
- }
-
- int IncrementRule(const TRule& rule) {
- RuleModelHash::iterator it = r.find(rule.f_);
- if (it == r.end()) {
- it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(strength))).first;
- }
- int count = it->second.increment(rule);
- return count;
- }
-
- void IncrementRules(const std::vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- IncrementRule(*rules[i]);
- }
-
- void DecrementRules(const std::vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- DecrementRule(*rules[i]);
- }
-
- prob_t RuleProbability(const TRule& rule) const {
- prob_t p;
- RuleModelHash::const_iterator it = r.find(rule.f_);
- if (it == r.end()) {
- p = rp0(rule.f_, rule.e_);
- } else {
- p = it->second.prob(rule, rp0(rule.f_, rule.e_));
- }
- return p;
- }
-
- double LogLikelihood(const double& dd, const double& aa) const {
- if (aa <= -dd) return -std::numeric_limits<double>::infinity();
- //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
- double llh = //Md::log_beta_density(dd, 1, 1) +
- Md::log_gamma_density(dd + aa, 1, 1);
- std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::const_iterator it;
- for (it = r.begin(); it != r.end(); ++it)
- llh += it->second.log_crp_prob(aa);
- return llh;
- }
-
- struct AlphaResampler {
- AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {}
- const TransliterationChunkConditionalModel& m_;
- double operator()(const double& proposed_strength) const {
- return m_.LogLikelihood(m_.d, proposed_strength);
- }
- };
-
- void ResampleHyperparameters(MT19937* rng) {
- std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::iterator it;
- //const unsigned nloop = 5;
- const unsigned niterations = 10;
- //DiscountResampler dr(*this);
- AlphaResampler ar(*this);
-#if 0
- for (int iter = 0; iter < nloop; ++iter) {
- strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- double min_discount = std::numeric_limits<double>::min();
- if (strength < 0.0) min_discount -= strength;
- d = slice_sampler1d(dr, d, *rng, min_discount,
- 1.0, 0.0, niterations, 100*niterations);
- }
-#endif
- strength = slice_sampler1d(ar, strength, *rng, -d,
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl;
- for (it = r.begin(); it != r.end(); ++it) {
-#if 0
- it->second.set_discount(d);
-#endif
- it->second.set_alpha(strength);
- }
- }
-
- prob_t Likelihood() const {
- prob_t p; p.logeq(LogLikelihood(d, strength));
- return p;
- }
-
- const CondBaseDist& rp0;
- typedef std::tr1::unordered_map<std::vector<WordID>,
- CCRP_NoTable<TRule>,
- boost::hash<std::vector<WordID> > > RuleModelHash;
- RuleModelHash r;
- double d, strength;
-};
-
-struct GraphStructure {
- GraphStructure() : r() {}
- // leak memory - these are basically static
- const Reachability* r;
- bool IsReachable() const { return r->nodes > 0; }
-};
-
-struct ProbabilityEstimates {
- ProbabilityEstimates() : gs(), backward() {}
- explicit ProbabilityEstimates(const GraphStructure& g) :
- gs(&g), backward() {
- if (g.r->nodes > 0)
- backward = new float[g.r->nodes];
- }
- // leak memory, these are static
-
- // returns an estimate of the marginal probability
- double MarginalEstimate() const {
- if (!backward) return 0;
- return backward[0];
- }
-
- // returns an backward estimate
- double Backward(int src_covered, int trg_covered) const {
- if (!backward) return 0;
- int ind = gs->r->node_addresses[src_covered][trg_covered];
- if (ind < 0) return 0;
- return backward[ind];
- }
-
- prob_t estp;
- float* backward;
- private:
- const GraphStructure* gs;
-};
-
-struct TransliterationsImpl {
- TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) :
- cp0(max_src, max_trg, sr),
- tccm(cp0),
- be(b),
- kMAX_SRC_CHUNK(max_src),
- kMAX_TRG_CHUNK(max_trg),
- kS2T_RATIO(sr),
- tot_pairs(), tot_mem() {
- }
- const CondBaseDist cp0;
- TransliterationChunkConditionalModel tccm;
- const BackwardEstimator& be;
-
- void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
- const size_t src_len = src_lets.size();
- const size_t trg_len = trg_lets.size();
-
- // init graph structure
- if (src_len >= graphs.size()) graphs.resize(src_len + 1);
- if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
- GraphStructure& gs = graphs[src_len][trg_len];
- if (!gs.r) {
- double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO))));
- if (rat > 1.5 || (rat > 2.4 && src_len < 6)) {
- cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl;
- gs.r = new Reachability(src_len, trg_len, 0, 0);
- } else {
- gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK);
- }
- }
-
- const Reachability& r = *gs.r;
-
- // init backward estimates
- if (src >= ests.size()) ests.resize(src + 1);
- unordered_map<WordID, ProbabilityEstimates>::iterator it = ests[src].find(trg);
- if (it != ests[src].end()) return; // already initialized
-
- it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first;
- ProbabilityEstimates& est = it->second;
- if (!gs.r->nodes) return; // not derivable subject to length constraints
-
- be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward);
- cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl;
- tot_pairs++;
- tot_mem += sizeof(float) * gs.r->nodes;
- }
-
- void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
- const size_t src_len = src_lets.size();
- const size_t trg_len = trg_lets.size();
- // TODO
- }
-
- prob_t EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
- assert(src.size() < graphs.size());
- const vector<GraphStructure>& tv = graphs[src.size()];
- assert(trg.size() < tv.size());
- const GraphStructure& gs = tv[trg.size()];
- if (gs.r->nodes == 0)
- return prob_t::Zero();
- const unordered_map<WordID, ProbabilityEstimates>::const_iterator it = ests[s].find(t);
- assert(it != ests[s].end());
- return it->second.estp;
- }
-
- void GraphSummary() const {
- double to = 0;
- double tn = 0;
- double tt = 0;
- for (int i = 0; i < graphs.size(); ++i) {
- const vector<GraphStructure>& vt = graphs[i];
- for (int j = 0; j < vt.size(); ++j) {
- const GraphStructure& gs = vt[j];
- if (!gs.r) continue;
- tt++;
- for (int k = 0; k < i; ++k) {
- for (int l = 0; l < j; ++l) {
- size_t c = gs.r->valid_deltas[k][l].size();
- if (c) {
- tn += 1;
- to += c;
- }
- }
- }
- }
- }
- cerr << " Average nodes = " << (tn / tt) << endl;
- cerr << "Average out-degree = " << (to / tn) << endl;
- cerr << " Unique structures = " << tt << endl;
- cerr << " Unique pairs = " << tot_pairs << endl;
- cerr << " BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl;
- }
-
- const int kMAX_SRC_CHUNK;
- const int kMAX_TRG_CHUNK;
- const double kS2T_RATIO;
- unsigned tot_pairs;
- size_t tot_mem;
- vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]
- vector<unordered_map<WordID, ProbabilityEstimates> > ests; // ests[src][trg]
-};
-
-Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) :
- pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {}
-Transliterations::~Transliterations() { delete pimpl_; }
-
-void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
- pimpl_->Initialize(src, src_lets, trg, trg_lets);
-}
-
-prob_t Transliterations::EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
- return pimpl_->EstimateProbability(s, src,t, trg);
-}
-
-void Transliterations::Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
- pimpl_->Forbid(src, src_lets, trg, trg_lets);
-}
-
-void Transliterations::GraphSummary() const {
- pimpl_->GraphSummary();
-}
-
diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h
deleted file mode 100644
index 49d14684..00000000
--- a/gi/pf/transliterations.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _TRANSLITERATIONS_H_
-#define _TRANSLITERATIONS_H_
-
-#include <vector>
-#include "wordid.h"
-#include "prob.h"
-
-struct BackwardEstimator;
-struct TransliterationsImpl;
-struct Transliterations {
- // max_src and max_trg indicate how big the transliteration phrases can be
- // see reachability.h for information about filter_ratio
- explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be);
- ~Transliterations();
- void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
- void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
- void GraphSummary() const;
- prob_t EstimateProbability(WordID s, const std::vector<WordID>& src, WordID t, const std::vector<WordID>& trg) const;
- private:
- TransliterationsImpl* pimpl_;
-};
-
-#endif
-
diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc
deleted file mode 100644
index 40829775..00000000
--- a/gi/pf/unigrams.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-#include "unigrams.h"
-
-#include <string>
-#include <cmath>
-
-#include "stringlib.h"
-#include "filelib.h"
-
-using namespace std;
-
-void UnigramModel::LoadUnigrams(const string& fname) {
- cerr << "Loading unigram probabilities from " << fname << " ..." << endl;
- ReadFile rf(fname);
- string line;
- istream& in = *rf.stream();
- assert(in);
- getline(in, line);
- assert(line.empty());
- getline(in, line);
- assert(line == "\\data\\");
- getline(in, line);
- size_t pos = line.find("ngram 1=");
- assert(pos == 0);
- assert(line.size() > 8);
- const size_t num_unigrams = atoi(&line[8]);
- getline(in, line);
- assert(line.empty());
- getline(in, line);
- assert(line == "\\1-grams:");
- for (size_t i = 0; i < num_unigrams; ++i) {
- getline(in, line);
- assert(line.size() > 0);
- pos = line.find('\t');
- assert(pos > 0);
- assert(pos + 1 < line.size());
- const WordID w = TD::Convert(line.substr(pos + 1));
- line[pos] = 0;
- float p = atof(&line[0]);
- if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n";
- }
-}
-
-void UnigramWordModel::LoadUnigrams(const string& fname) {
- cerr << "Loading unigram probabilities from " << fname << " ..." << endl;
- ReadFile rf(fname);
- string line;
- istream& in = *rf.stream();
- assert(in);
- getline(in, line);
- assert(line.empty());
- getline(in, line);
- assert(line == "\\data\\");
- getline(in, line);
- size_t pos = line.find("ngram 1=");
- assert(pos == 0);
- assert(line.size() > 8);
- const size_t num_unigrams = atoi(&line[8]);
- getline(in, line);
- assert(line.empty());
- getline(in, line);
- assert(line == "\\1-grams:");
- for (size_t i = 0; i < num_unigrams; ++i) {
- getline(in, line);
- assert(line.size() > 0);
- pos = line.find('\t');
- assert(pos > 0);
- assert(pos + 1 < line.size());
- size_t cur = pos + 1;
- vector<WordID> w;
- while (cur < line.size()) {
- const size_t len = UTF8Len(line[cur]);
- w.push_back(TD::Convert(line.substr(cur, len)));
- cur += len;
- }
- line[pos] = 0;
- float p = atof(&line[0]);
- probs_[w].logeq(p * log(10.0));
- }
-}
-
diff --git a/gi/pf/unigrams.h b/gi/pf/unigrams.h
deleted file mode 100644
index 1660d1ed..00000000
--- a/gi/pf/unigrams.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef _UNIGRAMS_H_
-#define _UNIGRAMS_H_
-
-#include <vector>
-#include <string>
-#include <tr1/unordered_map>
-#include <boost/functional.hpp>
-
-#include "wordid.h"
-#include "prob.h"
-#include "tdict.h"
-
-struct UnigramModel {
- explicit UnigramModel(const std::string& fname, unsigned vocab_size) :
- use_uniform_(fname.size() == 0),
- uniform_(1.0 / vocab_size),
- probs_() {
- if (fname.size() > 0) {
- probs_.resize(TD::NumWords() + 1);
- LoadUnigrams(fname);
- }
- }
-
- const prob_t& operator()(const WordID& w) const {
- assert(w);
- if (use_uniform_) return uniform_;
- return probs_[w];
- }
-
- private:
- void LoadUnigrams(const std::string& fname);
-
- const bool use_uniform_;
- const prob_t uniform_;
- std::vector<prob_t> probs_;
-};
-
-
-// reads an ARPA unigram file and converts words like 'cat' into a string 'c a t'
-struct UnigramWordModel {
- explicit UnigramWordModel(const std::string& fname) :
- use_uniform_(false),
- uniform_(1.0),
- probs_() {
- LoadUnigrams(fname);
- }
-
- explicit UnigramWordModel(const unsigned vocab_size) :
- use_uniform_(true),
- uniform_(1.0 / vocab_size),
- probs_() {}
-
- const prob_t& operator()(const std::vector<WordID>& s) const {
- if (use_uniform_) return uniform_;
- const VectorProbHash::const_iterator it = probs_.find(s);
- assert(it != probs_.end());
- return it->second;
- }
-
- private:
- void LoadUnigrams(const std::string& fname);
-
- const bool use_uniform_;
- const prob_t uniform_;
- typedef std::tr1::unordered_map<std::vector<WordID>, prob_t, boost::hash<std::vector<WordID> > > VectorProbHash;
- VectorProbHash probs_;
-};
-
-#endif