summaryrefslogtreecommitdiff
path: root/gi/pf
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pf')
-rw-r--r--gi/pf/Makefile.am25
-rw-r--r--gi/pf/align-lexonly-pyp.cc239
-rw-r--r--gi/pf/align-tl.cc339
-rw-r--r--gi/pf/backward.cc89
-rw-r--r--gi/pf/backward.h33
-rw-r--r--gi/pf/base_distributions.cc (renamed from gi/pf/base_measures.cc)93
-rw-r--r--gi/pf/base_distributions.h (renamed from gi/pf/base_measures.h)119
-rw-r--r--gi/pf/brat.cc2
-rw-r--r--gi/pf/conditional_pseg.h275
-rw-r--r--gi/pf/condnaive.cc298
-rw-r--r--gi/pf/corpus.cc15
-rw-r--r--gi/pf/dpnaive.cc2
-rwxr-xr-xgi/pf/guess-translits.pl72
-rw-r--r--gi/pf/itg.cc98
-rw-r--r--gi/pf/learn_cfg.cc428
-rwxr-xr-xgi/pf/make-freq-bins.pl26
-rw-r--r--gi/pf/monotonic_pseg.h2
-rw-r--r--gi/pf/ngram_base.cc69
-rw-r--r--gi/pf/ngram_base.h25
-rw-r--r--gi/pf/nuisance_test.cc161
-rw-r--r--gi/pf/os_phrase.h15
-rw-r--r--gi/pf/pfbrat.cc2
-rw-r--r--gi/pf/pfdist.cc8
-rw-r--r--gi/pf/pfnaive.cc6
-rw-r--r--gi/pf/pyp_lm.cc209
-rw-r--r--gi/pf/pyp_tm.cc131
-rw-r--r--gi/pf/pyp_tm.h35
-rw-r--r--gi/pf/pyp_word_model.cc20
-rw-r--r--gi/pf/pyp_word_model.h58
-rw-r--r--gi/pf/quasi_model2.h166
-rw-r--r--gi/pf/reachability.cc24
-rw-r--r--gi/pf/reachability.h10
-rw-r--r--gi/pf/tied_resampler.h124
-rw-r--r--gi/pf/transliterations.cc334
-rw-r--r--gi/pf/transliterations.h24
-rw-r--r--gi/pf/unigrams.cc80
-rw-r--r--gi/pf/unigrams.h69
37 files changed, 3654 insertions, 71 deletions
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 42758939..f9c979d0 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,10 +1,26 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl
noinst_LIBRARIES = libpf.a
-libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc
+
+libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc pyp_word_model.cc pyp_tm.cc
+
+nuisance_test_SOURCES = nuisance_test.cc
+nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
+align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+align_tl_SOURCES = align-tl.cc
+align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
itg_SOURCES = itg.cc
+pyp_lm_SOURCES = pyp_lm.cc
+
+learn_cfg_SOURCES = learn_cfg.cc
+
+condnaive_SOURCES = condnaive.cc
+
dpnaive_SOURCES = dpnaive.cc
pfdist_SOURCES = pfdist.cc
@@ -17,5 +33,6 @@ brat_SOURCES = brat.cc
pfbrat_SOURCES = pfbrat.cc
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
-AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm
+
+AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
new file mode 100644
index 00000000..942dcf51
--- /dev/null
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -0,0 +1,239 @@
+#include <iostream>
+#include <queue>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "tdict.h"
+#include "stringlib.h"
+#include "filelib.h"
+#include "array2d.h"
+#include "sampler.h"
+#include "corpus.h"
+#include "pyp_tm.h"
+#include "quasi_model2.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed")
+ ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null")
+ ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?")
+ ("input,i",po::value<string>(),"Read parallel data from")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("input") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+MT19937* prng;
+
+struct LexicalAlignment {
+ unsigned char src_index;
+ bool is_transliteration;
+ vector<pair<short, short> > derivation;
+};
+
+struct AlignedSentencePair {
+ vector<WordID> src;
+ vector<WordID> trg;
+ vector<LexicalAlignment> a;
+ Array2D<short> posterior;
+};
+
+struct Aligner {
+ Aligner(const vector<vector<WordID> >& lets,
+ int num_letters,
+ const po::variables_map& conf,
+ vector<AlignedSentencePair>* c) :
+ corpus(*c),
+ paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()),
+ infer_paj(conf.count("infer_alignment_hyperparameters") > 0),
+ model(lets, num_letters),
+ kNULL(TD::Convert("NULL")) {
+ assert(lets[kNULL].size() == 0);
+ }
+
+ vector<AlignedSentencePair>& corpus;
+ QuasiModel2 paj_model;
+ const bool infer_paj;
+ PYPLexicalTranslation model;
+ const WordID kNULL;
+
+ void ResampleHyperparameters() {
+ model.ResampleHyperparameters(prng);
+ if (infer_paj) paj_model.ResampleHyperparameters(prng);
+ }
+
+ void InitializeRandom() {
+ cerr << "Initializing with random alignments ...\n";
+ for (unsigned i = 0; i < corpus.size(); ++i) {
+ AlignedSentencePair& asp = corpus[i];
+ asp.a.resize(asp.trg.size());
+ for (unsigned j = 0; j < asp.trg.size(); ++j) {
+ unsigned char& a_j = asp.a[j].src_index;
+ a_j = prng->next() * (1 + asp.src.size());
+ const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+ model.Increment(f_a_j, asp.trg[j], &*prng);
+ paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
+ }
+ }
+ cerr << "Corpus intialized randomly." << endl;
+ cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood()
+ << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;
+ }
+
+ void ResampleCorpus() {
+ for (unsigned i = 0; i < corpus.size(); ++i) {
+ AlignedSentencePair& asp = corpus[i];
+ SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
+ for (unsigned j = 0; j < asp.trg.size(); ++j) {
+ unsigned char& a_j = asp.a[j].src_index;
+ const WordID e_j = asp.trg[j];
+ WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+ model.Decrement(f_a_j, e_j, prng);
+ paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size());
+
+ for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
+ const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
+ ss[prop_a_j] = model.Prob(prop_f, e_j);
+ ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size());
+ }
+ a_j = prng->SelectSample(ss);
+ f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+ model.Increment(f_a_j, e_j, prng);
+ paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
+ }
+ }
+ }
+
+ prob_t Likelihood() const {
+ return model.Likelihood() * paj_model.Likelihood();
+ }
+};
+
+void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
+ for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
+ vector<WordID>& letters = (*l)[*it];
+ if (letters.size()) continue; // if e and f have the same word
+
+ const string& w = TD::Convert(*it);
+
+ size_t cur = 0;
+ while (cur < w.size()) {
+ const size_t len = UTF8Len(w[cur]);
+ letters.push_back(TD::Convert(w.substr(cur, len)));
+ if (letset) letset->insert(letters.back());
+ cur += len;
+ }
+ }
+}
+
+void Debug(const AlignedSentencePair& asp) {
+ cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
+ Array2D<bool> a(asp.src.size(), asp.trg.size());
+ for (unsigned j = 0; j < asp.trg.size(); ++j) {
+ assert(asp.a[j].src_index <= asp.src.size());
+ if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
+ }
+ cerr << a << endl;
+}
+
+void AddSample(AlignedSentencePair* asp) {
+ for (unsigned j = 0; j < asp->trg.size(); ++j)
+ asp->posterior(asp->a[j].src_index, j)++;
+}
+
+void WriteAlignments(const AlignedSentencePair& asp) {
+ bool first = true;
+ for (unsigned j = 0; j < asp.trg.size(); ++j) {
+ int src_index = -1;
+ int mc = -1;
+ for (unsigned i = 0; i <= asp.src.size(); ++i) {
+ if (asp.posterior(i, j) > mc) {
+ mc = asp.posterior(i, j);
+ src_index = i;
+ }
+ }
+
+ if (src_index) {
+ if (first) first = false; else cout << ' ';
+ cout << (src_index - 1) << '-' << j;
+ }
+ }
+ cout << endl;
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+
+ if (conf.count("random_seed"))
+ prng = new MT19937(conf["random_seed"].as<uint32_t>());
+ else
+ prng = new MT19937;
+
+ vector<vector<int> > corpuse, corpusf;
+ set<int> vocabe, vocabf;
+ corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+ cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
+ cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
+ assert(corpusf.size() == corpuse.size());
+
+ vector<AlignedSentencePair> corpus(corpuse.size());
+ for (unsigned i = 0; i < corpuse.size(); ++i) {
+ corpus[i].src.swap(corpusf[i]);
+ corpus[i].trg.swap(corpuse[i]);
+ corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size());
+ }
+ corpusf.clear(); corpuse.clear();
+
+ vocabf.insert(TD::Convert("NULL"));
+ vector<vector<WordID> > letters(TD::NumWords());
+ set<WordID> letset;
+ ExtractLetters(vocabe, &letters, &letset);
+ ExtractLetters(vocabf, &letters, NULL);
+ letters[TD::Convert("NULL")].clear();
+
+ Aligner aligner(letters, letset.size(), conf, &corpus);
+ aligner.InitializeRandom();
+
+ const unsigned samples = conf["samples"].as<unsigned>();
+ for (int i = 0; i < samples; ++i) {
+ for (int j = 65; j < 67; ++j) Debug(corpus[j]);
+ if (i % 10 == 9) {
+ aligner.ResampleHyperparameters();
+ cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood()
+ << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl;
+ }
+ aligner.ResampleCorpus();
+ if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
+ }
+ for (unsigned i = 0; i < corpus.size(); ++i)
+ WriteAlignments(corpus[i]);
+ aligner.model.Summary();
+
+ return 0;
+}
diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc
new file mode 100644
index 00000000..cbe8c6c8
--- /dev/null
+++ b/gi/pf/align-tl.cc
@@ -0,0 +1,339 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/multi_array.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "backward.h"
+#include "array2d.h"
+#include "base_distributions.h"
+#include "monotonic_pseg.h"
+#include "conditional_pseg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "stringlib.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "mfcr.h"
+#include "corpus.h"
+#include "ngram_base.h"
+#include "transliterations.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("input,i",po::value<string>(),"Read parallel data from")
+ ("s2t", po::value<string>(), "character level source-to-target prior transliteration probabilities")
+ ("t2s", po::value<string>(), "character level target-to-source prior transliteration probabilities")
+ ("max_src_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in source")
+ ("max_trg_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in target")
+ ("expected_src_to_trg_ratio", po::value<double>()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("input") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+shared_ptr<MT19937> prng;
+
+struct LexicalAlignment {
+ unsigned char src_index;
+ bool is_transliteration;
+ vector<pair<short, short> > derivation;
+};
+
+struct AlignedSentencePair {
+ vector<WordID> src;
+ vector<WordID> trg;
+ vector<LexicalAlignment> a;
+ Array2D<short> posterior;
+};
+
+struct HierarchicalWordBase {
+ explicit HierarchicalWordBase(const unsigned vocab_e_size) :
+ base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {}
+
+ void ResampleHyperparameters(MT19937* rng) {
+ r.resample_hyperparameters(rng);
+ }
+
+ inline double logp0(const vector<WordID>& s) const {
+ return Md::log_poisson(s.size(), 7.5) + s.size() * u0;
+ }
+
+ // return p0 of rule.e_
+ prob_t operator()(const TRule& rule) const {
+ v[0].logeq(logp0(rule.e_));
+ return r.prob(rule.e_, v.begin(), l.begin());
+ }
+
+ void Increment(const TRule& rule) {
+ v[0].logeq(logp0(rule.e_));
+ if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) {
+ base *= v[0] * l[0];
+ }
+ }
+
+ void Decrement(const TRule& rule) {
+ if (r.decrement(rule.e_, &*prng).count) {
+ base /= prob_t(exp(logp0(rule.e_)));
+ }
+ }
+
+ prob_t Likelihood() const {
+ prob_t p; p.logeq(r.log_crp_prob());
+ p *= base;
+ return p;
+ }
+
+ void Summary() const {
+ cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl;
+ for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
+ cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl;
+ }
+
+ prob_t base;
+ MFCR<1,vector<WordID> > r;
+ const double u0;
+ const vector<prob_t> l;
+ mutable vector<prob_t> v;
+};
+
+struct BasicLexicalAlignment {
+ explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets,
+ const unsigned words_e,
+ const unsigned letters_e,
+ vector<AlignedSentencePair>* corp) :
+ letters(lets),
+ corpus(*corp),
+ //up0(words_e),
+ //up0("en.chars.1gram", letters_e),
+ //up0("en.words.1gram"),
+ up0(letters_e),
+ //up0("en.chars.2gram"),
+ tmodel(up0) {
+ }
+
+ void InstantiateRule(const WordID src,
+ const WordID trg,
+ TRule* rule) const {
+ static const WordID kX = TD::Convert("X") * -1;
+ rule->lhs_ = kX;
+ rule->e_ = letters[trg];
+ rule->f_ = letters[src];
+ }
+
+ void InitializeRandom() {
+ const WordID kNULL = TD::Convert("NULL");
+ cerr << "Initializing with random alignments ...\n";
+ for (unsigned i = 0; i < corpus.size(); ++i) {
+ AlignedSentencePair& asp = corpus[i];
+ asp.a.resize(asp.trg.size());
+ for (unsigned j = 0; j < asp.trg.size(); ++j) {
+ const unsigned char a_j = prng->next() * (1 + asp.src.size());
+ const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+ TRule r;
+ InstantiateRule(f_a_j, asp.trg[j], &r);
+ asp.a[j].is_transliteration = false;
+ asp.a[j].src_index = a_j;
+ if (tmodel.IncrementRule(r, &*prng))
+ up0.Increment(r);
+ }
+ }
+ cerr << " LLH = " << Likelihood() << endl;
+ }
+
+ prob_t Likelihood() const {
+ prob_t p = tmodel.Likelihood();
+ p *= up0.Likelihood();
+ return p;
+ }
+
+ void ResampleHyperparemeters() {
+ tmodel.ResampleHyperparameters(&*prng);
+ up0.ResampleHyperparameters(&*prng);
+ cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n";
+ }
+
+ void ResampleCorpus();
+
+ const vector<vector<WordID> >& letters; // spelling dictionary
+ vector<AlignedSentencePair>& corpus;
+ //PhraseConditionalUninformativeBase up0;
+ //PhraseConditionalUninformativeUnigramBase up0;
+ //UnigramWordBase up0;
+ //HierarchicalUnigramBase up0;
+ HierarchicalWordBase up0;
+ //CompletelyUniformBase up0;
+ //FixedNgramBase up0;
+ //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel;
+ //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel;
+ //ConditionalTranslationModel<UnigramWordBase> tmodel;
+ //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel;
+ MConditionalTranslationModel<HierarchicalWordBase> tmodel;
+ //ConditionalTranslationModel<FixedNgramBase> tmodel;
+ //ConditionalTranslationModel<CompletelyUniformBase> tmodel;
+};
+
+void BasicLexicalAlignment::ResampleCorpus() {
+ static const WordID kNULL = TD::Convert("NULL");
+ for (unsigned i = 0; i < corpus.size(); ++i) {
+ AlignedSentencePair& asp = corpus[i];
+ SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
+ for (unsigned j = 0; j < asp.trg.size(); ++j) {
+ TRule r;
+ unsigned char& a_j = asp.a[j].src_index;
+ WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+ InstantiateRule(f_a_j, asp.trg[j], &r);
+ if (tmodel.DecrementRule(r, &*prng))
+ up0.Decrement(r);
+
+ for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
+ const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
+ InstantiateRule(prop_f, asp.trg[j], &r);
+ ss[prop_a_j] = tmodel.RuleProbability(r);
+ }
+ a_j = prng->SelectSample(ss);
+ f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+ InstantiateRule(f_a_j, asp.trg[j], &r);
+ if (tmodel.IncrementRule(r, &*prng))
+ up0.Increment(r);
+ }
+ }
+ cerr << " LLH = " << Likelihood() << endl;
+}
+
+void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
+ for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
+ vector<WordID>& letters = (*l)[*it];
+ if (letters.size()) continue; // if e and f have the same word
+
+ const string& w = TD::Convert(*it);
+
+ size_t cur = 0;
+ while (cur < w.size()) {
+ const size_t len = UTF8Len(w[cur]);
+ letters.push_back(TD::Convert(w.substr(cur, len)));
+ if (letset) letset->insert(letters.back());
+ cur += len;
+ }
+ }
+}
+
+void Debug(const AlignedSentencePair& asp) {
+ cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
+ Array2D<bool> a(asp.src.size(), asp.trg.size());
+ for (unsigned j = 0; j < asp.trg.size(); ++j)
+ if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
+ cerr << a << endl;
+}
+
+void AddSample(AlignedSentencePair* asp) {
+ for (unsigned j = 0; j < asp->trg.size(); ++j)
+ asp->posterior(asp->a[j].src_index, j)++;
+}
+
+void WriteAlignments(const AlignedSentencePair& asp) {
+ bool first = true;
+ for (unsigned j = 0; j < asp.trg.size(); ++j) {
+ int src_index = -1;
+ int mc = -1;
+ for (unsigned i = 0; i <= asp.src.size(); ++i) {
+ if (asp.posterior(i, j) > mc) {
+ mc = asp.posterior(i, j);
+ src_index = i;
+ }
+ }
+
+ if (src_index) {
+ if (first) first = false; else cout << ' ';
+ cout << (src_index - 1) << '-' << j;
+ }
+ }
+ cout << endl;
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+
+ if (conf.count("random_seed"))
+ prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ prng.reset(new MT19937);
+// MT19937& rng = *prng;
+
+ vector<vector<int> > corpuse, corpusf;
+ set<int> vocabe, vocabf;
+ corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+ cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
+ cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
+ assert(corpusf.size() == corpuse.size());
+
+ vector<AlignedSentencePair> corpus(corpuse.size());
+ for (unsigned i = 0; i < corpuse.size(); ++i) {
+ corpus[i].src.swap(corpusf[i]);
+ corpus[i].trg.swap(corpuse[i]);
+ corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size());
+ }
+ corpusf.clear(); corpuse.clear();
+
+ vocabf.insert(TD::Convert("NULL"));
+ vector<vector<WordID> > letters(TD::NumWords() + 1);
+ set<WordID> letset;
+ ExtractLetters(vocabe, &letters, &letset);
+ ExtractLetters(vocabf, &letters, NULL);
+ letters[TD::Convert("NULL")].clear();
+
+ // TODO configure this
+ const int max_src_chunk = conf["max_src_chunk"].as<unsigned>();
+ const int max_trg_chunk = conf["max_trg_chunk"].as<unsigned>();
+ const double s2t_rat = conf["expected_src_to_trg_ratio"].as<double>();
+ const BackwardEstimator be(conf["s2t"].as<string>(), conf["t2s"].as<string>());
+ Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be);
+
+ cerr << "Initializing transliteration graph structures ...\n";
+ for (int i = 0; i < corpus.size(); ++i) {
+ const vector<int>& src = corpus[i].src;
+ const vector<int>& trg = corpus[i].trg;
+ for (int j = 0; j < src.size(); ++j) {
+ const vector<int>& src_let = letters[src[j]];
+ for (int k = 0; k < trg.size(); ++k) {
+ const vector<int>& trg_let = letters[trg[k]];
+ tl.Initialize(src[j], src_let, trg[k], trg_let);
+ //if (src_let.size() < min_trans_src)
+ // tl.Forbid(src[j], src_let, trg[k], trg_let);
+ }
+ }
+ }
+ cerr << endl;
+ tl.GraphSummary();
+
+ return 0;
+}
diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc
new file mode 100644
index 00000000..b92629fd
--- /dev/null
+++ b/gi/pf/backward.cc
@@ -0,0 +1,89 @@
+#include "backward.h"
+
+#include <queue>
+#include <utility>
+
+#include "array2d.h"
+#include "reachability.h"
+#include "base_distributions.h"
+
+using namespace std;
+
+BackwardEstimator::BackwardEstimator(const string& s2t,
+ const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {}
+
+BackwardEstimator::~BackwardEstimator() {
+ delete m1; m1 = NULL;
+ delete m1inv; m1inv = NULL;
+}
+
+float BackwardEstimator::ComputeBackwardProb(const std::vector<WordID>& src,
+ const std::vector<WordID>& trg,
+ unsigned src_covered,
+ unsigned trg_covered,
+ double s2t_ratio) const {
+ if (src_covered == src.size() || trg_covered == trg.size()) {
+ assert(src_covered == src.size());
+ assert(trg_covered == trg.size());
+ return 0;
+ }
+ static const WordID kNULL = TD::Convert("<eps>");
+ const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1));
+ // TODO factor in expected length ratio
+ prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining)
+ for (unsigned j = trg_covered; j < trg.size(); ++j) {
+ prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12);
+ for (unsigned i = src_covered; i < src.size(); ++i)
+ p += (*m1)(src[i], trg[j]);
+ if (p.is_0()) {
+ cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n";
+ assert(!"failed");
+ }
+ p *= uniform_alignment;
+ e *= p;
+ }
+ // TODO factor in expected length ratio
+ const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0));
+ prob_t inv;
+ inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio));
+ for (unsigned i = src_covered; i < src.size(); ++i) {
+ prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12);
+ for (unsigned j = trg_covered; j < trg.size(); ++j)
+ p += (*m1inv)(trg[j], src[i]);
+ if (p.is_0()) {
+ cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n";
+ assert(!"failed");
+ }
+ p *= inv_uniform;
+ inv *= p;
+ }
+ return (log(e) + log(inv)) / 2;
+}
+
+void BackwardEstimator::InitializeGrid(const vector<WordID>& src,
+ const vector<WordID>& trg,
+ const Reachability& r,
+ double s2t_ratio,
+ float* grid) const {
+ queue<pair<int,int> > q;
+ q.push(make_pair(0,0));
+ Array2D<bool> done(src.size()+1, trg.size()+1, false);
+ //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl;
+ while(!q.empty()) {
+ const pair<int,int> n = q.front();
+ q.pop();
+ if (done(n.first,n.second)) continue;
+ done(n.first,n.second) = true;
+
+ float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio);
+ if (n.first == 0 && n.second == 0) grid[0] = lp;
+ //cerr << " " << n.first << "," << n.second << "\t" << lp << endl;
+
+ if (n.first == src.size() || n.second == trg.size()) continue;
+ const vector<pair<short,short> >& edges = r.valid_deltas[n.first][n.second];
+ for (int i = 0; i < edges.size(); ++i)
+ q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second));
+ }
+ //static int cc = 0; ++cc; if (cc == 80) exit(1);
+}
+
diff --git a/gi/pf/backward.h b/gi/pf/backward.h
new file mode 100644
index 00000000..e67eff0c
--- /dev/null
+++ b/gi/pf/backward.h
@@ -0,0 +1,33 @@
+#ifndef _BACKWARD_H_
+#define _BACKWARD_H_
+
+#include <vector>
+#include <string>
+#include "wordid.h"
+
+struct Reachability;
+struct Model1;
+
+struct BackwardEstimator {
+ BackwardEstimator(const std::string& s2t,
+ const std::string& t2s);
+ ~BackwardEstimator();
+
+ void InitializeGrid(const std::vector<WordID>& src,
+ const std::vector<WordID>& trg,
+ const Reachability& r,
+ double src2trg_ratio,
+ float* grid) const;
+
+ private:
+ float ComputeBackwardProb(const std::vector<WordID>& src,
+ const std::vector<WordID>& trg,
+ unsigned src_covered,
+ unsigned trg_covered,
+ double src2trg_ratio) const;
+
+ Model1* m1;
+ Model1* m1inv;
+};
+
+#endif
diff --git a/gi/pf/base_measures.cc b/gi/pf/base_distributions.cc
index 8adb37d7..d9761005 100644
--- a/gi/pf/base_measures.cc
+++ b/gi/pf/base_distributions.cc
@@ -1,4 +1,4 @@
-#include "base_measures.h"
+#include "base_distributions.h"
#include <iostream>
@@ -6,6 +6,79 @@
using namespace std;
+TableLookupBase::TableLookupBase(const string& fname) {
+ cerr << "TableLookupBase reading from " << fname << " ..." << endl;
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ unsigned lc = 0;
+ const WordID kDIV = TD::Convert("|||");
+ vector<WordID> tmp;
+ vector<int> le, lf;
+ TRule x;
+ x.lhs_ = -TD::Convert("X");
+ bool flag = false;
+ while(getline(in, line)) {
+ ++lc;
+ if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; }
+ else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; }
+ tmp.clear();
+ TD::ConvertSentence(line, &tmp);
+ x.f_.clear();
+ x.e_.clear();
+ size_t pos = 0;
+ int cc = 0;
+ while(pos < tmp.size()) {
+ const WordID cur = tmp[pos++];
+ if (cur == kDIV) {
+ ++cc;
+ } else if (cc == 0) {
+ x.f_.push_back(cur);
+ } else if (cc == 1) {
+ x.e_.push_back(cur);
+ } else if (cc == 2) {
+ table[x].logeq(atof(TD::Convert(cur)));
+ ++cc;
+ } else {
+ if (flag) cerr << endl;
+ cerr << "Bad format in " << lc << ": " << line << endl; abort();
+ }
+ }
+ if (cc != 3) {
+ if (flag) cerr << endl;
+ cerr << "Bad format in " << lc << ": " << line << endl; abort();
+ }
+ }
+ if (flag) cerr << endl;
+ cerr << " read " << lc << " entries\n";
+}
+
+prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc,
+ const vector<WordID>& vtrg,
+ int start_src, int start_trg) const {
+ const int flen = vsrc.size() - start_src;
+ const int elen = vtrg.size() - start_trg;
+ prob_t p;
+ p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
+ //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01)
+ for (int i = 0; i < elen; ++i)
+ p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform
+ return p;
+}
+
+prob_t PhraseConditionalUninformativeBase::p0(const vector<WordID>& vsrc,
+ const vector<WordID>& vtrg,
+ int start_src, int start_trg) const {
+ const int flen = vsrc.size() - start_src;
+ const int elen = vtrg.size() - start_trg;
+ prob_t p;
+ //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
+ p.logeq(Md::log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01)
+ for (int i = 0; i < elen; ++i)
+ p *= kUNIFORM_TARGET; // draw e_i ~Uniform
+ return p;
+}
+
void Model1::LoadModel1(const string& fname) {
cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
ReadFile rf(fname);
@@ -40,7 +113,7 @@ prob_t PhraseConditionalBase::p0(const vector<WordID>& vsrc,
const int elen = vtrg.size() - start_trg;
prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
prob_t p;
- p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
+ p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
for (int i = 0; i < elen; ++i) { // for each position i in e-RHS
const WordID trg = vtrg[i + start_trg];
prob_t tp = prob_t::Zero();
@@ -66,9 +139,9 @@ prob_t PhraseJointBase::p0(const vector<WordID>& vsrc,
const int elen = vtrg.size() - start_trg;
prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
prob_t p;
- p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1)
+ p.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1)
// elen | flen ~Pois(flen + 0.01)
- prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01));
+ prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01));
p *= ptrglen;
p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform
for (int i = 0; i < elen; ++i) { // for each position i in E
@@ -98,9 +171,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc,
prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1));
prob_t p1;
- p1.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1)
+ p1.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1)
// elen | flen ~Pois(flen + 0.01)
- prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01));
+ prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01));
p1 *= ptrglen;
p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform
for (int i = 0; i < elen; ++i) { // for each position i in E
@@ -120,9 +193,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc,
}
prob_t p2;
- p2.logeq(log_poisson(elen, 1.0)); // elen ~Pois(1)
+ p2.logeq(Md::log_poisson(elen, 1.0)); // elen ~Pois(1)
// flen | elen ~Pois(flen + 0.01)
- prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01));
+ prob_t psrclen; psrclen.logeq(Md::log_poisson(flen, elen + 0.01));
p2 *= psrclen;
p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform
for (int i = 0; i < flen; ++i) { // for each position i in E
@@ -154,9 +227,9 @@ JumpBase::JumpBase() : p(200) {
for (int j = min_jump; j <= max_jump; ++j) {
prob_t& cp = cpd[j];
if (j < 0)
- cp.logeq(log_poisson(1.5-j, 1));
+ cp.logeq(Md::log_poisson(1.5-j, 1));
else if (j > 0)
- cp.logeq(log_poisson(j, 1));
+ cp.logeq(Md::log_poisson(j, 1));
cp.poweq(0.2);
z += cp;
}
diff --git a/gi/pf/base_measures.h b/gi/pf/base_distributions.h
index 7ce7e2e6..84dacdf2 100644
--- a/gi/pf/base_measures.h
+++ b/gi/pf/base_distributions.h
@@ -6,22 +6,15 @@
#include <string>
#include <cmath>
#include <iostream>
+#include <cassert>
+#include "unigrams.h"
#include "trule.h"
#include "prob.h"
#include "tdict.h"
-
-inline double log_poisson(unsigned x, const double& lambda) {
- assert(lambda > 0.0);
- return log(lambda) * x - lgamma(x + 1) - lambda;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) {
- os << '[';
- for (int i = 0; i < p.size(); ++i)
- os << (i==0 ? "" : " ") << TD::Convert(p[i]);
- return os << ']';
-}
+#include "sampler.h"
+#include "m.h"
+#include "os_phrase.h"
struct Model1 {
explicit Model1(const std::string& fname) :
@@ -49,6 +42,104 @@ struct Model1 {
std::vector<std::map<WordID, prob_t> > ttable;
};
+struct PoissonUniformUninformativeBase {
+ explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
+ prob_t operator()(const TRule& r) const {
+ prob_t p; p.logeq(Md::log_poisson(r.e_.size(), 1.0));
+ prob_t q = kUNIFORM; q.poweq(r.e_.size());
+ p *= q;
+ return p;
+ }
+ void Summary() const {}
+ void ResampleHyperparameters(MT19937*) {}
+ void Increment(const TRule&) {}
+ void Decrement(const TRule&) {}
+ prob_t Likelihood() const { return prob_t::One(); }
+ const prob_t kUNIFORM;
+};
+
+struct CompletelyUniformBase {
+ explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
+ prob_t operator()(const TRule&) const {
+ return kUNIFORM;
+ }
+ void Summary() const {}
+ void ResampleHyperparameters(MT19937*) {}
+ void Increment(const TRule&) {}
+ void Decrement(const TRule&) {}
+ prob_t Likelihood() const { return prob_t::One(); }
+ const prob_t kUNIFORM;
+};
+
+struct UnigramWordBase {
+ explicit UnigramWordBase(const std::string& fname) : un(fname) {}
+ prob_t operator()(const TRule& r) const {
+ return un(r.e_);
+ }
+ const UnigramWordModel un;
+};
+
+struct RuleHasher {
+ size_t operator()(const TRule& r) const {
+ return hash_value(r);
+ }
+};
+
+struct TableLookupBase {
+ TableLookupBase(const std::string& fname);
+
+ prob_t operator()(const TRule& rule) const {
+ const std::tr1::unordered_map<TRule,prob_t>::const_iterator it = table.find(rule);
+ if (it == table.end()) {
+ std::cerr << rule << " not found\n";
+ abort();
+ }
+ return it->second;
+ }
+
+ void ResampleHyperparameters(MT19937*) {}
+ void Increment(const TRule&) {}
+ void Decrement(const TRule&) {}
+ prob_t Likelihood() const { return prob_t::One(); }
+ void Summary() const {}
+
+ std::tr1::unordered_map<TRule,prob_t,RuleHasher> table;
+};
+
+struct PhraseConditionalUninformativeBase {
+ explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) :
+ kUNIFORM_TARGET(1.0 / vocab_e_size) {
+ assert(vocab_e_size > 0);
+ }
+
+ // return p0 of rule.e_ | rule.f_
+ prob_t operator()(const TRule& rule) const {
+ return p0(rule.f_, rule.e_, 0, 0);
+ }
+
+ prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+ void Summary() const {}
+ void ResampleHyperparameters(MT19937*) {}
+ void Increment(const TRule&) {}
+ void Decrement(const TRule&) {}
+ prob_t Likelihood() const { return prob_t::One(); }
+ const prob_t kUNIFORM_TARGET;
+};
+
+struct PhraseConditionalUninformativeUnigramBase {
+ explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {}
+
+ // return p0 of rule.e_ | rule.f_
+ prob_t operator()(const TRule& rule) const {
+ return p0(rule.f_, rule.e_, 0, 0);
+ }
+
+ prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+ const UnigramModel u;
+};
+
struct PhraseConditionalBase {
explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) :
model1(m1),
@@ -83,7 +174,7 @@ struct PhraseJointBase {
assert(vocab_e_size > 0);
}
- // return p0 of rule.e_ | rule.f_
+ // return p0 of rule.e_ , rule.f_
prob_t operator()(const TRule& rule) const {
return p0(rule.f_, rule.e_, 0, 0);
}
@@ -113,7 +204,7 @@ struct PhraseJointBase_BiDir {
assert(vocab_e_size > 0);
}
- // return p0 of rule.e_ | rule.f_
+ // return p0 of rule.e_ , rule.f_
prob_t operator()(const TRule& rule) const {
return p0(rule.f_, rule.e_, 0, 0);
}
diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc
index 7b60ef23..c2c52760 100644
--- a/gi/pf/brat.cc
+++ b/gi/pf/brat.cc
@@ -191,7 +191,7 @@ struct UniphraseLM {
void ResampleHyperparameters(MT19937* rng) {
phrases_.resample_hyperparameters(rng);
gen_.resample_hyperparameters(rng);
- cerr << " " << phrases_.concentration();
+ cerr << " " << phrases_.alpha();
}
CCRP_NoTable<vector<int> > phrases_;
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
new file mode 100644
index 00000000..81ddb206
--- /dev/null
+++ b/gi/pf/conditional_pseg.h
@@ -0,0 +1,275 @@
+#ifndef _CONDITIONAL_PSEG_H_
+#define _CONDITIONAL_PSEG_H_
+
+#include <vector>
+#include <tr1/unordered_map>
+#include <boost/functional/hash.hpp>
+#include <iostream>
+
+#include "m.h"
+#include "prob.h"
+#include "ccrp_nt.h"
+#include "mfcr.h"
+#include "trule.h"
+#include "base_distributions.h"
+#include "tdict.h"
+
+template <typename ConditionalBaseMeasure>
+struct MConditionalTranslationModel {
+ explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :
+ rp0(rcp0), d(0.5), strength(1.0), lambdas(1, prob_t::One()), p0s(1) {}
+
+ void Summary() const {
+ std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
+ for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+ std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl;
+ for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+ std::cerr << " " << i2->second.total_dish_count_ << '\t' << i2->first << std::endl;
+ }
+ }
+
+ double log_likelihood(const double& dd, const double& aa) const {
+ if (aa <= -dd) return -std::numeric_limits<double>::infinity();
+ //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
+ double llh = Md::log_beta_density(dd, 1, 1) +
+ Md::log_gamma_density(dd + aa, 1, 1);
+ typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::const_iterator it;
+ for (it = r.begin(); it != r.end(); ++it)
+ llh += it->second.log_crp_prob(dd, aa);
+ return llh;
+ }
+
+ struct DiscountResampler {
+ DiscountResampler(const MConditionalTranslationModel& m) : m_(m) {}
+ const MConditionalTranslationModel& m_;
+ double operator()(const double& proposed_discount) const {
+ return m_.log_likelihood(proposed_discount, m_.strength);
+ }
+ };
+
+ struct AlphaResampler {
+ AlphaResampler(const MConditionalTranslationModel& m) : m_(m) {}
+ const MConditionalTranslationModel& m_;
+ double operator()(const double& proposed_strength) const {
+ return m_.log_likelihood(m_.d, proposed_strength);
+ }
+ };
+
+ void ResampleHyperparameters(MT19937* rng) {
+ typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::iterator it;
+#if 1
+ for (it = r.begin(); it != r.end(); ++it) {
+ it->second.resample_hyperparameters(rng);
+ }
+#else
+ const unsigned nloop = 5;
+ const unsigned niterations = 10;
+ DiscountResampler dr(*this);
+ AlphaResampler ar(*this);
+ for (int iter = 0; iter < nloop; ++iter) {
+ strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ double min_discount = std::numeric_limits<double>::min();
+ if (strength < 0.0) min_discount -= strength;
+ d = slice_sampler1d(dr, d, *rng, min_discount,
+ 1.0, 0.0, niterations, 100*niterations);
+ }
+ strength = slice_sampler1d(ar, strength, *rng, -d,
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl;
+ for (it = r.begin(); it != r.end(); ++it) {
+ it->second.set_discount(d);
+ it->second.set_strength(strength);
+ }
+#endif
+ }
+
+ int DecrementRule(const TRule& rule, MT19937* rng) {
+ RuleModelHash::iterator it = r.find(rule.f_);
+ assert(it != r.end());
+ const TableCount delta = it->second.decrement(rule, rng);
+ if (delta.count) {
+ if (it->second.num_customers() == 0) r.erase(it);
+ }
+ return delta.count;
+ }
+
+ int IncrementRule(const TRule& rule, MT19937* rng) {
+ RuleModelHash::iterator it = r.find(rule.f_);
+ if (it == r.end()) {
+ //it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first;
+ it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1,1,1,1,0.6, -0.12))).first;
+ }
+ p0s[0] = rp0(rule);
+ TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng);
+ return delta.count;
+ }
+
+ prob_t RuleProbability(const TRule& rule) const {
+ prob_t p;
+ RuleModelHash::const_iterator it = r.find(rule.f_);
+ if (it == r.end()) {
+ p = rp0(rule);
+ } else {
+ p0s[0] = rp0(rule);
+ p = it->second.prob(rule, p0s.begin(), lambdas.begin());
+ }
+ return p;
+ }
+
+ prob_t Likelihood() const {
+ prob_t p; p.logeq(log_likelihood(d, strength));
+ return p;
+ }
+
+ const ConditionalBaseMeasure& rp0;
+ typedef std::tr1::unordered_map<std::vector<WordID>,
+ MFCR<1, TRule>,
+ boost::hash<std::vector<WordID> > > RuleModelHash;
+ RuleModelHash r;
+ double d, strength;
+ std::vector<prob_t> lambdas;
+ mutable std::vector<prob_t> p0s;
+};
+
+template <typename ConditionalBaseMeasure>
+struct ConditionalTranslationModel {
+ explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :
+ rp0(rcp0) {}
+
+ void Summary() const {
+ std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
+ for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+ std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
+ for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+ std::cerr << " " << i2->second << '\t' << i2->first << std::endl;
+ }
+ }
+
+ void ResampleHyperparameters(MT19937* rng) {
+ for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
+ it->second.resample_hyperparameters(rng);
+ }
+
+ int DecrementRule(const TRule& rule) {
+ RuleModelHash::iterator it = r.find(rule.f_);
+ assert(it != r.end());
+ int count = it->second.decrement(rule);
+ if (count) {
+ if (it->second.num_customers() == 0) r.erase(it);
+ }
+ return count;
+ }
+
+ int IncrementRule(const TRule& rule) {
+ RuleModelHash::iterator it = r.find(rule.f_);
+ if (it == r.end()) {
+ it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1.0, 1.0, 8.0))).first;
+ }
+ int count = it->second.increment(rule);
+ return count;
+ }
+
+ void IncrementRules(const std::vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ IncrementRule(*rules[i]);
+ }
+
+ void DecrementRules(const std::vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ DecrementRule(*rules[i]);
+ }
+
+ prob_t RuleProbability(const TRule& rule) const {
+ prob_t p;
+ RuleModelHash::const_iterator it = r.find(rule.f_);
+ if (it == r.end()) {
+ p.logeq(log(rp0(rule)));
+ } else {
+ p.logeq(it->second.logprob(rule, log(rp0(rule))));
+ }
+ return p;
+ }
+
+ prob_t Likelihood() const {
+ prob_t p = prob_t::One();
+ for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+ prob_t q; q.logeq(it->second.log_crp_prob());
+ p *= q;
+ for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+ p *= rp0(i2->first);
+ }
+ return p;
+ }
+
+ const ConditionalBaseMeasure& rp0;
+ typedef std::tr1::unordered_map<std::vector<WordID>,
+ CCRP_NoTable<TRule>,
+ boost::hash<std::vector<WordID> > > RuleModelHash;
+ RuleModelHash r;
+};
+
+template <typename ConditionalBaseMeasure>
+struct ConditionalParallelSegementationModel {
+ explicit ConditionalParallelSegementationModel(ConditionalBaseMeasure& rcp0) :
+ tmodel(rcp0), base(prob_t::One()), aligns(1,1) {}
+
+ ConditionalTranslationModel<ConditionalBaseMeasure> tmodel;
+
+ void DecrementRule(const TRule& rule) {
+ tmodel.DecrementRule(rule);
+ }
+
+ void IncrementRule(const TRule& rule) {
+ tmodel.IncrementRule(rule);
+ }
+
+ void IncrementRulesAndAlignments(const std::vector<TRulePtr>& rules) {
+ tmodel.IncrementRules(rules);
+ for (int i = 0; i < rules.size(); ++i) {
+ IncrementAlign(rules[i]->f_.size());
+ }
+ }
+
+ void DecrementRulesAndAlignments(const std::vector<TRulePtr>& rules) {
+ tmodel.DecrementRules(rules);
+ for (int i = 0; i < rules.size(); ++i) {
+ DecrementAlign(rules[i]->f_.size());
+ }
+ }
+
+ prob_t RuleProbability(const TRule& rule) const {
+ return tmodel.RuleProbability(rule);
+ }
+
+ void IncrementAlign(unsigned span) {
+ if (aligns.increment(span)) {
+ // TODO
+ }
+ }
+
+ void DecrementAlign(unsigned span) {
+ if (aligns.decrement(span)) {
+ // TODO
+ }
+ }
+
+ prob_t AlignProbability(unsigned span) const {
+ prob_t p;
+ p.logeq(aligns.logprob(span, Md::log_poisson(span, 1.0)));
+ return p;
+ }
+
+ prob_t Likelihood() const {
+ prob_t p; p.logeq(aligns.log_crp_prob());
+ p *= base;
+ p *= tmodel.Likelihood();
+ return p;
+ }
+
+ prob_t base;
+ CCRP_NoTable<unsigned> aligns;
+};
+
+#endif
+
diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc
new file mode 100644
index 00000000..3ea88016
--- /dev/null
+++ b/gi/pf/condnaive.cc
@@ -0,0 +1,298 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/multi_array.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "base_distributions.h"
+#include "monotonic_pseg.h"
+#include "conditional_pseg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp_nt.h"
+#include "corpus.h"
+
+using namespace std;
+using namespace std::tr1;
+namespace po = boost::program_options;
+
+static unsigned kMAX_SRC_PHRASE;
+static unsigned kMAX_TRG_PHRASE;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("input,i",po::value<string>(),"Read parallel data from")
+ ("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases")
+ ("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases")
+ ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
+ ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("input") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+shared_ptr<MT19937> prng;
+
+struct ModelAndData {
+ explicit ModelAndData(ConditionalParallelSegementationModel<PhraseConditionalBase>& m, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) :
+ model(m),
+ rng(&*prng),
+ corpuse(ce),
+ corpusf(cf),
+ vocabe(ve),
+ vocabf(vf),
+ mh_samples(),
+ mh_rejects(),
+ kX(-TD::Convert("X")),
+ derivations(corpuse.size()) {}
+
+ void ResampleHyperparameters() {
+ }
+
+ void InstantiateRule(const pair<short,short>& from,
+ const pair<short,short>& to,
+ const vector<int>& sentf,
+ const vector<int>& sente,
+ TRule* rule) const {
+ rule->f_.clear();
+ rule->e_.clear();
+ rule->lhs_ = kX;
+ for (short i = from.first; i < to.first; ++i)
+ rule->f_.push_back(sentf[i]);
+ for (short i = from.second; i < to.second; ++i)
+ rule->e_.push_back(sente[i]);
+ }
+
+ void DecrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
+ if (d.size() < 2) return;
+ TRule x;
+ for (int i = 1; i < d.size(); ++i) {
+ InstantiateRule(d[i], d[i-1], sentf, sente, &x);
+ model.DecrementRule(x);
+ model.DecrementAlign(x.f_.size());
+ }
+ }
+
+ void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
+ if (d.size() < 2) return;
+ TRule x;
+ for (int i = 1; i < d.size(); ++i) {
+ InstantiateRule(d[i], d[i-1], sentf, sente, &x);
+ cerr << i << '/' << (d.size() - 1) << ": " << x << endl;
+ }
+ }
+
+ void IncrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
+ if (d.size() < 2) return;
+ TRule x;
+ for (int i = 1; i < d.size(); ++i) {
+ InstantiateRule(d[i], d[i-1], sentf, sente, &x);
+ model.IncrementRule(x);
+ model.IncrementAlign(x.f_.size());
+ }
+ }
+
+ prob_t Likelihood() const {
+ return model.Likelihood();
+ }
+
+ prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const {
+ prob_t p = prob_t::One();
+ TRule x;
+ for (int i = 1; i < d.size(); ++i) {
+ InstantiateRule(d[i], d[i-1], sentf, sente, &x);
+ p *= model.RuleProbability(x);
+ p *= model.AlignProbability(x.f_.size());
+ }
+ return p;
+ }
+
+ void Sample();
+
+ ConditionalParallelSegementationModel<PhraseConditionalBase>& model;
+ MT19937* rng;
+ const vector<vector<int> >& corpuse, corpusf;
+ const set<int>& vocabe, vocabf;
+ unsigned mh_samples, mh_rejects;
+ const int kX;
+ vector<vector<pair<short, short> > > derivations;
+};
+
+void ModelAndData::Sample() {
+ unsigned MAXK = kMAX_SRC_PHRASE;
+ unsigned MAXL = kMAX_TRG_PHRASE;
+ TRule x;
+ x.lhs_ = -TD::Convert("X");
+
+ for (int samples = 0; samples < 1000; ++samples) {
+ if (samples % 1 == 0 && samples > 0) {
+ //ResampleHyperparameters();
+ cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n";
+ for (int i = 0; i < 10; ++i) {
+ cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl;
+ PrintDerivation(derivations[i], corpusf[i], corpuse[i]);
+ }
+ static TRule xx("[X] ||| w n ||| s h ||| X=0");
+ const CCRP_NoTable<TRule>& dcrp = model.tmodel.r.find(xx.f_)->second;
+ for (CCRP_NoTable<TRule>::const_iterator it = dcrp.begin(); it != dcrp.end(); ++it) {
+ cerr << "\t" << it->second << "\t" << it->first << endl;
+ }
+ }
+ cerr << '.' << flush;
+ for (int s = 0; s < corpuse.size(); ++s) {
+ const vector<int>& sentf = corpusf[s];
+ const vector<int>& sente = corpuse[s];
+// cerr << " CUSTOMERS: " << rules.num_customers() << endl;
+// cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl;
+
+ vector<pair<short, short> >& deriv = derivations[s];
+ const prob_t p_cur = Likelihood();
+ DecrementDerivation(deriv, sentf, sente);
+
+ boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]);
+ boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]);
+ a[0][0] = prob_t::One();
+ for (int i = 0; i < sentf.size(); ++i) {
+ for (int j = 0; j < sente.size(); ++j) {
+ const prob_t src_a = a[i][j];
+ x.f_.clear();
+ for (int k = 1; k <= MAXK; ++k) {
+ if (i + k > sentf.size()) break;
+ x.f_.push_back(sentf[i + k - 1]);
+ x.e_.clear();
+ const prob_t p_span = model.AlignProbability(k); // prob of consuming this much source
+ for (int l = 1; l <= MAXL; ++l) {
+ if (j + l > sente.size()) break;
+ x.e_.push_back(sente[j + l - 1]);
+ trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * p_span;
+ a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1];
+ }
+ }
+ }
+ }
+// cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl;
+ const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente);
+
+ vector<pair<short,short> > newderiv;
+ int cur_i = sentf.size();
+ int cur_j = sente.size();
+ while(cur_i > 0 && cur_j > 0) {
+ newderiv.push_back(pair<short,short>(cur_i, cur_j));
+// cerr << "NODE: (" << cur_i << "," << cur_j << ")\n";
+ SampleSet<prob_t> ss;
+ vector<pair<short,short> > nexts;
+ for (int k = 1; k <= MAXK; ++k) {
+ const int hyp_i = cur_i - k;
+ if (hyp_i < 0) break;
+ for (int l = 1; l <= MAXL; ++l) {
+ const int hyp_j = cur_j - l;
+ if (hyp_j < 0) break;
+ const prob_t& inside = a[hyp_i][hyp_j];
+ if (inside == prob_t::Zero()) continue;
+ const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1];
+ if (transp == prob_t::Zero()) continue;
+ const prob_t p = inside * transp;
+ ss.add(p);
+ nexts.push_back(pair<short,short>(hyp_i, hyp_j));
+// cerr << " (" << hyp_i << "," << hyp_j << ") <--- " << log(p) << endl;
+ }
+ }
+// cerr << " sample set has " << nexts.size() << " elements.\n";
+ const int selected = rng->SelectSample(ss);
+ cur_i = nexts[selected].first;
+ cur_j = nexts[selected].second;
+ }
+ newderiv.push_back(pair<short,short>(0,0));
+ const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente);
+ IncrementDerivation(newderiv, sentf, sente);
+// cerr << "SANITY: " << q_new << " " <<log(DerivationProposalProbability(newderiv, sentf, sente)) << endl;
+ if (deriv.empty()) { deriv = newderiv; continue; }
+ ++mh_samples;
+
+ if (deriv != newderiv) {
+ const prob_t p_new = Likelihood();
+// cerr << "p_cur=" << log(p_cur) << "\t p_new=" << log(p_new) << endl;
+// cerr << "q_cur=" << log(q_cur) << "\t q_new=" << log(q_new) << endl;
+ if (!rng->AcceptMetropolisHastings(p_new, p_cur, q_new, q_cur)) {
+ ++mh_rejects;
+ DecrementDerivation(newderiv, sentf, sente);
+ IncrementDerivation(deriv, sentf, sente);
+ } else {
+// cerr << " ACCEPT\n";
+ deriv = newderiv;
+ }
+ }
+ }
+ }
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
+ kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
+
+ if (!conf.count("model1")) {
+ cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
+ return 1;
+ }
+ if (conf.count("random_seed"))
+ prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ prng.reset(new MT19937);
+// MT19937& rng = *prng;
+
+ vector<vector<int> > corpuse, corpusf;
+ set<int> vocabe, vocabf;
+ corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+ cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
+ cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
+ assert(corpusf.size() == corpuse.size());
+
+ Model1 m1(conf["model1"].as<string>());
+
+ PhraseConditionalBase pcb0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size());
+ ConditionalParallelSegementationModel<PhraseConditionalBase> x(pcb0);
+
+ ModelAndData posterior(x, corpuse, corpusf, vocabe, vocabf);
+ posterior.Sample();
+
+ TRule r1("[X] ||| x ||| l e ||| X=0");
+ TRule r2("[X] ||| A ||| a d ||| X=0");
+ TRule r3("[X] ||| n ||| e r ||| X=0");
+ TRule r4("[X] ||| x A n ||| b l a g ||| X=0");
+
+ PhraseConditionalUninformativeBase u0(vocabe.size());
+
+ cerr << (pcb0(r1)*pcb0(r2)*pcb0(r3)) << endl;
+ cerr << (u0(r4)) << endl;
+
+ return 0;
+}
+
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc
index a408e7cf..cb6e4ed7 100644
--- a/gi/pf/corpus.cc
+++ b/gi/pf/corpus.cc
@@ -24,11 +24,11 @@ void ReadParallelCorpus(const string& filename,
istream* in = rf.stream();
assert(*in);
string line;
+ unsigned lc = 0;
const WordID kDIV = TD::Convert("|||");
vector<WordID> tmp;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
+ while(getline(*in, line)) {
+ ++lc;
e->push_back(vector<int>());
f->push_back(vector<int>());
vector<int>& le = e->back();
@@ -39,12 +39,17 @@ void ReadParallelCorpus(const string& filename,
for (unsigned i = 0; i < tmp.size(); ++i) {
const int cur = tmp[i];
if (isf) {
- if (kDIV == cur) { isf = false; } else {
+ if (kDIV == cur) {
+ isf = false;
+ } else {
lf.push_back(cur);
vocab_f->insert(cur);
}
} else {
- assert(cur != kDIV);
+ if (cur == kDIV) {
+ cerr << "ERROR in " << lc << ": " << line << endl << endl;
+ abort();
+ }
le.push_back(cur);
vocab_e->insert(cur);
}
diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc
index db1c43c7..469dff5c 100644
--- a/gi/pf/dpnaive.cc
+++ b/gi/pf/dpnaive.cc
@@ -6,7 +6,7 @@
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
-#include "base_measures.h"
+#include "base_distributions.h"
#include "monotonic_pseg.h"
#include "trule.h"
#include "tdict.h"
diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl
new file mode 100755
index 00000000..d00c2168
--- /dev/null
+++ b/gi/pf/guess-translits.pl
@@ -0,0 +1,72 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+my $MIN_PMI = -3;
+
+my %fs;
+my %es;
+my %ef;
+
+die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+binmode(STDERR,":utf8");
+
+my $tot = 0;
+print STDERR "Reading alignments from STDIN ...\n";
+while(<STDIN>) {
+ chomp;
+ my ($fsent, $esent, $alsent) = split / \|\|\| /;
+ die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent;
+
+ my @fws = split /\s+/, $fsent;
+ my @ews = split /\s+/, $esent;
+ my @as = split /\s+/, $alsent;
+ my %a2b;
+ my %b2a;
+ for my $ap (@as) {
+ my ($a,$b) = split /-/, $ap;
+ die "BAD INPUT: $_\n" unless defined $a && defined $b;
+ $a2b{$a}->{$b} = 1;
+ $b2a{$b}->{$a} = 1;
+ }
+ for my $a (keys %a2b) {
+ my $bref = $a2b{$a};
+ next unless scalar keys %$bref < 2;
+ my $b = (keys %$bref)[0];
+ next unless scalar keys %{$b2a{$b}} < 2;
+ my $f = $fws[$a];
+ next unless defined $f;
+ next unless length($f) > 3;
+ my $e = $ews[$b];
+ next unless defined $e;
+ next unless length($e) > 3;
+
+ $ef{$f}->{$e}++;
+ $es{$e}++;
+ $fs{$f}++;
+ $tot++;
+ }
+}
+my $ltot = log($tot);
+my $num = 0;
+print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n";
+for my $f (keys %fs) {
+ my $logf = log($fs{$f});
+ my $esref = $ef{$f};
+ for my $e (keys %$esref) {
+ my $loge = log($es{$e});
+ my $ef = $esref->{$e};
+ my $logef = log($ef);
+ my $pmi = $logef - ($loge + $logf);
+ next if $pmi < $MIN_PMI;
+ my @flets = split //, $f;
+ my @elets = split //, $e;
+ print "@flets ||| @elets\n";
+ $num++;
+ }
+}
+print STDERR "Extracted $num pairs.\n";
+print STDERR "Recommend running:\n ../../training/model1 -v -d -t -99999 output.txt\n";
diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc
index ac3c16a3..a38fe672 100644
--- a/gi/pf/itg.cc
+++ b/gi/pf/itg.cc
@@ -27,10 +27,67 @@ ostream& operator<<(ostream& os, const vector<WordID>& p) {
return os << ']';
}
-double log_poisson(unsigned x, const double& lambda) {
- assert(lambda > 0.0);
- return log(lambda) * x - lgamma(x + 1) - lambda;
-}
+struct UnigramModel {
+ explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) :
+ use_uniform_(fname.size() == 0),
+ p0null_(p0null),
+ uniform_((1.0 - p0null) / vocab_size),
+ probs_(TD::NumWords() + 1) {
+ if (fname.size() > 0) LoadUnigrams(fname);
+ probs_[0] = p0null_;
+ }
+
+//
+// \data\
+// ngram 1=9295
+//
+// \1-grams:
+// -3.191193 "
+
+ void LoadUnigrams(const string& fname) {
+ cerr << "Loading unigram probabilities from " << fname << " ..." << endl;
+ ReadFile rf(fname);
+ string line;
+ istream& in = *rf.stream();
+ assert(in);
+ getline(in, line);
+ assert(line.empty());
+ getline(in, line);
+ assert(line == "\\data\\");
+ getline(in, line);
+ size_t pos = line.find("ngram 1=");
+ assert(pos == 0);
+ assert(line.size() > 8);
+ const size_t num_unigrams = atoi(&line[8]);
+ getline(in, line);
+ assert(line.empty());
+ getline(in, line);
+ assert(line == "\\1-grams:");
+ for (size_t i = 0; i < num_unigrams; ++i) {
+ getline(in, line);
+ assert(line.size() > 0);
+ pos = line.find('\t');
+ assert(pos > 0);
+ assert(pos + 1 < line.size());
+ const WordID w = TD::Convert(line.substr(pos + 1));
+ line[pos] = 0;
+ float p = atof(&line[0]);
+ const prob_t pnon_null(1.0 - p0null_.as_float());
+ if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort();
+ }
+ }
+
+ const prob_t& operator()(const WordID& w) const {
+ if (!w) return p0null_;
+ if (use_uniform_) return uniform_;
+ return probs_[w];
+ }
+
+ const bool use_uniform_;
+ const prob_t p0null_;
+ const prob_t uniform_;
+ vector<prob_t> probs_;
+};
struct Model1 {
explicit Model1(const string& fname) :
@@ -89,11 +146,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
("particles,p",po::value<unsigned>()->default_value(25),"Number of particles")
("input,i",po::value<string>(),"Read parallel data from")
- ("max_src_phrase",po::value<unsigned>()->default_value(7),"Maximum length of source language phrases")
- ("max_trg_phrase",po::value<unsigned>()->default_value(7),"Maximum length of target language phrases")
("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")
("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+ ("src_unigram,u",po::value<string>()->default_value(""),"Source unigram distribution; empty for uniform")
+ ("trg_unigram,U",po::value<string>()->default_value(""),"Target unigram distribution; empty for uniform")
("random_seed,S",po::value<uint32_t>(), "Random seed");
po::options_description clo("Command line options");
clo.add_options()
@@ -165,11 +222,11 @@ void ReadParallelCorpus(const string& filename,
int main(int argc, char** argv) {
po::variables_map conf;
InitCommandLine(argc, argv, &conf);
- const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
- const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
const unsigned particles = conf["particles"].as<unsigned>();
const unsigned samples = conf["samples"].as<unsigned>();
-
+ TD::Convert("<s>");
+ TD::Convert("</s>");
+ TD::Convert("<unk>");
if (!conf.count("model1")) {
cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
return 1;
@@ -188,23 +245,28 @@ int main(int argc, char** argv) {
cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
assert(corpusf.size() == corpuse.size());
+ UnigramModel src_unigram(conf["src_unigram"].as<string>(), vocabf.size());
+ UnigramModel trg_unigram(conf["trg_unigram"].as<string>(), vocabe.size());
+ const prob_t kHALF(0.5);
+ const string kEMPTY = "NULL";
const int kLHS = -TD::Convert("X");
Model1 m1(conf["model1"].as<string>());
Model1 invm1(conf["inverse_model1"].as<string>());
for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) {
cerr << '.' << flush;
for (int ci = 0; ci < corpusf.size(); ++ci) {
- const vector<WordID>& src = corpusf[ci];
const vector<WordID>& trg = corpuse[ci];
- for (int i = 0; i < src.size(); ++i) {
- for (int j = 0; j < trg.size(); ++j) {
- const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE);
- for (int k = 0; k < eff_max_src; ++k) {
- const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE));
- for (int l = 0; l < eff_max_trg; ++l) {
- }
- }
+ const vector<WordID>& src = corpusf[ci];
+ for (int i = 0; i <= trg.size(); ++i) {
+ const WordID e_i = i > 0 ? trg[i-1] : 0;
+ for (int j = 0; j <= src.size(); ++j) {
+ const WordID f_j = j > 0 ? src[j-1] : 0;
+ if (e_i == 0 && f_j == 0) continue;
+ prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j);
+ cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl;
+ if (e_i && f_j)
+ cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl;
}
}
}
diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc
new file mode 100644
index 00000000..ed1772bf
--- /dev/null
+++ b/gi/pf/learn_cfg.cc
@@ -0,0 +1,428 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "inside_outside.h"
+#include "hg.h"
+#include "bottom_up_parser.h"
+#include "fdict.h"
+#include "grammar.h"
+#include "m.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp.h"
+#include "ccrp_onetable.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+shared_ptr<MT19937> prng;
+vector<int> nt_vocab;
+vector<int> nt_id_to_index;
+static unsigned kMAX_RULE_SIZE = 0;
+static unsigned kMAX_ARITY = 0;
+static bool kALLOW_MIXED = true; // allow rules with mixed terminals and NTs
+static bool kHIERARCHICAL_PRIOR = false;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("input,i",po::value<string>(),"Read parallel data from")
+ ("max_rule_size,m", po::value<unsigned>()->default_value(0), "Maximum rule size (0 for unlimited)")
+ ("max_arity,a", po::value<unsigned>()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)")
+ ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS")
+ ("nonterminals,n", po::value<unsigned>()->default_value(1), "Size of nonterminal vocabulary")
+ ("hierarchical_prior,h", "Use hierarchical prior")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("input") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+unsigned ReadCorpus(const string& filename,
+ vector<vector<WordID> >* e,
+ set<WordID>* vocab_e) {
+ e->clear();
+ vocab_e->clear();
+ istream* in;
+ if (filename == "-")
+ in = &cin;
+ else
+ in = new ifstream(filename.c_str());
+ assert(*in);
+ string line;
+ unsigned toks = 0;
+ while(*in) {
+ getline(*in, line);
+ if (line.empty() && !*in) break;
+ e->push_back(vector<int>());
+ vector<int>& le = e->back();
+ TD::ConvertSentence(line, &le);
+ for (unsigned i = 0; i < le.size(); ++i)
+ vocab_e->insert(le[i]);
+ toks += le.size();
+ }
+ if (in != &cin) delete in;
+ return toks;
+}
+
+struct Grid {
+ // a b c d e
+ // 0 - 0 - -
+ vector<int> grid;
+};
+
+struct BaseRuleModel {
+ explicit BaseRuleModel(unsigned term_size,
+ unsigned nonterm_size = 1) :
+ unif_term(1.0 / term_size),
+ unif_nonterm(1.0 / nonterm_size) {}
+ prob_t operator()(const TRule& r) const {
+ prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size()));
+ const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2));
+ const prob_t nonterm_prob(1.0 - term_prob.as_float());
+ for (unsigned i = 0; i < r.f_.size(); ++i) {
+ if (r.f_[i] <= 0) { // nonterminal
+ if (kALLOW_MIXED) p *= nonterm_prob;
+ p *= unif_nonterm;
+ } else { // terminal
+ if (kALLOW_MIXED) p *= term_prob;
+ p *= unif_term;
+ }
+ }
+ return p;
+ }
+ const prob_t unif_term, unif_nonterm;
+};
+
+struct HieroLMModel {
+ explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) :
+ base(vocab_size, num_nts),
+ q0(1,1,1,1),
+ nts(num_nts, CCRP<TRule>(1,1,1,1)) {}
+
+ prob_t Prob(const TRule& r) const {
+ return nts[nt_id_to_index[-r.lhs_]].prob(r, p0(r));
+ }
+
+ inline prob_t p0(const TRule& r) const {
+ if (kHIERARCHICAL_PRIOR)
+ return q0.prob(r, base(r));
+ else
+ return base(r);
+ }
+
+ int Increment(const TRule& r, MT19937* rng) {
+ const int delta = nts[nt_id_to_index[-r.lhs_]].increment(r, p0(r), rng);
+ if (kHIERARCHICAL_PRIOR && delta)
+ q0.increment(r, base(r), rng);
+ return delta;
+ // return x.increment(r);
+ }
+
+ int Decrement(const TRule& r, MT19937* rng) {
+ const int delta = nts[nt_id_to_index[-r.lhs_]].decrement(r, rng);
+ if (kHIERARCHICAL_PRIOR && delta)
+ q0.decrement(r, rng);
+ return delta;
+ //return x.decrement(r);
+ }
+
+ prob_t Likelihood() const {
+ prob_t p = prob_t::One();
+ for (unsigned i = 0; i < nts.size(); ++i) {
+ prob_t q; q.logeq(nts[i].log_crp_prob());
+ p *= q;
+ for (CCRP<TRule>::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) {
+ prob_t tp = p0(it->first);
+ tp.poweq(it->second.table_counts_.size());
+ p *= tp;
+ }
+ }
+ if (kHIERARCHICAL_PRIOR) {
+ prob_t q; q.logeq(q0.log_crp_prob());
+ p *= q;
+ for (CCRP<TRule>::const_iterator it = q0.begin(); it != q0.end(); ++it) {
+ prob_t tp = base(it->first);
+ tp.poweq(it->second.table_counts_.size());
+ p *= tp;
+ }
+ }
+ //for (CCRP_OneTable<TRule>::const_iterator it = x.begin(); it != x.end(); ++it)
+ // p *= base(it->first);
+ return p;
+ }
+
+ void ResampleHyperparameters(MT19937* rng) {
+ for (unsigned i = 0; i < nts.size(); ++i)
+ nts[i].resample_hyperparameters(rng);
+ if (kHIERARCHICAL_PRIOR) {
+ q0.resample_hyperparameters(rng);
+ cerr << "[base d=" << q0.discount() << ", s=" << q0.strength() << "]";
+ }
+ cerr << " d=" << nts[0].discount() << ", s=" << nts[0].strength() << endl;
+ }
+
+ const BaseRuleModel base;
+ CCRP<TRule> q0;
+ vector<CCRP<TRule> > nts;
+ //CCRP_OneTable<TRule> x;
+};
+
+vector<GrammarIter* > tofreelist;
+
+HieroLMModel* plm;
+
+struct NPGrammarIter : public GrammarIter, public RuleBin {
+ NPGrammarIter() : arity() { tofreelist.push_back(this); }
+ NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) {
+ if (inr) {
+ r.reset(new TRule(*inr));
+ } else {
+ r.reset(new TRule);
+ }
+ TRule& rr = *r;
+ rr.lhs_ = nt_vocab[0];
+ rr.f_.push_back(symbol);
+ rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol);
+ tofreelist.push_back(this);
+ }
+ inline static unsigned NextArity(int cur_a, int symbol) {
+ return cur_a + (symbol <= 0 ? 1 : 0);
+ }
+ virtual int GetNumRules() const {
+ if (r) return nt_vocab.size(); else return 0;
+ }
+ virtual TRulePtr GetIthRule(int i) const {
+ if (i == 0) return r;
+ TRulePtr nr(new TRule(*r));
+ nr->lhs_ = nt_vocab[i];
+ return nr;
+ }
+ virtual int Arity() const {
+ return arity;
+ }
+ virtual const RuleBin* GetRules() const {
+ if (!r) return NULL; else return this;
+ }
+ virtual const GrammarIter* Extend(int symbol) const {
+ const int next_arity = NextArity(arity, symbol);
+ if (kMAX_ARITY && next_arity > kMAX_ARITY)
+ return NULL;
+ if (!kALLOW_MIXED && r) {
+ bool t1 = r->f_.front() <= 0;
+ bool t2 = symbol <= 0;
+ if (t1 != t2) return NULL;
+ }
+ if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE))
+ return new NPGrammarIter(r, next_arity, symbol);
+ else
+ return NULL;
+ }
+ const unsigned char arity;
+ TRulePtr r;
+};
+
+struct NPGrammar : public Grammar {
+ virtual const GrammarIter* GetRoot() const {
+ return new NPGrammarIter;
+ }
+};
+
+prob_t TotalProb(const Hypergraph& hg) {
+ return Inside<prob_t, EdgeProb>(hg);
+}
+
+void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv) {
+ vector<prob_t> node_probs;
+ Inside<prob_t, EdgeProb>(hg, &node_probs);
+ queue<unsigned> q;
+ q.push(hg.nodes_.size() - 2);
+ while(!q.empty()) {
+ unsigned cur_node_id = q.front();
+// cerr << "NODE=" << cur_node_id << endl;
+ q.pop();
+ const Hypergraph::Node& node = hg.nodes_[cur_node_id];
+ const unsigned num_in_edges = node.in_edges_.size();
+ unsigned sampled_edge = 0;
+ if (num_in_edges == 1) {
+ sampled_edge = node.in_edges_[0];
+ } else {
+ //prob_t z;
+ assert(num_in_edges > 1);
+ SampleSet<prob_t> ss;
+ for (unsigned j = 0; j < num_in_edges; ++j) {
+ const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+ prob_t p = edge.edge_prob_;
+ for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
+ p *= node_probs[edge.tail_nodes_[k]];
+ ss.add(p);
+// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl;
+ //z += p;
+ }
+// for (unsigned j = 0; j < num_in_edges; ++j) {
+// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl;
+// }
+// cerr << " --- \n";
+ sampled_edge = node.in_edges_[rng->SelectSample(ss)];
+ }
+ sampled_deriv->push_back(sampled_edge);
+ const Hypergraph::Edge& edge = hg.edges_[sampled_edge];
+ for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
+ q.push(edge.tail_nodes_[j]);
+ }
+ }
+ for (unsigned i = 0; i < sampled_deriv->size(); ++i) {
+ cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl;
+ }
+}
+
+void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
+ for (unsigned i = 0; i < d.size(); ++i)
+ plm->Increment(*hg.edges_[d[i]].rule_, rng);
+}
+
+void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
+ for (unsigned i = 0; i < d.size(); ++i)
+ plm->Decrement(*hg.edges_[d[i]].rule_, rng);
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+
+ InitCommandLine(argc, argv, &conf);
+ nt_vocab.resize(conf["nonterminals"].as<unsigned>());
+ assert(nt_vocab.size() > 0);
+ assert(nt_vocab.size() < 26);
+ {
+ string nt = "X";
+ for (unsigned i = 0; i < nt_vocab.size(); ++i) {
+ if (nt_vocab.size() > 1) nt[0] = ('A' + i);
+ int pid = TD::Convert(nt);
+ nt_vocab[i] = -pid;
+ if (pid >= nt_id_to_index.size()) {
+ nt_id_to_index.resize(pid + 1, -1);
+ }
+ nt_id_to_index[pid] = i;
+ }
+ }
+ vector<GrammarPtr> grammars;
+ grammars.push_back(GrammarPtr(new NPGrammar));
+
+ const unsigned samples = conf["samples"].as<unsigned>();
+ kMAX_RULE_SIZE = conf["max_rule_size"].as<unsigned>();
+ if (kMAX_RULE_SIZE == 1) {
+ cerr << "Invalid maximum rule size: must be 0 or >1\n";
+ return 1;
+ }
+ kMAX_ARITY = conf["max_arity"].as<unsigned>();
+ if (kMAX_ARITY == 1) {
+ cerr << "Invalid maximum arity: must be 0 or >1\n";
+ return 1;
+ }
+ kALLOW_MIXED = !conf.count("no_mixed_rules");
+
+ kHIERARCHICAL_PRIOR = conf.count("hierarchical_prior");
+
+ if (conf.count("random_seed"))
+ prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ prng.reset(new MT19937);
+ MT19937& rng = *prng;
+ vector<vector<WordID> > corpuse;
+ set<WordID> vocabe;
+ cerr << "Reading corpus...\n";
+ const unsigned toks = ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe);
+ cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+ HieroLMModel lm(vocabe.size(), nt_vocab.size());
+
+ plm = &lm;
+ ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars);
+
+ Hypergraph hg;
+ const int kGoal = -TD::Convert("Goal");
+ const int kLP = FD::Convert("LogProb");
+ SparseVector<double> v; v.set_value(kLP, 1.0);
+ vector<vector<unsigned> > derivs(corpuse.size());
+ vector<Lattice> cl(corpuse.size());
+ for (int ci = 0; ci < corpuse.size(); ++ci) {
+ vector<int>& src = corpuse[ci];
+ Lattice& lat = cl[ci];
+ lat.resize(src.size());
+ for (unsigned i = 0; i < src.size(); ++i)
+ lat[i].push_back(LatticeArc(src[i], 0.0, 1));
+ }
+ for (int SS=0; SS < samples; ++SS) {
+ const bool is_last = ((samples - 1) == SS);
+ prob_t dlh = prob_t::One();
+ for (int ci = 0; ci < corpuse.size(); ++ci) {
+ const vector<int>& src = corpuse[ci];
+ const Lattice& lat = cl[ci];
+ cerr << TD::GetString(src) << endl;
+ hg.clear();
+ parser.Parse(lat, &hg); // exhaustive parse
+ vector<unsigned>& d = derivs[ci];
+ if (!is_last) DecrementDerivation(hg, d, &lm, &rng);
+ for (unsigned i = 0; i < hg.edges_.size(); ++i) {
+ TRule& r = *hg.edges_[i].rule_;
+ if (r.lhs_ == kGoal)
+ hg.edges_[i].edge_prob_ = prob_t::One();
+ else
+ hg.edges_[i].edge_prob_ = lm.Prob(r);
+ }
+ if (!is_last) {
+ d.clear();
+ SampleDerivation(hg, &rng, &d);
+ IncrementDerivation(hg, derivs[ci], &lm, &rng);
+ } else {
+ prob_t p = TotalProb(hg);
+ dlh *= p;
+ cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl;
+ }
+ if (tofreelist.size() > 200000) {
+ cerr << "Freeing ... ";
+ for (unsigned i = 0; i < tofreelist.size(); ++i)
+ delete tofreelist[i];
+ tofreelist.clear();
+ cerr << "Freed.\n";
+ }
+ }
+ double llh = log(lm.Likelihood());
+ cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl;
+ if (SS % 10 == 9) lm.ResampleHyperparameters(&rng);
+ if (is_last) {
+ double z = log(dlh);
+ cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl;
+ }
+ }
+ for (unsigned i = 0; i < nt_vocab.size(); ++i)
+ cerr << lm.nts[i] << endl;
+ return 0;
+}
+
diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl
new file mode 100755
index 00000000..fdcd3555
--- /dev/null
+++ b/gi/pf/make-freq-bins.pl
@@ -0,0 +1,26 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $BASE = 6;
+my $CUTOFF = 3;
+
+my %d;
+my $num = 0;
+while(<>){
+ chomp;
+ my @words = split /\s+/;
+ for my $w (@words) {$d{$w}++; $num++;}
+}
+
+my @vocab = sort {$d{$b} <=> $d{$a}} keys %d;
+
+for (my $i=0; $i<scalar @vocab; $i++) {
+ my $most = $d{$vocab[$i]};
+ my $least = 1;
+
+ my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF);
+ if ($nl < 0) { $nl = 0; }
+ print "$vocab[$i] $nl\n"
+}
+
+
diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h
index 301aa6d8..10d171fe 100644
--- a/gi/pf/monotonic_pseg.h
+++ b/gi/pf/monotonic_pseg.h
@@ -6,7 +6,7 @@
#include "prob.h"
#include "ccrp_nt.h"
#include "trule.h"
-#include "base_measures.h"
+#include "base_distributions.h"
template <typename BaseMeasure>
struct MonotonicParallelSegementationModel {
diff --git a/gi/pf/ngram_base.cc b/gi/pf/ngram_base.cc
new file mode 100644
index 00000000..1299f06f
--- /dev/null
+++ b/gi/pf/ngram_base.cc
@@ -0,0 +1,69 @@
+#include "ngram_base.h"
+
+#include "lm/model.hh"
+#include "tdict.h"
+
+using namespace std;
+
+namespace {
+struct GICSVMapper : public lm::EnumerateVocab {
+ GICSVMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); }
+ void Add(lm::WordIndex index, const StringPiece &str) {
+ const WordID cdec_id = TD::Convert(str.as_string());
+ if (cdec_id >= out_->size())
+ out_->resize(cdec_id + 1, kLM_UNKNOWN_TOKEN);
+ (*out_)[cdec_id] = index;
+ }
+ vector<lm::WordIndex>* out_;
+ const lm::WordIndex kLM_UNKNOWN_TOKEN;
+};
+}
+
+struct FixedNgramBaseImpl {
+ FixedNgramBaseImpl(const string& param) {
+ GICSVMapper vm(&cdec2klm_map_);
+ lm::ngram::Config conf;
+ conf.enumerate_vocab = &vm;
+ cerr << "Reading character LM from " << param << endl;
+ model = new lm::ngram::ProbingModel(param.c_str(), conf);
+ order = model->Order();
+ kEOS = MapWord(TD::Convert("</s>"));
+ assert(kEOS > 0);
+ }
+
+ lm::WordIndex MapWord(const WordID w) const {
+ if (w < cdec2klm_map_.size()) return cdec2klm_map_[w];
+ return 0;
+ }
+
+ ~FixedNgramBaseImpl() { delete model; }
+
+ prob_t StringProbability(const vector<WordID>& s) const {
+ lm::ngram::State state = model->BeginSentenceState();
+ double prob = 0;
+ for (unsigned i = 0; i < s.size(); ++i) {
+ const lm::ngram::State scopy(state);
+ prob += model->Score(scopy, MapWord(s[i]), state);
+ }
+ const lm::ngram::State scopy(state);
+ prob += model->Score(scopy, kEOS, state);
+ prob_t p; p.logeq(prob * log(10));
+ return p;
+ }
+
+ lm::ngram::ProbingModel* model;
+ unsigned order;
+ vector<lm::WordIndex> cdec2klm_map_;
+ lm::WordIndex kEOS;
+};
+
+FixedNgramBase::~FixedNgramBase() { delete impl; }
+
+FixedNgramBase::FixedNgramBase(const string& lmfname) {
+ impl = new FixedNgramBaseImpl(lmfname);
+}
+
+prob_t FixedNgramBase::StringProbability(const vector<WordID>& s) const {
+ return impl->StringProbability(s);
+}
+
diff --git a/gi/pf/ngram_base.h b/gi/pf/ngram_base.h
new file mode 100644
index 00000000..4ea999f3
--- /dev/null
+++ b/gi/pf/ngram_base.h
@@ -0,0 +1,25 @@
+#ifndef _NGRAM_BASE_H_
+#define _NGRAM_BASE_H_
+
+#include <string>
+#include <vector>
+#include "trule.h"
+#include "wordid.h"
+#include "prob.h"
+
+struct FixedNgramBaseImpl;
+struct FixedNgramBase {
+ FixedNgramBase(const std::string& lmfname);
+ ~FixedNgramBase();
+ prob_t StringProbability(const std::vector<WordID>& s) const;
+
+ prob_t operator()(const TRule& rule) const {
+ return StringProbability(rule.e_);
+ }
+
+ private:
+ FixedNgramBaseImpl* impl;
+
+};
+
+#endif
diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc
new file mode 100644
index 00000000..fc0af9cb
--- /dev/null
+++ b/gi/pf/nuisance_test.cc
@@ -0,0 +1,161 @@
+#include "ccrp.h"
+
+#include <vector>
+#include <iostream>
+
+#include "tdict.h"
+#include "transliterations.h"
+
+using namespace std;
+
+MT19937 rng;
+
+ostream& operator<<(ostream&os, const vector<int>& v) {
+ os << '[' << v[0];
+ if (v.size() == 2) os << ' ' << v[1];
+ return os << ']';
+}
+
+struct Base {
+ Base() : llh(), v(2), v1(1), v2(1), crp(0.25, 0.5) {}
+ inline double p0(const vector<int>& x) const {
+ double p = 0.75;
+ if (x.size() == 2) p = 0.25;
+ p *= 1.0 / 3.0;
+ if (x.size() == 2) p *= 1.0 / 3.0;
+ return p;
+ }
+ double est_deriv_prob(int a, int b, int seg) const {
+ assert(a > 0 && a < 4); // a \in {1,2,3}
+ assert(b > 0 && b < 4); // b \in {1,2,3}
+ assert(seg == 0 || seg == 1); // seg \in {0,1}
+ if (seg == 0) {
+ v[0] = a;
+ v[1] = b;
+ return crp.prob(v, p0(v));
+ } else {
+ v1[0] = a;
+ v2[0] = b;
+ return crp.prob(v1, p0(v1)) * crp.prob(v2, p0(v2));
+ }
+ }
+ double est_marginal_prob(int a, int b) const {
+ return est_deriv_prob(a,b,0) + est_deriv_prob(a,b,1);
+ }
+ int increment(int a, int b, double* pw = NULL) {
+ double p1 = est_deriv_prob(a, b, 0);
+ double p2 = est_deriv_prob(a, b, 1);
+ //p1 = 0.5; p2 = 0.5;
+ int seg = rng.SelectSample(p1,p2);
+ double tmp = 0;
+ if (!pw) pw = &tmp;
+ double& w = *pw;
+ if (seg == 0) {
+ v[0] = a;
+ v[1] = b;
+ w = crp.prob(v, p0(v)) / p1;
+ if (crp.increment(v, p0(v), &rng)) {
+ llh += log(p0(v));
+ }
+ } else {
+ v1[0] = a;
+ w = crp.prob(v1, p0(v1)) / p2;
+ if (crp.increment(v1, p0(v1), &rng)) {
+ llh += log(p0(v1));
+ }
+ v2[0] = b;
+ w *= crp.prob(v2, p0(v2));
+ if (crp.increment(v2, p0(v2), &rng)) {
+ llh += log(p0(v2));
+ }
+ }
+ return seg;
+ }
+ void increment(int a, int b, int seg) {
+ if (seg == 0) {
+ v[0] = a;
+ v[1] = b;
+ if (crp.increment(v, p0(v), &rng)) {
+ llh += log(p0(v));
+ }
+ } else {
+ v1[0] = a;
+ if (crp.increment(v1, p0(v1), &rng)) {
+ llh += log(p0(v1));
+ }
+ v2[0] = b;
+ if (crp.increment(v2, p0(v2), &rng)) {
+ llh += log(p0(v2));
+ }
+ }
+ }
+ void decrement(int a, int b, int seg) {
+ if (seg == 0) {
+ v[0] = a;
+ v[1] = b;
+ if (crp.decrement(v, &rng)) {
+ llh -= log(p0(v));
+ }
+ } else {
+ v1[0] = a;
+ if (crp.decrement(v1, &rng)) {
+ llh -= log(p0(v1));
+ }
+ v2[0] = b;
+ if (crp.decrement(v2, &rng)) {
+ llh -= log(p0(v2));
+ }
+ }
+ }
+ double log_likelihood() const {
+ return llh + crp.log_crp_prob();
+ }
+ double llh;
+ mutable vector<int> v, v1, v2;
+ CCRP<vector<int> > crp;
+};
+
+int main(int argc, char** argv) {
+ double tl = 0;
+ const int ITERS = 1000;
+ const int PARTICLES = 20;
+ const int DATAPOINTS = 50;
+ WordID x = TD::Convert("souvenons");
+ WordID y = TD::Convert("remember");
+ vector<WordID> src; TD::ConvertSentence("s o u v e n o n s", &src);
+ vector<WordID> trg; TD::ConvertSentence("r e m e m b e r", &trg);
+// Transliterations xx;
+// xx.Initialize(x, src, y, trg);
+// return 1;
+
+ for (int j = 0; j < ITERS; ++j) {
+ Base b;
+ vector<int> segs(DATAPOINTS);
+ SampleSet<double> ss;
+ vector<int> sss;
+ for (int i = 0; i < DATAPOINTS; i++) {
+ ss.clear();
+ sss.clear();
+ int x = ((i / 10) % 3) + 1;
+ int y = (i % 3) + 1;
+ //double ep = b.est_marginal_prob(x,y);
+ //cerr << "est p(" << x << "," << y << ") = " << ep << endl;
+ for (int n = 0; n < PARTICLES; ++n) {
+ double w;
+ int seg = b.increment(x,y,&w);
+ //cerr << seg << " w=" << w << endl;
+ ss.add(w);
+ sss.push_back(seg);
+ b.decrement(x,y,seg);
+ }
+ int seg = sss[rng.SelectSample(ss)];
+ b.increment(x, y, seg);
+ //cerr << "Selected: " << seg << endl;
+ //return 1;
+ segs[i] = seg;
+ }
+ tl += b.log_likelihood();
+ }
+ cerr << "LLH=" << tl / ITERS << endl;
+}
+
diff --git a/gi/pf/os_phrase.h b/gi/pf/os_phrase.h
new file mode 100644
index 00000000..dfe40cb1
--- /dev/null
+++ b/gi/pf/os_phrase.h
@@ -0,0 +1,15 @@
+#ifndef _OS_PHRASE_H_
+#define _OS_PHRASE_H_
+
+#include <iostream>
+#include <vector>
+#include "tdict.h"
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) {
+ os << '[';
+ for (int i = 0; i < p.size(); ++i)
+ os << (i==0 ? "" : " ") << TD::Convert(p[i]);
+ return os << ']';
+}
+
+#endif
diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc
index 7b60ef23..c2c52760 100644
--- a/gi/pf/pfbrat.cc
+++ b/gi/pf/pfbrat.cc
@@ -191,7 +191,7 @@ struct UniphraseLM {
void ResampleHyperparameters(MT19937* rng) {
phrases_.resample_hyperparameters(rng);
gen_.resample_hyperparameters(rng);
- cerr << " " << phrases_.concentration();
+ cerr << " " << phrases_.alpha();
}
CCRP_NoTable<vector<int> > phrases_;
diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc
index aae5f798..3d578db2 100644
--- a/gi/pf/pfdist.cc
+++ b/gi/pf/pfdist.cc
@@ -7,7 +7,7 @@
#include <boost/program_options/variables_map.hpp>
#include "pf.h"
-#include "base_measures.h"
+#include "base_distributions.h"
#include "reachability.h"
#include "viterbi.h"
#include "hg.h"
@@ -315,7 +315,7 @@ struct BackwardEstimate {
for (int i = 0; i < src_cov.size(); ++i)
if (!src_cov[i]) r.push_back(src_[i]);
const prob_t uniform_alignment(1.0 / r.size());
- e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
+ e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
for (unsigned j = trg_cov; j < trg_.size(); ++j) {
prob_t p;
for (unsigned i = 0; i < r.size(); ++i)
@@ -352,7 +352,7 @@ struct BackwardEstimateSym {
if (!src_cov[i]) r.push_back(src_[i]);
r.push_back(0); // NULL word
const prob_t uniform_alignment(1.0 / r.size());
- e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
+ e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
for (unsigned j = trg_cov; j < trg_.size(); ++j) {
prob_t p;
for (unsigned i = 0; i < r.size(); ++i)
@@ -367,7 +367,7 @@ struct BackwardEstimateSym {
r.pop_back();
const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0));
prob_t inv;
- inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov));
+ inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov));
for (unsigned i = 0; i < r.size(); ++i) {
prob_t p;
for (unsigned j = trg_cov - 1; j < trg_.size(); ++j)
diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc
index 728ec00d..e1a53f5c 100644
--- a/gi/pf/pfnaive.cc
+++ b/gi/pf/pfnaive.cc
@@ -7,7 +7,7 @@
#include <boost/program_options/variables_map.hpp>
#include "pf.h"
-#include "base_measures.h"
+#include "base_distributions.h"
#include "monotonic_pseg.h"
#include "reachability.h"
#include "viterbi.h"
@@ -77,7 +77,7 @@ struct BackwardEstimateSym {
r.push_back(src_[i]);
r.push_back(0); // NULL word
const prob_t uniform_alignment(1.0 / r.size());
- e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
+ e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
for (unsigned j = trg_cov; j < trg_.size(); ++j) {
prob_t p;
for (unsigned i = 0; i < r.size(); ++i)
@@ -92,7 +92,7 @@ struct BackwardEstimateSym {
r.pop_back();
const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0));
prob_t inv;
- inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov));
+ inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov));
for (unsigned i = 0; i < r.size(); ++i) {
prob_t p;
for (unsigned j = trg_cov - 1; j < trg_.size(); ++j)
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
new file mode 100644
index 00000000..91029688
--- /dev/null
+++ b/gi/pf/pyp_lm.cc
@@ -0,0 +1,209 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "corpus_tools.h"
+#include "m.h"
+#include "tdict.h"
+#include "sampler.h"
+#include "ccrp.h"
+#include "tied_resampler.h"
+
+// A not very memory-efficient implementation of an N-gram LM based on PYPs
+// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model
+// based on Pitman-Yor Processes. In Proc. ACL.
+
+// I use templates to handle the recursive formalation of the prior, so
+// the order of the model has to be specified here, at compile time:
+#define kORDER 3
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+shared_ptr<MT19937> prng;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,n",po::value<unsigned>()->default_value(300),"Number of samples")
+ ("train,i",po::value<string>(),"Training data file")
+ ("test,T",po::value<string>(),"Test data file")
+ ("discount_prior_a,a",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): a=this")
+ ("discount_prior_b,b",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): b=this")
+ ("strength_prior_s,s",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): s=this")
+ ("strength_prior_r,r",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): r=this")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("train") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+template <unsigned N> struct PYPLM;
+
+// uniform base distribution (0-gram model)
+template<> struct PYPLM<0> {
+ PYPLM(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {}
+ void increment(WordID, const vector<WordID>&, MT19937*) { ++draws; }
+ void decrement(WordID, const vector<WordID>&, MT19937*) { --draws; assert(draws >= 0); }
+ double prob(WordID, const vector<WordID>&) const { return p0; }
+ void resample_hyperparameters(MT19937*) {}
+ double log_likelihood() const { return draws * log(p0); }
+ const double p0;
+ int draws;
+};
+
+// represents an N-gram LM
+template <unsigned N> struct PYPLM {
+ PYPLM(unsigned vs, double da, double db, double ss, double sr) :
+ backoff(vs, da, db, ss, sr),
+ tr(da, db, ss, sr, 0.8, 1.0),
+ lookup(N-1) {}
+ void increment(WordID w, const vector<WordID>& context, MT19937* rng) {
+ const double bo = backoff.prob(w, context);
+ for (unsigned i = 0; i < N-1; ++i)
+ lookup[i] = context[context.size() - 1 - i];
+ typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
+ if (it == p.end()) {
+ it = p.insert(make_pair(lookup, CCRP<WordID>(0.5,1))).first;
+ tr.Add(&it->second); // add to resampler
+ }
+ if (it->second.increment(w, bo, rng))
+ backoff.increment(w, context, rng);
+ }
+ void decrement(WordID w, const vector<WordID>& context, MT19937* rng) {
+ for (unsigned i = 0; i < N-1; ++i)
+ lookup[i] = context[context.size() - 1 - i];
+ typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
+ assert(it != p.end());
+ if (it->second.decrement(w, rng))
+ backoff.decrement(w, context, rng);
+ }
+ double prob(WordID w, const vector<WordID>& context) const {
+ const double bo = backoff.prob(w, context);
+ for (unsigned i = 0; i < N-1; ++i)
+ lookup[i] = context[context.size() - 1 - i];
+ typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it = p.find(lookup);
+ if (it == p.end()) return bo;
+ return it->second.prob(w, bo);
+ }
+
+ double log_likelihood() const {
+ double llh = backoff.log_likelihood();
+ typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it;
+ for (it = p.begin(); it != p.end(); ++it)
+ llh += it->second.log_crp_prob();
+ llh += tr.LogLikelihood();
+ return llh;
+ }
+
+ void resample_hyperparameters(MT19937* rng) {
+ tr.ResampleHyperparameters(rng);
+ backoff.resample_hyperparameters(rng);
+ }
+
+ PYPLM<N-1> backoff;
+ TiedResampler<CCRP<WordID> > tr;
+ double discount_a, discount_b, strength_s, strength_r;
+ double d, strength;
+ mutable vector<WordID> lookup; // thread-local
+ unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p;
+};
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+
+ InitCommandLine(argc, argv, &conf);
+ const unsigned samples = conf["samples"].as<unsigned>();
+ if (conf.count("random_seed"))
+ prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ prng.reset(new MT19937);
+ MT19937& rng = *prng;
+ vector<vector<WordID> > corpuse;
+ set<WordID> vocabe;
+ const WordID kEOS = TD::Convert("</s>");
+ cerr << "Reading corpus...\n";
+ CorpusTools::ReadFromFile(conf["train"].as<string>(), &corpuse, &vocabe);
+ cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+ vector<vector<WordID> > test;
+ if (conf.count("test"))
+ CorpusTools::ReadFromFile(conf["test"].as<string>(), &test);
+ else
+ test = corpuse;
+ PYPLM<kORDER> lm(vocabe.size(),
+ conf["discount_prior_a"].as<double>(),
+ conf["discount_prior_b"].as<double>(),
+ conf["strength_prior_s"].as<double>(),
+ conf["strength_prior_r"].as<double>());
+ vector<WordID> ctx(kORDER - 1, TD::Convert("<s>"));
+ for (int SS=0; SS < samples; ++SS) {
+ for (int ci = 0; ci < corpuse.size(); ++ci) {
+ ctx.resize(kORDER - 1);
+ const vector<WordID>& s = corpuse[ci];
+ for (int i = 0; i <= s.size(); ++i) {
+ WordID w = (i < s.size() ? s[i] : kEOS);
+ if (SS > 0) lm.decrement(w, ctx, &rng);
+ lm.increment(w, ctx, &rng);
+ ctx.push_back(w);
+ }
+ if (SS > 0) lm.decrement(kEOS, ctx, &rng);
+ lm.increment(kEOS, ctx, &rng);
+ }
+ if (SS % 10 == 9) {
+ cerr << " [LLH=" << lm.log_likelihood() << "]" << endl;
+ if (SS % 20 == 19) lm.resample_hyperparameters(&rng);
+ } else { cerr << '.' << flush; }
+ }
+ double llh = 0;
+ unsigned cnt = 0;
+ unsigned oovs = 0;
+ for (int ci = 0; ci < test.size(); ++ci) {
+ ctx.resize(kORDER - 1);
+ const vector<WordID>& s = test[ci];
+ for (int i = 0; i <= s.size(); ++i) {
+ WordID w = (i < s.size() ? s[i] : kEOS);
+ double lp = log(lm.prob(w, ctx)) / log(2);
+ if (i < s.size() && vocabe.count(w) == 0) {
+ cerr << "**OOV ";
+ ++oovs;
+ lp = 0;
+ }
+ cerr << "p(" << TD::Convert(w) << " |";
+ for (int j = ctx.size() + 1 - kORDER; j < ctx.size(); ++j)
+ cerr << ' ' << TD::Convert(ctx[j]);
+ cerr << ") = " << lp << endl;
+ ctx.push_back(w);
+ llh -= lp;
+ cnt++;
+ }
+ }
+ cerr << " Log_10 prob: " << (-llh * log(2) / log(10)) << endl;
+ cerr << " Count: " << cnt << endl;
+ cerr << " OOVs: " << oovs << endl;
+ cerr << "Cross-entropy: " << (llh / cnt) << endl;
+ cerr << " Perplexity: " << pow(2, llh / cnt) << endl;
+ return 0;
+}
+
+
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
new file mode 100644
index 00000000..e21f0267
--- /dev/null
+++ b/gi/pf/pyp_tm.cc
@@ -0,0 +1,131 @@
+#include "pyp_tm.h"
+
+#include <tr1/unordered_map>
+#include <iostream>
+#include <queue>
+
+#include "tdict.h"
+#include "ccrp.h"
+#include "pyp_word_model.h"
+#include "tied_resampler.h"
+
+using namespace std;
+using namespace std::tr1;
+
+struct FreqBinner {
+ FreqBinner(const std::string& fname) { fd_.Load(fname); }
+ unsigned NumberOfBins() const { return fd_.Max() + 1; }
+ unsigned Bin(const WordID& w) const { return fd_.LookUp(w); }
+ FreqDict<unsigned> fd_;
+};
+
+template <typename Base, class Binner = FreqBinner>
+struct ConditionalPYPWordModel {
+ ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) :
+ base(*b),
+ binner(bnr),
+ btr(binner ? binner->NumberOfBins() + 1u : 2u) {}
+
+ void Summary() const {
+ cerr << "Number of conditioning contexts: " << r.size() << endl;
+ for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+ cerr << TD::Convert(it->first) << " \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl;
+ for (CCRP<vector<WordID> >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+ cerr << " " << i2->second.total_dish_count_ << '\t' << TD::GetString(i2->first) << endl;
+ }
+ }
+
+ void ResampleHyperparameters(MT19937* rng) {
+ btr.ResampleHyperparameters(rng);
+ }
+
+ prob_t Prob(const WordID src, const vector<WordID>& trglets) const {
+ RuleModelHash::const_iterator it = r.find(src);
+ if (it == r.end()) {
+ return base(trglets);
+ } else {
+ return it->second.prob(trglets, base(trglets));
+ }
+ }
+
+ void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
+ RuleModelHash::iterator it = r.find(src);
+ if (it == r.end()) {
+ it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first;
+ static const WordID kNULL = TD::Convert("NULL");
+ unsigned bin = (src == kNULL ? 0 : 1);
+ if (binner && bin) { bin = binner->Bin(src) + 1; }
+ btr.Add(bin, &it->second);
+ }
+ if (it->second.increment(trglets, base(trglets), rng))
+ base.Increment(trglets, rng);
+ }
+
+ void Decrement(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
+ RuleModelHash::iterator it = r.find(src);
+ assert(it != r.end());
+ if (it->second.decrement(trglets, rng)) {
+ base.Decrement(trglets, rng);
+ }
+ }
+
+ prob_t Likelihood() const {
+ prob_t p = prob_t::One();
+ for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+ prob_t q; q.logeq(it->second.log_crp_prob());
+ p *= q;
+ }
+ return p;
+ }
+
+ unsigned UniqueConditioningContexts() const {
+ return r.size();
+ }
+
+ // TODO tie PYP hyperparameters based on source word frequency bins
+ Base& base;
+ const Binner* binner;
+ BinTiedResampler<CCRP<vector<WordID> > > btr;
+ typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
+ RuleModelHash r;
+};
+
+PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets,
+ const unsigned num_letters) :
+ letters(lets),
+ up0(new PYPWordModel(num_letters)),
+ tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0, new FreqBinner("10k.freq"))),
+ kX(-TD::Convert("X")) {}
+
+void PYPLexicalTranslation::Summary() const {
+ tmodel->Summary();
+ up0->Summary();
+}
+
+prob_t PYPLexicalTranslation::Likelihood() const {
+ prob_t p = up0->Likelihood();
+ p *= tmodel->Likelihood();
+ return p;
+}
+
+void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) {
+ tmodel->ResampleHyperparameters(rng);
+ up0->ResampleHyperparameters(rng);
+}
+
+unsigned PYPLexicalTranslation::UniqueConditioningContexts() const {
+ return tmodel->UniqueConditioningContexts();
+}
+
+prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const {
+ return tmodel->Prob(src, letters[trg]);
+}
+
+void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) {
+ tmodel->Increment(src, letters[trg], rng);
+}
+
+void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) {
+ tmodel->Decrement(src, letters[trg], rng);
+}
+
diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h
new file mode 100644
index 00000000..63e7c96d
--- /dev/null
+++ b/gi/pf/pyp_tm.h
@@ -0,0 +1,35 @@
+#ifndef PYP_LEX_TRANS
+#define PYP_LEX_TRANS
+
+#include <vector>
+#include "wordid.h"
+#include "prob.h"
+#include "sampler.h"
+#include "freqdict.h"
+
+struct FreqBinner;
+struct PYPWordModel;
+template <typename T, class B> struct ConditionalPYPWordModel;
+
+struct PYPLexicalTranslation {
+ explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets,
+ const unsigned num_letters);
+
+ prob_t Likelihood() const;
+
+ void ResampleHyperparameters(MT19937* rng);
+ prob_t Prob(WordID src, WordID trg) const; // return p(trg | src)
+ void Summary() const;
+ void Increment(WordID src, WordID trg, MT19937* rng);
+ void Decrement(WordID src, WordID trg, MT19937* rng);
+ unsigned UniqueConditioningContexts() const;
+
+ private:
+ const std::vector<std::vector<WordID> >& letters; // spelling dictionary
+ PYPWordModel* up0; // base distribuction (model English word)
+ ConditionalPYPWordModel<PYPWordModel, FreqBinner>* tmodel; // translation distributions
+ // (model English word | French word)
+ const WordID kX;
+};
+
+#endif
diff --git a/gi/pf/pyp_word_model.cc b/gi/pf/pyp_word_model.cc
new file mode 100644
index 00000000..12df4abf
--- /dev/null
+++ b/gi/pf/pyp_word_model.cc
@@ -0,0 +1,20 @@
+#include "pyp_word_model.h"
+
+#include <iostream>
+
+using namespace std;
+
+void PYPWordModel::ResampleHyperparameters(MT19937* rng) {
+ r.resample_hyperparameters(rng);
+ cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n";
+}
+
+void PYPWordModel::Summary() const {
+ cerr << "PYPWordModel: generations=" << r.num_customers()
+ << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << endl;
+ for (CCRP<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
+ cerr << " " << it->second.total_dish_count_
+ << " (on " << it->second.table_counts_.size() << " tables) "
+ << TD::GetString(it->first) << endl;
+}
+
diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h
new file mode 100644
index 00000000..ff366865
--- /dev/null
+++ b/gi/pf/pyp_word_model.h
@@ -0,0 +1,58 @@
+#ifndef _PYP_WORD_MODEL_H_
+#define _PYP_WORD_MODEL_H_
+
+#include <iostream>
+#include <cmath>
+#include <vector>
+#include "prob.h"
+#include "ccrp.h"
+#include "m.h"
+#include "tdict.h"
+#include "os_phrase.h"
+
+// PYP(d,s,poisson-uniform) represented as a CRP
+struct PYPWordModel {
+ explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 5) :
+ base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {}
+
+ void ResampleHyperparameters(MT19937* rng);
+
+ inline prob_t operator()(const std::vector<WordID>& s) const {
+ return r.prob(s, p0(s));
+ }
+
+ inline void Increment(const std::vector<WordID>& s, MT19937* rng) {
+ if (r.increment(s, p0(s), rng))
+ base *= p0(s);
+ }
+
+ inline void Decrement(const std::vector<WordID>& s, MT19937 *rng) {
+ if (r.decrement(s, rng))
+ base /= p0(s);
+ }
+
+ inline prob_t Likelihood() const {
+ prob_t p; p.logeq(r.log_crp_prob());
+ p *= base;
+ return p;
+ }
+
+ void Summary() const;
+
+ private:
+ inline double logp0(const std::vector<WordID>& s) const {
+ return Md::log_poisson(s.size(), mean_length) + s.size() * u0;
+ }
+
+ inline prob_t p0(const std::vector<WordID>& s) const {
+ prob_t p; p.logeq(logp0(s));
+ return p;
+ }
+
+ prob_t base; // keeps track of the draws from the base distribution
+ CCRP<std::vector<WordID> > r;
+ const double u0; // uniform log prob of generating a letter
+ const double mean_length; // mean length of a word in the base distribution
+};
+
+#endif
diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h
new file mode 100644
index 00000000..588c8f84
--- /dev/null
+++ b/gi/pf/quasi_model2.h
@@ -0,0 +1,166 @@
+#ifndef _QUASI_MODEL2_H_
+#define _QUASI_MODEL2_H_
+
+#include <vector>
+#include <cmath>
+#include <tr1/unordered_map>
+#include "boost/functional.hpp"
+#include "prob.h"
+#include "array2d.h"
+#include "slice_sampler.h"
+#include "m.h"
+
+struct AlignmentObservation {
+ AlignmentObservation() : src_len(), trg_len(), j(), a_j() {}
+ AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) :
+ src_len(sl), trg_len(tl), j(tw), a_j(sw) {}
+ unsigned short src_len;
+ unsigned short trg_len;
+ unsigned short j;
+ unsigned short a_j;
+};
+
+inline size_t hash_value(const AlignmentObservation& o) {
+ return reinterpret_cast<const size_t&>(o);
+}
+
+inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) {
+ return hash_value(a) == hash_value(b);
+}
+
+struct QuasiModel2 {
+ explicit QuasiModel2(double alpha, double pnull = 0.1) :
+ alpha_(alpha),
+ pnull_(pnull),
+ pnotnull_(1 - pnull) {}
+
+ // a_j = 0 => NULL; src_len does *not* include null
+ prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const {
+ if (!a_j) return pnull_;
+ return pnotnull_ *
+ prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len));
+ }
+
+ void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) {
+ assert(a_j <= src_len);
+ assert(j < trg_len);
+ ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)];
+ }
+
+ void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) {
+ const AlignmentObservation ao(src_len, trg_len, j, a_j);
+ int &cc = obs_[ao];
+ assert(cc > 0);
+ --cc;
+ if (!cc) obs_.erase(ao);
+ }
+
+ struct PNullResampler {
+ PNullResampler(const QuasiModel2& m) : m_(m) {}
+ const QuasiModel2& m_;
+ double operator()(const double& proposed_pnull) const {
+ return log(m_.Likelihood(m_.alpha_, proposed_pnull));
+ }
+ };
+
+ struct AlphaResampler {
+ AlphaResampler(const QuasiModel2& m) : m_(m) {}
+ const QuasiModel2& m_;
+ double operator()(const double& proposed_alpha) const {
+ return log(m_.Likelihood(proposed_alpha, m_.pnull_.as_float()));
+ }
+ };
+
+ void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+ const PNullResampler dr(*this);
+ const AlphaResampler ar(*this);
+ for (unsigned i = 0; i < nloop; ++i) {
+ double pnull = slice_sampler1d(dr, pnull_.as_float(), *rng, 0.00000001,
+ 1.0, 0.0, niterations, 100*niterations);
+ pnull_ = prob_t(pnull);
+ alpha_ = slice_sampler1d(ar, alpha_, *rng, 0.00000001,
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ }
+ std::cerr << "QuasiModel2(alpha=" << alpha_ << ",p_null="
+ << pnull_.as_float() << ") = " << Likelihood() << std::endl;
+ zcache_.clear();
+ }
+
+ prob_t Likelihood() const {
+ return Likelihood(alpha_, pnull_.as_float());
+ }
+
+ prob_t Likelihood(double alpha, double ppnull) const {
+ const prob_t pnull(ppnull);
+ const prob_t pnotnull(1 - ppnull);
+
+ prob_t p;
+ p.logeq(Md::log_gamma_density(alpha, 0.1, 25)); // TODO configure
+ assert(!p.is_0());
+ prob_t prob_of_ppnull; prob_of_ppnull.logeq(Md::log_beta_density(ppnull, 2, 10));
+ assert(!prob_of_ppnull.is_0());
+ p *= prob_of_ppnull;
+ for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) {
+ const AlignmentObservation& ao = it->first;
+ if (ao.a_j) {
+ prob_t u = XUnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha);
+ prob_t z = XComputeZ(ao.j, ao.src_len, ao.trg_len, alpha);
+ prob_t pa(u / z);
+ pa *= pnotnull;
+ pa.poweq(it->second);
+ p *= pa;
+ } else {
+ p *= pnull.pow(it->second);
+ }
+ }
+ return p;
+ }
+
+ private:
+ static prob_t XUnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
+ prob_t p;
+ p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha);
+ return p;
+ }
+
+ static prob_t XComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
+ prob_t z = prob_t::Zero();
+ for (int a_j = 1; a_j <= src_len; ++a_j)
+ z += XUnnormalizedProb(a_j, j, src_len, trg_len, alpha);
+ return z;
+ }
+
+ static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
+ return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha);
+ }
+
+ static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
+ double z = 0;
+ for (int a_j = 1; a_j <= src_len; ++a_j)
+ z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha);
+ return z;
+ }
+
+ const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const {
+ if (src_len >= zcache_.size())
+ zcache_.resize(src_len + 1);
+ if (trg_len >= zcache_[src_len].size())
+ zcache_[src_len].resize(trg_len + 1);
+ std::vector<double>& zv = zcache_[src_len][trg_len];
+ if (zv.size() == 0)
+ zv.resize(trg_len);
+ double& z = zv[j];
+ if (!z)
+ z = ComputeZ(j, src_len, trg_len, alpha_);
+ return z;
+ }
+
+ double alpha_;
+ prob_t pnull_;
+ prob_t pnotnull_;
+ mutable std::vector<std::vector<std::vector<double> > > zcache_;
+ typedef std::tr1::unordered_map<AlignmentObservation, int, boost::hash<AlignmentObservation> > ObsCount;
+ ObsCount obs_;
+};
+
+#endif
diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc
index 73dd8d39..7d0d04ac 100644
--- a/gi/pf/reachability.cc
+++ b/gi/pf/reachability.cc
@@ -30,15 +30,17 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
}
}
a[0][0].clear();
- //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
- if (a[srclen][trglen].size() == 0) {
- cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n";
+ //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
+ if (a[srclen][trglen].empty()) {
+ cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n";
+ nodes = 0;
return;
}
typedef boost::multi_array<bool, 2> rarray_type;
rarray_type r(boost::extents[srclen + 1][trglen + 1]);
r[srclen][trglen] = true;
+ nodes = 0;
for (int i = srclen; i >= 0; --i) {
for (int j = trglen; j >= 0; --j) {
vector<SState>& prevs = a[i][j];
@@ -47,6 +49,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true;
int src_delta = i - prevs[k].prev_src_covered;
edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true;
+ valid_deltas[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(make_pair<short,short>(src_delta,j - prevs[k].prev_trg_covered));
short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered];
if (src_delta > msd) msd = src_delta;
}
@@ -56,9 +59,16 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
assert(!edges[0][0][0][1]);
assert(!edges[0][0][0][0]);
assert(max_src_delta[0][0] > 0);
- //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n";
- //for (int i = 0; i < b[0][0].size(); ++i) {
- // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n";
- //}
+ nodes = 0;
+ for (int i = 0; i < srclen; ++i) {
+ for (int j = 0; j < trglen; ++j) {
+ if (valid_deltas[i][j].size() > 0) {
+ node_addresses[i][j] = nodes++;
+ } else {
+ node_addresses[i][j] = -1;
+ }
+ }
+ }
+ cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node, " << nodes << " nodes in total, and outside estimate matrix will require " << sizeof(float)*nodes << " bytes\n";
}
diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h
index 98450ec1..1e22c76a 100644
--- a/gi/pf/reachability.h
+++ b/gi/pf/reachability.h
@@ -12,12 +12,18 @@
// currently forbids 0 -> n and n -> 0 alignments
struct Reachability {
- boost::multi_array<bool, 4> edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring?
+ unsigned nodes;
+ boost::multi_array<bool, 4> edges; // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring?
boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid
+ boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes")
+ boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node
Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
+ nodes(),
edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
- max_src_delta(boost::extents[srclen][trglen]) {
+ max_src_delta(boost::extents[srclen][trglen]),
+ node_addresses(boost::extents[srclen][trglen]),
+ valid_deltas(boost::extents[srclen][trglen]) {
ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
}
diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h
new file mode 100644
index 00000000..6f45fbce
--- /dev/null
+++ b/gi/pf/tied_resampler.h
@@ -0,0 +1,124 @@
+#ifndef _TIED_RESAMPLER_H_
+#define _TIED_RESAMPLER_H_
+
+#include <set>
+#include <vector>
+#include "sampler.h"
+#include "slice_sampler.h"
+#include "m.h"
+
+template <class CRP>
+struct TiedResampler {
+ explicit TiedResampler(double da, double db, double ss, double sr, double d=0.5, double s=1.0) :
+ d_alpha(da),
+ d_beta(db),
+ s_shape(ss),
+ s_rate(sr),
+ discount(d),
+ strength(s) {}
+
+ void Add(CRP* crp) {
+ crps.insert(crp);
+ crp->set_discount(discount);
+ crp->set_strength(strength);
+ assert(!crp->has_discount_prior());
+ assert(!crp->has_strength_prior());
+ }
+
+ void Remove(CRP* crp) {
+ crps.erase(crp);
+ }
+
+ size_t size() const {
+ return crps.size();
+ }
+
+ double LogLikelihood(double d, double s) const {
+ if (s <= -d) return -std::numeric_limits<double>::infinity();
+ double llh = Md::log_beta_density(d, d_alpha, d_beta) +
+ Md::log_gamma_density(d + s, s_shape, s_rate);
+ for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it)
+ llh += (*it)->log_crp_prob(d, s);
+ return llh;
+ }
+
+ double LogLikelihood() const {
+ return LogLikelihood(discount, strength);
+ }
+
+ struct DiscountResampler {
+ DiscountResampler(const TiedResampler& m) : m_(m) {}
+ const TiedResampler& m_;
+ double operator()(const double& proposed_discount) const {
+ return m_.LogLikelihood(proposed_discount, m_.strength);
+ }
+ };
+
+ struct AlphaResampler {
+ AlphaResampler(const TiedResampler& m) : m_(m) {}
+ const TiedResampler& m_;
+ double operator()(const double& proposed_strength) const {
+ return m_.LogLikelihood(m_.discount, proposed_strength);
+ }
+ };
+
+ void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+ if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; }
+ const DiscountResampler dr(*this);
+ const AlphaResampler ar(*this);
+ for (int iter = 0; iter < nloop; ++iter) {
+ strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(),
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ double min_discount = std::numeric_limits<double>::min();
+ if (strength < 0.0) min_discount -= strength;
+ discount = slice_sampler1d(dr, discount, *rng, min_discount,
+ 1.0, 0.0, niterations, 100*niterations);
+ }
+ strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(),
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ std::cerr << "TiedCRPs(d=" << discount << ",s="
+ << strength << ") = " << LogLikelihood(discount, strength) << std::endl;
+ for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it) {
+ (*it)->set_discount(discount);
+ (*it)->set_strength(strength);
+ }
+ }
+ private:
+ std::set<CRP*> crps;
+ const double d_alpha, d_beta, s_shape, s_rate;
+ double discount, strength;
+};
+
+// split according to some criterion
+template <class CRP>
+struct BinTiedResampler {
+ explicit BinTiedResampler(unsigned nbins) :
+ resamplers(nbins, TiedResampler<CRP>(1,1,1,1)) {}
+
+ void Add(unsigned bin, CRP* crp) {
+ resamplers[bin].Add(crp);
+ }
+
+ void Remove(unsigned bin, CRP* crp) {
+ resamplers[bin].Remove(crp);
+ }
+
+ void ResampleHyperparameters(MT19937* rng) {
+ for (unsigned i = 0; i < resamplers.size(); ++i) {
+ std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush;
+ resamplers[i].ResampleHyperparameters(rng);
+ }
+ }
+
+ double LogLikelihood() const {
+ double llh = 0;
+ for (unsigned i = 0; i < resamplers.size(); ++i)
+ llh += resamplers[i].LogLikelihood();
+ return llh;
+ }
+
+ private:
+ std::vector<TiedResampler<CRP> > resamplers;
+};
+
+#endif
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc
new file mode 100644
index 00000000..2200715e
--- /dev/null
+++ b/gi/pf/transliterations.cc
@@ -0,0 +1,334 @@
+#include "transliterations.h"
+
+#include <iostream>
+#include <vector>
+
+#include "boost/shared_ptr.hpp"
+
+#include "backward.h"
+#include "filelib.h"
+#include "tdict.h"
+#include "trule.h"
+#include "filelib.h"
+#include "ccrp_nt.h"
+#include "m.h"
+#include "reachability.h"
+
+using namespace std;
+using namespace std::tr1;
+
+struct TruncatedConditionalLengthModel {
+ TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) :
+ plens(max_src_size+1, vector<prob_t>(max_trg_size+1, 0.0)) {
+ for (unsigned i = 1; i <= max_src_size; ++i) {
+ prob_t z = prob_t::Zero();
+ for (unsigned j = 1; j <= max_trg_size; ++j)
+ z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio))));
+ for (unsigned j = 1; j <= max_trg_size; ++j)
+ plens[i][j] /= z;
+ //for (unsigned j = 1; j <= max_trg_size; ++j)
+ // cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl;
+ }
+ }
+
+ // return p(tlen | slen) for *chunks* not full words
+ inline const prob_t& operator()(int slen, int tlen) const {
+ return plens[slen][tlen];
+ }
+
+ vector<vector<prob_t> > plens;
+};
+
+struct CondBaseDist {
+ CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) :
+ tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {}
+
+ prob_t operator()(const vector<WordID>& src, unsigned sf, unsigned st,
+ const vector<WordID>& trg, unsigned tf, unsigned tt) const {
+ prob_t p = tclm(st - sf, tt - tf); // target len | source length ~ TCLM(source len)
+ assert(!"not impl");
+ return p;
+ }
+ inline prob_t operator()(const vector<WordID>& src, const vector<WordID>& trg) const {
+ return (*this)(src, 0, src.size(), trg, 0, trg.size());
+ }
+ TruncatedConditionalLengthModel tclm;
+};
+
+// represents transliteration phrase probabilities, e.g.
+// p( a l - | A l ) , p( o | A w ) , ...
+struct TransliterationChunkConditionalModel {
+ explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) :
+ d(0.0),
+ strength(1.0),
+ rp0(pp0) {
+ }
+
+ void Summary() const {
+ std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
+ for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+ std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
+ for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+ std::cerr << " " << i2->second << '\t' << i2->first << std::endl;
+ }
+ }
+
+ int DecrementRule(const TRule& rule) {
+ RuleModelHash::iterator it = r.find(rule.f_);
+ assert(it != r.end());
+ int count = it->second.decrement(rule);
+ if (count) {
+ if (it->second.num_customers() == 0) r.erase(it);
+ }
+ return count;
+ }
+
+ int IncrementRule(const TRule& rule) {
+ RuleModelHash::iterator it = r.find(rule.f_);
+ if (it == r.end()) {
+ it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(strength))).first;
+ }
+ int count = it->second.increment(rule);
+ return count;
+ }
+
+ void IncrementRules(const std::vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ IncrementRule(*rules[i]);
+ }
+
+ void DecrementRules(const std::vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ DecrementRule(*rules[i]);
+ }
+
+ prob_t RuleProbability(const TRule& rule) const {
+ prob_t p;
+ RuleModelHash::const_iterator it = r.find(rule.f_);
+ if (it == r.end()) {
+ p = rp0(rule.f_, rule.e_);
+ } else {
+ p = it->second.prob(rule, rp0(rule.f_, rule.e_));
+ }
+ return p;
+ }
+
+ double LogLikelihood(const double& dd, const double& aa) const {
+ if (aa <= -dd) return -std::numeric_limits<double>::infinity();
+ //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
+ double llh = //Md::log_beta_density(dd, 1, 1) +
+ Md::log_gamma_density(dd + aa, 1, 1);
+ typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::const_iterator it;
+ for (it = r.begin(); it != r.end(); ++it)
+ llh += it->second.log_crp_prob(aa);
+ return llh;
+ }
+
+ struct AlphaResampler {
+ AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {}
+ const TransliterationChunkConditionalModel& m_;
+ double operator()(const double& proposed_strength) const {
+ return m_.LogLikelihood(m_.d, proposed_strength);
+ }
+ };
+
+ void ResampleHyperparameters(MT19937* rng) {
+ typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::iterator it;
+ //const unsigned nloop = 5;
+ const unsigned niterations = 10;
+ //DiscountResampler dr(*this);
+ AlphaResampler ar(*this);
+#if 0
+ for (int iter = 0; iter < nloop; ++iter) {
+ strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ double min_discount = std::numeric_limits<double>::min();
+ if (strength < 0.0) min_discount -= strength;
+ d = slice_sampler1d(dr, d, *rng, min_discount,
+ 1.0, 0.0, niterations, 100*niterations);
+ }
+#endif
+ strength = slice_sampler1d(ar, strength, *rng, -d,
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl;
+ for (it = r.begin(); it != r.end(); ++it) {
+#if 0
+ it->second.set_discount(d);
+#endif
+ it->second.set_alpha(strength);
+ }
+ }
+
+ prob_t Likelihood() const {
+ prob_t p; p.logeq(LogLikelihood(d, strength));
+ return p;
+ }
+
+ const CondBaseDist& rp0;
+ typedef std::tr1::unordered_map<std::vector<WordID>,
+ CCRP_NoTable<TRule>,
+ boost::hash<std::vector<WordID> > > RuleModelHash;
+ RuleModelHash r;
+ double d, strength;
+};
+
+struct GraphStructure {
+ GraphStructure() : r() {}
+ // leak memory - these are basically static
+ const Reachability* r;
+ bool IsReachable() const { return r->nodes > 0; }
+};
+
+struct ProbabilityEstimates {
+ ProbabilityEstimates() : gs(), backward() {}
+ explicit ProbabilityEstimates(const GraphStructure& g) :
+ gs(&g), backward() {
+ if (g.r->nodes > 0)
+ backward = new float[g.r->nodes];
+ }
+ // leak memory, these are static
+
+ // returns an estimate of the marginal probability
+ double MarginalEstimate() const {
+ if (!backward) return 0;
+ return backward[0];
+ }
+
+ // returns an backward estimate
+ double Backward(int src_covered, int trg_covered) const {
+ if (!backward) return 0;
+ int ind = gs->r->node_addresses[src_covered][trg_covered];
+ if (ind < 0) return 0;
+ return backward[ind];
+ }
+
+ prob_t estp;
+ float* backward;
+ private:
+ const GraphStructure* gs;
+};
+
+struct TransliterationsImpl {
+ TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) :
+ cp0(max_src, max_trg, sr),
+ tccm(cp0),
+ be(b),
+ kMAX_SRC_CHUNK(max_src),
+ kMAX_TRG_CHUNK(max_trg),
+ kS2T_RATIO(sr),
+ tot_pairs(), tot_mem() {
+ }
+ const CondBaseDist cp0;
+ TransliterationChunkConditionalModel tccm;
+ const BackwardEstimator& be;
+
+ void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
+ const size_t src_len = src_lets.size();
+ const size_t trg_len = trg_lets.size();
+
+ // init graph structure
+ if (src_len >= graphs.size()) graphs.resize(src_len + 1);
+ if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
+ GraphStructure& gs = graphs[src_len][trg_len];
+ if (!gs.r) {
+ double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO))));
+ if (rat > 1.5 || (rat > 2.4 && src_len < 6)) {
+ cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl;
+ gs.r = new Reachability(src_len, trg_len, 0, 0);
+ } else {
+ gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK);
+ }
+ }
+
+ const Reachability& r = *gs.r;
+
+ // init backward estimates
+ if (src >= ests.size()) ests.resize(src + 1);
+ unordered_map<WordID, ProbabilityEstimates>::iterator it = ests[src].find(trg);
+ if (it != ests[src].end()) return; // already initialized
+
+ it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first;
+ ProbabilityEstimates& est = it->second;
+ if (!gs.r->nodes) return; // not derivable subject to length constraints
+
+ be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward);
+ cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl;
+ tot_pairs++;
+ tot_mem += sizeof(float) * gs.r->nodes;
+ }
+
+ void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
+ const size_t src_len = src_lets.size();
+ const size_t trg_len = trg_lets.size();
+ // TODO
+ }
+
+ prob_t EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
+ assert(src.size() < graphs.size());
+ const vector<GraphStructure>& tv = graphs[src.size()];
+ assert(trg.size() < tv.size());
+ const GraphStructure& gs = tv[trg.size()];
+ if (gs.r->nodes == 0)
+ return prob_t::Zero();
+ const unordered_map<WordID, ProbabilityEstimates>::const_iterator it = ests[s].find(t);
+ assert(it != ests[s].end());
+ return it->second.estp;
+ }
+
+ void GraphSummary() const {
+ double to = 0;
+ double tn = 0;
+ double tt = 0;
+ for (int i = 0; i < graphs.size(); ++i) {
+ const vector<GraphStructure>& vt = graphs[i];
+ for (int j = 0; j < vt.size(); ++j) {
+ const GraphStructure& gs = vt[j];
+ if (!gs.r) continue;
+ tt++;
+ for (int k = 0; k < i; ++k) {
+ for (int l = 0; l < j; ++l) {
+ size_t c = gs.r->valid_deltas[k][l].size();
+ if (c) {
+ tn += 1;
+ to += c;
+ }
+ }
+ }
+ }
+ }
+ cerr << " Average nodes = " << (tn / tt) << endl;
+ cerr << "Average out-degree = " << (to / tn) << endl;
+ cerr << " Unique structures = " << tt << endl;
+ cerr << " Unique pairs = " << tot_pairs << endl;
+ cerr << " BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl;
+ }
+
+ const int kMAX_SRC_CHUNK;
+ const int kMAX_TRG_CHUNK;
+ const double kS2T_RATIO;
+ unsigned tot_pairs;
+ size_t tot_mem;
+ vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]
+ vector<unordered_map<WordID, ProbabilityEstimates> > ests; // ests[src][trg]
+};
+
+Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) :
+ pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {}
+Transliterations::~Transliterations() { delete pimpl_; }
+
+void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
+ pimpl_->Initialize(src, src_lets, trg, trg_lets);
+}
+
+prob_t Transliterations::EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
+ return pimpl_->EstimateProbability(s, src,t, trg);
+}
+
+void Transliterations::Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
+ pimpl_->Forbid(src, src_lets, trg, trg_lets);
+}
+
+void Transliterations::GraphSummary() const {
+ pimpl_->GraphSummary();
+}
+
diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h
new file mode 100644
index 00000000..49d14684
--- /dev/null
+++ b/gi/pf/transliterations.h
@@ -0,0 +1,24 @@
+#ifndef _TRANSLITERATIONS_H_
+#define _TRANSLITERATIONS_H_
+
+#include <vector>
+#include "wordid.h"
+#include "prob.h"
+
+struct BackwardEstimator;
+struct TransliterationsImpl;
+struct Transliterations {
+ // max_src and max_trg indicate how big the transliteration phrases can be
+ // see reachability.h for information about filter_ratio
+ explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be);
+ ~Transliterations();
+ void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
+ void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
+ void GraphSummary() const;
+ prob_t EstimateProbability(WordID s, const std::vector<WordID>& src, WordID t, const std::vector<WordID>& trg) const;
+ private:
+ TransliterationsImpl* pimpl_;
+};
+
+#endif
+
diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc
new file mode 100644
index 00000000..40829775
--- /dev/null
+++ b/gi/pf/unigrams.cc
@@ -0,0 +1,80 @@
+#include "unigrams.h"
+
+#include <string>
+#include <cmath>
+
+#include "stringlib.h"
+#include "filelib.h"
+
+using namespace std;
+
+void UnigramModel::LoadUnigrams(const string& fname) {
+ cerr << "Loading unigram probabilities from " << fname << " ..." << endl;
+ ReadFile rf(fname);
+ string line;
+ istream& in = *rf.stream();
+ assert(in);
+ getline(in, line);
+ assert(line.empty());
+ getline(in, line);
+ assert(line == "\\data\\");
+ getline(in, line);
+ size_t pos = line.find("ngram 1=");
+ assert(pos == 0);
+ assert(line.size() > 8);
+ const size_t num_unigrams = atoi(&line[8]);
+ getline(in, line);
+ assert(line.empty());
+ getline(in, line);
+ assert(line == "\\1-grams:");
+ for (size_t i = 0; i < num_unigrams; ++i) {
+ getline(in, line);
+ assert(line.size() > 0);
+ pos = line.find('\t');
+ assert(pos > 0);
+ assert(pos + 1 < line.size());
+ const WordID w = TD::Convert(line.substr(pos + 1));
+ line[pos] = 0;
+ float p = atof(&line[0]);
+ if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n";
+ }
+}
+
+void UnigramWordModel::LoadUnigrams(const string& fname) {
+ cerr << "Loading unigram probabilities from " << fname << " ..." << endl;
+ ReadFile rf(fname);
+ string line;
+ istream& in = *rf.stream();
+ assert(in);
+ getline(in, line);
+ assert(line.empty());
+ getline(in, line);
+ assert(line == "\\data\\");
+ getline(in, line);
+ size_t pos = line.find("ngram 1=");
+ assert(pos == 0);
+ assert(line.size() > 8);
+ const size_t num_unigrams = atoi(&line[8]);
+ getline(in, line);
+ assert(line.empty());
+ getline(in, line);
+ assert(line == "\\1-grams:");
+ for (size_t i = 0; i < num_unigrams; ++i) {
+ getline(in, line);
+ assert(line.size() > 0);
+ pos = line.find('\t');
+ assert(pos > 0);
+ assert(pos + 1 < line.size());
+ size_t cur = pos + 1;
+ vector<WordID> w;
+ while (cur < line.size()) {
+ const size_t len = UTF8Len(line[cur]);
+ w.push_back(TD::Convert(line.substr(cur, len)));
+ cur += len;
+ }
+ line[pos] = 0;
+ float p = atof(&line[0]);
+ probs_[w].logeq(p * log(10.0));
+ }
+}
+
diff --git a/gi/pf/unigrams.h b/gi/pf/unigrams.h
new file mode 100644
index 00000000..1660d1ed
--- /dev/null
+++ b/gi/pf/unigrams.h
@@ -0,0 +1,69 @@
+#ifndef _UNIGRAMS_H_
+#define _UNIGRAMS_H_
+
+#include <vector>
+#include <string>
+#include <tr1/unordered_map>
+#include <boost/functional.hpp>
+
+#include "wordid.h"
+#include "prob.h"
+#include "tdict.h"
+
+struct UnigramModel {
+ explicit UnigramModel(const std::string& fname, unsigned vocab_size) :
+ use_uniform_(fname.size() == 0),
+ uniform_(1.0 / vocab_size),
+ probs_() {
+ if (fname.size() > 0) {
+ probs_.resize(TD::NumWords() + 1);
+ LoadUnigrams(fname);
+ }
+ }
+
+ const prob_t& operator()(const WordID& w) const {
+ assert(w);
+ if (use_uniform_) return uniform_;
+ return probs_[w];
+ }
+
+ private:
+ void LoadUnigrams(const std::string& fname);
+
+ const bool use_uniform_;
+ const prob_t uniform_;
+ std::vector<prob_t> probs_;
+};
+
+
+// reads an ARPA unigram file and converts words like 'cat' into a string 'c a t'
+struct UnigramWordModel {
+ explicit UnigramWordModel(const std::string& fname) :
+ use_uniform_(false),
+ uniform_(1.0),
+ probs_() {
+ LoadUnigrams(fname);
+ }
+
+ explicit UnigramWordModel(const unsigned vocab_size) :
+ use_uniform_(true),
+ uniform_(1.0 / vocab_size),
+ probs_() {}
+
+ const prob_t& operator()(const std::vector<WordID>& s) const {
+ if (use_uniform_) return uniform_;
+ const VectorProbHash::const_iterator it = probs_.find(s);
+ assert(it != probs_.end());
+ return it->second;
+ }
+
+ private:
+ void LoadUnigrams(const std::string& fname);
+
+ const bool use_uniform_;
+ const prob_t uniform_;
+ typedef std::tr1::unordered_map<std::vector<WordID>, prob_t, boost::hash<std::vector<WordID> > > VectorProbHash;
+ VectorProbHash probs_;
+};
+
+#endif