summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-03-09 22:23:50 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2012-03-09 22:23:50 -0500
commit113317266853abff2e1c0c3e889017d0eee55c93 (patch)
tree3fb77e29acaf45e1a9a006f8f11fb2b021b5987b
parent78bf1457f606dd3880c2bc912201c4945d3f85b4 (diff)
moar
-rw-r--r--gi/pf/Makefile.am3
-rw-r--r--gi/pf/align-lexonly-pyp.cc207
-rw-r--r--gi/pf/align-tl.cc18
-rw-r--r--gi/pf/backward.cc89
-rw-r--r--gi/pf/backward.h33
-rw-r--r--gi/pf/base_distributions.h8
-rwxr-xr-xgi/pf/guess-translits.pl2
-rw-r--r--gi/pf/nuisance_test.cc6
-rw-r--r--gi/pf/pyp_lm.cc2
-rw-r--r--gi/pf/pyp_tm.cc113
-rw-r--r--gi/pf/pyp_tm.h34
-rw-r--r--gi/pf/pyp_word_model.cc20
-rw-r--r--gi/pf/pyp_word_model.h58
-rw-r--r--gi/pf/reachability.cc8
-rw-r--r--gi/pf/reachability.h8
-rw-r--r--gi/pf/transliterations.cc223
-rw-r--r--gi/pf/transliterations.h3
-rw-r--r--utils/ccrp_nt.h17
18 files changed, 628 insertions, 224 deletions
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 94364c3d..4ce72ba1 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -2,7 +2,7 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexon
noinst_LIBRARIES = libpf.a
-libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc
+libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc pyp_word_model.cc pyp_tm.cc
nuisance_test_SOURCES = nuisance_test.cc
nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
@@ -10,6 +10,7 @@ nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mtev
align_lexonly_SOURCES = align-lexonly.cc
align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
+align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
align_tl_SOURCES = align-tl.cc
align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 13a3a487..d68a4b8f 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -1,27 +1,18 @@
#include <iostream>
-#include <tr1/memory>
#include <queue>
-#include <boost/multi_array.hpp>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
-#include "array2d.h"
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "conditional_pseg.h"
-#include "trule.h"
#include "tdict.h"
#include "stringlib.h"
#include "filelib.h"
-#include "dict.h"
+#include "array2d.h"
#include "sampler.h"
-#include "mfcr.h"
#include "corpus.h"
-#include "ngram_base.h"
+#include "pyp_tm.h"
using namespace std;
-using namespace tr1;
namespace po = boost::program_options;
void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
@@ -51,7 +42,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
}
}
-shared_ptr<MT19937> prng;
+MT19937* prng;
struct LexicalAlignment {
unsigned char src_index;
@@ -66,159 +57,59 @@ struct AlignedSentencePair {
Array2D<short> posterior;
};
-struct HierarchicalWordBase {
- explicit HierarchicalWordBase(const unsigned vocab_e_size) :
- base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {}
-
- void ResampleHyperparameters(MT19937* rng) {
- r.resample_hyperparameters(rng);
- }
-
- inline double logp0(const vector<WordID>& s) const {
- return Md::log_poisson(s.size(), 7.5) + s.size() * u0;
- }
-
- // return p0 of rule.e_
- prob_t operator()(const TRule& rule) const {
- v[0].logeq(logp0(rule.e_));
- return r.prob(rule.e_, v.begin(), l.begin());
- }
-
- void Increment(const TRule& rule) {
- v[0].logeq(logp0(rule.e_));
- if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) {
- base *= v[0] * l[0];
- }
- }
-
- void Decrement(const TRule& rule) {
- if (r.decrement(rule.e_, &*prng).count) {
- base /= prob_t(exp(logp0(rule.e_)));
- }
- }
-
- prob_t Likelihood() const {
- prob_t p; p.logeq(r.log_crp_prob());
- p *= base;
- return p;
+struct Aligner {
+ Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) :
+ corpus(*c),
+ model(lets, num_letters),
+ kNULL(TD::Convert("NULL")) {
+ assert(lets[kNULL].size() == 0);
}
- void Summary() const {
- cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl;
- for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
- cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl;
- }
-
- prob_t base;
- MFCR<1,vector<WordID> > r;
- const double u0;
- const vector<prob_t> l;
- mutable vector<prob_t> v;
-};
-
-struct BasicLexicalAlignment {
- explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets,
- const unsigned words_e,
- const unsigned letters_e,
- vector<AlignedSentencePair>* corp) :
- letters(lets),
- corpus(*corp),
- //up0(words_e),
- //up0("en.chars.1gram", letters_e),
- //up0("en.words.1gram"),
- up0(letters_e),
- //up0("en.chars.2gram"),
- tmodel(up0) {
- }
+ vector<AlignedSentencePair>& corpus;
+ PYPLexicalTranslation model;
+ const WordID kNULL;
- void InstantiateRule(const WordID src,
- const WordID trg,
- TRule* rule) const {
- static const WordID kX = TD::Convert("X") * -1;
- rule->lhs_ = kX;
- rule->e_ = letters[trg];
- rule->f_ = letters[src];
+ void ResampleHyperparameters() {
+ model.ResampleHyperparameters(prng);
}
void InitializeRandom() {
- const WordID kNULL = TD::Convert("NULL");
cerr << "Initializing with random alignments ...\n";
for (unsigned i = 0; i < corpus.size(); ++i) {
AlignedSentencePair& asp = corpus[i];
asp.a.resize(asp.trg.size());
for (unsigned j = 0; j < asp.trg.size(); ++j) {
- const unsigned char a_j = prng->next() * (1 + asp.src.size());
+ unsigned char& a_j = asp.a[j].src_index;
+ a_j = prng->next() * (1 + asp.src.size());
const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
- TRule r;
- InstantiateRule(f_a_j, asp.trg[j], &r);
- asp.a[j].is_transliteration = false;
- asp.a[j].src_index = a_j;
- if (tmodel.IncrementRule(r, &*prng))
- up0.Increment(r);
+ model.Increment(f_a_j, asp.trg[j], &*prng);
}
}
- cerr << " LLH = " << Likelihood() << endl;
- }
-
- prob_t Likelihood() const {
- prob_t p = tmodel.Likelihood();
- p *= up0.Likelihood();
- return p;
- }
-
- void ResampleHyperparemeters() {
- tmodel.ResampleHyperparameters(&*prng);
- up0.ResampleHyperparameters(&*prng);
- cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n";
+ cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl;
}
- void ResampleCorpus();
-
- const vector<vector<WordID> >& letters; // spelling dictionary
- vector<AlignedSentencePair>& corpus;
- //PhraseConditionalUninformativeBase up0;
- //PhraseConditionalUninformativeUnigramBase up0;
- //UnigramWordBase up0;
- //HierarchicalUnigramBase up0;
- HierarchicalWordBase up0;
- //CompletelyUniformBase up0;
- //FixedNgramBase up0;
- //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel;
- //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel;
- //ConditionalTranslationModel<UnigramWordBase> tmodel;
- //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel;
- MConditionalTranslationModel<HierarchicalWordBase> tmodel;
- //ConditionalTranslationModel<FixedNgramBase> tmodel;
- //ConditionalTranslationModel<CompletelyUniformBase> tmodel;
-};
-
-void BasicLexicalAlignment::ResampleCorpus() {
- static const WordID kNULL = TD::Convert("NULL");
- for (unsigned i = 0; i < corpus.size(); ++i) {
- AlignedSentencePair& asp = corpus[i];
- SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
- for (unsigned j = 0; j < asp.trg.size(); ++j) {
- TRule r;
- unsigned char& a_j = asp.a[j].src_index;
- WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
- InstantiateRule(f_a_j, asp.trg[j], &r);
- if (tmodel.DecrementRule(r, &*prng))
- up0.Decrement(r);
-
- for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
- const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
- InstantiateRule(prop_f, asp.trg[j], &r);
- ss[prop_a_j] = tmodel.RuleProbability(r);
+ void ResampleCorpus() {
+ for (unsigned i = 0; i < corpus.size(); ++i) {
+ AlignedSentencePair& asp = corpus[i];
+ SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
+ for (unsigned j = 0; j < asp.trg.size(); ++j) {
+ unsigned char& a_j = asp.a[j].src_index;
+ const WordID e_j = asp.trg[j];
+ WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+ model.Decrement(f_a_j, e_j, prng);
+
+ for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
+ const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
+ ss[prop_a_j] = model.Prob(prop_f, e_j);
+ }
+ a_j = prng->SelectSample(ss);
+ f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+ model.Increment(f_a_j, e_j, prng);
}
- a_j = prng->SelectSample(ss);
- f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
- InstantiateRule(f_a_j, asp.trg[j], &r);
- if (tmodel.IncrementRule(r, &*prng))
- up0.Increment(r);
}
+ cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl;
}
- cerr << " LLH = " << Likelihood() << endl;
-}
+};
void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
@@ -240,8 +131,10 @@ void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID
void Debug(const AlignedSentencePair& asp) {
cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
Array2D<bool> a(asp.src.size(), asp.trg.size());
- for (unsigned j = 0; j < asp.trg.size(); ++j)
+ for (unsigned j = 0; j < asp.trg.size(); ++j) {
+ assert(asp.a[j].src_index <= asp.src.size());
if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
+ }
cerr << a << endl;
}
@@ -275,10 +168,9 @@ int main(int argc, char** argv) {
InitCommandLine(argc, argv, &conf);
if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ prng = new MT19937(conf["random_seed"].as<uint32_t>());
else
- prng.reset(new MT19937);
-// MT19937& rng = *prng;
+ prng = new MT19937;
vector<vector<int> > corpuse, corpusf;
set<int> vocabe, vocabf;
@@ -304,23 +196,18 @@ int main(int argc, char** argv) {
ExtractLetters(vocabf, &letters, NULL);
letters[TD::Convert("NULL")].clear();
- BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus);
- x.InitializeRandom();
+ Aligner aligner(letters, letset.size(), &corpus);
+ aligner.InitializeRandom();
+
const unsigned samples = conf["samples"].as<unsigned>();
for (int i = 0; i < samples; ++i) {
for (int j = 65; j < 67; ++j) Debug(corpus[j]);
- cerr << i << "\t" << x.tmodel.r.size() << "\t";
- if (i % 7 == 6) x.ResampleHyperparemeters();
- x.ResampleCorpus();
+ if (i % 7 == 6) aligner.ResampleHyperparameters();
+ aligner.ResampleCorpus();
if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
}
for (unsigned i = 0; i < corpus.size(); ++i)
WriteAlignments(corpus[i]);
- //ModelAndData posterior(x, &corpus, vocabe, vocabf);
- x.tmodel.Summary();
- x.up0.Summary();
-
- //posterior.Sample();
return 0;
}
diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc
index fc9b7ca5..cbe8c6c8 100644
--- a/gi/pf/align-tl.cc
+++ b/gi/pf/align-tl.cc
@@ -6,6 +6,7 @@
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
+#include "backward.h"
#include "array2d.h"
#include "base_distributions.h"
#include "monotonic_pseg.h"
@@ -30,10 +31,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
opts.add_options()
("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
("input,i",po::value<string>(),"Read parallel data from")
+ ("s2t", po::value<string>(), "character level source-to-target prior transliteration probabilities")
+ ("t2s", po::value<string>(), "character level target-to-source prior transliteration probabilities")
("max_src_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in source")
("max_trg_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in target")
- ("min_transliterated_src_length", po::value<unsigned>()->default_value(3), "Minimum length of source words considered for transliteration")
- ("filter_ratio", po::value<double>()->default_value(0.66), "Filter ratio: basically, if the lengths differ by less than this ratio, mark the pair as non-transliteratable")
+ ("expected_src_to_trg_ratio", po::value<double>()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?")
("random_seed,S",po::value<uint32_t>(), "Random seed");
po::options_description clo("Command line options");
clo.add_options()
@@ -303,7 +305,7 @@ int main(int argc, char** argv) {
corpusf.clear(); corpuse.clear();
vocabf.insert(TD::Convert("NULL"));
- vector<vector<WordID> > letters(TD::NumWords());
+ vector<vector<WordID> > letters(TD::NumWords() + 1);
set<WordID> letset;
ExtractLetters(vocabe, &letters, &letset);
ExtractLetters(vocabf, &letters, NULL);
@@ -312,9 +314,9 @@ int main(int argc, char** argv) {
// TODO configure this
const int max_src_chunk = conf["max_src_chunk"].as<unsigned>();
const int max_trg_chunk = conf["max_trg_chunk"].as<unsigned>();
- const double filter_rat = conf["filter_ratio"].as<double>();
- const int min_trans_src = conf["min_transliterated_src_length"].as<unsigned>();
- Transliterations tl(max_src_chunk, max_trg_chunk, filter_rat);
+ const double s2t_rat = conf["expected_src_to_trg_ratio"].as<double>();
+ const BackwardEstimator be(conf["s2t"].as<string>(), conf["t2s"].as<string>());
+ Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be);
cerr << "Initializing transliteration graph structures ...\n";
for (int i = 0; i < corpus.size(); ++i) {
@@ -325,8 +327,8 @@ int main(int argc, char** argv) {
for (int k = 0; k < trg.size(); ++k) {
const vector<int>& trg_let = letters[trg[k]];
tl.Initialize(src[j], src_let, trg[k], trg_let);
- if (src_let.size() < min_trans_src)
- tl.Forbid(src[j], src_let, trg[k], trg_let);
+ //if (src_let.size() < min_trans_src)
+ // tl.Forbid(src[j], src_let, trg[k], trg_let);
}
}
}
diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc
new file mode 100644
index 00000000..b92629fd
--- /dev/null
+++ b/gi/pf/backward.cc
@@ -0,0 +1,89 @@
+#include "backward.h"
+
+#include <queue>
+#include <utility>
+
+#include "array2d.h"
+#include "reachability.h"
+#include "base_distributions.h"
+
+using namespace std;
+
+BackwardEstimator::BackwardEstimator(const string& s2t,
+ const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {}
+
+BackwardEstimator::~BackwardEstimator() {
+ delete m1; m1 = NULL;
+ delete m1inv; m1inv = NULL;
+}
+
+float BackwardEstimator::ComputeBackwardProb(const std::vector<WordID>& src,
+ const std::vector<WordID>& trg,
+ unsigned src_covered,
+ unsigned trg_covered,
+ double s2t_ratio) const {
+ if (src_covered == src.size() || trg_covered == trg.size()) {
+ assert(src_covered == src.size());
+ assert(trg_covered == trg.size());
+ return 0;
+ }
+ static const WordID kNULL = TD::Convert("<eps>");
+ const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1));
+ // TODO factor in expected length ratio
+ prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining)
+ for (unsigned j = trg_covered; j < trg.size(); ++j) {
+ prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12);
+ for (unsigned i = src_covered; i < src.size(); ++i)
+ p += (*m1)(src[i], trg[j]);
+ if (p.is_0()) {
+ cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n";
+ assert(!"failed");
+ }
+ p *= uniform_alignment;
+ e *= p;
+ }
+ // TODO factor in expected length ratio
+ const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0));
+ prob_t inv;
+ inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio));
+ for (unsigned i = src_covered; i < src.size(); ++i) {
+ prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12);
+ for (unsigned j = trg_covered; j < trg.size(); ++j)
+ p += (*m1inv)(trg[j], src[i]);
+ if (p.is_0()) {
+ cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n";
+ assert(!"failed");
+ }
+ p *= inv_uniform;
+ inv *= p;
+ }
+ return (log(e) + log(inv)) / 2;
+}
+
+void BackwardEstimator::InitializeGrid(const vector<WordID>& src,
+ const vector<WordID>& trg,
+ const Reachability& r,
+ double s2t_ratio,
+ float* grid) const {
+ queue<pair<int,int> > q;
+ q.push(make_pair(0,0));
+ Array2D<bool> done(src.size()+1, trg.size()+1, false);
+ //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl;
+ while(!q.empty()) {
+ const pair<int,int> n = q.front();
+ q.pop();
+ if (done(n.first,n.second)) continue;
+ done(n.first,n.second) = true;
+
+ float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio);
+ if (n.first == 0 && n.second == 0) grid[0] = lp;
+ //cerr << " " << n.first << "," << n.second << "\t" << lp << endl;
+
+ if (n.first == src.size() || n.second == trg.size()) continue;
+ const vector<pair<short,short> >& edges = r.valid_deltas[n.first][n.second];
+ for (int i = 0; i < edges.size(); ++i)
+ q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second));
+ }
+ //static int cc = 0; ++cc; if (cc == 80) exit(1);
+}
+
diff --git a/gi/pf/backward.h b/gi/pf/backward.h
new file mode 100644
index 00000000..e67eff0c
--- /dev/null
+++ b/gi/pf/backward.h
@@ -0,0 +1,33 @@
+#ifndef _BACKWARD_H_
+#define _BACKWARD_H_
+
+#include <vector>
+#include <string>
+#include "wordid.h"
+
+struct Reachability;
+struct Model1;
+
+struct BackwardEstimator {
+ BackwardEstimator(const std::string& s2t,
+ const std::string& t2s);
+ ~BackwardEstimator();
+
+ void InitializeGrid(const std::vector<WordID>& src,
+ const std::vector<WordID>& trg,
+ const Reachability& r,
+ double src2trg_ratio,
+ float* grid) const;
+
+ private:
+ float ComputeBackwardProb(const std::vector<WordID>& src,
+ const std::vector<WordID>& trg,
+ unsigned src_covered,
+ unsigned trg_covered,
+ double src2trg_ratio) const;
+
+ Model1* m1;
+ Model1* m1inv;
+};
+
+#endif
diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h
index 0d597c5c..84dacdf2 100644
--- a/gi/pf/base_distributions.h
+++ b/gi/pf/base_distributions.h
@@ -14,13 +14,7 @@
#include "tdict.h"
#include "sampler.h"
#include "m.h"
-
-inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) {
- os << '[';
- for (int i = 0; i < p.size(); ++i)
- os << (i==0 ? "" : " ") << TD::Convert(p[i]);
- return os << ']';
-}
+#include "os_phrase.h"
struct Model1 {
explicit Model1(const std::string& fname) :
diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl
index aafec13a..d00c2168 100755
--- a/gi/pf/guess-translits.pl
+++ b/gi/pf/guess-translits.pl
@@ -69,4 +69,4 @@ for my $f (keys %fs) {
}
}
print STDERR "Extracted $num pairs.\n";
-print STDERR "Recommend running:\n ../../training/model1 -t -99999 output.txt\n";
+print STDERR "Recommend running:\n ../../training/model1 -v -d -t -99999 output.txt\n";
diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc
index 0f44fe95..fc0af9cb 100644
--- a/gi/pf/nuisance_test.cc
+++ b/gi/pf/nuisance_test.cc
@@ -124,9 +124,9 @@ int main(int argc, char** argv) {
WordID y = TD::Convert("remember");
vector<WordID> src; TD::ConvertSentence("s o u v e n o n s", &src);
vector<WordID> trg; TD::ConvertSentence("r e m e m b e r", &trg);
- Transliterations xx;
- xx.Initialize(x, src, y, trg);
- return 1;
+// Transliterations xx;
+// xx.Initialize(x, src, y, trg);
+// return 1;
for (int j = 0; j < ITERS; ++j) {
Base b;
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 104f356b..52e6be2c 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -18,7 +18,7 @@
// I use templates to handle the recursive formalation of the prior, so
// the order of the model has to be specified here, at compile time:
-#define kORDER 4
+#define kORDER 3
using namespace std;
using namespace tr1;
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
new file mode 100644
index 00000000..94cbe7c3
--- /dev/null
+++ b/gi/pf/pyp_tm.cc
@@ -0,0 +1,113 @@
+#include "pyp_tm.h"
+
+#include <tr1/unordered_map>
+#include <iostream>
+#include <queue>
+
+#include "base_distributions.h"
+#include "monotonic_pseg.h"
+#include "conditional_pseg.h"
+#include "tdict.h"
+#include "ccrp.h"
+#include "pyp_word_model.h"
+
+using namespace std;
+using namespace std::tr1;
+
+template <typename Base>
+struct ConditionalPYPWordModel {
+ ConditionalPYPWordModel(Base* b) : base(*b) {}
+
+ void Summary() const {
+ cerr << "Number of conditioning contexts: " << r.size() << endl;
+ for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+ cerr << TD::Convert(it->first) << " \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl;
+ for (CCRP<vector<WordID> >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+ cerr << " " << i2->second.total_dish_count_ << '\t' << TD::GetString(i2->first) << endl;
+ }
+ }
+
+ void ResampleHyperparameters(MT19937* rng) {
+ for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
+ it->second.resample_hyperparameters(rng);
+ }
+
+ prob_t Prob(const WordID src, const vector<WordID>& trglets) const {
+ RuleModelHash::const_iterator it = r.find(src);
+ if (it == r.end()) {
+ return base(trglets);
+ } else {
+ return it->second.prob(trglets, base(trglets));
+ }
+ }
+
+ void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
+ RuleModelHash::iterator it = r.find(src);
+ if (it == r.end())
+ it = r.insert(make_pair(src, CCRP<vector<WordID> >(1,1,1,1,0.5,1.0))).first;
+ if (it->second.increment(trglets, base(trglets), rng))
+ base.Increment(trglets, rng);
+ }
+
+ void Decrement(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
+ RuleModelHash::iterator it = r.find(src);
+ assert(it != r.end());
+ if (it->second.decrement(trglets, rng)) {
+ base.Decrement(trglets, rng);
+ if (it->second.num_customers() == 0)
+ r.erase(it);
+ }
+ }
+
+ prob_t Likelihood() const {
+ prob_t p = prob_t::One();
+ for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+ prob_t q; q.logeq(it->second.log_crp_prob());
+ p *= q;
+ }
+ return p;
+ }
+
+ unsigned UniqueConditioningContexts() const {
+ return r.size();
+ }
+
+ Base& base;
+ typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
+ RuleModelHash r;
+};
+
+PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets,
+ const unsigned num_letters) :
+ letters(lets),
+ up0(new PYPWordModel(num_letters)),
+ tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)),
+ kX(-TD::Convert("X")) {}
+
+prob_t PYPLexicalTranslation::Likelihood() const {
+ prob_t p = up0->Likelihood();
+ p *= tmodel->Likelihood();
+ return p;
+}
+
+void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) {
+ tmodel->ResampleHyperparameters(rng);
+ up0->ResampleHyperparameters(rng);
+}
+
+unsigned PYPLexicalTranslation::UniqueConditioningContexts() const {
+ return tmodel->UniqueConditioningContexts();
+}
+
+prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const {
+ return tmodel->Prob(src, letters[trg]);
+}
+
+void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) {
+ tmodel->Increment(src, letters[trg], rng);
+}
+
+void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) {
+ tmodel->Decrement(src, letters[trg], rng);
+}
+
diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h
new file mode 100644
index 00000000..fa0fb28f
--- /dev/null
+++ b/gi/pf/pyp_tm.h
@@ -0,0 +1,34 @@
+#ifndef PYP_LEX_TRANS
+#define PYP_LEX_TRANS
+
+#include <vector>
+#include "wordid.h"
+#include "prob.h"
+#include "sampler.h"
+
+struct TRule;
+struct PYPWordModel;
+template <typename T> struct ConditionalPYPWordModel;
+
+struct PYPLexicalTranslation {
+ explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets,
+ const unsigned num_letters);
+
+ prob_t Likelihood() const;
+
+ void ResampleHyperparameters(MT19937* rng);
+ prob_t Prob(WordID src, WordID trg) const; // return p(trg | src)
+ void Summary() const;
+ void Increment(WordID src, WordID trg, MT19937* rng);
+ void Decrement(WordID src, WordID trg, MT19937* rng);
+ unsigned UniqueConditioningContexts() const;
+
+ private:
+ const std::vector<std::vector<WordID> >& letters; // spelling dictionary
+ PYPWordModel* up0; // base distribuction (model English word)
+ ConditionalPYPWordModel<PYPWordModel>* tmodel; // translation distributions
+ // (model English word | French word)
+ const WordID kX;
+};
+
+#endif
diff --git a/gi/pf/pyp_word_model.cc b/gi/pf/pyp_word_model.cc
new file mode 100644
index 00000000..12df4abf
--- /dev/null
+++ b/gi/pf/pyp_word_model.cc
@@ -0,0 +1,20 @@
+#include "pyp_word_model.h"
+
+#include <iostream>
+
+using namespace std;
+
+void PYPWordModel::ResampleHyperparameters(MT19937* rng) {
+ r.resample_hyperparameters(rng);
+ cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n";
+}
+
+void PYPWordModel::Summary() const {
+ cerr << "PYPWordModel: generations=" << r.num_customers()
+ << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << endl;
+ for (CCRP<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
+ cerr << " " << it->second.total_dish_count_
+ << " (on " << it->second.table_counts_.size() << " tables) "
+ << TD::GetString(it->first) << endl;
+}
+
diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h
new file mode 100644
index 00000000..800a4fd7
--- /dev/null
+++ b/gi/pf/pyp_word_model.h
@@ -0,0 +1,58 @@
+#ifndef _PYP_WORD_MODEL_H_
+#define _PYP_WORD_MODEL_H_
+
+#include <iostream>
+#include <cmath>
+#include <vector>
+#include "prob.h"
+#include "ccrp.h"
+#include "m.h"
+#include "tdict.h"
+#include "os_phrase.h"
+
+// PYP(d,s,poisson-uniform) represented as a CRP
+struct PYPWordModel {
+ explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 7.5) :
+ base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {}
+
+ void ResampleHyperparameters(MT19937* rng);
+
+ inline prob_t operator()(const std::vector<WordID>& s) const {
+ return r.prob(s, p0(s));
+ }
+
+ inline void Increment(const std::vector<WordID>& s, MT19937* rng) {
+ if (r.increment(s, p0(s), rng))
+ base *= p0(s);
+ }
+
+ inline void Decrement(const std::vector<WordID>& s, MT19937 *rng) {
+ if (r.decrement(s, rng))
+ base /= p0(s);
+ }
+
+ inline prob_t Likelihood() const {
+ prob_t p; p.logeq(r.log_crp_prob());
+ p *= base;
+ return p;
+ }
+
+ void Summary() const;
+
+ private:
+ inline double logp0(const std::vector<WordID>& s) const {
+ return Md::log_poisson(s.size(), mean_length) + s.size() * u0;
+ }
+
+ inline prob_t p0(const std::vector<WordID>& s) const {
+ prob_t p; p.logeq(logp0(s));
+ return p;
+ }
+
+ prob_t base; // keeps track of the draws from the base distribution
+ CCRP<std::vector<WordID> > r;
+ const double u0; // uniform log prob of generating a letter
+ const double mean_length; // mean length of a word in the base distribution
+};
+
+#endif
diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc
index c10000f2..7d0d04ac 100644
--- a/gi/pf/reachability.cc
+++ b/gi/pf/reachability.cc
@@ -12,7 +12,7 @@ struct SState {
int prev_trg_covered;
};
-void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio) {
+void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) {
typedef boost::multi_array<vector<SState>, 2> array_type;
array_type a(boost::extents[srclen + 1][trglen + 1]);
a[0][0].push_back(SState());
@@ -31,9 +31,9 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
}
a[0][0].clear();
//cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
- size_t min_allowed = (src_max_phrase_len + 1) * (trg_max_phrase_len + 1) * (filter_ratio * filter_ratio);
- if (a[srclen][trglen].size() < min_allowed) {
- cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraint of min indegree " << min_allowed << " with " << a[srclen][trglen].size() << " in edges\n";
+ if (a[srclen][trglen].empty()) {
+ cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n";
+ nodes = 0;
return;
}
diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h
index 03967d44..1e22c76a 100644
--- a/gi/pf/reachability.h
+++ b/gi/pf/reachability.h
@@ -18,19 +18,17 @@ struct Reachability {
boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes")
boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node
- // filter_ratio says if the number of outgoing edges from the first cell is less than
- // src_max * trg_max * filter_rat^2 then mark as non reachable
- Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio = 0.0) :
+ Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
nodes(),
edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
max_src_delta(boost::extents[srclen][trglen]),
node_addresses(boost::extents[srclen][trglen]),
valid_deltas(boost::extents[srclen][trglen]) {
- ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len, filter_ratio);
+ ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
}
private:
- void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio);
+ void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len);
};
#endif
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc
index 8ea4ebd2..2200715e 100644
--- a/gi/pf/transliterations.cc
+++ b/gi/pf/transliterations.cc
@@ -5,14 +5,173 @@
#include "boost/shared_ptr.hpp"
+#include "backward.h"
#include "filelib.h"
-#include "ccrp.h"
+#include "tdict.h"
+#include "trule.h"
+#include "filelib.h"
+#include "ccrp_nt.h"
#include "m.h"
#include "reachability.h"
using namespace std;
using namespace std::tr1;
+struct TruncatedConditionalLengthModel {
+ TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) :
+ plens(max_src_size+1, vector<prob_t>(max_trg_size+1, 0.0)) {
+ for (unsigned i = 1; i <= max_src_size; ++i) {
+ prob_t z = prob_t::Zero();
+ for (unsigned j = 1; j <= max_trg_size; ++j)
+ z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio))));
+ for (unsigned j = 1; j <= max_trg_size; ++j)
+ plens[i][j] /= z;
+ //for (unsigned j = 1; j <= max_trg_size; ++j)
+ // cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl;
+ }
+ }
+
+ // return p(tlen | slen) for *chunks* not full words
+ inline const prob_t& operator()(int slen, int tlen) const {
+ return plens[slen][tlen];
+ }
+
+ vector<vector<prob_t> > plens;
+};
+
+struct CondBaseDist {
+ CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) :
+ tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {}
+
+ prob_t operator()(const vector<WordID>& src, unsigned sf, unsigned st,
+ const vector<WordID>& trg, unsigned tf, unsigned tt) const {
+ prob_t p = tclm(st - sf, tt - tf); // target len | source length ~ TCLM(source len)
+ assert(!"not impl");
+ return p;
+ }
+ inline prob_t operator()(const vector<WordID>& src, const vector<WordID>& trg) const {
+ return (*this)(src, 0, src.size(), trg, 0, trg.size());
+ }
+ TruncatedConditionalLengthModel tclm;
+};
+
+// represents transliteration phrase probabilities, e.g.
+// p( a l - | A l ) , p( o | A w ) , ...
+struct TransliterationChunkConditionalModel {
+ explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) :
+ d(0.0),
+ strength(1.0),
+ rp0(pp0) {
+ }
+
+ void Summary() const {
+ std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
+ for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+ std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
+ for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+ std::cerr << " " << i2->second << '\t' << i2->first << std::endl;
+ }
+ }
+
+ int DecrementRule(const TRule& rule) {
+ RuleModelHash::iterator it = r.find(rule.f_);
+ assert(it != r.end());
+ int count = it->second.decrement(rule);
+ if (count) {
+ if (it->second.num_customers() == 0) r.erase(it);
+ }
+ return count;
+ }
+
+ int IncrementRule(const TRule& rule) {
+ RuleModelHash::iterator it = r.find(rule.f_);
+ if (it == r.end()) {
+ it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(strength))).first;
+ }
+ int count = it->second.increment(rule);
+ return count;
+ }
+
+ void IncrementRules(const std::vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ IncrementRule(*rules[i]);
+ }
+
+ void DecrementRules(const std::vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ DecrementRule(*rules[i]);
+ }
+
+ prob_t RuleProbability(const TRule& rule) const {
+ prob_t p;
+ RuleModelHash::const_iterator it = r.find(rule.f_);
+ if (it == r.end()) {
+ p = rp0(rule.f_, rule.e_);
+ } else {
+ p = it->second.prob(rule, rp0(rule.f_, rule.e_));
+ }
+ return p;
+ }
+
+ double LogLikelihood(const double& dd, const double& aa) const {
+ if (aa <= -dd) return -std::numeric_limits<double>::infinity();
+ //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
+ double llh = //Md::log_beta_density(dd, 1, 1) +
+ Md::log_gamma_density(dd + aa, 1, 1);
+ typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::const_iterator it;
+ for (it = r.begin(); it != r.end(); ++it)
+ llh += it->second.log_crp_prob(aa);
+ return llh;
+ }
+
+ struct AlphaResampler {
+ AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {}
+ const TransliterationChunkConditionalModel& m_;
+ double operator()(const double& proposed_strength) const {
+ return m_.LogLikelihood(m_.d, proposed_strength);
+ }
+ };
+
+ void ResampleHyperparameters(MT19937* rng) {
+ typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::iterator it;
+ //const unsigned nloop = 5;
+ const unsigned niterations = 10;
+ //DiscountResampler dr(*this);
+ AlphaResampler ar(*this);
+#if 0
+ for (int iter = 0; iter < nloop; ++iter) {
+ strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ double min_discount = std::numeric_limits<double>::min();
+ if (strength < 0.0) min_discount -= strength;
+ d = slice_sampler1d(dr, d, *rng, min_discount,
+ 1.0, 0.0, niterations, 100*niterations);
+ }
+#endif
+ strength = slice_sampler1d(ar, strength, *rng, -d,
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl;
+ for (it = r.begin(); it != r.end(); ++it) {
+#if 0
+ it->second.set_discount(d);
+#endif
+ it->second.set_alpha(strength);
+ }
+ }
+
+ prob_t Likelihood() const {
+ prob_t p; p.logeq(LogLikelihood(d, strength));
+ return p;
+ }
+
+ const CondBaseDist& rp0;
+ typedef std::tr1::unordered_map<std::vector<WordID>,
+ CCRP_NoTable<TRule>,
+ boost::hash<std::vector<WordID> > > RuleModelHash;
+ RuleModelHash r;
+ double d, strength;
+};
+
struct GraphStructure {
GraphStructure() : r() {}
// leak memory - these are basically static
@@ -20,9 +179,9 @@ struct GraphStructure {
bool IsReachable() const { return r->nodes > 0; }
};
-struct BackwardEstimates {
- BackwardEstimates() : gs(), backward() {}
- explicit BackwardEstimates(const GraphStructure& g) :
+struct ProbabilityEstimates {
+ ProbabilityEstimates() : gs(), backward() {}
+ explicit ProbabilityEstimates(const GraphStructure& g) :
gs(&g), backward() {
if (g.r->nodes > 0)
backward = new float[g.r->nodes];
@@ -36,24 +195,32 @@ struct BackwardEstimates {
}
// returns an backward estimate
- double operator()(int src_covered, int trg_covered) const {
+ double Backward(int src_covered, int trg_covered) const {
if (!backward) return 0;
int ind = gs->r->node_addresses[src_covered][trg_covered];
if (ind < 0) return 0;
return backward[ind];
}
+
+ prob_t estp;
+ float* backward;
private:
const GraphStructure* gs;
- float* backward;
};
struct TransliterationsImpl {
- TransliterationsImpl(int max_src, int max_trg, double fr) :
+ TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) :
+ cp0(max_src, max_trg, sr),
+ tccm(cp0),
+ be(b),
kMAX_SRC_CHUNK(max_src),
kMAX_TRG_CHUNK(max_trg),
- kFILTER_RATIO(fr),
+ kS2T_RATIO(sr),
tot_pairs(), tot_mem() {
}
+ const CondBaseDist cp0;
+ TransliterationChunkConditionalModel tccm;
+ const BackwardEstimator& be;
void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
const size_t src_len = src_lets.size();
@@ -63,20 +230,29 @@ struct TransliterationsImpl {
if (src_len >= graphs.size()) graphs.resize(src_len + 1);
if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
GraphStructure& gs = graphs[src_len][trg_len];
- if (!gs.r)
- gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO);
+ if (!gs.r) {
+ double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO))));
+ if (rat > 1.5 || (rat > 2.4 && src_len < 6)) {
+ cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl;
+ gs.r = new Reachability(src_len, trg_len, 0, 0);
+ } else {
+ gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK);
+ }
+ }
+
const Reachability& r = *gs.r;
// init backward estimates
- if (src >= bes.size()) bes.resize(src + 1);
- unordered_map<WordID, BackwardEstimates>::iterator it = bes[src].find(trg);
- if (it != bes[src].end()) return; // already initialized
+ if (src >= ests.size()) ests.resize(src + 1);
+ unordered_map<WordID, ProbabilityEstimates>::iterator it = ests[src].find(trg);
+ if (it != ests[src].end()) return; // already initialized
- it = bes[src].insert(make_pair(trg, BackwardEstimates(gs))).first;
- BackwardEstimates& b = it->second;
+ it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first;
+ ProbabilityEstimates& est = it->second;
if (!gs.r->nodes) return; // not derivable subject to length constraints
- // TODO
+ be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward);
+ cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl;
tot_pairs++;
tot_mem += sizeof(float) * gs.r->nodes;
}
@@ -92,8 +268,11 @@ struct TransliterationsImpl {
const vector<GraphStructure>& tv = graphs[src.size()];
assert(trg.size() < tv.size());
const GraphStructure& gs = tv[trg.size()];
- // TODO: do prob
- return prob_t::Zero();
+ if (gs.r->nodes == 0)
+ return prob_t::Zero();
+ const unordered_map<WordID, ProbabilityEstimates>::const_iterator it = ests[s].find(t);
+ assert(it != ests[s].end());
+ return it->second.estp;
}
void GraphSummary() const {
@@ -126,15 +305,15 @@ struct TransliterationsImpl {
const int kMAX_SRC_CHUNK;
const int kMAX_TRG_CHUNK;
- const double kFILTER_RATIO;
+ const double kS2T_RATIO;
unsigned tot_pairs;
size_t tot_mem;
vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]
- vector<unordered_map<WordID, BackwardEstimates> > bes; // bes[src][trg]
+ vector<unordered_map<WordID, ProbabilityEstimates> > ests; // ests[src][trg]
};
-Transliterations::Transliterations(int max_src, int max_trg, double fr) :
- pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {}
+Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) :
+ pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {}
Transliterations::~Transliterations() { delete pimpl_; }
void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h
index ea9f9d3f..49d14684 100644
--- a/gi/pf/transliterations.h
+++ b/gi/pf/transliterations.h
@@ -5,11 +5,12 @@
#include "wordid.h"
#include "prob.h"
+struct BackwardEstimator;
struct TransliterationsImpl;
struct Transliterations {
// max_src and max_trg indicate how big the transliteration phrases can be
// see reachability.h for information about filter_ratio
- explicit Transliterations(int max_src, int max_trg, double filter_ratio);
+ explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be);
~Transliterations();
void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h
index 79321493..6efbfc78 100644
--- a/utils/ccrp_nt.h
+++ b/utils/ccrp_nt.h
@@ -11,6 +11,7 @@
#include <boost/functional/hash.hpp>
#include "sampler.h"
#include "slice_sampler.h"
+#include "m.h"
// Chinese restaurant process (1 parameter)
template <typename Dish, typename DishHash = boost::hash<Dish> >
@@ -29,6 +30,7 @@ class CCRP_NoTable {
alpha_prior_rate_(c_rate) {}
double alpha() const { return alpha_; }
+ void set_alpha(const double& alpha) { alpha_ = alpha; assert(alpha_ > 0.0); }
bool has_alpha_prior() const {
return !std::isnan(alpha_prior_shape_);
@@ -71,9 +73,10 @@ class CCRP_NoTable {
return table_diff;
}
- double prob(const Dish& dish, const double& p0) const {
+ template <typename F>
+ F prob(const Dish& dish, const F& p0) const {
const unsigned at_table = num_customers(dish);
- return (at_table + p0 * alpha_) / (num_customers_ + alpha_);
+ return (F(at_table) + p0 * F(alpha_)) / F(num_customers_ + alpha_);
}
double logprob(const Dish& dish, const double& logp0) const {
@@ -85,20 +88,12 @@ class CCRP_NoTable {
return log_crp_prob(alpha_);
}
- static double log_gamma_density(const double& x, const double& shape, const double& rate) {
- assert(x >= 0.0);
- assert(shape > 0.0);
- assert(rate > 0.0);
- const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape);
- return lp;
- }
-
// taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
// does not include P_0's
double log_crp_prob(const double& alpha) const {
double lp = 0.0;
if (has_alpha_prior())
- lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
+ lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
assert(lp <= 0.0);
if (num_customers_) {
lp += lgamma(alpha) - lgamma(alpha + num_customers_) +