diff options
Diffstat (limited to 'gi/pf/align-lexonly-pyp.cc')
-rw-r--r-- | gi/pf/align-lexonly-pyp.cc | 207 |
1 files changed, 47 insertions, 160 deletions
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 13a3a487..d68a4b8f 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -1,27 +1,18 @@ #include <iostream> -#include <tr1/memory> #include <queue> -#include <boost/multi_array.hpp> #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> -#include "array2d.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" #include "tdict.h" #include "stringlib.h" #include "filelib.h" -#include "dict.h" +#include "array2d.h" #include "sampler.h" -#include "mfcr.h" #include "corpus.h" -#include "ngram_base.h" +#include "pyp_tm.h" using namespace std; -using namespace tr1; namespace po = boost::program_options; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { @@ -51,7 +42,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -shared_ptr<MT19937> prng; +MT19937* prng; struct LexicalAlignment { unsigned char src_index; @@ -66,159 +57,59 @@ struct AlignedSentencePair { Array2D<short> posterior; }; -struct HierarchicalWordBase { - explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} - - void ResampleHyperparameters(MT19937* rng) { - r.resample_hyperparameters(rng); - } - - inline double logp0(const vector<WordID>& s) const { - return Md::log_poisson(s.size(), 7.5) + s.size() * u0; - } - - // return p0 of rule.e_ - prob_t operator()(const TRule& rule) const { - v[0].logeq(logp0(rule.e_)); - return r.prob(rule.e_, v.begin(), l.begin()); - } - - void Increment(const TRule& rule) { - v[0].logeq(logp0(rule.e_)); - if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { - base *= v[0] * l[0]; - } - } - - void Decrement(const TRule& rule) { - if (r.decrement(rule.e_, &*prng).count) { - base /= prob_t(exp(logp0(rule.e_))); - } - } - - prob_t Likelihood() const { - prob_t p; p.logeq(r.log_crp_prob()); - p *= base; - return p; +struct Aligner { + Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) : + corpus(*c), + model(lets, num_letters), + kNULL(TD::Convert("NULL")) { + assert(lets[kNULL].size() == 0); } - void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; - for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) - cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; - } - - prob_t base; - MFCR<1,vector<WordID> > r; - const double u0; - const vector<prob_t> l; - mutable vector<prob_t> v; -}; - -struct BasicLexicalAlignment { - explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets, - const unsigned words_e, - const unsigned letters_e, - vector<AlignedSentencePair>* corp) : - letters(lets), - corpus(*corp), - //up0(words_e), - //up0("en.chars.1gram", letters_e), - //up0("en.words.1gram"), - up0(letters_e), - //up0("en.chars.2gram"), - tmodel(up0) { - } + vector<AlignedSentencePair>& corpus; + PYPLexicalTranslation model; + const WordID kNULL; - void InstantiateRule(const WordID src, - const WordID trg, - TRule* rule) const { - static const WordID kX = TD::Convert("X") * -1; - rule->lhs_ = kX; - rule->e_ = letters[trg]; - rule->f_ = letters[src]; + void ResampleHyperparameters() { + model.ResampleHyperparameters(prng); } void InitializeRandom() { - const WordID kNULL = TD::Convert("NULL"); cerr << "Initializing with random alignments ...\n"; for (unsigned i = 0; i < corpus.size(); ++i) { AlignedSentencePair& asp = corpus[i]; asp.a.resize(asp.trg.size()); for (unsigned j = 0; j < asp.trg.size(); ++j) { - const unsigned char a_j = prng->next() * (1 + asp.src.size()); + unsigned char& a_j = asp.a[j].src_index; + a_j = prng->next() * (1 + asp.src.size()); const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - TRule r; - InstantiateRule(f_a_j, asp.trg[j], &r); - asp.a[j].is_transliteration = false; - asp.a[j].src_index = a_j; - if (tmodel.IncrementRule(r, &*prng)) - up0.Increment(r); + model.Increment(f_a_j, asp.trg[j], &*prng); } } - cerr << " LLH = " << Likelihood() << endl; - } - - prob_t Likelihood() const { - prob_t p = tmodel.Likelihood(); - p *= up0.Likelihood(); - return p; - } - - void ResampleHyperparemeters() { - tmodel.ResampleHyperparameters(&*prng); - up0.ResampleHyperparameters(&*prng); - cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; + cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl; } - void ResampleCorpus(); - - const vector<vector<WordID> >& letters; // spelling dictionary - vector<AlignedSentencePair>& corpus; - //PhraseConditionalUninformativeBase up0; - //PhraseConditionalUninformativeUnigramBase up0; - //UnigramWordBase up0; - //HierarchicalUnigramBase up0; - HierarchicalWordBase up0; - //CompletelyUniformBase up0; - //FixedNgramBase up0; - //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel; - //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel; - //ConditionalTranslationModel<UnigramWordBase> tmodel; - //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel; - MConditionalTranslationModel<HierarchicalWordBase> tmodel; - //ConditionalTranslationModel<FixedNgramBase> tmodel; - //ConditionalTranslationModel<CompletelyUniformBase> tmodel; -}; - -void BasicLexicalAlignment::ResampleCorpus() { - static const WordID kNULL = TD::Convert("NULL"); - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - TRule r; - unsigned char& a_j = asp.a[j].src_index; - WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.DecrementRule(r, &*prng)) - up0.Decrement(r); - - for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { - const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); - InstantiateRule(prop_f, asp.trg[j], &r); - ss[prop_a_j] = tmodel.RuleProbability(r); + void ResampleCorpus() { + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + unsigned char& a_j = asp.a[j].src_index; + const WordID e_j = asp.trg[j]; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Decrement(f_a_j, e_j, prng); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + ss[prop_a_j] = model.Prob(prop_f, e_j); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Increment(f_a_j, e_j, prng); } - a_j = prng->SelectSample(ss); - f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.IncrementRule(r, &*prng)) - up0.Increment(r); } + cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl; } - cerr << " LLH = " << Likelihood() << endl; -} +}; void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) { for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { @@ -240,8 +131,10 @@ void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID void Debug(const AlignedSentencePair& asp) { cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; Array2D<bool> a(asp.src.size(), asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) + for (unsigned j = 0; j < asp.trg.size(); ++j) { + assert(asp.a[j].src_index <= asp.src.size()); if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + } cerr << a << endl; } @@ -275,10 +168,9 @@ int main(int argc, char** argv) { InitCommandLine(argc, argv, &conf); if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + prng = new MT19937(conf["random_seed"].as<uint32_t>()); else - prng.reset(new MT19937); -// MT19937& rng = *prng; + prng = new MT19937; vector<vector<int> > corpuse, corpusf; set<int> vocabe, vocabf; @@ -304,23 +196,18 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); - x.InitializeRandom(); + Aligner aligner(letters, letset.size(), &corpus); + aligner.InitializeRandom(); + const unsigned samples = conf["samples"].as<unsigned>(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - cerr << i << "\t" << x.tmodel.r.size() << "\t"; - if (i % 7 == 6) x.ResampleHyperparemeters(); - x.ResampleCorpus(); + if (i % 7 == 6) aligner.ResampleHyperparameters(); + aligner.ResampleCorpus(); if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } for (unsigned i = 0; i < corpus.size(); ++i) WriteAlignments(corpus[i]); - //ModelAndData posterior(x, &corpus, vocabe, vocabf); - x.tmodel.Summary(); - x.up0.Summary(); - - //posterior.Sample(); return 0; } |