From 38f28be7cd2bada87ebad78994e3c938e10c2cce Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 12:56:15 -0500 Subject: ready to infer alignment parameters --- gi/pf/Makefile.am | 4 +- gi/pf/align-lexonly-pyp.cc | 22 ++- gi/pf/align-lexonly.cc | 332 --------------------------------------------- gi/pf/pyp_tm.cc | 6 +- gi/pf/quasi_model2.h | 115 ++++++++++++---- gi/pf/tied_resampler.h | 31 +++++ 6 files changed, 143 insertions(+), 367 deletions(-) delete mode 100644 gi/pf/align-lexonly.cc diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 4ce72ba1..f9c979d0 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl noinst_LIBRARIES = libpf.a @@ -7,8 +7,6 @@ libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc cor nuisance_test_SOURCES = nuisance_test.cc nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz -align_lexonly_SOURCES = align-lexonly.cc - align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 0c90b6ce..68cb9192 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -61,15 +61,15 @@ struct AlignedSentencePair { struct Aligner { Aligner(const vector >& lets, int num_letters, vector* c) : corpus(*c), + paj_model(4, 0.08), model(lets, num_letters), - paj(4, 0.08), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); } vector& corpus; + QuasiModel2 paj_model; PYPLexicalTranslation model; - const QuasiModel2 paj; const WordID kNULL; void ResampleHyperparameters() { @@ -86,10 +86,12 @@ struct Aligner { a_j = prng->next() * (1 + asp.src.size()); const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Increment(f_a_j, asp.trg[j], &*prng); - // TODO factor in alignment prob + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl; + cerr << "Corpus intialized randomly." << endl; + cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() + << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; } void ResampleCorpus() { @@ -101,19 +103,25 @@ struct Aligner { const WordID e_j = asp.trg[j]; WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Decrement(f_a_j, e_j, prng); + paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size()); for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); ss[prop_a_j] = model.Prob(prop_f, e_j); - // TODO configurable - ss[prop_a_j] *= paj.Pa_j(prop_a_j, j, asp.src.size(), asp.trg.size()); + ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size()); } a_j = prng->SelectSample(ss); f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Increment(f_a_j, e_j, prng); + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl; + cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() + << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; + } + + prob_t Likelihood() const { + return model.Likelihood() * paj_model.Likelihood(); } }; diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc deleted file mode 100644 index dbc9dc07..00000000 --- a/gi/pf/align-lexonly.cc +++ /dev/null @@ -1,332 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "array2d.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" -#include "tdict.h" -#include "stringlib.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "corpus.h" -#include "ngram_base.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -shared_ptr prng; - -struct LexicalAlignment { - unsigned char src_index; - bool is_transliteration; - vector > derivation; -}; - -struct AlignedSentencePair { - vector src; - vector trg; - vector a; - Array2D posterior; -}; - -struct HierarchicalWordBase { - explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(25,25,10), u0(-log(vocab_e_size)) {} - - void ResampleHyperparameters(MT19937* rng) { - r.resample_hyperparameters(rng); - } - - inline double logp0(const vector& s) const { - return s.size() * u0; - } - - // return p0 of rule.e_ - prob_t operator()(const TRule& rule) const { - prob_t p; p.logeq(r.logprob(rule.e_, logp0(rule.e_))); - return p; - } - - void Increment(const TRule& rule) { - if (r.increment(rule.e_)) { - prob_t p; p.logeq(logp0(rule.e_)); - base *= p; - } - } - - void Decrement(const TRule& rule) { - if (r.decrement(rule.e_)) { - prob_t p; p.logeq(logp0(rule.e_)); - base /= p; - } - } - - prob_t Likelihood() const { - prob_t p; p.logeq(r.log_crp_prob()); - p *= base; - return p; - } - - void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (\\alpha=" << r.alpha() << ')' << endl; - for (CCRP_NoTable >::const_iterator it = r.begin(); it != r.end(); ++it) - cerr << " " << it->second << '\t' << TD::GetString(it->first) << endl; - } - - prob_t base; - CCRP_NoTable > r; - const double u0; -}; - -struct BasicLexicalAlignment { - explicit BasicLexicalAlignment(const vector >& lets, - const unsigned words_e, - const unsigned letters_e, - vector* corp) : - letters(lets), - corpus(*corp), - up0("fr-en.10k.translit-base.txt.gz"), - //up0(words_e), - //up0("en.chars.1gram", letters_e), - //up0("en.words.1gram"), - //up0(letters_e), - //up0("en.chars.2gram"), - tmodel(up0) { - } - - void InstantiateRule(const WordID src, - const WordID trg, - TRule* rule) const { - static const WordID kX = TD::Convert("X") * -1; - rule->lhs_ = kX; - rule->e_ = letters[trg]; - rule->f_ = letters[src]; - } - - void InitializeRandom() { - const WordID kNULL = TD::Convert("NULL"); - cerr << "Initializing with random alignments ...\n"; - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - asp.a.resize(asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - const unsigned char a_j = prng->next() * (1 + asp.src.size()); - const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - TRule r; - InstantiateRule(f_a_j, asp.trg[j], &r); - asp.a[j].is_transliteration = false; - asp.a[j].src_index = a_j; - if (tmodel.IncrementRule(r)) - up0.Increment(r); - } - } - cerr << " LLH = " << Likelihood() << endl; - } - - prob_t Likelihood() const { - prob_t p = tmodel.Likelihood(); - p *= up0.Likelihood(); - return p; - } - - void ResampleHyperparemeters() { - cerr << " LLH_prev = " << Likelihood() << flush; - tmodel.ResampleHyperparameters(&*prng); - up0.ResampleHyperparameters(&*prng); - cerr << "\tLLH_post = " << Likelihood() << endl; - } - - void ResampleCorpus(); - - const vector >& letters; // spelling dictionary - vector& corpus; - //PhraseConditionalUninformativeBase up0; - //PhraseConditionalUninformativeUnigramBase up0; - //UnigramWordBase up0; - //HierarchicalUnigramBase up0; - TableLookupBase up0; - //HierarchicalWordBase up0; - //PoissonUniformUninformativeBase up0; - //CompletelyUniformBase up0; - //FixedNgramBase up0; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; -}; - -void BasicLexicalAlignment::ResampleCorpus() { - static const WordID kNULL = TD::Convert("NULL"); - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - SampleSet ss; ss.resize(asp.src.size() + 1); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - TRule r; - unsigned char& a_j = asp.a[j].src_index; - WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.DecrementRule(r)) - up0.Decrement(r); - - for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { - const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); - InstantiateRule(prop_f, asp.trg[j], &r); - ss[prop_a_j] = tmodel.RuleProbability(r); - } - a_j = prng->SelectSample(ss); - f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.IncrementRule(r)) - up0.Increment(r); - } - } - cerr << " LLH = " << tmodel.Likelihood() << endl; -} - -void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { - for (set::const_iterator it = v.begin(); it != v.end(); ++it) { - if (*it >= l->size()) { l->resize(*it + 1); } - vector& letters = (*l)[*it]; - if (letters.size()) continue; // if e and f have the same word - - const string& w = TD::Convert(*it); - - size_t cur = 0; - while (cur < w.size()) { - const size_t len = UTF8Len(w[cur]); - letters.push_back(TD::Convert(w.substr(cur, len))); - if (letset) letset->insert(letters.back()); - cur += len; - } - } -} - -void Debug(const AlignedSentencePair& asp) { - cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; - Array2D a(asp.src.size(), asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) - if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; - cerr << a << endl; -} - -void AddSample(AlignedSentencePair* asp) { - for (unsigned j = 0; j < asp->trg.size(); ++j) - asp->posterior(asp->a[j].src_index, j)++; -} - -void WriteAlignments(const AlignedSentencePair& asp) { - bool first = true; - for (unsigned j = 0; j < asp.trg.size(); ++j) { - int src_index = -1; - int mc = -1; - for (unsigned i = 0; i <= asp.src.size(); ++i) { - if (asp.posterior(i, j) > mc) { - mc = asp.posterior(i, j); - src_index = i; - } - } - - if (src_index) { - if (first) first = false; else cout << ' '; - cout << (src_index - 1) << '-' << j; - } - } - cout << endl; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); -// MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; - cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; - assert(corpusf.size() == corpuse.size()); - - vector corpus(corpuse.size()); - for (unsigned i = 0; i < corpuse.size(); ++i) { - corpus[i].src.swap(corpusf[i]); - corpus[i].trg.swap(corpuse[i]); - corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); - } - corpusf.clear(); corpuse.clear(); - - vocabf.insert(TD::Convert("NULL")); - vector > letters(TD::NumWords()); - set letset; - ExtractLetters(vocabe, &letters, &letset); - ExtractLetters(vocabf, &letters, NULL); - letters[TD::Convert("NULL")].clear(); - - BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); - x.InitializeRandom(); - const unsigned samples = conf["samples"].as(); - for (int i = 0; i < samples; ++i) { - for (int j = 395; j < 397; ++j) Debug(corpus[j]); - cerr << i << "\t" << x.tmodel.r.size() << "\t"; - if (i % 10 == 0) x.ResampleHyperparemeters(); - x.ResampleCorpus(); - if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); - } - for (unsigned i = 0; i < corpus.size(); ++i) - WriteAlignments(corpus[i]); - //ModelAndData posterior(x, &corpus, vocabe, vocabf); - x.tmodel.Summary(); - x.up0.Summary(); - - //posterior.Sample(); - - return 0; -} diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index 73104fe9..bf5a6497 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -10,7 +10,6 @@ #include "tdict.h" #include "ccrp.h" #include "pyp_word_model.h" - #include "tied_resampler.h" using namespace std; @@ -18,7 +17,7 @@ using namespace std::tr1; template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b) {} + ConditionalPYPWordModel(Base* b) : base(*b), btr(3) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -32,6 +31,7 @@ struct ConditionalPYPWordModel { void ResampleHyperparameters(MT19937* rng) { for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) it->second.resample_hyperparameters(rng); + btr.ResampleHyperparameters(rng); } prob_t Prob(const WordID src, const vector& trglets) const { @@ -72,7 +72,9 @@ struct ConditionalPYPWordModel { return r.size(); } + // TODO tie PYP hyperparameters based on source word frequency bins Base& base; + BinTiedResampler > > btr; typedef unordered_map > > RuleModelHash; RuleModelHash r; }; diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h index 0095289f..8ec0a400 100644 --- a/gi/pf/quasi_model2.h +++ b/gi/pf/quasi_model2.h @@ -3,44 +3,113 @@ #include #include +#include +#include "boost/functional.hpp" #include "prob.h" #include "array2d.h" +struct AlignmentObservation { + AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} + AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) : + src_len(sl), trg_len(tl), j(tw), a_j(sw) {} + unsigned short src_len; + unsigned short trg_len; + unsigned short j; + unsigned short a_j; +}; + +inline size_t hash_value(const AlignmentObservation& o) { + return reinterpret_cast(o); +} + +inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) { + return hash_value(a) == hash_value(b); +} + struct QuasiModel2 { explicit QuasiModel2(double alpha, double pnull = 0.1) : alpha_(alpha), pnull_(pnull), - pnotnull_(1 - pnull), - z_(1000,1000) {} + pnotnull_(1 - pnull) {} + // a_j = 0 => NULL; src_len does *not* include null - prob_t Pa_j(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { + prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { if (!a_j) return pnull_; - std::vector& zv = z_(src_len, trg_len); - if (zv.size() == 0) - zv.resize(trg_len); - - prob_t& z = zv[j]; - if (z.is_0()) z = ComputeZ(j, src_len, trg_len); - - prob_t p; - p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); - p *= pnotnull_; - p /= z; + return pnotnull_ * + prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len)); + } + + void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + assert(a_j <= src_len); + assert(j < trg_len); + ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)]; + } + + void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + const AlignmentObservation ao(src_len, trg_len, j, a_j); + int &cc = obs_[ao]; + assert(cc > 0); + --cc; + if (!cc) obs_.erase(ao); + } + + prob_t Likelihood() const { + return Likelihood(alpha_, pnull_.as_float()); + } + + prob_t Likelihood(double alpha, double ppnull) const { + const prob_t pnull(ppnull); + const prob_t pnotnull(1 - ppnull); + + prob_t p = prob_t::One(); + for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { + const AlignmentObservation& ao = it->first; + if (ao.a_j) { + double u = UnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); + double z = ComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); + prob_t pa(u / z); + pa *= pnotnull; + pa.poweq(it->second); + p *= pa; + } else { + p *= pnull.pow(it->second); + } + } return p; } + private: - prob_t ComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { - prob_t p, z = prob_t::Zero(); - for (int a_j = 1; a_j <= src_len; ++a_j) { - p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); - z += p; - } + static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); + } + + static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + double z = 0; + for (int a_j = 1; a_j <= src_len; ++a_j) + z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha); return z; } + + const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { + if (src_len >= zcache_.size()) + zcache_.resize(src_len + 1); + if (trg_len >= zcache_[src_len].size()) + zcache_[src_len].resize(trg_len + 1); + std::vector& zv = zcache_[src_len][trg_len]; + if (zv.size() == 0) + zv.resize(trg_len); + double& z = zv[j]; + if (!z) + z = ComputeZ(j, src_len, trg_len, alpha_); + return z; + } + double alpha_; - const prob_t pnull_; - const prob_t pnotnull_; - mutable Array2D > z_; + prob_t pnull_; + prob_t pnotnull_; + mutable std::vector > > zcache_; + typedef std::tr1::unordered_map > ObsCount; + ObsCount obs_; }; #endif diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h index 208fb9c7..5a262f9d 100644 --- a/gi/pf/tied_resampler.h +++ b/gi/pf/tied_resampler.h @@ -2,6 +2,7 @@ #define _TIED_RESAMPLER_H_ #include +#include #include "sampler.h" #include "slice_sampler.h" #include "m.h" @@ -28,6 +29,10 @@ struct TiedResampler { crps.erase(crp); } + size_t size() const { + return crps.size(); + } + double LogLikelihood(double d, double s) const { if (s <= -d) return -std::numeric_limits::infinity(); double llh = Md::log_beta_density(d, d_alpha, d_beta) + @@ -54,6 +59,7 @@ struct TiedResampler { }; void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; } const DiscountResampler dr(*this); const AlphaResampler ar(*this); for (int iter = 0; iter < nloop; ++iter) { @@ -79,4 +85,29 @@ struct TiedResampler { double discount, strength; }; +// split according to some criterion +template +struct BinTiedResampler { + explicit BinTiedResampler(unsigned nbins) : + resamplers(nbins, TiedResampler(1,1,1,1)) {} + + void Add(unsigned bin, CRP* crp) { + resamplers[bin].Add(crp); + } + + void Remove(unsigned bin, CRP* crp) { + resamplers[bin].Remove(crp); + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < resamplers.size(); ++i) { + std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush; + resamplers[i].ResampleHyperparameters(rng); + } + } + + private: + std::vector > resamplers; +}; + #endif -- cgit v1.2.3