From 2a9c9a414abc074ec4ea8a5494e8dd50e1f94d70 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 9 Aug 2012 01:18:32 -0400 Subject: gamma-poisson word length model --- utils/unigram_pyp_lm.cc | 90 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 22 deletions(-) (limited to 'utils/unigram_pyp_lm.cc') diff --git a/utils/unigram_pyp_lm.cc b/utils/unigram_pyp_lm.cc index 510e8839..30b9fde1 100644 --- a/utils/unigram_pyp_lm.cc +++ b/utils/unigram_pyp_lm.cc @@ -11,6 +11,7 @@ #include "tdict.h" #include "sampler.h" #include "ccrp.h" +#include "gamma_poisson.h" // A not very memory-efficient implementation of an 1-gram LM based on PYPs // as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model @@ -54,49 +55,90 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } +struct Histogram { + void increment(unsigned bin, unsigned delta = 1u) { + data[bin] += delta; + } + void decrement(unsigned bin, unsigned delta = 1u) { + data[bin] -= delta; + } + void move(unsigned from_bin, unsigned to_bin, unsigned delta = 1u) { + decrement(from_bin, delta); + increment(to_bin, delta); + } + map data; + // SparseVector data; +}; + +// Lord Rothschild. 1986. THE DISTRIBUTION OF ENGLISH DICTIONARY WORD LENGTHS. +// Journal of Statistical Planning and Inference 14 (1986) 311-322 +struct PoissonLengthUniformCharWordModel { + explicit PoissonLengthUniformCharWordModel(unsigned vocab_size) : plen(5,5), uc(-log(50)), llh() {} + void increment(WordID w, MT19937*) { + llh += log(prob(w)); // this isn't quite right + plen.increment(TD::Convert(w).size() - 1); + } + void decrement(WordID w, MT19937*) { + plen.decrement(TD::Convert(w).size() - 1); + llh -= log(prob(w)); // this isn't quite right + } + double prob(WordID w) const { + size_t len = TD::Convert(w).size(); + return plen.prob(len - 1) * exp(uc * len); + } + double log_likelihood() const { return llh; } + void resample_hyperparameters(MT19937*) {} + GammaPoisson plen; + const double uc; + double llh; +}; + // uniform base distribution (0-gram model) struct UniformWordModel { explicit UniformWordModel(unsigned vocab_size) : p0(1.0 / vocab_size), draws() {} - void increment() { ++draws; } - void decrement() { --draws; assert(draws >= 0); } + void increment(WordID, MT19937*) { ++draws; } + void decrement(WordID, MT19937*) { --draws; assert(draws >= 0); } double prob(WordID) const { return p0; } // all words have equal prob double log_likelihood() const { return draws * log(p0); } + void resample_hyperparameters(MT19937*) {} const double p0; int draws; }; // represents an Unigram LM +template struct UnigramLM { UnigramLM(unsigned vs, double da, double db, double ss, double sr) : - uniform_vocab(vs), + base(vs), crp(da, db, ss, sr, 0.8, 1.0) {} void increment(WordID w, MT19937* rng) { - const double backoff = uniform_vocab.prob(w); + const double backoff = base.prob(w); if (crp.increment(w, backoff, rng)) - uniform_vocab.increment(); + base.increment(w, rng); } void decrement(WordID w, MT19937* rng) { if (crp.decrement(w, rng)) - uniform_vocab.decrement(); + base.decrement(w, rng); } double prob(WordID w) const { - const double backoff = uniform_vocab.prob(w); + const double backoff = base.prob(w); return crp.prob(w, backoff); } double log_likelihood() const { - double llh = uniform_vocab.log_likelihood(); + double llh = base.log_likelihood(); llh += crp.log_crp_prob(); return llh; } void resample_hyperparameters(MT19937* rng) { crp.resample_hyperparameters(rng); + base.resample_hyperparameters(rng); } double discount_a, discount_b, strength_s, strength_r; double d, strength; - UniformWordModel uniform_vocab; + BaseGenerator base; CCRP crp; }; @@ -121,15 +163,19 @@ int main(int argc, char** argv) { CorpusTools::ReadFromFile(conf["test"].as(), &test); else test = corpuse; - UnigramLM lm(vocabe.size(), - conf["discount_prior_a"].as(), - conf["discount_prior_b"].as(), - conf["strength_prior_s"].as(), - conf["strength_prior_r"].as()); - for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < corpuse.size(); ++ci) { +#if 1 + UnigramLM lm(vocabe.size(), +#else + UnigramLM lm(vocabe.size(), +#endif + conf["discount_prior_a"].as(), + conf["discount_prior_b"].as(), + conf["strength_prior_s"].as(), + conf["strength_prior_r"].as()); + for (unsigned SS=0; SS < samples; ++SS) { + for (unsigned ci = 0; ci < corpuse.size(); ++ci) { const vector& s = corpuse[ci]; - for (int i = 0; i <= s.size(); ++i) { + for (unsigned i = 0; i <= s.size(); ++i) { WordID w = (i < s.size() ? s[i] : kEOS); if (SS > 0) lm.decrement(w, &rng); lm.increment(w, &rng); @@ -137,21 +183,21 @@ int main(int argc, char** argv) { if (SS > 0) lm.decrement(kEOS, &rng); lm.increment(kEOS, &rng); } - cerr << "LLH=" << lm.log_likelihood() << endl; - //if (SS % 10 == 9) lm.resample_hyperparameters(&rng); + cerr << "LLH=" << lm.log_likelihood() << "\t tables=" << lm.crp.num_tables() << " " << endl; + if (SS % 10 == 9) lm.resample_hyperparameters(&rng); } double llh = 0; unsigned cnt = 0; unsigned oovs = 0; - for (int ci = 0; ci < test.size(); ++ci) { + for (unsigned ci = 0; ci < test.size(); ++ci) { const vector& s = test[ci]; - for (int i = 0; i <= s.size(); ++i) { + for (unsigned i = 0; i <= s.size(); ++i) { WordID w = (i < s.size() ? s[i] : kEOS); double lp = log(lm.prob(w)) / log(2); if (i < s.size() && vocabe.count(w) == 0) { cerr << "**OOV "; ++oovs; - lp = 0; + //lp = 0; } cerr << "p(" << TD::Convert(w) << ") = " << lp << endl; llh -= lp; -- cgit v1.2.3