From 2a9c9a414abc074ec4ea8a5494e8dd50e1f94d70 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 9 Aug 2012 01:18:32 -0400 Subject: gamma-poisson word length model --- utils/Makefile.am | 5 ++- utils/gamma_poisson.h | 33 ++++++++++++++++++ utils/tdict.cc | 19 ++--------- utils/tdict.h | 21 ++++++++---- utils/unigram_pyp_lm.cc | 90 +++++++++++++++++++++++++++++++++++++------------ 5 files changed, 121 insertions(+), 47 deletions(-) create mode 100644 utils/gamma_poisson.h (limited to 'utils') diff --git a/utils/Makefile.am b/utils/Makefile.am index 386344dd..799ec879 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -9,7 +9,8 @@ noinst_PROGRAMS = \ m_test \ weights_test \ logval_test \ - small_vector_test + small_vector_test \ + unigram_pyp_lm TESTS = ts mfcr_test crp_test small_vector_test logval_test weights_test dict_test m_test @@ -57,6 +58,8 @@ logval_test_SOURCES = logval_test.cc logval_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz small_vector_test_SOURCES = small_vector_test.cc small_vector_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz +unigram_pyp_lm_SOURCES = unigram_pyp_lm.cc +unigram_pyp_lm_LDADD = libutils.a -lz ################################################################ # do NOT NOT NOT add any other -I includes NO NO NO NO NO ###### diff --git a/utils/gamma_poisson.h b/utils/gamma_poisson.h new file mode 100644 index 00000000..fec763f6 --- /dev/null +++ b/utils/gamma_poisson.h @@ -0,0 +1,33 @@ +#ifndef _GAMMA_POISSON_H_ +#define _GAMMA_POISSON_H_ + +#include + +// http://en.wikipedia.org/wiki/Conjugate_prior +struct GammaPoisson { + GammaPoisson(double shape, double rate) : + a(shape), b(rate), n(), marginal() {} + + double prob(unsigned x) const { + return exp(Md::log_negative_binom(x, a + marginal, 1.0 - (b + n) / (1 + b + n))); + } + + void increment(unsigned x) { + ++n; + marginal += x; + } + + void decrement(unsigned x) { + --n; + marginal -= x; + } + + double log_likelihood() const { + return 0; + } + + double a, b; + int n, marginal; +}; + +#endif diff --git a/utils/tdict.cc b/utils/tdict.cc index f33bd576..fd2b76cb 100644 --- a/utils/tdict.cc +++ b/utils/tdict.cc @@ -13,22 +13,6 @@ using namespace std; Dict TD::dict_; -unsigned int TD::NumWords() { - return dict_.max(); -} - -WordID TD::Convert(const std::string& s) { - return dict_.Convert(s); -} - -WordID TD::Convert(char const* s) { - return dict_.Convert(string(s)); -} - -const char* TD::Convert(WordID w) { - return dict_.Convert(w).c_str(); -} - void TD::GetWordIDs(const std::vector& strings, std::vector* ids) { ids->clear(); for (vector::const_iterator i = strings.begin(); i != strings.end(); ++i) @@ -57,7 +41,8 @@ std::string TD::GetString(WordID const* i,WordID const* e) { int TD::AppendString(const WordID& w, int pos, int bufsize, char* buffer) { - const char* word = TD::Convert(w); + const string& s = TD::Convert(w); + const char* word = s.c_str(); const char* const end_buf = buffer + bufsize; char* dest = buffer + pos; while(dest < end_buf && *word) { diff --git a/utils/tdict.h b/utils/tdict.h index 393146fa..03afc2e6 100644 --- a/utils/tdict.h +++ b/utils/tdict.h @@ -3,10 +3,9 @@ #include #include +#include #include "wordid.h" -#include - -class Dict; +#include "dict.h" struct TD { static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far @@ -15,10 +14,18 @@ struct TD { static std::string GetString(const std::vector& str); static std::string GetString(WordID const* i,WordID const* e); static int AppendString(const WordID& w, int pos, int bufsize, char* buffer); - static unsigned int NumWords(); - static WordID Convert(const std::string& s); - static WordID Convert(char const* s); - static const char* Convert(WordID w); + static unsigned int NumWords() { + return dict_.max(); + } + static WordID Convert(const std::string& s) { + return dict_.Convert(s); + } + static WordID Convert(char const* s) { + return dict_.Convert(std::string(s)); + } + static const std::string& Convert(WordID w) { + return dict_.Convert(w); + } private: static Dict dict_; }; diff --git a/utils/unigram_pyp_lm.cc b/utils/unigram_pyp_lm.cc index 510e8839..30b9fde1 100644 --- a/utils/unigram_pyp_lm.cc +++ b/utils/unigram_pyp_lm.cc @@ -11,6 +11,7 @@ #include "tdict.h" #include "sampler.h" #include "ccrp.h" +#include "gamma_poisson.h" // A not very memory-efficient implementation of an 1-gram LM based on PYPs // as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model @@ -54,49 +55,90 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } +struct Histogram { + void increment(unsigned bin, unsigned delta = 1u) { + data[bin] += delta; + } + void decrement(unsigned bin, unsigned delta = 1u) { + data[bin] -= delta; + } + void move(unsigned from_bin, unsigned to_bin, unsigned delta = 1u) { + decrement(from_bin, delta); + increment(to_bin, delta); + } + map data; + // SparseVector data; +}; + +// Lord Rothschild. 1986. THE DISTRIBUTION OF ENGLISH DICTIONARY WORD LENGTHS. +// Journal of Statistical Planning and Inference 14 (1986) 311-322 +struct PoissonLengthUniformCharWordModel { + explicit PoissonLengthUniformCharWordModel(unsigned vocab_size) : plen(5,5), uc(-log(50)), llh() {} + void increment(WordID w, MT19937*) { + llh += log(prob(w)); // this isn't quite right + plen.increment(TD::Convert(w).size() - 1); + } + void decrement(WordID w, MT19937*) { + plen.decrement(TD::Convert(w).size() - 1); + llh -= log(prob(w)); // this isn't quite right + } + double prob(WordID w) const { + size_t len = TD::Convert(w).size(); + return plen.prob(len - 1) * exp(uc * len); + } + double log_likelihood() const { return llh; } + void resample_hyperparameters(MT19937*) {} + GammaPoisson plen; + const double uc; + double llh; +}; + // uniform base distribution (0-gram model) struct UniformWordModel { explicit UniformWordModel(unsigned vocab_size) : p0(1.0 / vocab_size), draws() {} - void increment() { ++draws; } - void decrement() { --draws; assert(draws >= 0); } + void increment(WordID, MT19937*) { ++draws; } + void decrement(WordID, MT19937*) { --draws; assert(draws >= 0); } double prob(WordID) const { return p0; } // all words have equal prob double log_likelihood() const { return draws * log(p0); } + void resample_hyperparameters(MT19937*) {} const double p0; int draws; }; // represents an Unigram LM +template struct UnigramLM { UnigramLM(unsigned vs, double da, double db, double ss, double sr) : - uniform_vocab(vs), + base(vs), crp(da, db, ss, sr, 0.8, 1.0) {} void increment(WordID w, MT19937* rng) { - const double backoff = uniform_vocab.prob(w); + const double backoff = base.prob(w); if (crp.increment(w, backoff, rng)) - uniform_vocab.increment(); + base.increment(w, rng); } void decrement(WordID w, MT19937* rng) { if (crp.decrement(w, rng)) - uniform_vocab.decrement(); + base.decrement(w, rng); } double prob(WordID w) const { - const double backoff = uniform_vocab.prob(w); + const double backoff = base.prob(w); return crp.prob(w, backoff); } double log_likelihood() const { - double llh = uniform_vocab.log_likelihood(); + double llh = base.log_likelihood(); llh += crp.log_crp_prob(); return llh; } void resample_hyperparameters(MT19937* rng) { crp.resample_hyperparameters(rng); + base.resample_hyperparameters(rng); } double discount_a, discount_b, strength_s, strength_r; double d, strength; - UniformWordModel uniform_vocab; + BaseGenerator base; CCRP crp; }; @@ -121,15 +163,19 @@ int main(int argc, char** argv) { CorpusTools::ReadFromFile(conf["test"].as(), &test); else test = corpuse; - UnigramLM lm(vocabe.size(), - conf["discount_prior_a"].as(), - conf["discount_prior_b"].as(), - conf["strength_prior_s"].as(), - conf["strength_prior_r"].as()); - for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < corpuse.size(); ++ci) { +#if 1 + UnigramLM lm(vocabe.size(), +#else + UnigramLM lm(vocabe.size(), +#endif + conf["discount_prior_a"].as(), + conf["discount_prior_b"].as(), + conf["strength_prior_s"].as(), + conf["strength_prior_r"].as()); + for (unsigned SS=0; SS < samples; ++SS) { + for (unsigned ci = 0; ci < corpuse.size(); ++ci) { const vector& s = corpuse[ci]; - for (int i = 0; i <= s.size(); ++i) { + for (unsigned i = 0; i <= s.size(); ++i) { WordID w = (i < s.size() ? s[i] : kEOS); if (SS > 0) lm.decrement(w, &rng); lm.increment(w, &rng); @@ -137,21 +183,21 @@ int main(int argc, char** argv) { if (SS > 0) lm.decrement(kEOS, &rng); lm.increment(kEOS, &rng); } - cerr << "LLH=" << lm.log_likelihood() << endl; - //if (SS % 10 == 9) lm.resample_hyperparameters(&rng); + cerr << "LLH=" << lm.log_likelihood() << "\t tables=" << lm.crp.num_tables() << " " << endl; + if (SS % 10 == 9) lm.resample_hyperparameters(&rng); } double llh = 0; unsigned cnt = 0; unsigned oovs = 0; - for (int ci = 0; ci < test.size(); ++ci) { + for (unsigned ci = 0; ci < test.size(); ++ci) { const vector& s = test[ci]; - for (int i = 0; i <= s.size(); ++i) { + for (unsigned i = 0; i <= s.size(); ++i) { WordID w = (i < s.size() ? s[i] : kEOS); double lp = log(lm.prob(w)) / log(2); if (i < s.size() && vocabe.count(w) == 0) { cerr << "**OOV "; ++oovs; - lp = 0; + //lp = 0; } cerr << "p(" << TD::Convert(w) << ") = " << lp << endl; llh -= lp; -- cgit v1.2.3