From 1cdb086a30abc0ea2f616ddbb008cda215f6f256 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 12 Aug 2012 21:56:58 -0400 Subject: possible errors with google hashes --- gi/pf/pyp_lm.cc | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 6 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 7cec437a..605d8206 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -6,6 +6,7 @@ #include #include +#include "gamma_poisson.h" #include "corpus_tools.h" #include "m.h" #include "tdict.h" @@ -59,11 +60,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -template struct PYPLM; - -// uniform base distribution (0-gram model) -template<> struct PYPLM<0> { - PYPLM(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} +// uniform distribution over a fixed vocabulary +struct UniformVocabulary { + UniformVocabulary(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} void increment(WordID, const vector&, MT19937*) { ++draws; } void decrement(WordID, const vector&, MT19937*) { --draws; assert(draws >= 0); } double prob(WordID, const vector&) const { return p0; } @@ -73,6 +72,73 @@ template<> struct PYPLM<0> { int draws; }; +// Lord Rothschild. 1986. THE DISTRIBUTION OF ENGLISH DICTIONARY WORD LENGTHS. +// Journal of Statistical Planning and Inference 14 (1986) 311-322 +struct PoissonLengthUniformCharWordModel { + explicit PoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : plen(5,5), uc(-log(95)), llh() {} + void increment(WordID w, const vector& v, MT19937*) { + llh += log(prob(w, v)); // this isn't quite right + plen.increment(TD::Convert(w).size() - 1); + } + void decrement(WordID w, const vector& v, MT19937*) { + plen.decrement(TD::Convert(w).size() - 1); + llh -= log(prob(w, v)); // this isn't quite right + } + double prob(WordID w, const vector&) const { + const unsigned len = TD::Convert(w).size(); + return plen.prob(len - 1) * exp(uc * len); + } + double log_likelihood() const { return llh; } + void resample_hyperparameters(MT19937*) {} + GammaPoisson plen; + const double uc; + double llh; +}; + +struct PYPAdaptedPoissonLengthUniformCharWordModel { + explicit PYPAdaptedPoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : + base(vocab_size,1,1,1,1), + crp(1,1,1,1) {} + void increment(WordID w, const vector& v, MT19937* rng) { + double p0 = base.prob(w, v); + if (crp.increment(w, p0, rng)) + base.increment(w, v, rng); + } + void decrement(WordID w, const vector& v, MT19937* rng) { + if (crp.decrement(w, rng)) + base.decrement(w, v, rng); + } + double prob(WordID w, const vector& v) const { + double p0 = base.prob(w, v); + return crp.prob(w, p0); + } + double log_likelihood() const { return crp.log_crp_prob() + base.log_likelihood(); } + void resample_hyperparameters(MT19937* rng) { crp.resample_hyperparameters(rng); } + PoissonLengthUniformCharWordModel base; + CCRP crp; +}; + +template struct PYPLM; + +#if 1 +template<> struct PYPLM<0> : public UniformVocabulary { + PYPLM(unsigned vs, double a, double b, double c, double d) : + UniformVocabulary(vs, a, b, c, d) {} +}; +#else +#if 0 +template<> struct PYPLM<0> : public PoissonLengthUniformCharWordModel { + PYPLM(unsigned vs, double a, double b, double c, double d) : + PoissonLengthUniformCharWordModel(vs, a, b, c, d) {} +}; +#else +template<> struct PYPLM<0> : public PYPAdaptedPoissonLengthUniformCharWordModel { + PYPLM(unsigned vs, double a, double b, double c, double d) : + PYPAdaptedPoissonLengthUniformCharWordModel(vs, a, b, c, d) {} +}; +#endif +#endif + // represents an N-gram LM template struct PYPLM { PYPLM(unsigned vs, double da, double db, double ss, double sr) : @@ -170,7 +236,7 @@ int main(int argc, char** argv) { } if (SS % 10 == 9) { cerr << " [LLH=" << lm.log_likelihood() << "]" << endl; - if (SS % 20 == 19) lm.resample_hyperparameters(&rng); + if (SS % 30 == 29) lm.resample_hyperparameters(&rng); } else { cerr << '.' << flush; } } double llh = 0; -- cgit v1.2.3