diff options
Diffstat (limited to 'gi')
-rw-r--r-- | gi/pf/pyp_lm.cc | 78 |
1 files changed, 72 insertions, 6 deletions
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 7cec437a..605d8206 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -6,6 +6,7 @@ #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> +#include "gamma_poisson.h" #include "corpus_tools.h" #include "m.h" #include "tdict.h" @@ -59,11 +60,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -template <unsigned N> struct PYPLM; - -// uniform base distribution (0-gram model) -template<> struct PYPLM<0> { - PYPLM(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} +// uniform distribution over a fixed vocabulary +struct UniformVocabulary { + UniformVocabulary(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} void increment(WordID, const vector<WordID>&, MT19937*) { ++draws; } void decrement(WordID, const vector<WordID>&, MT19937*) { --draws; assert(draws >= 0); } double prob(WordID, const vector<WordID>&) const { return p0; } @@ -73,6 +72,73 @@ template<> struct PYPLM<0> { int draws; }; +// Lord Rothschild. 1986. THE DISTRIBUTION OF ENGLISH DICTIONARY WORD LENGTHS. +// Journal of Statistical Planning and Inference 14 (1986) 311-322 +struct PoissonLengthUniformCharWordModel { + explicit PoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : plen(5,5), uc(-log(95)), llh() {} + void increment(WordID w, const vector<WordID>& v, MT19937*) { + llh += log(prob(w, v)); // this isn't quite right + plen.increment(TD::Convert(w).size() - 1); + } + void decrement(WordID w, const vector<WordID>& v, MT19937*) { + plen.decrement(TD::Convert(w).size() - 1); + llh -= log(prob(w, v)); // this isn't quite right + } + double prob(WordID w, const vector<WordID>&) const { + const unsigned len = TD::Convert(w).size(); + return plen.prob(len - 1) * exp(uc * len); + } + double log_likelihood() const { return llh; } + void resample_hyperparameters(MT19937*) {} + GammaPoisson plen; + const double uc; + double llh; +}; + +struct PYPAdaptedPoissonLengthUniformCharWordModel { + explicit PYPAdaptedPoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : + base(vocab_size,1,1,1,1), + crp(1,1,1,1) {} + void increment(WordID w, const vector<WordID>& v, MT19937* rng) { + double p0 = base.prob(w, v); + if (crp.increment(w, p0, rng)) + base.increment(w, v, rng); + } + void decrement(WordID w, const vector<WordID>& v, MT19937* rng) { + if (crp.decrement(w, rng)) + base.decrement(w, v, rng); + } + double prob(WordID w, const vector<WordID>& v) const { + double p0 = base.prob(w, v); + return crp.prob(w, p0); + } + double log_likelihood() const { return crp.log_crp_prob() + base.log_likelihood(); } + void resample_hyperparameters(MT19937* rng) { crp.resample_hyperparameters(rng); } + PoissonLengthUniformCharWordModel base; + CCRP<WordID> crp; +}; + +template <unsigned N> struct PYPLM; + +#if 1 +template<> struct PYPLM<0> : public UniformVocabulary { + PYPLM(unsigned vs, double a, double b, double c, double d) : + UniformVocabulary(vs, a, b, c, d) {} +}; +#else +#if 0 +template<> struct PYPLM<0> : public PoissonLengthUniformCharWordModel { + PYPLM(unsigned vs, double a, double b, double c, double d) : + PoissonLengthUniformCharWordModel(vs, a, b, c, d) {} +}; +#else +template<> struct PYPLM<0> : public PYPAdaptedPoissonLengthUniformCharWordModel { + PYPLM(unsigned vs, double a, double b, double c, double d) : + PYPAdaptedPoissonLengthUniformCharWordModel(vs, a, b, c, d) {} +}; +#endif +#endif + // represents an N-gram LM template <unsigned N> struct PYPLM { PYPLM(unsigned vs, double da, double db, double ss, double sr) : @@ -170,7 +236,7 @@ int main(int argc, char** argv) { } if (SS % 10 == 9) { cerr << " [LLH=" << lm.log_likelihood() << "]" << endl; - if (SS % 20 == 19) lm.resample_hyperparameters(&rng); + if (SS % 30 == 29) lm.resample_hyperparameters(&rng); } else { cerr << '.' << flush; } } double llh = 0; |