diff options
Diffstat (limited to 'gi/pf/pyp_lm.cc')
-rw-r--r-- | gi/pf/pyp_lm.cc | 70 |
1 files changed, 64 insertions, 6 deletions
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 2837e33c..0d85536c 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -50,16 +50,19 @@ template <unsigned N> struct PYPLM; // uniform base distribution template<> struct PYPLM<0> { - PYPLM(unsigned vs) : p0(1.0 / vs) {} - void increment(WordID w, const vector<WordID>& context, MT19937* rng) const {} - void decrement(WordID w, const vector<WordID>& context, MT19937* rng) const {} + PYPLM(unsigned vs) : p0(1.0 / vs), draws() {} + void increment(WordID w, const vector<WordID>& context, MT19937* rng) { ++draws; } + void decrement(WordID w, const vector<WordID>& context, MT19937* rng) { --draws; assert(draws >= 0); } double prob(WordID w, const vector<WordID>& context) const { return p0; } + void resample_hyperparameters(MT19937* rng, const unsigned nloop, const unsigned niterations) {} + double log_likelihood() const { return draws * log(p0); } const double p0; + int draws; }; // represents an N-gram LM template <unsigned N> struct PYPLM { - PYPLM(unsigned vs) : backoff(vs) {} + PYPLM(unsigned vs) : backoff(vs), d(0.8), alpha(1.0) {} void increment(WordID w, const vector<WordID>& context, MT19937* rng) { const double bo = backoff.prob(w, context); static vector<WordID> lookup(N-1); @@ -67,7 +70,7 @@ template <unsigned N> struct PYPLM { lookup[i] = context[context.size() - 1 - i]; typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup); if (it == p.end()) - it = p.insert(make_pair(lookup, CCRP<WordID>(1,1,1,1))).first; + it = p.insert(make_pair(lookup, CCRP<WordID>(d,alpha))).first; if (it->second.increment(w, bo, rng)) backoff.increment(w, context, rng); } @@ -89,7 +92,58 @@ template <unsigned N> struct PYPLM { if (it == p.end()) return bo; return it->second.prob(w, bo); } + + double log_likelihood() const { + return log_likelihood(d, alpha) + backoff.log_likelihood(); + } + + double log_likelihood(const double& dd, const double& aa) const { + if (aa <= -dd) return -std::numeric_limits<double>::infinity(); + double llh = Md::log_beta_density(dd, 1, 1) + Md::log_gamma_density(aa, 1, 1); + typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it; + for (it = p.begin(); it != p.end(); ++it) + llh += it->second.log_crp_prob(dd, aa); + return llh; + } + + struct DiscountResampler { + DiscountResampler(const PYPLM& m) : m_(m) {} + const PYPLM& m_; + double operator()(const double& proposed_discount) const { + return m_.log_likelihood(proposed_discount, m_.alpha); + } + }; + + struct AlphaResampler { + AlphaResampler(const PYPLM& m) : m_(m) {} + const PYPLM& m_; + double operator()(const double& proposed_alpha) const { + return m_.log_likelihood(m_.d, proposed_alpha); + } + }; + + void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + DiscountResampler dr(*this); + AlphaResampler ar(*this); + for (int iter = 0; iter < nloop; ++iter) { + alpha = slice_sampler1d(ar, alpha, *rng, 0.0, + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + d = slice_sampler1d(dr, d, *rng, std::numeric_limits<double>::min(), + 1.0, 0.0, niterations, 100*niterations); + } + alpha = slice_sampler1d(ar, alpha, *rng, 0.0, + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it; + cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << alpha << ") = " << log_likelihood(d, alpha) << endl; + for (it = p.begin(); it != p.end(); ++it) { + it->second.set_discount(d); + it->second.set_alpha(alpha); + } + backoff.resample_hyperparameters(rng, nloop, niterations); + } + PYPLM<N-1> backoff; + double d, alpha; unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p; }; @@ -109,7 +163,7 @@ int main(int argc, char** argv) { cerr << "Reading corpus...\n"; CorpusTools::ReadFromFile(conf["input"].as<string>(), &corpuse, &vocabe); cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; -#define kORDER 5 +#define kORDER 3 PYPLM<kORDER> lm(vocabe.size()); vector<WordID> ctx(kORDER - 1, TD::Convert("<s>")); int mci = corpuse.size() * 99 / 100; @@ -126,6 +180,10 @@ int main(int argc, char** argv) { if (SS > 0) lm.decrement(kEOS, ctx, &rng); lm.increment(kEOS, ctx, &rng); } + if (SS % 10 == 9) { + cerr << " [LLH=" << lm.log_likelihood() << "]" << endl; + if (SS % 20 == 19) lm.resample_hyperparameters(&rng); + } else { cerr << '.' << flush; } } double llh = 0; unsigned cnt = 0; |