summaryrefslogtreecommitdiff
path: root/gi/pf/pyp_lm.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pf/pyp_lm.cc')
-rw-r--r--gi/pf/pyp_lm.cc70
1 files changed, 64 insertions, 6 deletions
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 2837e33c..0d85536c 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -50,16 +50,19 @@ template <unsigned N> struct PYPLM;
// uniform base distribution
template<> struct PYPLM<0> {
- PYPLM(unsigned vs) : p0(1.0 / vs) {}
- void increment(WordID w, const vector<WordID>& context, MT19937* rng) const {}
- void decrement(WordID w, const vector<WordID>& context, MT19937* rng) const {}
+ PYPLM(unsigned vs) : p0(1.0 / vs), draws() {}
+ void increment(WordID w, const vector<WordID>& context, MT19937* rng) { ++draws; }
+ void decrement(WordID w, const vector<WordID>& context, MT19937* rng) { --draws; assert(draws >= 0); }
double prob(WordID w, const vector<WordID>& context) const { return p0; }
+ void resample_hyperparameters(MT19937* rng, const unsigned nloop, const unsigned niterations) {}
+ double log_likelihood() const { return draws * log(p0); }
const double p0;
+ int draws;
};
// represents an N-gram LM
template <unsigned N> struct PYPLM {
- PYPLM(unsigned vs) : backoff(vs) {}
+ PYPLM(unsigned vs) : backoff(vs), d(0.8), alpha(1.0) {}
void increment(WordID w, const vector<WordID>& context, MT19937* rng) {
const double bo = backoff.prob(w, context);
static vector<WordID> lookup(N-1);
@@ -67,7 +70,7 @@ template <unsigned N> struct PYPLM {
lookup[i] = context[context.size() - 1 - i];
typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
if (it == p.end())
- it = p.insert(make_pair(lookup, CCRP<WordID>(1,1,1,1))).first;
+ it = p.insert(make_pair(lookup, CCRP<WordID>(d,alpha))).first;
if (it->second.increment(w, bo, rng))
backoff.increment(w, context, rng);
}
@@ -89,7 +92,58 @@ template <unsigned N> struct PYPLM {
if (it == p.end()) return bo;
return it->second.prob(w, bo);
}
+
+ double log_likelihood() const {
+ return log_likelihood(d, alpha) + backoff.log_likelihood();
+ }
+
+ double log_likelihood(const double& dd, const double& aa) const {
+ if (aa <= -dd) return -std::numeric_limits<double>::infinity();
+ double llh = Md::log_beta_density(dd, 1, 1) + Md::log_gamma_density(aa, 1, 1);
+ typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it;
+ for (it = p.begin(); it != p.end(); ++it)
+ llh += it->second.log_crp_prob(dd, aa);
+ return llh;
+ }
+
+ struct DiscountResampler {
+ DiscountResampler(const PYPLM& m) : m_(m) {}
+ const PYPLM& m_;
+ double operator()(const double& proposed_discount) const {
+ return m_.log_likelihood(proposed_discount, m_.alpha);
+ }
+ };
+
+ struct AlphaResampler {
+ AlphaResampler(const PYPLM& m) : m_(m) {}
+ const PYPLM& m_;
+ double operator()(const double& proposed_alpha) const {
+ return m_.log_likelihood(m_.d, proposed_alpha);
+ }
+ };
+
+ void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+ DiscountResampler dr(*this);
+ AlphaResampler ar(*this);
+ for (int iter = 0; iter < nloop; ++iter) {
+ alpha = slice_sampler1d(ar, alpha, *rng, 0.0,
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ d = slice_sampler1d(dr, d, *rng, std::numeric_limits<double>::min(),
+ 1.0, 0.0, niterations, 100*niterations);
+ }
+ alpha = slice_sampler1d(ar, alpha, *rng, 0.0,
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it;
+ cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << alpha << ") = " << log_likelihood(d, alpha) << endl;
+ for (it = p.begin(); it != p.end(); ++it) {
+ it->second.set_discount(d);
+ it->second.set_alpha(alpha);
+ }
+ backoff.resample_hyperparameters(rng, nloop, niterations);
+ }
+
PYPLM<N-1> backoff;
+ double d, alpha;
unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p;
};
@@ -109,7 +163,7 @@ int main(int argc, char** argv) {
cerr << "Reading corpus...\n";
CorpusTools::ReadFromFile(conf["input"].as<string>(), &corpuse, &vocabe);
cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
-#define kORDER 5
+#define kORDER 3
PYPLM<kORDER> lm(vocabe.size());
vector<WordID> ctx(kORDER - 1, TD::Convert("<s>"));
int mci = corpuse.size() * 99 / 100;
@@ -126,6 +180,10 @@ int main(int argc, char** argv) {
if (SS > 0) lm.decrement(kEOS, ctx, &rng);
lm.increment(kEOS, ctx, &rng);
}
+ if (SS % 10 == 9) {
+ cerr << " [LLH=" << lm.log_likelihood() << "]" << endl;
+ if (SS % 20 == 19) lm.resample_hyperparameters(&rng);
+ } else { cerr << '.' << flush; }
}
double llh = 0;
unsigned cnt = 0;