pyp lm, fixed hyperparameters inference

author: Chris Dyer <cdyer@cs.cmu.edu> 2012-03-03 17:16:58 -0500
committer: Chris Dyer <cdyer@cs.cmu.edu> 2012-03-03 17:16:58 -0500
commit: 2579dd24d3833823527e688196276c2fab381b37 (patch)
tree: df25825f29db546549fc469f912cef5a7e32c08f /gi
parent: e0507d1aa96c6b1348e6a202beb95f63d8662258 (diff)
7 files changed, 72 insertions, 14 deletions
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index e24cb457..4ce7cf62 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -104,7 +104,7 @@ struct HierarchicalWordBase {
   }
 
   void Summary() const {
-    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.d() << ",\\alpha=" << r.alpha() << ')' << endl;
+    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.discount() << ",\\alpha=" << r.alpha() << ')' << endl;
     for (MFCR<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
       cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl;
   }
diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc
index 8c1d689f..dbc9dc07 100644
--- a/gi/pf/align-lexonly.cc
+++ b/gi/pf/align-lexonly.cc
@@ -105,7 +105,7 @@ struct HierarchicalWordBase {
   }
 
   void Summary() const {
-    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (\\alpha=" << r.concentration() << ')' << endl;
+    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (\\alpha=" << r.alpha() << ')' << endl;
     for (CCRP_NoTable<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
       cerr << "   " << it->second << '\t' << TD::GetString(it->first) << endl;
   }
diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc
index 7b60ef23..c2c52760 100644
--- a/gi/pf/brat.cc
+++ b/gi/pf/brat.cc
@@ -191,7 +191,7 @@ struct UniphraseLM {
   void ResampleHyperparameters(MT19937* rng) {
     phrases_.resample_hyperparameters(rng);
     gen_.resample_hyperparameters(rng);
-    cerr << " " << phrases_.concentration();
+    cerr << " " << phrases_.alpha();
   }
 
   CCRP_NoTable<vector<int> > phrases_;
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
index 2e9e38fc..f9841cbf 100644
--- a/gi/pf/conditional_pseg.h
+++ b/gi/pf/conditional_pseg.h
@@ -22,7 +22,7 @@ struct MConditionalTranslationModel {
   void Summary() const {
     std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
     for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
-      std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.d() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
+      std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.discount() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
       for (MFCR<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
         std::cerr << "   " << -1 << '\t' << i2->first << std::endl;
     }
@@ -95,7 +95,7 @@ struct ConditionalTranslationModel {
   void Summary() const {
     std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
     for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
-      std::cerr << TD::GetString(it->first) << "   \t(\\alpha = " << it->second.concentration() << ") --------------------------" << std::endl;
+      std::cerr << TD::GetString(it->first) << "   \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
       for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
         std::cerr << "   " << i2->second << '\t' << i2->first << std::endl;
     }
diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc
index b2ca029a..5b748311 100644
--- a/gi/pf/learn_cfg.cc
+++ b/gi/pf/learn_cfg.cc
@@ -183,9 +183,9 @@ struct HieroLMModel {
       nts[i].resample_hyperparameters(rng);
     if (kHIERARCHICAL_PRIOR) {
       q0.resample_hyperparameters(rng);
-      cerr << "[base d=" << q0.discount() << ", alpha=" << q0.discount() << "]";
+      cerr << "[base d=" << q0.discount() << ", alpha=" << q0.alpha() << "]";
     }
-    cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl;
+    cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].alpha() << endl;
   }
 
   const BaseRuleModel base;
diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc
index 7b60ef23..c2c52760 100644
--- a/gi/pf/pfbrat.cc
+++ b/gi/pf/pfbrat.cc
@@ -191,7 +191,7 @@ struct UniphraseLM {
   void ResampleHyperparameters(MT19937* rng) {
     phrases_.resample_hyperparameters(rng);
     gen_.resample_hyperparameters(rng);
-    cerr << " " << phrases_.concentration();
+    cerr << " " << phrases_.alpha();
   }
 
   CCRP_NoTable<vector<int> > phrases_;
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 2837e33c..0d85536c 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -50,16 +50,19 @@ template <unsigned N> struct PYPLM;
 
 // uniform base distribution
 template<> struct PYPLM<0> {
-  PYPLM(unsigned vs) : p0(1.0 / vs) {}
-  void increment(WordID w, const vector<WordID>& context, MT19937* rng) const {}
-  void decrement(WordID w, const vector<WordID>& context, MT19937* rng) const {}
+  PYPLM(unsigned vs) : p0(1.0 / vs), draws() {}
+  void increment(WordID w, const vector<WordID>& context, MT19937* rng) { ++draws; }
+  void decrement(WordID w, const vector<WordID>& context, MT19937* rng) { --draws; assert(draws >= 0); }
   double prob(WordID w, const vector<WordID>& context) const { return p0; }
+  void resample_hyperparameters(MT19937* rng, const unsigned nloop, const unsigned niterations) {}
+  double log_likelihood() const { return draws * log(p0); }
   const double p0;
+  int draws;
 };
 
 // represents an N-gram LM
 template <unsigned N> struct PYPLM {
-  PYPLM(unsigned vs) : backoff(vs) {}
+  PYPLM(unsigned vs) : backoff(vs), d(0.8), alpha(1.0) {}
   void increment(WordID w, const vector<WordID>& context, MT19937* rng) {
     const double bo = backoff.prob(w, context);
     static vector<WordID> lookup(N-1);
@@ -67,7 +70,7 @@ template <unsigned N> struct PYPLM {
       lookup[i] = context[context.size() - 1 - i];
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
     if (it == p.end())
-      it = p.insert(make_pair(lookup, CCRP<WordID>(1,1,1,1))).first;
+      it = p.insert(make_pair(lookup, CCRP<WordID>(d,alpha))).first;
     if (it->second.increment(w, bo, rng))
       backoff.increment(w, context, rng);
   }
@@ -89,7 +92,58 @@ template <unsigned N> struct PYPLM {
     if (it == p.end()) return bo;
     return it->second.prob(w, bo);
   }
+
+  double log_likelihood() const {
+    return log_likelihood(d, alpha) + backoff.log_likelihood();
+  }
+
+  double log_likelihood(const double& dd, const double& aa) const {
+    if (aa <= -dd) return -std::numeric_limits<double>::infinity();
+    double llh = Md::log_beta_density(dd, 1, 1) + Md::log_gamma_density(aa, 1, 1);
+    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it;
+    for (it = p.begin(); it != p.end(); ++it)
+      llh += it->second.log_crp_prob(dd, aa);
+    return llh;
+  }
+
+  struct DiscountResampler {
+    DiscountResampler(const PYPLM& m) : m_(m) {}
+    const PYPLM& m_;
+    double operator()(const double& proposed_discount) const {
+      return m_.log_likelihood(proposed_discount, m_.alpha);
+    }
+  };
+
+  struct AlphaResampler {
+    AlphaResampler(const PYPLM& m) : m_(m) {}
+    const PYPLM& m_;
+    double operator()(const double& proposed_alpha) const {
+      return m_.log_likelihood(m_.d, proposed_alpha);
+    }
+  };
+
+  void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    DiscountResampler dr(*this);
+    AlphaResampler ar(*this);
+    for (int iter = 0; iter < nloop; ++iter) {
+      alpha = slice_sampler1d(ar, alpha, *rng, 0.0,
+                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+      d = slice_sampler1d(dr, d, *rng, std::numeric_limits<double>::min(),
+                          1.0, 0.0, niterations, 100*niterations);
+    }
+    alpha = slice_sampler1d(ar, alpha, *rng, 0.0,
+                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it;
+    cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << alpha << ") = " << log_likelihood(d, alpha) << endl;
+    for (it = p.begin(); it != p.end(); ++it) {
+      it->second.set_discount(d);
+      it->second.set_alpha(alpha);
+    }
+    backoff.resample_hyperparameters(rng, nloop, niterations);
+  }
+
   PYPLM<N-1> backoff;
+  double d, alpha;
   unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p;
 };
 
@@ -109,7 +163,7 @@ int main(int argc, char** argv) {
   cerr << "Reading corpus...\n";
   CorpusTools::ReadFromFile(conf["input"].as<string>(), &corpuse, &vocabe);
   cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
-#define kORDER 5
+#define kORDER 3
   PYPLM<kORDER> lm(vocabe.size());
   vector<WordID> ctx(kORDER - 1, TD::Convert("<s>"));
   int mci = corpuse.size() * 99 / 100;
@@ -126,6 +180,10 @@ int main(int argc, char** argv) {
       if (SS > 0) lm.decrement(kEOS, ctx, &rng);
       lm.increment(kEOS, ctx, &rng);
     }
+    if (SS % 10 == 9) {
+      cerr << " [LLH=" << lm.log_likelihood() << "]" << endl;
+      if (SS % 20 == 19) lm.resample_hyperparameters(&rng);
+    } else { cerr << '.' << flush; }
   }
   double llh = 0;
   unsigned cnt = 0;
author	Chris Dyer <cdyer@cs.cmu.edu>	2012-03-03 17:16:58 -0500
committer	Chris Dyer <cdyer@cs.cmu.edu>	2012-03-03 17:16:58 -0500
commit	2579dd24d3833823527e688196276c2fab381b37 (patch)
tree	df25825f29db546549fc469f912cef5a7e32c08f /gi
parent	e0507d1aa96c6b1348e6a202beb95f63d8662258 (diff)