tie hyperparameters for translation distributions; support theta < 0 for PYPLM

author: Chris Dyer <cdyer@cs.cmu.edu> 2012-03-05 21:36:07 -0500
committer: Chris Dyer <cdyer@cs.cmu.edu> 2012-03-05 21:36:07 -0500
commit: de34b1493df93169c991a1828f951ca5abc00cae (patch)
tree: 81f691d66cf5e3c3775634a266482ea9b7163081
parent: 2048ac9943e2695a75b5f0303ca869e66ee32202 (diff)
5 files changed, 84 insertions, 32 deletions
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index ac0590e0..13a3a487 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -68,14 +68,14 @@ struct AlignedSentencePair {
 
 struct HierarchicalWordBase {
   explicit HierarchicalWordBase(const unsigned vocab_e_size) :
-      base(prob_t::One()), r(1,1,1,1), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {}
+      base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {}
 
   void ResampleHyperparameters(MT19937* rng) {
     r.resample_hyperparameters(rng);
   }
 
   inline double logp0(const vector<WordID>& s) const {
-    return s.size() * u0;
+    return Md::log_poisson(s.size(), 7.5) + s.size() * u0;
   }
 
   // return p0 of rule.e_
@@ -106,7 +106,7 @@ struct HierarchicalWordBase {
   void Summary() const {
     cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.discount() << ",s=" << r.strength() << ')' << endl;
     for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
-      cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl;
+      cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl;
   }
 
   prob_t base;
@@ -167,10 +167,9 @@ struct BasicLexicalAlignment {
   }
 
   void ResampleHyperparemeters() {
-    cerr << "  LLH_prev = " << Likelihood() << flush;
     tmodel.ResampleHyperparameters(&*prng);
     up0.ResampleHyperparameters(&*prng);
-    cerr << "\tLLH_post = " << Likelihood() << endl;
+    cerr << "  (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n";
   }
 
   void ResampleCorpus();
@@ -218,7 +217,7 @@ void BasicLexicalAlignment::ResampleCorpus() {
         up0.Increment(r);
     }
   }
-  cerr << "  LLH = " << tmodel.Likelihood() << endl;
+  cerr << "  LLH = " << Likelihood() << endl;
 }
 
 void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
@@ -311,7 +310,7 @@ int main(int argc, char** argv) {
   for (int i = 0; i < samples; ++i) {
     for (int j = 65; j < 67; ++j) Debug(corpus[j]);
     cerr << i << "\t" << x.tmodel.r.size() << "\t";
-    if (i % 10 == 0) x.ResampleHyperparemeters();
+    if (i % 7 == 6) x.ResampleHyperparemeters();
     x.ResampleCorpus();
     if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
   }
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
index ef73e332..8202778b 100644
--- a/gi/pf/conditional_pseg.h
+++ b/gi/pf/conditional_pseg.h
@@ -17,21 +17,66 @@
 template <typename ConditionalBaseMeasure>
 struct MConditionalTranslationModel {
   explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :
-    rp0(rcp0), lambdas(1, prob_t::One()), p0s(1) {}
+    rp0(rcp0), d(0.5), strength(1.0), lambdas(1, prob_t::One()), p0s(1) {}
 
   void Summary() const {
     std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
     for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
       std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl;
       for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
-        std::cerr << "   " << -1 << '\t' << i2->first << std::endl;
+        std::cerr << "   " << i2->second.total_dish_count_ << '\t' << i2->first << std::endl;
     }
   }
 
+  double log_likelihood(const double& dd, const double& aa) const {
+    if (aa <= -dd) return -std::numeric_limits<double>::infinity();
+    //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
+    double llh = Md::log_beta_density(dd, 1, 1) +
+                 Md::log_gamma_density(dd + aa, 1, 1);
+    typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::const_iterator it;
+    for (it = r.begin(); it != r.end(); ++it)
+      llh += it->second.log_crp_prob(dd, aa);
+    return llh;
+  }
+
+  struct DiscountResampler {
+    DiscountResampler(const MConditionalTranslationModel& m) : m_(m) {}
+    const MConditionalTranslationModel& m_;
+    double operator()(const double& proposed_discount) const {
+      return m_.log_likelihood(proposed_discount, m_.strength);
+    }
+  };
+
+  struct AlphaResampler {
+    AlphaResampler(const MConditionalTranslationModel& m) : m_(m) {}
+    const MConditionalTranslationModel& m_;
+    double operator()(const double& proposed_strength) const {
+      return m_.log_likelihood(m_.d, proposed_strength);
+    }
+  };
+
   void ResampleHyperparameters(MT19937* rng) {
-    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
-      it->second.resample_hyperparameters(rng);
-  } 
+    const unsigned nloop = 5;
+    const unsigned niterations = 10;
+    DiscountResampler dr(*this);
+    AlphaResampler ar(*this);
+    for (int iter = 0; iter < nloop; ++iter) {
+      strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
+                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+      double min_discount = std::numeric_limits<double>::min();
+      if (strength < 0.0) min_discount -= strength;
+      d = slice_sampler1d(dr, d, *rng, min_discount,
+                          1.0, 0.0, niterations, 100*niterations);
+    }
+    strength = slice_sampler1d(ar, strength, *rng, -d,
+                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+    typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::iterator it;
+    std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl;
+    for (it = r.begin(); it != r.end(); ++it) {
+      it->second.set_discount(d);
+      it->second.set_strength(strength);
+    }
+  }
 
   int DecrementRule(const TRule& rule, MT19937* rng) {
     RuleModelHash::iterator it = r.find(rule.f_);
@@ -46,7 +91,7 @@ struct MConditionalTranslationModel {
   int IncrementRule(const TRule& rule, MT19937* rng) {
     RuleModelHash::iterator it = r.find(rule.f_);
     if (it == r.end()) {
-      it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first;
+      it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first;
     }
     p0s[0] = rp0(rule); 
     TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng);
@@ -66,15 +111,7 @@ struct MConditionalTranslationModel {
   }
 
   prob_t Likelihood() const {
-    prob_t p = prob_t::One();
-#if 0
-    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
-      prob_t q; q.logeq(it->second.log_crp_prob());
-      p *= q;
-      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
-        p *= rp0(i2->first);
-    }
-#endif
+    prob_t p; p.logeq(log_likelihood(d, strength));
     return p;
   }
 
@@ -83,6 +120,7 @@ struct MConditionalTranslationModel {
                                   MFCR<1, TRule>,
                                   boost::hash<std::vector<WordID> > > RuleModelHash;
   RuleModelHash r;
+  double d, strength;
   std::vector<prob_t> lambdas;
   mutable std::vector<prob_t> p0s;
 };
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 7ebada13..104f356b 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -18,7 +18,7 @@
 
 // I use templates to handle the recursive formalation of the prior, so
 // the order of the model has to be specified here, at compile time:
-#define kORDER 3
+#define kORDER 4
 
 using namespace std;
 using namespace tr1;
@@ -114,7 +114,7 @@ template <unsigned N> struct PYPLM {
     if (aa <= -dd) return -std::numeric_limits<double>::infinity();
     //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
     double llh = Md::log_beta_density(dd, discount_a, discount_b) +
-                 Md::log_gamma_density(aa, strength_s, strength_r);
+                 Md::log_gamma_density(aa + dd, strength_s, strength_r);
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it;
     for (it = p.begin(); it != p.end(); ++it)
       llh += it->second.log_crp_prob(dd, aa);
@@ -141,12 +141,14 @@ template <unsigned N> struct PYPLM {
     DiscountResampler dr(*this);
     AlphaResampler ar(*this);
     for (int iter = 0; iter < nloop; ++iter) {
-      strength = slice_sampler1d(ar, strength, *rng, 0.0,
+      strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
                               std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
-      d = slice_sampler1d(dr, d, *rng, std::numeric_limits<double>::min(),
+      double min_discount = std::numeric_limits<double>::min();
+      if (strength < 0.0) min_discount -= strength;
+      d = slice_sampler1d(dr, d, *rng, min_discount,
                           1.0, 0.0, niterations, 100*niterations);
     }
-    strength = slice_sampler1d(ar, strength, *rng, 0.0,
+    strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
                             std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it;
     cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << strength << ") = " << log_likelihood(d, strength) << endl;
diff --git a/utils/ccrp.h b/utils/ccrp.h
index e24130ac..439d7e1e 100644
--- a/utils/ccrp.h
+++ b/utils/ccrp.h
@@ -225,12 +225,12 @@ class CCRP {
     StrengthResampler sr(*this);
     for (int iter = 0; iter < nloop; ++iter) {
       if (has_strength_prior()) {
-        strength_ = slice_sampler1d(sr, strength_, *rng, -discount_,
+        strength_ = slice_sampler1d(sr, strength_, *rng, -discount_ + std::numeric_limits<double>::min(),
                                std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
       }
       if (has_discount_prior()) {
         double min_discount = std::numeric_limits<double>::min();
-        if (strength_ < 0.0) min_discount = -strength_;
+        if (strength_ < 0.0) min_discount -= strength_;
         discount_ = slice_sampler1d(dr, discount_, *rng, min_discount,
                                1.0, 0.0, niterations, 100*niterations);
       }
diff --git a/utils/mfcr.h b/utils/mfcr.h
index 6cc0ebf1..886f01ef 100644
--- a/utils/mfcr.h
+++ b/utils/mfcr.h
@@ -48,7 +48,7 @@ class MFCR {
     discount_prior_strength_(std::numeric_limits<double>::quiet_NaN()),
     discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
     strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
-    strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
+    strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) { check_hyperparameters(); }
 
   MFCR(double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) :
     num_tables_(),
@@ -58,10 +58,23 @@ class MFCR {
     discount_prior_strength_(discount_strength),
     discount_prior_beta_(discount_beta),
     strength_prior_shape_(strength_shape),
-    strength_prior_rate_(strength_rate) {}
+    strength_prior_rate_(strength_rate) { check_hyperparameters(); }
+
+  void check_hyperparameters() {
+    if (discount_ < 0.0 || discount_ >= 1.0) {
+      std::cerr << "Bad discount: " << discount_ << std::endl;
+      abort();
+    }
+    if (strength_ <= -discount_) {
+      std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl;
+      abort();
+    }
+  }
 
   double discount() const { return discount_; }
   double strength() const { return strength_; }
+  void set_discount(double d) { discount_ = d; check_hyperparameters(); }
+  void set_strength(double a) { strength_ = a; check_hyperparameters(); }
 
   bool has_discount_prior() const {
     return !std::isnan(discount_prior_strength_);
@@ -275,7 +288,7 @@ class MFCR {
       }
       if (has_discount_prior()) {
         double min_discount = std::numeric_limits<double>::min();
-        if (strength_ < 0.0) min_discount = -strength_;
+        if (strength_ < 0.0) min_discount -= strength_;
         discount_ = slice_sampler1d(dr, discount_, *rng, min_discount,
                                1.0, 0.0, niterations, 100*niterations);
       }
author	Chris Dyer <cdyer@cs.cmu.edu>	2012-03-05 21:36:07 -0500
committer	Chris Dyer <cdyer@cs.cmu.edu>	2012-03-05 21:36:07 -0500
commit	de34b1493df93169c991a1828f951ca5abc00cae (patch)
tree	81f691d66cf5e3c3775634a266482ea9b7163081
parent	2048ac9943e2695a75b5f0303ca869e66ee32202 (diff)