From 113317266853abff2e1c0c3e889017d0eee55c93 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Fri, 9 Mar 2012 22:23:50 -0500
Subject: moar

---
 gi/pf/pyp_tm.cc | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 gi/pf/pyp_tm.cc

(limited to 'gi/pf/pyp_tm.cc')
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
new file mode 100644
index 00000000..94cbe7c3
--- /dev/null
+++ b/gi/pf/pyp_tm.cc
@@ -0,0 +1,113 @@
+#include "pyp_tm.h"
+
+#include <tr1/unordered_map>
+#include <iostream>
+#include <queue>
+
+#include "base_distributions.h"
+#include "monotonic_pseg.h"
+#include "conditional_pseg.h"
+#include "tdict.h"
+#include "ccrp.h"
+#include "pyp_word_model.h"
+
+using namespace std;
+using namespace std::tr1;
+
+template <typename Base>
+struct ConditionalPYPWordModel {
+  ConditionalPYPWordModel(Base* b) : base(*b) {}
+
+  void Summary() const {
+    cerr << "Number of conditioning contexts: " << r.size() << endl;
+    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+      cerr << TD::Convert(it->first) << "   \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl;
+      for (CCRP<vector<WordID> >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+        cerr << "   " << i2->second.total_dish_count_ << '\t' << TD::GetString(i2->first) << endl;
+    }
+  }
+
+  void ResampleHyperparameters(MT19937* rng) {
+    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
+      it->second.resample_hyperparameters(rng);
+  } 
+
+  prob_t Prob(const WordID src, const vector<WordID>& trglets) const {
+    RuleModelHash::const_iterator it = r.find(src);
+    if (it == r.end()) {
+      return base(trglets);
+    } else {
+      return it->second.prob(trglets, base(trglets));
+    }
+  }
+
+  void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
+    RuleModelHash::iterator it = r.find(src);
+    if (it == r.end())
+      it = r.insert(make_pair(src, CCRP<vector<WordID> >(1,1,1,1,0.5,1.0))).first;
+    if (it->second.increment(trglets, base(trglets), rng))
+      base.Increment(trglets, rng);
+  }
+
+  void Decrement(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
+    RuleModelHash::iterator it = r.find(src);
+    assert(it != r.end());
+    if (it->second.decrement(trglets, rng)) {
+      base.Decrement(trglets, rng);
+      if (it->second.num_customers() == 0)
+        r.erase(it);
+    }
+  }
+
+  prob_t Likelihood() const {
+    prob_t p = prob_t::One();
+    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+      prob_t q; q.logeq(it->second.log_crp_prob());
+      p *= q;
+    }
+    return p;
+  }
+
+  unsigned UniqueConditioningContexts() const {
+    return r.size();
+  }
+
+  Base& base;
+  typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
+  RuleModelHash r;
+};
+
+PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets,
+                                             const unsigned num_letters) :
+    letters(lets),
+    up0(new PYPWordModel(num_letters)),
+    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)),
+    kX(-TD::Convert("X")) {}
+
+prob_t PYPLexicalTranslation::Likelihood() const {
+  prob_t p = up0->Likelihood();
+  p *= tmodel->Likelihood();
+  return p;
+}
+
+void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) {
+  tmodel->ResampleHyperparameters(rng);
+  up0->ResampleHyperparameters(rng);
+}
+
+unsigned PYPLexicalTranslation::UniqueConditioningContexts() const {
+  return tmodel->UniqueConditioningContexts();
+}
+
+prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const {
+  return tmodel->Prob(src, letters[trg]);
+}
+
+void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) {
+  tmodel->Increment(src, letters[trg], rng);
+}
+
+void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) {
+  tmodel->Decrement(src, letters[trg], rng);
+}
+
-- 
cgit v1.2.3


From ef614a1d968aebbf463ed57876fee395b4c24635 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Fri, 9 Mar 2012 23:13:09 -0500
Subject: logging after alignment

---
 gi/pf/align-lexonly-pyp.cc | 1 +
 gi/pf/pyp_tm.cc            | 7 +++++--
 gi/pf/pyp_word_model.h     | 2 +-
 utils/ccrp.h               | 1 +
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'gi/pf/pyp_tm.cc')

diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index d68a4b8f..4a1d1db6 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -208,6 +208,7 @@ int main(int argc, char** argv) {
   }
   for (unsigned i = 0; i < corpus.size(); ++i)
     WriteAlignments(corpus[i]);
+  aligner.model.Summary();
 
   return 0;
 }
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index 94cbe7c3..b5262f47 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -54,8 +54,6 @@ struct ConditionalPYPWordModel {
     assert(it != r.end());
     if (it->second.decrement(trglets, rng)) {
       base.Decrement(trglets, rng);
-      if (it->second.num_customers() == 0)
-        r.erase(it);
     }
   }
 
@@ -84,6 +82,11 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets
     tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)),
     kX(-TD::Convert("X")) {}
 
+void PYPLexicalTranslation::Summary() const {
+  tmodel->Summary();
+  up0->Summary();
+}
+
 prob_t PYPLexicalTranslation::Likelihood() const {
   prob_t p = up0->Likelihood();
   p *= tmodel->Likelihood();
diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h
index 800a4fd7..ff366865 100644
--- a/gi/pf/pyp_word_model.h
+++ b/gi/pf/pyp_word_model.h
@@ -12,7 +12,7 @@
 
 // PYP(d,s,poisson-uniform) represented as a CRP
 struct PYPWordModel {
-  explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 7.5) :
+  explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 5) :
       base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {}
 
   void ResampleHyperparameters(MT19937* rng);
diff --git a/utils/ccrp.h b/utils/ccrp.h
index 439d7e1e..4a8b80e7 100644
--- a/utils/ccrp.h
+++ b/utils/ccrp.h
@@ -221,6 +221,7 @@ class CCRP {
 
   void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
     assert(has_discount_prior() || has_strength_prior());
+    if (num_customers() == 0) return;
     DiscountResampler dr(*this);
     StrengthResampler sr(*this);
     for (int iter = 0; iter < nloop; ++iter) {
-- 
cgit v1.2.3


From de136247bdedb960dc0f317cd65b28c02a441532 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 10 Mar 2012 01:08:23 -0500
Subject: tie params

---
 gi/pf/pyp_lm.cc        | 66 +++++++++-------------------------------
 gi/pf/pyp_tm.cc        |  2 ++
 gi/pf/tied_resampler.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 51 deletions(-)
 create mode 100644 gi/pf/tied_resampler.h

(limited to 'gi/pf/pyp_tm.cc')

diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 52e6be2c..85635b8f 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -11,6 +11,7 @@
 #include "tdict.h"
 #include "sampler.h"
 #include "ccrp.h"
+#include "tied_resampler.h"
 
 // A not very memory-efficient implementation of an N-gram LM based on PYPs
 // as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model
@@ -66,7 +67,7 @@ template<> struct PYPLM<0> {
   void increment(WordID, const vector<WordID>&, MT19937*) { ++draws; }
   void decrement(WordID, const vector<WordID>&, MT19937*) { --draws; assert(draws >= 0); }
   double prob(WordID, const vector<WordID>&) const { return p0; }
-  void resample_hyperparameters(MT19937*, const unsigned, const unsigned) {}
+  void resample_hyperparameters(MT19937*) {}
   double log_likelihood() const { return draws * log(p0); }
   const double p0;
   int draws;
@@ -76,16 +77,17 @@ template<> struct PYPLM<0> {
 template <unsigned N> struct PYPLM {
   PYPLM(unsigned vs, double da, double db, double ss, double sr) :
       backoff(vs, da, db, ss, sr),
-      discount_a(da), discount_b(db),
-      strength_s(ss), strength_r(sr),
-      d(0.8), strength(1.0), lookup(N-1) {}
+      tr(da, db, ss, sr, 0.8, 1.0),
+      lookup(N-1) {}
   void increment(WordID w, const vector<WordID>& context, MT19937* rng) {
     const double bo = backoff.prob(w, context);
     for (unsigned i = 0; i < N-1; ++i)
       lookup[i] = context[context.size() - 1 - i];
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
-    if (it == p.end())
-      it = p.insert(make_pair(lookup, CCRP<WordID>(d,strength))).first;
+    if (it == p.end()) {
+      it = p.insert(make_pair(lookup, CCRP<WordID>(0.5,1))).first;
+      tr.Add(&it->second);  // add to resampler
+    }
     if (it->second.increment(w, bo, rng))
       backoff.increment(w, context, rng);
   }
@@ -107,59 +109,21 @@ template <unsigned N> struct PYPLM {
   }
 
   double log_likelihood() const {
-    return log_likelihood(d, strength) + backoff.log_likelihood();
-  }
-
-  double log_likelihood(const double& dd, const double& aa) const {
-    if (aa <= -dd) return -std::numeric_limits<double>::infinity();
-    //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
-    double llh = Md::log_beta_density(dd, discount_a, discount_b) +
-                 Md::log_gamma_density(aa + dd, strength_s, strength_r);
+    double llh = backoff.log_likelihood();
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it;
     for (it = p.begin(); it != p.end(); ++it)
-      llh += it->second.log_crp_prob(dd, aa);
+      llh += it->second.log_crp_prob();
+    // TODO parametric likelihood from TiedResampler
     return llh;
   }
 
-  struct DiscountResampler {
-    DiscountResampler(const PYPLM& m) : m_(m) {}
-    const PYPLM& m_;
-    double operator()(const double& proposed_discount) const {
-      return m_.log_likelihood(proposed_discount, m_.strength);
-    }
-  };
-
-  struct AlphaResampler {
-    AlphaResampler(const PYPLM& m) : m_(m) {}
-    const PYPLM& m_;
-    double operator()(const double& proposed_strength) const {
-      return m_.log_likelihood(m_.d, proposed_strength);
-    }
-  };
-
-  void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
-    DiscountResampler dr(*this);
-    AlphaResampler ar(*this);
-    for (int iter = 0; iter < nloop; ++iter) {
-      strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
-                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
-      double min_discount = std::numeric_limits<double>::min();
-      if (strength < 0.0) min_discount -= strength;
-      d = slice_sampler1d(dr, d, *rng, min_discount,
-                          1.0, 0.0, niterations, 100*niterations);
-    }
-    strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
-                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
-    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it;
-    cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << strength << ") = " << log_likelihood(d, strength) << endl;
-    for (it = p.begin(); it != p.end(); ++it) {
-      it->second.set_discount(d);
-      it->second.set_strength(strength);
-    }
-    backoff.resample_hyperparameters(rng, nloop, niterations);
+  void resample_hyperparameters(MT19937* rng) {
+    tr.ResampleHyperparameters(rng);
+    backoff.resample_hyperparameters(rng);
   }
 
   PYPLM<N-1> backoff;
+  TiedResampler<CCRP<WordID> > tr;
   double discount_a, discount_b, strength_s, strength_r;
   double d, strength;
   mutable vector<WordID> lookup;  // thread-local
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index b5262f47..73104fe9 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -11,6 +11,8 @@
 #include "ccrp.h"
 #include "pyp_word_model.h"
 
+#include "tied_resampler.h"
+
 using namespace std;
 using namespace std::tr1;
 
diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h
new file mode 100644
index 00000000..208fb9c7
--- /dev/null
+++ b/gi/pf/tied_resampler.h
@@ -0,0 +1,82 @@
+#ifndef _TIED_RESAMPLER_H_
+#define _TIED_RESAMPLER_H_
+
+#include <set>
+#include "sampler.h"
+#include "slice_sampler.h"
+#include "m.h"
+
+template <class CRP>
+struct TiedResampler {
+  explicit TiedResampler(double da, double db, double ss, double sr, double d=0.5, double s=1.0) :
+      d_alpha(da),
+      d_beta(db),
+      s_shape(ss),
+      s_rate(sr),
+      discount(d),
+      strength(s) {}
+
+  void Add(CRP* crp) {
+    crps.insert(crp);
+    crp->set_discount(discount);
+    crp->set_strength(strength);
+    assert(!crp->has_discount_prior());
+    assert(!crp->has_strength_prior());
+  }
+
+  void Remove(CRP* crp) {
+    crps.erase(crp);
+  }
+
+  double LogLikelihood(double d, double s) const {
+    if (s <= -d) return -std::numeric_limits<double>::infinity();
+    double llh = Md::log_beta_density(d, d_alpha, d_beta) +
+                 Md::log_gamma_density(d + s, s_shape, s_rate);
+    for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it)
+      llh += (*it)->log_crp_prob(d, s);
+    return llh;
+  }
+
+  struct DiscountResampler {
+    DiscountResampler(const TiedResampler& m) : m_(m) {}
+    const TiedResampler& m_;
+    double operator()(const double& proposed_discount) const {
+      return m_.LogLikelihood(proposed_discount, m_.strength);
+    }
+  };
+
+  struct AlphaResampler {
+    AlphaResampler(const TiedResampler& m) : m_(m) {}
+    const TiedResampler& m_;
+    double operator()(const double& proposed_strength) const {
+      return m_.LogLikelihood(m_.discount, proposed_strength);
+    }
+  };
+
+  void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    const DiscountResampler dr(*this);
+    const AlphaResampler ar(*this);
+    for (int iter = 0; iter < nloop; ++iter) {
+      strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(),
+                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+      double min_discount = std::numeric_limits<double>::min();
+      if (strength < 0.0) min_discount -= strength;
+      discount = slice_sampler1d(dr, discount, *rng, min_discount,
+                          1.0, 0.0, niterations, 100*niterations);
+    }
+    strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(),
+                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+    std::cerr << "TiedCRPs(d=" << discount << ",s="
+              << strength << ") = " << LogLikelihood(discount, strength) << std::endl;
+    for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it) {
+      (*it)->set_discount(discount);
+      (*it)->set_strength(strength);
+    }
+  }
+ private:
+  std::set<CRP*> crps;
+  const double d_alpha, d_beta, s_shape, s_rate;
+  double discount, strength;
+};
+
+#endif
-- 
cgit v1.2.3


From 38f28be7cd2bada87ebad78994e3c938e10c2cce Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 10 Mar 2012 12:56:15 -0500
Subject: ready to infer alignment parameters

---
 gi/pf/Makefile.am          |   4 +-
 gi/pf/align-lexonly-pyp.cc |  22 ++-
 gi/pf/align-lexonly.cc     | 332 ---------------------------------------------
 gi/pf/pyp_tm.cc            |   6 +-
 gi/pf/quasi_model2.h       | 115 ++++++++++++----
 gi/pf/tied_resampler.h     |  31 +++++
 6 files changed, 143 insertions(+), 367 deletions(-)
 delete mode 100644 gi/pf/align-lexonly.cc

(limited to 'gi/pf/pyp_tm.cc')

diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 4ce72ba1..f9c979d0 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,4 +1,4 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl
 
 noinst_LIBRARIES = libpf.a
 
@@ -7,8 +7,6 @@ libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc cor
 nuisance_test_SOURCES = nuisance_test.cc
 nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
 
-align_lexonly_SOURCES = align-lexonly.cc
-
 align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
 align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
 
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 0c90b6ce..68cb9192 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -61,15 +61,15 @@ struct AlignedSentencePair {
 struct Aligner {
   Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) :
       corpus(*c),
+      paj_model(4, 0.08),
       model(lets, num_letters),
-      paj(4, 0.08),
       kNULL(TD::Convert("NULL")) {
     assert(lets[kNULL].size() == 0);
   }
 
   vector<AlignedSentencePair>& corpus;
+  QuasiModel2 paj_model;
   PYPLexicalTranslation model;
-  const QuasiModel2 paj;
   const WordID kNULL;
 
   void ResampleHyperparameters() {
@@ -86,10 +86,12 @@ struct Aligner {
         a_j = prng->next() * (1 + asp.src.size());
         const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
         model.Increment(f_a_j, asp.trg[j], &*prng);
-        // TODO factor in alignment prob
+        paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
       }
     }
-    cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl;
+    cerr << "Corpus intialized randomly." << endl;
+    cerr << "LLH = " << Likelihood() << "    \t(Amodel=" << paj_model.Likelihood()
+         << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;
   }
 
   void ResampleCorpus() {
@@ -101,19 +103,25 @@ struct Aligner {
         const WordID e_j = asp.trg[j];
         WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
         model.Decrement(f_a_j, e_j, prng);
+        paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size());
 
         for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
           const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
           ss[prop_a_j] = model.Prob(prop_f, e_j);
-          // TODO configurable
-          ss[prop_a_j] *= paj.Pa_j(prop_a_j, j, asp.src.size(), asp.trg.size());
+          ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size());
         }
         a_j = prng->SelectSample(ss);
         f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
         model.Increment(f_a_j, e_j, prng);
+        paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
       }
     }
-    cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl;
+    cerr << "LLH = " << Likelihood() << "    \t(Amodel=" << paj_model.Likelihood()
+         << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;
+  }
+
+  prob_t Likelihood() const {
+    return model.Likelihood() * paj_model.Likelihood();
   }
 };
 
diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc
deleted file mode 100644
index dbc9dc07..00000000
--- a/gi/pf/align-lexonly.cc
+++ /dev/null
@@ -1,332 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/multi_array.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "array2d.h"
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "conditional_pseg.h"
-#include "trule.h"
-#include "tdict.h"
-#include "stringlib.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp_nt.h"
-#include "corpus.h"
-#include "ngram_base.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
-        ("input,i",po::value<string>(),"Read parallel data from")
-        ("random_seed,S",po::value<uint32_t>(), "Random seed");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || (conf->count("input") == 0)) {
-    cerr << dcmdline_options << endl;
-    exit(1);
-  }
-}
-
-shared_ptr<MT19937> prng;
-
-struct LexicalAlignment {
-  unsigned char src_index;
-  bool is_transliteration;
-  vector<pair<short, short> > derivation;
-};
-
-struct AlignedSentencePair {
-  vector<WordID> src;
-  vector<WordID> trg;
-  vector<LexicalAlignment> a;
-  Array2D<short> posterior;
-};
-
-struct HierarchicalWordBase {
-  explicit HierarchicalWordBase(const unsigned vocab_e_size) :
-      base(prob_t::One()), r(25,25,10), u0(-log(vocab_e_size)) {}
-
-  void ResampleHyperparameters(MT19937* rng) {
-    r.resample_hyperparameters(rng);
-  }
-
-  inline double logp0(const vector<WordID>& s) const {
-    return s.size() * u0;
-  }
-
-  // return p0 of rule.e_
-  prob_t operator()(const TRule& rule) const {
-    prob_t p; p.logeq(r.logprob(rule.e_, logp0(rule.e_)));
-    return p;
-  }
-
-  void Increment(const TRule& rule) {
-    if (r.increment(rule.e_)) {
-      prob_t p; p.logeq(logp0(rule.e_));
-      base *= p;
-    }
-  }
-
-  void Decrement(const TRule& rule) {
-    if (r.decrement(rule.e_)) {
-      prob_t p; p.logeq(logp0(rule.e_));
-      base /= p;
-    }
-  }
-
-  prob_t Likelihood() const {
-    prob_t p; p.logeq(r.log_crp_prob());
-    p *= base;
-    return p;
-  }
-
-  void Summary() const {
-    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (\\alpha=" << r.alpha() << ')' << endl;
-    for (CCRP_NoTable<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
-      cerr << "   " << it->second << '\t' << TD::GetString(it->first) << endl;
-  }
-
-  prob_t base;
-  CCRP_NoTable<vector<WordID> > r;
-  const double u0;
-};
-
-struct BasicLexicalAlignment {
-  explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets,
-                                 const unsigned words_e,
-                                 const unsigned letters_e,
-                                 vector<AlignedSentencePair>* corp) :
-      letters(lets),
-      corpus(*corp),
-      up0("fr-en.10k.translit-base.txt.gz"),
-      //up0(words_e),
-      //up0("en.chars.1gram", letters_e),
-      //up0("en.words.1gram"),
-      //up0(letters_e),
-      //up0("en.chars.2gram"),
-      tmodel(up0) {
-  }
-
-  void InstantiateRule(const WordID src,
-                       const WordID trg,
-                       TRule* rule) const {
-    static const WordID kX = TD::Convert("X") * -1;
-    rule->lhs_ = kX;
-    rule->e_ = letters[trg];
-    rule->f_ = letters[src];
-  }
-
-  void InitializeRandom() {
-    const WordID kNULL = TD::Convert("NULL");
-    cerr << "Initializing with random alignments ...\n";
-    for (unsigned i = 0; i < corpus.size(); ++i) {
-      AlignedSentencePair& asp = corpus[i];
-      asp.a.resize(asp.trg.size());
-      for (unsigned j = 0; j < asp.trg.size(); ++j) {
-        const unsigned char a_j = prng->next() * (1 + asp.src.size());
-        const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
-        TRule r;
-        InstantiateRule(f_a_j, asp.trg[j], &r);
-        asp.a[j].is_transliteration = false;
-        asp.a[j].src_index = a_j;
-        if (tmodel.IncrementRule(r))
-          up0.Increment(r);
-      }
-    }
-    cerr << "  LLH = " << Likelihood() << endl;
-  }
-
-  prob_t Likelihood() const {
-    prob_t p = tmodel.Likelihood();
-    p *= up0.Likelihood();
-    return p;
-  }
-
-  void ResampleHyperparemeters() {
-    cerr << "  LLH_prev = " << Likelihood() << flush;
-    tmodel.ResampleHyperparameters(&*prng);
-    up0.ResampleHyperparameters(&*prng);
-    cerr << "\tLLH_post = " << Likelihood() << endl;
-  }
-
-  void ResampleCorpus();
-
-  const vector<vector<WordID> >& letters; // spelling dictionary
-  vector<AlignedSentencePair>& corpus;
-  //PhraseConditionalUninformativeBase up0;
-  //PhraseConditionalUninformativeUnigramBase up0;
-  //UnigramWordBase up0;
-  //HierarchicalUnigramBase up0;
-  TableLookupBase up0;
-  //HierarchicalWordBase up0;
-  //PoissonUniformUninformativeBase up0;
-  //CompletelyUniformBase up0;
-  //FixedNgramBase up0;
-  //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel;
-  //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel;
-  //ConditionalTranslationModel<UnigramWordBase> tmodel;
-  //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel;
-  //ConditionalTranslationModel<HierarchicalWordBase> tmodel;
-  //ConditionalTranslationModel<PoissonUniformUninformativeBase> tmodel;
-  ConditionalTranslationModel<TableLookupBase> tmodel;
-  //ConditionalTranslationModel<FixedNgramBase> tmodel;
-  //ConditionalTranslationModel<CompletelyUniformBase> tmodel;
-};
-
-void BasicLexicalAlignment::ResampleCorpus() {
-  static const WordID kNULL = TD::Convert("NULL");
-  for (unsigned i = 0; i < corpus.size(); ++i) {
-    AlignedSentencePair& asp = corpus[i];
-    SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
-    for (unsigned j = 0; j < asp.trg.size(); ++j) {
-      TRule r;
-      unsigned char& a_j = asp.a[j].src_index;
-      WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
-      InstantiateRule(f_a_j, asp.trg[j], &r);
-      if (tmodel.DecrementRule(r))
-        up0.Decrement(r);
-
-      for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
-        const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
-        InstantiateRule(prop_f, asp.trg[j], &r);
-        ss[prop_a_j] = tmodel.RuleProbability(r);
-      }
-      a_j = prng->SelectSample(ss);
-      f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
-      InstantiateRule(f_a_j, asp.trg[j], &r);
-      if (tmodel.IncrementRule(r))
-        up0.Increment(r);
-    }
-  }
-  cerr << "  LLH = " << tmodel.Likelihood() << endl;
-}
-
-void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
-  for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
-    if (*it >= l->size()) { l->resize(*it + 1); }
-    vector<WordID>& letters = (*l)[*it];
-    if (letters.size()) continue;   // if e and f have the same word
-
-    const string& w = TD::Convert(*it);
-    
-    size_t cur = 0;
-    while (cur < w.size()) {
-      const size_t len = UTF8Len(w[cur]);
-      letters.push_back(TD::Convert(w.substr(cur, len)));
-      if (letset) letset->insert(letters.back());
-      cur += len;
-    }
-  }
-}
-
-void Debug(const AlignedSentencePair& asp) {
-  cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
-  Array2D<bool> a(asp.src.size(), asp.trg.size());
-  for (unsigned j = 0; j < asp.trg.size(); ++j)
-    if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
-  cerr << a << endl;
-}
-
-void AddSample(AlignedSentencePair* asp) {
-  for (unsigned j = 0; j < asp->trg.size(); ++j)
-    asp->posterior(asp->a[j].src_index, j)++;
-}
-
-void WriteAlignments(const AlignedSentencePair& asp) {
-  bool first = true;
-  for (unsigned j = 0; j < asp.trg.size(); ++j) {
-    int src_index = -1;
-    int mc = -1;
-    for (unsigned i = 0; i <= asp.src.size(); ++i) {
-      if (asp.posterior(i, j) > mc) {
-        mc = asp.posterior(i, j);
-        src_index = i;
-      }
-    }
-
-    if (src_index) {
-      if (first) first = false; else cout << ' ';
-      cout << (src_index - 1) << '-' << j;
-    }
-  }
-  cout << endl;
-}
-
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-
-  if (conf.count("random_seed"))
-    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
-  else
-    prng.reset(new MT19937);
-//  MT19937& rng = *prng;
-
-  vector<vector<int> > corpuse, corpusf;
-  set<int> vocabe, vocabf;
-  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
-  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
-  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
-  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
-  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
-  assert(corpusf.size() == corpuse.size());
-
-  vector<AlignedSentencePair> corpus(corpuse.size());
-  for (unsigned i = 0; i < corpuse.size(); ++i) {
-    corpus[i].src.swap(corpusf[i]);
-    corpus[i].trg.swap(corpuse[i]);
-    corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size());
-  }
-  corpusf.clear(); corpuse.clear();
-
-  vocabf.insert(TD::Convert("NULL"));
-  vector<vector<WordID> > letters(TD::NumWords());
-  set<WordID> letset;
-  ExtractLetters(vocabe, &letters, &letset);
-  ExtractLetters(vocabf, &letters, NULL);
-  letters[TD::Convert("NULL")].clear();
-
-  BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus);
-  x.InitializeRandom();
-  const unsigned samples = conf["samples"].as<unsigned>();
-  for (int i = 0; i < samples; ++i) {
-    for (int j = 395; j < 397; ++j) Debug(corpus[j]);
-    cerr << i << "\t" << x.tmodel.r.size() << "\t";
-    if (i % 10 == 0) x.ResampleHyperparemeters();
-    x.ResampleCorpus();
-    if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
-  }
-  for (unsigned i = 0; i < corpus.size(); ++i)
-    WriteAlignments(corpus[i]);
-  //ModelAndData posterior(x, &corpus, vocabe, vocabf);
-  x.tmodel.Summary();
-  x.up0.Summary();
-
-  //posterior.Sample();
-
-  return 0;
-}
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index 73104fe9..bf5a6497 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -10,7 +10,6 @@
 #include "tdict.h"
 #include "ccrp.h"
 #include "pyp_word_model.h"
-
 #include "tied_resampler.h"
 
 using namespace std;
@@ -18,7 +17,7 @@ using namespace std::tr1;
 
 template <typename Base>
 struct ConditionalPYPWordModel {
-  ConditionalPYPWordModel(Base* b) : base(*b) {}
+  ConditionalPYPWordModel(Base* b) : base(*b), btr(3) {}
 
   void Summary() const {
     cerr << "Number of conditioning contexts: " << r.size() << endl;
@@ -32,6 +31,7 @@ struct ConditionalPYPWordModel {
   void ResampleHyperparameters(MT19937* rng) {
     for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
       it->second.resample_hyperparameters(rng);
+    btr.ResampleHyperparameters(rng);
   } 
 
   prob_t Prob(const WordID src, const vector<WordID>& trglets) const {
@@ -72,7 +72,9 @@ struct ConditionalPYPWordModel {
     return r.size();
   }
 
+  // TODO tie PYP hyperparameters based on source word frequency bins
   Base& base;
+  BinTiedResampler<CCRP<vector<WordID> > > btr;
   typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
   RuleModelHash r;
 };
diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h
index 0095289f..8ec0a400 100644
--- a/gi/pf/quasi_model2.h
+++ b/gi/pf/quasi_model2.h
@@ -3,44 +3,113 @@
 
 #include <vector>
 #include <cmath>
+#include <tr1/unordered_map>
+#include "boost/functional.hpp"
 #include "prob.h"
 #include "array2d.h"
 
+struct AlignmentObservation {
+  AlignmentObservation() : src_len(), trg_len(), j(), a_j() {}
+  AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) :
+      src_len(sl), trg_len(tl), j(tw), a_j(sw) {}
+  unsigned short src_len;
+  unsigned short trg_len;
+  unsigned short j;
+  unsigned short a_j;
+};
+
+inline size_t hash_value(const AlignmentObservation& o) {
+  return reinterpret_cast<const size_t&>(o);
+}
+
+inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) {
+  return hash_value(a) == hash_value(b);
+}
+
 struct QuasiModel2 {
   explicit QuasiModel2(double alpha, double pnull = 0.1) :
       alpha_(alpha),
       pnull_(pnull),
-      pnotnull_(1 - pnull),
-      z_(1000,1000) {}
+      pnotnull_(1 - pnull) {}
+
   // a_j = 0 => NULL; src_len does *not* include null
-  prob_t Pa_j(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const {
+  prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const {
     if (!a_j) return pnull_;
-    std::vector<prob_t>& zv = z_(src_len, trg_len);
-    if (zv.size() == 0)
-      zv.resize(trg_len);
-    
-    prob_t& z = zv[j];
-    if (z.is_0()) z = ComputeZ(j, src_len, trg_len);
-
-    prob_t p;
-    p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_);
-    p *= pnotnull_;
-    p /= z;
+    return pnotnull_ *
+       prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len));
+  }
+
+  void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) {
+    assert(a_j <= src_len);
+    assert(j < trg_len);
+    ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)];
+  }
+
+  void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) {
+    const AlignmentObservation ao(src_len, trg_len, j, a_j);
+    int &cc = obs_[ao];
+    assert(cc > 0);
+    --cc;
+    if (!cc) obs_.erase(ao);
+  }
+
+  prob_t Likelihood() const {
+    return Likelihood(alpha_, pnull_.as_float());
+  }
+
+  prob_t Likelihood(double alpha, double ppnull) const {
+    const prob_t pnull(ppnull);
+    const prob_t pnotnull(1 - ppnull);
+
+    prob_t p = prob_t::One();
+    for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) {
+      const AlignmentObservation& ao = it->first;
+      if (ao.a_j) {
+        double u = UnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha);
+        double z = ComputeZ(ao.j, ao.src_len, ao.trg_len, alpha);
+        prob_t pa(u / z);
+        pa *= pnotnull;
+        pa.poweq(it->second);
+        p *= pa;
+      } else {
+        p *= pnull.pow(it->second);
+      }
+    }
     return p;
   }
+
  private:
-  prob_t ComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const {
-    prob_t p, z = prob_t::Zero();
-    for (int a_j = 1; a_j <= src_len; ++a_j) {
-      p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_);
-      z += p;
-    }
+  static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
+    return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha);
+  }
+
+  static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
+    double z = 0;
+    for (int a_j = 1; a_j <= src_len; ++a_j)
+      z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha);
     return z;
   }
+
+  const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const {
+    if (src_len >= zcache_.size())
+      zcache_.resize(src_len + 1);
+    if (trg_len >= zcache_[src_len].size())
+      zcache_[src_len].resize(trg_len + 1);
+    std::vector<double>& zv = zcache_[src_len][trg_len];
+    if (zv.size() == 0)
+      zv.resize(trg_len);
+    double& z = zv[j];
+    if (!z)
+      z = ComputeZ(j, src_len, trg_len, alpha_);
+    return z;
+  }
+
   double alpha_;
-  const prob_t pnull_;
-  const prob_t pnotnull_;
-  mutable Array2D<std::vector<prob_t> > z_;
+  prob_t pnull_;
+  prob_t pnotnull_;
+  mutable std::vector<std::vector<std::vector<double> > > zcache_;
+  typedef std::tr1::unordered_map<AlignmentObservation, int, boost::hash<AlignmentObservation> > ObsCount;
+  ObsCount obs_;
 };
 
 #endif
diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h
index 208fb9c7..5a262f9d 100644
--- a/gi/pf/tied_resampler.h
+++ b/gi/pf/tied_resampler.h
@@ -2,6 +2,7 @@
 #define _TIED_RESAMPLER_H_
 
 #include <set>
+#include <vector>
 #include "sampler.h"
 #include "slice_sampler.h"
 #include "m.h"
@@ -28,6 +29,10 @@ struct TiedResampler {
     crps.erase(crp);
   }
 
+  size_t size() const {
+    return crps.size();
+  }
+
   double LogLikelihood(double d, double s) const {
     if (s <= -d) return -std::numeric_limits<double>::infinity();
     double llh = Md::log_beta_density(d, d_alpha, d_beta) +
@@ -54,6 +59,7 @@ struct TiedResampler {
   };
 
   void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; }
     const DiscountResampler dr(*this);
     const AlphaResampler ar(*this);
     for (int iter = 0; iter < nloop; ++iter) {
@@ -79,4 +85,29 @@ struct TiedResampler {
   double discount, strength;
 };
 
+// split according to some criterion
+template <class CRP>
+struct BinTiedResampler {
+  explicit BinTiedResampler(unsigned nbins) :
+      resamplers(nbins, TiedResampler<CRP>(1,1,1,1)) {}
+
+  void Add(unsigned bin, CRP* crp) {
+    resamplers[bin].Add(crp);
+  }
+
+  void Remove(unsigned bin, CRP* crp) {
+    resamplers[bin].Remove(crp);
+  }
+
+  void ResampleHyperparameters(MT19937* rng) {
+    for (unsigned i = 0; i < resamplers.size(); ++i) {
+      std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush;
+      resamplers[i].ResampleHyperparameters(rng);
+    }
+  }
+
+ private:
+  std::vector<TiedResampler<CRP> > resamplers;
+};
+
 #endif
-- 
cgit v1.2.3


From 289f96779e665ba24adca3461a624c68aa37bd99 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 10 Mar 2012 14:10:04 -0500
Subject: do Bayesian inference on quasimodel2 hyperparameters

---
 gi/pf/align-lexonly-pyp.cc |  5 ++--
 gi/pf/pyp_lm.cc            |  2 +-
 gi/pf/pyp_tm.cc            | 11 +++++----
 gi/pf/quasi_model2.h       | 57 +++++++++++++++++++++++++++++++++++++++++++---
 gi/pf/tied_resampler.h     | 11 +++++++++
 5 files changed, 75 insertions(+), 11 deletions(-)

(limited to 'gi/pf/pyp_tm.cc')

diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 68cb9192..6c054753 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -74,6 +74,7 @@ struct Aligner {
 
   void ResampleHyperparameters() {
     model.ResampleHyperparameters(prng);
+    paj_model.ResampleHyperparameters(prng);
   }
 
   void InitializeRandom() {
@@ -216,9 +217,9 @@ int main(int argc, char** argv) {
   const unsigned samples = conf["samples"].as<unsigned>();
   for (int i = 0; i < samples; ++i) {
     for (int j = 65; j < 67; ++j) Debug(corpus[j]);
-    if (i % 7 == 6) aligner.ResampleHyperparameters();
+    if (i % 10 == 9) aligner.ResampleHyperparameters();
     aligner.ResampleCorpus();
-    if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
+    if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
   }
   for (unsigned i = 0; i < corpus.size(); ++i)
     WriteAlignments(corpus[i]);
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 85635b8f..91029688 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -113,7 +113,7 @@ template <unsigned N> struct PYPLM {
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it;
     for (it = p.begin(); it != p.end(); ++it)
       llh += it->second.log_crp_prob();
-    // TODO parametric likelihood from TiedResampler
+    llh += tr.LogLikelihood();
     return llh;
   }
 
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index bf5a6497..34ef0ba2 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -17,7 +17,7 @@ using namespace std::tr1;
 
 template <typename Base>
 struct ConditionalPYPWordModel {
-  ConditionalPYPWordModel(Base* b) : base(*b), btr(3) {}
+  ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {}
 
   void Summary() const {
     cerr << "Number of conditioning contexts: " << r.size() << endl;
@@ -29,8 +29,6 @@ struct ConditionalPYPWordModel {
   }
 
   void ResampleHyperparameters(MT19937* rng) {
-    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
-      it->second.resample_hyperparameters(rng);
     btr.ResampleHyperparameters(rng);
   } 
 
@@ -45,8 +43,11 @@ struct ConditionalPYPWordModel {
 
   void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
     RuleModelHash::iterator it = r.find(src);
-    if (it == r.end())
-      it = r.insert(make_pair(src, CCRP<vector<WordID> >(1,1,1,1,0.5,1.0))).first;
+    if (it == r.end()) {
+      it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first;
+      static const WordID kNULL = TD::Convert("NULL");
+      btr.Add(src == kNULL ? 0 : 1, &it->second);
+    }
     if (it->second.increment(trglets, base(trglets), rng))
       base.Increment(trglets, rng);
   }
diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h
index 8ec0a400..588c8f84 100644
--- a/gi/pf/quasi_model2.h
+++ b/gi/pf/quasi_model2.h
@@ -7,6 +7,8 @@
 #include "boost/functional.hpp"
 #include "prob.h"
 #include "array2d.h"
+#include "slice_sampler.h"
+#include "m.h"
 
 struct AlignmentObservation {
   AlignmentObservation() : src_len(), trg_len(), j(), a_j() {}
@@ -53,6 +55,37 @@ struct QuasiModel2 {
     if (!cc) obs_.erase(ao);
   }
 
+  struct PNullResampler {
+    PNullResampler(const QuasiModel2& m) : m_(m) {}
+    const QuasiModel2& m_;
+    double operator()(const double& proposed_pnull) const {
+      return log(m_.Likelihood(m_.alpha_, proposed_pnull));
+    }
+  };
+
+  struct AlphaResampler {
+    AlphaResampler(const QuasiModel2& m) : m_(m) {}
+    const QuasiModel2& m_;
+    double operator()(const double& proposed_alpha) const {
+      return log(m_.Likelihood(proposed_alpha, m_.pnull_.as_float()));
+    }
+  };
+
+  void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    const PNullResampler dr(*this);
+    const AlphaResampler ar(*this);
+    for (unsigned i = 0; i < nloop; ++i) {
+      double pnull = slice_sampler1d(dr, pnull_.as_float(), *rng, 0.00000001,
+                            1.0, 0.0, niterations, 100*niterations);
+      pnull_ = prob_t(pnull);
+      alpha_ = slice_sampler1d(ar, alpha_, *rng, 0.00000001,
+                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+    }
+    std::cerr << "QuasiModel2(alpha=" << alpha_ << ",p_null="
+              << pnull_.as_float() << ") = " << Likelihood() << std::endl;
+    zcache_.clear();
+  }
+
   prob_t Likelihood() const {
     return Likelihood(alpha_, pnull_.as_float());
   }
@@ -61,12 +94,17 @@ struct QuasiModel2 {
     const prob_t pnull(ppnull);
     const prob_t pnotnull(1 - ppnull);
 
-    prob_t p = prob_t::One();
+    prob_t p;
+    p.logeq(Md::log_gamma_density(alpha, 0.1, 25));  // TODO configure
+    assert(!p.is_0());
+    prob_t prob_of_ppnull; prob_of_ppnull.logeq(Md::log_beta_density(ppnull, 2, 10));
+    assert(!prob_of_ppnull.is_0());
+    p *= prob_of_ppnull;
     for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) {
       const AlignmentObservation& ao = it->first;
       if (ao.a_j) {
-        double u = UnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha);
-        double z = ComputeZ(ao.j, ao.src_len, ao.trg_len, alpha);
+        prob_t u = XUnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha);
+        prob_t z = XComputeZ(ao.j, ao.src_len, ao.trg_len, alpha);
         prob_t pa(u / z);
         pa *= pnotnull;
         pa.poweq(it->second);
@@ -79,6 +117,19 @@ struct QuasiModel2 {
   }
 
  private:
+  static prob_t XUnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
+    prob_t p;
+    p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha);
+    return p;
+  }
+
+  static prob_t XComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
+    prob_t z = prob_t::Zero();
+    for (int a_j = 1; a_j <= src_len; ++a_j)
+      z += XUnnormalizedProb(a_j, j, src_len, trg_len, alpha);
+    return z;
+  }
+
   static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
     return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha);
   }
diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h
index 5a262f9d..6f45fbce 100644
--- a/gi/pf/tied_resampler.h
+++ b/gi/pf/tied_resampler.h
@@ -42,6 +42,10 @@ struct TiedResampler {
     return llh;
   }
 
+  double LogLikelihood() const {
+    return LogLikelihood(discount, strength);
+  }
+
   struct DiscountResampler {
     DiscountResampler(const TiedResampler& m) : m_(m) {}
     const TiedResampler& m_;
@@ -106,6 +110,13 @@ struct BinTiedResampler {
     }
   }
 
+  double LogLikelihood() const {
+    double llh = 0;
+    for (unsigned i = 0; i < resamplers.size(); ++i)
+      llh += resamplers[i].LogLikelihood();
+    return llh;
+  }
+
  private:
   std::vector<TiedResampler<CRP> > resamplers;
 };
-- 
cgit v1.2.3


From dfbc278c1057555fda9312291c8024049e00b7d8 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 10 Mar 2012 16:42:12 -0500
Subject: frequency-based binning

---
 decoder/Makefile.am        |  1 -
 decoder/ff_csplit.cc       |  2 +-
 decoder/freqdict.cc        | 29 -----------------------------
 decoder/freqdict.h         | 37 ++++++++++++++++++++++++++++++++-----
 gi/pf/align-lexonly-pyp.cc | 24 +++++++++++++++++-------
 gi/pf/make-freq-bins.pl    | 26 ++++++++++++++++++++++++++
 gi/pf/pyp_tm.cc            | 24 +++++++++++++++++-------
 gi/pf/pyp_tm.h             |  7 ++++---
 8 files changed, 97 insertions(+), 53 deletions(-)
 delete mode 100644 decoder/freqdict.cc
 create mode 100755 gi/pf/make-freq-bins.pl

(limited to 'gi/pf/pyp_tm.cc')

diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index a00b18af..ec51d643 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -76,7 +76,6 @@ libcdec_a_SOURCES = \
   ff_source_syntax.cc \
   ff_bleu.cc \
   ff_factory.cc \
-  freqdict.cc \
   lexalign.cc \
   lextrans.cc \
   tagger.cc \
diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc
index 3991d38f..c9ed996c 100644
--- a/decoder/ff_csplit.cc
+++ b/decoder/ff_csplit.cc
@@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl {
   const int fl1_;
   const int fl2_;
   const int bad_;
-  FreqDict freq_dict_;
+  FreqDict<float> freq_dict_;
   set<WordID> bad_words_;
 };
 
diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc
deleted file mode 100644
index 9e25d346..00000000
--- a/decoder/freqdict.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cassert>
-#include "freqdict.h"
-#include "tdict.h"
-#include "filelib.h"
-
-using namespace std;
-
-void FreqDict::Load(const std::string& fname) {
-  cerr << "Reading word frequencies: " << fname << endl;
-  ReadFile rf(fname);
-  istream& ifs = *rf.stream();
-  int cc=0;
-  while (ifs) {
-    std::string word;
-    ifs >> word;
-    if (word.size() == 0) continue;
-    if (word[0] == '#') continue;
-    double count = 0;
-    ifs >> count;
-    assert(count > 0.0);  // use -log(f)
-    counts_[TD::Convert(word)]=count;
-    ++cc;
-    if (cc % 10000 == 0) { std::cerr << "."; }
-  }
-  std::cerr << "\n";
-  std::cerr << "Loaded " << cc << " words\n";
-}
diff --git a/decoder/freqdict.h b/decoder/freqdict.h
index 9acf0c33..4e03fadd 100644
--- a/decoder/freqdict.h
+++ b/decoder/freqdict.h
@@ -1,20 +1,47 @@
 #ifndef _FREQDICT_H_
 #define _FREQDICT_H_
 
+#include <iostream>
 #include <map>
 #include <string>
 #include "wordid.h"
+#include "filelib.h"
+#include "tdict.h"
 
+template <typename T = float>
 class FreqDict {
  public:
-  void Load(const std::string& fname);
-  float LookUp(const WordID& word) const {
-    std::map<WordID,float>::const_iterator i = counts_.find(word);
-    if (i == counts_.end()) return 0;
+  FreqDict() : max_() {}
+  T Max() const { return max_; }
+  void Load(const std::string& fname) {
+    std::cerr << "Reading word statistics from: " << fname << std::endl;
+    ReadFile rf(fname);
+    std::istream& ifs = *rf.stream();
+    int cc=0;
+    std::string word;
+    while (ifs) {
+      ifs >> word;
+      if (word.size() == 0) continue;
+      if (word[0] == '#') continue;
+      T count = 0;
+      ifs >> count;
+      if (count > max_) max_ = count;
+      counts_[TD::Convert(word)]=count;
+      ++cc;
+      if (cc % 10000 == 0) { std::cerr << "."; }
+    }
+    std::cerr << "\n";
+    std::cerr << "Loaded " << cc << " words\n";
+  }
+
+  T LookUp(const WordID& word) const {
+    typename std::map<WordID,T>::const_iterator i = counts_.find(word);
+    if (i == counts_.end()) return T();
     return i->second;
   }
  private:
-  std::map<WordID, float> counts_;
+  T max_;
+  std::map<WordID, T> counts_;
 };
 
 #endif
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 6c054753..942dcf51 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
         ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed")
+        ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null")
+        ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?")
         ("input,i",po::value<string>(),"Read parallel data from")
         ("random_seed,S",po::value<uint32_t>(), "Random seed");
   po::options_description clo("Command line options");
@@ -59,9 +62,13 @@ struct AlignedSentencePair {
 };
 
 struct Aligner {
-  Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) :
+  Aligner(const vector<vector<WordID> >& lets,
+          int num_letters,
+          const po::variables_map& conf,
+          vector<AlignedSentencePair>* c) :
       corpus(*c),
-      paj_model(4, 0.08),
+      paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()),
+      infer_paj(conf.count("infer_alignment_hyperparameters") > 0),
       model(lets, num_letters),
       kNULL(TD::Convert("NULL")) {
     assert(lets[kNULL].size() == 0);
@@ -69,12 +76,13 @@ struct Aligner {
 
   vector<AlignedSentencePair>& corpus;
   QuasiModel2 paj_model;
+  const bool infer_paj;
   PYPLexicalTranslation model;
   const WordID kNULL;
 
   void ResampleHyperparameters() {
     model.ResampleHyperparameters(prng);
-    paj_model.ResampleHyperparameters(prng);
+    if (infer_paj) paj_model.ResampleHyperparameters(prng);
   }
 
   void InitializeRandom() {
@@ -117,8 +125,6 @@ struct Aligner {
         paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
       }
     }
-    cerr << "LLH = " << Likelihood() << "    \t(Amodel=" << paj_model.Likelihood()
-         << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;
   }
 
   prob_t Likelihood() const {
@@ -211,13 +217,17 @@ int main(int argc, char** argv) {
   ExtractLetters(vocabf, &letters, NULL);
   letters[TD::Convert("NULL")].clear();
 
-  Aligner aligner(letters, letset.size(), &corpus);
+  Aligner aligner(letters, letset.size(), conf, &corpus);
   aligner.InitializeRandom();
 
   const unsigned samples = conf["samples"].as<unsigned>();
   for (int i = 0; i < samples; ++i) {
     for (int j = 65; j < 67; ++j) Debug(corpus[j]);
-    if (i % 10 == 9) aligner.ResampleHyperparameters();
+    if (i % 10 == 9) {
+      aligner.ResampleHyperparameters();
+      cerr << "LLH = " << aligner.Likelihood() << "    \t(Amodel=" << aligner.paj_model.Likelihood()
+           << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl;
+    }
     aligner.ResampleCorpus();
     if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
   }
diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl
new file mode 100755
index 00000000..fdcd3555
--- /dev/null
+++ b/gi/pf/make-freq-bins.pl
@@ -0,0 +1,26 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $BASE = 6;
+my $CUTOFF = 3;
+
+my %d;
+my $num = 0;
+while(<>){
+ chomp;
+ my @words = split /\s+/;
+ for my $w (@words) {$d{$w}++; $num++;}
+}
+
+my @vocab = sort {$d{$b} <=> $d{$a}} keys %d;
+
+for (my $i=0; $i<scalar @vocab; $i++) {
+  my $most = $d{$vocab[$i]};
+  my $least = 1;
+
+  my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF);
+  if ($nl < 0) { $nl = 0; }
+  print "$vocab[$i] $nl\n"
+}
+
+
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index 34ef0ba2..e21f0267 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -4,9 +4,6 @@
 #include <iostream>
 #include <queue>
 
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "conditional_pseg.h"
 #include "tdict.h"
 #include "ccrp.h"
 #include "pyp_word_model.h"
@@ -15,9 +12,19 @@
 using namespace std;
 using namespace std::tr1;
 
-template <typename Base>
+struct FreqBinner {
+  FreqBinner(const std::string& fname) { fd_.Load(fname); }
+  unsigned NumberOfBins() const { return fd_.Max() + 1; }
+  unsigned Bin(const WordID& w) const { return fd_.LookUp(w); }
+  FreqDict<unsigned> fd_;
+};
+
+template <typename Base, class Binner = FreqBinner>
 struct ConditionalPYPWordModel {
-  ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {}
+  ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) :
+      base(*b),
+      binner(bnr),
+      btr(binner ? binner->NumberOfBins() + 1u : 2u) {}
 
   void Summary() const {
     cerr << "Number of conditioning contexts: " << r.size() << endl;
@@ -46,7 +53,9 @@ struct ConditionalPYPWordModel {
     if (it == r.end()) {
       it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first;
       static const WordID kNULL = TD::Convert("NULL");
-      btr.Add(src == kNULL ? 0 : 1, &it->second);
+      unsigned bin = (src == kNULL ? 0 : 1);
+      if (binner && bin) { bin = binner->Bin(src) + 1; }
+      btr.Add(bin, &it->second);
     }
     if (it->second.increment(trglets, base(trglets), rng))
       base.Increment(trglets, rng);
@@ -75,6 +84,7 @@ struct ConditionalPYPWordModel {
 
   // TODO tie PYP hyperparameters based on source word frequency bins
   Base& base;
+  const Binner* binner;
   BinTiedResampler<CCRP<vector<WordID> > > btr;
   typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
   RuleModelHash r;
@@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets
                                              const unsigned num_letters) :
     letters(lets),
     up0(new PYPWordModel(num_letters)),
-    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)),
+    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0, new FreqBinner("10k.freq"))),
     kX(-TD::Convert("X")) {}
 
 void PYPLexicalTranslation::Summary() const {
diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h
index fa0fb28f..63e7c96d 100644
--- a/gi/pf/pyp_tm.h
+++ b/gi/pf/pyp_tm.h
@@ -5,10 +5,11 @@
 #include "wordid.h"
 #include "prob.h"
 #include "sampler.h"
+#include "freqdict.h"
 
-struct TRule;
+struct FreqBinner;
 struct PYPWordModel;
-template <typename T> struct ConditionalPYPWordModel;
+template <typename T, class B> struct ConditionalPYPWordModel;
 
 struct PYPLexicalTranslation {
   explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets,
@@ -26,7 +27,7 @@ struct PYPLexicalTranslation {
  private:
   const std::vector<std::vector<WordID> >& letters;   // spelling dictionary
   PYPWordModel* up0;  // base distribuction (model English word)
-  ConditionalPYPWordModel<PYPWordModel>* tmodel;  // translation distributions
+  ConditionalPYPWordModel<PYPWordModel, FreqBinner>* tmodel;  // translation distributions
                       // (model English word | French word)
   const WordID kX;
 };
-- 
cgit v1.2.3