From 03478c6e7307b66ebde1e76801edd06062d8039c Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 29 Dec 2011 21:08:30 -0500
Subject: lexical alignment samplers

---
 gi/pf/Makefile.am      |  13 +-
 gi/pf/align-lexonly.cc | 356 +++++++++++++++++++++++++++++++++++++++++++++++++
 gi/pf/base_measures.cc |  26 ++++
 gi/pf/base_measures.h  |  50 ++++++-
 gi/pf/itg.cc           |  98 +++++++++++---
 gi/pf/unigrams.cc      |  80 +++++++++++
 gi/pf/unigrams.h       |  69 ++++++++++
 7 files changed, 668 insertions(+), 24 deletions(-)
 create mode 100644 gi/pf/align-lexonly.cc
 create mode 100644 gi/pf/unigrams.cc
 create mode 100644 gi/pf/unigrams.h

(limited to 'gi/pf')
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 42758939..7c8e89d0 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,10 +1,14 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly
 
 noinst_LIBRARIES = libpf.a
-libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc
+libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
+
+align_lexonly_SOURCES = align-lexonly.cc
 
 itg_SOURCES = itg.cc
 
+condnaive_SOURCES = condnaive.cc
+
 dpnaive_SOURCES = dpnaive.cc
 
 pfdist_SOURCES = pfdist.cc
@@ -17,5 +21,6 @@ brat_SOURCES = brat.cc
 
 pfbrat_SOURCES = pfbrat.cc
 
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
-AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm
+
+AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz
diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc
new file mode 100644
index 00000000..91a3cfcf
--- /dev/null
+++ b/gi/pf/align-lexonly.cc
@@ -0,0 +1,356 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/multi_array.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "array2d.h"
+#include "base_measures.h"
+#include "monotonic_pseg.h"
+#include "conditional_pseg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "stringlib.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp_nt.h"
+#include "corpus.h"
+#include "ngram_base.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("input,i",po::value<string>(),"Read parallel data from")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("input") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+shared_ptr<MT19937> prng;
+
+struct LexicalAlignment {
+  unsigned char src_index;
+  bool is_transliteration;
+  vector<pair<short, short> > derivation;
+};
+
+struct AlignedSentencePair {
+  vector<WordID> src;
+  vector<WordID> trg;
+  vector<LexicalAlignment> a;
+  Array2D<short> posterior;
+};
+
+struct HierarchicalUnigramBase {
+  explicit HierarchicalUnigramBase(const unsigned vocab_e_size) : r(5,5), u0(1.0 / vocab_e_size) {}
+
+  // return p0 of rule.e_
+  prob_t operator()(const TRule& rule) const {
+    prob_t p = prob_t::One();
+    prob_t q;
+    for (unsigned i = 0; i < rule.e_.size(); ++i) {
+      q.logeq(r.logprob(rule.e_[i], log(u0)));
+      p *= q;
+    }
+    q.logeq(r.logprob(TD::Convert("</s>"), log(u0)));
+    p *= q;
+    return p;
+  }
+
+  void Increment(const TRule& rule) {
+    for (unsigned i = 0; i < rule.e_.size(); ++i)
+      r.increment(rule.e_[i]);
+    r.increment(TD::Convert("</s>"));
+  }
+
+  void Decrement(const TRule& rule) {
+    for (unsigned i = 0; i < rule.e_.size(); ++i)
+      r.decrement(rule.e_[i]);
+    r.decrement(TD::Convert("</s>"));
+  }
+
+  CCRP_NoTable<WordID> r;
+  prob_t u0;
+};
+
+struct HierarchicalWordBase {
+  explicit HierarchicalWordBase(const unsigned vocab_e_size) :
+      base(prob_t::One()), r(15,15), u0(-log(vocab_e_size)) {}
+
+  void ResampleHyperparameters(MT19937* rng) {
+    r.resample_hyperparameters(rng);
+  }
+
+  inline double logp0(const vector<WordID>& s) const {
+    return s.size() * u0;
+  }
+
+  // return p0 of rule.e_
+  prob_t operator()(const TRule& rule) const {
+    prob_t p; p.logeq(r.logprob(rule.e_, logp0(rule.e_)));
+    return p;
+  }
+
+  void Increment(const TRule& rule) {
+    if (r.increment(rule.e_)) {
+      prob_t p; p.logeq(logp0(rule.e_));
+      base *= p;
+    }
+  }
+
+  void Decrement(const TRule& rule) {
+    if (r.decrement(rule.e_)) {
+      prob_t p; p.logeq(logp0(rule.e_));
+      base /= p;
+    }
+  }
+
+  prob_t Likelihood() const {
+    prob_t p; p.logeq(r.log_crp_prob());
+    p *= base;
+    return p;
+  }
+
+  void Summary() const {
+    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << endl;
+    for (CCRP_NoTable<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
+      cerr << "   " << it->second << '\t' << TD::GetString(it->first) << endl;
+  }
+
+  prob_t base;
+  CCRP_NoTable<vector<WordID> > r;
+  const double u0;
+};
+
+struct BasicLexicalAlignment {
+  explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets,
+                                 const unsigned letters_e,
+                                 vector<AlignedSentencePair>* corp) :
+      letters(lets),
+      corpus(*corp),
+      //up0("en.chars.1gram", letters_e),
+      //up0("en.words.1gram"),
+      up0(letters_e),
+      //up0("en.chars.2gram"),
+      tmodel(up0) {
+  }
+
+  void InstantiateRule(const WordID src,
+                       const WordID trg,
+                       TRule* rule) const {
+    static const WordID kX = TD::Convert("X") * -1;
+    rule->lhs_ = kX;
+    rule->e_ = letters[trg];
+    rule->f_ = letters[src];
+  }
+
+  void InitializeRandom() {
+    const WordID kNULL = TD::Convert("NULL");
+    cerr << "Initializing with random alignments ...\n";
+    for (unsigned i = 0; i < corpus.size(); ++i) {
+      AlignedSentencePair& asp = corpus[i];
+      asp.a.resize(asp.trg.size());
+      for (unsigned j = 0; j < asp.trg.size(); ++j) {
+        const unsigned char a_j = prng->next() * (1 + asp.src.size());
+        const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+        TRule r;
+        InstantiateRule(f_a_j, asp.trg[j], &r);
+        asp.a[j].is_transliteration = false;
+        asp.a[j].src_index = a_j;
+        if (tmodel.IncrementRule(r))
+          up0.Increment(r);
+      }
+    }
+    cerr << "  LLH = " << Likelihood() << endl;
+  }
+
+  prob_t Likelihood() const {
+    prob_t p = tmodel.Likelihood();
+    p *= up0.Likelihood();
+    return p;
+  }
+
+  void ResampleHyperparemeters() {
+    cerr << "  LLH_prev = " << Likelihood() << flush;
+    tmodel.ResampleHyperparameters(&*prng);
+    up0.ResampleHyperparameters(&*prng);
+    cerr << "\tLLH_post = " << Likelihood() << endl;
+  }
+
+  void ResampleCorpus();
+
+  const vector<vector<WordID> >& letters; // spelling dictionary
+  vector<AlignedSentencePair>& corpus;
+  //PhraseConditionalUninformativeBase up0;
+  //PhraseConditionalUninformativeUnigramBase up0;
+  //UnigramWordBase up0;
+  //HierarchicalUnigramBase up0;
+  HierarchicalWordBase up0;
+  //CompletelyUniformBase up0;
+  //FixedNgramBase up0;
+  //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel;
+  //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel;
+  //ConditionalTranslationModel<UnigramWordBase> tmodel;
+  //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel;
+  ConditionalTranslationModel<HierarchicalWordBase> tmodel;
+  //ConditionalTranslationModel<FixedNgramBase> tmodel;
+  //ConditionalTranslationModel<CompletelyUniformBase> tmodel;
+};
+
+void BasicLexicalAlignment::ResampleCorpus() {
+  static const WordID kNULL = TD::Convert("NULL");
+  for (unsigned i = 0; i < corpus.size(); ++i) {
+    AlignedSentencePair& asp = corpus[i];
+    SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
+    for (unsigned j = 0; j < asp.trg.size(); ++j) {
+      TRule r;
+      unsigned char& a_j = asp.a[j].src_index;
+      WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+      InstantiateRule(f_a_j, asp.trg[j], &r);
+      if (tmodel.DecrementRule(r))
+        up0.Decrement(r);
+
+      for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
+        const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
+        InstantiateRule(prop_f, asp.trg[j], &r);
+        ss[prop_a_j] = tmodel.RuleProbability(r);
+      }
+      a_j = prng->SelectSample(ss);
+      f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+      InstantiateRule(f_a_j, asp.trg[j], &r);
+      if (tmodel.IncrementRule(r))
+        up0.Increment(r);
+    }
+  }
+  cerr << "  LLH = " << tmodel.Likelihood() << endl;
+}
+
+void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
+  for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
+    vector<WordID>& letters = (*l)[*it];
+    if (letters.size()) continue;   // if e and f have the same word
+
+    const string& w = TD::Convert(*it);
+    
+    size_t cur = 0;
+    while (cur < w.size()) {
+      const size_t len = UTF8Len(w[cur]);
+      letters.push_back(TD::Convert(w.substr(cur, len)));
+      if (letset) letset->insert(letters.back());
+      cur += len;
+    }
+  }
+}
+
+void Debug(const AlignedSentencePair& asp) {
+  cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
+  Array2D<bool> a(asp.src.size(), asp.trg.size());
+  for (unsigned j = 0; j < asp.trg.size(); ++j)
+    if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
+  cerr << a << endl;
+}
+
+void AddSample(AlignedSentencePair* asp) {
+  for (unsigned j = 0; j < asp->trg.size(); ++j)
+    asp->posterior(asp->a[j].src_index, j)++;
+}
+
+void WriteAlignments(const AlignedSentencePair& asp) {
+  bool first = true;
+  for (unsigned j = 0; j < asp.trg.size(); ++j) {
+    int src_index = -1;
+    int mc = -1;
+    for (unsigned i = 0; i <= asp.src.size(); ++i) {
+      if (asp.posterior(i, j) > mc) {
+        mc = asp.posterior(i, j);
+        src_index = i;
+      }
+    }
+
+    if (src_index) {
+      if (first) first = false; else cout << ' ';
+      cout << (src_index - 1) << '-' << j;
+    }
+  }
+  cout << endl;
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+//  MT19937& rng = *prng;
+
+  vector<vector<int> > corpuse, corpusf;
+  set<int> vocabe, vocabf;
+  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
+  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
+  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
+  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
+  assert(corpusf.size() == corpuse.size());
+
+  vector<AlignedSentencePair> corpus(corpuse.size());
+  for (unsigned i = 0; i < corpuse.size(); ++i) {
+    corpus[i].src.swap(corpusf[i]);
+    corpus[i].trg.swap(corpuse[i]);
+    corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size());
+  }
+  corpusf.clear(); corpuse.clear();
+
+  vocabf.insert(TD::Convert("NULL"));
+  vector<vector<WordID> > letters(TD::NumWords());
+  set<WordID> letset;
+  ExtractLetters(vocabe, &letters, &letset);
+  ExtractLetters(vocabf, &letters, NULL);
+  letters[TD::Convert("NULL")].clear();
+
+  BasicLexicalAlignment x(letters, letset.size(), &corpus);
+  x.InitializeRandom();
+  const unsigned samples = conf["samples"].as<unsigned>();
+  for (int i = 0; i < samples; ++i) {
+    for (int j = 431; j < 433; ++j) Debug(corpus[j]);
+    cerr << i << "\t" << x.tmodel.r.size() << "\t";
+    if (i % 10 == 0) x.ResampleHyperparemeters();
+    x.ResampleCorpus();
+    if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
+  }
+  for (unsigned i = 0; i < corpus.size(); ++i)
+    WriteAlignments(corpus[i]);
+  //ModelAndData posterior(x, &corpus, vocabe, vocabf);
+  x.tmodel.Summary();
+  x.up0.Summary();
+
+  //posterior.Sample();
+
+  return 0;
+}
diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc
index 8adb37d7..97b4e698 100644
--- a/gi/pf/base_measures.cc
+++ b/gi/pf/base_measures.cc
@@ -6,6 +6,32 @@
 
 using namespace std;
 
+prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc,
+                                                     const vector<WordID>& vtrg,
+                                                     int start_src, int start_trg) const {
+  const int flen = vsrc.size() - start_src;
+  const int elen = vtrg.size() - start_trg;
+  prob_t p;
+  p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
+  //p.logeq(log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01)
+  for (int i = 0; i < elen; ++i)
+    p *= u(vtrg[i + start_trg]);                        // draw e_i             ~Uniform
+  return p;
+}
+
+prob_t PhraseConditionalUninformativeBase::p0(const vector<WordID>& vsrc,
+                                              const vector<WordID>& vtrg,
+                                              int start_src, int start_trg) const {
+  const int flen = vsrc.size() - start_src;
+  const int elen = vtrg.size() - start_trg;
+  prob_t p;
+  //p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
+  p.logeq(log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01)
+  for (int i = 0; i < elen; ++i)
+    p *= kUNIFORM_TARGET;                        // draw e_i             ~Uniform
+  return p;
+}
+
 void Model1::LoadModel1(const string& fname) {
   cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
   ReadFile rf(fname);
diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h
index 7ce7e2e6..fbd1c3ad 100644
--- a/gi/pf/base_measures.h
+++ b/gi/pf/base_measures.h
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <iostream>
 
+#include "unigrams.h"
 #include "trule.h"
 #include "prob.h"
 #include "tdict.h"
@@ -49,6 +50,51 @@ struct Model1 {
   std::vector<std::map<WordID, prob_t> > ttable;
 };
 
+struct CompletelyUniformBase {
+  explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
+  prob_t operator()(const TRule&) const {
+    return kUNIFORM;
+  }
+  const prob_t kUNIFORM;
+};
+
+struct UnigramWordBase {
+  explicit UnigramWordBase(const std::string& fname) : un(fname) {}
+  prob_t operator()(const TRule& r) const {
+    return un(r.e_);
+  }
+  const UnigramWordModel un;
+};
+
+struct PhraseConditionalUninformativeBase {
+  explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) :
+      kUNIFORM_TARGET(1.0 / vocab_e_size) {
+    assert(vocab_e_size > 0);
+  }
+
+  // return p0 of rule.e_ | rule.f_
+  prob_t operator()(const TRule& rule) const {
+    return p0(rule.f_, rule.e_, 0, 0);
+  }
+
+  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+  const prob_t kUNIFORM_TARGET;
+};
+
+struct PhraseConditionalUninformativeUnigramBase {
+  explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {}
+
+  // return p0 of rule.e_ | rule.f_
+  prob_t operator()(const TRule& rule) const {
+    return p0(rule.f_, rule.e_, 0, 0);
+  }
+
+  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+  const UnigramModel u;
+};
+
 struct PhraseConditionalBase {
   explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) :
       model1(m1),
@@ -83,7 +129,7 @@ struct PhraseJointBase {
     assert(vocab_e_size > 0);
   }
 
-  // return p0 of rule.e_ | rule.f_
+  // return p0 of rule.e_ , rule.f_
   prob_t operator()(const TRule& rule) const {
     return p0(rule.f_, rule.e_, 0, 0);
   }
@@ -113,7 +159,7 @@ struct PhraseJointBase_BiDir {
     assert(vocab_e_size > 0);
   }
 
-  // return p0 of rule.e_ | rule.f_
+  // return p0 of rule.e_ , rule.f_
   prob_t operator()(const TRule& rule) const {
     return p0(rule.f_, rule.e_, 0, 0);
   }
diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc
index ac3c16a3..a38fe672 100644
--- a/gi/pf/itg.cc
+++ b/gi/pf/itg.cc
@@ -27,10 +27,67 @@ ostream& operator<<(ostream& os, const vector<WordID>& p) {
   return os << ']';
 }
 
-double log_poisson(unsigned x, const double& lambda) {
-  assert(lambda > 0.0);
-  return log(lambda) * x - lgamma(x + 1) - lambda;
-}
+struct UnigramModel {
+  explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) :
+      use_uniform_(fname.size() == 0),
+      p0null_(p0null),
+      uniform_((1.0 - p0null) / vocab_size),
+      probs_(TD::NumWords() + 1) {
+    if (fname.size() > 0) LoadUnigrams(fname);
+    probs_[0] = p0null_;
+  }
+
+// 
+// \data\
+// ngram 1=9295
+// 
+// \1-grams:
+// -3.191193	"
+
+  void LoadUnigrams(const string& fname) {
+    cerr << "Loading unigram probabilities from " << fname << " ..." << endl;
+    ReadFile rf(fname);
+    string line;
+    istream& in = *rf.stream();
+    assert(in);
+    getline(in, line);
+    assert(line.empty());
+    getline(in, line);
+    assert(line == "\\data\\");
+    getline(in, line);
+    size_t pos = line.find("ngram 1=");
+    assert(pos == 0);
+    assert(line.size() > 8);
+    const size_t num_unigrams = atoi(&line[8]);
+    getline(in, line);
+    assert(line.empty());
+    getline(in, line);
+    assert(line == "\\1-grams:");
+    for (size_t i = 0; i < num_unigrams; ++i) {
+      getline(in, line);
+      assert(line.size() > 0);
+      pos = line.find('\t');
+      assert(pos > 0);
+      assert(pos + 1 < line.size());
+      const WordID w = TD::Convert(line.substr(pos + 1));
+      line[pos] = 0;
+      float p = atof(&line[0]);
+      const prob_t pnon_null(1.0 - p0null_.as_float());
+      if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort();
+    }
+  }
+
+  const prob_t& operator()(const WordID& w) const {
+    if (!w) return p0null_;
+    if (use_uniform_) return uniform_;
+    return probs_[w];
+  }
+
+  const bool use_uniform_;
+  const prob_t p0null_;
+  const prob_t uniform_;
+  vector<prob_t> probs_;
+};
 
 struct Model1 {
   explicit Model1(const string& fname) :
@@ -89,11 +146,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
         ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles")
         ("input,i",po::value<string>(),"Read parallel data from")
-        ("max_src_phrase",po::value<unsigned>()->default_value(7),"Maximum length of source language phrases")
-        ("max_trg_phrase",po::value<unsigned>()->default_value(7),"Maximum length of target language phrases")
         ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
         ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")
         ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+        ("src_unigram,u",po::value<string>()->default_value(""),"Source unigram distribution; empty for uniform")
+        ("trg_unigram,U",po::value<string>()->default_value(""),"Target unigram distribution; empty for uniform")
         ("random_seed,S",po::value<uint32_t>(), "Random seed");
   po::options_description clo("Command line options");
   clo.add_options()
@@ -165,11 +222,11 @@ void ReadParallelCorpus(const string& filename,
 int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
-  const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
-  const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
   const unsigned particles = conf["particles"].as<unsigned>();
   const unsigned samples = conf["samples"].as<unsigned>();
-
+  TD::Convert("<s>");
+  TD::Convert("</s>");
+  TD::Convert("<unk>");
   if (!conf.count("model1")) {
     cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
     return 1;
@@ -188,23 +245,28 @@ int main(int argc, char** argv) {
   cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
   cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
   assert(corpusf.size() == corpuse.size());
+  UnigramModel src_unigram(conf["src_unigram"].as<string>(), vocabf.size());
+  UnigramModel trg_unigram(conf["trg_unigram"].as<string>(), vocabe.size());
+  const prob_t kHALF(0.5);
 
+  const string kEMPTY = "NULL";
   const int kLHS = -TD::Convert("X");
   Model1 m1(conf["model1"].as<string>());
   Model1 invm1(conf["inverse_model1"].as<string>());
   for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) {
     cerr << '.' << flush;
     for (int ci = 0; ci < corpusf.size(); ++ci) {
-      const vector<WordID>& src = corpusf[ci];
       const vector<WordID>& trg = corpuse[ci];
-      for (int i = 0; i < src.size(); ++i) {
-        for (int j = 0; j < trg.size(); ++j) {
-          const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE);
-          for (int k = 0; k < eff_max_src; ++k) {
-            const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE));
-            for (int l = 0; l < eff_max_trg; ++l) {
-            }
-          }
+      const vector<WordID>& src = corpusf[ci];
+      for (int i = 0; i <= trg.size(); ++i) {
+        const WordID e_i = i > 0 ? trg[i-1] : 0;
+        for (int j = 0; j <= src.size(); ++j) {
+          const WordID f_j = j > 0 ? src[j-1] : 0;
+          if (e_i == 0 && f_j == 0) continue;
+          prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j);
+          cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl;
+          if (e_i && f_j)
+            cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl;
         }
       }
     }
diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc
new file mode 100644
index 00000000..40829775
--- /dev/null
+++ b/gi/pf/unigrams.cc
@@ -0,0 +1,80 @@
+#include "unigrams.h"
+
+#include <string>
+#include <cmath>
+
+#include "stringlib.h"
+#include "filelib.h"
+
+using namespace std;
+
+void UnigramModel::LoadUnigrams(const string& fname) {
+  cerr << "Loading unigram probabilities from " << fname << " ..." << endl;
+  ReadFile rf(fname);
+  string line;
+  istream& in = *rf.stream();
+  assert(in);
+  getline(in, line);
+  assert(line.empty());
+  getline(in, line);
+  assert(line == "\\data\\");
+  getline(in, line);
+  size_t pos = line.find("ngram 1=");
+  assert(pos == 0);
+  assert(line.size() > 8);
+  const size_t num_unigrams = atoi(&line[8]);
+  getline(in, line);
+  assert(line.empty());
+  getline(in, line);
+  assert(line == "\\1-grams:");
+  for (size_t i = 0; i < num_unigrams; ++i) {
+    getline(in, line);
+    assert(line.size() > 0);
+    pos = line.find('\t');
+    assert(pos > 0);
+    assert(pos + 1 < line.size());
+    const WordID w = TD::Convert(line.substr(pos + 1));
+    line[pos] = 0;
+    float p = atof(&line[0]);
+    if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n";
+  }
+}
+
+void UnigramWordModel::LoadUnigrams(const string& fname) {
+  cerr << "Loading unigram probabilities from " << fname << " ..." << endl;
+  ReadFile rf(fname);
+  string line;
+  istream& in = *rf.stream();
+  assert(in);
+  getline(in, line);
+  assert(line.empty());
+  getline(in, line);
+  assert(line == "\\data\\");
+  getline(in, line);
+  size_t pos = line.find("ngram 1=");
+  assert(pos == 0);
+  assert(line.size() > 8);
+  const size_t num_unigrams = atoi(&line[8]);
+  getline(in, line);
+  assert(line.empty());
+  getline(in, line);
+  assert(line == "\\1-grams:");
+  for (size_t i = 0; i < num_unigrams; ++i) {
+    getline(in, line);
+    assert(line.size() > 0);
+    pos = line.find('\t');
+    assert(pos > 0);
+    assert(pos + 1 < line.size());
+    size_t cur = pos + 1;
+    vector<WordID> w;
+    while (cur < line.size()) {
+      const size_t len = UTF8Len(line[cur]);
+      w.push_back(TD::Convert(line.substr(cur, len)));
+      cur += len;
+    }
+    line[pos] = 0;
+    float p = atof(&line[0]);
+    probs_[w].logeq(p * log(10.0));
+  }
+}
+
diff --git a/gi/pf/unigrams.h b/gi/pf/unigrams.h
new file mode 100644
index 00000000..1660d1ed
--- /dev/null
+++ b/gi/pf/unigrams.h
@@ -0,0 +1,69 @@
+#ifndef _UNIGRAMS_H_
+#define _UNIGRAMS_H_
+
+#include <vector>
+#include <string>
+#include <tr1/unordered_map>
+#include <boost/functional.hpp>
+
+#include "wordid.h"
+#include "prob.h"
+#include "tdict.h"
+
+struct UnigramModel {
+  explicit UnigramModel(const std::string& fname, unsigned vocab_size) :
+      use_uniform_(fname.size() == 0),
+      uniform_(1.0 / vocab_size),
+      probs_() {
+    if (fname.size() > 0) {
+      probs_.resize(TD::NumWords() + 1);
+      LoadUnigrams(fname);
+    }
+  }
+
+  const prob_t& operator()(const WordID& w) const {
+    assert(w);
+    if (use_uniform_) return uniform_;
+    return probs_[w];
+  }
+
+ private:
+  void LoadUnigrams(const std::string& fname);
+
+  const bool use_uniform_;
+  const prob_t uniform_;
+  std::vector<prob_t> probs_;
+};
+
+
+// reads an ARPA unigram file and converts words like 'cat' into a string 'c a t'
+struct UnigramWordModel {
+  explicit UnigramWordModel(const std::string& fname) :
+      use_uniform_(false),
+      uniform_(1.0),
+      probs_() {
+    LoadUnigrams(fname);
+  }
+
+  explicit UnigramWordModel(const unsigned vocab_size) :
+      use_uniform_(true),
+      uniform_(1.0 / vocab_size),
+      probs_() {}
+
+  const prob_t& operator()(const std::vector<WordID>& s) const {
+    if (use_uniform_) return uniform_;
+    const VectorProbHash::const_iterator it = probs_.find(s);
+    assert(it != probs_.end());
+    return it->second;
+  }
+
+ private:
+  void LoadUnigrams(const std::string& fname);
+
+  const bool use_uniform_;
+  const prob_t uniform_;
+  typedef std::tr1::unordered_map<std::vector<WordID>, prob_t, boost::hash<std::vector<WordID> > > VectorProbHash;
+  VectorProbHash probs_;
+};
+
+#endif
-- 
cgit v1.2.3


From 46d833dc92d99f8cbcf5c45e4624ffaca954570b Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 29 Dec 2011 21:09:14 -0500
Subject: ngram base dist

---
 gi/pf/ngram_base.cc | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 gi/pf/ngram_base.h  | 25 +++++++++++++++++++
 2 files changed, 94 insertions(+)
 create mode 100644 gi/pf/ngram_base.cc
 create mode 100644 gi/pf/ngram_base.h

(limited to 'gi/pf')

diff --git a/gi/pf/ngram_base.cc b/gi/pf/ngram_base.cc
new file mode 100644
index 00000000..1299f06f
--- /dev/null
+++ b/gi/pf/ngram_base.cc
@@ -0,0 +1,69 @@
+#include "ngram_base.h"
+
+#include "lm/model.hh"
+#include "tdict.h"
+
+using namespace std;
+
+namespace {
+struct GICSVMapper : public lm::EnumerateVocab {
+  GICSVMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); }
+  void Add(lm::WordIndex index, const StringPiece &str) {
+    const WordID cdec_id = TD::Convert(str.as_string());
+    if (cdec_id >= out_->size())
+      out_->resize(cdec_id + 1, kLM_UNKNOWN_TOKEN);
+    (*out_)[cdec_id] = index;
+  }
+  vector<lm::WordIndex>* out_;
+  const lm::WordIndex kLM_UNKNOWN_TOKEN;
+};
+}
+
+struct FixedNgramBaseImpl {
+  FixedNgramBaseImpl(const string& param) {
+    GICSVMapper vm(&cdec2klm_map_);
+    lm::ngram::Config conf;
+    conf.enumerate_vocab = &vm;
+    cerr << "Reading character LM from " << param << endl;
+    model = new lm::ngram::ProbingModel(param.c_str(), conf);
+    order = model->Order();
+    kEOS = MapWord(TD::Convert("</s>"));
+    assert(kEOS > 0);
+  }
+
+  lm::WordIndex MapWord(const WordID w) const {
+    if (w < cdec2klm_map_.size()) return cdec2klm_map_[w];
+    return 0;
+  }
+
+  ~FixedNgramBaseImpl() { delete model; }
+
+  prob_t StringProbability(const vector<WordID>& s) const {
+    lm::ngram::State state = model->BeginSentenceState();
+    double prob = 0;
+    for (unsigned i = 0; i < s.size(); ++i) {
+      const lm::ngram::State scopy(state);
+      prob += model->Score(scopy, MapWord(s[i]), state);
+    }
+    const lm::ngram::State scopy(state);
+    prob += model->Score(scopy, kEOS, state);
+    prob_t p; p.logeq(prob * log(10));
+    return p;
+  }
+
+  lm::ngram::ProbingModel* model;
+  unsigned order;
+  vector<lm::WordIndex> cdec2klm_map_;
+  lm::WordIndex kEOS;
+};
+
+FixedNgramBase::~FixedNgramBase() { delete impl; }
+
+FixedNgramBase::FixedNgramBase(const string& lmfname) {
+  impl = new FixedNgramBaseImpl(lmfname);
+}
+
+prob_t FixedNgramBase::StringProbability(const vector<WordID>& s) const {
+  return impl->StringProbability(s);
+}
+
diff --git a/gi/pf/ngram_base.h b/gi/pf/ngram_base.h
new file mode 100644
index 00000000..4ea999f3
--- /dev/null
+++ b/gi/pf/ngram_base.h
@@ -0,0 +1,25 @@
+#ifndef _NGRAM_BASE_H_
+#define _NGRAM_BASE_H_
+
+#include <string>
+#include <vector>
+#include "trule.h"
+#include "wordid.h"
+#include "prob.h"
+
+struct FixedNgramBaseImpl;
+struct FixedNgramBase {
+  FixedNgramBase(const std::string& lmfname);
+  ~FixedNgramBase();
+  prob_t StringProbability(const std::vector<WordID>& s) const;
+
+  prob_t operator()(const TRule& rule) const {
+    return StringProbability(rule.e_);
+  }
+
+ private:
+  FixedNgramBaseImpl* impl;
+
+};
+
+#endif
-- 
cgit v1.2.3


From deaec2e8837bcd1bace1527281eef442a1c1030b Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 29 Dec 2011 21:10:24 -0500
Subject: forgotten

---
 gi/pf/condnaive.cc | 298 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 298 insertions(+)
 create mode 100644 gi/pf/condnaive.cc

(limited to 'gi/pf')

diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc
new file mode 100644
index 00000000..52ddbbfe
--- /dev/null
+++ b/gi/pf/condnaive.cc
@@ -0,0 +1,298 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/multi_array.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "base_measures.h"
+#include "monotonic_pseg.h"
+#include "conditional_pseg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp_nt.h"
+#include "corpus.h"
+
+using namespace std;
+using namespace std::tr1;
+namespace po = boost::program_options;
+
+static unsigned kMAX_SRC_PHRASE;
+static unsigned kMAX_TRG_PHRASE;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("input,i",po::value<string>(),"Read parallel data from")
+        ("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases")
+        ("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases")
+        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
+        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("input") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+shared_ptr<MT19937> prng;
+
+struct ModelAndData {
+  explicit ModelAndData(ConditionalParallelSegementationModel<PhraseConditionalBase>& m, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) :
+     model(m),
+     rng(&*prng),
+     corpuse(ce),
+     corpusf(cf),
+     vocabe(ve),
+     vocabf(vf),
+     mh_samples(),
+     mh_rejects(),
+     kX(-TD::Convert("X")),
+     derivations(corpuse.size()) {}
+
+  void ResampleHyperparameters() {
+  }
+
+  void InstantiateRule(const pair<short,short>& from,
+                       const pair<short,short>& to,
+                       const vector<int>& sentf,
+                       const vector<int>& sente,
+                       TRule* rule) const {
+    rule->f_.clear();
+    rule->e_.clear();
+    rule->lhs_ = kX;
+    for (short i = from.first; i < to.first; ++i)
+      rule->f_.push_back(sentf[i]);
+    for (short i = from.second; i < to.second; ++i)
+      rule->e_.push_back(sente[i]);
+  }
+
+  void DecrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
+    if (d.size() < 2) return;
+    TRule x;
+    for (int i = 1; i < d.size(); ++i) {
+      InstantiateRule(d[i], d[i-1], sentf, sente, &x);
+      model.DecrementRule(x);
+      model.DecrementAlign(x.f_.size());
+    }
+  }
+
+  void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
+    if (d.size() < 2) return;
+    TRule x;
+    for (int i = 1; i < d.size(); ++i) {
+      InstantiateRule(d[i], d[i-1], sentf, sente, &x);
+      cerr << i << '/' << (d.size() - 1) << ": " << x << endl;
+    }
+  }
+
+  void IncrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
+    if (d.size() < 2) return;
+    TRule x;
+    for (int i = 1; i < d.size(); ++i) {
+      InstantiateRule(d[i], d[i-1], sentf, sente, &x);
+      model.IncrementRule(x);
+      model.IncrementAlign(x.f_.size());
+    }
+  }
+
+  prob_t Likelihood() const {
+    return model.Likelihood();
+  }
+
+  prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const {
+    prob_t p = prob_t::One();
+    TRule x;
+    for (int i = 1; i < d.size(); ++i) {
+      InstantiateRule(d[i], d[i-1], sentf, sente, &x);
+      p *= model.RuleProbability(x);
+      p *= model.AlignProbability(x.f_.size());
+    }
+    return p;
+  }
+
+  void Sample();
+
+  ConditionalParallelSegementationModel<PhraseConditionalBase>& model;
+  MT19937* rng;
+  const vector<vector<int> >& corpuse, corpusf;
+  const set<int>& vocabe, vocabf;
+  unsigned mh_samples, mh_rejects;
+  const int kX;
+  vector<vector<pair<short, short> > > derivations;
+};
+
+void ModelAndData::Sample() {
+  unsigned MAXK = kMAX_SRC_PHRASE;
+  unsigned MAXL = kMAX_TRG_PHRASE;
+  TRule x;
+  x.lhs_ = -TD::Convert("X");
+
+  for (int samples = 0; samples < 1000; ++samples) {
+    if (samples % 1 == 0 && samples > 0) {
+      //ResampleHyperparameters();
+      cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n";
+      for (int i = 0; i < 10; ++i) {
+        cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl;
+        PrintDerivation(derivations[i], corpusf[i], corpuse[i]);
+      }
+      static TRule xx("[X] ||| w n ||| s h ||| X=0");
+      const CCRP_NoTable<TRule>& dcrp = model.tmodel.r.find(xx.f_)->second;
+      for (CCRP_NoTable<TRule>::const_iterator it = dcrp.begin(); it != dcrp.end(); ++it) {
+        cerr << "\t" << it->second << "\t" << it->first << endl;
+      }
+    }
+    cerr << '.' << flush;
+    for (int s = 0; s < corpuse.size(); ++s) {
+      const vector<int>& sentf = corpusf[s];
+      const vector<int>& sente = corpuse[s];
+//      cerr << "  CUSTOMERS: " << rules.num_customers() << endl;
+//      cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl;
+
+      vector<pair<short, short> >& deriv = derivations[s];
+      const prob_t p_cur = Likelihood();
+      DecrementDerivation(deriv, sentf, sente);
+
+      boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]);
+      boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]);
+      a[0][0] = prob_t::One();
+      for (int i = 0; i < sentf.size(); ++i) {
+        for (int j = 0; j < sente.size(); ++j) {
+          const prob_t src_a = a[i][j];
+          x.f_.clear();
+          for (int k = 1; k <= MAXK; ++k) {
+            if (i + k > sentf.size()) break;
+            x.f_.push_back(sentf[i + k - 1]);
+            x.e_.clear();
+            const prob_t p_span = model.AlignProbability(k);  // prob of consuming this much source
+            for (int l = 1; l <= MAXL; ++l) {
+              if (j + l > sente.size()) break;
+              x.e_.push_back(sente[j + l - 1]);
+              trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * p_span;
+              a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1];
+            }
+          }
+        }
+      }
+//      cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl;
+      const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente);
+
+      vector<pair<short,short> > newderiv;
+      int cur_i = sentf.size();
+      int cur_j = sente.size();
+      while(cur_i > 0 && cur_j > 0) {
+        newderiv.push_back(pair<short,short>(cur_i, cur_j));
+//        cerr << "NODE: (" << cur_i << "," << cur_j << ")\n";
+        SampleSet<prob_t> ss;
+        vector<pair<short,short> > nexts;
+        for (int k = 1; k <= MAXK; ++k) {
+          const int hyp_i = cur_i - k;
+          if (hyp_i < 0) break;
+          for (int l = 1; l <= MAXL; ++l) {
+            const int hyp_j = cur_j - l;
+            if (hyp_j < 0) break;
+            const prob_t& inside = a[hyp_i][hyp_j];
+            if (inside == prob_t::Zero()) continue;
+            const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1];
+            if (transp == prob_t::Zero()) continue;
+            const prob_t p = inside * transp;
+            ss.add(p);
+            nexts.push_back(pair<short,short>(hyp_i, hyp_j));
+//            cerr << "    (" << hyp_i << "," << hyp_j << ")  <--- " << log(p) << endl;
+          }
+        }
+//        cerr << "  sample set has " << nexts.size() << " elements.\n";
+        const int selected = rng->SelectSample(ss);
+        cur_i = nexts[selected].first;
+        cur_j = nexts[selected].second;
+      }
+      newderiv.push_back(pair<short,short>(0,0));
+      const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente);
+      IncrementDerivation(newderiv, sentf, sente);
+//      cerr << "SANITY: " << q_new << "  " <<log(DerivationProposalProbability(newderiv, sentf, sente)) << endl;
+      if (deriv.empty()) { deriv = newderiv; continue; }
+      ++mh_samples;
+
+      if (deriv != newderiv) {
+        const prob_t p_new = Likelihood();
+//        cerr << "p_cur=" << log(p_cur) << "\t p_new=" << log(p_new) << endl;
+//        cerr << "q_cur=" << log(q_cur) << "\t q_new=" << log(q_new) << endl;
+        if (!rng->AcceptMetropolisHastings(p_new, p_cur, q_new, q_cur)) {
+          ++mh_rejects;
+          DecrementDerivation(newderiv, sentf, sente);
+          IncrementDerivation(deriv, sentf, sente);
+        } else {
+//          cerr << "  ACCEPT\n";
+          deriv = newderiv;
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
+  kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
+
+  if (!conf.count("model1")) {
+    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
+    return 1;
+  }
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+//  MT19937& rng = *prng;
+
+  vector<vector<int> > corpuse, corpusf;
+  set<int> vocabe, vocabf;
+  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
+  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
+  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
+  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
+  assert(corpusf.size() == corpuse.size());
+
+  Model1 m1(conf["model1"].as<string>());
+
+  PhraseConditionalBase pcb0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size());
+  ConditionalParallelSegementationModel<PhraseConditionalBase> x(pcb0);  
+
+  ModelAndData posterior(x, corpuse, corpusf, vocabe, vocabf);
+  posterior.Sample();
+
+  TRule r1("[X] ||| x ||| l e ||| X=0");
+  TRule r2("[X] ||| A ||| a d ||| X=0");
+  TRule r3("[X] ||| n ||| e r ||| X=0");
+  TRule r4("[X] ||| x A n ||| b l a g ||| X=0");
+
+  PhraseConditionalUninformativeBase u0(vocabe.size());
+
+  cerr << (pcb0(r1)*pcb0(r2)*pcb0(r3)) << endl;
+  cerr << (u0(r4)) << endl;
+
+  return 0;
+}
+
-- 
cgit v1.2.3


From 68870390f6c6429af2b9c182ad28e8a78a0f1752 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 29 Dec 2011 21:10:36 -0500
Subject: foo

---
 gi/pf/conditional_pseg.h | 155 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 gi/pf/conditional_pseg.h

(limited to 'gi/pf')

diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
new file mode 100644
index 00000000..edcdc813
--- /dev/null
+++ b/gi/pf/conditional_pseg.h
@@ -0,0 +1,155 @@
+#ifndef _CONDITIONAL_PSEG_H_
+#define _CONDITIONAL_PSEG_H_
+
+#include <vector>
+#include <tr1/unordered_map>
+#include <boost/functional/hash.hpp>
+#include <iostream>
+
+#include "prob.h"
+#include "ccrp_nt.h"
+#include "trule.h"
+#include "base_measures.h"
+#include "tdict.h"
+
+template <typename ConditionalBaseMeasure>
+struct ConditionalTranslationModel {
+  explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :
+    rp0(rcp0) {}
+
+  void Summary() const {
+    std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
+    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+      std::cerr << TD::GetString(it->first) << "   \t(\\alpha = " << it->second.concentration() << ") --------------------------" << std::endl;
+      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+        std::cerr << "   " << i2->second << '\t' << i2->first << std::endl;
+    }
+  }
+
+  void ResampleHyperparameters(MT19937* rng) {
+    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
+      it->second.resample_hyperparameters(rng);
+  } 
+
+  int DecrementRule(const TRule& rule) {
+    RuleModelHash::iterator it = r.find(rule.f_);
+    assert(it != r.end());    
+    int count = it->second.decrement(rule);
+    if (count) {
+      if (it->second.num_customers() == 0) r.erase(it);
+    }
+    return count;
+  }
+
+  int IncrementRule(const TRule& rule) {
+    RuleModelHash::iterator it = r.find(rule.f_);
+    if (it == r.end()) {
+      it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1.0, 1.0, 8.0))).first;
+    } 
+    int count = it->second.increment(rule);
+    return count;
+  }
+
+  void IncrementRules(const std::vector<TRulePtr>& rules) {
+    for (int i = 0; i < rules.size(); ++i)
+      IncrementRule(*rules[i]);
+  }
+
+  void DecrementRules(const std::vector<TRulePtr>& rules) {
+    for (int i = 0; i < rules.size(); ++i)
+      DecrementRule(*rules[i]);
+  }
+
+  prob_t RuleProbability(const TRule& rule) const {
+    prob_t p;
+    RuleModelHash::const_iterator it = r.find(rule.f_);
+    if (it == r.end()) {
+      p.logeq(log(rp0(rule)));
+    } else {
+      p.logeq(it->second.logprob(rule, log(rp0(rule))));
+    }
+    return p;
+  }
+
+  prob_t Likelihood() const {
+    prob_t p = prob_t::One();
+    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+      prob_t q; q.logeq(it->second.log_crp_prob());
+      p *= q;
+      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+        p *= rp0(i2->first);
+    }
+    return p;
+  }
+
+  const ConditionalBaseMeasure& rp0;
+  typedef std::tr1::unordered_map<std::vector<WordID>,
+                                  CCRP_NoTable<TRule>,
+                                  boost::hash<std::vector<WordID> > > RuleModelHash;
+  RuleModelHash r;
+};
+
+template <typename ConditionalBaseMeasure>
+struct ConditionalParallelSegementationModel {
+  explicit ConditionalParallelSegementationModel(ConditionalBaseMeasure& rcp0) :
+    tmodel(rcp0), base(prob_t::One()), aligns(1,1) {}
+
+  ConditionalTranslationModel<ConditionalBaseMeasure> tmodel;
+
+  void DecrementRule(const TRule& rule) {
+    tmodel.DecrementRule(rule);
+  }
+
+  void IncrementRule(const TRule& rule) {
+    tmodel.IncrementRule(rule);
+  }
+
+  void IncrementRulesAndAlignments(const std::vector<TRulePtr>& rules) {
+    tmodel.IncrementRules(rules);
+    for (int i = 0; i < rules.size(); ++i) {
+      IncrementAlign(rules[i]->f_.size());
+    }
+  }
+
+  void DecrementRulesAndAlignments(const std::vector<TRulePtr>& rules) {
+    tmodel.DecrementRules(rules);
+    for (int i = 0; i < rules.size(); ++i) {
+      DecrementAlign(rules[i]->f_.size());
+    }
+  }
+
+  prob_t RuleProbability(const TRule& rule) const {
+    return tmodel.RuleProbability(rule);
+  }
+
+  void IncrementAlign(unsigned span) {
+    if (aligns.increment(span)) {
+      // TODO
+    }
+  }
+
+  void DecrementAlign(unsigned span) {
+    if (aligns.decrement(span)) {
+      // TODO
+    }
+  }
+
+  prob_t AlignProbability(unsigned span) const {
+    prob_t p;
+    p.logeq(aligns.logprob(span, log_poisson(span, 1.0)));
+    return p;
+  }
+
+  prob_t Likelihood() const {
+    prob_t p; p.logeq(aligns.log_crp_prob());
+    p *= base;
+    p *= tmodel.Likelihood();
+    return p;
+  }
+
+  prob_t base;
+  CCRP_NoTable<unsigned> aligns;
+};
+
+#endif
+
-- 
cgit v1.2.3


From 031dc91814c1b57269b8a789c93aad0da0a46b6b Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 29 Dec 2011 23:02:50 -0500
Subject: remove broken prior, add logging

---
 gi/pf/align-lexonly.cc | 36 ++----------------------------------
 1 file changed, 2 insertions(+), 34 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc
index 91a3cfcf..7e48b25a 100644
--- a/gi/pf/align-lexonly.cc
+++ b/gi/pf/align-lexonly.cc
@@ -66,41 +66,9 @@ struct AlignedSentencePair {
   Array2D<short> posterior;
 };
 
-struct HierarchicalUnigramBase {
-  explicit HierarchicalUnigramBase(const unsigned vocab_e_size) : r(5,5), u0(1.0 / vocab_e_size) {}
-
-  // return p0 of rule.e_
-  prob_t operator()(const TRule& rule) const {
-    prob_t p = prob_t::One();
-    prob_t q;
-    for (unsigned i = 0; i < rule.e_.size(); ++i) {
-      q.logeq(r.logprob(rule.e_[i], log(u0)));
-      p *= q;
-    }
-    q.logeq(r.logprob(TD::Convert("</s>"), log(u0)));
-    p *= q;
-    return p;
-  }
-
-  void Increment(const TRule& rule) {
-    for (unsigned i = 0; i < rule.e_.size(); ++i)
-      r.increment(rule.e_[i]);
-    r.increment(TD::Convert("</s>"));
-  }
-
-  void Decrement(const TRule& rule) {
-    for (unsigned i = 0; i < rule.e_.size(); ++i)
-      r.decrement(rule.e_[i]);
-    r.decrement(TD::Convert("</s>"));
-  }
-
-  CCRP_NoTable<WordID> r;
-  prob_t u0;
-};
-
 struct HierarchicalWordBase {
   explicit HierarchicalWordBase(const unsigned vocab_e_size) :
-      base(prob_t::One()), r(15,15), u0(-log(vocab_e_size)) {}
+      base(prob_t::One()), r(25,25,10), u0(-log(vocab_e_size)) {}
 
   void ResampleHyperparameters(MT19937* rng) {
     r.resample_hyperparameters(rng);
@@ -137,7 +105,7 @@ struct HierarchicalWordBase {
   }
 
   void Summary() const {
-    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << endl;
+    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (\\alpha=" << r.concentration() << ')' << endl;
     for (CCRP_NoTable<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
       cerr << "   " << it->second << '\t' << TD::GetString(it->first) << endl;
   }
-- 
cgit v1.2.3


From 173570597f77da8f0bdb9b5a42baa64675e93b17 Mon Sep 17 00:00:00 2001
From: Chris Dyer <prguest11@taipan.cs>
Date: Fri, 30 Dec 2011 19:23:32 +0000
Subject: logging corpus errors

---
 gi/pf/corpus.cc | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc
index a408e7cf..cb6e4ed7 100644
--- a/gi/pf/corpus.cc
+++ b/gi/pf/corpus.cc
@@ -24,11 +24,11 @@ void ReadParallelCorpus(const string& filename,
   istream* in = rf.stream();
   assert(*in);
   string line;
+  unsigned lc = 0;
   const WordID kDIV = TD::Convert("|||");
   vector<WordID> tmp;
-  while(*in) {
-    getline(*in, line);
-    if (line.empty() && !*in) break;
+  while(getline(*in, line)) {
+    ++lc;
     e->push_back(vector<int>());
     f->push_back(vector<int>());
     vector<int>& le = e->back();
@@ -39,12 +39,17 @@ void ReadParallelCorpus(const string& filename,
     for (unsigned i = 0; i < tmp.size(); ++i) {
       const int cur = tmp[i];
       if (isf) {
-        if (kDIV == cur) { isf = false; } else {
+        if (kDIV == cur) {
+          isf = false;
+        } else {
           lf.push_back(cur);
           vocab_f->insert(cur);
         }
       } else {
-        assert(cur != kDIV);
+        if (cur == kDIV) {
+          cerr << "ERROR in " << lc << ": " << line << endl << endl;
+          abort();
+        }
         le.push_back(cur);
         vocab_e->insert(cur);
       }
-- 
cgit v1.2.3


From 16994c379cf9944968c778671ecd39a5ee95d43c Mon Sep 17 00:00:00 2001
From: Chris Dyer <prguest11@taipan.cs>
Date: Sat, 31 Dec 2011 17:54:38 +0000
Subject: last change before adding wood&teh stuff

---
 gi/pf/align-lexonly.cc |  6 ++++--
 gi/pf/base_measures.h  | 11 +++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc
index 7e48b25a..e9f1e7b6 100644
--- a/gi/pf/align-lexonly.cc
+++ b/gi/pf/align-lexonly.cc
@@ -117,10 +117,12 @@ struct HierarchicalWordBase {
 
 struct BasicLexicalAlignment {
   explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets,
+                                 const unsigned words_e,
                                  const unsigned letters_e,
                                  vector<AlignedSentencePair>* corp) :
       letters(lets),
       corpus(*corp),
+      //up0(words_e),
       //up0("en.chars.1gram", letters_e),
       //up0("en.words.1gram"),
       up0(letters_e),
@@ -302,11 +304,11 @@ int main(int argc, char** argv) {
   ExtractLetters(vocabf, &letters, NULL);
   letters[TD::Convert("NULL")].clear();
 
-  BasicLexicalAlignment x(letters, letset.size(), &corpus);
+  BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus);
   x.InitializeRandom();
   const unsigned samples = conf["samples"].as<unsigned>();
   for (int i = 0; i < samples; ++i) {
-    for (int j = 431; j < 433; ++j) Debug(corpus[j]);
+    for (int j = 4995; j < 4997; ++j) Debug(corpus[j]);
     cerr << i << "\t" << x.tmodel.r.size() << "\t";
     if (i % 10 == 0) x.ResampleHyperparemeters();
     x.ResampleCorpus();
diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h
index fbd1c3ad..a4e9ac28 100644
--- a/gi/pf/base_measures.h
+++ b/gi/pf/base_measures.h
@@ -11,6 +11,7 @@
 #include "trule.h"
 #include "prob.h"
 #include "tdict.h"
+#include "sampler.h"
 
 inline double log_poisson(unsigned x, const double& lambda) {
   assert(lambda > 0.0);
@@ -55,6 +56,11 @@ struct CompletelyUniformBase {
   prob_t operator()(const TRule&) const {
     return kUNIFORM;
   }
+  void Summary() const {}
+  void ResampleHyperparameters(MT19937*) {}
+  void Increment(const TRule&) {}
+  void Decrement(const TRule&) {}
+  prob_t Likelihood() const { return prob_t::One(); }
   const prob_t kUNIFORM;
 };
 
@@ -79,6 +85,11 @@ struct PhraseConditionalUninformativeBase {
 
   prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
 
+  void Summary() const {}
+  void ResampleHyperparameters(MT19937*) {}
+  void Increment(const TRule&) {}
+  void Decrement(const TRule&) {}
+  prob_t Likelihood() const { return prob_t::One(); }
   const prob_t kUNIFORM_TARGET;
 };
 
-- 
cgit v1.2.3


From 72b0ebee7d3398dfb657f2949b9e5dac82342198 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 11 Jan 2012 01:22:54 -0500
Subject: script to pull out candidate transliterations from a word-aligned
 parallel corpus

---
 gi/pf/guess-translits.pl | 71 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100755 gi/pf/guess-translits.pl

(limited to 'gi/pf')

diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl
new file mode 100755
index 00000000..ab737121
--- /dev/null
+++ b/gi/pf/guess-translits.pl
@@ -0,0 +1,71 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+my $MIN_PMI = -3;
+
+my %fs;
+my %es;
+my %ef;
+
+die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+binmode(STDERR,":utf8");
+
+my $tot = 0;
+print STDERR "Reading alignments from STDIN ...\n";
+while(<STDIN>) {
+  chomp;
+  my ($fsent, $esent, $alsent) = split / \|\|\| /;
+  die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent;
+
+  my @fws = split /\s+/, $fsent;  
+  my @ews = split /\s+/, $esent;
+  my @as = split /\s+/, $alsent;
+  my %a2b;
+  my %b2a;
+  for my $ap (@as) {
+    my ($a,$b) = split /-/, $ap;
+    $a2b{$a}->{$b} = 1;
+    $b2a{$b}->{$a} = 1;
+  }
+  for my $a (keys %a2b) {
+    my $bref = $a2b{$a};
+    next unless scalar keys %$bref < 2;
+    my $b = (keys %$bref)[0];
+    next unless scalar keys %{$b2a{$b}} < 2;
+    my $f = $fws[$a];
+    next unless defined $f;
+    next unless length($f) > 3;
+    my $e = $ews[$b];
+    next unless defined $e;
+    next unless length($e) > 3;
+
+    $ef{$f}->{$e}++;
+    $es{$e}++;
+    $fs{$f}++;
+    $tot++;
+  }  
+}
+my $ltot = log($tot);
+my $num = 0;
+print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n";
+for my $f (keys %fs) {
+  my $logf = log($fs{$f});
+  my $esref = $ef{$f};
+  for my $e (keys %$esref) {
+    my $loge = log($es{$e});
+    my $ef = $esref->{$e};
+    my $logef = log($ef);
+    my $pmi = $logef - ($loge + $logf);
+    next if $pmi < $MIN_PMI;
+    my @flets = split //, $f;
+    my @elets = split //, $e;
+    print "@flets ||| @elets\n";
+    $num++;
+  }
+}
+print STDERR "Extracted $num pairs.\n";
+print STDERR "Recommend running:\n   ../../training/model1 -t -99999 output.txt\n";
-- 
cgit v1.2.3


From 4ebb11b25cf87dc5938b5eb65e884d0e3f4ee146 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Mon, 23 Jan 2012 15:47:29 -0500
Subject: more alignment stuff

---
 gi/pf/Makefile.am           |   4 +-
 gi/pf/align-lexonly-pyp.cc  | 327 ++++++++++++++++++++++++++++++++++++++++++++
 gi/pf/base_measures.cc      |  47 +++++++
 gi/pf/base_measures.h       |  18 +++
 gi/pf/conditional_pseg.h    |  74 ++++++++++
 word-aligner/stemmers/ur.pl |  38 +++++
 6 files changed, 507 insertions(+), 1 deletion(-)
 create mode 100644 gi/pf/align-lexonly-pyp.cc
 create mode 100755 word-aligner/stemmers/ur.pl

(limited to 'gi/pf')

diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 7c8e89d0..28367e67 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,10 +1,12 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp
 
 noinst_LIBRARIES = libpf.a
 libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
 
 align_lexonly_SOURCES = align-lexonly.cc
 
+align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
+
 itg_SOURCES = itg.cc
 
 condnaive_SOURCES = condnaive.cc
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
new file mode 100644
index 00000000..d2630a2b
--- /dev/null
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -0,0 +1,327 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/multi_array.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "array2d.h"
+#include "base_measures.h"
+#include "monotonic_pseg.h"
+#include "conditional_pseg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "stringlib.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "mfcr.h"
+#include "corpus.h"
+#include "ngram_base.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("input,i",po::value<string>(),"Read parallel data from")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("input") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+shared_ptr<MT19937> prng;
+
+struct LexicalAlignment {
+  unsigned char src_index;
+  bool is_transliteration;
+  vector<pair<short, short> > derivation;
+};
+
+struct AlignedSentencePair {
+  vector<WordID> src;
+  vector<WordID> trg;
+  vector<LexicalAlignment> a;
+  Array2D<short> posterior;
+};
+
+struct HierarchicalWordBase {
+  explicit HierarchicalWordBase(const unsigned vocab_e_size) :
+      base(prob_t::One()), r(1,1,1,25,25), u0(-log(vocab_e_size)), l(1,1.0), v(1, 0.0) {}
+
+  void ResampleHyperparameters(MT19937* rng) {
+    r.resample_hyperparameters(rng);
+  }
+
+  inline double logp0(const vector<WordID>& s) const {
+    return s.size() * u0;
+  }
+
+  // return p0 of rule.e_
+  prob_t operator()(const TRule& rule) const {
+    v[0] = exp(logp0(rule.e_));
+    return prob_t(r.prob(rule.e_, v, l));
+  }
+
+  void Increment(const TRule& rule) {
+    v[0] = exp(logp0(rule.e_));
+    if (r.increment(rule.e_, v, l, &*prng).count) {
+      base *= prob_t(v[0] * l[0]);
+    }
+  }
+
+  void Decrement(const TRule& rule) {
+    if (r.decrement(rule.e_, &*prng).count) {
+      base /= prob_t(exp(logp0(rule.e_)));
+    }
+  }
+
+  prob_t Likelihood() const {
+    prob_t p; p.logeq(r.log_crp_prob());
+    p *= base;
+    return p;
+  }
+
+  void Summary() const {
+    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.d() << ",\\alpha=" << r.alpha() << ')' << endl;
+    for (MFCR<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
+      cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl;
+  }
+
+  prob_t base;
+  MFCR<vector<WordID> > r;
+  const double u0;
+  const vector<double> l;
+  mutable vector<double> v;
+};
+
+struct BasicLexicalAlignment {
+  explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets,
+                                 const unsigned words_e,
+                                 const unsigned letters_e,
+                                 vector<AlignedSentencePair>* corp) :
+      letters(lets),
+      corpus(*corp),
+      //up0(words_e),
+      //up0("en.chars.1gram", letters_e),
+      //up0("en.words.1gram"),
+      up0(letters_e),
+      //up0("en.chars.2gram"),
+      tmodel(up0) {
+  }
+
+  void InstantiateRule(const WordID src,
+                       const WordID trg,
+                       TRule* rule) const {
+    static const WordID kX = TD::Convert("X") * -1;
+    rule->lhs_ = kX;
+    rule->e_ = letters[trg];
+    rule->f_ = letters[src];
+  }
+
+  void InitializeRandom() {
+    const WordID kNULL = TD::Convert("NULL");
+    cerr << "Initializing with random alignments ...\n";
+    for (unsigned i = 0; i < corpus.size(); ++i) {
+      AlignedSentencePair& asp = corpus[i];
+      asp.a.resize(asp.trg.size());
+      for (unsigned j = 0; j < asp.trg.size(); ++j) {
+        const unsigned char a_j = prng->next() * (1 + asp.src.size());
+        const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+        TRule r;
+        InstantiateRule(f_a_j, asp.trg[j], &r);
+        asp.a[j].is_transliteration = false;
+        asp.a[j].src_index = a_j;
+        if (tmodel.IncrementRule(r, &*prng))
+          up0.Increment(r);
+      }
+    }
+    cerr << "  LLH = " << Likelihood() << endl;
+  }
+
+  prob_t Likelihood() const {
+    prob_t p = tmodel.Likelihood();
+    p *= up0.Likelihood();
+    return p;
+  }
+
+  void ResampleHyperparemeters() {
+    cerr << "  LLH_prev = " << Likelihood() << flush;
+    tmodel.ResampleHyperparameters(&*prng);
+    up0.ResampleHyperparameters(&*prng);
+    cerr << "\tLLH_post = " << Likelihood() << endl;
+  }
+
+  void ResampleCorpus();
+
+  const vector<vector<WordID> >& letters; // spelling dictionary
+  vector<AlignedSentencePair>& corpus;
+  //PhraseConditionalUninformativeBase up0;
+  //PhraseConditionalUninformativeUnigramBase up0;
+  //UnigramWordBase up0;
+  //HierarchicalUnigramBase up0;
+  HierarchicalWordBase up0;
+  //CompletelyUniformBase up0;
+  //FixedNgramBase up0;
+  //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel;
+  //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel;
+  //ConditionalTranslationModel<UnigramWordBase> tmodel;
+  //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel;
+  MConditionalTranslationModel<HierarchicalWordBase> tmodel;
+  //ConditionalTranslationModel<FixedNgramBase> tmodel;
+  //ConditionalTranslationModel<CompletelyUniformBase> tmodel;
+};
+
+void BasicLexicalAlignment::ResampleCorpus() {
+  static const WordID kNULL = TD::Convert("NULL");
+  for (unsigned i = 0; i < corpus.size(); ++i) {
+    AlignedSentencePair& asp = corpus[i];
+    SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
+    for (unsigned j = 0; j < asp.trg.size(); ++j) {
+      TRule r;
+      unsigned char& a_j = asp.a[j].src_index;
+      WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+      InstantiateRule(f_a_j, asp.trg[j], &r);
+      if (tmodel.DecrementRule(r, &*prng))
+        up0.Decrement(r);
+
+      for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
+        const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
+        InstantiateRule(prop_f, asp.trg[j], &r);
+        ss[prop_a_j] = tmodel.RuleProbability(r);
+      }
+      a_j = prng->SelectSample(ss);
+      f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+      InstantiateRule(f_a_j, asp.trg[j], &r);
+      if (tmodel.IncrementRule(r, &*prng))
+        up0.Increment(r);
+    }
+  }
+  cerr << "  LLH = " << tmodel.Likelihood() << endl;
+}
+
+void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
+  for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
+    vector<WordID>& letters = (*l)[*it];
+    if (letters.size()) continue;   // if e and f have the same word
+
+    const string& w = TD::Convert(*it);
+    
+    size_t cur = 0;
+    while (cur < w.size()) {
+      const size_t len = UTF8Len(w[cur]);
+      letters.push_back(TD::Convert(w.substr(cur, len)));
+      if (letset) letset->insert(letters.back());
+      cur += len;
+    }
+  }
+}
+
+void Debug(const AlignedSentencePair& asp) {
+  cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
+  Array2D<bool> a(asp.src.size(), asp.trg.size());
+  for (unsigned j = 0; j < asp.trg.size(); ++j)
+    if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
+  cerr << a << endl;
+}
+
+void AddSample(AlignedSentencePair* asp) {
+  for (unsigned j = 0; j < asp->trg.size(); ++j)
+    asp->posterior(asp->a[j].src_index, j)++;
+}
+
+void WriteAlignments(const AlignedSentencePair& asp) {
+  bool first = true;
+  for (unsigned j = 0; j < asp.trg.size(); ++j) {
+    int src_index = -1;
+    int mc = -1;
+    for (unsigned i = 0; i <= asp.src.size(); ++i) {
+      if (asp.posterior(i, j) > mc) {
+        mc = asp.posterior(i, j);
+        src_index = i;
+      }
+    }
+
+    if (src_index) {
+      if (first) first = false; else cout << ' ';
+      cout << (src_index - 1) << '-' << j;
+    }
+  }
+  cout << endl;
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+//  MT19937& rng = *prng;
+
+  vector<vector<int> > corpuse, corpusf;
+  set<int> vocabe, vocabf;
+  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
+  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
+  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
+  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
+  assert(corpusf.size() == corpuse.size());
+
+  vector<AlignedSentencePair> corpus(corpuse.size());
+  for (unsigned i = 0; i < corpuse.size(); ++i) {
+    corpus[i].src.swap(corpusf[i]);
+    corpus[i].trg.swap(corpuse[i]);
+    corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size());
+  }
+  corpusf.clear(); corpuse.clear();
+
+  vocabf.insert(TD::Convert("NULL"));
+  vector<vector<WordID> > letters(TD::NumWords());
+  set<WordID> letset;
+  ExtractLetters(vocabe, &letters, &letset);
+  ExtractLetters(vocabf, &letters, NULL);
+  letters[TD::Convert("NULL")].clear();
+
+  BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus);
+  x.InitializeRandom();
+  const unsigned samples = conf["samples"].as<unsigned>();
+  for (int i = 0; i < samples; ++i) {
+    for (int j = 65; j < 67; ++j) Debug(corpus[j]);
+    cerr << i << "\t" << x.tmodel.r.size() << "\t";
+    if (i % 10 == 0) x.ResampleHyperparemeters();
+    x.ResampleCorpus();
+    if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
+  }
+  for (unsigned i = 0; i < corpus.size(); ++i)
+    WriteAlignments(corpus[i]);
+  //ModelAndData posterior(x, &corpus, vocabe, vocabf);
+  x.tmodel.Summary();
+  x.up0.Summary();
+
+  //posterior.Sample();
+
+  return 0;
+}
diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc
index 97b4e698..7894d3e7 100644
--- a/gi/pf/base_measures.cc
+++ b/gi/pf/base_measures.cc
@@ -6,6 +6,53 @@
 
 using namespace std;
 
+TableLookupBase::TableLookupBase(const string& fname) {
+  cerr << "TableLookupBase reading from " << fname << " ..." << endl;
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  unsigned lc = 0;
+  const WordID kDIV = TD::Convert("|||");
+  vector<WordID> tmp;
+  vector<int> le, lf;
+  TRule x;
+  x.lhs_ = -TD::Convert("X");
+  bool flag = false;
+  while(getline(in, line)) {
+    ++lc;
+    if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; }
+    else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; }
+    tmp.clear();
+    TD::ConvertSentence(line, &tmp);
+    x.f_.clear();
+    x.e_.clear();
+    size_t pos = 0;
+    int cc = 0;
+    while(pos < tmp.size()) {
+      const WordID cur = tmp[pos++];
+      if (cur == kDIV) {
+        ++cc;
+      } else if (cc == 0) {
+        x.f_.push_back(cur);    
+      } else if (cc == 1) {
+        x.e_.push_back(cur);
+      } else if (cc == 2) {
+        table[x] = atof(TD::Convert(cur));
+        ++cc;
+      } else {
+        if (flag) cerr << endl;
+        cerr << "Bad format in " << lc << ": " << line << endl; abort();
+      }
+    }
+    if (cc != 3) {
+      if (flag) cerr << endl;
+      cerr << "Bad format in " << lc << ": " << line << endl; abort();
+    }
+  }
+  if (flag) cerr << endl;
+  cerr << " read " << lc << " entries\n";
+}
+
 prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc,
                                                      const vector<WordID>& vtrg,
                                                      int start_src, int start_trg) const {
diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h
index a4e9ac28..7214aa22 100644
--- a/gi/pf/base_measures.h
+++ b/gi/pf/base_measures.h
@@ -72,6 +72,24 @@ struct UnigramWordBase {
   const UnigramWordModel un;
 };
 
+struct RuleHasher {
+  size_t operator()(const TRule& r) const {
+    return hash_value(r);
+  }
+};
+
+struct TableLookupBase {
+  TableLookupBase(const std::string& fname);
+
+  prob_t operator()(const TRule& rule) const {
+    const std::tr1::unordered_map<TRule,prob_t>::const_iterator it = table.find(rule);
+    assert(it != table.end());
+    return it->second;
+  }
+
+  std::tr1::unordered_map<TRule,prob_t,RuleHasher> table;
+};
+
 struct PhraseConditionalUninformativeBase {
   explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) :
       kUNIFORM_TARGET(1.0 / vocab_e_size) {
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
index edcdc813..db951d15 100644
--- a/gi/pf/conditional_pseg.h
+++ b/gi/pf/conditional_pseg.h
@@ -8,10 +8,84 @@
 
 #include "prob.h"
 #include "ccrp_nt.h"
+#include "mfcr.h"
 #include "trule.h"
 #include "base_measures.h"
 #include "tdict.h"
 
+template <typename ConditionalBaseMeasure>
+struct MConditionalTranslationModel {
+  explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :
+    rp0(rcp0), lambdas(1, 1.0), p0s(1) {}
+
+  void Summary() const {
+    std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
+    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+      std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.d() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
+      for (MFCR<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+        std::cerr << "   " << -1 << '\t' << i2->first << std::endl;
+    }
+  }
+
+  void ResampleHyperparameters(MT19937* rng) {
+    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
+      it->second.resample_hyperparameters(rng);
+  } 
+
+  int DecrementRule(const TRule& rule, MT19937* rng) {
+    RuleModelHash::iterator it = r.find(rule.f_);
+    assert(it != r.end());
+    const TableCount delta = it->second.decrement(rule, rng);
+    if (delta.count) {
+      if (it->second.num_customers() == 0) r.erase(it);
+    }
+    return delta.count;
+  }
+
+  int IncrementRule(const TRule& rule, MT19937* rng) {
+    RuleModelHash::iterator it = r.find(rule.f_);
+    if (it == r.end()) {
+      it = r.insert(make_pair(rule.f_, MFCR<TRule>(1, 1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first;
+    }
+    p0s[0] = rp0(rule).as_float(); 
+    TableCount delta = it->second.increment(rule, p0s, lambdas, rng);
+    return delta.count;
+  }
+
+  prob_t RuleProbability(const TRule& rule) const {
+    prob_t p;
+    RuleModelHash::const_iterator it = r.find(rule.f_);
+    if (it == r.end()) {
+      p.logeq(log(rp0(rule)));
+    } else {
+      p0s[0] = rp0(rule).as_float();
+      p = prob_t(it->second.prob(rule, p0s, lambdas));
+    }
+    return p;
+  }
+
+  prob_t Likelihood() const {
+    prob_t p = prob_t::One();
+#if 0
+    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+      prob_t q; q.logeq(it->second.log_crp_prob());
+      p *= q;
+      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+        p *= rp0(i2->first);
+    }
+#endif
+    return p;
+  }
+
+  const ConditionalBaseMeasure& rp0;
+  typedef std::tr1::unordered_map<std::vector<WordID>,
+                                  MFCR<TRule>,
+                                  boost::hash<std::vector<WordID> > > RuleModelHash;
+  RuleModelHash r;
+  std::vector<double> lambdas;
+  mutable std::vector<double> p0s;
+};
+
 template <typename ConditionalBaseMeasure>
 struct ConditionalTranslationModel {
   explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :
diff --git a/word-aligner/stemmers/ur.pl b/word-aligner/stemmers/ur.pl
new file mode 100755
index 00000000..3a4f5a45
--- /dev/null
+++ b/word-aligner/stemmers/ur.pl
@@ -0,0 +1,38 @@
+#!/usr/bin/perl -w
+
+use strict;
+use utf8;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT,":utf8");
+
+my $vocab = undef;
+if (scalar @ARGV > 0) {
+  die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1);
+  $vocab = 1;
+}
+
+my %dict;
+while(<STDIN>) {
+  chomp;
+  my @words = split /\s+/;
+  my @out = ();
+  for my $w (@words) {
+    my $tw = $dict{$w};
+    if (!defined $tw) {
+      my $el = 4;
+      if ($w =~ /^(al|Al)/) { $el++; }
+      if ($el > length($w)) { $el = length($w); }
+      $tw = substr $w, 0, $el;
+      $dict{$w} = $tw;
+    }
+    push @out, $tw;
+  }
+  if ($vocab) {
+    die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1;
+    print "$_ @out\n";
+  } else {
+    print "@out\n";
+  }
+}
+
-- 
cgit v1.2.3


From f960ab86f4b44bf515af4bf43aa27147a0e7875a Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 24 Jan 2012 22:26:44 -0500
Subject: more models

---
 gi/pf/align-lexonly.cc | 14 +++++++----
 gi/pf/base_measures.cc |  2 +-
 gi/pf/base_measures.h  | 27 ++++++++++++++++++++-
 training/model1.cc     | 64 +++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 98 insertions(+), 9 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc
index e9f1e7b6..76e2e009 100644
--- a/gi/pf/align-lexonly.cc
+++ b/gi/pf/align-lexonly.cc
@@ -122,10 +122,11 @@ struct BasicLexicalAlignment {
                                  vector<AlignedSentencePair>* corp) :
       letters(lets),
       corpus(*corp),
+      up0("fr-en.10k.translit-base.txt.gz"),
       //up0(words_e),
       //up0("en.chars.1gram", letters_e),
       //up0("en.words.1gram"),
-      up0(letters_e),
+      //up0(letters_e),
       //up0("en.chars.2gram"),
       tmodel(up0) {
   }
@@ -180,14 +181,18 @@ struct BasicLexicalAlignment {
   //PhraseConditionalUninformativeUnigramBase up0;
   //UnigramWordBase up0;
   //HierarchicalUnigramBase up0;
-  HierarchicalWordBase up0;
+  TableLookupBase up0;
+  //HierarchicalWordBase up0;
+  //PoissonUniformUninformativeBase up0;
   //CompletelyUniformBase up0;
   //FixedNgramBase up0;
   //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel;
   //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel;
   //ConditionalTranslationModel<UnigramWordBase> tmodel;
   //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel;
-  ConditionalTranslationModel<HierarchicalWordBase> tmodel;
+  //ConditionalTranslationModel<HierarchicalWordBase> tmodel;
+  //ConditionalTranslationModel<PoissonUniformUninformativeBase> tmodel;
+  ConditionalTranslationModel<TableLookupBase> tmodel;
   //ConditionalTranslationModel<FixedNgramBase> tmodel;
   //ConditionalTranslationModel<CompletelyUniformBase> tmodel;
 };
@@ -222,6 +227,7 @@ void BasicLexicalAlignment::ResampleCorpus() {
 
 void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
   for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
+    if (*it >= l->size()) { l->resize(*it + 1); }
     vector<WordID>& letters = (*l)[*it];
     if (letters.size()) continue;   // if e and f have the same word
 
@@ -308,7 +314,7 @@ int main(int argc, char** argv) {
   x.InitializeRandom();
   const unsigned samples = conf["samples"].as<unsigned>();
   for (int i = 0; i < samples; ++i) {
-    for (int j = 4995; j < 4997; ++j) Debug(corpus[j]);
+    for (int j = 395; j < 397; ++j) Debug(corpus[j]);
     cerr << i << "\t" << x.tmodel.r.size() << "\t";
     if (i % 10 == 0) x.ResampleHyperparemeters();
     x.ResampleCorpus();
diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc
index 7894d3e7..4b1863fa 100644
--- a/gi/pf/base_measures.cc
+++ b/gi/pf/base_measures.cc
@@ -37,7 +37,7 @@ TableLookupBase::TableLookupBase(const string& fname) {
       } else if (cc == 1) {
         x.e_.push_back(cur);
       } else if (cc == 2) {
-        table[x] = atof(TD::Convert(cur));
+        table[x].logeq(atof(TD::Convert(cur)));
         ++cc;
       } else {
         if (flag) cerr << endl;
diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h
index 7214aa22..b0495bfd 100644
--- a/gi/pf/base_measures.h
+++ b/gi/pf/base_measures.h
@@ -51,6 +51,22 @@ struct Model1 {
   std::vector<std::map<WordID, prob_t> > ttable;
 };
 
+struct PoissonUniformUninformativeBase {
+  explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
+  prob_t operator()(const TRule& r) const {
+    prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0));
+    prob_t q = kUNIFORM; q.poweq(r.e_.size());
+    p *= q;
+    return p;
+  }
+  void Summary() const {}
+  void ResampleHyperparameters(MT19937*) {}
+  void Increment(const TRule&) {}
+  void Decrement(const TRule&) {}
+  prob_t Likelihood() const { return prob_t::One(); }
+  const prob_t kUNIFORM;
+};
+
 struct CompletelyUniformBase {
   explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
   prob_t operator()(const TRule&) const {
@@ -83,10 +99,19 @@ struct TableLookupBase {
 
   prob_t operator()(const TRule& rule) const {
     const std::tr1::unordered_map<TRule,prob_t>::const_iterator it = table.find(rule);
-    assert(it != table.end());
+    if (it == table.end()) {
+      std::cerr << rule << " not found\n";
+      abort();
+    }
     return it->second;
   }
 
+  void ResampleHyperparameters(MT19937*) {}
+  void Increment(const TRule&) {}
+  void Decrement(const TRule&) {}
+  prob_t Likelihood() const { return prob_t::One(); }
+  void Summary() const {}
+
   std::tr1::unordered_map<TRule,prob_t,RuleHasher> table;
 };
 
diff --git a/training/model1.cc b/training/model1.cc
index 346c0033..40249aa3 100644
--- a/training/model1.cc
+++ b/training/model1.cc
@@ -14,6 +14,11 @@
 namespace po = boost::program_options;
 using namespace std;
 
+inline double log_poisson(unsigned x, const double& lambda) {
+  assert(lambda > 0.0);
+  return log(lambda) * x - lgamma(x + 1) - lambda;
+}
+
 bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
@@ -25,6 +30,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (<1 = flat >1 = sharp)")
         ("prob_align_null", po::value<double>()->default_value(0.08), "When --favor_diagonal is set, what's the probability of a null alignment?")
         ("variational_bayes,v","Add a symmetric Dirichlet prior and infer VB estimate of weights")
+        ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model")
         ("alpha,a", po::value<double>()->default_value(0.01), "Hyperparameter for optional Dirichlet prior")
         ("no_add_viterbi,V","Do not add Viterbi alignment points (may generate a grammar where some training sentence pairs are unreachable)");
   po::options_description clo("Command line options");
@@ -63,6 +69,8 @@ int main(int argc, char** argv) {
   const bool write_alignments = (conf.count("write_alignments") > 0);
   const double diagonal_tension = conf["diagonal_tension"].as<double>();
   const double prob_align_null = conf["prob_align_null"].as<double>();
+  string testset;
+  if (conf.count("testset")) testset = conf["testset"].as<string>();
   const double prob_align_not_null = 1.0 - prob_align_null;
   const double alpha = conf["alpha"].as<double>();
   const bool favor_diagonal = conf.count("favor_diagonal");
@@ -73,6 +81,8 @@ int main(int argc, char** argv) {
 
   TTable tt;
   TTable::Word2Word2Double was_viterbi;
+  double tot_len_ratio = 0;
+  double mean_srclen_multiplier = 0;
   for (int iter = 0; iter < ITERATIONS; ++iter) {
     const bool final_iteration = (iter == (ITERATIONS - 1));
     cerr << "ITERATION " << (iter + 1) << (final_iteration ? " (FINAL)" : "") << endl;
@@ -83,13 +93,13 @@ int main(int argc, char** argv) {
     int lc = 0;
     bool flag = false;
     string line;
+    string ssrc, strg;
     while(true) {
       getline(in, line);
       if (!in) break;
       ++lc;
       if (lc % 1000 == 0) { cerr << '.'; flag = true; }
       if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
-      string ssrc, strg;
       ParseTranslatorInput(line, &ssrc, &strg);
       Lattice src, trg;
       LatticeTools::ConvertTextToLattice(ssrc, &src);
@@ -99,9 +109,10 @@ int main(int argc, char** argv) {
         assert(src.size() > 0);
         assert(trg.size() > 0);
       }
+      if (iter == 0)
+        tot_len_ratio += static_cast<double>(trg.size()) / static_cast<double>(src.size());
       denom += trg.size();
       vector<double> probs(src.size() + 1);
-      const double src_logprob = -log(src.size() + 1);
       bool first_al = true;  // used for write_alignments
       for (int j = 0; j < trg.size(); ++j) {
         const WordID& f_j = trg[j][0].label;
@@ -156,7 +167,7 @@ int main(int argc, char** argv) {
           for (int i = 1; i <= src.size(); ++i)
             tt.Increment(src[i-1][0].label, f_j, probs[i] / sum);
         }
-        likelihood += log(sum) + src_logprob;
+        likelihood += log(sum);
       }
       if (write_alignments && final_iteration) cout << endl;
     }
@@ -165,6 +176,10 @@ int main(int argc, char** argv) {
     double base2_likelihood = likelihood / log(2);
 
     if (flag) { cerr << endl; }
+    if (iter == 0) {
+      mean_srclen_multiplier = tot_len_ratio / lc;
+      cerr << "expected target length = source length * " << mean_srclen_multiplier << endl;
+    }
     cerr << "  log_e likelihood: " << likelihood << endl;
     cerr << "  log_2 likelihood: " << base2_likelihood << endl;
     cerr << "   cross entropy: " << (-base2_likelihood / denom) << endl;
@@ -176,6 +191,49 @@ int main(int argc, char** argv) {
         tt.Normalize();
     }
   }
+  if (testset.size()) {
+    ReadFile rf(testset);
+    istream& in = *rf.stream();
+    int lc = 0;
+    double tlp = 0;
+    string ssrc, strg, line;
+    while (getline(in, line)) {
+      ++lc;
+      ParseTranslatorInput(line, &ssrc, &strg);
+      Lattice src, trg;
+      LatticeTools::ConvertTextToLattice(ssrc, &src);
+      LatticeTools::ConvertTextToLattice(strg, &trg);
+      double log_prob = log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier);
+
+      // compute likelihood
+      for (int j = 0; j < trg.size(); ++j) {
+        const WordID& f_j = trg[j][0].label;
+        double sum = 0;
+        const double j_over_ts = double(j) / trg.size();
+        double prob_a_i = 1.0 / (src.size() + use_null);  // uniform (model 1)
+        if (use_null) {
+          if (favor_diagonal) prob_a_i = prob_align_null;
+          sum += tt.prob(kNULL, f_j) * prob_a_i;
+        }
+        double az = 0;
+        if (favor_diagonal) {
+          for (int ta = 0; ta < src.size(); ++ta)
+            az += exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension);
+          az /= prob_align_not_null;
+        }
+        for (int i = 1; i <= src.size(); ++i) {
+          if (favor_diagonal)
+            prob_a_i = exp(-fabs(double(i) / src.size() - j_over_ts) * diagonal_tension) / az;
+          sum += tt.prob(src[i-1][0].label, f_j) * prob_a_i;
+        }
+        log_prob += log(sum);
+      }
+      tlp += log_prob;
+      cerr << ssrc << " ||| " << strg << " ||| " << log_prob << endl;
+    }
+    cerr << "TOTAL LOG PROB " << tlp << endl;
+  }
+
   if (write_alignments) return 0;
 
   for (TTable::Word2Word2Double::iterator ei = tt.ttable.begin(); ei != tt.ttable.end(); ++ei) {
-- 
cgit v1.2.3


From 3c1c98b5aec7aec34432ddc37385df06d301bdd5 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Fri, 27 Jan 2012 02:31:00 -0500
Subject: migrate mert to the new scorer interface

---
 gi/pf/base_distributions.cc | 241 ++++++++++++++++++++++++++++++++++++++++
 gi/pf/base_distributions.h  | 261 ++++++++++++++++++++++++++++++++++++++++++++
 gi/pf/base_measures.cc      | 241 ----------------------------------------
 gi/pf/base_measures.h       | 247 -----------------------------------------
 mteval/ns.cc                |   4 +
 mteval/ns.h                 |  10 +-
 vest/ces.cc                 |  42 +++----
 vest/ces.h                  |  10 +-
 vest/dist-vest.pl           |   4 +-
 vest/error_surface.cc       |  11 +-
 vest/error_surface.h        |   6 +-
 vest/line_optimizer.cc      |  20 ++--
 vest/line_optimizer.h       |   2 +
 vest/lo_test.cc             |  21 ++--
 vest/mr_vest_map.cc         |  16 +--
 vest/mr_vest_reduce.cc      |  34 +++---
 16 files changed, 602 insertions(+), 568 deletions(-)
 create mode 100644 gi/pf/base_distributions.cc
 create mode 100644 gi/pf/base_distributions.h
 delete mode 100644 gi/pf/base_measures.cc
 delete mode 100644 gi/pf/base_measures.h

(limited to 'gi/pf')

diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc
new file mode 100644
index 00000000..4b1863fa
--- /dev/null
+++ b/gi/pf/base_distributions.cc
@@ -0,0 +1,241 @@
+#include "base_measures.h"
+
+#include <iostream>
+
+#include "filelib.h"
+
+using namespace std;
+
+TableLookupBase::TableLookupBase(const string& fname) {
+  cerr << "TableLookupBase reading from " << fname << " ..." << endl;
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  unsigned lc = 0;
+  const WordID kDIV = TD::Convert("|||");
+  vector<WordID> tmp;
+  vector<int> le, lf;
+  TRule x;
+  x.lhs_ = -TD::Convert("X");
+  bool flag = false;
+  while(getline(in, line)) {
+    ++lc;
+    if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; }
+    else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; }
+    tmp.clear();
+    TD::ConvertSentence(line, &tmp);
+    x.f_.clear();
+    x.e_.clear();
+    size_t pos = 0;
+    int cc = 0;
+    while(pos < tmp.size()) {
+      const WordID cur = tmp[pos++];
+      if (cur == kDIV) {
+        ++cc;
+      } else if (cc == 0) {
+        x.f_.push_back(cur);    
+      } else if (cc == 1) {
+        x.e_.push_back(cur);
+      } else if (cc == 2) {
+        table[x].logeq(atof(TD::Convert(cur)));
+        ++cc;
+      } else {
+        if (flag) cerr << endl;
+        cerr << "Bad format in " << lc << ": " << line << endl; abort();
+      }
+    }
+    if (cc != 3) {
+      if (flag) cerr << endl;
+      cerr << "Bad format in " << lc << ": " << line << endl; abort();
+    }
+  }
+  if (flag) cerr << endl;
+  cerr << " read " << lc << " entries\n";
+}
+
+prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc,
+                                                     const vector<WordID>& vtrg,
+                                                     int start_src, int start_trg) const {
+  const int flen = vsrc.size() - start_src;
+  const int elen = vtrg.size() - start_trg;
+  prob_t p;
+  p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
+  //p.logeq(log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01)
+  for (int i = 0; i < elen; ++i)
+    p *= u(vtrg[i + start_trg]);                        // draw e_i             ~Uniform
+  return p;
+}
+
+prob_t PhraseConditionalUninformativeBase::p0(const vector<WordID>& vsrc,
+                                              const vector<WordID>& vtrg,
+                                              int start_src, int start_trg) const {
+  const int flen = vsrc.size() - start_src;
+  const int elen = vtrg.size() - start_trg;
+  prob_t p;
+  //p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
+  p.logeq(log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01)
+  for (int i = 0; i < elen; ++i)
+    p *= kUNIFORM_TARGET;                        // draw e_i             ~Uniform
+  return p;
+}
+
+void Model1::LoadModel1(const string& fname) {
+  cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  unsigned lc = 0;
+  while(getline(in, line)) {
+    ++lc;
+    int cur = 0;
+    int start = 0;
+    while(cur < line.size() && line[cur] != ' ') { ++cur; }
+    assert(cur != line.size());
+    line[cur] = 0;
+    const WordID src = TD::Convert(&line[0]);
+    ++cur;
+    start = cur;
+    while(cur < line.size() && line[cur] != ' ') { ++cur; }
+    assert(cur != line.size());
+    line[cur] = 0;
+    WordID trg = TD::Convert(&line[start]);
+    const double logprob = strtod(&line[cur + 1], NULL);
+    if (src >= ttable.size()) ttable.resize(src + 1);
+    ttable[src][trg].logeq(logprob);
+  }
+  cerr << "  read " << lc << " parameters.\n";
+}
+
+prob_t PhraseConditionalBase::p0(const vector<WordID>& vsrc,
+                                 const vector<WordID>& vtrg,
+                                 int start_src, int start_trg) const {
+  const int flen = vsrc.size() - start_src;
+  const int elen = vtrg.size() - start_trg;
+  prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
+  prob_t p;
+  p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
+  for (int i = 0; i < elen; ++i) {               // for each position i in e-RHS
+    const WordID trg = vtrg[i + start_trg];
+    prob_t tp = prob_t::Zero();
+    for (int j = -1; j < flen; ++j) {
+      const WordID src = j < 0 ? 0 : vsrc[j + start_src];
+      tp += kM1MIXTURE * model1(src, trg);
+      tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
+    }
+    tp *= uniform_src_alignment;                 //     draw a_i         ~uniform
+    p *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform
+  }
+  if (p.is_0()) {
+    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
+    abort();
+  }
+  return p;
+}
+
+prob_t PhraseJointBase::p0(const vector<WordID>& vsrc,
+                           const vector<WordID>& vtrg,
+                           int start_src, int start_trg) const {
+  const int flen = vsrc.size() - start_src;
+  const int elen = vtrg.size() - start_trg;
+  prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
+  prob_t p;
+  p.logeq(log_poisson(flen, 1.0));               // flen                 ~Pois(1)
+                                                 // elen | flen          ~Pois(flen + 0.01)
+  prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01));
+  p *= ptrglen;
+  p *= kUNIFORM_SOURCE.pow(flen);                // each f in F ~Uniform
+  for (int i = 0; i < elen; ++i) {               // for each position i in E
+    const WordID trg = vtrg[i + start_trg];
+    prob_t tp = prob_t::Zero();
+    for (int j = -1; j < flen; ++j) {
+      const WordID src = j < 0 ? 0 : vsrc[j + start_src];
+      tp += kM1MIXTURE * model1(src, trg);
+      tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
+    }
+    tp *= uniform_src_alignment;                 //     draw a_i         ~uniform
+    p *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform
+  }
+  if (p.is_0()) {
+    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
+    abort();
+  }
+  return p;
+}
+
+prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc,
+                                 const vector<WordID>& vtrg,
+                                 int start_src, int start_trg) const {
+  const int flen = vsrc.size() - start_src;
+  const int elen = vtrg.size() - start_trg;
+  prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
+  prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1));
+
+  prob_t p1;
+  p1.logeq(log_poisson(flen, 1.0));               // flen                 ~Pois(1)
+                                                 // elen | flen          ~Pois(flen + 0.01)
+  prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01));
+  p1 *= ptrglen;
+  p1 *= kUNIFORM_SOURCE.pow(flen);                // each f in F ~Uniform
+  for (int i = 0; i < elen; ++i) {               // for each position i in E
+    const WordID trg = vtrg[i + start_trg];
+    prob_t tp = prob_t::Zero();
+    for (int j = -1; j < flen; ++j) {
+      const WordID src = j < 0 ? 0 : vsrc[j + start_src];
+      tp += kM1MIXTURE * model1(src, trg);
+      tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
+    }
+    tp *= uniform_src_alignment;                 //     draw a_i         ~uniform
+    p1 *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform
+  }
+  if (p1.is_0()) {
+    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
+    abort();
+  }
+
+  prob_t p2;
+  p2.logeq(log_poisson(elen, 1.0));               // elen                 ~Pois(1)
+                                                 // flen | elen          ~Pois(flen + 0.01)
+  prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01));
+  p2 *= psrclen;
+  p2 *= kUNIFORM_TARGET.pow(elen);                // each f in F ~Uniform
+  for (int i = 0; i < flen; ++i) {               // for each position i in E
+    const WordID src = vsrc[i + start_src];
+    prob_t tp = prob_t::Zero();
+    for (int j = -1; j < elen; ++j) {
+      const WordID trg = j < 0 ? 0 : vtrg[j + start_trg];
+      tp += kM1MIXTURE * invmodel1(trg, src);
+      tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE;
+    }
+    tp *= uniform_trg_alignment;                 //     draw a_i         ~uniform
+    p2 *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform
+  }
+  if (p2.is_0()) {
+    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
+    abort();
+  }
+
+  static const prob_t kHALF(0.5);
+  return (p1 + p2) * kHALF;
+}
+
+JumpBase::JumpBase() : p(200) {
+  for (unsigned src_len = 1; src_len < 200; ++src_len) {
+    map<int, prob_t>& cpd = p[src_len];
+    int min_jump = 1 - src_len;
+    int max_jump = src_len;
+    prob_t z;
+    for (int j = min_jump; j <= max_jump; ++j) {
+      prob_t& cp = cpd[j];
+      if (j < 0)
+        cp.logeq(log_poisson(1.5-j, 1));
+      else if (j > 0)
+        cp.logeq(log_poisson(j, 1));
+      cp.poweq(0.2);
+      z += cp;
+    }
+    for (int j = min_jump; j <= max_jump; ++j) {
+      cpd[j] /= z;
+    }
+  }
+}
+
diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h
new file mode 100644
index 00000000..a23ac32b
--- /dev/null
+++ b/gi/pf/base_distributions.h
@@ -0,0 +1,261 @@
+#ifndef _BASE_MEASURES_H_
+#define _BASE_MEASURES_H_
+
+#include <vector>
+#include <map>
+#include <string>
+#include <cmath>
+#include <iostream>
+#include <cassert>
+
+#include "unigrams.h"
+#include "trule.h"
+#include "prob.h"
+#include "tdict.h"
+#include "sampler.h"
+
+inline double log_poisson(unsigned x, const double& lambda) {
+  assert(lambda > 0.0);
+  return log(lambda) * x - lgamma(x + 1) - lambda;
+}
+
+inline double log_binom_coeff(unsigned n, unsigned k) {
+  assert(n >= k);
+  if (n == k) return 0.0;
+  return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1);
+}
+
+// http://en.wikipedia.org/wiki/Negative_binomial_distribution
+inline double log_negative_binom(unsigned x, unsigned r, double p) {
+  assert(p > 0.0);
+  assert(p < 1.0);
+  return log_binom_coeff(x + r - 1, x) + r * log(1 - p) + x * log(p);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) {
+  os << '[';
+  for (int i = 0; i < p.size(); ++i)
+    os << (i==0 ? "" : " ") << TD::Convert(p[i]);
+  return os << ']';
+}
+
+struct Model1 {
+  explicit Model1(const std::string& fname) :
+      kNULL(TD::Convert("<eps>")),
+      kZERO() {
+    LoadModel1(fname);
+  }
+
+  void LoadModel1(const std::string& fname);
+
+  // returns prob 0 if src or trg is not found
+  const prob_t& operator()(WordID src, WordID trg) const {
+    if (src == 0) src = kNULL;
+    if (src < ttable.size()) {
+      const std::map<WordID, prob_t>& cpd = ttable[src];
+      const std::map<WordID, prob_t>::const_iterator it = cpd.find(trg);
+      if (it != cpd.end())
+        return it->second;
+    }
+    return kZERO;
+  }
+
+  const WordID kNULL;
+  const prob_t kZERO;
+  std::vector<std::map<WordID, prob_t> > ttable;
+};
+
+struct PoissonUniformUninformativeBase {
+  explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
+  prob_t operator()(const TRule& r) const {
+    prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0));
+    prob_t q = kUNIFORM; q.poweq(r.e_.size());
+    p *= q;
+    return p;
+  }
+  void Summary() const {}
+  void ResampleHyperparameters(MT19937*) {}
+  void Increment(const TRule&) {}
+  void Decrement(const TRule&) {}
+  prob_t Likelihood() const { return prob_t::One(); }
+  const prob_t kUNIFORM;
+};
+
+struct CompletelyUniformBase {
+  explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
+  prob_t operator()(const TRule&) const {
+    return kUNIFORM;
+  }
+  void Summary() const {}
+  void ResampleHyperparameters(MT19937*) {}
+  void Increment(const TRule&) {}
+  void Decrement(const TRule&) {}
+  prob_t Likelihood() const { return prob_t::One(); }
+  const prob_t kUNIFORM;
+};
+
+struct UnigramWordBase {
+  explicit UnigramWordBase(const std::string& fname) : un(fname) {}
+  prob_t operator()(const TRule& r) const {
+    return un(r.e_);
+  }
+  const UnigramWordModel un;
+};
+
+struct RuleHasher {
+  size_t operator()(const TRule& r) const {
+    return hash_value(r);
+  }
+};
+
+struct TableLookupBase {
+  TableLookupBase(const std::string& fname);
+
+  prob_t operator()(const TRule& rule) const {
+    const std::tr1::unordered_map<TRule,prob_t>::const_iterator it = table.find(rule);
+    if (it == table.end()) {
+      std::cerr << rule << " not found\n";
+      abort();
+    }
+    return it->second;
+  }
+
+  void ResampleHyperparameters(MT19937*) {}
+  void Increment(const TRule&) {}
+  void Decrement(const TRule&) {}
+  prob_t Likelihood() const { return prob_t::One(); }
+  void Summary() const {}
+
+  std::tr1::unordered_map<TRule,prob_t,RuleHasher> table;
+};
+
+struct PhraseConditionalUninformativeBase {
+  explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) :
+      kUNIFORM_TARGET(1.0 / vocab_e_size) {
+    assert(vocab_e_size > 0);
+  }
+
+  // return p0 of rule.e_ | rule.f_
+  prob_t operator()(const TRule& rule) const {
+    return p0(rule.f_, rule.e_, 0, 0);
+  }
+
+  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+  void Summary() const {}
+  void ResampleHyperparameters(MT19937*) {}
+  void Increment(const TRule&) {}
+  void Decrement(const TRule&) {}
+  prob_t Likelihood() const { return prob_t::One(); }
+  const prob_t kUNIFORM_TARGET;
+};
+
+struct PhraseConditionalUninformativeUnigramBase {
+  explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {}
+
+  // return p0 of rule.e_ | rule.f_
+  prob_t operator()(const TRule& rule) const {
+    return p0(rule.f_, rule.e_, 0, 0);
+  }
+
+  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+  const UnigramModel u;
+};
+
+struct PhraseConditionalBase {
+  explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) :
+      model1(m1),
+      kM1MIXTURE(m1mixture),
+      kUNIFORM_MIXTURE(1.0 - m1mixture),
+      kUNIFORM_TARGET(1.0 / vocab_e_size) {
+    assert(m1mixture >= 0.0 && m1mixture <= 1.0);
+    assert(vocab_e_size > 0);
+  }
+
+  // return p0 of rule.e_ | rule.f_
+  prob_t operator()(const TRule& rule) const {
+    return p0(rule.f_, rule.e_, 0, 0);
+  }
+
+  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+  const Model1& model1;
+  const prob_t kM1MIXTURE;  // Model 1 mixture component
+  const prob_t kUNIFORM_MIXTURE; // uniform mixture component
+  const prob_t kUNIFORM_TARGET;
+};
+
+struct PhraseJointBase {
+  explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) :
+      model1(m1),
+      kM1MIXTURE(m1mixture),
+      kUNIFORM_MIXTURE(1.0 - m1mixture),
+      kUNIFORM_SOURCE(1.0 / vocab_f_size),
+      kUNIFORM_TARGET(1.0 / vocab_e_size) {
+    assert(m1mixture >= 0.0 && m1mixture <= 1.0);
+    assert(vocab_e_size > 0);
+  }
+
+  // return p0 of rule.e_ , rule.f_
+  prob_t operator()(const TRule& rule) const {
+    return p0(rule.f_, rule.e_, 0, 0);
+  }
+
+  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+  const Model1& model1;
+  const prob_t kM1MIXTURE;  // Model 1 mixture component
+  const prob_t kUNIFORM_MIXTURE; // uniform mixture component
+  const prob_t kUNIFORM_SOURCE;
+  const prob_t kUNIFORM_TARGET;
+};
+
+struct PhraseJointBase_BiDir {
+  explicit PhraseJointBase_BiDir(const Model1& m1,
+                                 const Model1& im1,
+                                 const double m1mixture,
+                                 const unsigned vocab_e_size,
+                                 const unsigned vocab_f_size) :
+      model1(m1),
+      invmodel1(im1),
+      kM1MIXTURE(m1mixture),
+      kUNIFORM_MIXTURE(1.0 - m1mixture),
+      kUNIFORM_SOURCE(1.0 / vocab_f_size),
+      kUNIFORM_TARGET(1.0 / vocab_e_size) {
+    assert(m1mixture >= 0.0 && m1mixture <= 1.0);
+    assert(vocab_e_size > 0);
+  }
+
+  // return p0 of rule.e_ , rule.f_
+  prob_t operator()(const TRule& rule) const {
+    return p0(rule.f_, rule.e_, 0, 0);
+  }
+
+  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+  const Model1& model1;
+  const Model1& invmodel1;
+  const prob_t kM1MIXTURE;  // Model 1 mixture component
+  const prob_t kUNIFORM_MIXTURE; // uniform mixture component
+  const prob_t kUNIFORM_SOURCE;
+  const prob_t kUNIFORM_TARGET;
+};
+
+// base distribution for jump size multinomials
+// basically p(0) = 0 and then, p(1) is max, and then
+// you drop as you move to the max jump distance
+struct JumpBase {
+  JumpBase();
+
+  const prob_t& operator()(int jump, unsigned src_len) const {
+    assert(jump != 0);
+    const std::map<int, prob_t>::const_iterator it = p[src_len].find(jump);
+    assert(it != p[src_len].end());
+    return it->second;
+  }
+  std::vector<std::map<int, prob_t> > p;
+};
+
+
+#endif
diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc
deleted file mode 100644
index 4b1863fa..00000000
--- a/gi/pf/base_measures.cc
+++ /dev/null
@@ -1,241 +0,0 @@
-#include "base_measures.h"
-
-#include <iostream>
-
-#include "filelib.h"
-
-using namespace std;
-
-TableLookupBase::TableLookupBase(const string& fname) {
-  cerr << "TableLookupBase reading from " << fname << " ..." << endl;
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  unsigned lc = 0;
-  const WordID kDIV = TD::Convert("|||");
-  vector<WordID> tmp;
-  vector<int> le, lf;
-  TRule x;
-  x.lhs_ = -TD::Convert("X");
-  bool flag = false;
-  while(getline(in, line)) {
-    ++lc;
-    if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; }
-    else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; }
-    tmp.clear();
-    TD::ConvertSentence(line, &tmp);
-    x.f_.clear();
-    x.e_.clear();
-    size_t pos = 0;
-    int cc = 0;
-    while(pos < tmp.size()) {
-      const WordID cur = tmp[pos++];
-      if (cur == kDIV) {
-        ++cc;
-      } else if (cc == 0) {
-        x.f_.push_back(cur);    
-      } else if (cc == 1) {
-        x.e_.push_back(cur);
-      } else if (cc == 2) {
-        table[x].logeq(atof(TD::Convert(cur)));
-        ++cc;
-      } else {
-        if (flag) cerr << endl;
-        cerr << "Bad format in " << lc << ": " << line << endl; abort();
-      }
-    }
-    if (cc != 3) {
-      if (flag) cerr << endl;
-      cerr << "Bad format in " << lc << ": " << line << endl; abort();
-    }
-  }
-  if (flag) cerr << endl;
-  cerr << " read " << lc << " entries\n";
-}
-
-prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc,
-                                                     const vector<WordID>& vtrg,
-                                                     int start_src, int start_trg) const {
-  const int flen = vsrc.size() - start_src;
-  const int elen = vtrg.size() - start_trg;
-  prob_t p;
-  p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
-  //p.logeq(log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01)
-  for (int i = 0; i < elen; ++i)
-    p *= u(vtrg[i + start_trg]);                        // draw e_i             ~Uniform
-  return p;
-}
-
-prob_t PhraseConditionalUninformativeBase::p0(const vector<WordID>& vsrc,
-                                              const vector<WordID>& vtrg,
-                                              int start_src, int start_trg) const {
-  const int flen = vsrc.size() - start_src;
-  const int elen = vtrg.size() - start_trg;
-  prob_t p;
-  //p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
-  p.logeq(log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01)
-  for (int i = 0; i < elen; ++i)
-    p *= kUNIFORM_TARGET;                        // draw e_i             ~Uniform
-  return p;
-}
-
-void Model1::LoadModel1(const string& fname) {
-  cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  unsigned lc = 0;
-  while(getline(in, line)) {
-    ++lc;
-    int cur = 0;
-    int start = 0;
-    while(cur < line.size() && line[cur] != ' ') { ++cur; }
-    assert(cur != line.size());
-    line[cur] = 0;
-    const WordID src = TD::Convert(&line[0]);
-    ++cur;
-    start = cur;
-    while(cur < line.size() && line[cur] != ' ') { ++cur; }
-    assert(cur != line.size());
-    line[cur] = 0;
-    WordID trg = TD::Convert(&line[start]);
-    const double logprob = strtod(&line[cur + 1], NULL);
-    if (src >= ttable.size()) ttable.resize(src + 1);
-    ttable[src][trg].logeq(logprob);
-  }
-  cerr << "  read " << lc << " parameters.\n";
-}
-
-prob_t PhraseConditionalBase::p0(const vector<WordID>& vsrc,
-                                 const vector<WordID>& vtrg,
-                                 int start_src, int start_trg) const {
-  const int flen = vsrc.size() - start_src;
-  const int elen = vtrg.size() - start_trg;
-  prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
-  prob_t p;
-  p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
-  for (int i = 0; i < elen; ++i) {               // for each position i in e-RHS
-    const WordID trg = vtrg[i + start_trg];
-    prob_t tp = prob_t::Zero();
-    for (int j = -1; j < flen; ++j) {
-      const WordID src = j < 0 ? 0 : vsrc[j + start_src];
-      tp += kM1MIXTURE * model1(src, trg);
-      tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
-    }
-    tp *= uniform_src_alignment;                 //     draw a_i         ~uniform
-    p *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform
-  }
-  if (p.is_0()) {
-    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
-    abort();
-  }
-  return p;
-}
-
-prob_t PhraseJointBase::p0(const vector<WordID>& vsrc,
-                           const vector<WordID>& vtrg,
-                           int start_src, int start_trg) const {
-  const int flen = vsrc.size() - start_src;
-  const int elen = vtrg.size() - start_trg;
-  prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
-  prob_t p;
-  p.logeq(log_poisson(flen, 1.0));               // flen                 ~Pois(1)
-                                                 // elen | flen          ~Pois(flen + 0.01)
-  prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01));
-  p *= ptrglen;
-  p *= kUNIFORM_SOURCE.pow(flen);                // each f in F ~Uniform
-  for (int i = 0; i < elen; ++i) {               // for each position i in E
-    const WordID trg = vtrg[i + start_trg];
-    prob_t tp = prob_t::Zero();
-    for (int j = -1; j < flen; ++j) {
-      const WordID src = j < 0 ? 0 : vsrc[j + start_src];
-      tp += kM1MIXTURE * model1(src, trg);
-      tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
-    }
-    tp *= uniform_src_alignment;                 //     draw a_i         ~uniform
-    p *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform
-  }
-  if (p.is_0()) {
-    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
-    abort();
-  }
-  return p;
-}
-
-prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc,
-                                 const vector<WordID>& vtrg,
-                                 int start_src, int start_trg) const {
-  const int flen = vsrc.size() - start_src;
-  const int elen = vtrg.size() - start_trg;
-  prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
-  prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1));
-
-  prob_t p1;
-  p1.logeq(log_poisson(flen, 1.0));               // flen                 ~Pois(1)
-                                                 // elen | flen          ~Pois(flen + 0.01)
-  prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01));
-  p1 *= ptrglen;
-  p1 *= kUNIFORM_SOURCE.pow(flen);                // each f in F ~Uniform
-  for (int i = 0; i < elen; ++i) {               // for each position i in E
-    const WordID trg = vtrg[i + start_trg];
-    prob_t tp = prob_t::Zero();
-    for (int j = -1; j < flen; ++j) {
-      const WordID src = j < 0 ? 0 : vsrc[j + start_src];
-      tp += kM1MIXTURE * model1(src, trg);
-      tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
-    }
-    tp *= uniform_src_alignment;                 //     draw a_i         ~uniform
-    p1 *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform
-  }
-  if (p1.is_0()) {
-    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
-    abort();
-  }
-
-  prob_t p2;
-  p2.logeq(log_poisson(elen, 1.0));               // elen                 ~Pois(1)
-                                                 // flen | elen          ~Pois(flen + 0.01)
-  prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01));
-  p2 *= psrclen;
-  p2 *= kUNIFORM_TARGET.pow(elen);                // each f in F ~Uniform
-  for (int i = 0; i < flen; ++i) {               // for each position i in E
-    const WordID src = vsrc[i + start_src];
-    prob_t tp = prob_t::Zero();
-    for (int j = -1; j < elen; ++j) {
-      const WordID trg = j < 0 ? 0 : vtrg[j + start_trg];
-      tp += kM1MIXTURE * invmodel1(trg, src);
-      tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE;
-    }
-    tp *= uniform_trg_alignment;                 //     draw a_i         ~uniform
-    p2 *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform
-  }
-  if (p2.is_0()) {
-    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
-    abort();
-  }
-
-  static const prob_t kHALF(0.5);
-  return (p1 + p2) * kHALF;
-}
-
-JumpBase::JumpBase() : p(200) {
-  for (unsigned src_len = 1; src_len < 200; ++src_len) {
-    map<int, prob_t>& cpd = p[src_len];
-    int min_jump = 1 - src_len;
-    int max_jump = src_len;
-    prob_t z;
-    for (int j = min_jump; j <= max_jump; ++j) {
-      prob_t& cp = cpd[j];
-      if (j < 0)
-        cp.logeq(log_poisson(1.5-j, 1));
-      else if (j > 0)
-        cp.logeq(log_poisson(j, 1));
-      cp.poweq(0.2);
-      z += cp;
-    }
-    for (int j = min_jump; j <= max_jump; ++j) {
-      cpd[j] /= z;
-    }
-  }
-}
-
diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h
deleted file mode 100644
index b0495bfd..00000000
--- a/gi/pf/base_measures.h
+++ /dev/null
@@ -1,247 +0,0 @@
-#ifndef _BASE_MEASURES_H_
-#define _BASE_MEASURES_H_
-
-#include <vector>
-#include <map>
-#include <string>
-#include <cmath>
-#include <iostream>
-
-#include "unigrams.h"
-#include "trule.h"
-#include "prob.h"
-#include "tdict.h"
-#include "sampler.h"
-
-inline double log_poisson(unsigned x, const double& lambda) {
-  assert(lambda > 0.0);
-  return log(lambda) * x - lgamma(x + 1) - lambda;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) {
-  os << '[';
-  for (int i = 0; i < p.size(); ++i)
-    os << (i==0 ? "" : " ") << TD::Convert(p[i]);
-  return os << ']';
-}
-
-struct Model1 {
-  explicit Model1(const std::string& fname) :
-      kNULL(TD::Convert("<eps>")),
-      kZERO() {
-    LoadModel1(fname);
-  }
-
-  void LoadModel1(const std::string& fname);
-
-  // returns prob 0 if src or trg is not found
-  const prob_t& operator()(WordID src, WordID trg) const {
-    if (src == 0) src = kNULL;
-    if (src < ttable.size()) {
-      const std::map<WordID, prob_t>& cpd = ttable[src];
-      const std::map<WordID, prob_t>::const_iterator it = cpd.find(trg);
-      if (it != cpd.end())
-        return it->second;
-    }
-    return kZERO;
-  }
-
-  const WordID kNULL;
-  const prob_t kZERO;
-  std::vector<std::map<WordID, prob_t> > ttable;
-};
-
-struct PoissonUniformUninformativeBase {
-  explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
-  prob_t operator()(const TRule& r) const {
-    prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0));
-    prob_t q = kUNIFORM; q.poweq(r.e_.size());
-    p *= q;
-    return p;
-  }
-  void Summary() const {}
-  void ResampleHyperparameters(MT19937*) {}
-  void Increment(const TRule&) {}
-  void Decrement(const TRule&) {}
-  prob_t Likelihood() const { return prob_t::One(); }
-  const prob_t kUNIFORM;
-};
-
-struct CompletelyUniformBase {
-  explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
-  prob_t operator()(const TRule&) const {
-    return kUNIFORM;
-  }
-  void Summary() const {}
-  void ResampleHyperparameters(MT19937*) {}
-  void Increment(const TRule&) {}
-  void Decrement(const TRule&) {}
-  prob_t Likelihood() const { return prob_t::One(); }
-  const prob_t kUNIFORM;
-};
-
-struct UnigramWordBase {
-  explicit UnigramWordBase(const std::string& fname) : un(fname) {}
-  prob_t operator()(const TRule& r) const {
-    return un(r.e_);
-  }
-  const UnigramWordModel un;
-};
-
-struct RuleHasher {
-  size_t operator()(const TRule& r) const {
-    return hash_value(r);
-  }
-};
-
-struct TableLookupBase {
-  TableLookupBase(const std::string& fname);
-
-  prob_t operator()(const TRule& rule) const {
-    const std::tr1::unordered_map<TRule,prob_t>::const_iterator it = table.find(rule);
-    if (it == table.end()) {
-      std::cerr << rule << " not found\n";
-      abort();
-    }
-    return it->second;
-  }
-
-  void ResampleHyperparameters(MT19937*) {}
-  void Increment(const TRule&) {}
-  void Decrement(const TRule&) {}
-  prob_t Likelihood() const { return prob_t::One(); }
-  void Summary() const {}
-
-  std::tr1::unordered_map<TRule,prob_t,RuleHasher> table;
-};
-
-struct PhraseConditionalUninformativeBase {
-  explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) :
-      kUNIFORM_TARGET(1.0 / vocab_e_size) {
-    assert(vocab_e_size > 0);
-  }
-
-  // return p0 of rule.e_ | rule.f_
-  prob_t operator()(const TRule& rule) const {
-    return p0(rule.f_, rule.e_, 0, 0);
-  }
-
-  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
-
-  void Summary() const {}
-  void ResampleHyperparameters(MT19937*) {}
-  void Increment(const TRule&) {}
-  void Decrement(const TRule&) {}
-  prob_t Likelihood() const { return prob_t::One(); }
-  const prob_t kUNIFORM_TARGET;
-};
-
-struct PhraseConditionalUninformativeUnigramBase {
-  explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {}
-
-  // return p0 of rule.e_ | rule.f_
-  prob_t operator()(const TRule& rule) const {
-    return p0(rule.f_, rule.e_, 0, 0);
-  }
-
-  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
-
-  const UnigramModel u;
-};
-
-struct PhraseConditionalBase {
-  explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) :
-      model1(m1),
-      kM1MIXTURE(m1mixture),
-      kUNIFORM_MIXTURE(1.0 - m1mixture),
-      kUNIFORM_TARGET(1.0 / vocab_e_size) {
-    assert(m1mixture >= 0.0 && m1mixture <= 1.0);
-    assert(vocab_e_size > 0);
-  }
-
-  // return p0 of rule.e_ | rule.f_
-  prob_t operator()(const TRule& rule) const {
-    return p0(rule.f_, rule.e_, 0, 0);
-  }
-
-  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
-
-  const Model1& model1;
-  const prob_t kM1MIXTURE;  // Model 1 mixture component
-  const prob_t kUNIFORM_MIXTURE; // uniform mixture component
-  const prob_t kUNIFORM_TARGET;
-};
-
-struct PhraseJointBase {
-  explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) :
-      model1(m1),
-      kM1MIXTURE(m1mixture),
-      kUNIFORM_MIXTURE(1.0 - m1mixture),
-      kUNIFORM_SOURCE(1.0 / vocab_f_size),
-      kUNIFORM_TARGET(1.0 / vocab_e_size) {
-    assert(m1mixture >= 0.0 && m1mixture <= 1.0);
-    assert(vocab_e_size > 0);
-  }
-
-  // return p0 of rule.e_ , rule.f_
-  prob_t operator()(const TRule& rule) const {
-    return p0(rule.f_, rule.e_, 0, 0);
-  }
-
-  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
-
-  const Model1& model1;
-  const prob_t kM1MIXTURE;  // Model 1 mixture component
-  const prob_t kUNIFORM_MIXTURE; // uniform mixture component
-  const prob_t kUNIFORM_SOURCE;
-  const prob_t kUNIFORM_TARGET;
-};
-
-struct PhraseJointBase_BiDir {
-  explicit PhraseJointBase_BiDir(const Model1& m1,
-                                 const Model1& im1,
-                                 const double m1mixture,
-                                 const unsigned vocab_e_size,
-                                 const unsigned vocab_f_size) :
-      model1(m1),
-      invmodel1(im1),
-      kM1MIXTURE(m1mixture),
-      kUNIFORM_MIXTURE(1.0 - m1mixture),
-      kUNIFORM_SOURCE(1.0 / vocab_f_size),
-      kUNIFORM_TARGET(1.0 / vocab_e_size) {
-    assert(m1mixture >= 0.0 && m1mixture <= 1.0);
-    assert(vocab_e_size > 0);
-  }
-
-  // return p0 of rule.e_ , rule.f_
-  prob_t operator()(const TRule& rule) const {
-    return p0(rule.f_, rule.e_, 0, 0);
-  }
-
-  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
-
-  const Model1& model1;
-  const Model1& invmodel1;
-  const prob_t kM1MIXTURE;  // Model 1 mixture component
-  const prob_t kUNIFORM_MIXTURE; // uniform mixture component
-  const prob_t kUNIFORM_SOURCE;
-  const prob_t kUNIFORM_TARGET;
-};
-
-// base distribution for jump size multinomials
-// basically p(0) = 0 and then, p(1) is max, and then
-// you drop as you move to the max jump distance
-struct JumpBase {
-  JumpBase();
-
-  const prob_t& operator()(int jump, unsigned src_len) const {
-    assert(jump != 0);
-    const std::map<int, prob_t>::const_iterator it = p[src_len].find(jump);
-    assert(it != p[src_len].end());
-    return it->second;
-  }
-  std::vector<std::map<int, prob_t> > p;
-};
-
-
-#endif
diff --git a/mteval/ns.cc b/mteval/ns.cc
index 68c8deaa..da678b84 100644
--- a/mteval/ns.cc
+++ b/mteval/ns.cc
@@ -136,6 +136,10 @@ struct BleuSegmentEvaluator : public SegmentEvaluator {
                          float* correct,  // N elements reserved
                          float* hyp,      // N elements reserved
                          bool clip_counts = true) const {
+    // clear clipping stats
+    for (typename NGramCountMap::iterator it = ngrams_.begin(); it != ngrams_.end(); ++it)
+      it->second.second = 0;
+
     vector<WordID> ngram(N);
     *correct *= 0;
     *hyp *= 0;
diff --git a/mteval/ns.h b/mteval/ns.h
index 622265db..d88c263b 100644
--- a/mteval/ns.h
+++ b/mteval/ns.h
@@ -6,6 +6,7 @@
 #include <map>
 #include <boost/shared_ptr.hpp>
 #include "wordid.h"
+#include <iostream>
 
 class SufficientStats {
  public:
@@ -43,6 +44,11 @@ class SufficientStats {
   bool operator==(const SufficientStats& other) const {
     return other.fields == fields;
   }
+  bool IsAdditiveIdentity() const {
+    for (unsigned i = 0; i < fields.size(); ++i)
+      if (fields[i]) return false;
+    return true;
+  }
   size_t size() const { return fields.size(); }
   float operator[](size_t i) const {
     if (i < fields.size()) return fields[i];
@@ -54,12 +60,12 @@ class SufficientStats {
   std::vector<float> fields;
 };
 
-inline const SufficientStats& operator+(const SufficientStats& a, const SufficientStats& b) {
+inline const SufficientStats operator+(const SufficientStats& a, const SufficientStats& b) {
   SufficientStats res(a);
   return res += b;
 }
 
-inline const SufficientStats& operator-(const SufficientStats& a, const SufficientStats& b) {
+inline const SufficientStats operator-(const SufficientStats& a, const SufficientStats& b) {
   SufficientStats res(a);
   return res -= b;
 }
diff --git a/vest/ces.cc b/vest/ces.cc
index 4ae6b695..cd89aa69 100644
--- a/vest/ces.cc
+++ b/vest/ces.cc
@@ -4,25 +4,32 @@
 #include <sstream>
 #include <boost/shared_ptr.hpp>
 
-#include "aligner.h"
+// TODO, if AER is to be optimized again, we will need this
+// #include "aligner.h"
 #include "lattice.h"
 #include "viterbi_envelope.h"
 #include "error_surface.h"
+#include "ns.h"
 
 using boost::shared_ptr;
 using namespace std;
 
 const bool minimize_segments = true;    // if adjacent segments have equal scores, merge them
 
-void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) {
+void ComputeErrorSurface(const SegmentEvaluator& ss,
+                         const ViterbiEnvelope& ve,
+                         ErrorSurface* env,
+                         const EvaluationMetric* metric,
+                         const Hypergraph& hg) {
   vector<WordID> prev_trans;
   const vector<shared_ptr<Segment> >& ienv = ve.GetSortedSegs();
   env->resize(ienv.size());
-  ScoreP prev_score;
+  SufficientStats prev_score; // defaults to 0
   int j = 0;
   for (int i = 0; i < ienv.size(); ++i) {
     const Segment& seg = *ienv[i];
     vector<WordID> trans;
+#if 0
     if (type == AER) {
       vector<bool> edges(hg.edges_.size(), false);
       seg.CollectEdgesUsed(&edges);  // get the set of edges in the viterbi
@@ -46,34 +53,31 @@ void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, Er
       string tstr = os.str();
       TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans);
     } else {
+#endif
       seg.ConstructTranslation(&trans);
-    }
-    // cerr << "Scoring: " << TD::GetString(trans) << endl;
+    //}
+    //cerr << "Scoring: " << TD::GetString(trans) << endl;
     if (trans == prev_trans) {
       if (!minimize_segments) {
-        assert(prev_score); // if this fails, it means
-	                    // the decoder can generate null translations
         ErrorSegment& out = (*env)[j];
-        out.delta = prev_score->GetZero();
+        out.delta.fields.clear();
         out.x = seg.x;
 	++j;
       }
-      // cerr << "Identical translation, skipping scoring\n";
+      //cerr << "Identical translation, skipping scoring\n";
     } else {
-      ScoreP score = ss.ScoreCandidate(trans);
+      SufficientStats score;
+      ss.Evaluate(trans, &score);
       // cerr << "score= " << score->ComputeScore() << "\n";
-      ScoreP cur_delta_p = score->GetZero();
-      Score* cur_delta = cur_delta_p.get();
-      // just record the score diffs
-      if (!prev_score)
-        prev_score = score->GetZero();
-
-      score->Subtract(*prev_score, cur_delta);
+      //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl;
+      const SufficientStats delta = score - prev_score;
+      //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl;
+      //string xx; delta.Encode(&xx); cerr << xx << endl;
       prev_trans.swap(trans);
       prev_score = score;
-      if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) {
+      if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) {
         ErrorSegment& out = (*env)[j];
-        out.delta = cur_delta_p;
+        out.delta = delta;
         out.x = seg.x;
         ++j;
       }
diff --git a/vest/ces.h b/vest/ces.h
index 2f098990..e021e715 100644
--- a/vest/ces.h
+++ b/vest/ces.h
@@ -1,12 +1,16 @@
 #ifndef _CES_H_
 #define _CES_H_
 
-#include "scorer.h"
-
 class ViterbiEnvelope;
 class Hypergraph;
+class SegmentEvaluator;
 class ErrorSurface;
+class EvaluationMetric;
 
-void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg);
+void ComputeErrorSurface(const SegmentEvaluator& ss,
+                         const ViterbiEnvelope& ve,
+                         ErrorSurface* es,
+                         const EvaluationMetric* metric,
+                         const Hypergraph& hg);
 
 #endif
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index c382a972..8cde748b 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -364,7 +364,7 @@ while (1){
 			$mapoutput =~ s/mapinput/mapoutput/;
 			push @mapoutputs, "$dir/splag.$im1/$mapoutput";
 			$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
-			my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";
+			my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";
 			if ($use_make) {
 				my $script_file = "$dir/scripts/map.$shard";
 				open F, ">$script_file" or die "Can't write $script_file: $!";
@@ -424,7 +424,7 @@ while (1){
 		print STDERR "Results for $tol/$til lines\n";
 		print STDERR "\nSORTING AND RUNNING VEST REDUCER\n";
 		print STDERR unchecked_output("date");
-		$cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1";
+		$cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1";
 		print STDERR "COMMAND:\n$cmd\n";
 		check_bash_call($cmd);
 		$cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1";
diff --git a/vest/error_surface.cc b/vest/error_surface.cc
index 754aa8de..515b67f8 100644
--- a/vest/error_surface.cc
+++ b/vest/error_surface.cc
@@ -5,8 +5,7 @@
 
 using namespace std;
 
-ErrorSurface::~ErrorSurface() {
-}
+ErrorSurface::~ErrorSurface() {}
 
 void ErrorSurface::Serialize(std::string* out) const {
   const int segments = this->size();
@@ -15,8 +14,8 @@ void ErrorSurface::Serialize(std::string* out) const {
   for (int i = 0; i < segments; ++i) {
     const ErrorSegment& cur = (*this)[i];
     string senc;
-    cur.delta->Encode(&senc);
-    assert(senc.size() < 256);
+    cur.delta.Encode(&senc);
+    assert(senc.size() < 1024);
     unsigned char len = senc.size();
     os.write((const char*)&cur.x, sizeof(cur.x));
     os.write((const char*)&len, sizeof(len));
@@ -25,7 +24,7 @@ void ErrorSurface::Serialize(std::string* out) const {
   *out = os.str();
 }
 
-void ErrorSurface::Deserialize(ScoreType type, const std::string& in) {
+void ErrorSurface::Deserialize(const std::string& in) {
   istringstream is(in, ios::binary);
   int segments;
   is.read((char*)&segments, sizeof(segments));
@@ -37,7 +36,7 @@ void ErrorSurface::Deserialize(ScoreType type, const std::string& in) {
     is.read((char*)&len, sizeof(len));
     string senc(len, '\0'); assert(senc.size() == len);
     is.read((char*)&senc[0], len);
-    cur.delta = SentenceScorer::CreateScoreFromString(type, senc);
+    cur.delta = SufficientStats(senc);
   }
 }
 
diff --git a/vest/error_surface.h b/vest/error_surface.h
index ad728cfa..bb65847b 100644
--- a/vest/error_surface.h
+++ b/vest/error_surface.h
@@ -4,13 +4,13 @@
 #include <vector>
 #include <string>
 
-#include "scorer.h"
+#include "ns.h"
 
 class Score;
 
 struct ErrorSegment {
   double x;
-  ScoreP delta;
+  SufficientStats delta;
   ErrorSegment() : x(0), delta() {}
 };
 
@@ -18,7 +18,7 @@ class ErrorSurface : public std::vector<ErrorSegment> {
  public:
   ~ErrorSurface();
   void Serialize(std::string* out) const;
-  void Deserialize(ScoreType type, const std::string& in);
+  void Deserialize(const std::string& in);
 };
 
 #endif
diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc
index 7303df8d..49443fbe 100644
--- a/vest/line_optimizer.cc
+++ b/vest/line_optimizer.cc
@@ -4,7 +4,7 @@
 #include <algorithm>
 
 #include "sparse_vector.h"
-#include "scorer.h"
+#include "ns.h"
 
 using namespace std;
 
@@ -18,6 +18,7 @@ struct IntervalComp {
 };
 
 double LineOptimizer::LineOptimize(
+    const EvaluationMetric* metric,
     const vector<ErrorSurface>& surfaces,
     const LineOptimizer::ScoreType type,
     float* best_score,
@@ -32,8 +33,7 @@ double LineOptimizer::LineOptimize(
   }
   sort(all_ints.begin(), all_ints.end(), IntervalComp());
   double last_boundary = all_ints.front()->x;
-  ScoreP accp = all_ints.front()->delta->GetZero();
-  Score *acc=accp.get();
+  SufficientStats acc;
   float& cur_best_score = *best_score;
   cur_best_score = (type == MAXIMIZE_SCORE ?
     -numeric_limits<float>::max() : numeric_limits<float>::max());
@@ -42,9 +42,8 @@ double LineOptimizer::LineOptimize(
   for (vector<ErrorIter>::iterator i = all_ints.begin();
        i != all_ints.end(); ++i) {
     const ErrorSegment& seg = **i;
-    assert(seg.delta);
     if (seg.x - last_boundary > epsilon) {
-      float sco = acc->ComputeScore();
+      float sco = metric->ComputeScore(acc);
       if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
           (type == MINIMIZE_SCORE && sco < cur_best_score) ) {
         cur_best_score = sco;
@@ -54,16 +53,18 @@ double LineOptimizer::LineOptimize(
 	} else {
 	  pos = last_boundary + (seg.x - last_boundary) / 2;
 	}
-	// cerr << "NEW BEST: " << pos << "  (score=" << cur_best_score << ")\n";
+	//cerr << "NEW BEST: " << pos << "  (score=" << cur_best_score << ")\n";
       }
-      // string xx; acc->ScoreDetails(&xx); cerr << "---- " << xx;
+      // string xx = metric->DetailedScore(acc); cerr << "---- " << xx;
       // cerr << "---- s=" << sco << "\n";
       last_boundary = seg.x;
     }
     // cerr << "x-boundary=" << seg.x << "\n";
-    acc->PlusEquals(*seg.delta);
+    //string x2; acc.Encode(&x2); cerr << "   ACC: " << x2 << endl;
+    //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl;
+    acc += seg.delta;
   }
-  float sco = acc->ComputeScore();
+  float sco = metric->ComputeScore(acc);
   if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
       (type == MINIMIZE_SCORE && sco < cur_best_score) ) {
     cur_best_score = sco;
@@ -107,3 +108,4 @@ void LineOptimizer::CreateOptimizationDirections(
      RandomUnitVector(features_to_optimize, &out[i], rng);
   cerr << "Generated " << out.size() << " total axes to optimize along.\n";
 }
+
diff --git a/vest/line_optimizer.h b/vest/line_optimizer.h
index 99a591f4..83819f41 100644
--- a/vest/line_optimizer.h
+++ b/vest/line_optimizer.h
@@ -7,6 +7,7 @@
 #include "error_surface.h"
 #include "sampler.h"
 
+class EvaluationMetric;
 class Weights;
 
 struct LineOptimizer {
@@ -18,6 +19,7 @@ struct LineOptimizer {
   // merge all the error surfaces together into a global
   // error surface and find (the middle of) the best segment
   static double LineOptimize(
+     const EvaluationMetric* metric,
      const std::vector<ErrorSurface>& envs,
      const LineOptimizer::ScoreType type,
      float* best_score,
diff --git a/vest/lo_test.cc b/vest/lo_test.cc
index f5638600..a67f65e1 100644
--- a/vest/lo_test.cc
+++ b/vest/lo_test.cc
@@ -5,6 +5,8 @@
 #include <boost/shared_ptr.hpp>
 #include <gtest/gtest.h>
 
+#include "ns.h"
+#include "ns_docscorer.h"
 #include "ces.h"
 #include "fdict.h"
 #include "hg.h"
@@ -15,7 +17,6 @@
 #include "viterbi.h"
 #include "viterbi_envelope.h"
 #include "line_optimizer.h"
-#include "scorer.h"
 
 using namespace std;
 using boost::shared_ptr;
@@ -141,9 +142,6 @@ TEST_F(OptTest, TestS1) {
   TD::ConvertSentence(ref22, &refs2[1]);
   TD::ConvertSentence(ref32, &refs2[2]);
   TD::ConvertSentence(ref42, &refs2[3]);
-  ScoreType type = ScoreTypeFromString("ibm_bleu");
-  ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, refs1);
-  ScorerP scorer2 = SentenceScorer::CreateSentenceScorer(type, refs2);
   vector<ViterbiEnvelope> envs(2);
 
   RandomNumberGenerator<boost::mt19937> rng;
@@ -167,14 +165,17 @@ TEST_F(OptTest, TestS1) {
   envs[1] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg2, NULL, wf);
 
   vector<ErrorSurface> es(2);
-  ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg);
-  ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2);
+  EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU");
+  boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(refs1);
+  boost::shared_ptr<SegmentEvaluator> scorer2 = metric->CreateSegmentEvaluator(refs2);
+  ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg);
+  ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2);
   cerr << envs[0].size() << " " << envs[1].size() << endl;
   cerr << es[0].size() << " " << es[1].size() << endl;
   envs.clear();
   clock_t t_env=clock();
   float score;
-  double m = LineOptimizer::LineOptimize(es, LineOptimizer::MAXIMIZE_SCORE, &score);
+  double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score);
   clock_t t_opt=clock();
   cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n";
   EXPECT_FLOAT_EQ(0.48719698, score);
@@ -217,15 +218,15 @@ TEST_F(OptTest,TestZeroOrigin) {
   vector<ViterbiEnvelope> envs(1);
   envs[0] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
 
-  ScoreType type = ScoreTypeFromString("ibm_bleu");
   vector<vector<WordID> > mr(4);
   TD::ConvertSentence("untitled", &mr[0]);
   TD::ConvertSentence("with no title", &mr[1]);
   TD::ConvertSentence("without a title", &mr[2]);
   TD::ConvertSentence("without title", &mr[3]);
-  ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, mr);
+  EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU");
+  boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(mr);
   vector<ErrorSurface> es(1);
-  ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg);
+  ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg);
 }
 
 int main(int argc, char **argv) {
diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc
index 71dda6d7..8f6e085d 100644
--- a/vest/mr_vest_map.cc
+++ b/vest/mr_vest_map.cc
@@ -6,11 +6,12 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "ns.h"
+#include "ns_docscorer.h"
 #include "ces.h"
 #include "filelib.h"
 #include "stringlib.h"
 #include "sparse_vector.h"
-#include "scorer.h"
 #include "viterbi_envelope.h"
 #include "inside_outside.h"
 #include "error_surface.h"
@@ -25,7 +26,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   opts.add_options()
         ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
         ("source,s",po::value<string>(), "Source file (ignored, except for AER)")
-        ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized")
+        ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric being optimized")
         ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
         ("help,h", "Help");
   po::options_description dcmdline_options;
@@ -67,10 +68,10 @@ bool ReadSparseVectorString(const string& s, SparseVector<double>* v) {
 int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
-  const string loss_function = conf["loss_function"].as<string>();
-  ScoreType type = ScoreTypeFromString(loss_function);
-  DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>());
-  cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl;
+  const string evaluation_metric = conf["evaluation_metric"].as<string>();
+  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+  DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+  cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
   Hypergraph hg;
   string last_file;
   ReadFile in_read(conf["input"].as<string>());
@@ -97,7 +98,8 @@ int main(int argc, char** argv) {
     ViterbiEnvelopeWeightFunction wf(origin, axis);
     ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
     ErrorSurface es;
-    ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg);
+
+    ComputeErrorSurface(*ds[sent_id], ve, &es, metric, hg);
     //cerr << "Viterbi envelope has " << ve.size() << " segments\n";
     // cerr << "Error surface has " << es.size() << " segments\n";
     string val;
diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc
index 3df52020..dda61f88 100644
--- a/vest/mr_vest_reduce.cc
+++ b/vest/mr_vest_reduce.cc
@@ -10,6 +10,7 @@
 #include "error_surface.h"
 #include "line_optimizer.h"
 #include "b64tools.h"
+#include "stringlib.h"
 
 using namespace std;
 namespace po = boost::program_options;
@@ -17,12 +18,12 @@ namespace po = boost::program_options;
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
-        ("loss_function,l",po::value<string>(), "Loss function being optimized")
+        ("evaluation_metric,m",po::value<string>(), "Evaluation metric (IBM_BLEU, etc.)")
         ("help,h", "Help");
   po::options_description dcmdline_options;
   dcmdline_options.add(opts);
   po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  bool flag = conf->count("loss_function") == 0;
+  bool flag = conf->count("evaluation_metric") == 0;
   if (flag || conf->count("help")) {
     cerr << dcmdline_options << endl;
     exit(1);
@@ -32,30 +33,27 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
 int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
-  const string loss_function = conf["loss_function"].as<string>();
-  ScoreType type = ScoreTypeFromString(loss_function);
+  const string evaluation_metric = conf["evaluation_metric"].as<string>();
   LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE;
-  if (type == TER || type == AER) {
+  if (UppercaseString(evaluation_metric) == "TER")
     opt_type = LineOptimizer::MINIMIZE_SCORE;
-  }
-  string last_key;
+  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+
   vector<ErrorSurface> esv;
-  while(cin) {
-    string line;
-    getline(cin, line);
-    if (line.empty()) continue;
+  string last_key, line, key, val;
+  while(getline(cin, line)) {
     size_t ks = line.find("\t");
     assert(string::npos != ks);
     assert(ks > 2);
-    string key = line.substr(2, ks - 2);
-    string val = line.substr(ks + 1);
+    key = line.substr(2, ks - 2);
+    val = line.substr(ks + 1);
     if (key != last_key) {
       if (!last_key.empty()) {
 	float score;
-        double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
+        double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score);
 	cout << last_key << "|" << x << "|" << score << endl;
       }
-      last_key = key;
+      last_key.swap(key);
       esv.clear();
     }
     if (val.size() % 4 != 0) {
@@ -68,13 +66,11 @@ int main(int argc, char** argv) {
       continue;
     }
     esv.push_back(ErrorSurface());
-    esv.back().Deserialize(type, encoded);
+    esv.back().Deserialize(encoded);
   }
   if (!esv.empty()) {
-    // cerr << "ESV=" << esv.size() << endl;
-    // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; }
     float score;
-    double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
+    double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score);
     cout << last_key << "|" << x << "|" << score << endl;
   }
   return 0;
-- 
cgit v1.2.3


From 5b910a0021a1b454b89c31046cbfd37a35f5ffab Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 2 Feb 2012 12:33:13 -0500
Subject: fix broken build

---
 gi/pf/Makefile.am           | 2 +-
 gi/pf/base_distributions.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 28367e67..8d43f36d 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,7 +1,7 @@
 bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp
 
 noinst_LIBRARIES = libpf.a
-libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
+libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
 
 align_lexonly_SOURCES = align-lexonly.cc
 
diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc
index 4b1863fa..d362fd76 100644
--- a/gi/pf/base_distributions.cc
+++ b/gi/pf/base_distributions.cc
@@ -1,4 +1,4 @@
-#include "base_measures.h"
+#include "base_distributions.h"
 
 #include <iostream>
 
-- 
cgit v1.2.3


From 2c3ee44cea2c46c6c1cdd21bc20568142181937b Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Fri, 3 Feb 2012 21:11:40 -0500
Subject: fix broken build

---
 gi/pf/align-lexonly-pyp.cc | 2 +-
 gi/pf/align-lexonly.cc     | 2 +-
 gi/pf/conditional_pseg.h   | 2 +-
 gi/pf/condnaive.cc         | 2 +-
 gi/pf/dpnaive.cc           | 2 +-
 gi/pf/monotonic_pseg.h     | 2 +-
 gi/pf/pfdist.cc            | 2 +-
 gi/pf/pfnaive.cc           | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index d2630a2b..e24cb457 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -7,7 +7,7 @@
 #include <boost/program_options/variables_map.hpp>
 
 #include "array2d.h"
-#include "base_measures.h"
+#include "base_distributions.h"
 #include "monotonic_pseg.h"
 #include "conditional_pseg.h"
 #include "trule.h"
diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc
index 76e2e009..8c1d689f 100644
--- a/gi/pf/align-lexonly.cc
+++ b/gi/pf/align-lexonly.cc
@@ -7,7 +7,7 @@
 #include <boost/program_options/variables_map.hpp>
 
 #include "array2d.h"
-#include "base_measures.h"
+#include "base_distributions.h"
 #include "monotonic_pseg.h"
 #include "conditional_pseg.h"
 #include "trule.h"
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
index db951d15..0aa5e8e0 100644
--- a/gi/pf/conditional_pseg.h
+++ b/gi/pf/conditional_pseg.h
@@ -10,7 +10,7 @@
 #include "ccrp_nt.h"
 #include "mfcr.h"
 #include "trule.h"
-#include "base_measures.h"
+#include "base_distributions.h"
 #include "tdict.h"
 
 template <typename ConditionalBaseMeasure>
diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc
index 52ddbbfe..3ea88016 100644
--- a/gi/pf/condnaive.cc
+++ b/gi/pf/condnaive.cc
@@ -6,7 +6,7 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
-#include "base_measures.h"
+#include "base_distributions.h"
 #include "monotonic_pseg.h"
 #include "conditional_pseg.h"
 #include "trule.h"
diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc
index db1c43c7..469dff5c 100644
--- a/gi/pf/dpnaive.cc
+++ b/gi/pf/dpnaive.cc
@@ -6,7 +6,7 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
-#include "base_measures.h"
+#include "base_distributions.h"
 #include "monotonic_pseg.h"
 #include "trule.h"
 #include "tdict.h"
diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h
index 301aa6d8..10d171fe 100644
--- a/gi/pf/monotonic_pseg.h
+++ b/gi/pf/monotonic_pseg.h
@@ -6,7 +6,7 @@
 #include "prob.h"
 #include "ccrp_nt.h"
 #include "trule.h"
-#include "base_measures.h"
+#include "base_distributions.h"
 
 template <typename BaseMeasure>
 struct MonotonicParallelSegementationModel {
diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc
index aae5f798..ef08a165 100644
--- a/gi/pf/pfdist.cc
+++ b/gi/pf/pfdist.cc
@@ -7,7 +7,7 @@
 #include <boost/program_options/variables_map.hpp>
 
 #include "pf.h"
-#include "base_measures.h"
+#include "base_distributions.h"
 #include "reachability.h"
 #include "viterbi.h"
 #include "hg.h"
diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc
index 728ec00d..acba9d22 100644
--- a/gi/pf/pfnaive.cc
+++ b/gi/pf/pfnaive.cc
@@ -7,7 +7,7 @@
 #include <boost/program_options/variables_map.hpp>
 
 #include "pf.h"
-#include "base_measures.h"
+#include "base_distributions.h"
 #include "monotonic_pseg.h"
 #include "reachability.h"
 #include "viterbi.h"
-- 
cgit v1.2.3


From 400d60b20e9e480b0eff9843404a4cb9f8bd02cc Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 8 Feb 2012 16:22:55 -0500
Subject: move widely duplicated math functions into m.h header

---
 .gitignore                     |  1 +
 gi/pf/base_distributions.cc    | 22 +++++------
 gi/pf/base_distributions.h     | 21 +---------
 gi/pf/conditional_pseg.h       |  3 +-
 gi/pf/pfdist.cc                |  6 +--
 gi/pf/pfnaive.cc               |  4 +-
 phrasinator/gibbs_train_plm.cc |  8 +---
 utils/Makefile.am              |  5 ++-
 utils/m.h                      | 89 ++++++++++++++++++++++++++++++++++++++++++
 utils/m_test.cc                | 75 +++++++++++++++++++++++++++++++++++
 utils/mfcr.h                   | 22 ++---------
 11 files changed, 194 insertions(+), 62 deletions(-)
 create mode 100644 utils/m.h
 create mode 100644 utils/m_test.cc

(limited to 'gi/pf')

diff --git a/.gitignore b/.gitignore
index ab8bf2c7..4f75d153 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 mira/kbest_mira
+utils/m_test
 sa-extract/calignment.c
 sa-extract/calignment.so
 sa-extract/cdat.c
diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc
index d362fd76..d9761005 100644
--- a/gi/pf/base_distributions.cc
+++ b/gi/pf/base_distributions.cc
@@ -59,7 +59,7 @@ prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc,
   const int flen = vsrc.size() - start_src;
   const int elen = vtrg.size() - start_trg;
   prob_t p;
-  p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
+  p.logeq(Md::log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
   //p.logeq(log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01)
   for (int i = 0; i < elen; ++i)
     p *= u(vtrg[i + start_trg]);                        // draw e_i             ~Uniform
@@ -73,7 +73,7 @@ prob_t PhraseConditionalUninformativeBase::p0(const vector<WordID>& vsrc,
   const int elen = vtrg.size() - start_trg;
   prob_t p;
   //p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
-  p.logeq(log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01)
+  p.logeq(Md::log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01)
   for (int i = 0; i < elen; ++i)
     p *= kUNIFORM_TARGET;                        // draw e_i             ~Uniform
   return p;
@@ -113,7 +113,7 @@ prob_t PhraseConditionalBase::p0(const vector<WordID>& vsrc,
   const int elen = vtrg.size() - start_trg;
   prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
   prob_t p;
-  p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
+  p.logeq(Md::log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)
   for (int i = 0; i < elen; ++i) {               // for each position i in e-RHS
     const WordID trg = vtrg[i + start_trg];
     prob_t tp = prob_t::Zero();
@@ -139,9 +139,9 @@ prob_t PhraseJointBase::p0(const vector<WordID>& vsrc,
   const int elen = vtrg.size() - start_trg;
   prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
   prob_t p;
-  p.logeq(log_poisson(flen, 1.0));               // flen                 ~Pois(1)
+  p.logeq(Md::log_poisson(flen, 1.0));               // flen                 ~Pois(1)
                                                  // elen | flen          ~Pois(flen + 0.01)
-  prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01));
+  prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01));
   p *= ptrglen;
   p *= kUNIFORM_SOURCE.pow(flen);                // each f in F ~Uniform
   for (int i = 0; i < elen; ++i) {               // for each position i in E
@@ -171,9 +171,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc,
   prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1));
 
   prob_t p1;
-  p1.logeq(log_poisson(flen, 1.0));               // flen                 ~Pois(1)
+  p1.logeq(Md::log_poisson(flen, 1.0));               // flen                 ~Pois(1)
                                                  // elen | flen          ~Pois(flen + 0.01)
-  prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01));
+  prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01));
   p1 *= ptrglen;
   p1 *= kUNIFORM_SOURCE.pow(flen);                // each f in F ~Uniform
   for (int i = 0; i < elen; ++i) {               // for each position i in E
@@ -193,9 +193,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc,
   }
 
   prob_t p2;
-  p2.logeq(log_poisson(elen, 1.0));               // elen                 ~Pois(1)
+  p2.logeq(Md::log_poisson(elen, 1.0));               // elen                 ~Pois(1)
                                                  // flen | elen          ~Pois(flen + 0.01)
-  prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01));
+  prob_t psrclen; psrclen.logeq(Md::log_poisson(flen, elen + 0.01));
   p2 *= psrclen;
   p2 *= kUNIFORM_TARGET.pow(elen);                // each f in F ~Uniform
   for (int i = 0; i < flen; ++i) {               // for each position i in E
@@ -227,9 +227,9 @@ JumpBase::JumpBase() : p(200) {
     for (int j = min_jump; j <= max_jump; ++j) {
       prob_t& cp = cpd[j];
       if (j < 0)
-        cp.logeq(log_poisson(1.5-j, 1));
+        cp.logeq(Md::log_poisson(1.5-j, 1));
       else if (j > 0)
-        cp.logeq(log_poisson(j, 1));
+        cp.logeq(Md::log_poisson(j, 1));
       cp.poweq(0.2);
       z += cp;
     }
diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h
index a23ac32b..0d597c5c 100644
--- a/gi/pf/base_distributions.h
+++ b/gi/pf/base_distributions.h
@@ -13,24 +13,7 @@
 #include "prob.h"
 #include "tdict.h"
 #include "sampler.h"
-
-inline double log_poisson(unsigned x, const double& lambda) {
-  assert(lambda > 0.0);
-  return log(lambda) * x - lgamma(x + 1) - lambda;
-}
-
-inline double log_binom_coeff(unsigned n, unsigned k) {
-  assert(n >= k);
-  if (n == k) return 0.0;
-  return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1);
-}
-
-// http://en.wikipedia.org/wiki/Negative_binomial_distribution
-inline double log_negative_binom(unsigned x, unsigned r, double p) {
-  assert(p > 0.0);
-  assert(p < 1.0);
-  return log_binom_coeff(x + r - 1, x) + r * log(1 - p) + x * log(p);
-}
+#include "m.h"
 
 inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) {
   os << '[';
@@ -68,7 +51,7 @@ struct Model1 {
 struct PoissonUniformUninformativeBase {
   explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
   prob_t operator()(const TRule& r) const {
-    prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0));
+    prob_t p; p.logeq(Md::log_poisson(r.e_.size(), 1.0));
     prob_t q = kUNIFORM; q.poweq(r.e_.size());
     p *= q;
     return p;
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
index 0aa5e8e0..2e9e38fc 100644
--- a/gi/pf/conditional_pseg.h
+++ b/gi/pf/conditional_pseg.h
@@ -6,6 +6,7 @@
 #include <boost/functional/hash.hpp>
 #include <iostream>
 
+#include "m.h"
 #include "prob.h"
 #include "ccrp_nt.h"
 #include "mfcr.h"
@@ -210,7 +211,7 @@ struct ConditionalParallelSegementationModel {
 
   prob_t AlignProbability(unsigned span) const {
     prob_t p;
-    p.logeq(aligns.logprob(span, log_poisson(span, 1.0)));
+    p.logeq(aligns.logprob(span, Md::log_poisson(span, 1.0)));
     return p;
   }
 
diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc
index ef08a165..3d578db2 100644
--- a/gi/pf/pfdist.cc
+++ b/gi/pf/pfdist.cc
@@ -315,7 +315,7 @@ struct BackwardEstimate {
       for (int i = 0; i < src_cov.size(); ++i)
         if (!src_cov[i]) r.push_back(src_[i]);
       const prob_t uniform_alignment(1.0 / r.size());
-      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
+      e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
       for (unsigned j = trg_cov; j < trg_.size(); ++j) {
         prob_t p;
         for (unsigned i = 0; i < r.size(); ++i)
@@ -352,7 +352,7 @@ struct BackwardEstimateSym {
         if (!src_cov[i]) r.push_back(src_[i]);
       r.push_back(0);  // NULL word
       const prob_t uniform_alignment(1.0 / r.size());
-      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
+      e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
       for (unsigned j = trg_cov; j < trg_.size(); ++j) {
         prob_t p;
         for (unsigned i = 0; i < r.size(); ++i)
@@ -367,7 +367,7 @@ struct BackwardEstimateSym {
       r.pop_back();
       const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0));
       prob_t inv;
-      inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov));
+      inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov));
       for (unsigned i = 0; i < r.size(); ++i) {
         prob_t p;
         for (unsigned j = trg_cov - 1; j < trg_.size(); ++j)
diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc
index acba9d22..e1a53f5c 100644
--- a/gi/pf/pfnaive.cc
+++ b/gi/pf/pfnaive.cc
@@ -77,7 +77,7 @@ struct BackwardEstimateSym {
         r.push_back(src_[i]);
       r.push_back(0);  // NULL word
       const prob_t uniform_alignment(1.0 / r.size());
-      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
+      e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)
       for (unsigned j = trg_cov; j < trg_.size(); ++j) {
         prob_t p;
         for (unsigned i = 0; i < r.size(); ++i)
@@ -92,7 +92,7 @@ struct BackwardEstimateSym {
       r.pop_back();
       const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0));
       prob_t inv;
-      inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov));
+      inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov));
       for (unsigned i = 0; i < r.size(); ++i) {
         prob_t p;
         for (unsigned j = trg_cov - 1; j < trg_.size(); ++j)
diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc
index 29b3d7ea..66b46011 100644
--- a/phrasinator/gibbs_train_plm.cc
+++ b/phrasinator/gibbs_train_plm.cc
@@ -8,6 +8,7 @@
 #include "dict.h"
 #include "sampler.h"
 #include "ccrp.h"
+#include "m.h"
 
 using namespace std;
 using namespace std::tr1;
@@ -95,11 +96,6 @@ void ReadCorpus(const string& filename, vector<vector<int> >* c, set<int>* vocab
   if (in != &cin) delete in;
 }
 
-double log_poisson(unsigned x, const double& lambda) {
-  assert(lambda > 0.0);
-  return log(lambda) * x - lgamma(x + 1) - lambda;
-}
-
 struct UniphraseLM {
   UniphraseLM(const vector<vector<int> >& corpus,
               const set<int>& vocab,
@@ -128,7 +124,7 @@ struct UniphraseLM {
   double log_p0(const vector<int>& phrase) const {
     double len_logprob;
     if (use_poisson_)
-      len_logprob = log_poisson(phrase.size(), 1.0);
+      len_logprob = Md::log_poisson(phrase.size(), 1.0);
     else
       len_logprob = log(1 - p_end_) * (phrase.size() -1) + log(p_end_);
     return log(uniform_word_) * phrase.size() + len_logprob;
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 3e559c75..a1ea8270 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -7,11 +7,12 @@ TESTS = ts phmt mfcr_test
 if HAVE_GTEST
 noinst_PROGRAMS += \
   dict_test \
+  m_test \
   weights_test \
   logval_test \
   small_vector_test
 
-TESTS += small_vector_test logval_test weights_test dict_test
+TESTS += small_vector_test logval_test weights_test dict_test m_test
 endif
 
 reconstruct_weights_SOURCES = reconstruct_weights.cc
@@ -38,6 +39,8 @@ endif
 
 phmt_SOURCES = phmt.cc
 ts_SOURCES = ts.cc
+m_test_SOURCES = m_test.cc
+m_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
 dict_test_SOURCES = dict_test.cc
 dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
 mfcr_test_SOURCES = mfcr_test.cc
diff --git a/utils/m.h b/utils/m.h
new file mode 100644
index 00000000..b25248c2
--- /dev/null
+++ b/utils/m.h
@@ -0,0 +1,89 @@
+#ifndef _M_H_
+#define _M_H_
+
+#include <cassert>
+#include <cmath>
+
+template <typename F>
+struct M {
+  // support [0, 1, 2 ...)
+  static inline F log_poisson(unsigned x, const F& lambda) {
+    assert(lambda > 0.0);
+    return std::log(lambda) * x - lgamma(x + 1) - lambda;
+  }
+
+  // support [0, 1, 2 ...)
+  static inline F log_geometric(unsigned x, const F& p) {
+    assert(p > 0.0);
+    assert(p < 1.0);
+    return std::log(1 - p) * x + std::log(p);
+  }
+
+  // log of the binomial coefficient
+  static inline F log_binom_coeff(unsigned n, unsigned k) {
+    assert(n >= k);
+    if (n == k) return 0.0;
+    return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1);
+  }
+
+  // http://en.wikipedia.org/wiki/Negative_binomial_distribution
+  // support [0, 1, 2 ...)
+  static inline F log_negative_binom(unsigned x, unsigned r, const F& p) {
+    assert(p > 0.0);
+    assert(p < 1.0);
+    return log_binom_coeff(x + r - 1u, x) + r * std::log(F(1) - p) + x * std::log(p);
+  }
+
+  // this is the Beta function, *not* the beta probability density
+  // http://mathworld.wolfram.com/BetaFunction.html
+  static inline F log_beta_fn(const F& x, const F& y) {
+    return lgamma(x) + lgamma(y) - lgamma(x + y);
+  }
+
+  // support x >= 0.0
+  static F log_gamma_density(const F& x, const F& shape, const F& rate) {
+    assert(x >= 0.0);
+    assert(shape > 0.0);
+    assert(rate > 0.0);
+    return (shape-1)*std::log(x) - shape*std::log(rate) - x/rate - lgamma(shape);
+  }
+
+  // this is the Beta *density* p(x ; alpha, beta)
+  // support x \in (0,1)
+  static inline F log_beta_density(const F& x, const F& alpha, const F& beta) {
+    assert(x > 0.0);
+    assert(x < 1.0);
+    assert(alpha > 0.0);
+    assert(beta > 0.0);
+    return (alpha-1)*std::log(x)+(beta-1)*std::log(1-x) - log_beta_fn(alpha, beta);
+  }
+
+  // note: this has been adapted so that 0 is in the support of the distribution
+  // support [0, 1, 2 ...)
+  static inline F log_yule_simon(unsigned x, const F& rho) {
+    assert(rho > 0.0);
+    return std::log(rho) + log_beta_fn(x + 1, rho + 1);
+  }
+
+  // see http://www.gatsby.ucl.ac.uk/~ywteh/research/compling/hpylm.pdf
+  // when y=1, sometimes written x^{\overline{n}} or x^{(n)} "Pochhammer symbol"
+  static inline F log_generalized_factorial(const F& x, const F& n, const F& y = 1.0) {
+    assert(x > 0.0);
+    assert(y >= 0.0);
+    assert(n > 0.0);
+    if (!n) return 0.0;
+    if (y == F(1)) {
+      return lgamma(x + n) - lgamma(x);
+    } else if (y) {
+      return n * std::log(y) + lgamma(x/y + n) - lgamma(x/y);
+    } else {  // y == 0.0
+      return n * std::log(x);
+    }
+  }
+
+};
+
+typedef M<double> Md;
+typedef M<double> Mf;
+
+#endif
diff --git a/utils/m_test.cc b/utils/m_test.cc
new file mode 100644
index 00000000..fca8f895
--- /dev/null
+++ b/utils/m_test.cc
@@ -0,0 +1,75 @@
+#include "m.h"
+
+#include <iostream>
+#include <gtest/gtest.h>
+#include <cassert>
+
+using namespace std;
+
+class MTest : public testing::Test {
+ public:
+  MTest() {}
+ protected:
+  virtual void SetUp() { }
+  virtual void TearDown() { }
+};
+
+TEST_F(MTest, Poisson) {
+  double prev = 1.0;
+  double tot = 0;
+  for (int i = 0; i < 10; ++i) {
+    double p = Md::log_poisson(i, 0.99);
+    cerr << "p(i=" << i << ") = " << exp(p) << endl;
+    EXPECT_LT(p, prev);
+    tot += exp(p);
+    prev = p;
+  }
+  cerr << "  tot=" << tot << endl;
+  EXPECT_LE(tot, 1.0);
+}
+
+TEST_F(MTest, YuleSimon) {
+  double prev = 1.0;
+  double tot = 0;
+  for (int i = 0; i < 10; ++i) {
+    double p = Md::log_yule_simon(i, 1.0);
+    cerr << "p(i=" << i << ") = " << exp(p) << endl;
+    EXPECT_LT(p, prev);
+    tot += exp(p);
+    prev = p;
+  }
+  cerr << "  tot=" << tot << endl;
+  EXPECT_LE(tot, 1.0);
+}
+
+TEST_F(MTest, LogGeometric) {
+  double prev = 1.0;
+  double tot = 0;
+  for (int i = 0; i < 10; ++i) {
+    double p = Md::log_geometric(i, 0.5);
+    cerr << "p(i=" << i << ") = " << exp(p) << endl;
+    EXPECT_LT(p, prev);
+    tot += exp(p);
+    prev = p;
+  }
+  cerr << "  tot=" << tot << endl;
+  EXPECT_LE(tot, 1.0);
+}
+
+TEST_F(MTest, GeneralizedFactorial) {
+  for (double i = 0.3; i < 10000; i += 0.4) {
+    double a = Md::log_generalized_factorial(1.0, i);
+    double b = lgamma(1.0 + i);
+    EXPECT_FLOAT_EQ(a,b);
+  }
+  double gf_3_6 = 3.0 * 4.0 * 5.0 * 6.0 * 7.0 * 8.0;
+  EXPECT_FLOAT_EQ(Md::log_generalized_factorial(3.0, 6.0), std::log(gf_3_6));
+  double gf_314_6 = 3.14 * 4.14 * 5.14 * 6.14 * 7.14 * 8.14;
+  EXPECT_FLOAT_EQ(Md::log_generalized_factorial(3.14, 6.0), std::log(gf_314_6));
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/utils/mfcr.h b/utils/mfcr.h
index 3eb133fc..396d0205 100644
--- a/utils/mfcr.h
+++ b/utils/mfcr.h
@@ -12,6 +12,7 @@
 #include <boost/functional/hash.hpp>
 #include "sampler.h"
 #include "slice_sampler.h"
+#include "m.h"
 
 struct TableCount {
   TableCount() : count(), floor() {}
@@ -218,31 +219,14 @@ class MFCR {
     return log_crp_prob(d_, alpha_);
   }
 
-  static double log_beta_density(const double& x, const double& alpha, const double& beta) {
-    assert(x > 0.0);
-    assert(x < 1.0);
-    assert(alpha > 0.0);
-    assert(beta > 0.0);
-    const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta);
-    return lp;
-  }
-
-  static double log_gamma_density(const double& x, const double& shape, const double& rate) {
-    assert(x >= 0.0);
-    assert(shape > 0.0);
-    assert(rate > 0.0);
-    const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape);
-    return lp;
-  }
-
   // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
   // does not include draws from G_w's
   double log_crp_prob(const double& d, const double& alpha) const {
     double lp = 0.0;
     if (has_d_prior())
-      lp = log_beta_density(d, d_prior_alpha_, d_prior_beta_);
+      lp = Md::log_beta_density(d, d_prior_alpha_, d_prior_beta_);
     if (has_alpha_prior())
-      lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
+      lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
     assert(lp <= 0.0);
     if (num_customers_) {
       if (d > 0.0) {
-- 
cgit v1.2.3


From 3b1851c8c4e5e7bec3e83177b5c08e566890517c Mon Sep 17 00:00:00 2001
From: Chris Dyer <prguest11@taipan.cs>
Date: Fri, 10 Feb 2012 18:31:05 +0000
Subject: better error checking

---
 gi/pf/guess-translits.pl | 1 +
 1 file changed, 1 insertion(+)

(limited to 'gi/pf')

diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl
index ab737121..aafec13a 100755
--- a/gi/pf/guess-translits.pl
+++ b/gi/pf/guess-translits.pl
@@ -28,6 +28,7 @@ while(<STDIN>) {
   my %b2a;
   for my $ap (@as) {
     my ($a,$b) = split /-/, $ap;
+    die "BAD INPUT: $_\n" unless defined $a && defined $b;
     $a2b{$a}->{$b} = 1;
     $b2a{$b}->{$a} = 1;
   }
-- 
cgit v1.2.3


From e279f1fd267bc18763fa8ff456462c5e677689e9 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 25 Feb 2012 21:22:27 -0500
Subject: really slow hiero lm

---
 gi/pf/Makefile.am     |   4 +-
 gi/pf/hierolm.cc      | 309 +++++++++++++++++++++++++++++++++++++++++++++
 phrasinator/ccrp.h    | 294 -------------------------------------------
 utils/ccrp.h          | 340 ++++++++++++++++++++++++++++++++++++++++++++++++++
 utils/ccrp_onetable.h |  12 ++
 utils/sampler.h       |   2 +-
 6 files changed, 665 insertions(+), 296 deletions(-)
 create mode 100644 gi/pf/hierolm.cc
 delete mode 100644 phrasinator/ccrp.h
 create mode 100644 utils/ccrp.h

(limited to 'gi/pf')

diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 8d43f36d..ed5b6fd3 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,4 +1,4 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp hierolm
 
 noinst_LIBRARIES = libpf.a
 libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
@@ -9,6 +9,8 @@ align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
 
 itg_SOURCES = itg.cc
 
+hierolm_SOURCES = hierolm.cc
+
 condnaive_SOURCES = condnaive.cc
 
 dpnaive_SOURCES = dpnaive.cc
diff --git a/gi/pf/hierolm.cc b/gi/pf/hierolm.cc
new file mode 100644
index 00000000..afb12fef
--- /dev/null
+++ b/gi/pf/hierolm.cc
@@ -0,0 +1,309 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "inside_outside.h"
+#include "hg.h"
+#include "bottom_up_parser.h"
+#include "fdict.h"
+#include "grammar.h"
+#include "m.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp.h"
+#include "ccrp_onetable.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+shared_ptr<MT19937> prng;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("input,i",po::value<string>(),"Read parallel data from")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("input") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+void ReadCorpus(const string& filename,
+                vector<vector<WordID> >* e,
+                set<WordID>* vocab_e) {
+  e->clear();
+  vocab_e->clear();
+  istream* in;
+  if (filename == "-")
+    in = &cin;
+  else
+    in = new ifstream(filename.c_str());
+  assert(*in);
+  string line;
+  while(*in) {
+    getline(*in, line);
+    if (line.empty() && !*in) break;
+    e->push_back(vector<int>());
+    vector<int>& le = e->back();
+    TD::ConvertSentence(line, &le);
+    for (unsigned i = 0; i < le.size(); ++i)
+      vocab_e->insert(le[i]);
+  }
+  if (in != &cin) delete in;
+}
+
+struct Grid {
+  // a b c d e
+  // 0 - 0 - -
+  vector<int> grid;
+};
+
+struct BaseRuleModel {
+  explicit BaseRuleModel(unsigned term_size,
+                         unsigned nonterm_size = 1) :
+      unif_term(1.0 / term_size),
+      unif_nonterm(1.0 / nonterm_size) {}
+  prob_t operator()(const TRule& r) const {
+    prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size()));
+    const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2));
+    const prob_t nonterm_prob(1.0 - term_prob.as_float());
+    for (unsigned i = 0; i < r.f_.size(); ++i) {
+      if (r.f_[i] <= 0) {     // nonterminal
+        p *= nonterm_prob;
+        p *= unif_nonterm;
+      } else {                // terminal
+        p *= term_prob;
+        p *= unif_term;
+      }
+    }
+    return p;
+  }
+  const prob_t unif_term, unif_nonterm;
+};
+
+struct HieroLMModel {
+  explicit HieroLMModel(unsigned vocab_size) : p0(vocab_size), x(1,1,1,1) {}
+
+  prob_t Prob(const TRule& r) const {
+    return x.probT<prob_t>(r, p0(r));
+  }
+
+  int Increment(const TRule& r, MT19937* rng) {
+    return x.incrementT<prob_t>(r, p0(r), rng);
+    // return x.increment(r);
+  }
+
+  int Decrement(const TRule& r, MT19937* rng) {
+    return x.decrement(r, rng);
+    //return x.decrement(r);
+  }
+
+  prob_t Likelihood() const {
+    prob_t p;
+    p.logeq(x.log_crp_prob());
+    for (CCRP<TRule>::const_iterator it = x.begin(); it != x.end(); ++it) {
+      prob_t tp = p0(it->first);
+      tp.poweq(it->second.table_counts_.size());
+      p *= tp;
+    }
+    //for (CCRP_OneTable<TRule>::const_iterator it = x.begin(); it != x.end(); ++it)
+    //    p *= p0(it->first);
+    return p;
+  }
+
+  void ResampleHyperparameters(MT19937* rng) {
+    x.resample_hyperparameters(rng);
+    cerr << " d=" << x.discount() << ", alpha=" << x.concentration() << endl;
+  }
+
+  const BaseRuleModel p0;
+  CCRP<TRule> x;
+  //CCRP_OneTable<TRule> x;
+};
+
+vector<GrammarIter* > tofreelist;
+
+HieroLMModel* plm;
+
+struct NPGrammarIter : public GrammarIter, public RuleBin {
+  NPGrammarIter() : arity() { tofreelist.push_back(this); }
+  NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a + (symbol < 0 ? 1 : 0)) {
+    if (inr) {
+      r.reset(new TRule(*inr));
+    } else {
+      static const int kLHS = -TD::Convert("X");
+      r.reset(new TRule);
+      r->lhs_ = kLHS;
+    }
+    TRule& rr = *r;
+    rr.f_.push_back(symbol);
+    rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol);
+    tofreelist.push_back(this);
+  }
+  virtual int GetNumRules() const {
+    if (r) return 1; else return 0;
+  }
+  virtual TRulePtr GetIthRule(int) const {
+    return r;
+  }
+  virtual int Arity() const {
+    return arity;
+  }
+  virtual const RuleBin* GetRules() const {
+    if (!r) return NULL; else return this;
+  }
+  virtual const GrammarIter* Extend(int symbol) const {
+    return new NPGrammarIter(r, arity, symbol);
+  }
+  const unsigned char arity;
+  TRulePtr r;
+};
+
+struct NPGrammar : public Grammar {
+  virtual const GrammarIter* GetRoot() const {
+    return new NPGrammarIter;
+  }
+};
+
+void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv, HieroLMModel* plm) {
+  HieroLMModel& lm = *plm;
+  vector<prob_t> node_probs;
+  const prob_t total_prob = Inside<prob_t, EdgeProb>(hg, &node_probs);
+  queue<unsigned> q;
+  q.push(hg.nodes_.size() - 3);
+  while(!q.empty()) {
+    unsigned cur_node_id = q.front();
+//    cerr << "NODE=" << cur_node_id << endl;
+    q.pop();
+    const Hypergraph::Node& node = hg.nodes_[cur_node_id];
+    const unsigned num_in_edges = node.in_edges_.size();
+    unsigned sampled_edge = 0;
+    if (num_in_edges == 1) {
+      sampled_edge = node.in_edges_[0];
+    } else {
+      //prob_t z;
+      assert(num_in_edges > 1);
+      SampleSet<prob_t> ss;
+      for (unsigned j = 0; j < num_in_edges; ++j) {
+        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+        prob_t p = edge.edge_prob_;
+        for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
+          p *= node_probs[edge.tail_nodes_[k]];
+        ss.add(p);
+//        cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl;
+        //z += p;
+      }
+//      for (unsigned j = 0; j < num_in_edges; ++j) {
+//        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+//        cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl;
+//      }
+//      cerr << " --- \n";
+      sampled_edge = node.in_edges_[rng->SelectSample(ss)];
+    }
+    sampled_deriv->push_back(sampled_edge);
+    const Hypergraph::Edge& edge = hg.edges_[sampled_edge];
+    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
+      q.push(edge.tail_nodes_[j]);
+    }
+  }
+  for (unsigned i = 0; i < sampled_deriv->size(); ++i) {
+    cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl;
+  }
+}
+
+void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
+  for (unsigned i = 0; i < d.size(); ++i)
+    plm->Increment(*hg.edges_[d[i]].rule_, rng);
+}
+
+void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
+  for (unsigned i = 0; i < d.size(); ++i)
+    plm->Decrement(*hg.edges_[d[i]].rule_, rng);
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  vector<GrammarPtr> grammars;
+  grammars.push_back(GrammarPtr(new NPGrammar));
+
+  InitCommandLine(argc, argv, &conf);
+  const unsigned samples = conf["samples"].as<unsigned>();
+
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+  MT19937& rng = *prng;
+
+  vector<vector<WordID> > corpuse;
+  set<WordID> vocabe;
+  cerr << "Reading corpus...\n";
+  ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe);
+  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+  HieroLMModel lm(vocabe.size());
+
+  plm = &lm;
+  ExhaustiveBottomUpParser parser("X", grammars);
+
+  Hypergraph hg;
+  const int kX = -TD::Convert("X");
+  const int kLP = FD::Convert("LogProb");
+  SparseVector<double> v; v.set_value(kLP, 1.0);
+  vector<vector<unsigned> > derivs(corpuse.size());
+  for (int SS=0; SS < samples; ++SS) {
+    for (int ci = 0; ci < corpuse.size(); ++ci) {
+      vector<int>& src = corpuse[ci];
+      Lattice lat(src.size());
+      for (unsigned i = 0; i < src.size(); ++i)
+        lat[i].push_back(LatticeArc(src[i], 0.0, 1));
+      cerr << TD::GetString(src) << endl;
+      hg.clear();
+      parser.Parse(lat, &hg);  // exhaustive parse
+      DecrementDerivation(hg, derivs[ci], &lm, &rng);
+      for (unsigned i = 0; i < hg.edges_.size(); ++i) {
+        TRule& r = *hg.edges_[i].rule_;
+        if (r.lhs_ == kX)
+          hg.edges_[i].edge_prob_ = lm.Prob(r);
+      }
+      vector<unsigned> d;
+      SampleDerivation(hg, &rng, &d, &lm);
+      derivs[ci] = d;
+      IncrementDerivation(hg, derivs[ci], &lm, &rng);
+      if (tofreelist.size() > 100000) {
+        cerr << "Freeing ... ";
+        for (unsigned i = 0; i < tofreelist.size(); ++i)
+          delete tofreelist[i];
+        tofreelist.clear();
+        cerr << "Freed.\n";
+      }
+    }
+    cerr << "LLH=" << lm.Likelihood() << endl;
+  }
+  return 0;
+}
+
diff --git a/phrasinator/ccrp.h b/phrasinator/ccrp.h
deleted file mode 100644
index 9acf12ab..00000000
--- a/phrasinator/ccrp.h
+++ /dev/null
@@ -1,294 +0,0 @@
-#ifndef _CCRP_H_
-#define _CCRP_H_
-
-#include <numeric>
-#include <cassert>
-#include <cmath>
-#include <list>
-#include <iostream>
-#include <vector>
-#include <tr1/unordered_map>
-#include <boost/functional/hash.hpp>
-#include "sampler.h"
-#include "slice_sampler.h"
-
-// Chinese restaurant process (Pitman-Yor parameters) with table tracking.
-
-template <typename Dish, typename DishHash = boost::hash<Dish> >
-class CCRP {
- public:
-  CCRP(double disc, double conc) :
-    num_tables_(),
-    num_customers_(),
-    discount_(disc),
-    concentration_(conc),
-    discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()),
-    discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
-    concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
-    concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
-
-  CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.1, double c = 10.0) :
-    num_tables_(),
-    num_customers_(),
-    discount_(d),
-    concentration_(c),
-    discount_prior_alpha_(d_alpha),
-    discount_prior_beta_(d_beta),
-    concentration_prior_shape_(c_shape),
-    concentration_prior_rate_(c_rate) {}
-
-  double discount() const { return discount_; }
-  double concentration() const { return concentration_; }
-
-  bool has_discount_prior() const {
-    return !std::isnan(discount_prior_alpha_);
-  }
-
-  bool has_concentration_prior() const {
-    return !std::isnan(concentration_prior_shape_);
-  }
-
-  void clear() {
-    num_tables_ = 0;
-    num_customers_ = 0;
-    dish_locs_.clear();
-  }
-
-  unsigned num_tables() const {
-    return num_tables_;
-  }
-
-  unsigned num_tables(const Dish& dish) const {
-    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
-    if (it == dish_locs_.end()) return 0;
-    return it->second.table_counts_.size();
-  }
-
-  unsigned num_customers() const {
-    return num_customers_;
-  }
-
-  unsigned num_customers(const Dish& dish) const {
-    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
-    if (it == dish_locs_.end()) return 0;
-    return it->total_dish_count_;
-  }
-
-  // returns +1 or 0 indicating whether a new table was opened
-  int increment(const Dish& dish, const double& p0, MT19937* rng) {
-    DishLocations& loc = dish_locs_[dish];
-    bool share_table = false;
-    if (loc.total_dish_count_) {
-      const double p_empty = (concentration_ + num_tables_ * discount_) * p0;
-      const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
-      share_table = rng->SelectSample(p_empty, p_share);
-    }
-    if (share_table) {
-      double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
-      for (typename std::list<unsigned>::iterator ti = loc.table_counts_.begin();
-           ti != loc.table_counts_.end(); ++ti) {
-        r -= (*ti - discount_);
-        if (r <= 0.0) {
-          ++(*ti);
-          break;
-        }
-      }
-      if (r > 0.0) {
-        std::cerr << "Serious error: r=" << r << std::endl;
-        Print(&std::cerr);
-        assert(r <= 0.0);
-      }
-    } else {
-      loc.table_counts_.push_back(1u);
-      ++num_tables_;
-    }
-    ++loc.total_dish_count_;
-    ++num_customers_;
-    return (share_table ? 0 : 1);
-  }
-
-  // returns -1 or 0, indicating whether a table was closed
-  int decrement(const Dish& dish, MT19937* rng) {
-    DishLocations& loc = dish_locs_[dish];
-    assert(loc.total_dish_count_);
-    if (loc.total_dish_count_ == 1) {
-      dish_locs_.erase(dish);
-      --num_tables_;
-      --num_customers_;
-      return -1;
-    } else {
-      int delta = 0;
-      // sample customer to remove UNIFORMLY. that is, do NOT use the discount
-      // here. if you do, it will introduce (unwanted) bias!
-      double r = rng->next() * loc.total_dish_count_;
-      --loc.total_dish_count_;
-      for (typename std::list<unsigned>::iterator ti = loc.table_counts_.begin();
-           ti != loc.table_counts_.end(); ++ti) {
-        r -= *ti;
-        if (r <= 0.0) {
-          if ((--(*ti)) == 0) {
-            --num_tables_;
-            delta = -1;
-            loc.table_counts_.erase(ti);
-          }
-          break;
-        }
-      }
-      if (r > 0.0) {
-        std::cerr << "Serious error: r=" << r << std::endl;
-        Print(&std::cerr);
-        assert(r <= 0.0);
-      }
-      --num_customers_;
-      return delta;
-    }
-  }
-
-  double prob(const Dish& dish, const double& p0) const {
-    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
-    const double r = num_tables_ * discount_ + concentration_;
-    if (it == dish_locs_.end()) {
-      return r * p0 / (num_customers_ + concentration_);
-    } else {
-      return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) /
-               (num_customers_ + concentration_);
-    }
-  }
-
-  double log_crp_prob() const {
-    return log_crp_prob(discount_, concentration_);
-  }
-
-  static double log_beta_density(const double& x, const double& alpha, const double& beta) {
-    assert(x > 0.0);
-    assert(x < 1.0);
-    assert(alpha > 0.0);
-    assert(beta > 0.0);
-    const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta);
-    return lp;
-  }
-
-  static double log_gamma_density(const double& x, const double& shape, const double& rate) {
-    assert(x >= 0.0);
-    assert(shape > 0.0);
-    assert(rate > 0.0);
-    const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape);
-    return lp;
-  }
-
-  // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
-  // does not include P_0's
-  double log_crp_prob(const double& discount, const double& concentration) const {
-    double lp = 0.0;
-    if (has_discount_prior())
-      lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_);
-    if (has_concentration_prior())
-      lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_);
-    assert(lp <= 0.0);
-    if (num_customers_) {
-      if (discount > 0.0) {
-        const double r = lgamma(1.0 - discount);
-        lp += lgamma(concentration) - lgamma(concentration + num_customers_)
-             + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_)
-             - lgamma(concentration / discount);
-        assert(std::isfinite(lp));
-        for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
-             it != dish_locs_.end(); ++it) {
-          const DishLocations& cur = it->second;
-          for (std::list<unsigned>::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) {
-            lp += lgamma(*ti - discount) - r;
-          }
-        }
-      } else {
-        assert(!"not implemented yet");
-      }
-    }
-    assert(std::isfinite(lp));
-    return lp;
-  }
-
-  void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
-    assert(has_discount_prior() || has_concentration_prior());
-    DiscountResampler dr(*this);
-    ConcentrationResampler cr(*this);
-    for (int iter = 0; iter < nloop; ++iter) {
-      if (has_concentration_prior()) {
-        concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
-                               std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
-      }
-      if (has_discount_prior()) {
-        discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits<double>::min(),
-                               1.0, 0.0, niterations, 100*niterations);
-      }
-    }
-    concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
-                             std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
-  }
-
-  struct DiscountResampler {
-    DiscountResampler(const CCRP& crp) : crp_(crp) {}
-    const CCRP& crp_;
-    double operator()(const double& proposed_discount) const {
-      return crp_.log_crp_prob(proposed_discount, crp_.concentration_);
-    }
-  };
-
-  struct ConcentrationResampler {
-    ConcentrationResampler(const CCRP& crp) : crp_(crp) {}
-    const CCRP& crp_;
-    double operator()(const double& proposed_concentration) const {
-      return crp_.log_crp_prob(crp_.discount_, proposed_concentration);
-    }
-  };
-
-  struct DishLocations {
-    DishLocations() : total_dish_count_() {}
-    unsigned total_dish_count_;        // customers at all tables with this dish
-    std::list<unsigned> table_counts_; // list<> gives O(1) deletion and insertion, which we want
-                                       // .size() is the number of tables for this dish
-  };
-
-  void Print(std::ostream* out) const {
-    std::cerr << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl;
-    for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
-         it != dish_locs_.end(); ++it) {
-      (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): ";
-      for (typename std::list<unsigned>::const_iterator i = it->second.table_counts_.begin();
-           i != it->second.table_counts_.end(); ++i) {
-        (*out) << " " << *i;
-      }
-      (*out) << std::endl;
-    }
-  }
-
-  typedef typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator const_iterator;
-  const_iterator begin() const {
-    return dish_locs_.begin();
-  }
-  const_iterator end() const {
-    return dish_locs_.end();
-  }
-
-  unsigned num_tables_;
-  unsigned num_customers_;
-  std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_;
-
-  double discount_;
-  double concentration_;
-
-  // optional beta prior on discount_ (NaN if no prior)
-  double discount_prior_alpha_;
-  double discount_prior_beta_;
-
-  // optional gamma prior on concentration_ (NaN if no prior)
-  double concentration_prior_shape_;
-  double concentration_prior_rate_;
-};
-
-template <typename T,typename H>
-std::ostream& operator<<(std::ostream& o, const CCRP<T,H>& c) {
-  c.Print(&o);
-  return o;
-}
-
-#endif
diff --git a/utils/ccrp.h b/utils/ccrp.h
new file mode 100644
index 00000000..1a9e3ed5
--- /dev/null
+++ b/utils/ccrp.h
@@ -0,0 +1,340 @@
+#ifndef _CCRP_H_
+#define _CCRP_H_
+
+#include <numeric>
+#include <cassert>
+#include <cmath>
+#include <list>
+#include <iostream>
+#include <vector>
+#include <tr1/unordered_map>
+#include <boost/functional/hash.hpp>
+#include "sampler.h"
+#include "slice_sampler.h"
+
+// Chinese restaurant process (Pitman-Yor parameters) with table tracking.
+
+template <typename Dish, typename DishHash = boost::hash<Dish> >
+class CCRP {
+ public:
+  CCRP(double disc, double conc) :
+    num_tables_(),
+    num_customers_(),
+    discount_(disc),
+    concentration_(conc),
+    discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()),
+    discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
+    concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
+    concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
+
+  CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) :
+    num_tables_(),
+    num_customers_(),
+    discount_(d),
+    concentration_(c),
+    discount_prior_alpha_(d_alpha),
+    discount_prior_beta_(d_beta),
+    concentration_prior_shape_(c_shape),
+    concentration_prior_rate_(c_rate) {}
+
+  double discount() const { return discount_; }
+  double concentration() const { return concentration_; }
+
+  bool has_discount_prior() const {
+    return !std::isnan(discount_prior_alpha_);
+  }
+
+  bool has_concentration_prior() const {
+    return !std::isnan(concentration_prior_shape_);
+  }
+
+  void clear() {
+    num_tables_ = 0;
+    num_customers_ = 0;
+    dish_locs_.clear();
+  }
+
+  unsigned num_tables() const {
+    return num_tables_;
+  }
+
+  unsigned num_tables(const Dish& dish) const {
+    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
+    if (it == dish_locs_.end()) return 0;
+    return it->second.table_counts_.size();
+  }
+
+  unsigned num_customers() const {
+    return num_customers_;
+  }
+
+  unsigned num_customers(const Dish& dish) const {
+    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
+    if (it == dish_locs_.end()) return 0;
+    return it->total_dish_count_;
+  }
+
+  // returns +1 or 0 indicating whether a new table was opened
+  int increment(const Dish& dish, const double& p0, MT19937* rng) {
+    DishLocations& loc = dish_locs_[dish];
+    bool share_table = false;
+    if (loc.total_dish_count_) {
+      const double p_empty = (concentration_ + num_tables_ * discount_) * p0;
+      const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
+      share_table = rng->SelectSample(p_empty, p_share);
+    }
+    if (share_table) {
+      double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
+      for (typename std::list<unsigned>::iterator ti = loc.table_counts_.begin();
+           ti != loc.table_counts_.end(); ++ti) {
+        r -= (*ti - discount_);
+        if (r <= 0.0) {
+          ++(*ti);
+          break;
+        }
+      }
+      if (r > 0.0) {
+        std::cerr << "Serious error: r=" << r << std::endl;
+        Print(&std::cerr);
+        assert(r <= 0.0);
+      }
+    } else {
+      loc.table_counts_.push_back(1u);
+      ++num_tables_;
+    }
+    ++loc.total_dish_count_;
+    ++num_customers_;
+    return (share_table ? 0 : 1);
+  }
+
+  // returns +1 or 0 indicating whether a new table was opened
+  template <typename T>
+  int incrementT(const Dish& dish, const T& p0, MT19937* rng) {
+    DishLocations& loc = dish_locs_[dish];
+    bool share_table = false;
+    if (loc.total_dish_count_) {
+      const T p_empty = T(concentration_ + num_tables_ * discount_) * p0;
+      const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_);
+      share_table = rng->SelectSample(p_empty, p_share);
+    }
+    if (share_table) {
+      double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
+      for (typename std::list<unsigned>::iterator ti = loc.table_counts_.begin();
+           ti != loc.table_counts_.end(); ++ti) {
+        r -= (*ti - discount_);
+        if (r <= 0.0) {
+          ++(*ti);
+          break;
+        }
+      }
+      if (r > 0.0) {
+        std::cerr << "Serious error: r=" << r << std::endl;
+        Print(&std::cerr);
+        assert(r <= 0.0);
+      }
+    } else {
+      loc.table_counts_.push_back(1u);
+      ++num_tables_;
+    }
+    ++loc.total_dish_count_;
+    ++num_customers_;
+    return (share_table ? 0 : 1);
+  }
+
+  // returns -1 or 0, indicating whether a table was closed
+  int decrement(const Dish& dish, MT19937* rng) {
+    DishLocations& loc = dish_locs_[dish];
+    assert(loc.total_dish_count_);
+    if (loc.total_dish_count_ == 1) {
+      dish_locs_.erase(dish);
+      --num_tables_;
+      --num_customers_;
+      return -1;
+    } else {
+      int delta = 0;
+      // sample customer to remove UNIFORMLY. that is, do NOT use the discount
+      // here. if you do, it will introduce (unwanted) bias!
+      double r = rng->next() * loc.total_dish_count_;
+      --loc.total_dish_count_;
+      for (typename std::list<unsigned>::iterator ti = loc.table_counts_.begin();
+           ti != loc.table_counts_.end(); ++ti) {
+        r -= *ti;
+        if (r <= 0.0) {
+          if ((--(*ti)) == 0) {
+            --num_tables_;
+            delta = -1;
+            loc.table_counts_.erase(ti);
+          }
+          break;
+        }
+      }
+      if (r > 0.0) {
+        std::cerr << "Serious error: r=" << r << std::endl;
+        Print(&std::cerr);
+        assert(r <= 0.0);
+      }
+      --num_customers_;
+      return delta;
+    }
+  }
+
+  double prob(const Dish& dish, const double& p0) const {
+    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
+    const double r = num_tables_ * discount_ + concentration_;
+    if (it == dish_locs_.end()) {
+      return r * p0 / (num_customers_ + concentration_);
+    } else {
+      return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) /
+               (num_customers_ + concentration_);
+    }
+  }
+
+  template <typename T>
+  T probT(const Dish& dish, const T& p0) const {
+    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
+    const T r = T(num_tables_ * discount_ + concentration_);
+    if (it == dish_locs_.end()) {
+      return r * p0 / T(num_customers_ + concentration_);
+    } else {
+      return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) /
+               T(num_customers_ + concentration_);
+    }
+  }
+
+  double log_crp_prob() const {
+    return log_crp_prob(discount_, concentration_);
+  }
+
+  static double log_beta_density(const double& x, const double& alpha, const double& beta) {
+    assert(x > 0.0);
+    assert(x < 1.0);
+    assert(alpha > 0.0);
+    assert(beta > 0.0);
+    const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta);
+    return lp;
+  }
+
+  static double log_gamma_density(const double& x, const double& shape, const double& rate) {
+    assert(x >= 0.0);
+    assert(shape > 0.0);
+    assert(rate > 0.0);
+    const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape);
+    return lp;
+  }
+
+  // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
+  // does not include P_0's
+  double log_crp_prob(const double& discount, const double& concentration) const {
+    double lp = 0.0;
+    if (has_discount_prior())
+      lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_);
+    if (has_concentration_prior())
+      lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_);
+    assert(lp <= 0.0);
+    if (num_customers_) {
+      if (discount > 0.0) {
+        const double r = lgamma(1.0 - discount);
+        lp += lgamma(concentration) - lgamma(concentration + num_customers_)
+             + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_)
+             - lgamma(concentration / discount);
+        assert(std::isfinite(lp));
+        for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
+             it != dish_locs_.end(); ++it) {
+          const DishLocations& cur = it->second;
+          for (std::list<unsigned>::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) {
+            lp += lgamma(*ti - discount) - r;
+          }
+        }
+      } else {
+        assert(!"not implemented yet");
+      }
+    }
+    assert(std::isfinite(lp));
+    return lp;
+  }
+
+  void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    assert(has_discount_prior() || has_concentration_prior());
+    DiscountResampler dr(*this);
+    ConcentrationResampler cr(*this);
+    for (int iter = 0; iter < nloop; ++iter) {
+      if (has_concentration_prior()) {
+        concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+                               std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+      }
+      if (has_discount_prior()) {
+        discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits<double>::min(),
+                               1.0, 0.0, niterations, 100*niterations);
+      }
+    }
+    concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+                             std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+  }
+
+  struct DiscountResampler {
+    DiscountResampler(const CCRP& crp) : crp_(crp) {}
+    const CCRP& crp_;
+    double operator()(const double& proposed_discount) const {
+      return crp_.log_crp_prob(proposed_discount, crp_.concentration_);
+    }
+  };
+
+  struct ConcentrationResampler {
+    ConcentrationResampler(const CCRP& crp) : crp_(crp) {}
+    const CCRP& crp_;
+    double operator()(const double& proposed_concentration) const {
+      return crp_.log_crp_prob(crp_.discount_, proposed_concentration);
+    }
+  };
+
+  struct DishLocations {
+    DishLocations() : total_dish_count_() {}
+    unsigned total_dish_count_;        // customers at all tables with this dish
+    std::list<unsigned> table_counts_; // list<> gives O(1) deletion and insertion, which we want
+                                       // .size() is the number of tables for this dish
+  };
+
+  void Print(std::ostream* out) const {
+    std::cerr << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl;
+    for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
+         it != dish_locs_.end(); ++it) {
+      (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): ";
+      for (typename std::list<unsigned>::const_iterator i = it->second.table_counts_.begin();
+           i != it->second.table_counts_.end(); ++i) {
+        (*out) << " " << *i;
+      }
+      (*out) << std::endl;
+    }
+  }
+
+  typedef typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator const_iterator;
+  const_iterator begin() const {
+    return dish_locs_.begin();
+  }
+  const_iterator end() const {
+    return dish_locs_.end();
+  }
+
+  unsigned num_tables_;
+  unsigned num_customers_;
+  std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_;
+
+  double discount_;
+  double concentration_;
+
+  // optional beta prior on discount_ (NaN if no prior)
+  double discount_prior_alpha_;
+  double discount_prior_beta_;
+
+  // optional gamma prior on concentration_ (NaN if no prior)
+  double concentration_prior_shape_;
+  double concentration_prior_rate_;
+};
+
+template <typename T,typename H>
+std::ostream& operator<<(std::ostream& o, const CCRP<T,H>& c) {
+  c.Print(&o);
+  return o;
+}
+
+#endif
diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h
index a868af9a..b63737d1 100644
--- a/utils/ccrp_onetable.h
+++ b/utils/ccrp_onetable.h
@@ -117,6 +117,18 @@ class CCRP_OneTable {
     }
   }
 
+  template <typename T>
+  T probT(const Dish& dish, const T& p0) const {
+    const typename DishMapType::const_iterator it = dish_counts_.find(dish);
+    const T r(num_tables_ * discount_ + concentration_);
+    if (it == dish_counts_.end()) {
+      return r * p0 / T(num_customers_ + concentration_);
+    } else {
+      return (T(it->second - discount_) + r * p0) /
+               T(num_customers_ + concentration_);
+    }
+  }
+
   double log_crp_prob() const {
     return log_crp_prob(discount_, concentration_);
   }
diff --git a/utils/sampler.h b/utils/sampler.h
index 153e7ef1..22c873d4 100644
--- a/utils/sampler.h
+++ b/utils/sampler.h
@@ -48,7 +48,7 @@ struct RandomNumberGenerator {
   template <typename F>
   size_t SelectSample(const F& a, const F& b, double T = 1.0) {
     if (T == 1.0) {
-      if (this->next() > (a / (a + b))) return 1; else return 0;
+      if (F(this->next()) > (a / (a + b))) return 1; else return 0;
     } else {
       assert(!"not implemented");
     }
-- 
cgit v1.2.3


From dc2b2fc395ad496851f723c4da59181445c07047 Mon Sep 17 00:00:00 2001
From: Chris Dyer <prguest11@taipan.cs>
Date: Mon, 27 Feb 2012 02:19:34 +0000
Subject: generic bayesian cfg learner with a bunch of cfg grammar types

---
 .gitignore         |   1 +
 decoder/trule.cc   |  16 +--
 gi/pf/Makefile.am  |   4 +-
 gi/pf/hierolm.cc   | 309 -----------------------------------------
 gi/pf/learn_cfg.cc | 394 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 398 insertions(+), 326 deletions(-)
 delete mode 100644 gi/pf/hierolm.cc
 create mode 100644 gi/pf/learn_cfg.cc

(limited to 'gi/pf')

diff --git a/.gitignore b/.gitignore
index 327f7261..28d5a60a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -57,6 +57,7 @@ training/mpi_extract_reachable
 klm/lm/build_binary
 extools/extractor_monolingual
 gi/pf/.deps
+gi/pf/learn_cfg
 gi/pf/brat
 gi/pf/cbgi
 gi/pf/dpnaive
diff --git a/decoder/trule.cc b/decoder/trule.cc
index 40235542..141b8faa 100644
--- a/decoder/trule.cc
+++ b/decoder/trule.cc
@@ -232,16 +232,6 @@ void TRule::ComputeArity() {
   arity_ = 1 - min;
 }
 
-static string AnonymousStrVar(int i) {
-  string res("[v]");
-  if(!(i <= 0 && i >= -8)) {
-    cerr << "Can't handle more than 9 non-terminals: index=" << (-i) << endl;
-    abort();
-  }
-  res[1] = '1' - i;
-  return res;
-}
-
 string TRule::AsString(bool verbose) const {
   ostringstream os;
   int idx = 0;
@@ -259,15 +249,11 @@ string TRule::AsString(bool verbose) const {
     }
   }
   os << " ||| ";
-  if (idx > 9) {
-    cerr << "Too many non-terminals!\n partial: " << os.str() << endl;
-    exit(1);
-  }
   for (int i =0; i<e_.size(); ++i) {
     if (i) os << ' ';
     const WordID& w = e_[i];
     if (w < 1)
-      os << AnonymousStrVar(w);
+      os << '[' << (1-w) << ']';
     else
       os << TD::Convert(w);
   }
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index ed5b6fd3..0cf0bc63 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,4 +1,4 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp hierolm
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg
 
 noinst_LIBRARIES = libpf.a
 libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
@@ -9,7 +9,7 @@ align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
 
 itg_SOURCES = itg.cc
 
-hierolm_SOURCES = hierolm.cc
+learn_cfg_SOURCES = learn_cfg.cc
 
 condnaive_SOURCES = condnaive.cc
 
diff --git a/gi/pf/hierolm.cc b/gi/pf/hierolm.cc
deleted file mode 100644
index afb12fef..00000000
--- a/gi/pf/hierolm.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/functional.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "inside_outside.h"
-#include "hg.h"
-#include "bottom_up_parser.h"
-#include "fdict.h"
-#include "grammar.h"
-#include "m.h"
-#include "trule.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp.h"
-#include "ccrp_onetable.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-shared_ptr<MT19937> prng;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
-        ("input,i",po::value<string>(),"Read parallel data from")
-        ("random_seed,S",po::value<uint32_t>(), "Random seed");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || (conf->count("input") == 0)) {
-    cerr << dcmdline_options << endl;
-    exit(1);
-  }
-}
-
-void ReadCorpus(const string& filename,
-                vector<vector<WordID> >* e,
-                set<WordID>* vocab_e) {
-  e->clear();
-  vocab_e->clear();
-  istream* in;
-  if (filename == "-")
-    in = &cin;
-  else
-    in = new ifstream(filename.c_str());
-  assert(*in);
-  string line;
-  while(*in) {
-    getline(*in, line);
-    if (line.empty() && !*in) break;
-    e->push_back(vector<int>());
-    vector<int>& le = e->back();
-    TD::ConvertSentence(line, &le);
-    for (unsigned i = 0; i < le.size(); ++i)
-      vocab_e->insert(le[i]);
-  }
-  if (in != &cin) delete in;
-}
-
-struct Grid {
-  // a b c d e
-  // 0 - 0 - -
-  vector<int> grid;
-};
-
-struct BaseRuleModel {
-  explicit BaseRuleModel(unsigned term_size,
-                         unsigned nonterm_size = 1) :
-      unif_term(1.0 / term_size),
-      unif_nonterm(1.0 / nonterm_size) {}
-  prob_t operator()(const TRule& r) const {
-    prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size()));
-    const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2));
-    const prob_t nonterm_prob(1.0 - term_prob.as_float());
-    for (unsigned i = 0; i < r.f_.size(); ++i) {
-      if (r.f_[i] <= 0) {     // nonterminal
-        p *= nonterm_prob;
-        p *= unif_nonterm;
-      } else {                // terminal
-        p *= term_prob;
-        p *= unif_term;
-      }
-    }
-    return p;
-  }
-  const prob_t unif_term, unif_nonterm;
-};
-
-struct HieroLMModel {
-  explicit HieroLMModel(unsigned vocab_size) : p0(vocab_size), x(1,1,1,1) {}
-
-  prob_t Prob(const TRule& r) const {
-    return x.probT<prob_t>(r, p0(r));
-  }
-
-  int Increment(const TRule& r, MT19937* rng) {
-    return x.incrementT<prob_t>(r, p0(r), rng);
-    // return x.increment(r);
-  }
-
-  int Decrement(const TRule& r, MT19937* rng) {
-    return x.decrement(r, rng);
-    //return x.decrement(r);
-  }
-
-  prob_t Likelihood() const {
-    prob_t p;
-    p.logeq(x.log_crp_prob());
-    for (CCRP<TRule>::const_iterator it = x.begin(); it != x.end(); ++it) {
-      prob_t tp = p0(it->first);
-      tp.poweq(it->second.table_counts_.size());
-      p *= tp;
-    }
-    //for (CCRP_OneTable<TRule>::const_iterator it = x.begin(); it != x.end(); ++it)
-    //    p *= p0(it->first);
-    return p;
-  }
-
-  void ResampleHyperparameters(MT19937* rng) {
-    x.resample_hyperparameters(rng);
-    cerr << " d=" << x.discount() << ", alpha=" << x.concentration() << endl;
-  }
-
-  const BaseRuleModel p0;
-  CCRP<TRule> x;
-  //CCRP_OneTable<TRule> x;
-};
-
-vector<GrammarIter* > tofreelist;
-
-HieroLMModel* plm;
-
-struct NPGrammarIter : public GrammarIter, public RuleBin {
-  NPGrammarIter() : arity() { tofreelist.push_back(this); }
-  NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a + (symbol < 0 ? 1 : 0)) {
-    if (inr) {
-      r.reset(new TRule(*inr));
-    } else {
-      static const int kLHS = -TD::Convert("X");
-      r.reset(new TRule);
-      r->lhs_ = kLHS;
-    }
-    TRule& rr = *r;
-    rr.f_.push_back(symbol);
-    rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol);
-    tofreelist.push_back(this);
-  }
-  virtual int GetNumRules() const {
-    if (r) return 1; else return 0;
-  }
-  virtual TRulePtr GetIthRule(int) const {
-    return r;
-  }
-  virtual int Arity() const {
-    return arity;
-  }
-  virtual const RuleBin* GetRules() const {
-    if (!r) return NULL; else return this;
-  }
-  virtual const GrammarIter* Extend(int symbol) const {
-    return new NPGrammarIter(r, arity, symbol);
-  }
-  const unsigned char arity;
-  TRulePtr r;
-};
-
-struct NPGrammar : public Grammar {
-  virtual const GrammarIter* GetRoot() const {
-    return new NPGrammarIter;
-  }
-};
-
-void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv, HieroLMModel* plm) {
-  HieroLMModel& lm = *plm;
-  vector<prob_t> node_probs;
-  const prob_t total_prob = Inside<prob_t, EdgeProb>(hg, &node_probs);
-  queue<unsigned> q;
-  q.push(hg.nodes_.size() - 3);
-  while(!q.empty()) {
-    unsigned cur_node_id = q.front();
-//    cerr << "NODE=" << cur_node_id << endl;
-    q.pop();
-    const Hypergraph::Node& node = hg.nodes_[cur_node_id];
-    const unsigned num_in_edges = node.in_edges_.size();
-    unsigned sampled_edge = 0;
-    if (num_in_edges == 1) {
-      sampled_edge = node.in_edges_[0];
-    } else {
-      //prob_t z;
-      assert(num_in_edges > 1);
-      SampleSet<prob_t> ss;
-      for (unsigned j = 0; j < num_in_edges; ++j) {
-        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
-        prob_t p = edge.edge_prob_;
-        for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
-          p *= node_probs[edge.tail_nodes_[k]];
-        ss.add(p);
-//        cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl;
-        //z += p;
-      }
-//      for (unsigned j = 0; j < num_in_edges; ++j) {
-//        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
-//        cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl;
-//      }
-//      cerr << " --- \n";
-      sampled_edge = node.in_edges_[rng->SelectSample(ss)];
-    }
-    sampled_deriv->push_back(sampled_edge);
-    const Hypergraph::Edge& edge = hg.edges_[sampled_edge];
-    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
-      q.push(edge.tail_nodes_[j]);
-    }
-  }
-  for (unsigned i = 0; i < sampled_deriv->size(); ++i) {
-    cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl;
-  }
-}
-
-void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
-  for (unsigned i = 0; i < d.size(); ++i)
-    plm->Increment(*hg.edges_[d[i]].rule_, rng);
-}
-
-void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
-  for (unsigned i = 0; i < d.size(); ++i)
-    plm->Decrement(*hg.edges_[d[i]].rule_, rng);
-}
-
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  vector<GrammarPtr> grammars;
-  grammars.push_back(GrammarPtr(new NPGrammar));
-
-  InitCommandLine(argc, argv, &conf);
-  const unsigned samples = conf["samples"].as<unsigned>();
-
-  if (conf.count("random_seed"))
-    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
-  else
-    prng.reset(new MT19937);
-  MT19937& rng = *prng;
-
-  vector<vector<WordID> > corpuse;
-  set<WordID> vocabe;
-  cerr << "Reading corpus...\n";
-  ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe);
-  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
-  HieroLMModel lm(vocabe.size());
-
-  plm = &lm;
-  ExhaustiveBottomUpParser parser("X", grammars);
-
-  Hypergraph hg;
-  const int kX = -TD::Convert("X");
-  const int kLP = FD::Convert("LogProb");
-  SparseVector<double> v; v.set_value(kLP, 1.0);
-  vector<vector<unsigned> > derivs(corpuse.size());
-  for (int SS=0; SS < samples; ++SS) {
-    for (int ci = 0; ci < corpuse.size(); ++ci) {
-      vector<int>& src = corpuse[ci];
-      Lattice lat(src.size());
-      for (unsigned i = 0; i < src.size(); ++i)
-        lat[i].push_back(LatticeArc(src[i], 0.0, 1));
-      cerr << TD::GetString(src) << endl;
-      hg.clear();
-      parser.Parse(lat, &hg);  // exhaustive parse
-      DecrementDerivation(hg, derivs[ci], &lm, &rng);
-      for (unsigned i = 0; i < hg.edges_.size(); ++i) {
-        TRule& r = *hg.edges_[i].rule_;
-        if (r.lhs_ == kX)
-          hg.edges_[i].edge_prob_ = lm.Prob(r);
-      }
-      vector<unsigned> d;
-      SampleDerivation(hg, &rng, &d, &lm);
-      derivs[ci] = d;
-      IncrementDerivation(hg, derivs[ci], &lm, &rng);
-      if (tofreelist.size() > 100000) {
-        cerr << "Freeing ... ";
-        for (unsigned i = 0; i < tofreelist.size(); ++i)
-          delete tofreelist[i];
-        tofreelist.clear();
-        cerr << "Freed.\n";
-      }
-    }
-    cerr << "LLH=" << lm.Likelihood() << endl;
-  }
-  return 0;
-}
-
diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc
new file mode 100644
index 00000000..3d202816
--- /dev/null
+++ b/gi/pf/learn_cfg.cc
@@ -0,0 +1,394 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "inside_outside.h"
+#include "hg.h"
+#include "bottom_up_parser.h"
+#include "fdict.h"
+#include "grammar.h"
+#include "m.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp.h"
+#include "ccrp_onetable.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+shared_ptr<MT19937> prng;
+vector<int> nt_vocab;
+vector<int> nt_id_to_index;
+static unsigned kMAX_RULE_SIZE = 0;
+static unsigned kMAX_ARITY = 0;
+static bool kALLOW_MIXED = true;  // allow rules with mixed terminals and NTs
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("input,i",po::value<string>(),"Read parallel data from")
+        ("max_rule_size,m", po::value<unsigned>()->default_value(0), "Maximum rule size (0 for unlimited)")
+        ("max_arity,a", po::value<unsigned>()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)")
+        ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS")
+        ("nonterminals,n", po::value<unsigned>()->default_value(1), "Size of nonterminal vocabulary")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("input") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+unsigned ReadCorpus(const string& filename,
+                    vector<vector<WordID> >* e,
+                    set<WordID>* vocab_e) {
+  e->clear();
+  vocab_e->clear();
+  istream* in;
+  if (filename == "-")
+    in = &cin;
+  else
+    in = new ifstream(filename.c_str());
+  assert(*in);
+  string line;
+  unsigned toks = 0;
+  while(*in) {
+    getline(*in, line);
+    if (line.empty() && !*in) break;
+    e->push_back(vector<int>());
+    vector<int>& le = e->back();
+    TD::ConvertSentence(line, &le);
+    for (unsigned i = 0; i < le.size(); ++i)
+      vocab_e->insert(le[i]);
+    toks += le.size();
+  }
+  if (in != &cin) delete in;
+  return toks;
+}
+
+struct Grid {
+  // a b c d e
+  // 0 - 0 - -
+  vector<int> grid;
+};
+
+struct BaseRuleModel {
+  explicit BaseRuleModel(unsigned term_size,
+                         unsigned nonterm_size = 1) :
+      unif_term(1.0 / term_size),
+      unif_nonterm(1.0 / nonterm_size) {}
+  prob_t operator()(const TRule& r) const {
+    prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size()));
+    const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2));
+    const prob_t nonterm_prob(1.0 - term_prob.as_float());
+    for (unsigned i = 0; i < r.f_.size(); ++i) {
+      if (r.f_[i] <= 0) {     // nonterminal
+        p *= nonterm_prob;
+        p *= unif_nonterm;
+      } else {                // terminal
+        p *= term_prob;
+        p *= unif_term;
+      }
+    }
+    return p;
+  }
+  const prob_t unif_term, unif_nonterm;
+};
+
+struct HieroLMModel {
+  explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : p0(vocab_size, num_nts), nts(num_nts, CCRP<TRule>(1,1,1,1)) {}
+
+  prob_t Prob(const TRule& r) const {
+    return nts[nt_id_to_index[-r.lhs_]].probT<prob_t>(r, p0(r));
+  }
+
+  int Increment(const TRule& r, MT19937* rng) {
+    return nts[nt_id_to_index[-r.lhs_]].incrementT<prob_t>(r, p0(r), rng);
+    // return x.increment(r);
+  }
+
+  int Decrement(const TRule& r, MT19937* rng) {
+    return nts[nt_id_to_index[-r.lhs_]].decrement(r, rng);
+    //return x.decrement(r);
+  }
+
+  prob_t Likelihood() const {
+    prob_t p = prob_t::One();
+    for (unsigned i = 0; i < nts.size(); ++i) {
+      prob_t q; q.logeq(nts[i].log_crp_prob());
+      p *= q;
+      for (CCRP<TRule>::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) {
+        prob_t tp = p0(it->first);
+        tp.poweq(it->second.table_counts_.size());
+        p *= tp;
+      }
+    }
+    //for (CCRP_OneTable<TRule>::const_iterator it = x.begin(); it != x.end(); ++it)
+    //    p *= p0(it->first);
+    return p;
+  }
+
+  void ResampleHyperparameters(MT19937* rng) {
+    for (unsigned i = 0; i < nts.size(); ++i)
+      nts[i].resample_hyperparameters(rng);
+    cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl;
+  }
+
+  const BaseRuleModel p0;
+  vector<CCRP<TRule> > nts;
+  //CCRP_OneTable<TRule> x;
+};
+
+vector<GrammarIter* > tofreelist;
+
+HieroLMModel* plm;
+
+struct NPGrammarIter : public GrammarIter, public RuleBin {
+  NPGrammarIter() : arity() { tofreelist.push_back(this); }
+  NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) {
+    if (inr) {
+      r.reset(new TRule(*inr));
+    } else {
+      r.reset(new TRule);
+    }
+    TRule& rr = *r;
+    rr.lhs_ = nt_vocab[0];
+    rr.f_.push_back(symbol);
+    rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol);
+    tofreelist.push_back(this);
+  }
+  inline static unsigned NextArity(int cur_a, int symbol) {
+    return cur_a + (symbol <= 0 ? 1 : 0);
+  }
+  virtual int GetNumRules() const {
+    if (r) return nt_vocab.size(); else return 0;
+  }
+  virtual TRulePtr GetIthRule(int i) const {
+    if (i == 0) return r;
+    TRulePtr nr(new TRule(*r));
+    nr->lhs_ = nt_vocab[i];
+    return nr;
+  }
+  virtual int Arity() const {
+    return arity;
+  }
+  virtual const RuleBin* GetRules() const {
+    if (!r) return NULL; else return this;
+  }
+  virtual const GrammarIter* Extend(int symbol) const {
+    const int next_arity = NextArity(arity, symbol);
+    if (kMAX_ARITY && next_arity > kMAX_ARITY)
+      return NULL;
+    if (!kALLOW_MIXED && r) {
+      bool t1 = r->f_.front() <= 0;
+      bool t2 = symbol <= 0;
+      if (t1 != t2) return NULL;
+    }
+    if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE))
+      return new NPGrammarIter(r, next_arity, symbol);
+    else
+      return NULL;
+  }
+  const unsigned char arity;
+  TRulePtr r;
+};
+
+struct NPGrammar : public Grammar {
+  virtual const GrammarIter* GetRoot() const {
+    return new NPGrammarIter;
+  }
+};
+
+prob_t TotalProb(const Hypergraph& hg) {
+  return Inside<prob_t, EdgeProb>(hg);
+}
+
+void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv) {
+  vector<prob_t> node_probs;
+  Inside<prob_t, EdgeProb>(hg, &node_probs);
+  queue<unsigned> q;
+  q.push(hg.nodes_.size() - 2);
+  while(!q.empty()) {
+    unsigned cur_node_id = q.front();
+//    cerr << "NODE=" << cur_node_id << endl;
+    q.pop();
+    const Hypergraph::Node& node = hg.nodes_[cur_node_id];
+    const unsigned num_in_edges = node.in_edges_.size();
+    unsigned sampled_edge = 0;
+    if (num_in_edges == 1) {
+      sampled_edge = node.in_edges_[0];
+    } else {
+      //prob_t z;
+      assert(num_in_edges > 1);
+      SampleSet<prob_t> ss;
+      for (unsigned j = 0; j < num_in_edges; ++j) {
+        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+        prob_t p = edge.edge_prob_;
+        for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
+          p *= node_probs[edge.tail_nodes_[k]];
+        ss.add(p);
+//        cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl;
+        //z += p;
+      }
+//      for (unsigned j = 0; j < num_in_edges; ++j) {
+//        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+//        cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl;
+//      }
+//      cerr << " --- \n";
+      sampled_edge = node.in_edges_[rng->SelectSample(ss)];
+    }
+    sampled_deriv->push_back(sampled_edge);
+    const Hypergraph::Edge& edge = hg.edges_[sampled_edge];
+    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
+      q.push(edge.tail_nodes_[j]);
+    }
+  }
+  for (unsigned i = 0; i < sampled_deriv->size(); ++i) {
+    cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl;
+  }
+}
+
+void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
+  for (unsigned i = 0; i < d.size(); ++i)
+    plm->Increment(*hg.edges_[d[i]].rule_, rng);
+}
+
+void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
+  for (unsigned i = 0; i < d.size(); ++i)
+    plm->Decrement(*hg.edges_[d[i]].rule_, rng);
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+
+  InitCommandLine(argc, argv, &conf);
+  nt_vocab.resize(conf["nonterminals"].as<unsigned>());
+  assert(nt_vocab.size() > 0);
+  assert(nt_vocab.size() < 26);
+  {
+    string nt = "X";
+    for (unsigned i = 0; i < nt_vocab.size(); ++i) {
+      if (nt_vocab.size() > 1) nt[0] = ('A' + i);
+      int pid = TD::Convert(nt);
+      nt_vocab[i] = -pid;
+      if (pid >= nt_id_to_index.size()) {
+        nt_id_to_index.resize(pid + 1, -1);
+      }
+      nt_id_to_index[pid] = i;
+    }
+  }
+  vector<GrammarPtr> grammars;
+  grammars.push_back(GrammarPtr(new NPGrammar));
+
+  const unsigned samples = conf["samples"].as<unsigned>();
+  kMAX_RULE_SIZE = conf["max_rule_size"].as<unsigned>();
+  if (kMAX_RULE_SIZE == 1) {
+    cerr << "Invalid maximum rule size: must be 0 or >1\n";
+    return 1;
+  }
+  kMAX_ARITY = conf["max_arity"].as<unsigned>();
+  if (kMAX_ARITY == 1) {
+    cerr << "Invalid maximum arity: must be 0 or >1\n";
+    return 1;
+  }
+  kALLOW_MIXED = !conf.count("no_mixed_rules");
+
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+  MT19937& rng = *prng;
+  vector<vector<WordID> > corpuse;
+  set<WordID> vocabe;
+  cerr << "Reading corpus...\n";
+  const unsigned toks = ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe);
+  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+  HieroLMModel lm(vocabe.size(), nt_vocab.size());
+
+  plm = &lm;
+  ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars);
+
+  Hypergraph hg;
+  const int kGoal = -TD::Convert("Goal");
+  const int kLP = FD::Convert("LogProb");
+  SparseVector<double> v; v.set_value(kLP, 1.0);
+  vector<vector<unsigned> > derivs(corpuse.size());
+  vector<Lattice> cl(corpuse.size());
+  for (int ci = 0; ci < corpuse.size(); ++ci) {
+    vector<int>& src = corpuse[ci];
+    Lattice& lat = cl[ci];
+    lat.resize(src.size());
+    for (unsigned i = 0; i < src.size(); ++i)
+      lat[i].push_back(LatticeArc(src[i], 0.0, 1));
+  }
+  for (int SS=0; SS < samples; ++SS) {
+    const bool is_last = ((samples - 1) == SS);
+    prob_t dlh = prob_t::One();
+    for (int ci = 0; ci < corpuse.size(); ++ci) {
+      const vector<int>& src = corpuse[ci];
+      const Lattice& lat = cl[ci];
+      cerr << TD::GetString(src) << endl;
+      hg.clear();
+      parser.Parse(lat, &hg);  // exhaustive parse
+      vector<unsigned>& d = derivs[ci];
+      if (!is_last) DecrementDerivation(hg, d, &lm, &rng);
+      for (unsigned i = 0; i < hg.edges_.size(); ++i) {
+        TRule& r = *hg.edges_[i].rule_;
+        if (r.lhs_ == kGoal)
+          hg.edges_[i].edge_prob_ = prob_t::One();
+        else
+          hg.edges_[i].edge_prob_ = lm.Prob(r);
+      }
+      if (!is_last) {
+        d.clear();
+        SampleDerivation(hg, &rng, &d);
+        IncrementDerivation(hg, derivs[ci], &lm, &rng);
+      } else {
+        prob_t p = TotalProb(hg);
+        dlh *= p;
+        cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl;
+      }
+      if (tofreelist.size() > 200000) {
+        cerr << "Freeing ... ";
+        for (unsigned i = 0; i < tofreelist.size(); ++i)
+          delete tofreelist[i];
+        tofreelist.clear();
+        cerr << "Freed.\n";
+      }
+    }
+    double llh = log(lm.Likelihood());
+    cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl;
+    if (SS % 10 == 9) lm.ResampleHyperparameters(&rng);
+    if (is_last) {
+      double z = log(dlh);
+      cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl;
+    }
+  }
+  for (unsigned i = 0; i < nt_vocab.size(); ++i)
+    cerr << lm.nts[i] << endl;
+  return 0;
+}
+
-- 
cgit v1.2.3


From c9fecc7613c075dc2e998479a9d39a538807e609 Mon Sep 17 00:00:00 2001
From: Chris Dyer <prguest11@taipan.cs>
Date: Mon, 27 Feb 2012 02:40:00 +0000
Subject: fix base distribution, partially

---
 gi/pf/learn_cfg.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc
index 3d202816..6e574035 100644
--- a/gi/pf/learn_cfg.cc
+++ b/gi/pf/learn_cfg.cc
@@ -106,10 +106,10 @@ struct BaseRuleModel {
     const prob_t nonterm_prob(1.0 - term_prob.as_float());
     for (unsigned i = 0; i < r.f_.size(); ++i) {
       if (r.f_[i] <= 0) {     // nonterminal
-        p *= nonterm_prob;
+        if (kALLOW_MIXED) p *= nonterm_prob;
         p *= unif_nonterm;
       } else {                // terminal
-        p *= term_prob;
+        if (kALLOW_MIXED) p *= term_prob;
         p *= unif_term;
       }
     }
-- 
cgit v1.2.3


From 5c63dae2edca73b2fa1c668d708b8b0c3ff1f7dc Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 28 Feb 2012 00:47:20 -0500
Subject: optional hierarchical prior

---
 gi/pf/learn_cfg.cc | 46 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 6 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc
index 6e574035..b2ca029a 100644
--- a/gi/pf/learn_cfg.cc
+++ b/gi/pf/learn_cfg.cc
@@ -30,6 +30,7 @@ vector<int> nt_id_to_index;
 static unsigned kMAX_RULE_SIZE = 0;
 static unsigned kMAX_ARITY = 0;
 static bool kALLOW_MIXED = true;  // allow rules with mixed terminals and NTs
+static bool kHIERARCHICAL_PRIOR = false;
 
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
@@ -40,11 +41,12 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("max_arity,a", po::value<unsigned>()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)")
         ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS")
         ("nonterminals,n", po::value<unsigned>()->default_value(1), "Size of nonterminal vocabulary")
+        ("hierarchical_prior,h", "Use hierarchical prior")
         ("random_seed,S",po::value<uint32_t>(), "Random seed");
   po::options_description clo("Command line options");
   clo.add_options()
         ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
+        ("help", "Print this help message and exit");
   po::options_description dconfig_options, dcmdline_options;
   dconfig_options.add(opts);
   dcmdline_options.add(opts).add(clo);
@@ -119,19 +121,35 @@ struct BaseRuleModel {
 };
 
 struct HieroLMModel {
-  explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : p0(vocab_size, num_nts), nts(num_nts, CCRP<TRule>(1,1,1,1)) {}
+  explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) :
+      base(vocab_size, num_nts),
+      q0(1,1,1,1),
+      nts(num_nts, CCRP<TRule>(1,1,1,1)) {}
 
   prob_t Prob(const TRule& r) const {
     return nts[nt_id_to_index[-r.lhs_]].probT<prob_t>(r, p0(r));
   }
 
+  inline prob_t p0(const TRule& r) const {
+    if (kHIERARCHICAL_PRIOR)
+      return q0.probT<prob_t>(r, base(r));
+    else
+      return base(r);
+  }
+
   int Increment(const TRule& r, MT19937* rng) {
-    return nts[nt_id_to_index[-r.lhs_]].incrementT<prob_t>(r, p0(r), rng);
+    const int delta = nts[nt_id_to_index[-r.lhs_]].incrementT<prob_t>(r, p0(r), rng);
+    if (kHIERARCHICAL_PRIOR && delta)
+      q0.incrementT<prob_t>(r, base(r), rng);
+    return delta;
     // return x.increment(r);
   }
 
   int Decrement(const TRule& r, MT19937* rng) {
-    return nts[nt_id_to_index[-r.lhs_]].decrement(r, rng);
+    const int delta = nts[nt_id_to_index[-r.lhs_]].decrement(r, rng);
+    if (kHIERARCHICAL_PRIOR && delta)
+      q0.decrement(r, rng);
+    return delta;
     //return x.decrement(r);
   }
 
@@ -146,18 +164,32 @@ struct HieroLMModel {
         p *= tp;
       }
     }
+    if (kHIERARCHICAL_PRIOR) {
+      prob_t q; q.logeq(q0.log_crp_prob());
+      p *= q;
+      for (CCRP<TRule>::const_iterator it = q0.begin(); it != q0.end(); ++it) {
+        prob_t tp = base(it->first);
+        tp.poweq(it->second.table_counts_.size());
+        p *= tp;
+      }
+    }
     //for (CCRP_OneTable<TRule>::const_iterator it = x.begin(); it != x.end(); ++it)
-    //    p *= p0(it->first);
+    //    p *= base(it->first);
     return p;
   }
 
   void ResampleHyperparameters(MT19937* rng) {
     for (unsigned i = 0; i < nts.size(); ++i)
       nts[i].resample_hyperparameters(rng);
+    if (kHIERARCHICAL_PRIOR) {
+      q0.resample_hyperparameters(rng);
+      cerr << "[base d=" << q0.discount() << ", alpha=" << q0.discount() << "]";
+    }
     cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl;
   }
 
-  const BaseRuleModel p0;
+  const BaseRuleModel base;
+  CCRP<TRule> q0;
   vector<CCRP<TRule> > nts;
   //CCRP_OneTable<TRule> x;
 };
@@ -316,6 +348,8 @@ int main(int argc, char** argv) {
   }
   kALLOW_MIXED = !conf.count("no_mixed_rules");
 
+  kHIERARCHICAL_PRIOR = conf.count("hierarchical_prior");
+
   if (conf.count("random_seed"))
     prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
   else
-- 
cgit v1.2.3


From b3c0b5e4a05019045e6a81209741b60e0f20b073 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 3 Mar 2012 03:24:53 -0500
Subject: PYP language model (Teh 2006)

---
 decoder/fst_translator.cc |   5 +-
 gi/pf/Makefile.am         |   4 +-
 gi/pf/pyp_lm.cc           | 150 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 157 insertions(+), 2 deletions(-)
 create mode 100644 gi/pf/pyp_lm.cc

(limited to 'gi/pf')

diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc
index 38dbd717..074de4c9 100644
--- a/decoder/fst_translator.cc
+++ b/decoder/fst_translator.cc
@@ -30,7 +30,10 @@ struct FSTTranslatorImpl {
     if (input.find("{\"rules\"") == 0) {
       istringstream is(input);
       Hypergraph src_cfg_hg;
-      assert(HypergraphIO::ReadFromJSON(&is, &src_cfg_hg));
+      if (!HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)) {
+        cerr << "Failed to read HG from JSON.\n";
+        abort();
+      }
       if (add_pass_through_rules) {
         SparseVector<double> feats;
         feats.set_value(FD::Convert("PassThrough"), 1);
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 0cf0bc63..7cf9c14d 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,4 +1,4 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm
 
 noinst_LIBRARIES = libpf.a
 libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
@@ -9,6 +9,8 @@ align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
 
 itg_SOURCES = itg.cc
 
+pyp_lm_SOURCES = pyp_lm.cc
+
 learn_cfg_SOURCES = learn_cfg.cc
 
 condnaive_SOURCES = condnaive.cc
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
new file mode 100644
index 00000000..2837e33c
--- /dev/null
+++ b/gi/pf/pyp_lm.cc
@@ -0,0 +1,150 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "corpus_tools.h"
+#include "m.h"
+#include "tdict.h"
+#include "sampler.h"
+#include "ccrp.h"
+#include "ccrp_onetable.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+shared_ptr<MT19937> prng;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("input,i",po::value<string>(),"Read data from")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("input") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+template <unsigned N> struct PYPLM;
+
+// uniform base distribution
+template<> struct PYPLM<0> {
+  PYPLM(unsigned vs) : p0(1.0 / vs) {}
+  void increment(WordID w, const vector<WordID>& context, MT19937* rng) const {}
+  void decrement(WordID w, const vector<WordID>& context, MT19937* rng) const {}
+  double prob(WordID w, const vector<WordID>& context) const { return p0; }
+  const double p0;
+};
+
+// represents an N-gram LM
+template <unsigned N> struct PYPLM {
+  PYPLM(unsigned vs) : backoff(vs) {}
+  void increment(WordID w, const vector<WordID>& context, MT19937* rng) {
+    const double bo = backoff.prob(w, context);
+    static vector<WordID> lookup(N-1);
+    for (unsigned i = 0; i < N-1; ++i)
+      lookup[i] = context[context.size() - 1 - i];
+    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
+    if (it == p.end())
+      it = p.insert(make_pair(lookup, CCRP<WordID>(1,1,1,1))).first;
+    if (it->second.increment(w, bo, rng))
+      backoff.increment(w, context, rng);
+  }
+  void decrement(WordID w, const vector<WordID>& context, MT19937* rng) {
+    static vector<WordID> lookup(N-1);
+    for (unsigned i = 0; i < N-1; ++i)
+      lookup[i] = context[context.size() - 1 - i];
+    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
+    assert(it != p.end());
+    if (it->second.decrement(w, rng))
+      backoff.decrement(w, context, rng);
+  }
+  double prob(WordID w, const vector<WordID>& context) const {
+    const double bo = backoff.prob(w, context);
+    static vector<WordID> lookup(N-1);
+    for (unsigned i = 0; i < N-1; ++i)
+      lookup[i] = context[context.size() - 1 - i];
+    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it = p.find(lookup);
+    if (it == p.end()) return bo;
+    return it->second.prob(w, bo);
+  }
+  PYPLM<N-1> backoff;
+  unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p;
+};
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+
+  InitCommandLine(argc, argv, &conf);
+  const unsigned samples = conf["samples"].as<unsigned>();
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+  MT19937& rng = *prng;
+  vector<vector<WordID> > corpuse;
+  set<WordID> vocabe;
+  const WordID kEOS = TD::Convert("</s>");
+  cerr << "Reading corpus...\n";
+  CorpusTools::ReadFromFile(conf["input"].as<string>(), &corpuse, &vocabe);
+  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+#define kORDER 5
+  PYPLM<kORDER> lm(vocabe.size());
+  vector<WordID> ctx(kORDER - 1, TD::Convert("<s>"));
+  int mci = corpuse.size() * 99 / 100;
+  for (int SS=0; SS < samples; ++SS) {
+    for (int ci = 0; ci < mci; ++ci) {
+      ctx.resize(kORDER - 1);
+      const vector<WordID>& s = corpuse[ci];
+      for (int i = 0; i <= s.size(); ++i) {
+        WordID w = (i < s.size() ? s[i] : kEOS);
+        if (SS > 0) lm.decrement(w, ctx, &rng);
+        lm.increment(w, ctx, &rng);
+        ctx.push_back(w);
+      }
+      if (SS > 0) lm.decrement(kEOS, ctx, &rng);
+      lm.increment(kEOS, ctx, &rng);
+    }
+  }
+  double llh = 0;
+  unsigned cnt = 0;
+  for (int ci = mci; ci < corpuse.size(); ++ci) {
+    ctx.resize(kORDER - 1);
+    const vector<WordID>& s = corpuse[ci];
+    for (int i = 0; i <= s.size(); ++i) {
+      WordID w = (i < s.size() ? s[i] : kEOS);
+      double lp = log(lm.prob(w, ctx)) / log(2);
+      cerr << "p(" << TD::Convert(w) << " | " << TD::GetString(ctx) << ") = " << lp << endl;
+      ctx.push_back(w);
+      llh -= lp;
+      cnt++;
+    }
+  }
+  cerr << "  Log_10 prob: " << (llh * log(2) / log(10)) << endl;
+  cerr << "        Count: " << (cnt) << endl;
+  cerr << "Cross-entropy: " << (llh / cnt) << endl;
+  cerr << "   Perplexity: " << pow(2, llh / cnt) << endl;
+  return 0;
+}
+
-- 
cgit v1.2.3


From 8f6006cabee490a956940765c30cdd720d2e9161 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 3 Mar 2012 17:16:58 -0500
Subject: pyp lm, fixed hyperparameters inference

---
 gi/pf/align-lexonly-pyp.cc     |  2 +-
 gi/pf/align-lexonly.cc         |  2 +-
 gi/pf/brat.cc                  |  2 +-
 gi/pf/conditional_pseg.h       |  4 +-
 gi/pf/learn_cfg.cc             |  4 +-
 gi/pf/pfbrat.cc                |  2 +-
 gi/pf/pyp_lm.cc                | 70 ++++++++++++++++++++++++++++---
 phrasinator/gibbs_train_plm.cc |  2 +-
 utils/ccrp.h                   | 95 ++++++++++++++++++------------------------
 utils/ccrp_nt.h                | 52 +++++++++++------------
 utils/ccrp_onetable.h          | 70 +++++++++++++++----------------
 utils/mfcr.h                   | 58 +++++++++++++-------------
 12 files changed, 203 insertions(+), 160 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index e24cb457..4ce7cf62 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -104,7 +104,7 @@ struct HierarchicalWordBase {
   }
 
   void Summary() const {
-    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.d() << ",\\alpha=" << r.alpha() << ')' << endl;
+    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.discount() << ",\\alpha=" << r.alpha() << ')' << endl;
     for (MFCR<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
       cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl;
   }
diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc
index 8c1d689f..dbc9dc07 100644
--- a/gi/pf/align-lexonly.cc
+++ b/gi/pf/align-lexonly.cc
@@ -105,7 +105,7 @@ struct HierarchicalWordBase {
   }
 
   void Summary() const {
-    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (\\alpha=" << r.concentration() << ')' << endl;
+    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (\\alpha=" << r.alpha() << ')' << endl;
     for (CCRP_NoTable<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
       cerr << "   " << it->second << '\t' << TD::GetString(it->first) << endl;
   }
diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc
index 7b60ef23..c2c52760 100644
--- a/gi/pf/brat.cc
+++ b/gi/pf/brat.cc
@@ -191,7 +191,7 @@ struct UniphraseLM {
   void ResampleHyperparameters(MT19937* rng) {
     phrases_.resample_hyperparameters(rng);
     gen_.resample_hyperparameters(rng);
-    cerr << " " << phrases_.concentration();
+    cerr << " " << phrases_.alpha();
   }
 
   CCRP_NoTable<vector<int> > phrases_;
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
index 2e9e38fc..f9841cbf 100644
--- a/gi/pf/conditional_pseg.h
+++ b/gi/pf/conditional_pseg.h
@@ -22,7 +22,7 @@ struct MConditionalTranslationModel {
   void Summary() const {
     std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
     for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
-      std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.d() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
+      std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.discount() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
       for (MFCR<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
         std::cerr << "   " << -1 << '\t' << i2->first << std::endl;
     }
@@ -95,7 +95,7 @@ struct ConditionalTranslationModel {
   void Summary() const {
     std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
     for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
-      std::cerr << TD::GetString(it->first) << "   \t(\\alpha = " << it->second.concentration() << ") --------------------------" << std::endl;
+      std::cerr << TD::GetString(it->first) << "   \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
       for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
         std::cerr << "   " << i2->second << '\t' << i2->first << std::endl;
     }
diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc
index b2ca029a..5b748311 100644
--- a/gi/pf/learn_cfg.cc
+++ b/gi/pf/learn_cfg.cc
@@ -183,9 +183,9 @@ struct HieroLMModel {
       nts[i].resample_hyperparameters(rng);
     if (kHIERARCHICAL_PRIOR) {
       q0.resample_hyperparameters(rng);
-      cerr << "[base d=" << q0.discount() << ", alpha=" << q0.discount() << "]";
+      cerr << "[base d=" << q0.discount() << ", alpha=" << q0.alpha() << "]";
     }
-    cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl;
+    cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].alpha() << endl;
   }
 
   const BaseRuleModel base;
diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc
index 7b60ef23..c2c52760 100644
--- a/gi/pf/pfbrat.cc
+++ b/gi/pf/pfbrat.cc
@@ -191,7 +191,7 @@ struct UniphraseLM {
   void ResampleHyperparameters(MT19937* rng) {
     phrases_.resample_hyperparameters(rng);
     gen_.resample_hyperparameters(rng);
-    cerr << " " << phrases_.concentration();
+    cerr << " " << phrases_.alpha();
   }
 
   CCRP_NoTable<vector<int> > phrases_;
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 2837e33c..0d85536c 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -50,16 +50,19 @@ template <unsigned N> struct PYPLM;
 
 // uniform base distribution
 template<> struct PYPLM<0> {
-  PYPLM(unsigned vs) : p0(1.0 / vs) {}
-  void increment(WordID w, const vector<WordID>& context, MT19937* rng) const {}
-  void decrement(WordID w, const vector<WordID>& context, MT19937* rng) const {}
+  PYPLM(unsigned vs) : p0(1.0 / vs), draws() {}
+  void increment(WordID w, const vector<WordID>& context, MT19937* rng) { ++draws; }
+  void decrement(WordID w, const vector<WordID>& context, MT19937* rng) { --draws; assert(draws >= 0); }
   double prob(WordID w, const vector<WordID>& context) const { return p0; }
+  void resample_hyperparameters(MT19937* rng, const unsigned nloop, const unsigned niterations) {}
+  double log_likelihood() const { return draws * log(p0); }
   const double p0;
+  int draws;
 };
 
 // represents an N-gram LM
 template <unsigned N> struct PYPLM {
-  PYPLM(unsigned vs) : backoff(vs) {}
+  PYPLM(unsigned vs) : backoff(vs), d(0.8), alpha(1.0) {}
   void increment(WordID w, const vector<WordID>& context, MT19937* rng) {
     const double bo = backoff.prob(w, context);
     static vector<WordID> lookup(N-1);
@@ -67,7 +70,7 @@ template <unsigned N> struct PYPLM {
       lookup[i] = context[context.size() - 1 - i];
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
     if (it == p.end())
-      it = p.insert(make_pair(lookup, CCRP<WordID>(1,1,1,1))).first;
+      it = p.insert(make_pair(lookup, CCRP<WordID>(d,alpha))).first;
     if (it->second.increment(w, bo, rng))
       backoff.increment(w, context, rng);
   }
@@ -89,7 +92,58 @@ template <unsigned N> struct PYPLM {
     if (it == p.end()) return bo;
     return it->second.prob(w, bo);
   }
+
+  double log_likelihood() const {
+    return log_likelihood(d, alpha) + backoff.log_likelihood();
+  }
+
+  double log_likelihood(const double& dd, const double& aa) const {
+    if (aa <= -dd) return -std::numeric_limits<double>::infinity();
+    double llh = Md::log_beta_density(dd, 1, 1) + Md::log_gamma_density(aa, 1, 1);
+    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it;
+    for (it = p.begin(); it != p.end(); ++it)
+      llh += it->second.log_crp_prob(dd, aa);
+    return llh;
+  }
+
+  struct DiscountResampler {
+    DiscountResampler(const PYPLM& m) : m_(m) {}
+    const PYPLM& m_;
+    double operator()(const double& proposed_discount) const {
+      return m_.log_likelihood(proposed_discount, m_.alpha);
+    }
+  };
+
+  struct AlphaResampler {
+    AlphaResampler(const PYPLM& m) : m_(m) {}
+    const PYPLM& m_;
+    double operator()(const double& proposed_alpha) const {
+      return m_.log_likelihood(m_.d, proposed_alpha);
+    }
+  };
+
+  void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    DiscountResampler dr(*this);
+    AlphaResampler ar(*this);
+    for (int iter = 0; iter < nloop; ++iter) {
+      alpha = slice_sampler1d(ar, alpha, *rng, 0.0,
+                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+      d = slice_sampler1d(dr, d, *rng, std::numeric_limits<double>::min(),
+                          1.0, 0.0, niterations, 100*niterations);
+    }
+    alpha = slice_sampler1d(ar, alpha, *rng, 0.0,
+                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it;
+    cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << alpha << ") = " << log_likelihood(d, alpha) << endl;
+    for (it = p.begin(); it != p.end(); ++it) {
+      it->second.set_discount(d);
+      it->second.set_alpha(alpha);
+    }
+    backoff.resample_hyperparameters(rng, nloop, niterations);
+  }
+
   PYPLM<N-1> backoff;
+  double d, alpha;
   unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p;
 };
 
@@ -109,7 +163,7 @@ int main(int argc, char** argv) {
   cerr << "Reading corpus...\n";
   CorpusTools::ReadFromFile(conf["input"].as<string>(), &corpuse, &vocabe);
   cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
-#define kORDER 5
+#define kORDER 3
   PYPLM<kORDER> lm(vocabe.size());
   vector<WordID> ctx(kORDER - 1, TD::Convert("<s>"));
   int mci = corpuse.size() * 99 / 100;
@@ -126,6 +180,10 @@ int main(int argc, char** argv) {
       if (SS > 0) lm.decrement(kEOS, ctx, &rng);
       lm.increment(kEOS, ctx, &rng);
     }
+    if (SS % 10 == 9) {
+      cerr << " [LLH=" << lm.log_likelihood() << "]" << endl;
+      if (SS % 20 == 19) lm.resample_hyperparameters(&rng);
+    } else { cerr << '.' << flush; }
   }
   double llh = 0;
   unsigned cnt = 0;
diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc
index 66b46011..54861dcb 100644
--- a/phrasinator/gibbs_train_plm.cc
+++ b/phrasinator/gibbs_train_plm.cc
@@ -252,7 +252,7 @@ struct UniphraseLM {
   void ResampleHyperparameters(MT19937* rng) {
     phrases_.resample_hyperparameters(rng);
     gen_.resample_hyperparameters(rng);
-    cerr << " d=" << phrases_.discount() << ",c=" << phrases_.concentration();
+    cerr << " d=" << phrases_.discount() << ",a=" << phrases_.alpha();
   }
 
   CCRP<vector<int> > phrases_;
diff --git a/utils/ccrp.h b/utils/ccrp.h
index 1a9e3ed5..d9a38089 100644
--- a/utils/ccrp.h
+++ b/utils/ccrp.h
@@ -17,35 +17,37 @@
 template <typename Dish, typename DishHash = boost::hash<Dish> >
 class CCRP {
  public:
-  CCRP(double disc, double conc) :
+  CCRP(double disc, double alpha) :
     num_tables_(),
     num_customers_(),
     discount_(disc),
-    concentration_(conc),
+    alpha_(alpha),
     discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()),
     discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
-    concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
-    concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
+    alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
+    alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
 
   CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) :
     num_tables_(),
     num_customers_(),
     discount_(d),
-    concentration_(c),
+    alpha_(c),
     discount_prior_alpha_(d_alpha),
     discount_prior_beta_(d_beta),
-    concentration_prior_shape_(c_shape),
-    concentration_prior_rate_(c_rate) {}
+    alpha_prior_shape_(c_shape),
+    alpha_prior_rate_(c_rate) {}
 
   double discount() const { return discount_; }
-  double concentration() const { return concentration_; }
+  double alpha() const { return alpha_; }
+  void set_discount(double d) { discount_ = d; }
+  void set_alpha(double a) { alpha_ = a; }
 
   bool has_discount_prior() const {
     return !std::isnan(discount_prior_alpha_);
   }
 
-  bool has_concentration_prior() const {
-    return !std::isnan(concentration_prior_shape_);
+  bool has_alpha_prior() const {
+    return !std::isnan(alpha_prior_shape_);
   }
 
   void clear() {
@@ -79,7 +81,7 @@ class CCRP {
     DishLocations& loc = dish_locs_[dish];
     bool share_table = false;
     if (loc.total_dish_count_) {
-      const double p_empty = (concentration_ + num_tables_ * discount_) * p0;
+      const double p_empty = (alpha_ + num_tables_ * discount_) * p0;
       const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
       share_table = rng->SelectSample(p_empty, p_share);
     }
@@ -113,7 +115,7 @@ class CCRP {
     DishLocations& loc = dish_locs_[dish];
     bool share_table = false;
     if (loc.total_dish_count_) {
-      const T p_empty = T(concentration_ + num_tables_ * discount_) * p0;
+      const T p_empty = T(alpha_ + num_tables_ * discount_) * p0;
       const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_);
       share_table = rng->SelectSample(p_empty, p_share);
     }
@@ -180,63 +182,46 @@ class CCRP {
 
   double prob(const Dish& dish, const double& p0) const {
     const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
-    const double r = num_tables_ * discount_ + concentration_;
+    const double r = num_tables_ * discount_ + alpha_;
     if (it == dish_locs_.end()) {
-      return r * p0 / (num_customers_ + concentration_);
+      return r * p0 / (num_customers_ + alpha_);
     } else {
       return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) /
-               (num_customers_ + concentration_);
+               (num_customers_ + alpha_);
     }
   }
 
   template <typename T>
   T probT(const Dish& dish, const T& p0) const {
     const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
-    const T r = T(num_tables_ * discount_ + concentration_);
+    const T r = T(num_tables_ * discount_ + alpha_);
     if (it == dish_locs_.end()) {
-      return r * p0 / T(num_customers_ + concentration_);
+      return r * p0 / T(num_customers_ + alpha_);
     } else {
       return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) /
-               T(num_customers_ + concentration_);
+               T(num_customers_ + alpha_);
     }
   }
 
   double log_crp_prob() const {
-    return log_crp_prob(discount_, concentration_);
-  }
-
-  static double log_beta_density(const double& x, const double& alpha, const double& beta) {
-    assert(x > 0.0);
-    assert(x < 1.0);
-    assert(alpha > 0.0);
-    assert(beta > 0.0);
-    const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta);
-    return lp;
-  }
-
-  static double log_gamma_density(const double& x, const double& shape, const double& rate) {
-    assert(x >= 0.0);
-    assert(shape > 0.0);
-    assert(rate > 0.0);
-    const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape);
-    return lp;
+    return log_crp_prob(discount_, alpha_);
   }
 
   // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
   // does not include P_0's
-  double log_crp_prob(const double& discount, const double& concentration) const {
+  double log_crp_prob(const double& discount, const double& alpha) const {
     double lp = 0.0;
     if (has_discount_prior())
-      lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_);
-    if (has_concentration_prior())
-      lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_);
+      lp = Md::log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_);
+    if (has_alpha_prior())
+      lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
     assert(lp <= 0.0);
     if (num_customers_) {
       if (discount > 0.0) {
         const double r = lgamma(1.0 - discount);
-        lp += lgamma(concentration) - lgamma(concentration + num_customers_)
-             + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_)
-             - lgamma(concentration / discount);
+        lp += lgamma(alpha) - lgamma(alpha + num_customers_)
+             + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_)
+             - lgamma(alpha / discount);
         assert(std::isfinite(lp));
         for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
              it != dish_locs_.end(); ++it) {
@@ -254,12 +239,12 @@ class CCRP {
   }
 
   void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
-    assert(has_discount_prior() || has_concentration_prior());
+    assert(has_discount_prior() || has_alpha_prior());
     DiscountResampler dr(*this);
     ConcentrationResampler cr(*this);
     for (int iter = 0; iter < nloop; ++iter) {
-      if (has_concentration_prior()) {
-        concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+      if (has_alpha_prior()) {
+        alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
                                std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
       }
       if (has_discount_prior()) {
@@ -267,7 +252,7 @@ class CCRP {
                                1.0, 0.0, niterations, 100*niterations);
       }
     }
-    concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+    alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
   }
 
@@ -275,15 +260,15 @@ class CCRP {
     DiscountResampler(const CCRP& crp) : crp_(crp) {}
     const CCRP& crp_;
     double operator()(const double& proposed_discount) const {
-      return crp_.log_crp_prob(proposed_discount, crp_.concentration_);
+      return crp_.log_crp_prob(proposed_discount, crp_.alpha_);
     }
   };
 
   struct ConcentrationResampler {
     ConcentrationResampler(const CCRP& crp) : crp_(crp) {}
     const CCRP& crp_;
-    double operator()(const double& proposed_concentration) const {
-      return crp_.log_crp_prob(crp_.discount_, proposed_concentration);
+    double operator()(const double& proposed_alpha) const {
+      return crp_.log_crp_prob(crp_.discount_, proposed_alpha);
     }
   };
 
@@ -295,7 +280,7 @@ class CCRP {
   };
 
   void Print(std::ostream* out) const {
-    std::cerr << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl;
+    std::cerr << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl;
     for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
          it != dish_locs_.end(); ++it) {
       (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): ";
@@ -320,15 +305,15 @@ class CCRP {
   std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_;
 
   double discount_;
-  double concentration_;
+  double alpha_;
 
   // optional beta prior on discount_ (NaN if no prior)
   double discount_prior_alpha_;
   double discount_prior_beta_;
 
-  // optional gamma prior on concentration_ (NaN if no prior)
-  double concentration_prior_shape_;
-  double concentration_prior_rate_;
+  // optional gamma prior on alpha_ (NaN if no prior)
+  double alpha_prior_shape_;
+  double alpha_prior_rate_;
 };
 
 template <typename T,typename H>
diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h
index 63b6f4c2..79321493 100644
--- a/utils/ccrp_nt.h
+++ b/utils/ccrp_nt.h
@@ -18,20 +18,20 @@ class CCRP_NoTable {
  public:
   explicit CCRP_NoTable(double conc) :
     num_customers_(),
-    concentration_(conc),
-    concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
-    concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
+    alpha_(conc),
+    alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
+    alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
 
   CCRP_NoTable(double c_shape, double c_rate, double c = 10.0) :
     num_customers_(),
-    concentration_(c),
-    concentration_prior_shape_(c_shape),
-    concentration_prior_rate_(c_rate) {}
+    alpha_(c),
+    alpha_prior_shape_(c_shape),
+    alpha_prior_rate_(c_rate) {}
 
-  double concentration() const { return concentration_; }
+  double alpha() const { return alpha_; }
 
-  bool has_concentration_prior() const {
-    return !std::isnan(concentration_prior_shape_);
+  bool has_alpha_prior() const {
+    return !std::isnan(alpha_prior_shape_);
   }
 
   void clear() {
@@ -73,16 +73,16 @@ class CCRP_NoTable {
 
   double prob(const Dish& dish, const double& p0) const {
     const unsigned at_table = num_customers(dish);
-    return (at_table + p0 * concentration_) / (num_customers_ + concentration_);
+    return (at_table + p0 * alpha_) / (num_customers_ + alpha_);
   }
 
   double logprob(const Dish& dish, const double& logp0) const {
     const unsigned at_table = num_customers(dish);
-    return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_);
+    return log(at_table + exp(logp0 + log(alpha_))) - log(num_customers_ + alpha_);
   }
 
   double log_crp_prob() const {
-    return log_crp_prob(concentration_);
+    return log_crp_prob(alpha_);
   }
 
   static double log_gamma_density(const double& x, const double& shape, const double& rate) {
@@ -95,14 +95,14 @@ class CCRP_NoTable {
 
   // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
   // does not include P_0's
-  double log_crp_prob(const double& concentration) const {
+  double log_crp_prob(const double& alpha) const {
     double lp = 0.0;
-    if (has_concentration_prior())
-      lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_);
+    if (has_alpha_prior())
+      lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
     assert(lp <= 0.0);
     if (num_customers_) {
-      lp += lgamma(concentration) - lgamma(concentration + num_customers_) +
-        custs_.size() * log(concentration);
+      lp += lgamma(alpha) - lgamma(alpha + num_customers_) +
+        custs_.size() * log(alpha);
       assert(std::isfinite(lp));
       for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin();
              it != custs_.end(); ++it) {
@@ -114,10 +114,10 @@ class CCRP_NoTable {
   }
 
   void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
-    assert(has_concentration_prior());
+    assert(has_alpha_prior());
     ConcentrationResampler cr(*this);
     for (int iter = 0; iter < nloop; ++iter) {
-        concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+        alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
                                std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
     }
   }
@@ -125,13 +125,13 @@ class CCRP_NoTable {
   struct ConcentrationResampler {
     ConcentrationResampler(const CCRP_NoTable& crp) : crp_(crp) {}
     const CCRP_NoTable& crp_;
-    double operator()(const double& proposed_concentration) const {
-      return crp_.log_crp_prob(proposed_concentration);
+    double operator()(const double& proposed_alpha) const {
+      return crp_.log_crp_prob(proposed_alpha);
     }
   };
 
   void Print(std::ostream* out) const {
-    (*out) << "DP(alpha=" << concentration_ << ") customers=" << num_customers_ << std::endl;
+    (*out) << "DP(alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl;
     int cc = 0;
     for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin();
          it != custs_.end(); ++it) {
@@ -153,11 +153,11 @@ class CCRP_NoTable {
     return custs_.end();
   }
 
-  double concentration_;
+  double alpha_;
 
-  // optional gamma prior on concentration_ (NaN if no prior)
-  double concentration_prior_shape_;
-  double concentration_prior_rate_;
+  // optional gamma prior on alpha_ (NaN if no prior)
+  double alpha_prior_shape_;
+  double alpha_prior_rate_;
 };
 
 template <typename T,typename H>
diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h
index b63737d1..1fe01b0e 100644
--- a/utils/ccrp_onetable.h
+++ b/utils/ccrp_onetable.h
@@ -21,33 +21,33 @@ class CCRP_OneTable {
     num_tables_(),
     num_customers_(),
     discount_(disc),
-    concentration_(conc),
+    alpha_(conc),
     discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()),
     discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
-    concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
-    concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
+    alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
+    alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
 
   CCRP_OneTable(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) :
     num_tables_(),
     num_customers_(),
     discount_(d),
-    concentration_(c),
+    alpha_(c),
     discount_prior_alpha_(d_alpha),
     discount_prior_beta_(d_beta),
-    concentration_prior_shape_(c_shape),
-    concentration_prior_rate_(c_rate) {}
+    alpha_prior_shape_(c_shape),
+    alpha_prior_rate_(c_rate) {}
 
   double discount() const { return discount_; }
-  double concentration() const { return concentration_; }
-  void set_concentration(double c) { concentration_ = c; }
+  double alpha() const { return alpha_; }
+  void set_alpha(double c) { alpha_ = c; }
   void set_discount(double d) { discount_ = d; }
 
   bool has_discount_prior() const {
     return !std::isnan(discount_prior_alpha_);
   }
 
-  bool has_concentration_prior() const {
-    return !std::isnan(concentration_prior_shape_);
+  bool has_alpha_prior() const {
+    return !std::isnan(alpha_prior_shape_);
   }
 
   void clear() {
@@ -108,29 +108,29 @@ class CCRP_OneTable {
 
   double prob(const Dish& dish, const double& p0) const {
     const typename DishMapType::const_iterator it = dish_counts_.find(dish);
-    const double r = num_tables_ * discount_ + concentration_;
+    const double r = num_tables_ * discount_ + alpha_;
     if (it == dish_counts_.end()) {
-      return r * p0 / (num_customers_ + concentration_);
+      return r * p0 / (num_customers_ + alpha_);
     } else {
       return (it->second - discount_ + r * p0) /
-               (num_customers_ + concentration_);
+               (num_customers_ + alpha_);
     }
   }
 
   template <typename T>
   T probT(const Dish& dish, const T& p0) const {
     const typename DishMapType::const_iterator it = dish_counts_.find(dish);
-    const T r(num_tables_ * discount_ + concentration_);
+    const T r(num_tables_ * discount_ + alpha_);
     if (it == dish_counts_.end()) {
-      return r * p0 / T(num_customers_ + concentration_);
+      return r * p0 / T(num_customers_ + alpha_);
     } else {
       return (T(it->second - discount_) + r * p0) /
-               T(num_customers_ + concentration_);
+               T(num_customers_ + alpha_);
     }
   }
 
   double log_crp_prob() const {
-    return log_crp_prob(discount_, concentration_);
+    return log_crp_prob(discount_, alpha_);
   }
 
   static double log_beta_density(const double& x, const double& alpha, const double& beta) {
@@ -152,19 +152,19 @@ class CCRP_OneTable {
 
   // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
   // does not include P_0's
-  double log_crp_prob(const double& discount, const double& concentration) const {
+  double log_crp_prob(const double& discount, const double& alpha) const {
     double lp = 0.0;
     if (has_discount_prior())
       lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_);
-    if (has_concentration_prior())
-      lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_);
+    if (has_alpha_prior())
+      lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
     assert(lp <= 0.0);
     if (num_customers_) {
       if (discount > 0.0) {
         const double r = lgamma(1.0 - discount);
-        lp += lgamma(concentration) - lgamma(concentration + num_customers_)
-             + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_)
-             - lgamma(concentration / discount);
+        lp += lgamma(alpha) - lgamma(alpha + num_customers_)
+             + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_)
+             - lgamma(alpha / discount);
         assert(std::isfinite(lp));
         for (typename DishMapType::const_iterator it = dish_counts_.begin();
              it != dish_counts_.end(); ++it) {
@@ -180,12 +180,12 @@ class CCRP_OneTable {
   }
 
   void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
-    assert(has_discount_prior() || has_concentration_prior());
+    assert(has_discount_prior() || has_alpha_prior());
     DiscountResampler dr(*this);
     ConcentrationResampler cr(*this);
     for (int iter = 0; iter < nloop; ++iter) {
-      if (has_concentration_prior()) {
-        concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+      if (has_alpha_prior()) {
+        alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
                                std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
       }
       if (has_discount_prior()) {
@@ -193,7 +193,7 @@ class CCRP_OneTable {
                                1.0, 0.0, niterations, 100*niterations);
       }
     }
-    concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+    alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
   }
 
@@ -201,20 +201,20 @@ class CCRP_OneTable {
     DiscountResampler(const CCRP_OneTable& crp) : crp_(crp) {}
     const CCRP_OneTable& crp_;
     double operator()(const double& proposed_discount) const {
-      return crp_.log_crp_prob(proposed_discount, crp_.concentration_);
+      return crp_.log_crp_prob(proposed_discount, crp_.alpha_);
     }
   };
 
   struct ConcentrationResampler {
     ConcentrationResampler(const CCRP_OneTable& crp) : crp_(crp) {}
     const CCRP_OneTable& crp_;
-    double operator()(const double& proposed_concentration) const {
-      return crp_.log_crp_prob(crp_.discount_, proposed_concentration);
+    double operator()(const double& proposed_alpha) const {
+      return crp_.log_crp_prob(crp_.discount_, proposed_alpha);
     }
   };
 
   void Print(std::ostream* out) const {
-    (*out) << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl;
+    (*out) << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl;
     for (typename DishMapType::const_iterator it = dish_counts_.begin(); it != dish_counts_.end(); ++it) {
       (*out) << "  " << it->first << " = " << it->second << std::endl;
     }
@@ -233,15 +233,15 @@ class CCRP_OneTable {
   DishMapType dish_counts_;
 
   double discount_;
-  double concentration_;
+  double alpha_;
 
   // optional beta prior on discount_ (NaN if no prior)
   double discount_prior_alpha_;
   double discount_prior_beta_;
 
-  // optional gamma prior on concentration_ (NaN if no prior)
-  double concentration_prior_shape_;
-  double concentration_prior_rate_;
+  // optional gamma prior on alpha_ (NaN if no prior)
+  double alpha_prior_shape_;
+  double alpha_prior_rate_;
 };
 
 template <typename T,typename H>
diff --git a/utils/mfcr.h b/utils/mfcr.h
index 396d0205..df988f51 100644
--- a/utils/mfcr.h
+++ b/utils/mfcr.h
@@ -43,29 +43,29 @@ class MFCR {
     num_floors_(num_floors),
     num_tables_(),
     num_customers_(),
-    d_(d),
+    discount_(d),
     alpha_(alpha),
-    d_prior_alpha_(std::numeric_limits<double>::quiet_NaN()),
-    d_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
+    discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()),
+    discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
     alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
     alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
 
-  MFCR(unsigned num_floors, double d_alpha, double d_beta, double alpha_shape, double alpha_rate, double d = 0.9, double alpha = 10.0) :
+  MFCR(unsigned num_floors, double discount_alpha, double discount_beta, double alpha_shape, double alpha_rate, double d = 0.9, double alpha = 10.0) :
     num_floors_(num_floors),
     num_tables_(),
     num_customers_(),
-    d_(d),
+    discount_(d),
     alpha_(alpha),
-    d_prior_alpha_(d_alpha),
-    d_prior_beta_(d_beta),
+    discount_prior_alpha_(discount_alpha),
+    discount_prior_beta_(discount_beta),
     alpha_prior_shape_(alpha_shape),
     alpha_prior_rate_(alpha_rate) {}
 
-  double d() const { return d_; }
+  double discount() const { return discount_; }
   double alpha() const { return alpha_; }
 
-  bool has_d_prior() const {
-    return !std::isnan(d_prior_alpha_);
+  bool has_discount_prior() const {
+    return !std::isnan(discount_prior_alpha_);
   }
 
   bool has_alpha_prior() const {
@@ -122,15 +122,15 @@ class MFCR {
     int floor = -1;
     bool share_table = false;
     if (loc.total_dish_count_) {
-      const double p_empty = (alpha_ + num_tables_ * d_) * marg_p0;
-      const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * d_);
+      const double p_empty = (alpha_ + num_tables_ * discount_) * marg_p0;
+      const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
       share_table = rng->SelectSample(p_empty, p_share);
     }
     if (share_table) {
-      double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * d_);
+      double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
       for (typename std::list<TableCount>::iterator ti = loc.table_counts_.begin();
            ti != loc.table_counts_.end(); ++ti) {
-        r -= ti->count - d_;
+        r -= ti->count - discount_;
         if (r <= 0.0) {
           ++ti->count;
           floor = ti->floor;
@@ -206,25 +206,25 @@ class MFCR {
     const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0);
     assert(marg_p0 <= 1.0);
     const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
-    const double r = num_tables_ * d_ + alpha_;
+    const double r = num_tables_ * discount_ + alpha_;
     if (it == dish_locs_.end()) {
       return r * marg_p0 / (num_customers_ + alpha_);
     } else {
-      return (it->second.total_dish_count_ - d_ * it->second.table_counts_.size() + r * marg_p0) /
+      return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * marg_p0) /
                (num_customers_ + alpha_);
     }
   }
 
   double log_crp_prob() const {
-    return log_crp_prob(d_, alpha_);
+    return log_crp_prob(discount_, alpha_);
   }
 
   // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
   // does not include draws from G_w's
   double log_crp_prob(const double& d, const double& alpha) const {
     double lp = 0.0;
-    if (has_d_prior())
-      lp = Md::log_beta_density(d, d_prior_alpha_, d_prior_beta_);
+    if (has_discount_prior())
+      lp = Md::log_beta_density(d, discount_prior_alpha_, discount_prior_beta_);
     if (has_alpha_prior())
       lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
     assert(lp <= 0.0);
@@ -251,7 +251,7 @@ class MFCR {
   }
 
   void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
-    assert(has_d_prior() || has_alpha_prior());
+    assert(has_discount_prior() || has_alpha_prior());
     DiscountResampler dr(*this);
     ConcentrationResampler cr(*this);
     for (int iter = 0; iter < nloop; ++iter) {
@@ -259,8 +259,8 @@ class MFCR {
         alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
                                std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
       }
-      if (has_d_prior()) {
-        d_ = slice_sampler1d(dr, d_, *rng, std::numeric_limits<double>::min(),
+      if (has_discount_prior()) {
+        discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits<double>::min(),
                                1.0, 0.0, niterations, 100*niterations);
       }
     }
@@ -279,8 +279,8 @@ class MFCR {
   struct ConcentrationResampler {
     ConcentrationResampler(const MFCR& crp) : crp_(crp) {}
     const MFCR& crp_;
-    double operator()(const double& proposed_alpha) const {
-      return crp_.log_crp_prob(crp_.d_, proposed_alpha);
+    double operator()(const double& proposediscount_alpha) const {
+      return crp_.log_crp_prob(crp_.discount_, proposediscount_alpha);
     }
   };
 
@@ -292,7 +292,7 @@ class MFCR {
   };
 
   void Print(std::ostream* out) const {
-    (*out) << "MFCR(d=" << d_ << ",alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl;
+    (*out) << "MFCR(d=" << discount_ << ",alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl;
     for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
          it != dish_locs_.end(); ++it) {
       (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): ";
@@ -317,12 +317,12 @@ class MFCR {
   unsigned num_customers_;
   std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_;
 
-  double d_;
+  double discount_;
   double alpha_;
 
-  // optional beta prior on d_ (NaN if no prior)
-  double d_prior_alpha_;
-  double d_prior_beta_;
+  // optional beta prior on discount_ (NaN if no prior)
+  double discount_prior_alpha_;
+  double discount_prior_beta_;
 
   // optional gamma prior on alpha_ (NaN if no prior)
   double alpha_prior_shape_;
-- 
cgit v1.2.3


From 9aca7f30dda576a453eee64bb4ff0e8bd11a9b85 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sun, 4 Mar 2012 14:33:11 -0500
Subject: clean up pyp lm code

---
 gi/pf/pyp_lm.cc | 85 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 60 insertions(+), 25 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 0d85536c..88dfcc7c 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -11,7 +11,14 @@
 #include "tdict.h"
 #include "sampler.h"
 #include "ccrp.h"
-#include "ccrp_onetable.h"
+
+// A not very memory-efficient implementation of an N-gram LM based on PYPs
+// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model
+// based on Pitman-Yor Processes. In Proc. ACL.
+
+// I use templates to handle the recursive formalation of the prior, so
+// the order of the model has to be specified here, at compile time:
+#define kORDER 3
 
 using namespace std;
 using namespace tr1;
@@ -22,8 +29,13 @@ shared_ptr<MT19937> prng;
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
-        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
-        ("input,i",po::value<string>(),"Read data from")
+        ("samples,s",po::value<unsigned>()->default_value(300),"Number of samples")
+        ("train,i",po::value<string>(),"Training data file")
+        ("test,T",po::value<string>(),"Test data file")
+        ("discount_prior_a,a",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): a=this")
+        ("discount_prior_b,b",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): b=this")
+        ("strength_prior_s,s",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): s=this")
+        ("strength_prior_r,r",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): r=this")
         ("random_seed,S",po::value<uint32_t>(), "Random seed");
   po::options_description clo("Command line options");
   clo.add_options()
@@ -40,7 +52,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   }
   po::notify(*conf);
 
-  if (conf->count("help") || (conf->count("input") == 0)) {
+  if (conf->count("help") || (conf->count("train") == 0)) {
     cerr << dcmdline_options << endl;
     exit(1);
   }
@@ -48,13 +60,13 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
 
 template <unsigned N> struct PYPLM;
 
-// uniform base distribution
+// uniform base distribution (0-gram model)
 template<> struct PYPLM<0> {
-  PYPLM(unsigned vs) : p0(1.0 / vs), draws() {}
-  void increment(WordID w, const vector<WordID>& context, MT19937* rng) { ++draws; }
-  void decrement(WordID w, const vector<WordID>& context, MT19937* rng) { --draws; assert(draws >= 0); }
-  double prob(WordID w, const vector<WordID>& context) const { return p0; }
-  void resample_hyperparameters(MT19937* rng, const unsigned nloop, const unsigned niterations) {}
+  PYPLM(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {}
+  void increment(WordID, const vector<WordID>&, MT19937*) { ++draws; }
+  void decrement(WordID, const vector<WordID>&, MT19937*) { --draws; assert(draws >= 0); }
+  double prob(WordID, const vector<WordID>&) const { return p0; }
+  void resample_hyperparameters(MT19937*, const unsigned, const unsigned) {}
   double log_likelihood() const { return draws * log(p0); }
   const double p0;
   int draws;
@@ -62,10 +74,13 @@ template<> struct PYPLM<0> {
 
 // represents an N-gram LM
 template <unsigned N> struct PYPLM {
-  PYPLM(unsigned vs) : backoff(vs), d(0.8), alpha(1.0) {}
+  PYPLM(unsigned vs, double da, double db, double ss, double sr) :
+      backoff(vs, da, db, ss, sr),
+      discount_a(da), discount_b(db),
+      strength_s(ss), strength_r(sr),
+      d(0.8), alpha(1.0), lookup(N-1) {}
   void increment(WordID w, const vector<WordID>& context, MT19937* rng) {
     const double bo = backoff.prob(w, context);
-    static vector<WordID> lookup(N-1);
     for (unsigned i = 0; i < N-1; ++i)
       lookup[i] = context[context.size() - 1 - i];
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
@@ -75,7 +90,6 @@ template <unsigned N> struct PYPLM {
       backoff.increment(w, context, rng);
   }
   void decrement(WordID w, const vector<WordID>& context, MT19937* rng) {
-    static vector<WordID> lookup(N-1);
     for (unsigned i = 0; i < N-1; ++i)
       lookup[i] = context[context.size() - 1 - i];
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
@@ -85,7 +99,6 @@ template <unsigned N> struct PYPLM {
   }
   double prob(WordID w, const vector<WordID>& context) const {
     const double bo = backoff.prob(w, context);
-    static vector<WordID> lookup(N-1);
     for (unsigned i = 0; i < N-1; ++i)
       lookup[i] = context[context.size() - 1 - i];
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it = p.find(lookup);
@@ -99,7 +112,9 @@ template <unsigned N> struct PYPLM {
 
   double log_likelihood(const double& dd, const double& aa) const {
     if (aa <= -dd) return -std::numeric_limits<double>::infinity();
-    double llh = Md::log_beta_density(dd, 1, 1) + Md::log_gamma_density(aa, 1, 1);
+    //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
+    double llh = Md::log_beta_density(dd, discount_a, discount_b) +
+                 Md::log_gamma_density(aa, strength_s, strength_r);
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it;
     for (it = p.begin(); it != p.end(); ++it)
       llh += it->second.log_crp_prob(dd, aa);
@@ -143,7 +158,9 @@ template <unsigned N> struct PYPLM {
   }
 
   PYPLM<N-1> backoff;
+  double discount_a, discount_b, strength_s, strength_r;
   double d, alpha;
+  mutable vector<WordID> lookup;  // thread-local
   unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p;
 };
 
@@ -161,14 +178,21 @@ int main(int argc, char** argv) {
   set<WordID> vocabe;
   const WordID kEOS = TD::Convert("</s>");
   cerr << "Reading corpus...\n";
-  CorpusTools::ReadFromFile(conf["input"].as<string>(), &corpuse, &vocabe);
+  CorpusTools::ReadFromFile(conf["train"].as<string>(), &corpuse, &vocabe);
   cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
-#define kORDER 3
-  PYPLM<kORDER> lm(vocabe.size());
+  vector<vector<WordID> > test;
+  if (conf.count("test"))
+    CorpusTools::ReadFromFile(conf["test"].as<string>(), &test);
+  else
+    test = corpuse;
+  PYPLM<kORDER> lm(vocabe.size(),
+                   conf["discount_prior_a"].as<double>(),
+                   conf["discount_prior_b"].as<double>(),
+                   conf["strength_prior_s"].as<double>(),
+                   conf["strength_prior_r"].as<double>());
   vector<WordID> ctx(kORDER - 1, TD::Convert("<s>"));
-  int mci = corpuse.size() * 99 / 100;
   for (int SS=0; SS < samples; ++SS) {
-    for (int ci = 0; ci < mci; ++ci) {
+    for (int ci = 0; ci < corpuse.size(); ++ci) {
       ctx.resize(kORDER - 1);
       const vector<WordID>& s = corpuse[ci];
       for (int i = 0; i <= s.size(); ++i) {
@@ -187,22 +211,33 @@ int main(int argc, char** argv) {
   }
   double llh = 0;
   unsigned cnt = 0;
-  for (int ci = mci; ci < corpuse.size(); ++ci) {
+  unsigned oovs = 0;
+  for (int ci = 0; ci < test.size(); ++ci) {
     ctx.resize(kORDER - 1);
-    const vector<WordID>& s = corpuse[ci];
+    const vector<WordID>& s = test[ci];
     for (int i = 0; i <= s.size(); ++i) {
       WordID w = (i < s.size() ? s[i] : kEOS);
       double lp = log(lm.prob(w, ctx)) / log(2);
-      cerr << "p(" << TD::Convert(w) << " | " << TD::GetString(ctx) << ") = " << lp << endl;
+      if (i < s.size() && vocabe.count(w) == 0) {
+        cerr << "**OOV ";
+        ++oovs;
+        lp = 0;
+      }
+      cerr << "p(" << TD::Convert(w) << " |";
+      for (int j = ctx.size() + 1 - kORDER; j < ctx.size(); ++j)
+        cerr << ' ' << TD::Convert(ctx[j]);
+      cerr << ") = " << lp << endl;
       ctx.push_back(w);
       llh -= lp;
       cnt++;
     }
   }
-  cerr << "  Log_10 prob: " << (llh * log(2) / log(10)) << endl;
-  cerr << "        Count: " << (cnt) << endl;
+  cerr << "  Log_10 prob: " << (-llh * log(2) / log(10)) << endl;
+  cerr << "        Count: " << cnt << endl;
+  cerr << "         OOVs: " << oovs << endl;
   cerr << "Cross-entropy: " << (llh / cnt) << endl;
   cerr << "   Perplexity: " << pow(2, llh / cnt) << endl;
   return 0;
 }
 
+
-- 
cgit v1.2.3


From 441c4e5d45d989bee8405e410ab343d11cae8164 Mon Sep 17 00:00:00 2001
From: Chris Dyer <prguest11@taipan.cs>
Date: Sun, 4 Mar 2012 23:26:17 +0000
Subject: fix parameter name clash

---
 gi/pf/pyp_lm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gi/pf')

diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 88dfcc7c..e5c44c8b 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -29,7 +29,7 @@ shared_ptr<MT19937> prng;
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
-        ("samples,s",po::value<unsigned>()->default_value(300),"Number of samples")
+        ("samples,n",po::value<unsigned>()->default_value(300),"Number of samples")
         ("train,i",po::value<string>(),"Training data file")
         ("test,T",po::value<string>(),"Test data file")
         ("discount_prior_a,a",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): a=this")
-- 
cgit v1.2.3


From 1d5a0055a948663d799b4c5b1380ce1d9742bf6b Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Mon, 5 Mar 2012 14:51:04 -0500
Subject: support strength=0 PYPs, final notation clean-up

---
 gi/pf/align-lexonly-pyp.cc     |   2 +-
 gi/pf/conditional_pseg.h       |   2 +-
 gi/pf/learn_cfg.cc             |   4 +-
 gi/pf/pyp_lm.cc                |  22 ++++-----
 phrasinator/gibbs_train_plm.cc |   2 +-
 utils/ccrp.h                   | 106 ++++++++++++++++++++++-------------------
 utils/mfcr.h                   | 105 ++++++++++++++++++++++------------------
 7 files changed, 131 insertions(+), 112 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 4ce7cf62..87f7f6b5 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -104,7 +104,7 @@ struct HierarchicalWordBase {
   }
 
   void Summary() const {
-    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.discount() << ",\\alpha=" << r.alpha() << ')' << endl;
+    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.discount() << ",s=" << r.strength() << ')' << endl;
     for (MFCR<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
       cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl;
   }
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
index f9841cbf..86403d8d 100644
--- a/gi/pf/conditional_pseg.h
+++ b/gi/pf/conditional_pseg.h
@@ -22,7 +22,7 @@ struct MConditionalTranslationModel {
   void Summary() const {
     std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
     for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
-      std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.discount() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
+      std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl;
       for (MFCR<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
         std::cerr << "   " << -1 << '\t' << i2->first << std::endl;
     }
diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc
index 5b748311..bf157828 100644
--- a/gi/pf/learn_cfg.cc
+++ b/gi/pf/learn_cfg.cc
@@ -183,9 +183,9 @@ struct HieroLMModel {
       nts[i].resample_hyperparameters(rng);
     if (kHIERARCHICAL_PRIOR) {
       q0.resample_hyperparameters(rng);
-      cerr << "[base d=" << q0.discount() << ", alpha=" << q0.alpha() << "]";
+      cerr << "[base d=" << q0.discount() << ", s=" << q0.strength() << "]";
     }
-    cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].alpha() << endl;
+    cerr << " d=" << nts[0].discount() << ", s=" << nts[0].strength() << endl;
   }
 
   const BaseRuleModel base;
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index e5c44c8b..7ebada13 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -78,14 +78,14 @@ template <unsigned N> struct PYPLM {
       backoff(vs, da, db, ss, sr),
       discount_a(da), discount_b(db),
       strength_s(ss), strength_r(sr),
-      d(0.8), alpha(1.0), lookup(N-1) {}
+      d(0.8), strength(1.0), lookup(N-1) {}
   void increment(WordID w, const vector<WordID>& context, MT19937* rng) {
     const double bo = backoff.prob(w, context);
     for (unsigned i = 0; i < N-1; ++i)
       lookup[i] = context[context.size() - 1 - i];
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
     if (it == p.end())
-      it = p.insert(make_pair(lookup, CCRP<WordID>(d,alpha))).first;
+      it = p.insert(make_pair(lookup, CCRP<WordID>(d,strength))).first;
     if (it->second.increment(w, bo, rng))
       backoff.increment(w, context, rng);
   }
@@ -107,7 +107,7 @@ template <unsigned N> struct PYPLM {
   }
 
   double log_likelihood() const {
-    return log_likelihood(d, alpha) + backoff.log_likelihood();
+    return log_likelihood(d, strength) + backoff.log_likelihood();
   }
 
   double log_likelihood(const double& dd, const double& aa) const {
@@ -125,15 +125,15 @@ template <unsigned N> struct PYPLM {
     DiscountResampler(const PYPLM& m) : m_(m) {}
     const PYPLM& m_;
     double operator()(const double& proposed_discount) const {
-      return m_.log_likelihood(proposed_discount, m_.alpha);
+      return m_.log_likelihood(proposed_discount, m_.strength);
     }
   };
 
   struct AlphaResampler {
     AlphaResampler(const PYPLM& m) : m_(m) {}
     const PYPLM& m_;
-    double operator()(const double& proposed_alpha) const {
-      return m_.log_likelihood(m_.d, proposed_alpha);
+    double operator()(const double& proposed_strength) const {
+      return m_.log_likelihood(m_.d, proposed_strength);
     }
   };
 
@@ -141,25 +141,25 @@ template <unsigned N> struct PYPLM {
     DiscountResampler dr(*this);
     AlphaResampler ar(*this);
     for (int iter = 0; iter < nloop; ++iter) {
-      alpha = slice_sampler1d(ar, alpha, *rng, 0.0,
+      strength = slice_sampler1d(ar, strength, *rng, 0.0,
                               std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
       d = slice_sampler1d(dr, d, *rng, std::numeric_limits<double>::min(),
                           1.0, 0.0, niterations, 100*niterations);
     }
-    alpha = slice_sampler1d(ar, alpha, *rng, 0.0,
+    strength = slice_sampler1d(ar, strength, *rng, 0.0,
                             std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it;
-    cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << alpha << ") = " << log_likelihood(d, alpha) << endl;
+    cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << strength << ") = " << log_likelihood(d, strength) << endl;
     for (it = p.begin(); it != p.end(); ++it) {
       it->second.set_discount(d);
-      it->second.set_alpha(alpha);
+      it->second.set_strength(strength);
     }
     backoff.resample_hyperparameters(rng, nloop, niterations);
   }
 
   PYPLM<N-1> backoff;
   double discount_a, discount_b, strength_s, strength_r;
-  double d, alpha;
+  double d, strength;
   mutable vector<WordID> lookup;  // thread-local
   unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p;
 };
diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc
index 54861dcb..3b99e1b6 100644
--- a/phrasinator/gibbs_train_plm.cc
+++ b/phrasinator/gibbs_train_plm.cc
@@ -252,7 +252,7 @@ struct UniphraseLM {
   void ResampleHyperparameters(MT19937* rng) {
     phrases_.resample_hyperparameters(rng);
     gen_.resample_hyperparameters(rng);
-    cerr << " d=" << phrases_.discount() << ",a=" << phrases_.alpha();
+    cerr << " d=" << phrases_.discount() << ",s=" << phrases_.strength();
   }
 
   CCRP<vector<int> > phrases_;
diff --git a/utils/ccrp.h b/utils/ccrp.h
index c883c027..5f9db7a6 100644
--- a/utils/ccrp.h
+++ b/utils/ccrp.h
@@ -18,27 +18,27 @@
 template <typename Dish, typename DishHash = boost::hash<Dish> >
 class CCRP {
  public:
-  CCRP(double disc, double alpha) :
+  CCRP(double disc, double strength) :
       num_tables_(),
       num_customers_(),
       discount_(disc),
-      alpha_(alpha),
-      discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()),
+      strength_(strength),
+      discount_prior_strength_(std::numeric_limits<double>::quiet_NaN()),
       discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
-      alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
-      alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {
+      strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
+      strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {
     check_hyperparameters();
   }
 
-  CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) :
+  CCRP(double d_strength, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) :
       num_tables_(),
       num_customers_(),
       discount_(d),
-      alpha_(c),
-      discount_prior_alpha_(d_alpha),
+      strength_(c),
+      discount_prior_strength_(d_strength),
       discount_prior_beta_(d_beta),
-      alpha_prior_shape_(c_shape),
-      alpha_prior_rate_(c_rate) {
+      strength_prior_shape_(c_shape),
+      strength_prior_rate_(c_rate) {
     check_hyperparameters();
   }
 
@@ -47,23 +47,23 @@ class CCRP {
       std::cerr << "Bad discount: " << discount_ << std::endl;
       abort();
     }
-    if (alpha_ <= -discount_) {
-      std::cerr << "Bad strength: " << alpha_ << " (discount=" << discount_ << ")" << std::endl;
+    if (strength_ <= -discount_) {
+      std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl;
       abort();
     }
   }
 
   double discount() const { return discount_; }
-  double alpha() const { return alpha_; }
+  double strength() const { return strength_; }
   void set_discount(double d) { discount_ = d; check_hyperparameters(); }
-  void set_alpha(double a) { alpha_ = a; check_hyperparameters(); }
+  void set_strength(double a) { strength_ = a; check_hyperparameters(); }
 
   bool has_discount_prior() const {
-    return !std::isnan(discount_prior_alpha_);
+    return !std::isnan(discount_prior_strength_);
   }
 
-  bool has_alpha_prior() const {
-    return !std::isnan(alpha_prior_shape_);
+  bool has_strength_prior() const {
+    return !std::isnan(strength_prior_shape_);
   }
 
   void clear() {
@@ -97,7 +97,7 @@ class CCRP {
     DishLocations& loc = dish_locs_[dish];
     bool share_table = false;
     if (loc.total_dish_count_) {
-      const double p_empty = (alpha_ + num_tables_ * discount_) * p0;
+      const double p_empty = (strength_ + num_tables_ * discount_) * p0;
       const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
       share_table = rng->SelectSample(p_empty, p_share);
     }
@@ -131,7 +131,7 @@ class CCRP {
     DishLocations& loc = dish_locs_[dish];
     bool share_table = false;
     if (loc.total_dish_count_) {
-      const T p_empty = T(alpha_ + num_tables_ * discount_) * p0;
+      const T p_empty = T(strength_ + num_tables_ * discount_) * p0;
       const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_);
       share_table = rng->SelectSample(p_empty, p_share);
     }
@@ -198,47 +198,47 @@ class CCRP {
 
   double prob(const Dish& dish, const double& p0) const {
     const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
-    const double r = num_tables_ * discount_ + alpha_;
+    const double r = num_tables_ * discount_ + strength_;
     if (it == dish_locs_.end()) {
-      return r * p0 / (num_customers_ + alpha_);
+      return r * p0 / (num_customers_ + strength_);
     } else {
       return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) /
-               (num_customers_ + alpha_);
+               (num_customers_ + strength_);
     }
   }
 
   template <typename T>
   T probT(const Dish& dish, const T& p0) const {
     const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
-    const T r = T(num_tables_ * discount_ + alpha_);
+    const T r = T(num_tables_ * discount_ + strength_);
     if (it == dish_locs_.end()) {
-      return r * p0 / T(num_customers_ + alpha_);
+      return r * p0 / T(num_customers_ + strength_);
     } else {
       return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) /
-               T(num_customers_ + alpha_);
+               T(num_customers_ + strength_);
     }
   }
 
   double log_crp_prob() const {
-    return log_crp_prob(discount_, alpha_);
+    return log_crp_prob(discount_, strength_);
   }
 
   // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
   // does not include P_0's
-  double log_crp_prob(const double& discount, const double& alpha) const {
+  double log_crp_prob(const double& discount, const double& strength) const {
     double lp = 0.0;
     if (has_discount_prior())
-      lp = Md::log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_);
-    if (has_alpha_prior())
-      lp += Md::log_gamma_density(alpha + discount, alpha_prior_shape_, alpha_prior_rate_);
+      lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_);
+    if (has_strength_prior())
+      lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_);
     assert(lp <= 0.0);
     if (num_customers_) {
       if (discount > 0.0) {
         const double r = lgamma(1.0 - discount);
-        if (alpha)
-          lp += lgamma(alpha) - lgamma(alpha / discount);
-        lp += - lgamma(alpha + num_customers_)
-             + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_);
+        if (strength)
+          lp += lgamma(strength) - lgamma(strength / discount);
+        lp += - lgamma(strength + num_customers_)
+             + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_);
         assert(std::isfinite(lp));
         for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
              it != dish_locs_.end(); ++it) {
@@ -247,8 +247,16 @@ class CCRP {
             lp += lgamma(*ti - discount) - r;
           }
         }
+      } else if (!discount) { // discount == 0.0
+        lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_);
+        assert(std::isfinite(lp));
+        for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
+             it != dish_locs_.end(); ++it) {
+          const DishLocations& cur = it->second;
+          lp += lgamma(cur.table_counts_.size());
+        }
       } else {
-        assert(!"not implemented yet");
+        assert(!"discount less than 0 detected!");
       }
     }
     assert(std::isfinite(lp));
@@ -256,22 +264,22 @@ class CCRP {
   }
 
   void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
-    assert(has_discount_prior() || has_alpha_prior());
+    assert(has_discount_prior() || has_strength_prior());
     DiscountResampler dr(*this);
     StrengthResampler sr(*this);
     for (int iter = 0; iter < nloop; ++iter) {
-      if (has_alpha_prior()) {
-        alpha_ = slice_sampler1d(sr, alpha_, *rng, -discount_,
+      if (has_strength_prior()) {
+        strength_ = slice_sampler1d(sr, strength_, *rng, -discount_,
                                std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
       }
       if (has_discount_prior()) {
         double min_discount = std::numeric_limits<double>::min();
-        if (alpha_ < 0.0) min_discount = -alpha_;
+        if (strength_ < 0.0) min_discount = -strength_;
         discount_ = slice_sampler1d(dr, discount_, *rng, min_discount,
                                1.0, 0.0, niterations, 100*niterations);
       }
     }
-    alpha_ = slice_sampler1d(sr, alpha_, *rng, -discount_,
+    strength_ = slice_sampler1d(sr, strength_, *rng, -discount_,
                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
   }
 
@@ -279,15 +287,15 @@ class CCRP {
     DiscountResampler(const CCRP& crp) : crp_(crp) {}
     const CCRP& crp_;
     double operator()(const double& proposed_discount) const {
-      return crp_.log_crp_prob(proposed_discount, crp_.alpha_);
+      return crp_.log_crp_prob(proposed_discount, crp_.strength_);
     }
   };
 
   struct StrengthResampler {
     StrengthResampler(const CCRP& crp) : crp_(crp) {}
     const CCRP& crp_;
-    double operator()(const double& proposed_alpha) const {
-      return crp_.log_crp_prob(crp_.discount_, proposed_alpha);
+    double operator()(const double& proposed_strength) const {
+      return crp_.log_crp_prob(crp_.discount_, proposed_strength);
     }
   };
 
@@ -299,7 +307,7 @@ class CCRP {
   };
 
   void Print(std::ostream* out) const {
-    std::cerr << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl;
+    std::cerr << "PYP(d=" << discount_ << ",c=" << strength_ << ") customers=" << num_customers_ << std::endl;
     for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
          it != dish_locs_.end(); ++it) {
       (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): ";
@@ -324,15 +332,15 @@ class CCRP {
   std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_;
 
   double discount_;
-  double alpha_;
+  double strength_;
 
   // optional beta prior on discount_ (NaN if no prior)
-  double discount_prior_alpha_;
+  double discount_prior_strength_;
   double discount_prior_beta_;
 
-  // optional gamma prior on alpha_ (NaN if no prior)
-  double alpha_prior_shape_;
-  double alpha_prior_rate_;
+  // optional gamma prior on strength_ (NaN if no prior)
+  double strength_prior_shape_;
+  double strength_prior_rate_;
 };
 
 template <typename T,typename H>
diff --git a/utils/mfcr.h b/utils/mfcr.h
index df988f51..aeaf599d 100644
--- a/utils/mfcr.h
+++ b/utils/mfcr.h
@@ -39,37 +39,37 @@ template <typename Dish, typename DishHash = boost::hash<Dish> >
 class MFCR {
  public:
 
-  MFCR(unsigned num_floors, double d, double alpha) :
+  MFCR(unsigned num_floors, double d, double strength) :
     num_floors_(num_floors),
     num_tables_(),
     num_customers_(),
     discount_(d),
-    alpha_(alpha),
-    discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()),
+    strength_(strength),
+    discount_prior_strength_(std::numeric_limits<double>::quiet_NaN()),
     discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
-    alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
-    alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
+    strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
+    strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
 
-  MFCR(unsigned num_floors, double discount_alpha, double discount_beta, double alpha_shape, double alpha_rate, double d = 0.9, double alpha = 10.0) :
+  MFCR(unsigned num_floors, double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) :
     num_floors_(num_floors),
     num_tables_(),
     num_customers_(),
     discount_(d),
-    alpha_(alpha),
-    discount_prior_alpha_(discount_alpha),
+    strength_(strength),
+    discount_prior_strength_(discount_strength),
     discount_prior_beta_(discount_beta),
-    alpha_prior_shape_(alpha_shape),
-    alpha_prior_rate_(alpha_rate) {}
+    strength_prior_shape_(strength_shape),
+    strength_prior_rate_(strength_rate) {}
 
   double discount() const { return discount_; }
-  double alpha() const { return alpha_; }
+  double strength() const { return strength_; }
 
   bool has_discount_prior() const {
-    return !std::isnan(discount_prior_alpha_);
+    return !std::isnan(discount_prior_strength_);
   }
 
-  bool has_alpha_prior() const {
-    return !std::isnan(alpha_prior_shape_);
+  bool has_strength_prior() const {
+    return !std::isnan(strength_prior_shape_);
   }
 
   void clear() {
@@ -122,7 +122,7 @@ class MFCR {
     int floor = -1;
     bool share_table = false;
     if (loc.total_dish_count_) {
-      const double p_empty = (alpha_ + num_tables_ * discount_) * marg_p0;
+      const double p_empty = (strength_ + num_tables_ * discount_) * marg_p0;
       const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
       share_table = rng->SelectSample(p_empty, p_share);
     }
@@ -206,44 +206,53 @@ class MFCR {
     const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0);
     assert(marg_p0 <= 1.0);
     const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
-    const double r = num_tables_ * discount_ + alpha_;
+    const double r = num_tables_ * discount_ + strength_;
     if (it == dish_locs_.end()) {
-      return r * marg_p0 / (num_customers_ + alpha_);
+      return r * marg_p0 / (num_customers_ + strength_);
     } else {
       return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * marg_p0) /
-               (num_customers_ + alpha_);
+               (num_customers_ + strength_);
     }
   }
 
   double log_crp_prob() const {
-    return log_crp_prob(discount_, alpha_);
+    return log_crp_prob(discount_, strength_);
   }
 
   // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
   // does not include draws from G_w's
-  double log_crp_prob(const double& d, const double& alpha) const {
+  double log_crp_prob(const double& discount, const double& strength) const {
     double lp = 0.0;
     if (has_discount_prior())
-      lp = Md::log_beta_density(d, discount_prior_alpha_, discount_prior_beta_);
-    if (has_alpha_prior())
-      lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
+      lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_);
+    if (has_strength_prior())
+      lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_);
     assert(lp <= 0.0);
     if (num_customers_) {
-      if (d > 0.0) {
-        const double r = lgamma(1.0 - d);
-        lp += lgamma(alpha) - lgamma(alpha + num_customers_)
-             + num_tables_ * log(d) + lgamma(alpha / d + num_tables_)
-             - lgamma(alpha / d);
+      if (discount > 0.0) {
+        const double r = lgamma(1.0 - discount);
+        if (strength)
+          lp += lgamma(strength) - lgamma(strength / discount);
+        lp += - lgamma(strength + num_customers_)
+             + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_);
         assert(std::isfinite(lp));
         for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
              it != dish_locs_.end(); ++it) {
           const DishLocations& cur = it->second;
           for (std::list<TableCount>::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) {
-            lp += lgamma(ti->count - d) - r;
+            lp += lgamma(ti->count - discount) - r;
           }
         }
+      } else if (!discount) { // discount == 0.0
+        lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_);
+        assert(std::isfinite(lp));
+        for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
+             it != dish_locs_.end(); ++it) {
+          const DishLocations& cur = it->second;
+          lp += lgamma(cur.table_counts_.size());
+        }
       } else {
-        assert(!"not implemented yet");
+        assert(!"discount less than 0 detected!");
       }
     }
     assert(std::isfinite(lp));
@@ -251,20 +260,22 @@ class MFCR {
   }
 
   void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
-    assert(has_discount_prior() || has_alpha_prior());
+    assert(has_discount_prior() || has_strength_prior());
     DiscountResampler dr(*this);
-    ConcentrationResampler cr(*this);
+    StrengthResampler sr(*this);
     for (int iter = 0; iter < nloop; ++iter) {
-      if (has_alpha_prior()) {
-        alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
+      if (has_strength_prior()) {
+        strength_ = slice_sampler1d(sr, strength_, *rng, -discount_,
                                std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
       }
       if (has_discount_prior()) {
-        discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits<double>::min(),
+        double min_discount = std::numeric_limits<double>::min();
+        if (strength_ < 0.0) min_discount = -strength_;
+        discount_ = slice_sampler1d(dr, discount_, *rng, min_discount,
                                1.0, 0.0, niterations, 100*niterations);
       }
     }
-    alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
+    strength_ = slice_sampler1d(sr, strength_, *rng, -discount_,
                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
   }
 
@@ -272,15 +283,15 @@ class MFCR {
     DiscountResampler(const MFCR& crp) : crp_(crp) {}
     const MFCR& crp_;
     double operator()(const double& proposed_d) const {
-      return crp_.log_crp_prob(proposed_d, crp_.alpha_);
+      return crp_.log_crp_prob(proposed_d, crp_.strength_);
     }
   };
 
-  struct ConcentrationResampler {
-    ConcentrationResampler(const MFCR& crp) : crp_(crp) {}
+  struct StrengthResampler {
+    StrengthResampler(const MFCR& crp) : crp_(crp) {}
     const MFCR& crp_;
-    double operator()(const double& proposediscount_alpha) const {
-      return crp_.log_crp_prob(crp_.discount_, proposediscount_alpha);
+    double operator()(const double& proposediscount_strength) const {
+      return crp_.log_crp_prob(crp_.discount_, proposediscount_strength);
     }
   };
 
@@ -292,7 +303,7 @@ class MFCR {
   };
 
   void Print(std::ostream* out) const {
-    (*out) << "MFCR(d=" << discount_ << ",alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl;
+    (*out) << "MFCR(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl;
     for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
          it != dish_locs_.end(); ++it) {
       (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): ";
@@ -318,15 +329,15 @@ class MFCR {
   std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_;
 
   double discount_;
-  double alpha_;
+  double strength_;
 
   // optional beta prior on discount_ (NaN if no prior)
-  double discount_prior_alpha_;
+  double discount_prior_strength_;
   double discount_prior_beta_;
 
-  // optional gamma prior on alpha_ (NaN if no prior)
-  double alpha_prior_shape_;
-  double alpha_prior_rate_;
+  // optional gamma prior on strength_ (NaN if no prior)
+  double strength_prior_shape_;
+  double strength_prior_rate_;
 };
 
 template <typename T,typename H>
-- 
cgit v1.2.3


From 4c007d48d5829233d0ae3c3c8b48f8c25631bf81 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Mon, 5 Mar 2012 16:06:45 -0500
Subject: use template parameter inference to figure out what type to use for
 probability computations, templatatize number of floors in MFCR rather than
 compile-time set

---
 gi/pf/align-lexonly-pyp.cc | 20 +++++++-------
 gi/pf/conditional_pseg.h   | 22 +++++++--------
 gi/pf/learn_cfg.cc         |  8 +++---
 utils/ccrp.h               | 48 ++------------------------------
 utils/mfcr.h               | 68 ++++++++++++++++++++++++----------------------
 utils/mfcr_test.cc         | 10 +++----
 6 files changed, 68 insertions(+), 108 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 87f7f6b5..ac0590e0 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -68,7 +68,7 @@ struct AlignedSentencePair {
 
 struct HierarchicalWordBase {
   explicit HierarchicalWordBase(const unsigned vocab_e_size) :
-      base(prob_t::One()), r(1,1,1,25,25), u0(-log(vocab_e_size)), l(1,1.0), v(1, 0.0) {}
+      base(prob_t::One()), r(1,1,1,1), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {}
 
   void ResampleHyperparameters(MT19937* rng) {
     r.resample_hyperparameters(rng);
@@ -80,14 +80,14 @@ struct HierarchicalWordBase {
 
   // return p0 of rule.e_
   prob_t operator()(const TRule& rule) const {
-    v[0] = exp(logp0(rule.e_));
-    return prob_t(r.prob(rule.e_, v, l));
+    v[0].logeq(logp0(rule.e_));
+    return r.prob(rule.e_, v.begin(), l.begin());
   }
 
   void Increment(const TRule& rule) {
-    v[0] = exp(logp0(rule.e_));
-    if (r.increment(rule.e_, v, l, &*prng).count) {
-      base *= prob_t(v[0] * l[0]);
+    v[0].logeq(logp0(rule.e_));
+    if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) {
+      base *= v[0] * l[0];
     }
   }
 
@@ -105,15 +105,15 @@ struct HierarchicalWordBase {
 
   void Summary() const {
     cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.discount() << ",s=" << r.strength() << ')' << endl;
-    for (MFCR<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
+    for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
       cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl;
   }
 
   prob_t base;
-  MFCR<vector<WordID> > r;
+  MFCR<1,vector<WordID> > r;
   const double u0;
-  const vector<double> l;
-  mutable vector<double> v;
+  const vector<prob_t> l;
+  mutable vector<prob_t> v;
 };
 
 struct BasicLexicalAlignment {
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
index 86403d8d..ef73e332 100644
--- a/gi/pf/conditional_pseg.h
+++ b/gi/pf/conditional_pseg.h
@@ -17,13 +17,13 @@
 template <typename ConditionalBaseMeasure>
 struct MConditionalTranslationModel {
   explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :
-    rp0(rcp0), lambdas(1, 1.0), p0s(1) {}
+    rp0(rcp0), lambdas(1, prob_t::One()), p0s(1) {}
 
   void Summary() const {
     std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
     for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
       std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl;
-      for (MFCR<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+      for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
         std::cerr << "   " << -1 << '\t' << i2->first << std::endl;
     }
   }
@@ -46,10 +46,10 @@ struct MConditionalTranslationModel {
   int IncrementRule(const TRule& rule, MT19937* rng) {
     RuleModelHash::iterator it = r.find(rule.f_);
     if (it == r.end()) {
-      it = r.insert(make_pair(rule.f_, MFCR<TRule>(1, 1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first;
+      it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first;
     }
-    p0s[0] = rp0(rule).as_float(); 
-    TableCount delta = it->second.increment(rule, p0s, lambdas, rng);
+    p0s[0] = rp0(rule); 
+    TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng);
     return delta.count;
   }
 
@@ -57,10 +57,10 @@ struct MConditionalTranslationModel {
     prob_t p;
     RuleModelHash::const_iterator it = r.find(rule.f_);
     if (it == r.end()) {
-      p.logeq(log(rp0(rule)));
+      p = rp0(rule);
     } else {
-      p0s[0] = rp0(rule).as_float();
-      p = prob_t(it->second.prob(rule, p0s, lambdas));
+      p0s[0] = rp0(rule);
+      p = it->second.prob(rule, p0s.begin(), lambdas.begin());
     }
     return p;
   }
@@ -80,11 +80,11 @@ struct MConditionalTranslationModel {
 
   const ConditionalBaseMeasure& rp0;
   typedef std::tr1::unordered_map<std::vector<WordID>,
-                                  MFCR<TRule>,
+                                  MFCR<1, TRule>,
                                   boost::hash<std::vector<WordID> > > RuleModelHash;
   RuleModelHash r;
-  std::vector<double> lambdas;
-  mutable std::vector<double> p0s;
+  std::vector<prob_t> lambdas;
+  mutable std::vector<prob_t> p0s;
 };
 
 template <typename ConditionalBaseMeasure>
diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc
index bf157828..ed1772bf 100644
--- a/gi/pf/learn_cfg.cc
+++ b/gi/pf/learn_cfg.cc
@@ -127,20 +127,20 @@ struct HieroLMModel {
       nts(num_nts, CCRP<TRule>(1,1,1,1)) {}
 
   prob_t Prob(const TRule& r) const {
-    return nts[nt_id_to_index[-r.lhs_]].probT<prob_t>(r, p0(r));
+    return nts[nt_id_to_index[-r.lhs_]].prob(r, p0(r));
   }
 
   inline prob_t p0(const TRule& r) const {
     if (kHIERARCHICAL_PRIOR)
-      return q0.probT<prob_t>(r, base(r));
+      return q0.prob(r, base(r));
     else
       return base(r);
   }
 
   int Increment(const TRule& r, MT19937* rng) {
-    const int delta = nts[nt_id_to_index[-r.lhs_]].incrementT<prob_t>(r, p0(r), rng);
+    const int delta = nts[nt_id_to_index[-r.lhs_]].increment(r, p0(r), rng);
     if (kHIERARCHICAL_PRIOR && delta)
-      q0.incrementT<prob_t>(r, base(r), rng);
+      q0.increment(r, base(r), rng);
     return delta;
     // return x.increment(r);
   }
diff --git a/utils/ccrp.h b/utils/ccrp.h
index 5f9db7a6..e24130ac 100644
--- a/utils/ccrp.h
+++ b/utils/ccrp.h
@@ -92,42 +92,9 @@ class CCRP {
     return it->total_dish_count_;
   }
 
-  // returns +1 or 0 indicating whether a new table was opened
-  int increment(const Dish& dish, const double& p0, MT19937* rng) {
-    DishLocations& loc = dish_locs_[dish];
-    bool share_table = false;
-    if (loc.total_dish_count_) {
-      const double p_empty = (strength_ + num_tables_ * discount_) * p0;
-      const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
-      share_table = rng->SelectSample(p_empty, p_share);
-    }
-    if (share_table) {
-      double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
-      for (typename std::list<unsigned>::iterator ti = loc.table_counts_.begin();
-           ti != loc.table_counts_.end(); ++ti) {
-        r -= (*ti - discount_);
-        if (r <= 0.0) {
-          ++(*ti);
-          break;
-        }
-      }
-      if (r > 0.0) {
-        std::cerr << "Serious error: r=" << r << std::endl;
-        Print(&std::cerr);
-        assert(r <= 0.0);
-      }
-    } else {
-      loc.table_counts_.push_back(1u);
-      ++num_tables_;
-    }
-    ++loc.total_dish_count_;
-    ++num_customers_;
-    return (share_table ? 0 : 1);
-  }
-
   // returns +1 or 0 indicating whether a new table was opened
   template <typename T>
-  int incrementT(const Dish& dish, const T& p0, MT19937* rng) {
+  int increment(const Dish& dish, const T& p0, MT19937* rng) {
     DishLocations& loc = dish_locs_[dish];
     bool share_table = false;
     if (loc.total_dish_count_) {
@@ -196,19 +163,8 @@ class CCRP {
     }
   }
 
-  double prob(const Dish& dish, const double& p0) const {
-    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
-    const double r = num_tables_ * discount_ + strength_;
-    if (it == dish_locs_.end()) {
-      return r * p0 / (num_customers_ + strength_);
-    } else {
-      return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) /
-               (num_customers_ + strength_);
-    }
-  }
-
   template <typename T>
-  T probT(const Dish& dish, const T& p0) const {
+  T prob(const Dish& dish, const T& p0) const {
     const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
     const T r = T(num_tables_ * discount_ + strength_);
     if (it == dish_locs_.end()) {
diff --git a/utils/mfcr.h b/utils/mfcr.h
index aeaf599d..6cc0ebf1 100644
--- a/utils/mfcr.h
+++ b/utils/mfcr.h
@@ -8,6 +8,7 @@
 #include <list>
 #include <iostream>
 #include <vector>
+#include <iterator>
 #include <tr1/unordered_map>
 #include <boost/functional/hash.hpp>
 #include "sampler.h"
@@ -35,12 +36,11 @@ std::ostream& operator<<(std::ostream& o, const TableCount& tc) {
 // referenced therein.
 // http://www.aclweb.org/anthology/P/P09/P09-2085.pdf
 //
-template <typename Dish, typename DishHash = boost::hash<Dish> >
+template <unsigned Floors, typename Dish, typename DishHash = boost::hash<Dish> >
 class MFCR {
  public:
 
-  MFCR(unsigned num_floors, double d, double strength) :
-    num_floors_(num_floors),
+  MFCR(double d, double strength) :
     num_tables_(),
     num_customers_(),
     discount_(d),
@@ -50,8 +50,7 @@ class MFCR {
     strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
     strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
 
-  MFCR(unsigned num_floors, double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) :
-    num_floors_(num_floors),
+  MFCR(double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) :
     num_tables_(),
     num_customers_(),
     discount_(d),
@@ -111,22 +110,22 @@ class MFCR {
   }
 
   // returns (delta, floor) indicating whether a new table (delta) was opened and on which floor
-  TableCount increment(const Dish& dish, const std::vector<double>& p0s, const std::vector<double>& lambdas, MT19937* rng) {
-    assert(p0s.size() == num_floors_);
-    assert(lambdas.size() == num_floors_);
-
+  template <class InputIterator, class InputIterator2>
+  TableCount increment(const Dish& dish, InputIterator p0s, InputIterator2 lambdas, MT19937* rng) {
     DishLocations& loc = dish_locs_[dish];
     // marg_p0 = marginal probability of opening a new table on any floor with label dish
-    const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0);
-    assert(marg_p0 <= 1.0);
+    typedef typename std::iterator_traits<InputIterator>::value_type F;
+    const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0));
+    assert(marg_p0 <= F(1.0001));
     int floor = -1;
     bool share_table = false;
     if (loc.total_dish_count_) {
-      const double p_empty = (strength_ + num_tables_ * discount_) * marg_p0;
-      const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
+      const F p_empty = F(strength_ + num_tables_ * discount_) * marg_p0;
+      const F p_share = F(loc.total_dish_count_ - loc.table_counts_.size() * discount_);
       share_table = rng->SelectSample(p_empty, p_share);
     }
     if (share_table) {
+      // this can be done with doubles since P0 (which may be tiny) is not involved
       double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
       for (typename std::list<TableCount>::iterator ti = loc.table_counts_.begin();
            ti != loc.table_counts_.end(); ++ti) {
@@ -143,12 +142,18 @@ class MFCR {
         assert(r <= 0.0);
       }
     } else { // sit at currently empty table -- must sample what floor
-      double r = rng->next() * marg_p0;
-      for (unsigned i = 0; i < p0s.size(); ++i) {
-        r -= p0s[i] * lambdas[i];
-        if (r <= 0.0) {
-          floor = i;
-          break;
+      if (Floors == 1) {
+        floor = 0;
+      } else {
+        F r = F(rng->next()) * marg_p0;
+        for (unsigned i = 0; i < Floors; ++i) {
+          r -= (*p0s) * (*lambdas);
+          ++p0s;
+          ++lambdas;
+          if (r <= F(0.0)) {
+            floor = i;
+            break;
+          }
         }
       }
       assert(floor >= 0);
@@ -200,18 +205,18 @@ class MFCR {
     return TableCount(delta, floor);
   }
 
-  double prob(const Dish& dish, const std::vector<double>& p0s, const std::vector<double>& lambdas) const {
-    assert(p0s.size() == num_floors_);
-    assert(lambdas.size() == num_floors_);
-    const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0);
-    assert(marg_p0 <= 1.0);
+  template <class InputIterator, class InputIterator2>
+  typename std::iterator_traits<InputIterator>::value_type prob(const Dish& dish, InputIterator p0s, InputIterator2 lambdas) const {
+    typedef typename std::iterator_traits<InputIterator>::value_type F;
+    const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0));
+    assert(marg_p0 <= F(1.0001));
     const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
-    const double r = num_tables_ * discount_ + strength_;
+    const F r = F(num_tables_ * discount_ + strength_);
     if (it == dish_locs_.end()) {
-      return r * marg_p0 / (num_customers_ + strength_);
+      return r * marg_p0 / F(num_customers_ + strength_);
     } else {
-      return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * marg_p0) /
-               (num_customers_ + strength_);
+      return (F(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + F(r * marg_p0)) /
+               F(num_customers_ + strength_);
     }
   }
 
@@ -303,7 +308,7 @@ class MFCR {
   };
 
   void Print(std::ostream* out) const {
-    (*out) << "MFCR(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl;
+    (*out) << "MFCR<" << Floors << ">(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl;
     for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
          it != dish_locs_.end(); ++it) {
       (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): ";
@@ -323,7 +328,6 @@ class MFCR {
     return dish_locs_.end();
   }
 
-  unsigned num_floors_;
   unsigned num_tables_;
   unsigned num_customers_;
   std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_;
@@ -340,8 +344,8 @@ class MFCR {
   double strength_prior_rate_;
 };
 
-template <typename T,typename H>
-std::ostream& operator<<(std::ostream& o, const MFCR<T,H>& c) {
+template <unsigned N,typename T,typename H>
+std::ostream& operator<<(std::ostream& o, const MFCR<N,T,H>& c) {
   c.Print(&o);
   return o;
 }
diff --git a/utils/mfcr_test.cc b/utils/mfcr_test.cc
index 7c45a37c..cc886335 100644
--- a/utils/mfcr_test.cc
+++ b/utils/mfcr_test.cc
@@ -9,7 +9,7 @@
 using namespace std;
 
 void test_exch(MT19937* rng) {
-  MFCR<int> crp(2, 0.5, 3.0);
+  MFCR<2, int> crp(0.5, 3.0);
   vector<double> lambdas(2);
   vector<double> p0s(2);
   lambdas[0] = 0.2;
@@ -22,23 +22,23 @@ void test_exch(MT19937* rng) {
   double xt = 0;
   int cust = 10;
   vector<int> hist(cust + 1, 0), hist2(cust + 1, 0);
-  for (int i = 0; i < cust; ++i) { crp.increment(1, p0s, lambdas, rng); }
+  for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); }
   const int samples = 100000;
   const bool simulate = true;
   for (int k = 0; k < samples; ++k) {
     if (!simulate) {
       crp.clear();
-      for (int i = 0; i < cust; ++i) { crp.increment(1, p0s, lambdas, rng); }
+      for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); }
     } else {
       int da = rng->next() * cust;
       bool a = rng->next() < 0.45;
       if (a) {
-        for (int i = 0; i < da; ++i) { crp.increment(1, p0s, lambdas, rng); }
+        for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); }
         for (int i = 0; i < da; ++i) { crp.decrement(1, rng); }
         xt += 1.0;
       } else {
         for (int i = 0; i < da; ++i) { crp.decrement(1, rng); }
-        for (int i = 0; i < da; ++i) { crp.increment(1, p0s, lambdas, rng); }
+        for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); }
       }
     }
     int c = crp.num_tables(1);
-- 
cgit v1.2.3


From 7b3936660fb777b455079c63c23aec00f60f98ea Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Mon, 5 Mar 2012 21:36:07 -0500
Subject: tie hyperparameters for translation distributions; support theta < 0
 for PYPLM

---
 gi/pf/align-lexonly-pyp.cc | 13 ++++-----
 gi/pf/conditional_pseg.h   | 68 ++++++++++++++++++++++++++++++++++++----------
 gi/pf/pyp_lm.cc            | 12 ++++----
 utils/ccrp.h               |  4 +--
 utils/mfcr.h               | 19 +++++++++++--
 5 files changed, 84 insertions(+), 32 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index ac0590e0..13a3a487 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -68,14 +68,14 @@ struct AlignedSentencePair {
 
 struct HierarchicalWordBase {
   explicit HierarchicalWordBase(const unsigned vocab_e_size) :
-      base(prob_t::One()), r(1,1,1,1), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {}
+      base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {}
 
   void ResampleHyperparameters(MT19937* rng) {
     r.resample_hyperparameters(rng);
   }
 
   inline double logp0(const vector<WordID>& s) const {
-    return s.size() * u0;
+    return Md::log_poisson(s.size(), 7.5) + s.size() * u0;
   }
 
   // return p0 of rule.e_
@@ -106,7 +106,7 @@ struct HierarchicalWordBase {
   void Summary() const {
     cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.discount() << ",s=" << r.strength() << ')' << endl;
     for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
-      cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl;
+      cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl;
   }
 
   prob_t base;
@@ -167,10 +167,9 @@ struct BasicLexicalAlignment {
   }
 
   void ResampleHyperparemeters() {
-    cerr << "  LLH_prev = " << Likelihood() << flush;
     tmodel.ResampleHyperparameters(&*prng);
     up0.ResampleHyperparameters(&*prng);
-    cerr << "\tLLH_post = " << Likelihood() << endl;
+    cerr << "  (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n";
   }
 
   void ResampleCorpus();
@@ -218,7 +217,7 @@ void BasicLexicalAlignment::ResampleCorpus() {
         up0.Increment(r);
     }
   }
-  cerr << "  LLH = " << tmodel.Likelihood() << endl;
+  cerr << "  LLH = " << Likelihood() << endl;
 }
 
 void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
@@ -311,7 +310,7 @@ int main(int argc, char** argv) {
   for (int i = 0; i < samples; ++i) {
     for (int j = 65; j < 67; ++j) Debug(corpus[j]);
     cerr << i << "\t" << x.tmodel.r.size() << "\t";
-    if (i % 10 == 0) x.ResampleHyperparemeters();
+    if (i % 7 == 6) x.ResampleHyperparemeters();
     x.ResampleCorpus();
     if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
   }
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
index ef73e332..8202778b 100644
--- a/gi/pf/conditional_pseg.h
+++ b/gi/pf/conditional_pseg.h
@@ -17,21 +17,66 @@
 template <typename ConditionalBaseMeasure>
 struct MConditionalTranslationModel {
   explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :
-    rp0(rcp0), lambdas(1, prob_t::One()), p0s(1) {}
+    rp0(rcp0), d(0.5), strength(1.0), lambdas(1, prob_t::One()), p0s(1) {}
 
   void Summary() const {
     std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
     for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
       std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl;
       for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
-        std::cerr << "   " << -1 << '\t' << i2->first << std::endl;
+        std::cerr << "   " << i2->second.total_dish_count_ << '\t' << i2->first << std::endl;
     }
   }
 
+  double log_likelihood(const double& dd, const double& aa) const {
+    if (aa <= -dd) return -std::numeric_limits<double>::infinity();
+    //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
+    double llh = Md::log_beta_density(dd, 1, 1) +
+                 Md::log_gamma_density(dd + aa, 1, 1);
+    typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::const_iterator it;
+    for (it = r.begin(); it != r.end(); ++it)
+      llh += it->second.log_crp_prob(dd, aa);
+    return llh;
+  }
+
+  struct DiscountResampler {
+    DiscountResampler(const MConditionalTranslationModel& m) : m_(m) {}
+    const MConditionalTranslationModel& m_;
+    double operator()(const double& proposed_discount) const {
+      return m_.log_likelihood(proposed_discount, m_.strength);
+    }
+  };
+
+  struct AlphaResampler {
+    AlphaResampler(const MConditionalTranslationModel& m) : m_(m) {}
+    const MConditionalTranslationModel& m_;
+    double operator()(const double& proposed_strength) const {
+      return m_.log_likelihood(m_.d, proposed_strength);
+    }
+  };
+
   void ResampleHyperparameters(MT19937* rng) {
-    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
-      it->second.resample_hyperparameters(rng);
-  } 
+    const unsigned nloop = 5;
+    const unsigned niterations = 10;
+    DiscountResampler dr(*this);
+    AlphaResampler ar(*this);
+    for (int iter = 0; iter < nloop; ++iter) {
+      strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
+                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+      double min_discount = std::numeric_limits<double>::min();
+      if (strength < 0.0) min_discount -= strength;
+      d = slice_sampler1d(dr, d, *rng, min_discount,
+                          1.0, 0.0, niterations, 100*niterations);
+    }
+    strength = slice_sampler1d(ar, strength, *rng, -d,
+                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+    typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::iterator it;
+    std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl;
+    for (it = r.begin(); it != r.end(); ++it) {
+      it->second.set_discount(d);
+      it->second.set_strength(strength);
+    }
+  }
 
   int DecrementRule(const TRule& rule, MT19937* rng) {
     RuleModelHash::iterator it = r.find(rule.f_);
@@ -46,7 +91,7 @@ struct MConditionalTranslationModel {
   int IncrementRule(const TRule& rule, MT19937* rng) {
     RuleModelHash::iterator it = r.find(rule.f_);
     if (it == r.end()) {
-      it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first;
+      it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first;
     }
     p0s[0] = rp0(rule); 
     TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng);
@@ -66,15 +111,7 @@ struct MConditionalTranslationModel {
   }
 
   prob_t Likelihood() const {
-    prob_t p = prob_t::One();
-#if 0
-    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
-      prob_t q; q.logeq(it->second.log_crp_prob());
-      p *= q;
-      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
-        p *= rp0(i2->first);
-    }
-#endif
+    prob_t p; p.logeq(log_likelihood(d, strength));
     return p;
   }
 
@@ -83,6 +120,7 @@ struct MConditionalTranslationModel {
                                   MFCR<1, TRule>,
                                   boost::hash<std::vector<WordID> > > RuleModelHash;
   RuleModelHash r;
+  double d, strength;
   std::vector<prob_t> lambdas;
   mutable std::vector<prob_t> p0s;
 };
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 7ebada13..104f356b 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -18,7 +18,7 @@
 
 // I use templates to handle the recursive formalation of the prior, so
 // the order of the model has to be specified here, at compile time:
-#define kORDER 3
+#define kORDER 4
 
 using namespace std;
 using namespace tr1;
@@ -114,7 +114,7 @@ template <unsigned N> struct PYPLM {
     if (aa <= -dd) return -std::numeric_limits<double>::infinity();
     //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
     double llh = Md::log_beta_density(dd, discount_a, discount_b) +
-                 Md::log_gamma_density(aa, strength_s, strength_r);
+                 Md::log_gamma_density(aa + dd, strength_s, strength_r);
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it;
     for (it = p.begin(); it != p.end(); ++it)
       llh += it->second.log_crp_prob(dd, aa);
@@ -141,12 +141,14 @@ template <unsigned N> struct PYPLM {
     DiscountResampler dr(*this);
     AlphaResampler ar(*this);
     for (int iter = 0; iter < nloop; ++iter) {
-      strength = slice_sampler1d(ar, strength, *rng, 0.0,
+      strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
                               std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
-      d = slice_sampler1d(dr, d, *rng, std::numeric_limits<double>::min(),
+      double min_discount = std::numeric_limits<double>::min();
+      if (strength < 0.0) min_discount -= strength;
+      d = slice_sampler1d(dr, d, *rng, min_discount,
                           1.0, 0.0, niterations, 100*niterations);
     }
-    strength = slice_sampler1d(ar, strength, *rng, 0.0,
+    strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
                             std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it;
     cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << strength << ") = " << log_likelihood(d, strength) << endl;
diff --git a/utils/ccrp.h b/utils/ccrp.h
index e24130ac..439d7e1e 100644
--- a/utils/ccrp.h
+++ b/utils/ccrp.h
@@ -225,12 +225,12 @@ class CCRP {
     StrengthResampler sr(*this);
     for (int iter = 0; iter < nloop; ++iter) {
       if (has_strength_prior()) {
-        strength_ = slice_sampler1d(sr, strength_, *rng, -discount_,
+        strength_ = slice_sampler1d(sr, strength_, *rng, -discount_ + std::numeric_limits<double>::min(),
                                std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
       }
       if (has_discount_prior()) {
         double min_discount = std::numeric_limits<double>::min();
-        if (strength_ < 0.0) min_discount = -strength_;
+        if (strength_ < 0.0) min_discount -= strength_;
         discount_ = slice_sampler1d(dr, discount_, *rng, min_discount,
                                1.0, 0.0, niterations, 100*niterations);
       }
diff --git a/utils/mfcr.h b/utils/mfcr.h
index 6cc0ebf1..886f01ef 100644
--- a/utils/mfcr.h
+++ b/utils/mfcr.h
@@ -48,7 +48,7 @@ class MFCR {
     discount_prior_strength_(std::numeric_limits<double>::quiet_NaN()),
     discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
     strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
-    strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
+    strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) { check_hyperparameters(); }
 
   MFCR(double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) :
     num_tables_(),
@@ -58,10 +58,23 @@ class MFCR {
     discount_prior_strength_(discount_strength),
     discount_prior_beta_(discount_beta),
     strength_prior_shape_(strength_shape),
-    strength_prior_rate_(strength_rate) {}
+    strength_prior_rate_(strength_rate) { check_hyperparameters(); }
+
+  void check_hyperparameters() {
+    if (discount_ < 0.0 || discount_ >= 1.0) {
+      std::cerr << "Bad discount: " << discount_ << std::endl;
+      abort();
+    }
+    if (strength_ <= -discount_) {
+      std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl;
+      abort();
+    }
+  }
 
   double discount() const { return discount_; }
   double strength() const { return strength_; }
+  void set_discount(double d) { discount_ = d; check_hyperparameters(); }
+  void set_strength(double a) { strength_ = a; check_hyperparameters(); }
 
   bool has_discount_prior() const {
     return !std::isnan(discount_prior_strength_);
@@ -275,7 +288,7 @@ class MFCR {
       }
       if (has_discount_prior()) {
         double min_discount = std::numeric_limits<double>::min();
-        if (strength_ < 0.0) min_discount = -strength_;
+        if (strength_ < 0.0) min_discount -= strength_;
         discount_ = slice_sampler1d(dr, discount_, *rng, min_discount,
                                1.0, 0.0, niterations, 100*niterations);
       }
-- 
cgit v1.2.3


From b355e5e8a2a17ae6d8f241d8f6e19e4b4c528397 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 7 Mar 2012 20:25:53 -0500
Subject: lattice builder

---
 gi/pf/Makefile.am         |   7 +-
 gi/pf/align-tl.cc         | 334 ++++++++++++++++++++++++++++++++++++++++++++++
 gi/pf/conditional_pseg.h  |  11 +-
 gi/pf/nuisance_test.cc    | 161 ++++++++++++++++++++++
 gi/pf/transliterations.cc | 193 +++++++++++++++++++++++++++
 gi/pf/transliterations.h  |  20 +++
 6 files changed, 723 insertions(+), 3 deletions(-)
 create mode 100644 gi/pf/align-tl.cc
 create mode 100644 gi/pf/nuisance_test.cc
 create mode 100644 gi/pf/transliterations.cc
 create mode 100644 gi/pf/transliterations.h

(limited to 'gi/pf')

diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 7cf9c14d..5e89f02a 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,12 +1,17 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl
 
 noinst_LIBRARIES = libpf.a
+
 libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
 
+nuisance_test_SOURCES = nuisance_test.cc transliterations.cc
+
 align_lexonly_SOURCES = align-lexonly.cc
 
 align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
 
+align_tl_SOURCES = align-tl.cc transliterations.cc
+
 itg_SOURCES = itg.cc
 
 pyp_lm_SOURCES = pyp_lm.cc
diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc
new file mode 100644
index 00000000..0e0454e5
--- /dev/null
+++ b/gi/pf/align-tl.cc
@@ -0,0 +1,334 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/multi_array.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "array2d.h"
+#include "base_distributions.h"
+#include "monotonic_pseg.h"
+#include "conditional_pseg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "stringlib.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "mfcr.h"
+#include "corpus.h"
+#include "ngram_base.h"
+#include "transliterations.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("input,i",po::value<string>(),"Read parallel data from")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("input") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+shared_ptr<MT19937> prng;
+
+struct LexicalAlignment {
+  unsigned char src_index;
+  bool is_transliteration;
+  vector<pair<short, short> > derivation;
+};
+
+struct AlignedSentencePair {
+  vector<WordID> src;
+  vector<WordID> trg;
+  vector<LexicalAlignment> a;
+  Array2D<short> posterior;
+};
+
+struct HierarchicalWordBase {
+  explicit HierarchicalWordBase(const unsigned vocab_e_size) :
+      base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {}
+
+  void ResampleHyperparameters(MT19937* rng) {
+    r.resample_hyperparameters(rng);
+  }
+
+  inline double logp0(const vector<WordID>& s) const {
+    return Md::log_poisson(s.size(), 7.5) + s.size() * u0;
+  }
+
+  // return p0 of rule.e_
+  prob_t operator()(const TRule& rule) const {
+    v[0].logeq(logp0(rule.e_));
+    return r.prob(rule.e_, v.begin(), l.begin());
+  }
+
+  void Increment(const TRule& rule) {
+    v[0].logeq(logp0(rule.e_));
+    if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) {
+      base *= v[0] * l[0];
+    }
+  }
+
+  void Decrement(const TRule& rule) {
+    if (r.decrement(rule.e_, &*prng).count) {
+      base /= prob_t(exp(logp0(rule.e_)));
+    }
+  }
+
+  prob_t Likelihood() const {
+    prob_t p; p.logeq(r.log_crp_prob());
+    p *= base;
+    return p;
+  }
+
+  void Summary() const {
+    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.discount() << ",s=" << r.strength() << ')' << endl;
+    for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
+      cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl;
+  }
+
+  prob_t base;
+  MFCR<1,vector<WordID> > r;
+  const double u0;
+  const vector<prob_t> l;
+  mutable vector<prob_t> v;
+};
+
+struct BasicLexicalAlignment {
+  explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets,
+                                 const unsigned words_e,
+                                 const unsigned letters_e,
+                                 vector<AlignedSentencePair>* corp) :
+      letters(lets),
+      corpus(*corp),
+      //up0(words_e),
+      //up0("en.chars.1gram", letters_e),
+      //up0("en.words.1gram"),
+      up0(letters_e),
+      //up0("en.chars.2gram"),
+      tmodel(up0) {
+  }
+
+  void InstantiateRule(const WordID src,
+                       const WordID trg,
+                       TRule* rule) const {
+    static const WordID kX = TD::Convert("X") * -1;
+    rule->lhs_ = kX;
+    rule->e_ = letters[trg];
+    rule->f_ = letters[src];
+  }
+
+  void InitializeRandom() {
+    const WordID kNULL = TD::Convert("NULL");
+    cerr << "Initializing with random alignments ...\n";
+    for (unsigned i = 0; i < corpus.size(); ++i) {
+      AlignedSentencePair& asp = corpus[i];
+      asp.a.resize(asp.trg.size());
+      for (unsigned j = 0; j < asp.trg.size(); ++j) {
+        const unsigned char a_j = prng->next() * (1 + asp.src.size());
+        const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+        TRule r;
+        InstantiateRule(f_a_j, asp.trg[j], &r);
+        asp.a[j].is_transliteration = false;
+        asp.a[j].src_index = a_j;
+        if (tmodel.IncrementRule(r, &*prng))
+          up0.Increment(r);
+      }
+    }
+    cerr << "  LLH = " << Likelihood() << endl;
+  }
+
+  prob_t Likelihood() const {
+    prob_t p = tmodel.Likelihood();
+    p *= up0.Likelihood();
+    return p;
+  }
+
+  void ResampleHyperparemeters() {
+    tmodel.ResampleHyperparameters(&*prng);
+    up0.ResampleHyperparameters(&*prng);
+    cerr << "  (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n";
+  }
+
+  void ResampleCorpus();
+
+  const vector<vector<WordID> >& letters; // spelling dictionary
+  vector<AlignedSentencePair>& corpus;
+  //PhraseConditionalUninformativeBase up0;
+  //PhraseConditionalUninformativeUnigramBase up0;
+  //UnigramWordBase up0;
+  //HierarchicalUnigramBase up0;
+  HierarchicalWordBase up0;
+  //CompletelyUniformBase up0;
+  //FixedNgramBase up0;
+  //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel;
+  //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel;
+  //ConditionalTranslationModel<UnigramWordBase> tmodel;
+  //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel;
+  MConditionalTranslationModel<HierarchicalWordBase> tmodel;
+  //ConditionalTranslationModel<FixedNgramBase> tmodel;
+  //ConditionalTranslationModel<CompletelyUniformBase> tmodel;
+};
+
+void BasicLexicalAlignment::ResampleCorpus() {
+  static const WordID kNULL = TD::Convert("NULL");
+  for (unsigned i = 0; i < corpus.size(); ++i) {
+    AlignedSentencePair& asp = corpus[i];
+    SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
+    for (unsigned j = 0; j < asp.trg.size(); ++j) {
+      TRule r;
+      unsigned char& a_j = asp.a[j].src_index;
+      WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+      InstantiateRule(f_a_j, asp.trg[j], &r);
+      if (tmodel.DecrementRule(r, &*prng))
+        up0.Decrement(r);
+
+      for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
+        const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
+        InstantiateRule(prop_f, asp.trg[j], &r);
+        ss[prop_a_j] = tmodel.RuleProbability(r);
+      }
+      a_j = prng->SelectSample(ss);
+      f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+      InstantiateRule(f_a_j, asp.trg[j], &r);
+      if (tmodel.IncrementRule(r, &*prng))
+        up0.Increment(r);
+    }
+  }
+  cerr << "  LLH = " << Likelihood() << endl;
+}
+
+void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
+  for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
+    vector<WordID>& letters = (*l)[*it];
+    if (letters.size()) continue;   // if e and f have the same word
+
+    const string& w = TD::Convert(*it);
+    
+    size_t cur = 0;
+    while (cur < w.size()) {
+      const size_t len = UTF8Len(w[cur]);
+      letters.push_back(TD::Convert(w.substr(cur, len)));
+      if (letset) letset->insert(letters.back());
+      cur += len;
+    }
+  }
+}
+
+void Debug(const AlignedSentencePair& asp) {
+  cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
+  Array2D<bool> a(asp.src.size(), asp.trg.size());
+  for (unsigned j = 0; j < asp.trg.size(); ++j)
+    if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
+  cerr << a << endl;
+}
+
+void AddSample(AlignedSentencePair* asp) {
+  for (unsigned j = 0; j < asp->trg.size(); ++j)
+    asp->posterior(asp->a[j].src_index, j)++;
+}
+
+void WriteAlignments(const AlignedSentencePair& asp) {
+  bool first = true;
+  for (unsigned j = 0; j < asp.trg.size(); ++j) {
+    int src_index = -1;
+    int mc = -1;
+    for (unsigned i = 0; i <= asp.src.size(); ++i) {
+      if (asp.posterior(i, j) > mc) {
+        mc = asp.posterior(i, j);
+        src_index = i;
+      }
+    }
+
+    if (src_index) {
+      if (first) first = false; else cout << ' ';
+      cout << (src_index - 1) << '-' << j;
+    }
+  }
+  cout << endl;
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+//  MT19937& rng = *prng;
+
+  vector<vector<int> > corpuse, corpusf;
+  set<int> vocabe, vocabf;
+  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
+  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
+  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
+  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
+  assert(corpusf.size() == corpuse.size());
+
+  vector<AlignedSentencePair> corpus(corpuse.size());
+  for (unsigned i = 0; i < corpuse.size(); ++i) {
+    corpus[i].src.swap(corpusf[i]);
+    corpus[i].trg.swap(corpuse[i]);
+    corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size());
+  }
+  corpusf.clear(); corpuse.clear();
+
+  vocabf.insert(TD::Convert("NULL"));
+  vector<vector<WordID> > letters(TD::NumWords());
+  set<WordID> letset;
+  ExtractLetters(vocabe, &letters, &letset);
+  ExtractLetters(vocabf, &letters, NULL);
+  letters[TD::Convert("NULL")].clear();
+
+  Transliterations tl;
+
+  // TODO CONFIGURE THIS
+  int min_trans_src = 4;
+
+  cerr << "Initializing transliteration DPs ...\n";
+  for (int i = 0; i < corpus.size(); ++i) {
+    const vector<int>& src = corpus[i].src;
+    const vector<int>& trg = corpus[i].trg;
+    cerr << '.' << flush;
+    if (i % 80 == 79) cerr << endl;
+    for (int j = 0; j < src.size(); ++j) {
+      const vector<int>& src_let = letters[src[j]];
+      for (int k = 0; k < trg.size(); ++k) {
+        const vector<int>& trg_let = letters[trg[k]];
+        if (src_let.size() < min_trans_src)
+          tl.Forbid(src[j], trg[k]);
+        else
+          tl.Initialize(src[j], src_let, trg[k], trg_let);
+      }
+    }
+  }
+  cerr << endl;
+  tl.GraphSummary();
+
+  return 0;
+}
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
index 8202778b..81ddb206 100644
--- a/gi/pf/conditional_pseg.h
+++ b/gi/pf/conditional_pseg.h
@@ -56,6 +56,12 @@ struct MConditionalTranslationModel {
   };
 
   void ResampleHyperparameters(MT19937* rng) {
+    typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::iterator it;
+#if 1
+    for (it = r.begin(); it != r.end(); ++it) {
+      it->second.resample_hyperparameters(rng);
+    }
+#else
     const unsigned nloop = 5;
     const unsigned niterations = 10;
     DiscountResampler dr(*this);
@@ -70,12 +76,12 @@ struct MConditionalTranslationModel {
     }
     strength = slice_sampler1d(ar, strength, *rng, -d,
                             std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
-    typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::iterator it;
     std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl;
     for (it = r.begin(); it != r.end(); ++it) {
       it->second.set_discount(d);
       it->second.set_strength(strength);
     }
+#endif
   }
 
   int DecrementRule(const TRule& rule, MT19937* rng) {
@@ -91,7 +97,8 @@ struct MConditionalTranslationModel {
   int IncrementRule(const TRule& rule, MT19937* rng) {
     RuleModelHash::iterator it = r.find(rule.f_);
     if (it == r.end()) {
-      it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first;
+      //it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first;
+      it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1,1,1,1,0.6, -0.12))).first;
     }
     p0s[0] = rp0(rule); 
     TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng);
diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc
new file mode 100644
index 00000000..0f44fe95
--- /dev/null
+++ b/gi/pf/nuisance_test.cc
@@ -0,0 +1,161 @@
+#include "ccrp.h"
+
+#include <vector>
+#include <iostream>
+
+#include "tdict.h"
+#include "transliterations.h"
+
+using namespace std;
+
+MT19937 rng;
+
+ostream& operator<<(ostream&os, const vector<int>& v) {
+  os << '[' << v[0];
+  if (v.size() == 2) os << ' ' << v[1];
+  return os << ']';
+}
+
+struct Base {
+  Base() : llh(), v(2), v1(1), v2(1), crp(0.25, 0.5) {}
+  inline double p0(const vector<int>& x) const {
+    double p = 0.75;
+    if (x.size() == 2) p = 0.25;
+    p *= 1.0 / 3.0;
+    if (x.size() == 2) p *= 1.0 / 3.0;
+    return p;
+  }
+  double est_deriv_prob(int a, int b, int seg) const {
+    assert(a > 0 && a < 4);  // a \in {1,2,3}
+    assert(b > 0 && b < 4);  // b \in {1,2,3}
+    assert(seg == 0 || seg == 1);   // seg \in {0,1}
+    if (seg == 0) {
+      v[0] = a;
+      v[1] = b;
+      return crp.prob(v, p0(v));
+    } else {
+      v1[0] = a;
+      v2[0] = b;
+      return crp.prob(v1, p0(v1)) * crp.prob(v2, p0(v2));
+    }
+  }
+  double est_marginal_prob(int a, int b) const {
+    return est_deriv_prob(a,b,0) + est_deriv_prob(a,b,1);
+  }
+  int increment(int a, int b, double* pw = NULL) {
+    double p1 = est_deriv_prob(a, b, 0);
+    double p2 = est_deriv_prob(a, b, 1);
+    //p1 = 0.5; p2 = 0.5;
+    int seg = rng.SelectSample(p1,p2);
+    double tmp = 0;
+    if (!pw) pw = &tmp;
+    double& w = *pw;
+    if (seg == 0) {
+      v[0] = a;
+      v[1] = b;
+      w = crp.prob(v, p0(v)) / p1;
+      if (crp.increment(v, p0(v), &rng)) {
+        llh += log(p0(v));
+      }
+    } else {
+      v1[0] = a;
+      w = crp.prob(v1, p0(v1)) / p2;
+      if (crp.increment(v1, p0(v1), &rng)) {
+        llh += log(p0(v1));
+      }
+      v2[0] = b;
+      w *= crp.prob(v2, p0(v2));
+      if (crp.increment(v2, p0(v2), &rng)) {
+        llh += log(p0(v2));
+      }
+    }
+    return seg;
+  }
+  void increment(int a, int b, int seg) {
+    if (seg == 0) {
+      v[0] = a;
+      v[1] = b;
+      if (crp.increment(v, p0(v), &rng)) {
+        llh += log(p0(v));
+      }
+    } else {
+      v1[0] = a;
+      if (crp.increment(v1, p0(v1), &rng)) {
+        llh += log(p0(v1));
+      }
+      v2[0] = b;
+      if (crp.increment(v2, p0(v2), &rng)) {
+        llh += log(p0(v2));
+      }
+    }
+  }
+  void decrement(int a, int b, int seg) {
+    if (seg == 0) {
+      v[0] = a;
+      v[1] = b;
+      if (crp.decrement(v, &rng)) {
+        llh -= log(p0(v));
+      }
+    } else {
+      v1[0] = a;
+      if (crp.decrement(v1, &rng)) {
+        llh -= log(p0(v1));
+      }
+      v2[0] = b;
+      if (crp.decrement(v2, &rng)) {
+        llh -= log(p0(v2));
+      }
+    }
+  }
+  double log_likelihood() const {
+    return llh + crp.log_crp_prob();
+  }
+  double llh;
+  mutable vector<int> v, v1, v2;
+  CCRP<vector<int> > crp;
+};
+
+int main(int argc, char** argv) {
+  double tl = 0;
+  const int ITERS = 1000;
+  const int PARTICLES = 20;
+  const int DATAPOINTS = 50;
+  WordID x = TD::Convert("souvenons");
+  WordID y = TD::Convert("remember");
+  vector<WordID> src; TD::ConvertSentence("s o u v e n o n s", &src);
+  vector<WordID> trg; TD::ConvertSentence("r e m e m b e r", &trg);
+  Transliterations xx;
+  xx.Initialize(x, src, y, trg);
+  return 1;
+
+ for (int j = 0; j < ITERS; ++j) {
+  Base b;
+  vector<int> segs(DATAPOINTS);
+  SampleSet<double> ss;
+  vector<int> sss;
+  for (int i = 0; i < DATAPOINTS; i++) {
+    ss.clear();
+    sss.clear();
+    int x = ((i / 10) % 3) + 1;
+    int y = (i % 3) + 1;
+    //double ep = b.est_marginal_prob(x,y);
+    //cerr << "est p(" << x << "," << y << ") = " << ep << endl;
+    for (int n = 0; n < PARTICLES; ++n) {
+      double w;
+      int seg = b.increment(x,y,&w);
+      //cerr << seg << " w=" << w << endl;
+      ss.add(w);
+      sss.push_back(seg);
+      b.decrement(x,y,seg);
+    }
+    int seg = sss[rng.SelectSample(ss)];
+    b.increment(x, y, seg);
+    //cerr << "Selected: " << seg << endl;
+    //return 1;
+    segs[i] = seg;
+  }
+  tl += b.log_likelihood();
+ }
+  cerr << "LLH=" << tl / ITERS << endl;
+}
+
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc
new file mode 100644
index 00000000..6e0c2e93
--- /dev/null
+++ b/gi/pf/transliterations.cc
@@ -0,0 +1,193 @@
+#include "transliterations.h"
+
+#include <iostream>
+#include <vector>
+#include <tr1/unordered_map>
+
+#include "grammar.h"
+#include "bottom_up_parser.h"
+#include "hg.h"
+#include "hg_intersect.h"
+#include "filelib.h"
+#include "ccrp.h"
+#include "m.h"
+#include "lattice.h"
+#include "verbose.h"
+
+using namespace std;
+using namespace std::tr1;
+
+static WordID kX;
+static int kMAX_SRC_SIZE = 0;
+static vector<vector<WordID> > cur_trg_chunks;
+
+vector<GrammarIter*> tlttofreelist;
+
+static void InitTargetChunks(int max_size, const vector<WordID>& trg) {
+  cur_trg_chunks.clear();
+  vector<WordID> tmp;
+  unordered_set<vector<WordID>, boost::hash<vector<WordID> > > u;
+  for (int len = 1; len <= max_size; ++len) {
+    int end = trg.size() + 1;
+    end -= len;
+    for (int i = 0; i < end; ++i) {
+      tmp.clear();
+      for (int j = 0; j < len; ++j)
+        tmp.push_back(trg[i + j]);
+      if (u.insert(tmp).second) cur_trg_chunks.push_back(tmp);
+    }
+  }
+}
+
+struct TransliterationGrammarIter : public GrammarIter, public RuleBin {
+  TransliterationGrammarIter() { tlttofreelist.push_back(this); }
+  TransliterationGrammarIter(const TRulePtr& inr, int symbol) {
+    if (inr) {
+      r.reset(new TRule(*inr));
+    } else {
+      r.reset(new TRule);
+    }
+    TRule& rr = *r;
+    rr.lhs_ = kX;
+    rr.f_.push_back(symbol);
+    tlttofreelist.push_back(this);
+  }
+  virtual int GetNumRules() const {
+    if (!r) return 0;
+    return cur_trg_chunks.size();
+  }
+  virtual TRulePtr GetIthRule(int i) const {
+    TRulePtr nr(new TRule(*r));
+    nr->e_ = cur_trg_chunks[i];
+    //cerr << nr->AsString() << endl;
+    return nr;
+  }
+  virtual int Arity() const {
+    return 0;
+  }
+  virtual const RuleBin* GetRules() const {
+    if (!r) return NULL; else return this;
+  }
+  virtual const GrammarIter* Extend(int symbol) const {
+    if (symbol <= 0) return NULL;
+    if (!r || !kMAX_SRC_SIZE || r->f_.size() < kMAX_SRC_SIZE)
+      return new TransliterationGrammarIter(r, symbol);
+    else
+      return NULL;
+  }
+  TRulePtr r;
+};
+
+struct TransliterationGrammar : public Grammar {
+  virtual const GrammarIter* GetRoot() const {
+    return new TransliterationGrammarIter;
+  }
+  virtual bool HasRuleForSpan(int, int, int distance) const {
+    return (distance < kMAX_SRC_SIZE);
+  }
+};
+
+struct TInfo {
+  TInfo() : initialized(false) {}
+  bool initialized;
+  Hypergraph lattice;   // may be empty if transliteration is not possible
+  prob_t est_prob;      // will be zero if not possible
+};
+
+struct TransliterationsImpl {
+  TransliterationsImpl() {
+    kX = TD::Convert("X")*-1;
+    kMAX_SRC_SIZE = 4;
+    grammars.push_back(GrammarPtr(new TransliterationGrammar));
+    grammars.push_back(GrammarPtr(new GlueGrammar("S", "X")));
+    SetSilent(true);
+  }
+
+  void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
+    if (src >= graphs.size()) graphs.resize(src + 1);
+    if (graphs[src][trg].initialized) return;
+    int kMAX_TRG_SIZE = 4;
+    InitTargetChunks(kMAX_TRG_SIZE, trg_lets);
+    ExhaustiveBottomUpParser parser("S", grammars);
+    Lattice lat(src_lets.size()), tlat(trg_lets.size());
+    for (unsigned i = 0; i < src_lets.size(); ++i)
+      lat[i].push_back(LatticeArc(src_lets[i], 0.0, 1));
+    for (unsigned i = 0; i < trg_lets.size(); ++i)
+      tlat[i].push_back(LatticeArc(trg_lets[i], 0.0, 1));
+    //cerr << "Creating lattice for: " << TD::Convert(src) << " --> " << TD::Convert(trg) << endl;
+    //cerr << "'" << TD::GetString(src_lets) << "' --> " << TD::GetString(trg_lets) << endl;
+    if (!parser.Parse(lat, &graphs[src][trg].lattice)) {
+      //cerr << "Failed to parse " << TD::GetString(src_lets) << endl;
+      abort();
+    }
+    if (HG::Intersect(tlat, &graphs[src][trg].lattice)) {
+      graphs[src][trg].est_prob = prob_t(1e-4);
+    } else {
+      graphs[src][trg].lattice.clear();
+      //cerr << "Failed to intersect " << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << endl;
+      graphs[src][trg].est_prob = prob_t::Zero();
+    }
+    for (unsigned i = 0; i < tlttofreelist.size(); ++i)
+      delete tlttofreelist[i];
+    tlttofreelist.clear();
+    //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl;
+    graphs[src][trg].initialized = true;
+  }
+
+  const prob_t& EstimateProbability(WordID src, WordID trg) const {
+    assert(src < graphs.size());
+    const unordered_map<WordID, TInfo>& um = graphs[src];
+    const unordered_map<WordID, TInfo>::const_iterator it = um.find(trg);
+    assert(it != um.end());
+    assert(it->second.initialized);
+    return it->second.est_prob;
+  }
+
+  void Forbid(WordID src, WordID trg) {
+    if (src >= graphs.size()) graphs.resize(src + 1);
+    graphs[src][trg].est_prob = prob_t::Zero();
+    graphs[src][trg].initialized = true;
+  }
+
+  void GraphSummary() const {
+    double tlp = 0;
+    int tt = 0;
+    for (int i = 0; i < graphs.size(); ++i) {
+      const unordered_map<WordID, TInfo>& um = graphs[i];
+      unordered_map<WordID, TInfo>::const_iterator it;
+      for (it = um.begin(); it != um.end(); ++it) {
+        if (it->second.lattice.empty()) continue;
+        //cerr << TD::Convert(i) << " --> " << TD::Convert(it->first) << ": " << it->second.lattice.NumberOfPaths() << endl;
+        tlp += log(it->second.lattice.NumberOfPaths());
+        tt++;
+      }
+    }
+    tlp /= tt;
+    cerr << "E[log paths] = " << tlp << endl;
+    cerr << "exp(E[log paths]) = " << exp(tlp) << endl;
+  }
+
+  vector<unordered_map<WordID, TInfo> > graphs;
+  vector<GrammarPtr> grammars;
+};
+
+Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {}
+Transliterations::~Transliterations() { delete pimpl_; }
+
+void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
+  pimpl_->Initialize(src, src_lets, trg, trg_lets);
+}
+
+prob_t Transliterations::EstimateProbability(WordID src, WordID trg) const {
+  return pimpl_->EstimateProbability(src,trg);
+}
+
+void Transliterations::Forbid(WordID src, WordID trg) {
+  pimpl_->Forbid(src, trg);
+}
+
+void Transliterations::GraphSummary() const {
+  pimpl_->GraphSummary();
+}
+
+
diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h
new file mode 100644
index 00000000..a548aacf
--- /dev/null
+++ b/gi/pf/transliterations.h
@@ -0,0 +1,20 @@
+#ifndef _TRANSLITERATIONS_H_
+#define _TRANSLITERATIONS_H_
+
+#include <vector>
+#include "wordid.h"
+#include "prob.h"
+
+struct TransliterationsImpl;
+struct Transliterations {
+  explicit Transliterations();
+  ~Transliterations();
+  void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
+  void Forbid(WordID src, WordID trg);
+  void GraphSummary() const;
+  prob_t EstimateProbability(WordID src, WordID trg) const;
+  TransliterationsImpl* pimpl_;
+};
+
+#endif
+
-- 
cgit v1.2.3


From 9399d6e1f1112d67dd842086a3225387ea55725c Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 8 Mar 2012 01:46:32 -0500
Subject: simple context feature for tagger

---
 decoder/Makefile.am       |   1 +
 decoder/cdec_ff.cc        |   2 +
 decoder/ff_context.cc     |  99 +++++++++++++++++++++++
 decoder/ff_context.h      |  23 ++++++
 gi/pf/align-tl.cc         |   6 +-
 gi/pf/reachability.cc     |   2 +
 gi/pf/reachability.h      |   6 +-
 gi/pf/transliterations.cc | 198 ++++++++++++++--------------------------------
 gi/pf/transliterations.h  |   5 +-
 9 files changed, 194 insertions(+), 148 deletions(-)
 create mode 100644 decoder/ff_context.cc
 create mode 100644 decoder/ff_context.h

(limited to 'gi/pf')

diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 30eaf04d..a00b18af 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -63,6 +63,7 @@ libcdec_a_SOURCES = \
   ff.cc \
   ff_rules.cc \
   ff_wordset.cc \
+  ff_context.cc \
   ff_charset.cc \
   ff_lm.cc \
   ff_klm.cc \
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 4ce5749e..b516c386 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -1,6 +1,7 @@
 #include <boost/shared_ptr.hpp>
 
 #include "ff.h"
+#include "ff_context.h"
 #include "ff_spans.h"
 #include "ff_lm.h"
 #include "ff_klm.h"
@@ -42,6 +43,7 @@ void register_feature_functions() {
 #endif
   ff_registry.Register("SpanFeatures", new FFFactory<SpanFeatures>());
   ff_registry.Register("NgramFeatures", new FFFactory<NgramDetector>());
+  ff_registry.Register("RuleContextFeatures", new FFFactory<RuleContextFeatures>());
   ff_registry.Register("RuleIdentityFeatures", new FFFactory<RuleIdentityFeatures>());
   ff_registry.Register("SourceSyntaxFeatures", new FFFactory<SourceSyntaxFeatures>);
   ff_registry.Register("SourceSpanSizeFeatures", new FFFactory<SourceSpanSizeFeatures>);
diff --git a/decoder/ff_context.cc b/decoder/ff_context.cc
new file mode 100644
index 00000000..19f9a413
--- /dev/null
+++ b/decoder/ff_context.cc
@@ -0,0 +1,99 @@
+#include "ff_context.h"
+
+#include <sstream>
+#include <cassert>
+#include <cmath>
+
+#include "filelib.h"
+#include "stringlib.h"
+#include "sentence_metadata.h"
+#include "lattice.h"
+#include "fdict.h"
+#include "verbose.h"
+
+using namespace std;
+
+namespace {
+  string Escape(const string& x) {
+    string y = x;
+    for (int i = 0; i < y.size(); ++i) {
+      if (y[i] == '=') y[i]='_';
+      if (y[i] == ';') y[i]='_';
+    }
+    return y;
+  }
+}
+
+RuleContextFeatures::RuleContextFeatures(const std::string& param) {
+  kSOS = TD::Convert("<s>");
+  kEOS = TD::Convert("</s>");
+
+  // TODO param lets you pass in a string from the cdec.ini file
+}
+
+void RuleContextFeatures::PrepareForInput(const SentenceMetadata& smeta) {
+  const Lattice& sl = smeta.GetSourceLattice();
+  current_input.resize(sl.size());
+  for (unsigned i = 0; i < sl.size(); ++i) {
+    if (sl[i].size() != 1) {
+      cerr << "Context features not supported with lattice inputs!\nid=" << smeta.GetSentenceId() << endl;
+      abort();
+    }
+    current_input[i] = sl[i][0].label;
+  }
+}
+
+void RuleContextFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                                const Hypergraph::Edge& edge,
+                                                const vector<const void*>& ant_contexts,
+                                                SparseVector<double>* features,
+                                                SparseVector<double>* estimated_features,
+                                                void* context) const {
+  const TRule& rule = *edge.rule_;
+
+  if (rule.Arity() != 0 || // arity = 0, no nonterminals
+      rule.e_.size() != 1) return; // size = 1, predicted label is a single token
+
+
+  // you can see the current label "for free"
+  const WordID cur_label = rule.e_[0];
+  // (if you want to see more labels, you have to be very careful, and muck
+  //  about with contexts and ant_contexts)
+
+  // but... you can look at as much of the source as you want!
+  const int from_src_index = edge.i_;   // start of the span in the input being labeled
+  const int to_src_index = edge.j_;     // end of the span in the input
+  // (note: in the case of tagging the size of the spans being labeled will
+  //  always be 1, but in other formalisms, you can have bigger spans.)
+
+  // this is the current token being labeled:
+  const WordID cur_input = current_input[from_src_index];
+
+  // let's get the previous token in the input (may be to the left of the start
+  // of the sentence!)
+  WordID prev_input = kSOS;
+  if (from_src_index > 0) { prev_input = current_input[from_src_index - 1]; }
+  // let's get the next token (may be to the left of the start of the sentence!)
+  WordID next_input = kEOS;
+  if (to_src_index < current_input.size()) { next_input = current_input[to_src_index]; }
+
+  // now, build a feature string
+  ostringstream os;
+  // TD::Convert converts from the internal integer representation of a token
+  // to the actual token
+  os << "C1:" << TD::Convert(prev_input) << '_' 
+     << TD::Convert(cur_input) << '|' << TD::Convert(cur_label);
+  // C1 is just to prevent a name clash
+
+  // pick a value
+  double fval = 1.0; // can be any real value
+
+  // add it to the feature vector FD::Convert converts the feature string to a
+  // feature int, Escape makes sure the feature string doesn't have any bad
+  // symbols that could confuse a parser somewhere
+  features->add_value(FD::Convert(Escape(os.str())), fval);
+  // that's it!
+
+  // create more features if you like...
+}
+
diff --git a/decoder/ff_context.h b/decoder/ff_context.h
new file mode 100644
index 00000000..0d22b027
--- /dev/null
+++ b/decoder/ff_context.h
@@ -0,0 +1,23 @@
+#ifndef _FF_CONTEXT_H_
+#define _FF_CONTEXT_H_
+
+#include <vector>
+#include "ff.h"
+
+class RuleContextFeatures : public FeatureFunction {
+ public:
+  RuleContextFeatures(const std::string& param);
+ protected:
+  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                     const Hypergraph::Edge& edge,
+                                     const std::vector<const void*>& ant_contexts,
+                                     SparseVector<double>* features,
+                                     SparseVector<double>* estimated_features,
+                                     void* context) const;
+  virtual void PrepareForInput(const SentenceMetadata& smeta);
+ private:
+  std::vector<WordID> current_input;
+  WordID kSOS, kEOS;
+};
+
+#endif
diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc
index 0e0454e5..6bb8c886 100644
--- a/gi/pf/align-tl.cc
+++ b/gi/pf/align-tl.cc
@@ -310,18 +310,16 @@ int main(int argc, char** argv) {
   // TODO CONFIGURE THIS
   int min_trans_src = 4;
 
-  cerr << "Initializing transliteration DPs ...\n";
+  cerr << "Initializing transliteration graph structures ...\n";
   for (int i = 0; i < corpus.size(); ++i) {
     const vector<int>& src = corpus[i].src;
     const vector<int>& trg = corpus[i].trg;
-    cerr << '.' << flush;
-    if (i % 80 == 79) cerr << endl;
     for (int j = 0; j < src.size(); ++j) {
       const vector<int>& src_let = letters[src[j]];
       for (int k = 0; k < trg.size(); ++k) {
         const vector<int>& trg_let = letters[trg[k]];
         if (src_let.size() < min_trans_src)
-          tl.Forbid(src[j], trg[k]);
+          tl.Forbid(src[j], src_let, trg[k], trg_let);
         else
           tl.Initialize(src[j], src_let, trg[k], trg_let);
       }
diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc
index 73dd8d39..70fb76da 100644
--- a/gi/pf/reachability.cc
+++ b/gi/pf/reachability.cc
@@ -47,6 +47,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
           r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true;
           int src_delta = i - prevs[k].prev_src_covered;
           edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true;
+          valid_deltas[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(make_pair<short,short>(src_delta,j - prevs[k].prev_trg_covered));
           short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered];
           if (src_delta > msd) msd = src_delta;
         }
@@ -56,6 +57,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
     assert(!edges[0][0][0][1]);
     assert(!edges[0][0][0][0]);
     assert(max_src_delta[0][0] > 0);
+    cerr << "Sentence with length (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node\n";
     //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n";
     //for (int i = 0; i < b[0][0].size(); ++i) {
     //  cerr << "  -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n";
diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h
index 98450ec1..fb2f4965 100644
--- a/gi/pf/reachability.h
+++ b/gi/pf/reachability.h
@@ -12,12 +12,14 @@
 // currently forbids 0 -> n and n -> 0 alignments
 
 struct Reachability {
-  boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring?
+  boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring?
   boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid
+  boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node
 
   Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
       edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
-      max_src_delta(boost::extents[srclen][trglen]) {
+      max_src_delta(boost::extents[srclen][trglen]),
+      valid_deltas(boost::extents[srclen][trglen]) {
     ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
   }
 
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc
index 6e0c2e93..e29334fd 100644
--- a/gi/pf/transliterations.cc
+++ b/gi/pf/transliterations.cc
@@ -2,173 +2,92 @@
 
 #include <iostream>
 #include <vector>
-#include <tr1/unordered_map>
 
-#include "grammar.h"
-#include "bottom_up_parser.h"
-#include "hg.h"
-#include "hg_intersect.h"
+#include "boost/shared_ptr.hpp"
+
 #include "filelib.h"
 #include "ccrp.h"
 #include "m.h"
-#include "lattice.h"
-#include "verbose.h"
+#include "reachability.h"
 
 using namespace std;
 using namespace std::tr1;
 
-static WordID kX;
-static int kMAX_SRC_SIZE = 0;
-static vector<vector<WordID> > cur_trg_chunks;
-
-vector<GrammarIter*> tlttofreelist;
-
-static void InitTargetChunks(int max_size, const vector<WordID>& trg) {
-  cur_trg_chunks.clear();
-  vector<WordID> tmp;
-  unordered_set<vector<WordID>, boost::hash<vector<WordID> > > u;
-  for (int len = 1; len <= max_size; ++len) {
-    int end = trg.size() + 1;
-    end -= len;
-    for (int i = 0; i < end; ++i) {
-      tmp.clear();
-      for (int j = 0; j < len; ++j)
-        tmp.push_back(trg[i + j]);
-      if (u.insert(tmp).second) cur_trg_chunks.push_back(tmp);
-    }
-  }
-}
-
-struct TransliterationGrammarIter : public GrammarIter, public RuleBin {
-  TransliterationGrammarIter() { tlttofreelist.push_back(this); }
-  TransliterationGrammarIter(const TRulePtr& inr, int symbol) {
-    if (inr) {
-      r.reset(new TRule(*inr));
-    } else {
-      r.reset(new TRule);
-    }
-    TRule& rr = *r;
-    rr.lhs_ = kX;
-    rr.f_.push_back(symbol);
-    tlttofreelist.push_back(this);
-  }
-  virtual int GetNumRules() const {
-    if (!r) return 0;
-    return cur_trg_chunks.size();
-  }
-  virtual TRulePtr GetIthRule(int i) const {
-    TRulePtr nr(new TRule(*r));
-    nr->e_ = cur_trg_chunks[i];
-    //cerr << nr->AsString() << endl;
-    return nr;
-  }
-  virtual int Arity() const {
-    return 0;
-  }
-  virtual const RuleBin* GetRules() const {
-    if (!r) return NULL; else return this;
-  }
-  virtual const GrammarIter* Extend(int symbol) const {
-    if (symbol <= 0) return NULL;
-    if (!r || !kMAX_SRC_SIZE || r->f_.size() < kMAX_SRC_SIZE)
-      return new TransliterationGrammarIter(r, symbol);
-    else
-      return NULL;
-  }
-  TRulePtr r;
-};
-
-struct TransliterationGrammar : public Grammar {
-  virtual const GrammarIter* GetRoot() const {
-    return new TransliterationGrammarIter;
-  }
-  virtual bool HasRuleForSpan(int, int, int distance) const {
-    return (distance < kMAX_SRC_SIZE);
-  }
-};
-
-struct TInfo {
-  TInfo() : initialized(false) {}
+struct GraphStructure {
+  GraphStructure() : initialized(false) {}
+  boost::shared_ptr<Reachability> r;
   bool initialized;
-  Hypergraph lattice;   // may be empty if transliteration is not possible
-  prob_t est_prob;      // will be zero if not possible
 };
 
 struct TransliterationsImpl {
   TransliterationsImpl() {
-    kX = TD::Convert("X")*-1;
-    kMAX_SRC_SIZE = 4;
-    grammars.push_back(GrammarPtr(new TransliterationGrammar));
-    grammars.push_back(GrammarPtr(new GlueGrammar("S", "X")));
-    SetSilent(true);
   }
 
   void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
-    if (src >= graphs.size()) graphs.resize(src + 1);
-    if (graphs[src][trg].initialized) return;
-    int kMAX_TRG_SIZE = 4;
-    InitTargetChunks(kMAX_TRG_SIZE, trg_lets);
-    ExhaustiveBottomUpParser parser("S", grammars);
-    Lattice lat(src_lets.size()), tlat(trg_lets.size());
-    for (unsigned i = 0; i < src_lets.size(); ++i)
-      lat[i].push_back(LatticeArc(src_lets[i], 0.0, 1));
-    for (unsigned i = 0; i < trg_lets.size(); ++i)
-      tlat[i].push_back(LatticeArc(trg_lets[i], 0.0, 1));
-    //cerr << "Creating lattice for: " << TD::Convert(src) << " --> " << TD::Convert(trg) << endl;
-    //cerr << "'" << TD::GetString(src_lets) << "' --> " << TD::GetString(trg_lets) << endl;
-    if (!parser.Parse(lat, &graphs[src][trg].lattice)) {
-      //cerr << "Failed to parse " << TD::GetString(src_lets) << endl;
-      abort();
-    }
-    if (HG::Intersect(tlat, &graphs[src][trg].lattice)) {
-      graphs[src][trg].est_prob = prob_t(1e-4);
+    const size_t src_len = src_lets.size();
+    const size_t trg_len = trg_lets.size();
+    if (src_len >= graphs.size()) graphs.resize(src_len + 1);
+    if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
+    if (graphs[src_len][trg_len].initialized) return;
+    graphs[src_len][trg_len].r.reset(new Reachability(src_len, trg_len, 4, 4));
+
+#if 0
+    if (HG::Intersect(tlat, &hg)) {
+      // TODO
     } else {
-      graphs[src][trg].lattice.clear();
-      //cerr << "Failed to intersect " << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << endl;
-      graphs[src][trg].est_prob = prob_t::Zero();
+      cerr << "No transliteration lattice possible for src_len=" << src_len << " trg_len=" << trg_len << endl;
+      hg.clear();
     }
-    for (unsigned i = 0; i < tlttofreelist.size(); ++i)
-      delete tlttofreelist[i];
-    tlttofreelist.clear();
     //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl;
-    graphs[src][trg].initialized = true;
+#endif
+    graphs[src_len][trg_len].initialized = true;
   }
 
-  const prob_t& EstimateProbability(WordID src, WordID trg) const {
-    assert(src < graphs.size());
-    const unordered_map<WordID, TInfo>& um = graphs[src];
-    const unordered_map<WordID, TInfo>::const_iterator it = um.find(trg);
-    assert(it != um.end());
-    assert(it->second.initialized);
-    return it->second.est_prob;
+  void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
+    const size_t src_len = src_lets.size();
+    const size_t trg_len = trg_lets.size();
+    if (src_len >= graphs.size()) graphs.resize(src_len + 1);
+    if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
+    graphs[src_len][trg_len].r.reset();
+    graphs[src_len][trg_len].initialized = true;
   }
 
-  void Forbid(WordID src, WordID trg) {
-    if (src >= graphs.size()) graphs.resize(src + 1);
-    graphs[src][trg].est_prob = prob_t::Zero();
-    graphs[src][trg].initialized = true;
+  prob_t EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
+    assert(src.size() < graphs.size());
+    const vector<GraphStructure>& tv = graphs[src.size()];
+    assert(trg.size() < tv.size());
+    const GraphStructure& gs = tv[trg.size()];
+    // TODO: do prob
+    return prob_t::Zero();
   }
 
   void GraphSummary() const {
-    double tlp = 0;
-    int tt = 0;
+    double to = 0;
+    double tn = 0;
+    double tt = 0;
     for (int i = 0; i < graphs.size(); ++i) {
-      const unordered_map<WordID, TInfo>& um = graphs[i];
-      unordered_map<WordID, TInfo>::const_iterator it;
-      for (it = um.begin(); it != um.end(); ++it) {
-        if (it->second.lattice.empty()) continue;
-        //cerr << TD::Convert(i) << " --> " << TD::Convert(it->first) << ": " << it->second.lattice.NumberOfPaths() << endl;
-        tlp += log(it->second.lattice.NumberOfPaths());
+      const vector<GraphStructure>& vt = graphs[i];
+      for (int j = 0; j < vt.size(); ++j) {
+        const GraphStructure& gs = vt[j];
+        if (!gs.r) continue;
         tt++;
+        for (int k = 0; k < i; ++k) {
+          for (int l = 0; l < j; ++l) {
+            size_t c = gs.r->valid_deltas[k][l].size();
+            if (c) {
+              tn += 1;
+              to += c;
+            }
+          }
+        }
       }
     }
-    tlp /= tt;
-    cerr << "E[log paths] = " << tlp << endl;
-    cerr << "exp(E[log paths]) = " << exp(tlp) << endl;
+    cerr << "     Average nodes = " << (tn / tt) << endl;
+    cerr << "Average out-degree = " << (to / tn) << endl;
+    cerr << " Unique structures = " << tt << endl;
   }
 
-  vector<unordered_map<WordID, TInfo> > graphs;
-  vector<GrammarPtr> grammars;
+  vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]
 };
 
 Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {}
@@ -178,16 +97,15 @@ void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, Wo
   pimpl_->Initialize(src, src_lets, trg, trg_lets);
 }
 
-prob_t Transliterations::EstimateProbability(WordID src, WordID trg) const {
-  return pimpl_->EstimateProbability(src,trg);
+prob_t Transliterations::EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
+  return pimpl_->EstimateProbability(s, src,t, trg);
 }
 
-void Transliterations::Forbid(WordID src, WordID trg) {
-  pimpl_->Forbid(src, trg);
+void Transliterations::Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
+  pimpl_->Forbid(src, src_lets, trg, trg_lets);
 }
 
 void Transliterations::GraphSummary() const {
   pimpl_->GraphSummary();
 }
 
-
diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h
index a548aacf..76eb2a05 100644
--- a/gi/pf/transliterations.h
+++ b/gi/pf/transliterations.h
@@ -10,9 +10,10 @@ struct Transliterations {
   explicit Transliterations();
   ~Transliterations();
   void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
-  void Forbid(WordID src, WordID trg);
+  void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
   void GraphSummary() const;
-  prob_t EstimateProbability(WordID src, WordID trg) const;
+  prob_t EstimateProbability(WordID s, const std::vector<WordID>& src, WordID t, const std::vector<WordID>& trg) const;
+ private:
   TransliterationsImpl* pimpl_;
 };
 
-- 
cgit v1.2.3


From 2347c5c81ca2873e634975ea5197b0926a69ce53 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 8 Mar 2012 13:32:41 -0500
Subject: tl stuff

---
 gi/pf/Makefile.am         |  8 +++--
 gi/pf/align-tl.cc         |  8 +++--
 gi/pf/reachability.cc     | 17 +++++++---
 gi/pf/reachability.h      |  4 +++
 gi/pf/transliterations.cc | 82 ++++++++++++++++++++++++++++++++++-------------
 gi/pf/transliterations.h  |  3 +-
 6 files changed, 88 insertions(+), 34 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 5e89f02a..9888a70b 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -2,15 +2,17 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexon
 
 noinst_LIBRARIES = libpf.a
 
-libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
+libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc
 
-nuisance_test_SOURCES = nuisance_test.cc transliterations.cc
+nuisance_test_SOURCES = nuisance_test.cc
+nuisance_test_LDADD = libpf.a
 
 align_lexonly_SOURCES = align-lexonly.cc
 
 align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
 
-align_tl_SOURCES = align-tl.cc transliterations.cc
+align_tl_SOURCES = align-tl.cc
+align_tl_LDADD = libpf.a
 
 itg_SOURCES = itg.cc
 
diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc
index 6bb8c886..fe8950b5 100644
--- a/gi/pf/align-tl.cc
+++ b/gi/pf/align-tl.cc
@@ -305,7 +305,10 @@ int main(int argc, char** argv) {
   ExtractLetters(vocabf, &letters, NULL);
   letters[TD::Convert("NULL")].clear();
 
-  Transliterations tl;
+  // TODO configure this
+  int max_src_chunk = 4;
+  int max_trg_chunk = 4;
+  Transliterations tl(max_src_chunk, max_trg_chunk);
 
   // TODO CONFIGURE THIS
   int min_trans_src = 4;
@@ -318,10 +321,9 @@ int main(int argc, char** argv) {
       const vector<int>& src_let = letters[src[j]];
       for (int k = 0; k < trg.size(); ++k) {
         const vector<int>& trg_let = letters[trg[k]];
+        tl.Initialize(src[j], src_let, trg[k], trg_let);
         if (src_let.size() < min_trans_src)
           tl.Forbid(src[j], src_let, trg[k], trg_let);
-        else
-          tl.Initialize(src[j], src_let, trg[k], trg_let);
       }
     }
   }
diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc
index 70fb76da..59bc6ace 100644
--- a/gi/pf/reachability.cc
+++ b/gi/pf/reachability.cc
@@ -39,6 +39,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
     typedef boost::multi_array<bool, 2> rarray_type;
     rarray_type r(boost::extents[srclen + 1][trglen + 1]);
     r[srclen][trglen] = true;
+    nodes = 0;
     for (int i = srclen; i >= 0; --i) {
       for (int j = trglen; j >= 0; --j) {
         vector<SState>& prevs = a[i][j];
@@ -57,10 +58,16 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
     assert(!edges[0][0][0][1]);
     assert(!edges[0][0][0][0]);
     assert(max_src_delta[0][0] > 0);
-    cerr << "Sentence with length (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node\n";
-    //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n";
-    //for (int i = 0; i < b[0][0].size(); ++i) {
-    //  cerr << "  -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n";
-    //}
+    nodes = 0;
+    for (int i = 0; i < srclen; ++i) {
+      for (int j = 0; j < trglen; ++j) {
+        if (valid_deltas[i][j].size() > 0) {
+          node_addresses[i][j] = nodes++;
+        } else {
+          node_addresses[i][j] = -1;
+        }
+      }
+    }
+    cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node, " << nodes << " nodes in total, and outside estimate matrix will require " << sizeof(float)*nodes << " bytes\n";
   }
 
diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h
index fb2f4965..1e22c76a 100644
--- a/gi/pf/reachability.h
+++ b/gi/pf/reachability.h
@@ -12,13 +12,17 @@
 // currently forbids 0 -> n and n -> 0 alignments
 
 struct Reachability {
+  unsigned nodes;
   boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring?
   boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid
+  boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes")
   boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node
 
   Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
+      nodes(),
       edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
       max_src_delta(boost::extents[srclen][trglen]),
+      node_addresses(boost::extents[srclen][trglen]),
       valid_deltas(boost::extents[srclen][trglen]) {
     ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
   }
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc
index e29334fd..61e95b82 100644
--- a/gi/pf/transliterations.cc
+++ b/gi/pf/transliterations.cc
@@ -14,42 +14,75 @@ using namespace std;
 using namespace std::tr1;
 
 struct GraphStructure {
-  GraphStructure() : initialized(false) {}
-  boost::shared_ptr<Reachability> r;
-  bool initialized;
+  GraphStructure() : r() {}
+  // leak memory - these are basically static
+  const Reachability* r;
+  bool IsReachable() const { return r->nodes > 0; }
+};
+
+struct BackwardEstimates {
+  BackwardEstimates() : gs(), backward() {}
+  explicit BackwardEstimates(const GraphStructure& g) :
+      gs(&g), backward() {
+    if (g.r->nodes > 0)
+      backward = new float[g.r->nodes];
+  }
+  // leak memory, these are static
+
+  // returns an estimate of the marginal probability
+  double MarginalEstimate() const {
+    if (!backward) return 0;
+    return backward[0];
+  }
+
+  // returns an backward estimate
+  double operator()(int src_covered, int trg_covered) const {
+    if (!backward) return 0;
+    int ind = gs->r->node_addresses[src_covered][trg_covered];
+    if (ind < 0) return 0;
+    return backward[ind];
+  }
+ private:
+  const GraphStructure* gs;
+  float* backward;
 };
 
 struct TransliterationsImpl {
-  TransliterationsImpl() {
+  TransliterationsImpl(int max_src, int max_trg) :
+      kMAX_SRC_CHUNK(max_src),
+      kMAX_TRG_CHUNK(max_trg),
+      tot_pairs() {
   }
 
   void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
     const size_t src_len = src_lets.size();
     const size_t trg_len = trg_lets.size();
+
+    // init graph structure
     if (src_len >= graphs.size()) graphs.resize(src_len + 1);
     if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
-    if (graphs[src_len][trg_len].initialized) return;
-    graphs[src_len][trg_len].r.reset(new Reachability(src_len, trg_len, 4, 4));
-
-#if 0
-    if (HG::Intersect(tlat, &hg)) {
-      // TODO
-    } else {
-      cerr << "No transliteration lattice possible for src_len=" << src_len << " trg_len=" << trg_len << endl;
-      hg.clear();
-    }
-    //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl;
-#endif
-    graphs[src_len][trg_len].initialized = true;
+    GraphStructure& gs = graphs[src_len][trg_len];
+    if (!gs.r)
+      gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK);
+    const Reachability& r = *gs.r;
+
+    // init backward estimates
+    if (src >= bes.size()) bes.resize(src + 1);
+    unordered_map<WordID, BackwardEstimates>::iterator it = bes[src].find(trg);
+    if (it != bes[src].end()) return; // already initialized
+
+    it = bes[src].insert(make_pair(trg, BackwardEstimates(gs))).first;
+    BackwardEstimates& b = it->second;
+    if (!gs.r->nodes) return;  // not derivable subject to length constraints
+
+    // TODO
+    tot_pairs++;
   }
 
   void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
     const size_t src_len = src_lets.size();
     const size_t trg_len = trg_lets.size();
-    if (src_len >= graphs.size()) graphs.resize(src_len + 1);
-    if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
-    graphs[src_len][trg_len].r.reset();
-    graphs[src_len][trg_len].initialized = true;
+    // TODO
   }
 
   prob_t EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
@@ -85,12 +118,17 @@ struct TransliterationsImpl {
     cerr << "     Average nodes = " << (tn / tt) << endl;
     cerr << "Average out-degree = " << (to / tn) << endl;
     cerr << " Unique structures = " << tt << endl;
+    cerr << "      Unique pairs = " << tot_pairs << endl;
   }
 
+  const int kMAX_SRC_CHUNK;
+  const int kMAX_TRG_CHUNK;
+  unsigned tot_pairs;
   vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]
+  vector<unordered_map<WordID, BackwardEstimates> > bes; // bes[src][trg]
 };
 
-Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {}
+Transliterations::Transliterations(int max_src, int max_trg) : pimpl_(new TransliterationsImpl(max_src, max_trg)) {}
 Transliterations::~Transliterations() { delete pimpl_; }
 
 void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h
index 76eb2a05..e025547e 100644
--- a/gi/pf/transliterations.h
+++ b/gi/pf/transliterations.h
@@ -7,7 +7,8 @@
 
 struct TransliterationsImpl;
 struct Transliterations {
-  explicit Transliterations();
+  // max_src and max_trg indicate how big the transliteration phrases can be
+  explicit Transliterations(int max_src, int max_trg);
   ~Transliterations();
   void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
   void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
-- 
cgit v1.2.3


From 4695353e8e37c49c1cc4f91ff8679fff17aac404 Mon Sep 17 00:00:00 2001
From: Chris Dyer <prguest11@taipan.cs>
Date: Thu, 8 Mar 2012 18:47:23 +0000
Subject: fix link error on linux

---
 gi/pf/Makefile.am | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 9888a70b..94364c3d 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -5,14 +5,14 @@ noinst_LIBRARIES = libpf.a
 libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc
 
 nuisance_test_SOURCES = nuisance_test.cc
-nuisance_test_LDADD = libpf.a
+nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
 
 align_lexonly_SOURCES = align-lexonly.cc
 
 align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
 
 align_tl_SOURCES = align-tl.cc
-align_tl_LDADD = libpf.a
+align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
 
 itg_SOURCES = itg.cc
 
-- 
cgit v1.2.3


From 0ab9e175f86b3fd02a4a94f350282210aba054e3 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 8 Mar 2012 14:29:42 -0500
Subject: moar

---
 gi/pf/align-tl.cc         | 15 +++++++++------
 gi/pf/reachability.cc     |  9 +++++----
 gi/pf/reachability.h      |  8 +++++---
 gi/pf/transliterations.cc | 14 ++++++++++----
 gi/pf/transliterations.h  |  3 ++-
 5 files changed, 31 insertions(+), 18 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc
index fe8950b5..fc9b7ca5 100644
--- a/gi/pf/align-tl.cc
+++ b/gi/pf/align-tl.cc
@@ -30,6 +30,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   opts.add_options()
         ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
         ("input,i",po::value<string>(),"Read parallel data from")
+        ("max_src_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in source")
+        ("max_trg_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in target")
+        ("min_transliterated_src_length", po::value<unsigned>()->default_value(3), "Minimum length of source words considered for transliteration")
+        ("filter_ratio", po::value<double>()->default_value(0.66), "Filter ratio: basically, if the lengths differ by less than this ratio, mark the pair as non-transliteratable")
         ("random_seed,S",po::value<uint32_t>(), "Random seed");
   po::options_description clo("Command line options");
   clo.add_options()
@@ -306,12 +310,11 @@ int main(int argc, char** argv) {
   letters[TD::Convert("NULL")].clear();
 
   // TODO configure this
-  int max_src_chunk = 4;
-  int max_trg_chunk = 4;
-  Transliterations tl(max_src_chunk, max_trg_chunk);
-
-  // TODO CONFIGURE THIS
-  int min_trans_src = 4;
+  const int max_src_chunk = conf["max_src_chunk"].as<unsigned>();
+  const int max_trg_chunk = conf["max_trg_chunk"].as<unsigned>();
+  const double filter_rat = conf["filter_ratio"].as<double>();
+  const int min_trans_src = conf["min_transliterated_src_length"].as<unsigned>();
+  Transliterations tl(max_src_chunk, max_trg_chunk, filter_rat);
 
   cerr << "Initializing transliteration graph structures ...\n";
   for (int i = 0; i < corpus.size(); ++i) {
diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc
index 59bc6ace..c10000f2 100644
--- a/gi/pf/reachability.cc
+++ b/gi/pf/reachability.cc
@@ -12,7 +12,7 @@ struct SState {
   int prev_trg_covered;
 };
 
-void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) {
+void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio) {
     typedef boost::multi_array<vector<SState>, 2> array_type;
     array_type a(boost::extents[srclen + 1][trglen + 1]);
     a[0][0].push_back(SState());
@@ -30,9 +30,10 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
       }
     }
     a[0][0].clear();
-    //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
-    if (a[srclen][trglen].size() == 0) {
-      cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n";
+    //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
+    size_t min_allowed = (src_max_phrase_len + 1) * (trg_max_phrase_len + 1) * (filter_ratio * filter_ratio);
+    if (a[srclen][trglen].size() < min_allowed) {
+      cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraint of min indegree " << min_allowed << " with " << a[srclen][trglen].size() << " in edges\n";
       return;
     }
 
diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h
index 1e22c76a..03967d44 100644
--- a/gi/pf/reachability.h
+++ b/gi/pf/reachability.h
@@ -18,17 +18,19 @@ struct Reachability {
   boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes")
   boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node
 
-  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
+  // filter_ratio says if the number of outgoing edges from the first cell is less than
+  //    src_max * trg_max * filter_rat^2 then mark as non reachable
+  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio = 0.0) :
       nodes(),
       edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
       max_src_delta(boost::extents[srclen][trglen]),
       node_addresses(boost::extents[srclen][trglen]),
       valid_deltas(boost::extents[srclen][trglen]) {
-    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
+    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len, filter_ratio);
   }
 
  private:
-  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len);
+  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio);
 };
 
 #endif
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc
index 61e95b82..8ea4ebd2 100644
--- a/gi/pf/transliterations.cc
+++ b/gi/pf/transliterations.cc
@@ -48,10 +48,11 @@ struct BackwardEstimates {
 };
 
 struct TransliterationsImpl {
-  TransliterationsImpl(int max_src, int max_trg) :
+  TransliterationsImpl(int max_src, int max_trg, double fr) :
       kMAX_SRC_CHUNK(max_src),
       kMAX_TRG_CHUNK(max_trg),
-      tot_pairs() {
+      kFILTER_RATIO(fr),
+      tot_pairs(), tot_mem() {
   }
 
   void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
@@ -63,7 +64,7 @@ struct TransliterationsImpl {
     if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
     GraphStructure& gs = graphs[src_len][trg_len];
     if (!gs.r)
-      gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK);
+      gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO);
     const Reachability& r = *gs.r;
 
     // init backward estimates
@@ -77,6 +78,7 @@ struct TransliterationsImpl {
 
     // TODO
     tot_pairs++;
+    tot_mem += sizeof(float) * gs.r->nodes;
   }
 
   void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
@@ -119,16 +121,20 @@ struct TransliterationsImpl {
     cerr << "Average out-degree = " << (to / tn) << endl;
     cerr << " Unique structures = " << tt << endl;
     cerr << "      Unique pairs = " << tot_pairs << endl;
+    cerr << "          BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl;
   }
 
   const int kMAX_SRC_CHUNK;
   const int kMAX_TRG_CHUNK;
+  const double kFILTER_RATIO;
   unsigned tot_pairs;
+  size_t tot_mem;
   vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]
   vector<unordered_map<WordID, BackwardEstimates> > bes; // bes[src][trg]
 };
 
-Transliterations::Transliterations(int max_src, int max_trg) : pimpl_(new TransliterationsImpl(max_src, max_trg)) {}
+Transliterations::Transliterations(int max_src, int max_trg, double fr) :
+    pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {}
 Transliterations::~Transliterations() { delete pimpl_; }
 
 void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h
index e025547e..ea9f9d3f 100644
--- a/gi/pf/transliterations.h
+++ b/gi/pf/transliterations.h
@@ -8,7 +8,8 @@
 struct TransliterationsImpl;
 struct Transliterations {
   // max_src and max_trg indicate how big the transliteration phrases can be
-  explicit Transliterations(int max_src, int max_trg);
+  // see reachability.h for information about filter_ratio
+  explicit Transliterations(int max_src, int max_trg, double filter_ratio);
   ~Transliterations();
   void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
   void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
-- 
cgit v1.2.3


From 89d63600524bc042b6c2741d7d67db6a3a74dc8c Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Fri, 9 Mar 2012 22:23:50 -0500
Subject: moar

---
 gi/pf/Makefile.am          |   3 +-
 gi/pf/align-lexonly-pyp.cc | 207 ++++++++++-------------------------------
 gi/pf/align-tl.cc          |  18 ++--
 gi/pf/backward.cc          |  89 ++++++++++++++++++
 gi/pf/backward.h           |  33 +++++++
 gi/pf/base_distributions.h |   8 +-
 gi/pf/guess-translits.pl   |   2 +-
 gi/pf/nuisance_test.cc     |   6 +-
 gi/pf/pyp_lm.cc            |   2 +-
 gi/pf/pyp_tm.cc            | 113 +++++++++++++++++++++++
 gi/pf/pyp_tm.h             |  34 +++++++
 gi/pf/pyp_word_model.cc    |  20 ++++
 gi/pf/pyp_word_model.h     |  58 ++++++++++++
 gi/pf/reachability.cc      |   8 +-
 gi/pf/reachability.h       |   8 +-
 gi/pf/transliterations.cc  | 223 ++++++++++++++++++++++++++++++++++++++++-----
 gi/pf/transliterations.h   |   3 +-
 utils/ccrp_nt.h            |  17 ++--
 18 files changed, 628 insertions(+), 224 deletions(-)
 create mode 100644 gi/pf/backward.cc
 create mode 100644 gi/pf/backward.h
 create mode 100644 gi/pf/pyp_tm.cc
 create mode 100644 gi/pf/pyp_tm.h
 create mode 100644 gi/pf/pyp_word_model.cc
 create mode 100644 gi/pf/pyp_word_model.h

(limited to 'gi/pf')

diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 94364c3d..4ce72ba1 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -2,7 +2,7 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexon
 
 noinst_LIBRARIES = libpf.a
 
-libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc
+libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc pyp_word_model.cc pyp_tm.cc
 
 nuisance_test_SOURCES = nuisance_test.cc
 nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
@@ -10,6 +10,7 @@ nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mtev
 align_lexonly_SOURCES = align-lexonly.cc
 
 align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
+align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
 
 align_tl_SOURCES = align-tl.cc
 align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 13a3a487..d68a4b8f 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -1,27 +1,18 @@
 #include <iostream>
-#include <tr1/memory>
 #include <queue>
 
-#include <boost/multi_array.hpp>
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
-#include "array2d.h"
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "conditional_pseg.h"
-#include "trule.h"
 #include "tdict.h"
 #include "stringlib.h"
 #include "filelib.h"
-#include "dict.h"
+#include "array2d.h"
 #include "sampler.h"
-#include "mfcr.h"
 #include "corpus.h"
-#include "ngram_base.h"
+#include "pyp_tm.h"
 
 using namespace std;
-using namespace tr1;
 namespace po = boost::program_options;
 
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
@@ -51,7 +42,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   }
 }
 
-shared_ptr<MT19937> prng;
+MT19937* prng;
 
 struct LexicalAlignment {
   unsigned char src_index;
@@ -66,159 +57,59 @@ struct AlignedSentencePair {
   Array2D<short> posterior;
 };
 
-struct HierarchicalWordBase {
-  explicit HierarchicalWordBase(const unsigned vocab_e_size) :
-      base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {}
-
-  void ResampleHyperparameters(MT19937* rng) {
-    r.resample_hyperparameters(rng);
-  }
-
-  inline double logp0(const vector<WordID>& s) const {
-    return Md::log_poisson(s.size(), 7.5) + s.size() * u0;
-  }
-
-  // return p0 of rule.e_
-  prob_t operator()(const TRule& rule) const {
-    v[0].logeq(logp0(rule.e_));
-    return r.prob(rule.e_, v.begin(), l.begin());
-  }
-
-  void Increment(const TRule& rule) {
-    v[0].logeq(logp0(rule.e_));
-    if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) {
-      base *= v[0] * l[0];
-    }
-  }
-
-  void Decrement(const TRule& rule) {
-    if (r.decrement(rule.e_, &*prng).count) {
-      base /= prob_t(exp(logp0(rule.e_)));
-    }
-  }
-
-  prob_t Likelihood() const {
-    prob_t p; p.logeq(r.log_crp_prob());
-    p *= base;
-    return p;
+struct Aligner {
+  Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) :
+      corpus(*c),
+      model(lets, num_letters),
+      kNULL(TD::Convert("NULL")) {
+    assert(lets[kNULL].size() == 0);
   }
 
-  void Summary() const {
-    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.discount() << ",s=" << r.strength() << ')' << endl;
-    for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
-      cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl;
-  }
-
-  prob_t base;
-  MFCR<1,vector<WordID> > r;
-  const double u0;
-  const vector<prob_t> l;
-  mutable vector<prob_t> v;
-};
-
-struct BasicLexicalAlignment {
-  explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets,
-                                 const unsigned words_e,
-                                 const unsigned letters_e,
-                                 vector<AlignedSentencePair>* corp) :
-      letters(lets),
-      corpus(*corp),
-      //up0(words_e),
-      //up0("en.chars.1gram", letters_e),
-      //up0("en.words.1gram"),
-      up0(letters_e),
-      //up0("en.chars.2gram"),
-      tmodel(up0) {
-  }
+  vector<AlignedSentencePair>& corpus;
+  PYPLexicalTranslation model;
+  const WordID kNULL;
 
-  void InstantiateRule(const WordID src,
-                       const WordID trg,
-                       TRule* rule) const {
-    static const WordID kX = TD::Convert("X") * -1;
-    rule->lhs_ = kX;
-    rule->e_ = letters[trg];
-    rule->f_ = letters[src];
+  void ResampleHyperparameters() {
+    model.ResampleHyperparameters(prng);
   }
 
   void InitializeRandom() {
-    const WordID kNULL = TD::Convert("NULL");
     cerr << "Initializing with random alignments ...\n";
     for (unsigned i = 0; i < corpus.size(); ++i) {
       AlignedSentencePair& asp = corpus[i];
       asp.a.resize(asp.trg.size());
       for (unsigned j = 0; j < asp.trg.size(); ++j) {
-        const unsigned char a_j = prng->next() * (1 + asp.src.size());
+        unsigned char& a_j = asp.a[j].src_index;
+        a_j = prng->next() * (1 + asp.src.size());
         const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
-        TRule r;
-        InstantiateRule(f_a_j, asp.trg[j], &r);
-        asp.a[j].is_transliteration = false;
-        asp.a[j].src_index = a_j;
-        if (tmodel.IncrementRule(r, &*prng))
-          up0.Increment(r);
+        model.Increment(f_a_j, asp.trg[j], &*prng);
       }
     }
-    cerr << "  LLH = " << Likelihood() << endl;
-  }
-
-  prob_t Likelihood() const {
-    prob_t p = tmodel.Likelihood();
-    p *= up0.Likelihood();
-    return p;
-  }
-
-  void ResampleHyperparemeters() {
-    tmodel.ResampleHyperparameters(&*prng);
-    up0.ResampleHyperparameters(&*prng);
-    cerr << "  (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n";
+    cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl;
   }
 
-  void ResampleCorpus();
-
-  const vector<vector<WordID> >& letters; // spelling dictionary
-  vector<AlignedSentencePair>& corpus;
-  //PhraseConditionalUninformativeBase up0;
-  //PhraseConditionalUninformativeUnigramBase up0;
-  //UnigramWordBase up0;
-  //HierarchicalUnigramBase up0;
-  HierarchicalWordBase up0;
-  //CompletelyUniformBase up0;
-  //FixedNgramBase up0;
-  //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel;
-  //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel;
-  //ConditionalTranslationModel<UnigramWordBase> tmodel;
-  //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel;
-  MConditionalTranslationModel<HierarchicalWordBase> tmodel;
-  //ConditionalTranslationModel<FixedNgramBase> tmodel;
-  //ConditionalTranslationModel<CompletelyUniformBase> tmodel;
-};
-
-void BasicLexicalAlignment::ResampleCorpus() {
-  static const WordID kNULL = TD::Convert("NULL");
-  for (unsigned i = 0; i < corpus.size(); ++i) {
-    AlignedSentencePair& asp = corpus[i];
-    SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
-    for (unsigned j = 0; j < asp.trg.size(); ++j) {
-      TRule r;
-      unsigned char& a_j = asp.a[j].src_index;
-      WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
-      InstantiateRule(f_a_j, asp.trg[j], &r);
-      if (tmodel.DecrementRule(r, &*prng))
-        up0.Decrement(r);
-
-      for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
-        const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
-        InstantiateRule(prop_f, asp.trg[j], &r);
-        ss[prop_a_j] = tmodel.RuleProbability(r);
+  void ResampleCorpus() {
+    for (unsigned i = 0; i < corpus.size(); ++i) {
+      AlignedSentencePair& asp = corpus[i];
+      SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
+      for (unsigned j = 0; j < asp.trg.size(); ++j) {
+        unsigned char& a_j = asp.a[j].src_index;
+        const WordID e_j = asp.trg[j];
+        WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+        model.Decrement(f_a_j, e_j, prng);
+
+        for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
+          const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
+          ss[prop_a_j] = model.Prob(prop_f, e_j);
+        }
+        a_j = prng->SelectSample(ss);
+        f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+        model.Increment(f_a_j, e_j, prng);
       }
-      a_j = prng->SelectSample(ss);
-      f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
-      InstantiateRule(f_a_j, asp.trg[j], &r);
-      if (tmodel.IncrementRule(r, &*prng))
-        up0.Increment(r);
     }
+    cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl;
   }
-  cerr << "  LLH = " << Likelihood() << endl;
-}
+};
 
 void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
   for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
@@ -240,8 +131,10 @@ void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID
 void Debug(const AlignedSentencePair& asp) {
   cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
   Array2D<bool> a(asp.src.size(), asp.trg.size());
-  for (unsigned j = 0; j < asp.trg.size(); ++j)
+  for (unsigned j = 0; j < asp.trg.size(); ++j) {
+    assert(asp.a[j].src_index <= asp.src.size());
     if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
+  }
   cerr << a << endl;
 }
 
@@ -275,10 +168,9 @@ int main(int argc, char** argv) {
   InitCommandLine(argc, argv, &conf);
 
   if (conf.count("random_seed"))
-    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+    prng = new MT19937(conf["random_seed"].as<uint32_t>());
   else
-    prng.reset(new MT19937);
-//  MT19937& rng = *prng;
+    prng = new MT19937;
 
   vector<vector<int> > corpuse, corpusf;
   set<int> vocabe, vocabf;
@@ -304,23 +196,18 @@ int main(int argc, char** argv) {
   ExtractLetters(vocabf, &letters, NULL);
   letters[TD::Convert("NULL")].clear();
 
-  BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus);
-  x.InitializeRandom();
+  Aligner aligner(letters, letset.size(), &corpus);
+  aligner.InitializeRandom();
+
   const unsigned samples = conf["samples"].as<unsigned>();
   for (int i = 0; i < samples; ++i) {
     for (int j = 65; j < 67; ++j) Debug(corpus[j]);
-    cerr << i << "\t" << x.tmodel.r.size() << "\t";
-    if (i % 7 == 6) x.ResampleHyperparemeters();
-    x.ResampleCorpus();
+    if (i % 7 == 6) aligner.ResampleHyperparameters();
+    aligner.ResampleCorpus();
     if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
   }
   for (unsigned i = 0; i < corpus.size(); ++i)
     WriteAlignments(corpus[i]);
-  //ModelAndData posterior(x, &corpus, vocabe, vocabf);
-  x.tmodel.Summary();
-  x.up0.Summary();
-
-  //posterior.Sample();
 
   return 0;
 }
diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc
index fc9b7ca5..cbe8c6c8 100644
--- a/gi/pf/align-tl.cc
+++ b/gi/pf/align-tl.cc
@@ -6,6 +6,7 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "backward.h"
 #include "array2d.h"
 #include "base_distributions.h"
 #include "monotonic_pseg.h"
@@ -30,10 +31,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   opts.add_options()
         ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
         ("input,i",po::value<string>(),"Read parallel data from")
+        ("s2t", po::value<string>(), "character level source-to-target prior transliteration probabilities")
+        ("t2s", po::value<string>(), "character level target-to-source prior transliteration probabilities")
         ("max_src_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in source")
         ("max_trg_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in target")
-        ("min_transliterated_src_length", po::value<unsigned>()->default_value(3), "Minimum length of source words considered for transliteration")
-        ("filter_ratio", po::value<double>()->default_value(0.66), "Filter ratio: basically, if the lengths differ by less than this ratio, mark the pair as non-transliteratable")
+        ("expected_src_to_trg_ratio", po::value<double>()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?")
         ("random_seed,S",po::value<uint32_t>(), "Random seed");
   po::options_description clo("Command line options");
   clo.add_options()
@@ -303,7 +305,7 @@ int main(int argc, char** argv) {
   corpusf.clear(); corpuse.clear();
 
   vocabf.insert(TD::Convert("NULL"));
-  vector<vector<WordID> > letters(TD::NumWords());
+  vector<vector<WordID> > letters(TD::NumWords() + 1);
   set<WordID> letset;
   ExtractLetters(vocabe, &letters, &letset);
   ExtractLetters(vocabf, &letters, NULL);
@@ -312,9 +314,9 @@ int main(int argc, char** argv) {
   // TODO configure this
   const int max_src_chunk = conf["max_src_chunk"].as<unsigned>();
   const int max_trg_chunk = conf["max_trg_chunk"].as<unsigned>();
-  const double filter_rat = conf["filter_ratio"].as<double>();
-  const int min_trans_src = conf["min_transliterated_src_length"].as<unsigned>();
-  Transliterations tl(max_src_chunk, max_trg_chunk, filter_rat);
+  const double s2t_rat = conf["expected_src_to_trg_ratio"].as<double>();
+  const BackwardEstimator be(conf["s2t"].as<string>(), conf["t2s"].as<string>());
+  Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be); 
 
   cerr << "Initializing transliteration graph structures ...\n";
   for (int i = 0; i < corpus.size(); ++i) {
@@ -325,8 +327,8 @@ int main(int argc, char** argv) {
       for (int k = 0; k < trg.size(); ++k) {
         const vector<int>& trg_let = letters[trg[k]];
         tl.Initialize(src[j], src_let, trg[k], trg_let);
-        if (src_let.size() < min_trans_src)
-          tl.Forbid(src[j], src_let, trg[k], trg_let);
+        //if (src_let.size() < min_trans_src)
+        //  tl.Forbid(src[j], src_let, trg[k], trg_let);
       }
     }
   }
diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc
new file mode 100644
index 00000000..b92629fd
--- /dev/null
+++ b/gi/pf/backward.cc
@@ -0,0 +1,89 @@
+#include "backward.h"
+
+#include <queue>
+#include <utility>
+
+#include "array2d.h"
+#include "reachability.h"
+#include "base_distributions.h"
+
+using namespace std;
+
+BackwardEstimator::BackwardEstimator(const string& s2t,
+                    const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {}
+
+BackwardEstimator::~BackwardEstimator() {
+  delete m1; m1 = NULL;
+  delete m1inv; m1inv = NULL;
+}
+
+float BackwardEstimator::ComputeBackwardProb(const std::vector<WordID>& src,
+                                             const std::vector<WordID>& trg,
+                                             unsigned src_covered,
+                                             unsigned trg_covered,
+                                             double s2t_ratio) const {
+  if (src_covered == src.size() || trg_covered == trg.size()) {
+    assert(src_covered == src.size());
+    assert(trg_covered == trg.size());
+    return 0;
+  }
+  static const WordID kNULL = TD::Convert("<eps>");
+  const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1));
+  // TODO factor in expected length ratio
+  prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining)
+  for (unsigned j = trg_covered; j < trg.size(); ++j) {
+    prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12);
+    for (unsigned i = src_covered; i < src.size(); ++i)
+      p += (*m1)(src[i], trg[j]);
+    if (p.is_0()) {
+      cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n";
+      assert(!"failed");
+    }
+    p *= uniform_alignment;
+    e *= p;
+  }
+  // TODO factor in expected length ratio
+  const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0));
+  prob_t inv;
+  inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio));
+  for (unsigned i = src_covered; i < src.size(); ++i) {
+    prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12);
+    for (unsigned j = trg_covered; j < trg.size(); ++j)
+      p += (*m1inv)(trg[j], src[i]);
+    if (p.is_0()) {
+      cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n";
+      assert(!"failed");
+    }
+    p *= inv_uniform;
+    inv *= p;
+  }
+  return (log(e) + log(inv)) / 2;
+}
+
+void BackwardEstimator::InitializeGrid(const vector<WordID>& src,
+                      const vector<WordID>& trg,
+                      const Reachability& r,
+                      double s2t_ratio,
+                      float* grid) const {
+  queue<pair<int,int> > q;
+  q.push(make_pair(0,0));
+  Array2D<bool> done(src.size()+1, trg.size()+1, false);
+  //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl;
+  while(!q.empty()) {
+    const pair<int,int> n = q.front();
+    q.pop();
+    if (done(n.first,n.second)) continue;
+    done(n.first,n.second) = true;
+
+    float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio);
+    if (n.first == 0 && n.second == 0) grid[0] = lp;
+    //cerr << "  " << n.first << "," << n.second << "\t" << lp << endl;
+
+    if (n.first == src.size() || n.second == trg.size()) continue;
+    const vector<pair<short,short> >& edges = r.valid_deltas[n.first][n.second];
+    for (int i = 0; i < edges.size(); ++i)
+      q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second));
+  }
+  //static int cc = 0; ++cc; if (cc == 80) exit(1);
+}
+
diff --git a/gi/pf/backward.h b/gi/pf/backward.h
new file mode 100644
index 00000000..e67eff0c
--- /dev/null
+++ b/gi/pf/backward.h
@@ -0,0 +1,33 @@
+#ifndef _BACKWARD_H_
+#define _BACKWARD_H_
+
+#include <vector>
+#include <string>
+#include "wordid.h"
+
+struct Reachability;
+struct Model1;
+
+struct BackwardEstimator {
+  BackwardEstimator(const std::string& s2t,
+                    const std::string& t2s);
+  ~BackwardEstimator();
+
+  void InitializeGrid(const std::vector<WordID>& src,
+                      const std::vector<WordID>& trg,
+                      const Reachability& r,
+                      double src2trg_ratio,
+                      float* grid) const;
+
+ private:
+  float ComputeBackwardProb(const std::vector<WordID>& src,
+                            const std::vector<WordID>& trg,
+                            unsigned src_covered,
+                            unsigned trg_covered,
+                            double src2trg_ratio) const;
+
+  Model1* m1;
+  Model1* m1inv;
+};
+
+#endif
diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h
index 0d597c5c..84dacdf2 100644
--- a/gi/pf/base_distributions.h
+++ b/gi/pf/base_distributions.h
@@ -14,13 +14,7 @@
 #include "tdict.h"
 #include "sampler.h"
 #include "m.h"
-
-inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) {
-  os << '[';
-  for (int i = 0; i < p.size(); ++i)
-    os << (i==0 ? "" : " ") << TD::Convert(p[i]);
-  return os << ']';
-}
+#include "os_phrase.h"
 
 struct Model1 {
   explicit Model1(const std::string& fname) :
diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl
index aafec13a..d00c2168 100755
--- a/gi/pf/guess-translits.pl
+++ b/gi/pf/guess-translits.pl
@@ -69,4 +69,4 @@ for my $f (keys %fs) {
   }
 }
 print STDERR "Extracted $num pairs.\n";
-print STDERR "Recommend running:\n   ../../training/model1 -t -99999 output.txt\n";
+print STDERR "Recommend running:\n   ../../training/model1 -v -d -t -99999 output.txt\n";
diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc
index 0f44fe95..fc0af9cb 100644
--- a/gi/pf/nuisance_test.cc
+++ b/gi/pf/nuisance_test.cc
@@ -124,9 +124,9 @@ int main(int argc, char** argv) {
   WordID y = TD::Convert("remember");
   vector<WordID> src; TD::ConvertSentence("s o u v e n o n s", &src);
   vector<WordID> trg; TD::ConvertSentence("r e m e m b e r", &trg);
-  Transliterations xx;
-  xx.Initialize(x, src, y, trg);
-  return 1;
+//  Transliterations xx;
+//  xx.Initialize(x, src, y, trg);
+//  return 1;
 
  for (int j = 0; j < ITERS; ++j) {
   Base b;
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 104f356b..52e6be2c 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -18,7 +18,7 @@
 
 // I use templates to handle the recursive formalation of the prior, so
 // the order of the model has to be specified here, at compile time:
-#define kORDER 4
+#define kORDER 3
 
 using namespace std;
 using namespace tr1;
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
new file mode 100644
index 00000000..94cbe7c3
--- /dev/null
+++ b/gi/pf/pyp_tm.cc
@@ -0,0 +1,113 @@
+#include "pyp_tm.h"
+
+#include <tr1/unordered_map>
+#include <iostream>
+#include <queue>
+
+#include "base_distributions.h"
+#include "monotonic_pseg.h"
+#include "conditional_pseg.h"
+#include "tdict.h"
+#include "ccrp.h"
+#include "pyp_word_model.h"
+
+using namespace std;
+using namespace std::tr1;
+
+template <typename Base>
+struct ConditionalPYPWordModel {
+  ConditionalPYPWordModel(Base* b) : base(*b) {}
+
+  void Summary() const {
+    cerr << "Number of conditioning contexts: " << r.size() << endl;
+    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+      cerr << TD::Convert(it->first) << "   \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl;
+      for (CCRP<vector<WordID> >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+        cerr << "   " << i2->second.total_dish_count_ << '\t' << TD::GetString(i2->first) << endl;
+    }
+  }
+
+  void ResampleHyperparameters(MT19937* rng) {
+    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
+      it->second.resample_hyperparameters(rng);
+  } 
+
+  prob_t Prob(const WordID src, const vector<WordID>& trglets) const {
+    RuleModelHash::const_iterator it = r.find(src);
+    if (it == r.end()) {
+      return base(trglets);
+    } else {
+      return it->second.prob(trglets, base(trglets));
+    }
+  }
+
+  void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
+    RuleModelHash::iterator it = r.find(src);
+    if (it == r.end())
+      it = r.insert(make_pair(src, CCRP<vector<WordID> >(1,1,1,1,0.5,1.0))).first;
+    if (it->second.increment(trglets, base(trglets), rng))
+      base.Increment(trglets, rng);
+  }
+
+  void Decrement(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
+    RuleModelHash::iterator it = r.find(src);
+    assert(it != r.end());
+    if (it->second.decrement(trglets, rng)) {
+      base.Decrement(trglets, rng);
+      if (it->second.num_customers() == 0)
+        r.erase(it);
+    }
+  }
+
+  prob_t Likelihood() const {
+    prob_t p = prob_t::One();
+    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+      prob_t q; q.logeq(it->second.log_crp_prob());
+      p *= q;
+    }
+    return p;
+  }
+
+  unsigned UniqueConditioningContexts() const {
+    return r.size();
+  }
+
+  Base& base;
+  typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
+  RuleModelHash r;
+};
+
+PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets,
+                                             const unsigned num_letters) :
+    letters(lets),
+    up0(new PYPWordModel(num_letters)),
+    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)),
+    kX(-TD::Convert("X")) {}
+
+prob_t PYPLexicalTranslation::Likelihood() const {
+  prob_t p = up0->Likelihood();
+  p *= tmodel->Likelihood();
+  return p;
+}
+
+void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) {
+  tmodel->ResampleHyperparameters(rng);
+  up0->ResampleHyperparameters(rng);
+}
+
+unsigned PYPLexicalTranslation::UniqueConditioningContexts() const {
+  return tmodel->UniqueConditioningContexts();
+}
+
+prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const {
+  return tmodel->Prob(src, letters[trg]);
+}
+
+void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) {
+  tmodel->Increment(src, letters[trg], rng);
+}
+
+void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) {
+  tmodel->Decrement(src, letters[trg], rng);
+}
+
diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h
new file mode 100644
index 00000000..fa0fb28f
--- /dev/null
+++ b/gi/pf/pyp_tm.h
@@ -0,0 +1,34 @@
+#ifndef PYP_LEX_TRANS
+#define PYP_LEX_TRANS
+
+#include <vector>
+#include "wordid.h"
+#include "prob.h"
+#include "sampler.h"
+
+struct TRule;
+struct PYPWordModel;
+template <typename T> struct ConditionalPYPWordModel;
+
+struct PYPLexicalTranslation {
+  explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets,
+                                 const unsigned num_letters);
+
+  prob_t Likelihood() const;
+
+  void ResampleHyperparameters(MT19937* rng);
+  prob_t Prob(WordID src, WordID trg) const;  // return p(trg | src)
+  void Summary() const;
+  void Increment(WordID src, WordID trg, MT19937* rng);
+  void Decrement(WordID src, WordID trg, MT19937* rng);
+  unsigned UniqueConditioningContexts() const;
+
+ private:
+  const std::vector<std::vector<WordID> >& letters;   // spelling dictionary
+  PYPWordModel* up0;  // base distribuction (model English word)
+  ConditionalPYPWordModel<PYPWordModel>* tmodel;  // translation distributions
+                      // (model English word | French word)
+  const WordID kX;
+};
+
+#endif
diff --git a/gi/pf/pyp_word_model.cc b/gi/pf/pyp_word_model.cc
new file mode 100644
index 00000000..12df4abf
--- /dev/null
+++ b/gi/pf/pyp_word_model.cc
@@ -0,0 +1,20 @@
+#include "pyp_word_model.h"
+
+#include <iostream>
+
+using namespace std;
+
+void PYPWordModel::ResampleHyperparameters(MT19937* rng) {
+  r.resample_hyperparameters(rng);
+  cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n";
+}
+
+void PYPWordModel::Summary() const {
+  cerr << "PYPWordModel: generations=" << r.num_customers()
+       << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << endl;
+  for (CCRP<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
+    cerr << "   " << it->second.total_dish_count_
+              << " (on " << it->second.table_counts_.size() << " tables) "
+              << TD::GetString(it->first) << endl;
+}
+
diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h
new file mode 100644
index 00000000..800a4fd7
--- /dev/null
+++ b/gi/pf/pyp_word_model.h
@@ -0,0 +1,58 @@
+#ifndef _PYP_WORD_MODEL_H_
+#define _PYP_WORD_MODEL_H_
+
+#include <iostream>
+#include <cmath>
+#include <vector>
+#include "prob.h"
+#include "ccrp.h"
+#include "m.h"
+#include "tdict.h"
+#include "os_phrase.h"
+
+// PYP(d,s,poisson-uniform) represented as a CRP
+struct PYPWordModel {
+  explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 7.5) :
+      base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {}
+
+  void ResampleHyperparameters(MT19937* rng);
+
+  inline prob_t operator()(const std::vector<WordID>& s) const {
+    return r.prob(s, p0(s));
+  }
+
+  inline void Increment(const std::vector<WordID>& s, MT19937* rng) {
+    if (r.increment(s, p0(s), rng))
+      base *= p0(s);
+  }
+
+  inline void Decrement(const std::vector<WordID>& s, MT19937 *rng) {
+    if (r.decrement(s, rng))
+      base /= p0(s);
+  }
+
+  inline prob_t Likelihood() const {
+    prob_t p; p.logeq(r.log_crp_prob());
+    p *= base;
+    return p;
+  }
+
+  void Summary() const;
+
+ private:
+  inline double logp0(const std::vector<WordID>& s) const {
+    return Md::log_poisson(s.size(), mean_length) + s.size() * u0;
+  }
+
+  inline prob_t p0(const std::vector<WordID>& s) const {
+    prob_t p; p.logeq(logp0(s));
+    return p;
+  }
+
+  prob_t base;  // keeps track of the draws from the base distribution
+  CCRP<std::vector<WordID> > r;
+  const double u0;  // uniform log prob of generating a letter
+  const double mean_length;  // mean length of a word in the base distribution
+};
+
+#endif
diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc
index c10000f2..7d0d04ac 100644
--- a/gi/pf/reachability.cc
+++ b/gi/pf/reachability.cc
@@ -12,7 +12,7 @@ struct SState {
   int prev_trg_covered;
 };
 
-void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio) {
+void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) {
     typedef boost::multi_array<vector<SState>, 2> array_type;
     array_type a(boost::extents[srclen + 1][trglen + 1]);
     a[0][0].push_back(SState());
@@ -31,9 +31,9 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
     }
     a[0][0].clear();
     //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
-    size_t min_allowed = (src_max_phrase_len + 1) * (trg_max_phrase_len + 1) * (filter_ratio * filter_ratio);
-    if (a[srclen][trglen].size() < min_allowed) {
-      cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraint of min indegree " << min_allowed << " with " << a[srclen][trglen].size() << " in edges\n";
+    if (a[srclen][trglen].empty()) {
+      cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n";
+      nodes = 0;
       return;
     }
 
diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h
index 03967d44..1e22c76a 100644
--- a/gi/pf/reachability.h
+++ b/gi/pf/reachability.h
@@ -18,19 +18,17 @@ struct Reachability {
   boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes")
   boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node
 
-  // filter_ratio says if the number of outgoing edges from the first cell is less than
-  //    src_max * trg_max * filter_rat^2 then mark as non reachable
-  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio = 0.0) :
+  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
       nodes(),
       edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
       max_src_delta(boost::extents[srclen][trglen]),
       node_addresses(boost::extents[srclen][trglen]),
       valid_deltas(boost::extents[srclen][trglen]) {
-    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len, filter_ratio);
+    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
   }
 
  private:
-  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio);
+  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len);
 };
 
 #endif
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc
index 8ea4ebd2..2200715e 100644
--- a/gi/pf/transliterations.cc
+++ b/gi/pf/transliterations.cc
@@ -5,14 +5,173 @@
 
 #include "boost/shared_ptr.hpp"
 
+#include "backward.h"
 #include "filelib.h"
-#include "ccrp.h"
+#include "tdict.h"
+#include "trule.h"
+#include "filelib.h"
+#include "ccrp_nt.h"
 #include "m.h"
 #include "reachability.h"
 
 using namespace std;
 using namespace std::tr1;
 
+struct TruncatedConditionalLengthModel {
+  TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) :
+      plens(max_src_size+1, vector<prob_t>(max_trg_size+1, 0.0)) {
+    for (unsigned i = 1; i <= max_src_size; ++i) {
+      prob_t z = prob_t::Zero();
+      for (unsigned j = 1; j <= max_trg_size; ++j)
+        z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio))));
+      for (unsigned j = 1; j <= max_trg_size; ++j)
+        plens[i][j] /= z;
+      //for (unsigned j = 1; j <= max_trg_size; ++j)
+      //  cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl;
+    }
+  }
+
+  // return p(tlen | slen) for *chunks* not full words
+  inline const prob_t& operator()(int slen, int tlen) const {
+    return plens[slen][tlen];
+  }
+
+  vector<vector<prob_t> > plens;
+};
+
+struct CondBaseDist {
+  CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) :
+    tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {}
+
+  prob_t operator()(const vector<WordID>& src, unsigned sf, unsigned st,
+                    const vector<WordID>& trg, unsigned tf, unsigned tt) const {
+    prob_t p = tclm(st - sf, tt - tf);  // target len | source length ~ TCLM(source len)
+    assert(!"not impl");
+    return p;
+  }
+  inline prob_t operator()(const vector<WordID>& src, const vector<WordID>& trg) const {
+    return (*this)(src, 0, src.size(), trg, 0, trg.size());
+  }
+  TruncatedConditionalLengthModel tclm;
+};
+
+// represents transliteration phrase probabilities, e.g.
+//   p( a l - | A l ) , p( o | A w ) , ...
+struct TransliterationChunkConditionalModel {
+  explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) :
+      d(0.0),
+      strength(1.0),
+      rp0(pp0) {
+  }
+
+  void Summary() const {
+    std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
+    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+      std::cerr << TD::GetString(it->first) << "   \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
+      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+        std::cerr << "   " << i2->second << '\t' << i2->first << std::endl;
+    }
+  }
+
+  int DecrementRule(const TRule& rule) {
+    RuleModelHash::iterator it = r.find(rule.f_);
+    assert(it != r.end());    
+    int count = it->second.decrement(rule);
+    if (count) {
+      if (it->second.num_customers() == 0) r.erase(it);
+    }
+    return count;
+  }
+
+  int IncrementRule(const TRule& rule) {
+    RuleModelHash::iterator it = r.find(rule.f_);
+    if (it == r.end()) {
+      it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(strength))).first;
+    } 
+    int count = it->second.increment(rule);
+    return count;
+  }
+
+  void IncrementRules(const std::vector<TRulePtr>& rules) {
+    for (int i = 0; i < rules.size(); ++i)
+      IncrementRule(*rules[i]);
+  }
+
+  void DecrementRules(const std::vector<TRulePtr>& rules) {
+    for (int i = 0; i < rules.size(); ++i)
+      DecrementRule(*rules[i]);
+  }
+
+  prob_t RuleProbability(const TRule& rule) const {
+    prob_t p;
+    RuleModelHash::const_iterator it = r.find(rule.f_);
+    if (it == r.end()) {
+      p = rp0(rule.f_, rule.e_);
+    } else {
+      p = it->second.prob(rule, rp0(rule.f_, rule.e_));
+    }
+    return p;
+  }
+
+  double LogLikelihood(const double& dd, const double& aa) const {
+    if (aa <= -dd) return -std::numeric_limits<double>::infinity();
+    //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
+    double llh = //Md::log_beta_density(dd, 1, 1) +
+                 Md::log_gamma_density(dd + aa, 1, 1);
+    typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::const_iterator it;
+    for (it = r.begin(); it != r.end(); ++it)
+      llh += it->second.log_crp_prob(aa);
+    return llh;
+  }
+
+  struct AlphaResampler {
+    AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {}
+    const TransliterationChunkConditionalModel& m_;
+    double operator()(const double& proposed_strength) const {
+      return m_.LogLikelihood(m_.d, proposed_strength);
+    }
+  };
+
+  void ResampleHyperparameters(MT19937* rng) {
+    typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::iterator it;
+    //const unsigned nloop = 5;
+    const unsigned niterations = 10;
+    //DiscountResampler dr(*this);
+    AlphaResampler ar(*this);
+#if 0
+    for (int iter = 0; iter < nloop; ++iter) {
+      strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
+                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+      double min_discount = std::numeric_limits<double>::min();
+      if (strength < 0.0) min_discount -= strength;
+      d = slice_sampler1d(dr, d, *rng, min_discount,
+                          1.0, 0.0, niterations, 100*niterations);
+    }
+#endif
+    strength = slice_sampler1d(ar, strength, *rng, -d,
+                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+    std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl;
+    for (it = r.begin(); it != r.end(); ++it) {
+#if 0
+      it->second.set_discount(d);
+#endif
+      it->second.set_alpha(strength);
+    }
+  }
+
+  prob_t Likelihood() const {
+    prob_t p; p.logeq(LogLikelihood(d, strength));
+    return p;
+  }
+
+  const CondBaseDist& rp0;
+  typedef std::tr1::unordered_map<std::vector<WordID>,
+                                  CCRP_NoTable<TRule>,
+                                  boost::hash<std::vector<WordID> > > RuleModelHash;
+  RuleModelHash r;
+  double d, strength;
+};
+
 struct GraphStructure {
   GraphStructure() : r() {}
   // leak memory - these are basically static
@@ -20,9 +179,9 @@ struct GraphStructure {
   bool IsReachable() const { return r->nodes > 0; }
 };
 
-struct BackwardEstimates {
-  BackwardEstimates() : gs(), backward() {}
-  explicit BackwardEstimates(const GraphStructure& g) :
+struct ProbabilityEstimates {
+  ProbabilityEstimates() : gs(), backward() {}
+  explicit ProbabilityEstimates(const GraphStructure& g) :
       gs(&g), backward() {
     if (g.r->nodes > 0)
       backward = new float[g.r->nodes];
@@ -36,24 +195,32 @@ struct BackwardEstimates {
   }
 
   // returns an backward estimate
-  double operator()(int src_covered, int trg_covered) const {
+  double Backward(int src_covered, int trg_covered) const {
     if (!backward) return 0;
     int ind = gs->r->node_addresses[src_covered][trg_covered];
     if (ind < 0) return 0;
     return backward[ind];
   }
+
+  prob_t estp;
+  float* backward;
  private:
   const GraphStructure* gs;
-  float* backward;
 };
 
 struct TransliterationsImpl {
-  TransliterationsImpl(int max_src, int max_trg, double fr) :
+  TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) :
+      cp0(max_src, max_trg, sr),
+      tccm(cp0),
+      be(b),
       kMAX_SRC_CHUNK(max_src),
       kMAX_TRG_CHUNK(max_trg),
-      kFILTER_RATIO(fr),
+      kS2T_RATIO(sr),
       tot_pairs(), tot_mem() {
   }
+  const CondBaseDist cp0;
+  TransliterationChunkConditionalModel tccm;
+  const BackwardEstimator& be;
 
   void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
     const size_t src_len = src_lets.size();
@@ -63,20 +230,29 @@ struct TransliterationsImpl {
     if (src_len >= graphs.size()) graphs.resize(src_len + 1);
     if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
     GraphStructure& gs = graphs[src_len][trg_len];
-    if (!gs.r)
-      gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO);
+    if (!gs.r) {
+      double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO))));
+      if (rat > 1.5 || (rat > 2.4 && src_len < 6)) {
+        cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl;
+        gs.r = new Reachability(src_len, trg_len, 0, 0);
+      } else {
+        gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK);
+      }
+    }
+
     const Reachability& r = *gs.r;
 
     // init backward estimates
-    if (src >= bes.size()) bes.resize(src + 1);
-    unordered_map<WordID, BackwardEstimates>::iterator it = bes[src].find(trg);
-    if (it != bes[src].end()) return; // already initialized
+    if (src >= ests.size()) ests.resize(src + 1);
+    unordered_map<WordID, ProbabilityEstimates>::iterator it = ests[src].find(trg);
+    if (it != ests[src].end()) return; // already initialized
 
-    it = bes[src].insert(make_pair(trg, BackwardEstimates(gs))).first;
-    BackwardEstimates& b = it->second;
+    it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first;
+    ProbabilityEstimates& est = it->second;
     if (!gs.r->nodes) return;  // not derivable subject to length constraints
 
-    // TODO
+    be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward);
+    cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl;
     tot_pairs++;
     tot_mem += sizeof(float) * gs.r->nodes;
   }
@@ -92,8 +268,11 @@ struct TransliterationsImpl {
     const vector<GraphStructure>& tv = graphs[src.size()];
     assert(trg.size() < tv.size());
     const GraphStructure& gs = tv[trg.size()];
-    // TODO: do prob
-    return prob_t::Zero();
+    if (gs.r->nodes == 0)
+      return prob_t::Zero();
+    const unordered_map<WordID, ProbabilityEstimates>::const_iterator it = ests[s].find(t);
+    assert(it != ests[s].end());
+    return it->second.estp;
   }
 
   void GraphSummary() const {
@@ -126,15 +305,15 @@ struct TransliterationsImpl {
 
   const int kMAX_SRC_CHUNK;
   const int kMAX_TRG_CHUNK;
-  const double kFILTER_RATIO;
+  const double kS2T_RATIO;
   unsigned tot_pairs;
   size_t tot_mem;
   vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]
-  vector<unordered_map<WordID, BackwardEstimates> > bes; // bes[src][trg]
+  vector<unordered_map<WordID, ProbabilityEstimates> > ests; // ests[src][trg]
 };
 
-Transliterations::Transliterations(int max_src, int max_trg, double fr) :
-    pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {}
+Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) :
+    pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {}
 Transliterations::~Transliterations() { delete pimpl_; }
 
 void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h
index ea9f9d3f..49d14684 100644
--- a/gi/pf/transliterations.h
+++ b/gi/pf/transliterations.h
@@ -5,11 +5,12 @@
 #include "wordid.h"
 #include "prob.h"
 
+struct BackwardEstimator;
 struct TransliterationsImpl;
 struct Transliterations {
   // max_src and max_trg indicate how big the transliteration phrases can be
   // see reachability.h for information about filter_ratio
-  explicit Transliterations(int max_src, int max_trg, double filter_ratio);
+  explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be);
   ~Transliterations();
   void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
   void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h
index 79321493..6efbfc78 100644
--- a/utils/ccrp_nt.h
+++ b/utils/ccrp_nt.h
@@ -11,6 +11,7 @@
 #include <boost/functional/hash.hpp>
 #include "sampler.h"
 #include "slice_sampler.h"
+#include "m.h"
 
 // Chinese restaurant process (1 parameter)
 template <typename Dish, typename DishHash = boost::hash<Dish> >
@@ -29,6 +30,7 @@ class CCRP_NoTable {
     alpha_prior_rate_(c_rate) {}
 
   double alpha() const { return alpha_; }
+  void set_alpha(const double& alpha) { alpha_ = alpha; assert(alpha_ > 0.0); }
 
   bool has_alpha_prior() const {
     return !std::isnan(alpha_prior_shape_);
@@ -71,9 +73,10 @@ class CCRP_NoTable {
     return table_diff;
   }
 
-  double prob(const Dish& dish, const double& p0) const {
+  template <typename F>
+  F prob(const Dish& dish, const F& p0) const {
     const unsigned at_table = num_customers(dish);
-    return (at_table + p0 * alpha_) / (num_customers_ + alpha_);
+    return (F(at_table) + p0 * F(alpha_)) / F(num_customers_ + alpha_);
   }
 
   double logprob(const Dish& dish, const double& logp0) const {
@@ -85,20 +88,12 @@ class CCRP_NoTable {
     return log_crp_prob(alpha_);
   }
 
-  static double log_gamma_density(const double& x, const double& shape, const double& rate) {
-    assert(x >= 0.0);
-    assert(shape > 0.0);
-    assert(rate > 0.0);
-    const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape);
-    return lp;
-  }
-
   // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
   // does not include P_0's
   double log_crp_prob(const double& alpha) const {
     double lp = 0.0;
     if (has_alpha_prior())
-      lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
+      lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
     assert(lp <= 0.0);
     if (num_customers_) {
       lp += lgamma(alpha) - lgamma(alpha + num_customers_) +
-- 
cgit v1.2.3


From e8f1e4e75dd8e27b81fd4e68116a38830547c2c4 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Fri, 9 Mar 2012 22:27:12 -0500
Subject: forgotten file

---
 gi/pf/os_phrase.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 gi/pf/os_phrase.h

(limited to 'gi/pf')

diff --git a/gi/pf/os_phrase.h b/gi/pf/os_phrase.h
new file mode 100644
index 00000000..dfe40cb1
--- /dev/null
+++ b/gi/pf/os_phrase.h
@@ -0,0 +1,15 @@
+#ifndef _OS_PHRASE_H_
+#define _OS_PHRASE_H_
+
+#include <iostream>
+#include <vector>
+#include "tdict.h"
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) {
+  os << '[';
+  for (int i = 0; i < p.size(); ++i)
+    os << (i==0 ? "" : " ") << TD::Convert(p[i]);
+  return os << ']';
+}
+
+#endif
-- 
cgit v1.2.3


From 5f9f400f4359bc14f7231d6eabd76b7ceee737aa Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Fri, 9 Mar 2012 23:13:09 -0500
Subject: logging after alignment

---
 gi/pf/align-lexonly-pyp.cc | 1 +
 gi/pf/pyp_tm.cc            | 7 +++++--
 gi/pf/pyp_word_model.h     | 2 +-
 utils/ccrp.h               | 1 +
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index d68a4b8f..4a1d1db6 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -208,6 +208,7 @@ int main(int argc, char** argv) {
   }
   for (unsigned i = 0; i < corpus.size(); ++i)
     WriteAlignments(corpus[i]);
+  aligner.model.Summary();
 
   return 0;
 }
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index 94cbe7c3..b5262f47 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -54,8 +54,6 @@ struct ConditionalPYPWordModel {
     assert(it != r.end());
     if (it->second.decrement(trglets, rng)) {
       base.Decrement(trglets, rng);
-      if (it->second.num_customers() == 0)
-        r.erase(it);
     }
   }
 
@@ -84,6 +82,11 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets
     tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)),
     kX(-TD::Convert("X")) {}
 
+void PYPLexicalTranslation::Summary() const {
+  tmodel->Summary();
+  up0->Summary();
+}
+
 prob_t PYPLexicalTranslation::Likelihood() const {
   prob_t p = up0->Likelihood();
   p *= tmodel->Likelihood();
diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h
index 800a4fd7..ff366865 100644
--- a/gi/pf/pyp_word_model.h
+++ b/gi/pf/pyp_word_model.h
@@ -12,7 +12,7 @@
 
 // PYP(d,s,poisson-uniform) represented as a CRP
 struct PYPWordModel {
-  explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 7.5) :
+  explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 5) :
       base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {}
 
   void ResampleHyperparameters(MT19937* rng);
diff --git a/utils/ccrp.h b/utils/ccrp.h
index 439d7e1e..4a8b80e7 100644
--- a/utils/ccrp.h
+++ b/utils/ccrp.h
@@ -221,6 +221,7 @@ class CCRP {
 
   void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
     assert(has_discount_prior() || has_strength_prior());
+    if (num_customers() == 0) return;
     DiscountResampler dr(*this);
     StrengthResampler sr(*this);
     for (int iter = 0; iter < nloop; ++iter) {
-- 
cgit v1.2.3


From 2e9006a5b153dfe3c0fcedf9f1eaea8866f518a8 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 10 Mar 2012 00:00:27 -0500
Subject: use quasi model 2 instead of uniform alignments

---
 gi/pf/align-lexonly-pyp.cc |  6 ++++++
 gi/pf/quasi_model2.h       | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 gi/pf/quasi_model2.h

(limited to 'gi/pf')

diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 4a1d1db6..0c90b6ce 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -11,6 +11,7 @@
 #include "sampler.h"
 #include "corpus.h"
 #include "pyp_tm.h"
+#include "quasi_model2.h"
 
 using namespace std;
 namespace po = boost::program_options;
@@ -61,12 +62,14 @@ struct Aligner {
   Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) :
       corpus(*c),
       model(lets, num_letters),
+      paj(4, 0.08),
       kNULL(TD::Convert("NULL")) {
     assert(lets[kNULL].size() == 0);
   }
 
   vector<AlignedSentencePair>& corpus;
   PYPLexicalTranslation model;
+  const QuasiModel2 paj;
   const WordID kNULL;
 
   void ResampleHyperparameters() {
@@ -83,6 +86,7 @@ struct Aligner {
         a_j = prng->next() * (1 + asp.src.size());
         const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
         model.Increment(f_a_j, asp.trg[j], &*prng);
+        // TODO factor in alignment prob
       }
     }
     cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl;
@@ -101,6 +105,8 @@ struct Aligner {
         for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
           const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
           ss[prop_a_j] = model.Prob(prop_f, e_j);
+          // TODO configurable
+          ss[prop_a_j] *= paj.Pa_j(prop_a_j, j, asp.src.size(), asp.trg.size());
         }
         a_j = prng->SelectSample(ss);
         f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h
new file mode 100644
index 00000000..0095289f
--- /dev/null
+++ b/gi/pf/quasi_model2.h
@@ -0,0 +1,46 @@
+#ifndef _QUASI_MODEL2_H_
+#define _QUASI_MODEL2_H_
+
+#include <vector>
+#include <cmath>
+#include "prob.h"
+#include "array2d.h"
+
+struct QuasiModel2 {
+  explicit QuasiModel2(double alpha, double pnull = 0.1) :
+      alpha_(alpha),
+      pnull_(pnull),
+      pnotnull_(1 - pnull),
+      z_(1000,1000) {}
+  // a_j = 0 => NULL; src_len does *not* include null
+  prob_t Pa_j(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const {
+    if (!a_j) return pnull_;
+    std::vector<prob_t>& zv = z_(src_len, trg_len);
+    if (zv.size() == 0)
+      zv.resize(trg_len);
+    
+    prob_t& z = zv[j];
+    if (z.is_0()) z = ComputeZ(j, src_len, trg_len);
+
+    prob_t p;
+    p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_);
+    p *= pnotnull_;
+    p /= z;
+    return p;
+  }
+ private:
+  prob_t ComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const {
+    prob_t p, z = prob_t::Zero();
+    for (int a_j = 1; a_j <= src_len; ++a_j) {
+      p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_);
+      z += p;
+    }
+    return z;
+  }
+  double alpha_;
+  const prob_t pnull_;
+  const prob_t pnotnull_;
+  mutable Array2D<std::vector<prob_t> > z_;
+};
+
+#endif
-- 
cgit v1.2.3


From 1c92df11360cda4be57183bfb4efa2d62107c651 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 10 Mar 2012 01:08:23 -0500
Subject: tie params

---
 gi/pf/pyp_lm.cc        | 66 +++++++++-------------------------------
 gi/pf/pyp_tm.cc        |  2 ++
 gi/pf/tied_resampler.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 51 deletions(-)
 create mode 100644 gi/pf/tied_resampler.h

(limited to 'gi/pf')

diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 52e6be2c..85635b8f 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -11,6 +11,7 @@
 #include "tdict.h"
 #include "sampler.h"
 #include "ccrp.h"
+#include "tied_resampler.h"
 
 // A not very memory-efficient implementation of an N-gram LM based on PYPs
 // as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model
@@ -66,7 +67,7 @@ template<> struct PYPLM<0> {
   void increment(WordID, const vector<WordID>&, MT19937*) { ++draws; }
   void decrement(WordID, const vector<WordID>&, MT19937*) { --draws; assert(draws >= 0); }
   double prob(WordID, const vector<WordID>&) const { return p0; }
-  void resample_hyperparameters(MT19937*, const unsigned, const unsigned) {}
+  void resample_hyperparameters(MT19937*) {}
   double log_likelihood() const { return draws * log(p0); }
   const double p0;
   int draws;
@@ -76,16 +77,17 @@ template<> struct PYPLM<0> {
 template <unsigned N> struct PYPLM {
   PYPLM(unsigned vs, double da, double db, double ss, double sr) :
       backoff(vs, da, db, ss, sr),
-      discount_a(da), discount_b(db),
-      strength_s(ss), strength_r(sr),
-      d(0.8), strength(1.0), lookup(N-1) {}
+      tr(da, db, ss, sr, 0.8, 1.0),
+      lookup(N-1) {}
   void increment(WordID w, const vector<WordID>& context, MT19937* rng) {
     const double bo = backoff.prob(w, context);
     for (unsigned i = 0; i < N-1; ++i)
       lookup[i] = context[context.size() - 1 - i];
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
-    if (it == p.end())
-      it = p.insert(make_pair(lookup, CCRP<WordID>(d,strength))).first;
+    if (it == p.end()) {
+      it = p.insert(make_pair(lookup, CCRP<WordID>(0.5,1))).first;
+      tr.Add(&it->second);  // add to resampler
+    }
     if (it->second.increment(w, bo, rng))
       backoff.increment(w, context, rng);
   }
@@ -107,59 +109,21 @@ template <unsigned N> struct PYPLM {
   }
 
   double log_likelihood() const {
-    return log_likelihood(d, strength) + backoff.log_likelihood();
-  }
-
-  double log_likelihood(const double& dd, const double& aa) const {
-    if (aa <= -dd) return -std::numeric_limits<double>::infinity();
-    //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
-    double llh = Md::log_beta_density(dd, discount_a, discount_b) +
-                 Md::log_gamma_density(aa + dd, strength_s, strength_r);
+    double llh = backoff.log_likelihood();
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it;
     for (it = p.begin(); it != p.end(); ++it)
-      llh += it->second.log_crp_prob(dd, aa);
+      llh += it->second.log_crp_prob();
+    // TODO parametric likelihood from TiedResampler
     return llh;
   }
 
-  struct DiscountResampler {
-    DiscountResampler(const PYPLM& m) : m_(m) {}
-    const PYPLM& m_;
-    double operator()(const double& proposed_discount) const {
-      return m_.log_likelihood(proposed_discount, m_.strength);
-    }
-  };
-
-  struct AlphaResampler {
-    AlphaResampler(const PYPLM& m) : m_(m) {}
-    const PYPLM& m_;
-    double operator()(const double& proposed_strength) const {
-      return m_.log_likelihood(m_.d, proposed_strength);
-    }
-  };
-
-  void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
-    DiscountResampler dr(*this);
-    AlphaResampler ar(*this);
-    for (int iter = 0; iter < nloop; ++iter) {
-      strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
-                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
-      double min_discount = std::numeric_limits<double>::min();
-      if (strength < 0.0) min_discount -= strength;
-      d = slice_sampler1d(dr, d, *rng, min_discount,
-                          1.0, 0.0, niterations, 100*niterations);
-    }
-    strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
-                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
-    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it;
-    cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << strength << ") = " << log_likelihood(d, strength) << endl;
-    for (it = p.begin(); it != p.end(); ++it) {
-      it->second.set_discount(d);
-      it->second.set_strength(strength);
-    }
-    backoff.resample_hyperparameters(rng, nloop, niterations);
+  void resample_hyperparameters(MT19937* rng) {
+    tr.ResampleHyperparameters(rng);
+    backoff.resample_hyperparameters(rng);
   }
 
   PYPLM<N-1> backoff;
+  TiedResampler<CCRP<WordID> > tr;
   double discount_a, discount_b, strength_s, strength_r;
   double d, strength;
   mutable vector<WordID> lookup;  // thread-local
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index b5262f47..73104fe9 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -11,6 +11,8 @@
 #include "ccrp.h"
 #include "pyp_word_model.h"
 
+#include "tied_resampler.h"
+
 using namespace std;
 using namespace std::tr1;
 
diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h
new file mode 100644
index 00000000..208fb9c7
--- /dev/null
+++ b/gi/pf/tied_resampler.h
@@ -0,0 +1,82 @@
+#ifndef _TIED_RESAMPLER_H_
+#define _TIED_RESAMPLER_H_
+
+#include <set>
+#include "sampler.h"
+#include "slice_sampler.h"
+#include "m.h"
+
+template <class CRP>
+struct TiedResampler {
+  explicit TiedResampler(double da, double db, double ss, double sr, double d=0.5, double s=1.0) :
+      d_alpha(da),
+      d_beta(db),
+      s_shape(ss),
+      s_rate(sr),
+      discount(d),
+      strength(s) {}
+
+  void Add(CRP* crp) {
+    crps.insert(crp);
+    crp->set_discount(discount);
+    crp->set_strength(strength);
+    assert(!crp->has_discount_prior());
+    assert(!crp->has_strength_prior());
+  }
+
+  void Remove(CRP* crp) {
+    crps.erase(crp);
+  }
+
+  double LogLikelihood(double d, double s) const {
+    if (s <= -d) return -std::numeric_limits<double>::infinity();
+    double llh = Md::log_beta_density(d, d_alpha, d_beta) +
+                 Md::log_gamma_density(d + s, s_shape, s_rate);
+    for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it)
+      llh += (*it)->log_crp_prob(d, s);
+    return llh;
+  }
+
+  struct DiscountResampler {
+    DiscountResampler(const TiedResampler& m) : m_(m) {}
+    const TiedResampler& m_;
+    double operator()(const double& proposed_discount) const {
+      return m_.LogLikelihood(proposed_discount, m_.strength);
+    }
+  };
+
+  struct AlphaResampler {
+    AlphaResampler(const TiedResampler& m) : m_(m) {}
+    const TiedResampler& m_;
+    double operator()(const double& proposed_strength) const {
+      return m_.LogLikelihood(m_.discount, proposed_strength);
+    }
+  };
+
+  void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    const DiscountResampler dr(*this);
+    const AlphaResampler ar(*this);
+    for (int iter = 0; iter < nloop; ++iter) {
+      strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(),
+                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+      double min_discount = std::numeric_limits<double>::min();
+      if (strength < 0.0) min_discount -= strength;
+      discount = slice_sampler1d(dr, discount, *rng, min_discount,
+                          1.0, 0.0, niterations, 100*niterations);
+    }
+    strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(),
+                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+    std::cerr << "TiedCRPs(d=" << discount << ",s="
+              << strength << ") = " << LogLikelihood(discount, strength) << std::endl;
+    for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it) {
+      (*it)->set_discount(discount);
+      (*it)->set_strength(strength);
+    }
+  }
+ private:
+  std::set<CRP*> crps;
+  const double d_alpha, d_beta, s_shape, s_rate;
+  double discount, strength;
+};
+
+#endif
-- 
cgit v1.2.3


From f06c3f8d9dc2ce66153890809a7fc9b296ee625e Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 10 Mar 2012 12:56:15 -0500
Subject: ready to infer alignment parameters

---
 gi/pf/Makefile.am          |   4 +-
 gi/pf/align-lexonly-pyp.cc |  22 ++-
 gi/pf/align-lexonly.cc     | 332 ---------------------------------------------
 gi/pf/pyp_tm.cc            |   6 +-
 gi/pf/quasi_model2.h       | 115 ++++++++++++----
 gi/pf/tied_resampler.h     |  31 +++++
 6 files changed, 143 insertions(+), 367 deletions(-)
 delete mode 100644 gi/pf/align-lexonly.cc

(limited to 'gi/pf')

diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 4ce72ba1..f9c979d0 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,4 +1,4 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl
 
 noinst_LIBRARIES = libpf.a
 
@@ -7,8 +7,6 @@ libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc cor
 nuisance_test_SOURCES = nuisance_test.cc
 nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
 
-align_lexonly_SOURCES = align-lexonly.cc
-
 align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
 align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
 
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 0c90b6ce..68cb9192 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -61,15 +61,15 @@ struct AlignedSentencePair {
 struct Aligner {
   Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) :
       corpus(*c),
+      paj_model(4, 0.08),
       model(lets, num_letters),
-      paj(4, 0.08),
       kNULL(TD::Convert("NULL")) {
     assert(lets[kNULL].size() == 0);
   }
 
   vector<AlignedSentencePair>& corpus;
+  QuasiModel2 paj_model;
   PYPLexicalTranslation model;
-  const QuasiModel2 paj;
   const WordID kNULL;
 
   void ResampleHyperparameters() {
@@ -86,10 +86,12 @@ struct Aligner {
         a_j = prng->next() * (1 + asp.src.size());
         const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
         model.Increment(f_a_j, asp.trg[j], &*prng);
-        // TODO factor in alignment prob
+        paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
       }
     }
-    cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl;
+    cerr << "Corpus intialized randomly." << endl;
+    cerr << "LLH = " << Likelihood() << "    \t(Amodel=" << paj_model.Likelihood()
+         << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;
   }
 
   void ResampleCorpus() {
@@ -101,19 +103,25 @@ struct Aligner {
         const WordID e_j = asp.trg[j];
         WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
         model.Decrement(f_a_j, e_j, prng);
+        paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size());
 
         for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
           const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
           ss[prop_a_j] = model.Prob(prop_f, e_j);
-          // TODO configurable
-          ss[prop_a_j] *= paj.Pa_j(prop_a_j, j, asp.src.size(), asp.trg.size());
+          ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size());
         }
         a_j = prng->SelectSample(ss);
         f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
         model.Increment(f_a_j, e_j, prng);
+        paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
       }
     }
-    cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl;
+    cerr << "LLH = " << Likelihood() << "    \t(Amodel=" << paj_model.Likelihood()
+         << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;
+  }
+
+  prob_t Likelihood() const {
+    return model.Likelihood() * paj_model.Likelihood();
   }
 };
 
diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc
deleted file mode 100644
index dbc9dc07..00000000
--- a/gi/pf/align-lexonly.cc
+++ /dev/null
@@ -1,332 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/multi_array.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "array2d.h"
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "conditional_pseg.h"
-#include "trule.h"
-#include "tdict.h"
-#include "stringlib.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp_nt.h"
-#include "corpus.h"
-#include "ngram_base.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
-        ("input,i",po::value<string>(),"Read parallel data from")
-        ("random_seed,S",po::value<uint32_t>(), "Random seed");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || (conf->count("input") == 0)) {
-    cerr << dcmdline_options << endl;
-    exit(1);
-  }
-}
-
-shared_ptr<MT19937> prng;
-
-struct LexicalAlignment {
-  unsigned char src_index;
-  bool is_transliteration;
-  vector<pair<short, short> > derivation;
-};
-
-struct AlignedSentencePair {
-  vector<WordID> src;
-  vector<WordID> trg;
-  vector<LexicalAlignment> a;
-  Array2D<short> posterior;
-};
-
-struct HierarchicalWordBase {
-  explicit HierarchicalWordBase(const unsigned vocab_e_size) :
-      base(prob_t::One()), r(25,25,10), u0(-log(vocab_e_size)) {}
-
-  void ResampleHyperparameters(MT19937* rng) {
-    r.resample_hyperparameters(rng);
-  }
-
-  inline double logp0(const vector<WordID>& s) const {
-    return s.size() * u0;
-  }
-
-  // return p0 of rule.e_
-  prob_t operator()(const TRule& rule) const {
-    prob_t p; p.logeq(r.logprob(rule.e_, logp0(rule.e_)));
-    return p;
-  }
-
-  void Increment(const TRule& rule) {
-    if (r.increment(rule.e_)) {
-      prob_t p; p.logeq(logp0(rule.e_));
-      base *= p;
-    }
-  }
-
-  void Decrement(const TRule& rule) {
-    if (r.decrement(rule.e_)) {
-      prob_t p; p.logeq(logp0(rule.e_));
-      base /= p;
-    }
-  }
-
-  prob_t Likelihood() const {
-    prob_t p; p.logeq(r.log_crp_prob());
-    p *= base;
-    return p;
-  }
-
-  void Summary() const {
-    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (\\alpha=" << r.alpha() << ')' << endl;
-    for (CCRP_NoTable<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
-      cerr << "   " << it->second << '\t' << TD::GetString(it->first) << endl;
-  }
-
-  prob_t base;
-  CCRP_NoTable<vector<WordID> > r;
-  const double u0;
-};
-
-struct BasicLexicalAlignment {
-  explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets,
-                                 const unsigned words_e,
-                                 const unsigned letters_e,
-                                 vector<AlignedSentencePair>* corp) :
-      letters(lets),
-      corpus(*corp),
-      up0("fr-en.10k.translit-base.txt.gz"),
-      //up0(words_e),
-      //up0("en.chars.1gram", letters_e),
-      //up0("en.words.1gram"),
-      //up0(letters_e),
-      //up0("en.chars.2gram"),
-      tmodel(up0) {
-  }
-
-  void InstantiateRule(const WordID src,
-                       const WordID trg,
-                       TRule* rule) const {
-    static const WordID kX = TD::Convert("X") * -1;
-    rule->lhs_ = kX;
-    rule->e_ = letters[trg];
-    rule->f_ = letters[src];
-  }
-
-  void InitializeRandom() {
-    const WordID kNULL = TD::Convert("NULL");
-    cerr << "Initializing with random alignments ...\n";
-    for (unsigned i = 0; i < corpus.size(); ++i) {
-      AlignedSentencePair& asp = corpus[i];
-      asp.a.resize(asp.trg.size());
-      for (unsigned j = 0; j < asp.trg.size(); ++j) {
-        const unsigned char a_j = prng->next() * (1 + asp.src.size());
-        const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
-        TRule r;
-        InstantiateRule(f_a_j, asp.trg[j], &r);
-        asp.a[j].is_transliteration = false;
-        asp.a[j].src_index = a_j;
-        if (tmodel.IncrementRule(r))
-          up0.Increment(r);
-      }
-    }
-    cerr << "  LLH = " << Likelihood() << endl;
-  }
-
-  prob_t Likelihood() const {
-    prob_t p = tmodel.Likelihood();
-    p *= up0.Likelihood();
-    return p;
-  }
-
-  void ResampleHyperparemeters() {
-    cerr << "  LLH_prev = " << Likelihood() << flush;
-    tmodel.ResampleHyperparameters(&*prng);
-    up0.ResampleHyperparameters(&*prng);
-    cerr << "\tLLH_post = " << Likelihood() << endl;
-  }
-
-  void ResampleCorpus();
-
-  const vector<vector<WordID> >& letters; // spelling dictionary
-  vector<AlignedSentencePair>& corpus;
-  //PhraseConditionalUninformativeBase up0;
-  //PhraseConditionalUninformativeUnigramBase up0;
-  //UnigramWordBase up0;
-  //HierarchicalUnigramBase up0;
-  TableLookupBase up0;
-  //HierarchicalWordBase up0;
-  //PoissonUniformUninformativeBase up0;
-  //CompletelyUniformBase up0;
-  //FixedNgramBase up0;
-  //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel;
-  //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel;
-  //ConditionalTranslationModel<UnigramWordBase> tmodel;
-  //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel;
-  //ConditionalTranslationModel<HierarchicalWordBase> tmodel;
-  //ConditionalTranslationModel<PoissonUniformUninformativeBase> tmodel;
-  ConditionalTranslationModel<TableLookupBase> tmodel;
-  //ConditionalTranslationModel<FixedNgramBase> tmodel;
-  //ConditionalTranslationModel<CompletelyUniformBase> tmodel;
-};
-
-void BasicLexicalAlignment::ResampleCorpus() {
-  static const WordID kNULL = TD::Convert("NULL");
-  for (unsigned i = 0; i < corpus.size(); ++i) {
-    AlignedSentencePair& asp = corpus[i];
-    SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
-    for (unsigned j = 0; j < asp.trg.size(); ++j) {
-      TRule r;
-      unsigned char& a_j = asp.a[j].src_index;
-      WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
-      InstantiateRule(f_a_j, asp.trg[j], &r);
-      if (tmodel.DecrementRule(r))
-        up0.Decrement(r);
-
-      for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
-        const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
-        InstantiateRule(prop_f, asp.trg[j], &r);
-        ss[prop_a_j] = tmodel.RuleProbability(r);
-      }
-      a_j = prng->SelectSample(ss);
-      f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
-      InstantiateRule(f_a_j, asp.trg[j], &r);
-      if (tmodel.IncrementRule(r))
-        up0.Increment(r);
-    }
-  }
-  cerr << "  LLH = " << tmodel.Likelihood() << endl;
-}
-
-void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
-  for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
-    if (*it >= l->size()) { l->resize(*it + 1); }
-    vector<WordID>& letters = (*l)[*it];
-    if (letters.size()) continue;   // if e and f have the same word
-
-    const string& w = TD::Convert(*it);
-    
-    size_t cur = 0;
-    while (cur < w.size()) {
-      const size_t len = UTF8Len(w[cur]);
-      letters.push_back(TD::Convert(w.substr(cur, len)));
-      if (letset) letset->insert(letters.back());
-      cur += len;
-    }
-  }
-}
-
-void Debug(const AlignedSentencePair& asp) {
-  cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
-  Array2D<bool> a(asp.src.size(), asp.trg.size());
-  for (unsigned j = 0; j < asp.trg.size(); ++j)
-    if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
-  cerr << a << endl;
-}
-
-void AddSample(AlignedSentencePair* asp) {
-  for (unsigned j = 0; j < asp->trg.size(); ++j)
-    asp->posterior(asp->a[j].src_index, j)++;
-}
-
-void WriteAlignments(const AlignedSentencePair& asp) {
-  bool first = true;
-  for (unsigned j = 0; j < asp.trg.size(); ++j) {
-    int src_index = -1;
-    int mc = -1;
-    for (unsigned i = 0; i <= asp.src.size(); ++i) {
-      if (asp.posterior(i, j) > mc) {
-        mc = asp.posterior(i, j);
-        src_index = i;
-      }
-    }
-
-    if (src_index) {
-      if (first) first = false; else cout << ' ';
-      cout << (src_index - 1) << '-' << j;
-    }
-  }
-  cout << endl;
-}
-
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-
-  if (conf.count("random_seed"))
-    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
-  else
-    prng.reset(new MT19937);
-//  MT19937& rng = *prng;
-
-  vector<vector<int> > corpuse, corpusf;
-  set<int> vocabe, vocabf;
-  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
-  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
-  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
-  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
-  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
-  assert(corpusf.size() == corpuse.size());
-
-  vector<AlignedSentencePair> corpus(corpuse.size());
-  for (unsigned i = 0; i < corpuse.size(); ++i) {
-    corpus[i].src.swap(corpusf[i]);
-    corpus[i].trg.swap(corpuse[i]);
-    corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size());
-  }
-  corpusf.clear(); corpuse.clear();
-
-  vocabf.insert(TD::Convert("NULL"));
-  vector<vector<WordID> > letters(TD::NumWords());
-  set<WordID> letset;
-  ExtractLetters(vocabe, &letters, &letset);
-  ExtractLetters(vocabf, &letters, NULL);
-  letters[TD::Convert("NULL")].clear();
-
-  BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus);
-  x.InitializeRandom();
-  const unsigned samples = conf["samples"].as<unsigned>();
-  for (int i = 0; i < samples; ++i) {
-    for (int j = 395; j < 397; ++j) Debug(corpus[j]);
-    cerr << i << "\t" << x.tmodel.r.size() << "\t";
-    if (i % 10 == 0) x.ResampleHyperparemeters();
-    x.ResampleCorpus();
-    if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
-  }
-  for (unsigned i = 0; i < corpus.size(); ++i)
-    WriteAlignments(corpus[i]);
-  //ModelAndData posterior(x, &corpus, vocabe, vocabf);
-  x.tmodel.Summary();
-  x.up0.Summary();
-
-  //posterior.Sample();
-
-  return 0;
-}
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index 73104fe9..bf5a6497 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -10,7 +10,6 @@
 #include "tdict.h"
 #include "ccrp.h"
 #include "pyp_word_model.h"
-
 #include "tied_resampler.h"
 
 using namespace std;
@@ -18,7 +17,7 @@ using namespace std::tr1;
 
 template <typename Base>
 struct ConditionalPYPWordModel {
-  ConditionalPYPWordModel(Base* b) : base(*b) {}
+  ConditionalPYPWordModel(Base* b) : base(*b), btr(3) {}
 
   void Summary() const {
     cerr << "Number of conditioning contexts: " << r.size() << endl;
@@ -32,6 +31,7 @@ struct ConditionalPYPWordModel {
   void ResampleHyperparameters(MT19937* rng) {
     for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
       it->second.resample_hyperparameters(rng);
+    btr.ResampleHyperparameters(rng);
   } 
 
   prob_t Prob(const WordID src, const vector<WordID>& trglets) const {
@@ -72,7 +72,9 @@ struct ConditionalPYPWordModel {
     return r.size();
   }
 
+  // TODO tie PYP hyperparameters based on source word frequency bins
   Base& base;
+  BinTiedResampler<CCRP<vector<WordID> > > btr;
   typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
   RuleModelHash r;
 };
diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h
index 0095289f..8ec0a400 100644
--- a/gi/pf/quasi_model2.h
+++ b/gi/pf/quasi_model2.h
@@ -3,44 +3,113 @@
 
 #include <vector>
 #include <cmath>
+#include <tr1/unordered_map>
+#include "boost/functional.hpp"
 #include "prob.h"
 #include "array2d.h"
 
+struct AlignmentObservation {
+  AlignmentObservation() : src_len(), trg_len(), j(), a_j() {}
+  AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) :
+      src_len(sl), trg_len(tl), j(tw), a_j(sw) {}
+  unsigned short src_len;
+  unsigned short trg_len;
+  unsigned short j;
+  unsigned short a_j;
+};
+
+inline size_t hash_value(const AlignmentObservation& o) {
+  return reinterpret_cast<const size_t&>(o);
+}
+
+inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) {
+  return hash_value(a) == hash_value(b);
+}
+
 struct QuasiModel2 {
   explicit QuasiModel2(double alpha, double pnull = 0.1) :
       alpha_(alpha),
       pnull_(pnull),
-      pnotnull_(1 - pnull),
-      z_(1000,1000) {}
+      pnotnull_(1 - pnull) {}
+
   // a_j = 0 => NULL; src_len does *not* include null
-  prob_t Pa_j(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const {
+  prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const {
     if (!a_j) return pnull_;
-    std::vector<prob_t>& zv = z_(src_len, trg_len);
-    if (zv.size() == 0)
-      zv.resize(trg_len);
-    
-    prob_t& z = zv[j];
-    if (z.is_0()) z = ComputeZ(j, src_len, trg_len);
-
-    prob_t p;
-    p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_);
-    p *= pnotnull_;
-    p /= z;
+    return pnotnull_ *
+       prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len));
+  }
+
+  void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) {
+    assert(a_j <= src_len);
+    assert(j < trg_len);
+    ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)];
+  }
+
+  void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) {
+    const AlignmentObservation ao(src_len, trg_len, j, a_j);
+    int &cc = obs_[ao];
+    assert(cc > 0);
+    --cc;
+    if (!cc) obs_.erase(ao);
+  }
+
+  prob_t Likelihood() const {
+    return Likelihood(alpha_, pnull_.as_float());
+  }
+
+  prob_t Likelihood(double alpha, double ppnull) const {
+    const prob_t pnull(ppnull);
+    const prob_t pnotnull(1 - ppnull);
+
+    prob_t p = prob_t::One();
+    for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) {
+      const AlignmentObservation& ao = it->first;
+      if (ao.a_j) {
+        double u = UnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha);
+        double z = ComputeZ(ao.j, ao.src_len, ao.trg_len, alpha);
+        prob_t pa(u / z);
+        pa *= pnotnull;
+        pa.poweq(it->second);
+        p *= pa;
+      } else {
+        p *= pnull.pow(it->second);
+      }
+    }
     return p;
   }
+
  private:
-  prob_t ComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const {
-    prob_t p, z = prob_t::Zero();
-    for (int a_j = 1; a_j <= src_len; ++a_j) {
-      p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_);
-      z += p;
-    }
+  static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
+    return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha);
+  }
+
+  static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
+    double z = 0;
+    for (int a_j = 1; a_j <= src_len; ++a_j)
+      z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha);
     return z;
   }
+
+  const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const {
+    if (src_len >= zcache_.size())
+      zcache_.resize(src_len + 1);
+    if (trg_len >= zcache_[src_len].size())
+      zcache_[src_len].resize(trg_len + 1);
+    std::vector<double>& zv = zcache_[src_len][trg_len];
+    if (zv.size() == 0)
+      zv.resize(trg_len);
+    double& z = zv[j];
+    if (!z)
+      z = ComputeZ(j, src_len, trg_len, alpha_);
+    return z;
+  }
+
   double alpha_;
-  const prob_t pnull_;
-  const prob_t pnotnull_;
-  mutable Array2D<std::vector<prob_t> > z_;
+  prob_t pnull_;
+  prob_t pnotnull_;
+  mutable std::vector<std::vector<std::vector<double> > > zcache_;
+  typedef std::tr1::unordered_map<AlignmentObservation, int, boost::hash<AlignmentObservation> > ObsCount;
+  ObsCount obs_;
 };
 
 #endif
diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h
index 208fb9c7..5a262f9d 100644
--- a/gi/pf/tied_resampler.h
+++ b/gi/pf/tied_resampler.h
@@ -2,6 +2,7 @@
 #define _TIED_RESAMPLER_H_
 
 #include <set>
+#include <vector>
 #include "sampler.h"
 #include "slice_sampler.h"
 #include "m.h"
@@ -28,6 +29,10 @@ struct TiedResampler {
     crps.erase(crp);
   }
 
+  size_t size() const {
+    return crps.size();
+  }
+
   double LogLikelihood(double d, double s) const {
     if (s <= -d) return -std::numeric_limits<double>::infinity();
     double llh = Md::log_beta_density(d, d_alpha, d_beta) +
@@ -54,6 +59,7 @@ struct TiedResampler {
   };
 
   void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; }
     const DiscountResampler dr(*this);
     const AlphaResampler ar(*this);
     for (int iter = 0; iter < nloop; ++iter) {
@@ -79,4 +85,29 @@ struct TiedResampler {
   double discount, strength;
 };
 
+// split according to some criterion
+template <class CRP>
+struct BinTiedResampler {
+  explicit BinTiedResampler(unsigned nbins) :
+      resamplers(nbins, TiedResampler<CRP>(1,1,1,1)) {}
+
+  void Add(unsigned bin, CRP* crp) {
+    resamplers[bin].Add(crp);
+  }
+
+  void Remove(unsigned bin, CRP* crp) {
+    resamplers[bin].Remove(crp);
+  }
+
+  void ResampleHyperparameters(MT19937* rng) {
+    for (unsigned i = 0; i < resamplers.size(); ++i) {
+      std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush;
+      resamplers[i].ResampleHyperparameters(rng);
+    }
+  }
+
+ private:
+  std::vector<TiedResampler<CRP> > resamplers;
+};
+
 #endif
-- 
cgit v1.2.3


From 280d5aa74b6a41f8f6deb5dd374140b7e3ab2703 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 10 Mar 2012 14:10:04 -0500
Subject: do Bayesian inference on quasimodel2 hyperparameters

---
 gi/pf/align-lexonly-pyp.cc |  5 ++--
 gi/pf/pyp_lm.cc            |  2 +-
 gi/pf/pyp_tm.cc            | 11 +++++----
 gi/pf/quasi_model2.h       | 57 +++++++++++++++++++++++++++++++++++++++++++---
 gi/pf/tied_resampler.h     | 11 +++++++++
 5 files changed, 75 insertions(+), 11 deletions(-)

(limited to 'gi/pf')

diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 68cb9192..6c054753 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -74,6 +74,7 @@ struct Aligner {
 
   void ResampleHyperparameters() {
     model.ResampleHyperparameters(prng);
+    paj_model.ResampleHyperparameters(prng);
   }
 
   void InitializeRandom() {
@@ -216,9 +217,9 @@ int main(int argc, char** argv) {
   const unsigned samples = conf["samples"].as<unsigned>();
   for (int i = 0; i < samples; ++i) {
     for (int j = 65; j < 67; ++j) Debug(corpus[j]);
-    if (i % 7 == 6) aligner.ResampleHyperparameters();
+    if (i % 10 == 9) aligner.ResampleHyperparameters();
     aligner.ResampleCorpus();
-    if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
+    if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
   }
   for (unsigned i = 0; i < corpus.size(); ++i)
     WriteAlignments(corpus[i]);
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 85635b8f..91029688 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -113,7 +113,7 @@ template <unsigned N> struct PYPLM {
     typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it;
     for (it = p.begin(); it != p.end(); ++it)
       llh += it->second.log_crp_prob();
-    // TODO parametric likelihood from TiedResampler
+    llh += tr.LogLikelihood();
     return llh;
   }
 
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index bf5a6497..34ef0ba2 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -17,7 +17,7 @@ using namespace std::tr1;
 
 template <typename Base>
 struct ConditionalPYPWordModel {
-  ConditionalPYPWordModel(Base* b) : base(*b), btr(3) {}
+  ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {}
 
   void Summary() const {
     cerr << "Number of conditioning contexts: " << r.size() << endl;
@@ -29,8 +29,6 @@ struct ConditionalPYPWordModel {
   }
 
   void ResampleHyperparameters(MT19937* rng) {
-    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
-      it->second.resample_hyperparameters(rng);
     btr.ResampleHyperparameters(rng);
   } 
 
@@ -45,8 +43,11 @@ struct ConditionalPYPWordModel {
 
   void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) {
     RuleModelHash::iterator it = r.find(src);
-    if (it == r.end())
-      it = r.insert(make_pair(src, CCRP<vector<WordID> >(1,1,1,1,0.5,1.0))).first;
+    if (it == r.end()) {
+      it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first;
+      static const WordID kNULL = TD::Convert("NULL");
+      btr.Add(src == kNULL ? 0 : 1, &it->second);
+    }
     if (it->second.increment(trglets, base(trglets), rng))
       base.Increment(trglets, rng);
   }
diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h
index 8ec0a400..588c8f84 100644
--- a/gi/pf/quasi_model2.h
+++ b/gi/pf/quasi_model2.h
@@ -7,6 +7,8 @@
 #include "boost/functional.hpp"
 #include "prob.h"
 #include "array2d.h"
+#include "slice_sampler.h"
+#include "m.h"
 
 struct AlignmentObservation {
   AlignmentObservation() : src_len(), trg_len(), j(), a_j() {}
@@ -53,6 +55,37 @@ struct QuasiModel2 {
     if (!cc) obs_.erase(ao);
   }
 
+  struct PNullResampler {
+    PNullResampler(const QuasiModel2& m) : m_(m) {}
+    const QuasiModel2& m_;
+    double operator()(const double& proposed_pnull) const {
+      return log(m_.Likelihood(m_.alpha_, proposed_pnull));
+    }
+  };
+
+  struct AlphaResampler {
+    AlphaResampler(const QuasiModel2& m) : m_(m) {}
+    const QuasiModel2& m_;
+    double operator()(const double& proposed_alpha) const {
+      return log(m_.Likelihood(proposed_alpha, m_.pnull_.as_float()));
+    }
+  };
+
+  void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    const PNullResampler dr(*this);
+    const AlphaResampler ar(*this);
+    for (unsigned i = 0; i < nloop; ++i) {
+      double pnull = slice_sampler1d(dr, pnull_.as_float(), *rng, 0.00000001,
+                            1.0, 0.0, niterations, 100*niterations);
+      pnull_ = prob_t(pnull);
+      alpha_ = slice_sampler1d(ar, alpha_, *rng, 0.00000001,
+                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+    }
+    std::cerr << "QuasiModel2(alpha=" << alpha_ << ",p_null="
+              << pnull_.as_float() << ") = " << Likelihood() << std::endl;
+    zcache_.clear();
+  }
+
   prob_t Likelihood() const {
     return Likelihood(alpha_, pnull_.as_float());
   }
@@ -61,12 +94,17 @@ struct QuasiModel2 {
     const prob_t pnull(ppnull);
     const prob_t pnotnull(1 - ppnull);
 
-    prob_t p = prob_t::One();
+    prob_t p;
+    p.logeq(Md::log_gamma_density(alpha, 0.1, 25));  // TODO configure
+    assert(!p.is_0());
+    prob_t prob_of_ppnull; prob_of_ppnull.logeq(Md::log_beta_density(ppnull, 2, 10));
+    assert(!prob_of_ppnull.is_0());
+    p *= prob_of_ppnull;
     for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) {
       const AlignmentObservation& ao = it->first;
       if (ao.a_j) {
-        double u = UnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha);
-        double z = ComputeZ(ao.j, ao.src_len, ao.trg_len, alpha);
+        prob_t u = XUnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha);
+        prob_t z = XComputeZ(ao.j, ao.src_len, ao.trg_len, alpha);
         prob_t pa(u / z);
         pa *= pnotnull;
         pa.poweq(it->second);
@@ -79,6 +117,19 @@ struct QuasiModel2 {
   }
 
  private:
+  static prob_t XUnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
+    prob_t p;
+    p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha);
+    return p;
+  }
+
+  static prob_t XComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
+    prob_t z = prob_t::Zero();
+    for (int a_j = 1; a_j <= src_len; ++a_j)
+      z += XUnnormalizedProb(a_j, j, src_len, trg_len, alpha);
+    return z;
+  }
+
   static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) {
     return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha);
   }
diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h
index 5a262f9d..6f45fbce 100644
--- a/gi/pf/tied_resampler.h
+++ b/gi/pf/tied_resampler.h
@@ -42,6 +42,10 @@ struct TiedResampler {
     return llh;
   }
 
+  double LogLikelihood() const {
+    return LogLikelihood(discount, strength);
+  }
+
   struct DiscountResampler {
     DiscountResampler(const TiedResampler& m) : m_(m) {}
     const TiedResampler& m_;
@@ -106,6 +110,13 @@ struct BinTiedResampler {
     }
   }
 
+  double LogLikelihood() const {
+    double llh = 0;
+    for (unsigned i = 0; i < resamplers.size(); ++i)
+      llh += resamplers[i].LogLikelihood();
+    return llh;
+  }
+
  private:
   std::vector<TiedResampler<CRP> > resamplers;
 };
-- 
cgit v1.2.3


From a45af4a3704531a8382cd231f6445b3a33b598a3 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 10 Mar 2012 16:42:12 -0500
Subject: frequency-based binning

---
 decoder/Makefile.am        |  1 -
 decoder/ff_csplit.cc       |  2 +-
 decoder/freqdict.cc        | 29 -----------------------------
 decoder/freqdict.h         | 37 ++++++++++++++++++++++++++++++++-----
 gi/pf/align-lexonly-pyp.cc | 24 +++++++++++++++++-------
 gi/pf/make-freq-bins.pl    | 26 ++++++++++++++++++++++++++
 gi/pf/pyp_tm.cc            | 24 +++++++++++++++++-------
 gi/pf/pyp_tm.h             |  7 ++++---
 8 files changed, 97 insertions(+), 53 deletions(-)
 delete mode 100644 decoder/freqdict.cc
 create mode 100755 gi/pf/make-freq-bins.pl

(limited to 'gi/pf')

diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index a00b18af..ec51d643 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -76,7 +76,6 @@ libcdec_a_SOURCES = \
   ff_source_syntax.cc \
   ff_bleu.cc \
   ff_factory.cc \
-  freqdict.cc \
   lexalign.cc \
   lextrans.cc \
   tagger.cc \
diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc
index 3991d38f..c9ed996c 100644
--- a/decoder/ff_csplit.cc
+++ b/decoder/ff_csplit.cc
@@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl {
   const int fl1_;
   const int fl2_;
   const int bad_;
-  FreqDict freq_dict_;
+  FreqDict<float> freq_dict_;
   set<WordID> bad_words_;
 };
 
diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc
deleted file mode 100644
index 9e25d346..00000000
--- a/decoder/freqdict.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cassert>
-#include "freqdict.h"
-#include "tdict.h"
-#include "filelib.h"
-
-using namespace std;
-
-void FreqDict::Load(const std::string& fname) {
-  cerr << "Reading word frequencies: " << fname << endl;
-  ReadFile rf(fname);
-  istream& ifs = *rf.stream();
-  int cc=0;
-  while (ifs) {
-    std::string word;
-    ifs >> word;
-    if (word.size() == 0) continue;
-    if (word[0] == '#') continue;
-    double count = 0;
-    ifs >> count;
-    assert(count > 0.0);  // use -log(f)
-    counts_[TD::Convert(word)]=count;
-    ++cc;
-    if (cc % 10000 == 0) { std::cerr << "."; }
-  }
-  std::cerr << "\n";
-  std::cerr << "Loaded " << cc << " words\n";
-}
diff --git a/decoder/freqdict.h b/decoder/freqdict.h
index 9acf0c33..4e03fadd 100644
--- a/decoder/freqdict.h
+++ b/decoder/freqdict.h
@@ -1,20 +1,47 @@
 #ifndef _FREQDICT_H_
 #define _FREQDICT_H_
 
+#include <iostream>
 #include <map>
 #include <string>
 #include "wordid.h"
+#include "filelib.h"
+#include "tdict.h"
 
+template <typename T = float>
 class FreqDict {
  public:
-  void Load(const std::string& fname);
-  float LookUp(const WordID& word) const {
-    std::map<WordID,float>::const_iterator i = counts_.find(word);
-    if (i == counts_.end()) return 0;
+  FreqDict() : max_() {}
+  T Max() const { return max_; }
+  void Load(const std::string& fname) {
+    std::cerr << "Reading word statistics from: " << fname << std::endl;
+    ReadFile rf(fname);
+    std::istream& ifs = *rf.stream();
+    int cc=0;
+    std::string word;
+    while (ifs) {
+      ifs >> word;
+      if (word.size() == 0) continue;
+      if (word[0] == '#') continue;
+      T count = 0;
+      ifs >> count;
+      if (count > max_) max_ = count;
+      counts_[TD::Convert(word)]=count;
+      ++cc;
+      if (cc % 10000 == 0) { std::cerr << "."; }
+    }
+    std::cerr << "\n";
+    std::cerr << "Loaded " << cc << " words\n";
+  }
+
+  T LookUp(const WordID& word) const {
+    typename std::map<WordID,T>::const_iterator i = counts_.find(word);
+    if (i == counts_.end()) return T();
     return i->second;
   }
  private:
-  std::map<WordID, float> counts_;
+  T max_;
+  std::map<WordID, T> counts_;
 };
 
 #endif
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 6c054753..942dcf51 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
         ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed")
+        ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null")
+        ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?")
         ("input,i",po::value<string>(),"Read parallel data from")
         ("random_seed,S",po::value<uint32_t>(), "Random seed");
   po::options_description clo("Command line options");
@@ -59,9 +62,13 @@ struct AlignedSentencePair {
 };
 
 struct Aligner {
-  Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) :
+  Aligner(const vector<vector<WordID> >& lets,
+          int num_letters,
+          const po::variables_map& conf,
+          vector<AlignedSentencePair>* c) :
       corpus(*c),
-      paj_model(4, 0.08),
+      paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()),
+      infer_paj(conf.count("infer_alignment_hyperparameters") > 0),
       model(lets, num_letters),
       kNULL(TD::Convert("NULL")) {
     assert(lets[kNULL].size() == 0);
@@ -69,12 +76,13 @@ struct Aligner {
 
   vector<AlignedSentencePair>& corpus;
   QuasiModel2 paj_model;
+  const bool infer_paj;
   PYPLexicalTranslation model;
   const WordID kNULL;
 
   void ResampleHyperparameters() {
     model.ResampleHyperparameters(prng);
-    paj_model.ResampleHyperparameters(prng);
+    if (infer_paj) paj_model.ResampleHyperparameters(prng);
   }
 
   void InitializeRandom() {
@@ -117,8 +125,6 @@ struct Aligner {
         paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
       }
     }
-    cerr << "LLH = " << Likelihood() << "    \t(Amodel=" << paj_model.Likelihood()
-         << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;
   }
 
   prob_t Likelihood() const {
@@ -211,13 +217,17 @@ int main(int argc, char** argv) {
   ExtractLetters(vocabf, &letters, NULL);
   letters[TD::Convert("NULL")].clear();
 
-  Aligner aligner(letters, letset.size(), &corpus);
+  Aligner aligner(letters, letset.size(), conf, &corpus);
   aligner.InitializeRandom();
 
   const unsigned samples = conf["samples"].as<unsigned>();
   for (int i = 0; i < samples; ++i) {
     for (int j = 65; j < 67; ++j) Debug(corpus[j]);
-    if (i % 10 == 9) aligner.ResampleHyperparameters();
+    if (i % 10 == 9) {
+      aligner.ResampleHyperparameters();
+      cerr << "LLH = " << aligner.Likelihood() << "    \t(Amodel=" << aligner.paj_model.Likelihood()
+           << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl;
+    }
     aligner.ResampleCorpus();
     if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
   }
diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl
new file mode 100755
index 00000000..fdcd3555
--- /dev/null
+++ b/gi/pf/make-freq-bins.pl
@@ -0,0 +1,26 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $BASE = 6;
+my $CUTOFF = 3;
+
+my %d;
+my $num = 0;
+while(<>){
+ chomp;
+ my @words = split /\s+/;
+ for my $w (@words) {$d{$w}++; $num++;}
+}
+
+my @vocab = sort {$d{$b} <=> $d{$a}} keys %d;
+
+for (my $i=0; $i<scalar @vocab; $i++) {
+  my $most = $d{$vocab[$i]};
+  my $least = 1;
+
+  my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF);
+  if ($nl < 0) { $nl = 0; }
+  print "$vocab[$i] $nl\n"
+}
+
+
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index 34ef0ba2..e21f0267 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -4,9 +4,6 @@
 #include <iostream>
 #include <queue>
 
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "conditional_pseg.h"
 #include "tdict.h"
 #include "ccrp.h"
 #include "pyp_word_model.h"
@@ -15,9 +12,19 @@
 using namespace std;
 using namespace std::tr1;
 
-template <typename Base>
+struct FreqBinner {
+  FreqBinner(const std::string& fname) { fd_.Load(fname); }
+  unsigned NumberOfBins() const { return fd_.Max() + 1; }
+  unsigned Bin(const WordID& w) const { return fd_.LookUp(w); }
+  FreqDict<unsigned> fd_;
+};
+
+template <typename Base, class Binner = FreqBinner>
 struct ConditionalPYPWordModel {
-  ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {}
+  ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) :
+      base(*b),
+      binner(bnr),
+      btr(binner ? binner->NumberOfBins() + 1u : 2u) {}
 
   void Summary() const {
     cerr << "Number of conditioning contexts: " << r.size() << endl;
@@ -46,7 +53,9 @@ struct ConditionalPYPWordModel {
     if (it == r.end()) {
       it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first;
       static const WordID kNULL = TD::Convert("NULL");
-      btr.Add(src == kNULL ? 0 : 1, &it->second);
+      unsigned bin = (src == kNULL ? 0 : 1);
+      if (binner && bin) { bin = binner->Bin(src) + 1; }
+      btr.Add(bin, &it->second);
     }
     if (it->second.increment(trglets, base(trglets), rng))
       base.Increment(trglets, rng);
@@ -75,6 +84,7 @@ struct ConditionalPYPWordModel {
 
   // TODO tie PYP hyperparameters based on source word frequency bins
   Base& base;
+  const Binner* binner;
   BinTiedResampler<CCRP<vector<WordID> > > btr;
   typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
   RuleModelHash r;
@@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets
                                              const unsigned num_letters) :
     letters(lets),
     up0(new PYPWordModel(num_letters)),
-    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)),
+    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0, new FreqBinner("10k.freq"))),
     kX(-TD::Convert("X")) {}
 
 void PYPLexicalTranslation::Summary() const {
diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h
index fa0fb28f..63e7c96d 100644
--- a/gi/pf/pyp_tm.h
+++ b/gi/pf/pyp_tm.h
@@ -5,10 +5,11 @@
 #include "wordid.h"
 #include "prob.h"
 #include "sampler.h"
+#include "freqdict.h"
 
-struct TRule;
+struct FreqBinner;
 struct PYPWordModel;
-template <typename T> struct ConditionalPYPWordModel;
+template <typename T, class B> struct ConditionalPYPWordModel;
 
 struct PYPLexicalTranslation {
   explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets,
@@ -26,7 +27,7 @@ struct PYPLexicalTranslation {
  private:
   const std::vector<std::vector<WordID> >& letters;   // spelling dictionary
   PYPWordModel* up0;  // base distribuction (model English word)
-  ConditionalPYPWordModel<PYPWordModel>* tmodel;  // translation distributions
+  ConditionalPYPWordModel<PYPWordModel, FreqBinner>* tmodel;  // translation distributions
                       // (model English word | French word)
   const WordID kX;
 };
-- 
cgit v1.2.3