From ee84ab027c0be54800cac0c9bff62dd097354f6d Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 12 Oct 2011 14:57:15 +0100
Subject: model lenght properly, clean up

---
 gi/pf/Makefile.am      |   2 +-
 gi/pf/corpus.cc        |  57 ++++++++++++++++++++++++
 gi/pf/corpus.h         |  19 ++++++++
 gi/pf/dpnaive.cc       |  95 +++++++++++-----------------------------
 gi/pf/monotonic_pseg.h |  88 +++++++++++++++++++++++++++++++++++++
 gi/pf/pfnaive.cc       | 116 +++++--------------------------------------------
 6 files changed, 202 insertions(+), 175 deletions(-)
 create mode 100644 gi/pf/corpus.cc
 create mode 100644 gi/pf/corpus.h
 create mode 100644 gi/pf/monotonic_pseg.h

(limited to 'gi')
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index c9764ad5..42758939 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,7 +1,7 @@
 bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive
 
 noinst_LIBRARIES = libpf.a
-libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc
+libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc
 
 itg_SOURCES = itg.cc
 
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc
new file mode 100644
index 00000000..a408e7cf
--- /dev/null
+++ b/gi/pf/corpus.cc
@@ -0,0 +1,57 @@
+#include "corpus.h"
+
+#include <set>
+#include <vector>
+#include <string>
+
+#include "tdict.h"
+#include "filelib.h"
+
+using namespace std;
+
+namespace corpus {
+
+void ReadParallelCorpus(const string& filename,
+                vector<vector<WordID> >* f,
+                vector<vector<WordID> >* e,
+                set<WordID>* vocab_f,
+                set<WordID>* vocab_e) {
+  f->clear();
+  e->clear();
+  vocab_f->clear();
+  vocab_e->clear();
+  ReadFile rf(filename);
+  istream* in = rf.stream();
+  assert(*in);
+  string line;
+  const WordID kDIV = TD::Convert("|||");
+  vector<WordID> tmp;
+  while(*in) {
+    getline(*in, line);
+    if (line.empty() && !*in) break;
+    e->push_back(vector<int>());
+    f->push_back(vector<int>());
+    vector<int>& le = e->back();
+    vector<int>& lf = f->back();
+    tmp.clear();
+    TD::ConvertSentence(line, &tmp);
+    bool isf = true;
+    for (unsigned i = 0; i < tmp.size(); ++i) {
+      const int cur = tmp[i];
+      if (isf) {
+        if (kDIV == cur) { isf = false; } else {
+          lf.push_back(cur);
+          vocab_f->insert(cur);
+        }
+      } else {
+        assert(cur != kDIV);
+        le.push_back(cur);
+        vocab_e->insert(cur);
+      }
+    }
+    assert(isf == false);
+  }
+}
+
+}
+
diff --git a/gi/pf/corpus.h b/gi/pf/corpus.h
new file mode 100644
index 00000000..e7febdb7
--- /dev/null
+++ b/gi/pf/corpus.h
@@ -0,0 +1,19 @@
+#ifndef _CORPUS_H_
+#define _CORPUS_H_
+
+#include <string>
+#include <vector>
+#include <set>
+#include "wordid.h"
+
+namespace corpus {
+
+void ReadParallelCorpus(const std::string& filename,
+                std::vector<std::vector<WordID> >* f,
+                std::vector<std::vector<WordID> >* e,
+                std::set<WordID>* vocab_f,
+                std::set<WordID>* vocab_e);
+
+}
+
+#endif
diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc
index 608f73d5..c926487b 100644
--- a/gi/pf/dpnaive.cc
+++ b/gi/pf/dpnaive.cc
@@ -7,12 +7,14 @@
 #include <boost/program_options/variables_map.hpp>
 
 #include "base_measures.h"
+#include "monotonic_pseg.h"
 #include "trule.h"
 #include "tdict.h"
 #include "filelib.h"
 #include "dict.h"
 #include "sampler.h"
 #include "ccrp_nt.h"
+#include "corpus.h"
 
 using namespace std;
 using namespace std::tr1;
@@ -52,57 +54,12 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   }
 }
 
-void ReadParallelCorpus(const string& filename,
-                vector<vector<WordID> >* f,
-                vector<vector<int> >* e,
-                set<int>* vocab_e,
-                set<int>* vocab_f) {
-  f->clear();
-  e->clear();
-  vocab_f->clear();
-  vocab_e->clear();
-  istream* in;
-  if (filename == "-")
-    in = &cin;
-  else
-    in = new ifstream(filename.c_str());
-  assert(*in);
-  string line;
-  const WordID kDIV = TD::Convert("|||");
-  vector<WordID> tmp;
-  while(*in) {
-    getline(*in, line);
-    if (line.empty() && !*in) break;
-    e->push_back(vector<int>());
-    f->push_back(vector<int>());
-    vector<int>& le = e->back();
-    vector<int>& lf = f->back();
-    tmp.clear();
-    TD::ConvertSentence(line, &tmp);
-    bool isf = true;
-    for (unsigned i = 0; i < tmp.size(); ++i) {
-      const int cur = tmp[i];
-      if (isf) {
-        if (kDIV == cur) { isf = false; } else {
-          lf.push_back(cur);
-          vocab_f->insert(cur);
-        }
-      } else {
-        assert(cur != kDIV);
-        le.push_back(cur);
-        vocab_e->insert(cur);
-      }
-    }
-    assert(isf == false);
-  }
-  if (in != &cin) delete in;
-}
-
 shared_ptr<MT19937> prng;
 
 template <typename Base>
 struct ModelAndData {
-  explicit ModelAndData(const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) :
+  explicit ModelAndData(MonotonicParallelSegementationModel& m, const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) :
+     model(m),
      rng(&*prng),
      p0(b),
      baseprob(prob_t::One()),
@@ -110,14 +67,12 @@ struct ModelAndData {
      corpusf(cf),
      vocabe(ve),
      vocabf(vf),
-     rules(1,1),
      mh_samples(),
      mh_rejects(),
      kX(-TD::Convert("X")),
      derivations(corpuse.size()) {}
 
   void ResampleHyperparameters() {
-    rules.resample_hyperparameters(&*prng);
   }
 
   void InstantiateRule(const pair<short,short>& from,
@@ -139,12 +94,10 @@ struct ModelAndData {
     TRule x;
     for (int i = 1; i < d.size(); ++i) {
       InstantiateRule(d[i], d[i-1], sentf, sente, &x);
-      //cerr << "REMOVE: " << x.AsString() << endl;
-      if (rules.decrement(x)) {
-        baseprob /= p0(x);
-        //cerr << "  (REMOVED ONLY INSTANCE)\n";
-      }
+      model.DecrementRule(x);
+      model.DecrementContinue();
     }
+    model.DecrementStop();
   }
 
   void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) {
@@ -161,39 +114,38 @@ struct ModelAndData {
     TRule x;
     for (int i = 1; i < d.size(); ++i) {
       InstantiateRule(d[i], d[i-1], sentf, sente, &x);
-      if (rules.increment(x)) {
-        baseprob *= p0(x);
-      }
+      model.IncrementRule(x);
+      model.IncrementContinue();
     }
+    model.IncrementStop();
   }
 
   prob_t Likelihood() const {
-    prob_t p;
-    p.logeq(rules.log_crp_prob());
-    return p * baseprob;
+    return model.Likelihood();
   }
 
   prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const {
-    prob_t p = prob_t::One();
+    prob_t p = model.StopProbability();
     if (d.size() < 2) return p;
     TRule x;
+    const prob_t p_cont = model.ContinueProbability();
     for (int i = 1; i < d.size(); ++i) {
       InstantiateRule(d[i], d[i-1], sentf, sente, &x);
-      prob_t rp; rp.logeq(rules.logprob(x, log(p0(x))));
-      p *= rp;
+      p *= p_cont;
+      p *= model.RuleProbability(x);
     }
     return p;
   }
 
   void Sample();
 
+  MonotonicParallelSegementationModel& model;
   MT19937* rng;
   const Base& p0;
   prob_t baseprob; // cached value of generating the table table labels from p0
                    // this can't be used if we go to a hierarchical prior!
   const vector<vector<int> >& corpuse, corpusf;
   const set<int>& vocabe, vocabf;
-  CCRP_NoTable<TRule> rules;
   unsigned mh_samples, mh_rejects;
   const int kX;
   vector<vector<pair<short, short> > > derivations;
@@ -201,8 +153,8 @@ struct ModelAndData {
 
 template <typename Base>
 void ModelAndData<Base>::Sample() {
-  unsigned MAXK = 4;
-  unsigned MAXL = 4;
+  unsigned MAXK = kMAX_SRC_PHRASE;
+  unsigned MAXL = kMAX_TRG_PHRASE;
   TRule x;
   x.lhs_ = -TD::Convert("X");
   for (int samples = 0; samples < 1000; ++samples) {
@@ -228,6 +180,8 @@ void ModelAndData<Base>::Sample() {
       boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]);
       boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]);
       a[0][0] = prob_t::One();
+      const prob_t q_stop = model.StopProbability();
+      const prob_t q_cont = model.ContinueProbability();
       for (int i = 0; i < sentf.size(); ++i) {
         for (int j = 0; j < sente.size(); ++j) {
           const prob_t src_a = a[i][j];
@@ -239,7 +193,9 @@ void ModelAndData<Base>::Sample() {
             for (int l = 1; l <= MAXL; ++l) {
               if (j + l > sente.size()) break;
               x.e_.push_back(sente[j + l - 1]);
-              trans[i][j][k - 1][l - 1].logeq(rules.logprob(x, log(p0(x))));
+              const bool stop_now = ((j + l) == sente.size()) && ((i + k) == sentf.size());
+              const prob_t& cp = stop_now ? q_stop : q_cont;
+              trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * cp;
               a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1];
             }
           }
@@ -319,7 +275,7 @@ int main(int argc, char** argv) {
 
   vector<vector<int> > corpuse, corpusf;
   set<int> vocabe, vocabf;
-  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
   cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
   cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
   cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
@@ -328,8 +284,9 @@ int main(int argc, char** argv) {
 
   Model1 m1(conf["model1"].as<string>());
   PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
+  MonotonicParallelSegementationModel m(lp0);
 
-  ModelAndData<PhraseJointBase> posterior(lp0, corpuse, corpusf, vocabe, vocabf);
+  ModelAndData<PhraseJointBase> posterior(m, lp0, corpuse, corpusf, vocabe, vocabf);
   posterior.Sample();
 
   return 0;
diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h
new file mode 100644
index 00000000..7e6af3fc
--- /dev/null
+++ b/gi/pf/monotonic_pseg.h
@@ -0,0 +1,88 @@
+#ifndef _MONOTONIC_PSEG_H_
+#define _MONOTONIC_PSEG_H_
+
+#include <vector>
+
+#include "prob.h"
+#include "ccrp_nt.h"
+#include "trule.h"
+#include "base_measures.h"
+
+struct MonotonicParallelSegementationModel {
+  explicit MonotonicParallelSegementationModel(PhraseJointBase& rcp0) :
+    rp0(rcp0), base(prob_t::One()), rules(1,1), stop(1.0) {}
+
+  void DecrementRule(const TRule& rule) {
+    if (rules.decrement(rule))
+      base /= rp0(rule);
+  }
+
+  void IncrementRule(const TRule& rule) {
+    if (rules.increment(rule))
+      base *= rp0(rule);
+  }
+
+  void IncrementRulesAndStops(const std::vector<TRulePtr>& rules) {
+    for (int i = 0; i < rules.size(); ++i)
+      IncrementRule(*rules[i]);
+    if (rules.size()) IncrementContinue(rules.size() - 1);
+    IncrementStop();
+  }
+
+  void DecrementRulesAndStops(const std::vector<TRulePtr>& rules) {
+    for (int i = 0; i < rules.size(); ++i)
+      DecrementRule(*rules[i]);
+    if (rules.size()) {
+      DecrementContinue(rules.size() - 1);
+      DecrementStop();
+    }
+  }
+
+  prob_t RuleProbability(const TRule& rule) const {
+    prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule))));
+    return p;
+  }
+
+  prob_t Likelihood() const {
+    prob_t p = base;
+    prob_t q; q.logeq(rules.log_crp_prob());
+    p *= q;
+    q.logeq(stop.log_crp_prob());
+    p *= q;
+    return p;
+  }
+
+  void IncrementStop() {
+    stop.increment(true);
+  }
+
+  void IncrementContinue(int n = 1) {
+    for (int i = 0; i < n; ++i)
+      stop.increment(false);
+  }
+
+  void DecrementStop() {
+    stop.decrement(true);
+  }
+
+  void DecrementContinue(int n = 1) {
+    for (int i = 0; i < n; ++i)
+      stop.decrement(false);
+  }
+
+  prob_t StopProbability() const {
+    return prob_t(stop.prob(true, 0.5));
+  }
+
+  prob_t ContinueProbability() const {
+    return prob_t(stop.prob(false, 0.5));
+  }
+
+  const PhraseJointBase& rp0;
+  prob_t base;
+  CCRP_NoTable<TRule> rules;
+  CCRP_NoTable<bool> stop;
+};
+
+#endif
+
diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc
index c30e7c4f..33dc08c3 100644
--- a/gi/pf/pfnaive.cc
+++ b/gi/pf/pfnaive.cc
@@ -7,6 +7,7 @@
 #include <boost/program_options/variables_map.hpp>
 
 #include "base_measures.h"
+#include "monotonic_pseg.h"
 #include "reachability.h"
 #include "viterbi.h"
 #include "hg.h"
@@ -17,6 +18,7 @@
 #include "sampler.h"
 #include "ccrp_nt.h"
 #include "ccrp_onetable.h"
+#include "corpus.h"
 
 using namespace std;
 using namespace tr1;
@@ -58,101 +60,6 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   }
 }
 
-void ReadParallelCorpus(const string& filename,
-                vector<vector<WordID> >* f,
-                vector<vector<WordID> >* e,
-                set<WordID>* vocab_f,
-                set<WordID>* vocab_e) {
-  f->clear();
-  e->clear();
-  vocab_f->clear();
-  vocab_e->clear();
-  istream* in;
-  if (filename == "-")
-    in = &cin;
-  else
-    in = new ifstream(filename.c_str());
-  assert(*in);
-  string line;
-  const WordID kDIV = TD::Convert("|||");
-  vector<WordID> tmp;
-  while(*in) {
-    getline(*in, line);
-    if (line.empty() && !*in) break;
-    e->push_back(vector<int>());
-    f->push_back(vector<int>());
-    vector<int>& le = e->back();
-    vector<int>& lf = f->back();
-    tmp.clear();
-    TD::ConvertSentence(line, &tmp);
-    bool isf = true;
-    for (unsigned i = 0; i < tmp.size(); ++i) {
-      const int cur = tmp[i];
-      if (isf) {
-        if (kDIV == cur) { isf = false; } else {
-          lf.push_back(cur);
-          vocab_f->insert(cur);
-        }
-      } else {
-        assert(cur != kDIV);
-        le.push_back(cur);
-        vocab_e->insert(cur);
-      }
-    }
-    assert(isf == false);
-  }
-  if (in != &cin) delete in;
-}
-
-struct MyJointModel {
-  MyJointModel(PhraseJointBase& rcp0) :
-    rp0(rcp0), base(prob_t::One()), rules(1,1) {}
-
-  void DecrementRule(const TRule& rule) {
-    if (rules.decrement(rule))
-      base /= rp0(rule);
-  }
-
-  void IncrementRule(const TRule& rule) {
-    if (rules.increment(rule))
-      base *= rp0(rule);
-  }
-
-  void IncrementRules(const vector<TRulePtr>& rules) {
-    for (int i = 0; i < rules.size(); ++i)
-      IncrementRule(*rules[i]);
-  }
-
-  void DecrementRules(const vector<TRulePtr>& rules) {
-    for (int i = 0; i < rules.size(); ++i)
-      DecrementRule(*rules[i]);
-  }
-
-  prob_t RuleProbability(const TRule& rule) const {
-    prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule))));
-    return p;
-  }
-
-  prob_t Likelihood() const {
-    prob_t p = base;
-    prob_t q; q.logeq(rules.log_crp_prob());
-    p *= q;
-    for (unsigned l = 1; l < src_jumps.size(); ++l) {
-      if (src_jumps[l].num_customers() > 0) {
-        prob_t q;
-        q.logeq(src_jumps[l].log_crp_prob());
-        p *= q;
-      }
-    }
-    return p;
-  }
-
-  const PhraseJointBase& rp0;
-  prob_t base;
-  CCRP_NoTable<TRule> rules;
-  vector<CCRP_NoTable<int> > src_jumps;
-};
-
 struct BackwardEstimateSym {
   BackwardEstimateSym(const Model1& m1,
                       const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) :
@@ -264,7 +171,7 @@ int main(int argc, char** argv) {
   vector<vector<WordID> > corpuse, corpusf;
   set<WordID> vocabe, vocabf;
   cerr << "Reading corpus...\n";
-  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
   cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
   cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
   assert(corpusf.size() == corpuse.size());
@@ -273,13 +180,8 @@ int main(int argc, char** argv) {
   Model1 m1(conf["model1"].as<string>());
   Model1 invm1(conf["inverse_model1"].as<string>());
 
-#if 0
-  PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size());
-  MyConditionalModel m(lp0);
-#else
   PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
-  MyJointModel m(lp0);
-#endif
+  MonotonicParallelSegementationModel m(lp0);
 
   cerr << "Initializing reachability limits...\n";
   vector<Particle> ps(corpusf.size());
@@ -296,7 +198,10 @@ int main(int argc, char** argv) {
     for (int ci = 0; ci < corpusf.size(); ++ci) {
       vector<int>& src = corpusf[ci];
       vector<int>& trg = corpuse[ci];
-      m.DecrementRules(ps[ci].rules);
+      m.DecrementRulesAndStops(ps[ci].rules);
+      const prob_t q_stop = m.StopProbability();
+      const prob_t q_cont = m.ContinueProbability();
+      cerr << "P(stop)=" << q_stop << "\tP(continue)=" <<q_cont << endl;
 
       BackwardEstimateSym be(m1, invm1, src, trg);
       const Reachability& r = reaches[ci];
@@ -336,7 +241,8 @@ int main(int argc, char** argv) {
                   x.f_.push_back(src[i + j]);
                 np.src_cov += x.f_.size();
                 np.trg_cov += x.e_.size();
-                prob_t rp = m.RuleProbability(x);
+                const bool stop_now = (np.src_cov == src_len && np.trg_cov == trg_len);
+                prob_t rp = m.RuleProbability(x) * (stop_now ? q_stop : q_cont);
                 np.gamma_last = rp;
                 const prob_t u = pow(np.gamma_last * pow(be(np.src_cov, np.trg_cov), 1.2), 0.1);
                 //cerr << "**rule=" << x << endl;
@@ -363,7 +269,7 @@ int main(int argc, char** argv) {
         pfss.add(lps[i].weight);
       const int sampled = rng.SelectSample(pfss);
       ps[ci] = lps[sampled];
-      m.IncrementRules(lps[sampled].rules);
+      m.IncrementRulesAndStops(lps[sampled].rules);
       for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; }
       cerr << "tmp-LLH: " << log(m.Likelihood()) << endl;
     }
-- 
cgit v1.2.3