Merge remote-tracking branch 'upstream/master'

author: Patrick Simianer <p@simianer.de> 2011-11-03 00:38:49 +0100
committer: Patrick Simianer <p@simianer.de> 2011-11-03 00:38:49 +0100
commit: bcfb20c1e1e9434ae04a30240859db6a353457d0 (patch)
tree: 4611721111f4fef5bd132ed3236b820c4e16fa9c /gi
parent: 4df1aea9e6edce1c79059d3310d55947be0a302e (diff)
parent: 6de8f58cd13813bf33af4903bf386439683c0fd6 (diff)
7 files changed, 209 insertions, 38 deletions
diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc
index f8ddfd32..8adb37d7 100644
--- a/gi/pf/base_measures.cc
+++ b/gi/pf/base_measures.cc
@@ -89,6 +89,62 @@ prob_t PhraseJointBase::p0(const vector<WordID>& vsrc,
   return p;
 }
 
+prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc,
+                                 const vector<WordID>& vtrg,
+                                 int start_src, int start_trg) const {
+  const int flen = vsrc.size() - start_src;
+  const int elen = vtrg.size() - start_trg;
+  prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
+  prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1));
+
+  prob_t p1;
+  p1.logeq(log_poisson(flen, 1.0));               // flen                 ~Pois(1)
+                                                 // elen | flen          ~Pois(flen + 0.01)
+  prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01));
+  p1 *= ptrglen;
+  p1 *= kUNIFORM_SOURCE.pow(flen);                // each f in F ~Uniform
+  for (int i = 0; i < elen; ++i) {               // for each position i in E
+    const WordID trg = vtrg[i + start_trg];
+    prob_t tp = prob_t::Zero();
+    for (int j = -1; j < flen; ++j) {
+      const WordID src = j < 0 ? 0 : vsrc[j + start_src];
+      tp += kM1MIXTURE * model1(src, trg);
+      tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
+    }
+    tp *= uniform_src_alignment;                 //     draw a_i         ~uniform
+    p1 *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform
+  }
+  if (p1.is_0()) {
+    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
+    abort();
+  }
+
+  prob_t p2;
+  p2.logeq(log_poisson(elen, 1.0));               // elen                 ~Pois(1)
+                                                 // flen | elen          ~Pois(flen + 0.01)
+  prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01));
+  p2 *= psrclen;
+  p2 *= kUNIFORM_TARGET.pow(elen);                // each f in F ~Uniform
+  for (int i = 0; i < flen; ++i) {               // for each position i in E
+    const WordID src = vsrc[i + start_src];
+    prob_t tp = prob_t::Zero();
+    for (int j = -1; j < elen; ++j) {
+      const WordID trg = j < 0 ? 0 : vtrg[j + start_trg];
+      tp += kM1MIXTURE * invmodel1(trg, src);
+      tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE;
+    }
+    tp *= uniform_trg_alignment;                 //     draw a_i         ~uniform
+    p2 *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform
+  }
+  if (p2.is_0()) {
+    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
+    abort();
+  }
+
+  static const prob_t kHALF(0.5);
+  return (p1 + p2) * kHALF;
+}
+
 JumpBase::JumpBase() : p(200) {
   for (unsigned src_len = 1; src_len < 200; ++src_len) {
     map<int, prob_t>& cpd = p[src_len];
diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h
index df17aa62..7ce7e2e6 100644
--- a/gi/pf/base_measures.h
+++ b/gi/pf/base_measures.h
@@ -97,6 +97,37 @@ struct PhraseJointBase {
   const prob_t kUNIFORM_TARGET;
 };
 
+struct PhraseJointBase_BiDir {
+  explicit PhraseJointBase_BiDir(const Model1& m1,
+                                 const Model1& im1,
+                                 const double m1mixture,
+                                 const unsigned vocab_e_size,
+                                 const unsigned vocab_f_size) :
+      model1(m1),
+      invmodel1(im1),
+      kM1MIXTURE(m1mixture),
+      kUNIFORM_MIXTURE(1.0 - m1mixture),
+      kUNIFORM_SOURCE(1.0 / vocab_f_size),
+      kUNIFORM_TARGET(1.0 / vocab_e_size) {
+    assert(m1mixture >= 0.0 && m1mixture <= 1.0);
+    assert(vocab_e_size > 0);
+  }
+
+  // return p0 of rule.e_ | rule.f_
+  prob_t operator()(const TRule& rule) const {
+    return p0(rule.f_, rule.e_, 0, 0);
+  }
+
+  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+  const Model1& model1;
+  const Model1& invmodel1;
+  const prob_t kM1MIXTURE;  // Model 1 mixture component
+  const prob_t kUNIFORM_MIXTURE; // uniform mixture component
+  const prob_t kUNIFORM_SOURCE;
+  const prob_t kUNIFORM_TARGET;
+};
+
 // base distribution for jump size multinomials
 // basically p(0) = 0 and then, p(1) is max, and then
 // you drop as you move to the max jump distance
diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc
index c926487b..db1c43c7 100644
--- a/gi/pf/dpnaive.cc
+++ b/gi/pf/dpnaive.cc
@@ -31,6 +31,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases")
         ("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases")
         ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
+        ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in base distribution)")
         ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
         ("random_seed,S",po::value<uint32_t>(), "Random seed");
   po::options_description clo("Command line options");
@@ -58,7 +59,7 @@ shared_ptr<MT19937> prng;
 
 template <typename Base>
 struct ModelAndData {
-  explicit ModelAndData(MonotonicParallelSegementationModel& m, const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) :
+  explicit ModelAndData(MonotonicParallelSegementationModel<PhraseJointBase_BiDir>& m, const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) :
      model(m),
      rng(&*prng),
      p0(b),
@@ -139,7 +140,7 @@ struct ModelAndData {
 
   void Sample();
 
-  MonotonicParallelSegementationModel& model;
+  MonotonicParallelSegementationModel<PhraseJointBase_BiDir>& model;
   MT19937* rng;
   const Base& p0;
   prob_t baseprob; // cached value of generating the table table labels from p0
@@ -267,6 +268,10 @@ int main(int argc, char** argv) {
     cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
     return 1;
   }
+  if (!conf.count("inverse_model1")) {
+    cerr << argv[0] << "Please use --inverse_model1 to specify inverse model 1 parameters\n";
+    return 1;
+  }
   if (conf.count("random_seed"))
     prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
   else
@@ -283,10 +288,12 @@ int main(int argc, char** argv) {
   assert(corpusf.size() == corpuse.size());
 
   Model1 m1(conf["model1"].as<string>());
-  PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
-  MonotonicParallelSegementationModel m(lp0);
+  Model1 invm1(conf["inverse_model1"].as<string>());
+//  PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
+  PhraseJointBase_BiDir alp0(m1, invm1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
+  MonotonicParallelSegementationModel<PhraseJointBase_BiDir> m(alp0);
 
-  ModelAndData<PhraseJointBase> posterior(m, lp0, corpuse, corpusf, vocabe, vocabf);
+  ModelAndData<PhraseJointBase_BiDir> posterior(m, alp0, corpuse, corpusf, vocabe, vocabf);
   posterior.Sample();
 
   return 0;
diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h
index 7e6af3fc..301aa6d8 100644
--- a/gi/pf/monotonic_pseg.h
+++ b/gi/pf/monotonic_pseg.h
@@ -8,8 +8,9 @@
 #include "trule.h"
 #include "base_measures.h"
 
+template <typename BaseMeasure>
 struct MonotonicParallelSegementationModel {
-  explicit MonotonicParallelSegementationModel(PhraseJointBase& rcp0) :
+  explicit MonotonicParallelSegementationModel(BaseMeasure& rcp0) :
     rp0(rcp0), base(prob_t::One()), rules(1,1), stop(1.0) {}
 
   void DecrementRule(const TRule& rule) {
@@ -78,7 +79,7 @@ struct MonotonicParallelSegementationModel {
     return prob_t(stop.prob(false, 0.5));
   }
 
-  const PhraseJointBase& rp0;
+  const BaseMeasure& rp0;
   prob_t base;
   CCRP_NoTable<TRule> rules;
   CCRP_NoTable<bool> stop;
diff --git a/gi/pf/pf.h b/gi/pf/pf.h
new file mode 100644
index 00000000..ede7cda8
--- /dev/null
+++ b/gi/pf/pf.h
@@ -0,0 +1,84 @@
+#ifndef _PF_H_
+#define _PF_H_
+
+#include <cassert>
+#include <vector>
+#include "sampler.h"
+#include "prob.h"
+
+template <typename ParticleType>
+struct ParticleRenormalizer {
+  void operator()(std::vector<ParticleType>* pv) const {
+    if (pv->empty()) return;
+    prob_t z = prob_t::Zero();
+    for (unsigned i = 0; i < pv->size(); ++i)
+      z += (*pv)[i].weight;
+    assert(z > prob_t::Zero());
+    for (unsigned i = 0; i < pv->size(); ++i)
+      (*pv)[i].weight /= z;
+  }
+};
+
+template <typename ParticleType>
+struct MultinomialResampleFilter {
+  explicit MultinomialResampleFilter(MT19937* rng) : rng_(rng) {}
+
+  void operator()(std::vector<ParticleType>* pv) {
+    if (pv->empty()) return;
+    std::vector<ParticleType>& ps = *pv;
+    SampleSet<prob_t> ss;
+    for (int i = 0; i < ps.size(); ++i)
+      ss.add(ps[i].weight);
+    std::vector<ParticleType> nps; nps.reserve(ps.size());
+    const prob_t uniform_weight(1.0 / ps.size());
+    for (int i = 0; i < ps.size(); ++i) {
+      nps.push_back(ps[rng_->SelectSample(ss)]);
+      nps[i].weight = uniform_weight;
+    }
+    nps.swap(ps);
+  }
+
+ private:
+  MT19937* rng_;
+};
+
+template <typename ParticleType>
+struct SystematicResampleFilter {
+  explicit SystematicResampleFilter(MT19937* rng) : rng_(rng), renorm_() {}
+
+  void operator()(std::vector<ParticleType>* pv) {
+    if (pv->empty()) return;
+    renorm_(pv);
+    std::vector<ParticleType>& ps = *pv;
+    std::vector<ParticleType> nps; nps.reserve(ps.size());
+    double lower = 0, upper = 0;
+    const double skip = 1.0 / ps.size();
+    double u_j = rng_->next() * skip;
+    //std::cerr << "u_0: " << u_j << std::endl;
+    int j = 0;
+    for (unsigned i = 0; i < ps.size(); ++i) {
+      upper += ps[i].weight.as_float();
+      //std::cerr << "lower: " << lower << " upper: " << upper << std::endl;
+      // how many children does ps[i] have?
+      while (u_j < lower) { u_j += skip; ++j; }
+      while (u_j >= lower && u_j <= upper) {
+        assert(j < ps.size());
+        nps.push_back(ps[i]);
+        u_j += skip;
+        //std::cerr << " add u_j=" << u_j << std::endl;
+        ++j;
+      }
+      lower = upper;
+    }
+    //std::cerr << ps.size() << " " << nps.size() << "\n";
+    assert(ps.size() == nps.size());
+    //exit(1);
+    ps.swap(nps);
+  }
+
+ private:
+  MT19937* rng_;
+  ParticleRenormalizer<ParticleType> renorm_;
+};
+
+#endif
diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc
index 81abd61b..aae5f798 100644
--- a/gi/pf/pfdist.cc
+++ b/gi/pf/pfdist.cc
@@ -6,6 +6,7 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "pf.h"
 #include "base_measures.h"
 #include "reachability.h"
 #include "viterbi.h"
@@ -413,20 +414,6 @@ ostream& operator<<(ostream& o, const Particle& p) {
   return o;
 }
 
-void FilterCrapParticlesAndReweight(vector<Particle>* pps) {
-  vector<Particle>& ps = *pps;
-  SampleSet<prob_t> ss;
-  for (int i = 0; i < ps.size(); ++i)
-    ss.add(ps[i].weight);
-  vector<Particle> nps; nps.reserve(ps.size());
-  const prob_t uniform_weight(1.0 / ps.size());
-  for (int i = 0; i < ps.size(); ++i) {
-    nps.push_back(ps[prng->SelectSample(ss)]);
-    nps[i].weight = uniform_weight;
-  }
-  nps.swap(ps);
-}
-
 int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
@@ -466,6 +453,7 @@ int main(int argc, char** argv) {
   MyJointModel m(lp0);
 #endif
 
+  MultinomialResampleFilter<Particle> filter(&rng);
   cerr << "Initializing reachability limits...\n";
   vector<Particle> ps(corpusf.size());
   vector<Reachability> reaches; reaches.reserve(corpusf.size());
@@ -500,7 +488,7 @@ int main(int argc, char** argv) {
 
         // all particles have now been extended a bit, we will reweight them now
         if (lps[0].trg_cov > 0)
-          FilterCrapParticlesAndReweight(&lps);
+          filter(&lps);
 
         // loop over all particles and extend them
         bool done_nothing = true;
diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc
index 33dc08c3..728ec00d 100644
--- a/gi/pf/pfnaive.cc
+++ b/gi/pf/pfnaive.cc
@@ -6,6 +6,7 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "pf.h"
 #include "base_measures.h"
 #include "monotonic_pseg.h"
 #include "reachability.h"
@@ -135,20 +136,6 @@ ostream& operator<<(ostream& o, const Particle& p) {
   return o;
 }
 
-void FilterCrapParticlesAndReweight(vector<Particle>* pps) {
-  vector<Particle>& ps = *pps;
-  SampleSet<prob_t> ss;
-  for (int i = 0; i < ps.size(); ++i)
-    ss.add(ps[i].weight);
-  vector<Particle> nps; nps.reserve(ps.size());
-  const prob_t uniform_weight(1.0 / ps.size());
-  for (int i = 0; i < ps.size(); ++i) {
-    nps.push_back(ps[prng->SelectSample(ss)]);
-    nps[i].weight = uniform_weight;
-  }
-  nps.swap(ps);
-}
-
 int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
@@ -181,7 +168,17 @@ int main(int argc, char** argv) {
   Model1 invm1(conf["inverse_model1"].as<string>());
 
   PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
-  MonotonicParallelSegementationModel m(lp0);
+  PhraseJointBase_BiDir alp0(m1, invm1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
+  MonotonicParallelSegementationModel<PhraseJointBase_BiDir> m(alp0);
+  TRule xx("[X] ||| ms. kimura ||| MS. KIMURA ||| X=0");
+  cerr << xx << endl << lp0(xx) << " " << alp0(xx) << endl;
+  TRule xx12("[X] ||| . ||| PHARMACY . ||| X=0");
+  TRule xx21("[X] ||| pharmacy . ||| . ||| X=0");
+//  TRule xx22("[X] ||| . ||| . ||| X=0");
+  TRule xx22("[X] ||| . ||| THE . ||| X=0");
+  cerr << xx12 << "\t" << lp0(xx12) << " " << alp0(xx12) << endl;
+  cerr << xx21 << "\t" << lp0(xx21) << " " << alp0(xx21) << endl;
+  cerr << xx22 << "\t" << lp0(xx22) << " " << alp0(xx22) << endl;
 
   cerr << "Initializing reachability limits...\n";
   vector<Particle> ps(corpusf.size());
@@ -194,6 +191,8 @@ int main(int argc, char** argv) {
   cerr << "Sampling...\n"; 
   vector<Particle> tmp_p(10000);  // work space
   SampleSet<prob_t> pfss;
+  SystematicResampleFilter<Particle> filter(&rng);
+  // MultinomialResampleFilter<Particle> filter(&rng);
   for (int SS=0; SS < samples; ++SS) {
     for (int ci = 0; ci < corpusf.size(); ++ci) {
       vector<int>& src = corpusf[ci];
@@ -213,7 +212,7 @@ int main(int argc, char** argv) {
 
         // all particles have now been extended a bit, we will reweight them now
         if (lps[0].trg_cov > 0)
-          FilterCrapParticlesAndReweight(&lps);
+          filter(&lps);
 
         // loop over all particles and extend them
         bool done_nothing = true;
@@ -263,6 +262,11 @@ int main(int argc, char** argv) {
           }
         } // loop over particles (pi = 0 .. particles)
         if (done_nothing) all_complete = true;
+        prob_t wv = prob_t::Zero();
+        for (int pp = 0; pp < lps.size(); ++pp)
+          wv += lps[pp].weight;
+        for (int pp = 0; pp < lps.size(); ++pp)
+          lps[pp].weight /= wv;
       }
       pfss.clear();
       for (int i = 0; i < lps.size(); ++i)
author	Patrick Simianer <p@simianer.de>	2011-11-03 00:38:49 +0100
committer	Patrick Simianer <p@simianer.de>	2011-11-03 00:38:49 +0100
commit	bcfb20c1e1e9434ae04a30240859db6a353457d0 (patch)
tree	4611721111f4fef5bd132ed3236b820c4e16fa9c /gi
parent	4df1aea9e6edce1c79059d3310d55947be0a302e (diff)
parent	6de8f58cd13813bf33af4903bf386439683c0fd6 (diff)