check in some experimental particle filtering code, some gitignore fixes

author: Chris Dyer <cdyer@cs.cmu.edu> 2011-10-11 12:06:32 +0100
committer: Chris Dyer <cdyer@cs.cmu.edu> 2011-10-11 12:06:32 +0100
commit: 0acc92a0eecf04a2c429f6f7685bfcaa68c7ec3a (patch)
tree: f85f238a7921541702112ae0835cf638c5c7abf2 /gi/pf/itg.cc
parent: b77d23a3032f42be3705e88ae1734bae779fb9a3 (diff)
1 files changed, 224 insertions, 0 deletions
diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc
new file mode 100644
index 00000000..2c2a86f9
--- /dev/null
+++ b/gi/pf/itg.cc
@@ -0,0 +1,224 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "viterbi.h"
+#include "hg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp_nt.h"
+#include "ccrp_onetable.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+ostream& operator<<(ostream& os, const vector<WordID>& p) {
+  os << '[';
+  for (int i = 0; i < p.size(); ++i)
+    os << (i==0 ? "" : " ") << TD::Convert(p[i]);
+  return os << ']';
+}
+
+size_t hash_value(const TRule& r) {
+  size_t h = boost::hash_value(r.e_);
+  boost::hash_combine(h, -r.lhs_);
+  boost::hash_combine(h, boost::hash_value(r.f_));
+  return h;
+}
+
+bool operator==(const TRule& a, const TRule& b) {
+  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
+}
+
+double log_poisson(unsigned x, const double& lambda) {
+  assert(lambda > 0.0);
+  return log(lambda) * x - lgamma(x + 1) - lambda;
+}
+
+struct Model1 {
+  explicit Model1(const string& fname) :
+      kNULL(TD::Convert("<eps>")),
+      kZERO() {
+    LoadModel1(fname);
+  }
+
+  void LoadModel1(const string& fname) {
+    cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
+    ReadFile rf(fname);
+    istream& in = *rf.stream();
+    string line;
+    unsigned lc = 0;
+    while(getline(in, line)) {
+      ++lc;
+      int cur = 0;
+      int start = 0;
+      while(cur < line.size() && line[cur] != ' ') { ++cur; }
+      assert(cur != line.size());
+      line[cur] = 0;
+      const WordID src = TD::Convert(&line[0]);
+      ++cur;
+      start = cur;
+      while(cur < line.size() && line[cur] != ' ') { ++cur; }
+      assert(cur != line.size());
+      line[cur] = 0;
+      WordID trg = TD::Convert(&line[start]);
+      const double logprob = strtod(&line[cur + 1], NULL);
+      if (src >= ttable.size()) ttable.resize(src + 1);
+      ttable[src][trg].logeq(logprob);
+    }
+    cerr << "  read " << lc << " parameters.\n";
+  }
+
+  // returns prob 0 if src or trg is not found!
+  const prob_t& operator()(WordID src, WordID trg) const {
+    if (src == 0) src = kNULL;
+    if (src < ttable.size()) {
+      const map<WordID, prob_t>& cpd = ttable[src];
+      const map<WordID, prob_t>::const_iterator it = cpd.find(trg);
+      if (it != cpd.end())
+        return it->second;
+    }
+    return kZERO;
+  }
+
+  const WordID kNULL;
+  const prob_t kZERO;
+  vector<map<WordID, prob_t> > ttable;
+};
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles")
+        ("input,i",po::value<string>(),"Read parallel data from")
+        ("max_src_phrase",po::value<unsigned>()->default_value(7),"Maximum length of source language phrases")
+        ("max_trg_phrase",po::value<unsigned>()->default_value(7),"Maximum length of target language phrases")
+        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
+        ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")
+        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("input") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+void ReadParallelCorpus(const string& filename,
+                vector<vector<WordID> >* f,
+                vector<vector<WordID> >* e,
+                set<WordID>* vocab_f,
+                set<WordID>* vocab_e) {
+  f->clear();
+  e->clear();
+  vocab_f->clear();
+  vocab_e->clear();
+  istream* in;
+  if (filename == "-")
+    in = &cin;
+  else
+    in = new ifstream(filename.c_str());
+  assert(*in);
+  string line;
+  const WordID kDIV = TD::Convert("|||");
+  vector<WordID> tmp;
+  while(*in) {
+    getline(*in, line);
+    if (line.empty() && !*in) break;
+    e->push_back(vector<int>());
+    f->push_back(vector<int>());
+    vector<int>& le = e->back();
+    vector<int>& lf = f->back();
+    tmp.clear();
+    TD::ConvertSentence(line, &tmp);
+    bool isf = true;
+    for (unsigned i = 0; i < tmp.size(); ++i) {
+      const int cur = tmp[i];
+      if (isf) {
+        if (kDIV == cur) { isf = false; } else {
+          lf.push_back(cur);
+          vocab_f->insert(cur);
+        }
+      } else {
+        assert(cur != kDIV);
+        le.push_back(cur);
+        vocab_e->insert(cur);
+      }
+    }
+    assert(isf == false);
+  }
+  if (in != &cin) delete in;
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
+  const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
+  const unsigned particles = conf["particles"].as<unsigned>();
+  const unsigned samples = conf["samples"].as<unsigned>();
+
+  if (!conf.count("model1")) {
+    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
+    return 1;
+  }
+  shared_ptr<MT19937> prng;
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+  MT19937& rng = *prng;
+
+  vector<vector<WordID> > corpuse, corpusf;
+  set<WordID> vocabe, vocabf;
+  cerr << "Reading corpus...\n";
+  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+  cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
+  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+  assert(corpusf.size() == corpuse.size());
+
+  const int kLHS = -TD::Convert("X");
+  Model1 m1(conf["model1"].as<string>());
+  Model1 invm1(conf["inverse_model1"].as<string>());
+  for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) {
+    cerr << '.' << flush;
+    for (int ci = 0; ci < corpusf.size(); ++ci) {
+      const vector<WordID>& src = corpusf[ci];
+      const vector<WordID>& trg = corpuse[ci];
+      for (int i = 0; i < src.size(); ++i) {
+        for (int j = 0; j < trg.size(); ++j) {
+          const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE);
+          for (int k = 0; k < eff_max_src; ++k) {
+            const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE));
+            for (int l = 0; l < eff_max_trg; ++l) {
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
author	Chris Dyer <cdyer@cs.cmu.edu>	2011-10-11 12:06:32 +0100
committer	Chris Dyer <cdyer@cs.cmu.edu>	2011-10-11 12:06:32 +0100
commit	0acc92a0eecf04a2c429f6f7685bfcaa68c7ec3a (patch)
tree	f85f238a7921541702112ae0835cf638c5c7abf2 /gi/pf/itg.cc
parent	b77d23a3032f42be3705e88ae1734bae779fb9a3 (diff)