rst sampler

author: Chris Dyer <cdyer@cs.cmu.edu> 2012-04-16 18:20:33 -0400
committer: Chris Dyer <cdyer@cs.cmu.edu> 2012-04-16 18:20:33 -0400
commit: caf1a688db7b446642581c3f69a9aea720735a8f (patch)
tree: 4d550b6d6be2b58d26d3d703d5b09f74d3101484
parent: b0d1a4f0c9718412691f279b9092f18073cf5094 (diff)
8 files changed, 260 insertions, 72 deletions
diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am
index 2b64b43a..6e884f53 100644
--- a/rst_parser/Makefile.am
+++ b/rst_parser/Makefile.am
@@ -1,5 +1,5 @@
 bin_PROGRAMS = \
-  mst_train
+  mst_train rst_parse
 
 noinst_PROGRAMS = \
   rst_test
@@ -8,11 +8,14 @@ TESTS = rst_test
 
 noinst_LIBRARIES = librst.a
 
-librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc
+librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc dep_training.cc
 
 mst_train_SOURCES = mst_train.cc
 mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../training/optimize.o -lz
 
+rst_parse_SOURCES = rst_parse.cc
+rst_parse_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
 rst_test_SOURCES = rst_test.cc
 rst_test_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
diff --git a/rst_parser/dep_training.cc b/rst_parser/dep_training.cc
new file mode 100644
index 00000000..de431ebc
--- /dev/null
+++ b/rst_parser/dep_training.cc
@@ -0,0 +1,56 @@
+#include "dep_training.h"
+
+#include <vector>
+#include <iostream>
+
+#include "stringlib.h"
+#include "filelib.h"
+#include "tdict.h"
+#include "picojson.h"
+
+using namespace std;
+
+void TrainingInstance::ReadTraining(const string& fname, vector<TrainingInstance>* corpus, int rank, int size) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  string err;
+  int lc = 0;
+  bool flag = false;
+  while(getline(in, line)) {
+    ++lc;
+    if ((lc-1) % size != rank) continue;
+    if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; }
+    if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
+    size_t pos = line.rfind('\t');
+    assert(pos != string::npos);
+    picojson::value obj;
+    picojson::parse(obj, line.begin() + pos, line.end(), &err);
+    if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); }
+    corpus->push_back(TrainingInstance());
+    TrainingInstance& cur = corpus->back();
+    TaggedSentence& ts = cur.ts;
+    EdgeSubset& tree = cur.tree;
+    assert(obj.is<picojson::object>());
+    const picojson::object& d = obj.get<picojson::object>();
+    const picojson::array& ta = d.find("tokens")->second.get<picojson::array>();
+    for (unsigned i = 0; i < ta.size(); ++i) {
+      ts.words.push_back(TD::Convert(ta[i].get<picojson::array>()[0].get<string>()));
+      ts.pos.push_back(TD::Convert(ta[i].get<picojson::array>()[1].get<string>()));
+    }
+    const picojson::array& da = d.find("deps")->second.get<picojson::array>();
+    for (unsigned i = 0; i < da.size(); ++i) {
+      const picojson::array& thm = da[i].get<picojson::array>();
+      // get dep type here
+      short h = thm[2].get<double>();
+      short m = thm[1].get<double>();
+      if (h < 0)
+        tree.roots.push_back(m);
+      else
+        tree.h_m_pairs.push_back(make_pair(h,m));
+    }
+    //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl;
+  }
+  if (flag) cerr << "\nRead " << lc << " training instances\n";
+}
+
diff --git a/rst_parser/dep_training.h b/rst_parser/dep_training.h
new file mode 100644
index 00000000..73ffd298
--- /dev/null
+++ b/rst_parser/dep_training.h
@@ -0,0 +1,17 @@
+#ifndef _DEP_TRAINING_H_
+#define _DEP_TRAINING_H_
+
+#include <string>
+#include <vector>
+#include "arc_factored.h"
+#include "weights.h"
+
+struct TrainingInstance {
+  TaggedSentence ts;
+  EdgeSubset tree;
+  SparseVector<weight_t> features;
+  // reads a "Jsent" formatted dependency file
+  static void ReadTraining(const std::string& fname, std::vector<TrainingInstance>* corpus, int rank = 0, int size = 1);
+};
+
+#endif
diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc
index c5cab6ec..f0403d7e 100644
--- a/rst_parser/mst_train.cc
+++ b/rst_parser/mst_train.cc
@@ -10,10 +10,9 @@
 #include "stringlib.h"
 #include "filelib.h"
 #include "tdict.h"
-#include "picojson.h"
+#include "dep_training.h"
 #include "optimize.h"
 #include "weights.h"
-#include "rst.h"
 
 using namespace std;
 namespace po = boost::program_options;
@@ -47,56 +46,6 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   }
 }
 
-struct TrainingInstance {
-  TaggedSentence ts;
-  EdgeSubset tree;
-  SparseVector<weight_t> features;
-};
-
-void ReadTraining(const string& fname, vector<TrainingInstance>* corpus, int rank = 0, int size = 1) {
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  string err;
-  int lc = 0;
-  bool flag = false;
-  while(getline(in, line)) {
-    ++lc;
-    if ((lc-1) % size != rank) continue;
-    if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; }
-    if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
-    size_t pos = line.rfind('\t');
-    assert(pos != string::npos);
-    picojson::value obj;
-    picojson::parse(obj, line.begin() + pos, line.end(), &err);
-    if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); }
-    corpus->push_back(TrainingInstance());
-    TrainingInstance& cur = corpus->back();
-    TaggedSentence& ts = cur.ts;
-    EdgeSubset& tree = cur.tree;
-    assert(obj.is<picojson::object>());
-    const picojson::object& d = obj.get<picojson::object>();
-    const picojson::array& ta = d.find("tokens")->second.get<picojson::array>();
-    for (unsigned i = 0; i < ta.size(); ++i) {
-      ts.words.push_back(TD::Convert(ta[i].get<picojson::array>()[0].get<string>()));
-      ts.pos.push_back(TD::Convert(ta[i].get<picojson::array>()[1].get<string>()));
-    }
-    const picojson::array& da = d.find("deps")->second.get<picojson::array>();
-    for (unsigned i = 0; i < da.size(); ++i) {
-      const picojson::array& thm = da[i].get<picojson::array>();
-      // get dep type here
-      short h = thm[2].get<double>();
-      short m = thm[1].get<double>();
-      if (h < 0)
-        tree.roots.push_back(m);
-      else
-        tree.h_m_pairs.push_back(make_pair(h,m));
-    }
-    //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl;
-  }
-  if (flag) cerr << "\nRead " << lc << " training instances\n";
-}
-
 void AddFeatures(double prob, const SparseVector<double>& fmap, vector<double>* g) {
   for (SparseVector<double>::const_iterator it = fmap.begin(); it != fmap.end(); ++it)
     (*g)[it->first] += it->second * prob;
@@ -131,7 +80,7 @@ int main(int argc, char** argv) {
   vector<TrainingInstance> corpus;
   vector<boost::shared_ptr<ArcFeatureFunction> > ffs;
   ffs.push_back(boost::shared_ptr<ArcFeatureFunction>(new DistancePenalty("")));
-  ReadTraining(conf["training_data"].as<string>(), &corpus, rank, size);
+  TrainingInstance::ReadTraining(conf["training_data"].as<string>(), &corpus, rank, size);
   vector<ArcFactoredForest> forests(corpus.size());
   SparseVector<double> empirical;
   bool flag = false;
@@ -224,9 +173,6 @@ int main(int argc, char** argv) {
     }
     if (converged) { cerr << "CONVERGED\n"; break; }
   }
-  forests[0].Reweight(weights);
-  TreeSampler ts(forests[0]);
-  EdgeSubset tt; ts.SampleRandomSpanningTree(&tt);
   return 0;
 }
 
diff --git a/rst_parser/rst.cc b/rst_parser/rst.cc
index c4ce898e..bc91330b 100644
--- a/rst_parser/rst.cc
+++ b/rst_parser/rst.cc
@@ -3,45 +3,77 @@
 using namespace std;
 
 // David B. Wilson. Generating Random Spanning Trees More Quickly than the Cover Time.
-
+// this is an awesome algorithm
 TreeSampler::TreeSampler(const ArcFactoredForest& af) : forest(af), usucc(af.size() + 1) {
-  // edges are directed from modifiers to heads, to the root
+  // edges are directed from modifiers to heads, and finally to the root
+  vector<double> p;
   for (int m = 1; m <= forest.size(); ++m) {
+#if USE_ALIAS_SAMPLER
+    p.clear();
+#else
     SampleSet<double>& ss = usucc[m];
-    for (int h = 0; h <= forest.size(); ++h)
-      ss.add(forest(h-1,m-1).edge_prob.as_float());
+#endif
+    double z = 0;
+    for (int h = 0; h <= forest.size(); ++h) {
+      double u = forest(h-1,m-1).edge_prob.as_float();
+      z += u;
+#if USE_ALIAS_SAMPLER
+      p.push_back(u);
+#else
+      ss.add(u);
+#endif
+    }
+#if USE_ALIAS_SAMPLER
+    for (int i = 0; i < p.size(); ++i) { p[i] /= z; }
+    usucc[m].Init(p);
+#endif
   }
 }
 
-void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree) {
-  MT19937 rng;
+void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree, MT19937* prng) {
+  MT19937& rng = *prng;
   const int r = 0;
   bool success = false;
   while (!success) {
     int roots = 0;
+    tree->h_m_pairs.clear();
+    tree->roots.clear();
     vector<int> next(forest.size() + 1, -1);
     vector<char> in_tree(forest.size() + 1, 0);
     in_tree[r] = 1;
-    for (int i = 0; i < forest.size(); ++i) {
+    //cerr << "Forest size: " << forest.size() << endl;
+    for (int i = 0; i <= forest.size(); ++i) {
+      //cerr << "Sampling starting at u=" << i << endl;
       int u = i;
       if (in_tree[u]) continue;
       while(!in_tree[u]) {
+#if USE_ALIAS_SAMPLER
+        next[u] = usucc[u].Draw(rng);
+#else
         next[u] = rng.SelectSample(usucc[u]);
+#endif
         u = next[u];
       }
       u = i;
-      cerr << (u-1);
+      //cerr << (u-1);
+      int prev = u-1;
       while(!in_tree[u]) {
         in_tree[u] = true;
         u = next[u];
-        cerr << " > " << (u-1);
-        if (u == r) { ++roots; }
+        //cerr << " > " << (u-1);
+        if (u == r) {
+          ++roots;
+          tree->roots.push_back(prev);
+        } else {
+          tree->h_m_pairs.push_back(make_pair<short,short>(u-1,prev));
+        }
+        prev = u-1;
       }
-      cerr << endl;
+      //cerr << endl;
     }
     assert(roots > 0);
     if (roots > 1) {
-      cerr << "FAILURE\n";
+      //cerr << "FAILURE\n";
     } else {
       success = true;
     }
diff --git a/rst_parser/rst.h b/rst_parser/rst.h
index a269ff9b..8bf389f7 100644
--- a/rst_parser/rst.h
+++ b/rst_parser/rst.h
@@ -4,12 +4,18 @@
 #include <vector>
 #include "sampler.h"
 #include "arc_factored.h"
+#include "alias_sampler.h"
 
 struct TreeSampler {
   explicit TreeSampler(const ArcFactoredForest& af);
-  void SampleRandomSpanningTree(EdgeSubset* tree);
+  void SampleRandomSpanningTree(EdgeSubset* tree, MT19937* rng);
   const ArcFactoredForest& forest;
+#define USE_ALIAS_SAMPLER 1
+#if USE_ALIAS_SAMPLER
+  std::vector<AliasSampler> usucc;
+#else
   std::vector<SampleSet<double> > usucc;
+#endif
 };
 
 #endif
diff --git a/rst_parser/rst_parse.cc b/rst_parser/rst_parse.cc
new file mode 100644
index 00000000..9cc1359a
--- /dev/null
+++ b/rst_parser/rst_parse.cc
@@ -0,0 +1,126 @@
+#include "arc_factored.h"
+
+#include <vector>
+#include <iostream>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "timing_stats.h"
+#include "arc_ff.h"
+#include "arc_ff_factory.h"
+#include "dep_training.h"
+#include "stringlib.h"
+#include "filelib.h"
+#include "tdict.h"
+#include "weights.h"
+#include "rst.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  string cfg_file;
+  opts.add_options()
+        ("training_data,t",po::value<string>()->default_value("-"), "File containing training data (jsent format)")
+        ("feature_function,F",po::value<vector<string> >()->composing(), "feature function (multiple permitted)")
+        ("q_weights,q",po::value<string>(), "Arc-factored weights for proposal distribution")
+        ("samples,n",po::value<unsigned>()->default_value(1000), "Number of samples");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config,c", po::value<string>(&cfg_file), "Configuration file")
+        ("help,?", "Print this help message and exit");
+
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(dconfig_options).add(clo);
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (cfg_file.size() > 0) {
+    ReadFile rf(cfg_file);
+    po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf);
+  }
+  if (conf->count("help")) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  ArcFactoredForest af(5);
+  ArcFFRegistry reg;
+  reg.Register("DistancePenalty", new ArcFFFactory<DistancePenalty>);
+  vector<TrainingInstance> corpus;
+  vector<boost::shared_ptr<ArcFeatureFunction> > ffs;
+  ffs.push_back(boost::shared_ptr<ArcFeatureFunction>(new DistancePenalty("")));
+  TrainingInstance::ReadTraining(conf["training_data"].as<string>(), &corpus);
+  vector<ArcFactoredForest> forests(corpus.size());
+  SparseVector<double> empirical;
+  bool flag = false;
+  for (int i = 0; i < corpus.size(); ++i) {
+    TrainingInstance& cur = corpus[i];
+    if ((i+1) % 10 == 0) { cerr << '.' << flush; flag = true; }
+    if ((i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; }
+    for (int fi = 0; fi < ffs.size(); ++fi) {
+      ArcFeatureFunction& ff = *ffs[fi];
+      ff.PrepareForInput(cur.ts);
+      SparseVector<weight_t> efmap;
+      for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) {
+        efmap.clear();
+        ff.EgdeFeatures(cur.ts, cur.tree.h_m_pairs[j].first,
+                        cur.tree.h_m_pairs[j].second,
+                        &efmap);
+        cur.features += efmap;
+      }
+      for (int j = 0; j < cur.tree.roots.size(); ++j) {
+        efmap.clear();
+        ff.EgdeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap);
+        cur.features += efmap;
+      }
+    }
+    empirical += cur.features;
+    forests[i].resize(cur.ts.words.size());
+    forests[i].ExtractFeatures(cur.ts, ffs);
+  }
+  if (flag) cerr << endl;
+  vector<weight_t> weights(FD::NumFeats(), 0.0);
+  Weights::InitFromFile(conf["q_weights"].as<string>(), &weights);
+  MT19937 rng;
+  SparseVector<double> model_exp;
+  SparseVector<double> sampled_exp;
+  int samples = conf["samples"].as<unsigned>();
+  for (int i = 0; i < corpus.size(); ++i) {
+    const int num_words = corpus[i].ts.words.size();
+    forests[i].Reweight(weights);
+    forests[i].EdgeMarginals();
+    model_exp.clear();
+    for (int h = -1; h < num_words; ++h) {
+      for (int m = 0; m < num_words; ++m) {
+        if (h == m) continue;
+        const ArcFactoredForest::Edge& edge = forests[i](h,m);
+        const SparseVector<weight_t>& fmap = edge.features;
+        double prob = edge.edge_prob.as_float();
+        model_exp += fmap * prob;
+      }
+    }
+    //cerr << "TRUE EXP: " << model_exp << endl;
+
+    forests[i].Reweight(weights);
+    TreeSampler ts(forests[i]);
+    sampled_exp.clear();
+      //ostringstream os; os << "Samples_" << samples;
+      //Timer t(os.str());
+      for (int n = 0; n < samples; ++n) {
+        EdgeSubset tree;
+        ts.SampleRandomSpanningTree(&tree, &rng);
+        SparseVector<double> feats;
+        tree.ExtractFeatures(corpus[i].ts, ffs, &feats);
+        sampled_exp += feats;
+      }
+      sampled_exp /= samples;
+      cerr << "L2 norm of diff @ " << samples << " samples: " << (model_exp - sampled_exp).l2norm() << endl;
+  }
+  return 0;
+}
+
diff --git a/utils/weights.cc b/utils/weights.cc
index ac407dfb..39c18474 100644
--- a/utils/weights.cc
+++ b/utils/weights.cc
@@ -144,8 +144,10 @@ void Weights::ShowLargestFeatures(const vector<weight_t>& w) {
   vector<int> fnums(w.size());
   for (int i = 0; i < w.size(); ++i)
     fnums[i] = i;
+  int nf = FD::NumFeats();
+  if (nf > 10) nf = 10;
   vector<int>::iterator mid = fnums.begin();
-  mid += (w.size() > 10 ? 10 : w.size());
+  mid += nf;
   partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
   cerr << "TOP FEATURES:";
   for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
author	Chris Dyer <cdyer@cs.cmu.edu>	2012-04-16 18:20:33 -0400
committer	Chris Dyer <cdyer@cs.cmu.edu>	2012-04-16 18:20:33 -0400
commit	caf1a688db7b446642581c3f69a9aea720735a8f (patch)
tree	4d550b6d6be2b58d26d3d703d5b09f74d3101484
parent	b0d1a4f0c9718412691f279b9092f18073cf5094 (diff)