From c294227a928672bf108eed81106063a194c872ca Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 12 Apr 2012 18:57:44 -0400
Subject: unigram pyp lm added

---
 utils/alias_sampler.h   |  47 ++++++++++++++
 utils/unigram_pyp_lm.cc | 168 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 215 insertions(+)
 create mode 100644 utils/alias_sampler.h
 create mode 100644 utils/unigram_pyp_lm.cc

(limited to 'utils')
diff --git a/utils/alias_sampler.h b/utils/alias_sampler.h
new file mode 100644
index 00000000..85da9944
--- /dev/null
+++ b/utils/alias_sampler.h
@@ -0,0 +1,47 @@
+#ifndef _ALIAS_SAMPLER_H_
+#define _ALIAS_SAMPLER_H_
+
+#include <vector>
+#include <limits>
+
+// R. A. Kronmal and A. V. Peterson, Jr. (1977) On the alias method for
+// generating random variables from a discrete distribution. In The American
+// Statistician, Vol. 33, No. 4. Pages 214--218.
+//
+// Intuition: a multinomial with N outcomes can be rewritten as a uniform
+// mixture of N Bernoulli distributions. The ith Bernoulli returns i with
+// probability F[i], otherwise it returns an "alias" value L[i]. The
+// constructor computes the F's and L's given an arbitrary multionimial p in
+// O(n) time and Draw returns samples in O(1) time.
+struct AliasSampler {
+  explicit AliasSampler(const std::vector<double>& p) :
+      cutoffs_(p.size()),
+      aliases_(p.size(), std::numeric_limits<unsigned>::max()) {
+    const unsigned N = p.size();
+    std::vector<unsigned> s,g;
+    for (unsigned i = 0; i < N; ++i) {
+      const double cutoff = cutoffs_[i] = N * p[i];
+      if (cutoff >= 1.0) g.push_back(i); else s.push_back(i);
+    }
+    while(!s.empty() && !g.empty()) {
+      const unsigned k = g.back();
+      const unsigned j = s.back();
+      aliases_[j] = k;
+      cutoffs_[k] -= 1.0 - cutoffs_[j];
+      s.pop_back();
+      if (cutoffs_[k] < 1.0) {
+        g.pop_back();
+        s.push_back(k);
+      }
+    }
+  }
+  template <typename Uniform01Generator>
+  unsigned Draw(Uniform01Generator& u01) const {
+    const unsigned n = u01() * cutoffs_.size();
+    if (u01() > cutoffs_[n]) return aliases_[n]; else return n;
+  }
+  std::vector<double> cutoffs_;    // F
+  std::vector<unsigned> aliases_;  // L
+};
+
+#endif
diff --git a/utils/unigram_pyp_lm.cc b/utils/unigram_pyp_lm.cc
new file mode 100644
index 00000000..510e8839
--- /dev/null
+++ b/utils/unigram_pyp_lm.cc
@@ -0,0 +1,168 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "corpus_tools.h"
+#include "m.h"
+#include "tdict.h"
+#include "sampler.h"
+#include "ccrp.h"
+
+// A not very memory-efficient implementation of an 1-gram LM based on PYPs
+// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model
+// based on Pitman-Yor Processes. In Proc. ACL.
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+boost::shared_ptr<MT19937> prng;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,n",po::value<unsigned>()->default_value(50),"Number of samples")
+        ("train,i",po::value<string>(),"Training data file")
+        ("test,T",po::value<string>(),"Test data file")
+        ("discount_prior_a,a",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): a=this")
+        ("discount_prior_b,b",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): b=this")
+        ("strength_prior_s,s",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): s=this")
+        ("strength_prior_r,r",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): r=this")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("train") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+// uniform base distribution (0-gram model)
+struct UniformWordModel {
+  explicit UniformWordModel(unsigned vocab_size) : p0(1.0 / vocab_size), draws() {}
+  void increment() { ++draws; }
+  void decrement() { --draws; assert(draws >= 0); }
+  double prob(WordID) const { return p0; } // all words have equal prob
+  double log_likelihood() const { return draws * log(p0); }
+  const double p0;
+  int draws;
+};
+
+// represents an Unigram LM
+struct UnigramLM {
+  UnigramLM(unsigned vs, double da, double db, double ss, double sr) :
+      uniform_vocab(vs),
+      crp(da, db, ss, sr, 0.8, 1.0) {}
+  void increment(WordID w, MT19937* rng) {
+    const double backoff = uniform_vocab.prob(w);
+    if (crp.increment(w, backoff, rng))
+      uniform_vocab.increment();
+  }
+  void decrement(WordID w, MT19937* rng) {
+    if (crp.decrement(w, rng))
+      uniform_vocab.decrement();
+  }
+  double prob(WordID w) const {
+    const double backoff = uniform_vocab.prob(w);
+    return crp.prob(w, backoff);
+  }
+
+  double log_likelihood() const {
+    double llh = uniform_vocab.log_likelihood();
+    llh += crp.log_crp_prob();
+    return llh;
+  }
+
+  void resample_hyperparameters(MT19937* rng) {
+    crp.resample_hyperparameters(rng);
+  }
+
+  double discount_a, discount_b, strength_s, strength_r;
+  double d, strength;
+  UniformWordModel uniform_vocab;
+  CCRP<WordID> crp;
+};
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+
+  InitCommandLine(argc, argv, &conf);
+  const unsigned samples = conf["samples"].as<unsigned>();
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+  MT19937& rng = *prng;
+  vector<vector<WordID> > corpuse;
+  set<WordID> vocabe;
+  const WordID kEOS = TD::Convert("</s>");
+  cerr << "Reading corpus...\n";
+  CorpusTools::ReadFromFile(conf["train"].as<string>(), &corpuse, &vocabe);
+  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+  vector<vector<WordID> > test;
+  if (conf.count("test"))
+    CorpusTools::ReadFromFile(conf["test"].as<string>(), &test);
+  else
+    test = corpuse;
+  UnigramLM lm(vocabe.size(),
+               conf["discount_prior_a"].as<double>(),
+               conf["discount_prior_b"].as<double>(),
+               conf["strength_prior_s"].as<double>(),
+               conf["strength_prior_r"].as<double>());
+  for (int SS=0; SS < samples; ++SS) {
+    for (int ci = 0; ci < corpuse.size(); ++ci) {
+      const vector<WordID>& s = corpuse[ci];
+      for (int i = 0; i <= s.size(); ++i) {
+        WordID w = (i < s.size() ? s[i] : kEOS);
+        if (SS > 0) lm.decrement(w, &rng);
+        lm.increment(w, &rng);
+      }
+      if (SS > 0) lm.decrement(kEOS, &rng);
+      lm.increment(kEOS, &rng);
+    }
+    cerr << "LLH=" << lm.log_likelihood() << endl;
+    //if (SS % 10 == 9) lm.resample_hyperparameters(&rng);
+  }
+  double llh = 0;
+  unsigned cnt = 0;
+  unsigned oovs = 0;
+  for (int ci = 0; ci < test.size(); ++ci) {
+    const vector<WordID>& s = test[ci];
+    for (int i = 0; i <= s.size(); ++i) {
+      WordID w = (i < s.size() ? s[i] : kEOS);
+      double lp = log(lm.prob(w)) / log(2);
+      if (i < s.size() && vocabe.count(w) == 0) {
+        cerr << "**OOV ";
+        ++oovs;
+        lp = 0;
+      }
+      cerr << "p(" << TD::Convert(w) << ") = " << lp << endl;
+      llh -= lp;
+      cnt++;
+    }
+  }
+  cerr << "  Log_10 prob: " << (-llh * log(2) / log(10)) << endl;
+  cerr << "        Count: " << cnt << endl;
+  cerr << "         OOVs: " << oovs << endl;
+  cerr << "Cross-entropy: " << (llh / cnt) << endl;
+  cerr << "   Perplexity: " << pow(2, llh / cnt) << endl;
+  return 0;
+}
+
-- 
cgit v1.2.3


From d5a2a9c3bf18c1e414f79a757c1662fe422e2f5c Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Mon, 16 Apr 2012 14:11:02 -0400
Subject: switch to log domain for matrix operations

---
 rst_parser/arc_factored.h            |  2 +-
 rst_parser/arc_factored_marginals.cc | 24 +++++++++++++-----------
 rst_parser/mst_train.cc              | 29 +++++++++++++++++++++--------
 rst_parser/rst_test.cc               | 16 +++++++++++++---
 utils/logval.h                       | 13 ++++++++++---
 5 files changed, 58 insertions(+), 26 deletions(-)

(limited to 'utils')

diff --git a/rst_parser/arc_factored.h b/rst_parser/arc_factored.h
index d9a0bb24..4de38b66 100644
--- a/rst_parser/arc_factored.h
+++ b/rst_parser/arc_factored.h
@@ -56,7 +56,7 @@ class ArcFactoredForest {
 
   // Reweight edges so that edge_prob is the edge's marginals
   // optionally returns log partition
-  void EdgeMarginals(double* p_log_z = NULL);
+  void EdgeMarginals(prob_t* p_log_z = NULL);
 
   // This may not return a tree
   void PickBestParentForEachWord(EdgeSubset* st) const;
diff --git a/rst_parser/arc_factored_marginals.cc b/rst_parser/arc_factored_marginals.cc
index 16360b0d..acb8102a 100644
--- a/rst_parser/arc_factored_marginals.cc
+++ b/rst_parser/arc_factored_marginals.cc
@@ -9,37 +9,39 @@ using namespace std;
 #if HAVE_EIGEN
 
 #include <Eigen/Dense>
-typedef Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic> ArcMatrix;
-typedef Eigen::Matrix<double, Eigen::Dynamic, 1> RootVector;
+typedef Eigen::Matrix<prob_t, Eigen::Dynamic, Eigen::Dynamic> ArcMatrix;
+typedef Eigen::Matrix<prob_t, Eigen::Dynamic, 1> RootVector;
 
-void ArcFactoredForest::EdgeMarginals(double *plog_z) {
+void ArcFactoredForest::EdgeMarginals(prob_t *plog_z) {
   ArcMatrix A(num_words_,num_words_);
   RootVector r(num_words_);
   for (int h = 0; h < num_words_; ++h) {
     for (int m = 0; m < num_words_; ++m) {
       if (h != m)
-        A(h,m) = edges_(h,m).edge_prob.as_float();
+        A(h,m) = edges_(h,m).edge_prob;
       else
-        A(h,m) = 0;
+        A(h,m) = prob_t::Zero();
     }
-    r(h) = root_edges_[h].edge_prob.as_float();
+    r(h) = root_edges_[h].edge_prob;
   }
 
   ArcMatrix L = -A;
   L.diagonal() = A.colwise().sum();
   L.row(0) = r;
   ArcMatrix Linv = L.inverse();
-  if (plog_z) *plog_z = log(Linv.determinant());
+  if (plog_z) *plog_z = Linv.determinant();
   RootVector rootMarginals = r.cwiseProduct(Linv.col(0));
+  static const prob_t ZERO(0);
+  static const prob_t ONE(1);
 //  ArcMatrix T = Linv;
   for (int h = 0; h < num_words_; ++h) {
     for (int m = 0; m < num_words_; ++m) {
-      const double marginal = (m == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,m) -
-                              (h == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,h);
-      edges_(h,m).edge_prob = prob_t(marginal);
+      const prob_t marginal = (m == 0 ? ZERO : ONE) * A(h,m) * Linv(m,m) -
+                              (h == 0 ? ZERO : ONE) * A(h,m) * Linv(m,h);
+      edges_(h,m).edge_prob = marginal;
 //      T(h,m) = marginal;
     }
-    root_edges_[h].edge_prob = prob_t(rootMarginals(h));
+    root_edges_[h].edge_prob = rootMarginals(h);
   }
 //   cerr << "ROOT MARGINALS: " << rootMarginals.transpose() << endl;
 //  cerr << "M:\n" << T << endl;
diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc
index b5114726..c5cab6ec 100644
--- a/rst_parser/mst_train.cc
+++ b/rst_parser/mst_train.cc
@@ -23,7 +23,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   string cfg_file;
   opts.add_options()
         ("training_data,t",po::value<string>()->default_value("-"), "File containing training data (jsent format)")
-        ("feature_function,F",po::value<vector<string> >()->composing(), "feature function")
+        ("feature_function,F",po::value<vector<string> >()->composing(), "feature function (multiple permitted)")
+        ("weights,w",po::value<string>(), "Optional starting weights")
+        ("output_every_i_iterations,I",po::value<unsigned>()->default_value(1), "Write weights every I iterations")
         ("regularization_strength,C",po::value<double>()->default_value(1.0), "Regularization strength")
         ("correction_buffers,m", po::value<int>()->default_value(10), "LBFGS correction buffers");
   po::options_description clo("Command line options");
@@ -161,9 +163,13 @@ int main(int argc, char** argv) {
   if (flag) cerr << endl;
   //cerr << "EMP: " << empirical << endl; //DE
   vector<weight_t> weights(FD::NumFeats(), 0.0);
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &weights);
   vector<weight_t> g(FD::NumFeats(), 0.0);
   cerr << "features initialized\noptimizing...\n";
   boost::shared_ptr<BatchOptimizer> o;
+  int every = corpus.size() / 20;
+  if (!every) ++every;
   o.reset(new LBFGSOptimizer(g.size(), conf["correction_buffers"].as<int>()));
   int iterations = 1000;
   for (int iter = 0; iter < iterations; ++iter) {
@@ -174,11 +180,12 @@ int main(int argc, char** argv) {
     double obj = -empirical.dot(weights);
     // SparseVector<double> mfm;  //DE
     for (int i = 0; i < corpus.size(); ++i) {
+      if ((i + 1) % every == 0) cerr << '.' << flush;
       const int num_words = corpus[i].ts.words.size();
       forests[i].Reweight(weights);
-      double lz;
-      forests[i].EdgeMarginals(&lz);
-      obj -= lz;
+      prob_t z;
+      forests[i].EdgeMarginals(&z);
+      obj -= log(z);
       //cerr << " O = " << (-corpus[i].features.dot(weights)) << " D=" << -lz << "  OO= " << (-corpus[i].features.dot(weights) - lz) << endl;
       //cerr << " ZZ = " << zz << endl;
       for (int h = -1; h < num_words; ++h) {
@@ -202,14 +209,20 @@ int main(int argc, char** argv) {
       gnorm += g[i]*g[i];
     ostringstream ll;
     ll << "ITER=" << (iter+1) << "\tOBJ=" << (obj+r) << "\t[F=" << obj << " R=" << r << "]\tGnorm=" << sqrt(gnorm);
-    cerr << endl << ll.str() << endl;
+    cerr << ' ' << ll.str().substr(ll.str().find('\t')+1) << endl;
     obj += r;
     assert(obj >= 0);
     o->Optimize(obj, g, &weights);
     Weights::ShowLargestFeatures(weights);
-    string sl = ll.str();
-    Weights::WriteToFile(o->HasConverged() ? "weights.final.gz" : "weights.cur.gz", weights, true, &sl);
-    if (o->HasConverged()) { cerr << "CONVERGED\n"; break; }
+    const bool converged = o->HasConverged();
+    const char* ofname = converged ? "weights.final.gz" : "weights.cur.gz";
+    if (converged || ((iter+1) % conf["output_every_i_iterations"].as<unsigned>()) == 0) {
+      cerr << "writing..." << flush;
+      const string sl = ll.str();
+      Weights::WriteToFile(ofname, weights, true, &sl);
+      cerr << "done" << endl;
+    }
+    if (converged) { cerr << "CONVERGED\n"; break; }
   }
   forests[0].Reweight(weights);
   TreeSampler ts(forests[0]);
diff --git a/rst_parser/rst_test.cc b/rst_parser/rst_test.cc
index 7e6fb2c1..3bb95759 100644
--- a/rst_parser/rst_test.cc
+++ b/rst_parser/rst_test.cc
@@ -2,6 +2,8 @@
 
 #include <iostream>
 
+#include <Eigen/Dense>
+
 using namespace std;
 
 int main(int argc, char** argv) {
@@ -28,11 +30,19 @@ int main(int argc, char** argv) {
   af(-1,2).edge_prob.logeq(9);
   EdgeSubset tree;
 //  af.MaximumEdgeSubset(&tree);
-  double lz;
-  af.EdgeMarginals(&lz);
-  cerr << "Z = " << lz << endl;
+  prob_t z;
+  af.EdgeMarginals(&z);
+  cerr << "Z = " << abs(z) << endl;
   af.PickBestParentForEachWord(&tree);
   cerr << tree << endl;
+  typedef Eigen::Matrix<prob_t, 2, 2> M3;
+  M3 A = M3::Zero();
+  A(0,0) = prob_t(1);
+  A(1,0) = prob_t(3);
+  A(0,1) = prob_t(2);
+  A(1,1) = prob_t(4);
+  prob_t det = A.determinant();
+  cerr << det.as_float() << endl;
   return 0;
 }
 
diff --git a/utils/logval.h b/utils/logval.h
index 8a59d0b1..ec1f6acd 100644
--- a/utils/logval.h
+++ b/utils/logval.h
@@ -30,8 +30,6 @@ class LogVal {
   LogVal(init_minus_1) : s_(true),v_(0) {  }
   LogVal(init_1) : s_(),v_(0) {  }
   LogVal(init_0) : s_(),v_(LOGVAL_LOG0) {  }
-  explicit LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {}
-  explicit LogVal(unsigned x) : s_(0), v_(std::log(x)) { }
   LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {}
   LogVal(double lnx,init_lnx) : s_(),v_(lnx) {}
   static Self exp(T lnx) { return Self(lnx,false); }
@@ -126,7 +124,7 @@ class LogVal {
   }
 
   Self operator-() const {
-    return Self(v_,-s_);
+    return Self(v_,!s_);
   }
   void negate() { s_ = !s_; }
 
@@ -193,6 +191,15 @@ T log(const LogVal<T>& o) {
   return o.v_;
 }
 
+template<class T>
+LogVal<T> abs(const LogVal<T>& o) {
+  if (o.s_) {
+    LogVal<T> res = o;
+    res.s_ = false;
+    return res;
+  } else { return o; }
+}
+
 template <class T>
 LogVal<T> pow(const LogVal<T>& b, const T& e) {
   return b.pow(e);
-- 
cgit v1.2.3


From 4b38556c88c739de82b9c298261a262ec620280e Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Mon, 16 Apr 2012 18:20:33 -0400
Subject: rst sampler

---
 rst_parser/Makefile.am     |   7 ++-
 rst_parser/dep_training.cc |  56 ++++++++++++++++++++
 rst_parser/dep_training.h  |  17 ++++++
 rst_parser/mst_train.cc    |  58 +--------------------
 rst_parser/rst.cc          |  56 +++++++++++++++-----
 rst_parser/rst.h           |   8 ++-
 rst_parser/rst_parse.cc    | 126 +++++++++++++++++++++++++++++++++++++++++++++
 utils/weights.cc           |   4 +-
 8 files changed, 260 insertions(+), 72 deletions(-)
 create mode 100644 rst_parser/dep_training.cc
 create mode 100644 rst_parser/dep_training.h
 create mode 100644 rst_parser/rst_parse.cc

(limited to 'utils')

diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am
index 2b64b43a..6e884f53 100644
--- a/rst_parser/Makefile.am
+++ b/rst_parser/Makefile.am
@@ -1,5 +1,5 @@
 bin_PROGRAMS = \
-  mst_train
+  mst_train rst_parse
 
 noinst_PROGRAMS = \
   rst_test
@@ -8,11 +8,14 @@ TESTS = rst_test
 
 noinst_LIBRARIES = librst.a
 
-librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc
+librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc dep_training.cc
 
 mst_train_SOURCES = mst_train.cc
 mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../training/optimize.o -lz
 
+rst_parse_SOURCES = rst_parse.cc
+rst_parse_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
 rst_test_SOURCES = rst_test.cc
 rst_test_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
diff --git a/rst_parser/dep_training.cc b/rst_parser/dep_training.cc
new file mode 100644
index 00000000..de431ebc
--- /dev/null
+++ b/rst_parser/dep_training.cc
@@ -0,0 +1,56 @@
+#include "dep_training.h"
+
+#include <vector>
+#include <iostream>
+
+#include "stringlib.h"
+#include "filelib.h"
+#include "tdict.h"
+#include "picojson.h"
+
+using namespace std;
+
+void TrainingInstance::ReadTraining(const string& fname, vector<TrainingInstance>* corpus, int rank, int size) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  string err;
+  int lc = 0;
+  bool flag = false;
+  while(getline(in, line)) {
+    ++lc;
+    if ((lc-1) % size != rank) continue;
+    if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; }
+    if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
+    size_t pos = line.rfind('\t');
+    assert(pos != string::npos);
+    picojson::value obj;
+    picojson::parse(obj, line.begin() + pos, line.end(), &err);
+    if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); }
+    corpus->push_back(TrainingInstance());
+    TrainingInstance& cur = corpus->back();
+    TaggedSentence& ts = cur.ts;
+    EdgeSubset& tree = cur.tree;
+    assert(obj.is<picojson::object>());
+    const picojson::object& d = obj.get<picojson::object>();
+    const picojson::array& ta = d.find("tokens")->second.get<picojson::array>();
+    for (unsigned i = 0; i < ta.size(); ++i) {
+      ts.words.push_back(TD::Convert(ta[i].get<picojson::array>()[0].get<string>()));
+      ts.pos.push_back(TD::Convert(ta[i].get<picojson::array>()[1].get<string>()));
+    }
+    const picojson::array& da = d.find("deps")->second.get<picojson::array>();
+    for (unsigned i = 0; i < da.size(); ++i) {
+      const picojson::array& thm = da[i].get<picojson::array>();
+      // get dep type here
+      short h = thm[2].get<double>();
+      short m = thm[1].get<double>();
+      if (h < 0)
+        tree.roots.push_back(m);
+      else
+        tree.h_m_pairs.push_back(make_pair(h,m));
+    }
+    //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl;
+  }
+  if (flag) cerr << "\nRead " << lc << " training instances\n";
+}
+
diff --git a/rst_parser/dep_training.h b/rst_parser/dep_training.h
new file mode 100644
index 00000000..73ffd298
--- /dev/null
+++ b/rst_parser/dep_training.h
@@ -0,0 +1,17 @@
+#ifndef _DEP_TRAINING_H_
+#define _DEP_TRAINING_H_
+
+#include <string>
+#include <vector>
+#include "arc_factored.h"
+#include "weights.h"
+
+struct TrainingInstance {
+  TaggedSentence ts;
+  EdgeSubset tree;
+  SparseVector<weight_t> features;
+  // reads a "Jsent" formatted dependency file
+  static void ReadTraining(const std::string& fname, std::vector<TrainingInstance>* corpus, int rank = 0, int size = 1);
+};
+
+#endif
diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc
index c5cab6ec..f0403d7e 100644
--- a/rst_parser/mst_train.cc
+++ b/rst_parser/mst_train.cc
@@ -10,10 +10,9 @@
 #include "stringlib.h"
 #include "filelib.h"
 #include "tdict.h"
-#include "picojson.h"
+#include "dep_training.h"
 #include "optimize.h"
 #include "weights.h"
-#include "rst.h"
 
 using namespace std;
 namespace po = boost::program_options;
@@ -47,56 +46,6 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   }
 }
 
-struct TrainingInstance {
-  TaggedSentence ts;
-  EdgeSubset tree;
-  SparseVector<weight_t> features;
-};
-
-void ReadTraining(const string& fname, vector<TrainingInstance>* corpus, int rank = 0, int size = 1) {
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  string err;
-  int lc = 0;
-  bool flag = false;
-  while(getline(in, line)) {
-    ++lc;
-    if ((lc-1) % size != rank) continue;
-    if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; }
-    if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
-    size_t pos = line.rfind('\t');
-    assert(pos != string::npos);
-    picojson::value obj;
-    picojson::parse(obj, line.begin() + pos, line.end(), &err);
-    if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); }
-    corpus->push_back(TrainingInstance());
-    TrainingInstance& cur = corpus->back();
-    TaggedSentence& ts = cur.ts;
-    EdgeSubset& tree = cur.tree;
-    assert(obj.is<picojson::object>());
-    const picojson::object& d = obj.get<picojson::object>();
-    const picojson::array& ta = d.find("tokens")->second.get<picojson::array>();
-    for (unsigned i = 0; i < ta.size(); ++i) {
-      ts.words.push_back(TD::Convert(ta[i].get<picojson::array>()[0].get<string>()));
-      ts.pos.push_back(TD::Convert(ta[i].get<picojson::array>()[1].get<string>()));
-    }
-    const picojson::array& da = d.find("deps")->second.get<picojson::array>();
-    for (unsigned i = 0; i < da.size(); ++i) {
-      const picojson::array& thm = da[i].get<picojson::array>();
-      // get dep type here
-      short h = thm[2].get<double>();
-      short m = thm[1].get<double>();
-      if (h < 0)
-        tree.roots.push_back(m);
-      else
-        tree.h_m_pairs.push_back(make_pair(h,m));
-    }
-    //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl;
-  }
-  if (flag) cerr << "\nRead " << lc << " training instances\n";
-}
-
 void AddFeatures(double prob, const SparseVector<double>& fmap, vector<double>* g) {
   for (SparseVector<double>::const_iterator it = fmap.begin(); it != fmap.end(); ++it)
     (*g)[it->first] += it->second * prob;
@@ -131,7 +80,7 @@ int main(int argc, char** argv) {
   vector<TrainingInstance> corpus;
   vector<boost::shared_ptr<ArcFeatureFunction> > ffs;
   ffs.push_back(boost::shared_ptr<ArcFeatureFunction>(new DistancePenalty("")));
-  ReadTraining(conf["training_data"].as<string>(), &corpus, rank, size);
+  TrainingInstance::ReadTraining(conf["training_data"].as<string>(), &corpus, rank, size);
   vector<ArcFactoredForest> forests(corpus.size());
   SparseVector<double> empirical;
   bool flag = false;
@@ -224,9 +173,6 @@ int main(int argc, char** argv) {
     }
     if (converged) { cerr << "CONVERGED\n"; break; }
   }
-  forests[0].Reweight(weights);
-  TreeSampler ts(forests[0]);
-  EdgeSubset tt; ts.SampleRandomSpanningTree(&tt);
   return 0;
 }
 
diff --git a/rst_parser/rst.cc b/rst_parser/rst.cc
index c4ce898e..bc91330b 100644
--- a/rst_parser/rst.cc
+++ b/rst_parser/rst.cc
@@ -3,45 +3,77 @@
 using namespace std;
 
 // David B. Wilson. Generating Random Spanning Trees More Quickly than the Cover Time.
-
+// this is an awesome algorithm
 TreeSampler::TreeSampler(const ArcFactoredForest& af) : forest(af), usucc(af.size() + 1) {
-  // edges are directed from modifiers to heads, to the root
+  // edges are directed from modifiers to heads, and finally to the root
+  vector<double> p;
   for (int m = 1; m <= forest.size(); ++m) {
+#if USE_ALIAS_SAMPLER
+    p.clear();
+#else
     SampleSet<double>& ss = usucc[m];
-    for (int h = 0; h <= forest.size(); ++h)
-      ss.add(forest(h-1,m-1).edge_prob.as_float());
+#endif
+    double z = 0;
+    for (int h = 0; h <= forest.size(); ++h) {
+      double u = forest(h-1,m-1).edge_prob.as_float();
+      z += u;
+#if USE_ALIAS_SAMPLER
+      p.push_back(u);
+#else
+      ss.add(u);
+#endif
+    }
+#if USE_ALIAS_SAMPLER
+    for (int i = 0; i < p.size(); ++i) { p[i] /= z; }
+    usucc[m].Init(p);
+#endif
   }
 }
 
-void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree) {
-  MT19937 rng;
+void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree, MT19937* prng) {
+  MT19937& rng = *prng;
   const int r = 0;
   bool success = false;
   while (!success) {
     int roots = 0;
+    tree->h_m_pairs.clear();
+    tree->roots.clear();
     vector<int> next(forest.size() + 1, -1);
     vector<char> in_tree(forest.size() + 1, 0);
     in_tree[r] = 1;
-    for (int i = 0; i < forest.size(); ++i) {
+    //cerr << "Forest size: " << forest.size() << endl;
+    for (int i = 0; i <= forest.size(); ++i) {
+      //cerr << "Sampling starting at u=" << i << endl;
       int u = i;
       if (in_tree[u]) continue;
       while(!in_tree[u]) {
+#if USE_ALIAS_SAMPLER
+        next[u] = usucc[u].Draw(rng);
+#else
         next[u] = rng.SelectSample(usucc[u]);
+#endif
         u = next[u];
       }
       u = i;
-      cerr << (u-1);
+      //cerr << (u-1);
+      int prev = u-1;
       while(!in_tree[u]) {
         in_tree[u] = true;
         u = next[u];
-        cerr << " > " << (u-1);
-        if (u == r) { ++roots; }
+        //cerr << " > " << (u-1);
+        if (u == r) {
+          ++roots;
+          tree->roots.push_back(prev);
+        } else {
+          tree->h_m_pairs.push_back(make_pair<short,short>(u-1,prev));
+        }
+        prev = u-1;
       }
-      cerr << endl;
+      //cerr << endl;
     }
     assert(roots > 0);
     if (roots > 1) {
-      cerr << "FAILURE\n";
+      //cerr << "FAILURE\n";
     } else {
       success = true;
     }
diff --git a/rst_parser/rst.h b/rst_parser/rst.h
index a269ff9b..8bf389f7 100644
--- a/rst_parser/rst.h
+++ b/rst_parser/rst.h
@@ -4,12 +4,18 @@
 #include <vector>
 #include "sampler.h"
 #include "arc_factored.h"
+#include "alias_sampler.h"
 
 struct TreeSampler {
   explicit TreeSampler(const ArcFactoredForest& af);
-  void SampleRandomSpanningTree(EdgeSubset* tree);
+  void SampleRandomSpanningTree(EdgeSubset* tree, MT19937* rng);
   const ArcFactoredForest& forest;
+#define USE_ALIAS_SAMPLER 1
+#if USE_ALIAS_SAMPLER
+  std::vector<AliasSampler> usucc;
+#else
   std::vector<SampleSet<double> > usucc;
+#endif
 };
 
 #endif
diff --git a/rst_parser/rst_parse.cc b/rst_parser/rst_parse.cc
new file mode 100644
index 00000000..9cc1359a
--- /dev/null
+++ b/rst_parser/rst_parse.cc
@@ -0,0 +1,126 @@
+#include "arc_factored.h"
+
+#include <vector>
+#include <iostream>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "timing_stats.h"
+#include "arc_ff.h"
+#include "arc_ff_factory.h"
+#include "dep_training.h"
+#include "stringlib.h"
+#include "filelib.h"
+#include "tdict.h"
+#include "weights.h"
+#include "rst.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  string cfg_file;
+  opts.add_options()
+        ("training_data,t",po::value<string>()->default_value("-"), "File containing training data (jsent format)")
+        ("feature_function,F",po::value<vector<string> >()->composing(), "feature function (multiple permitted)")
+        ("q_weights,q",po::value<string>(), "Arc-factored weights for proposal distribution")
+        ("samples,n",po::value<unsigned>()->default_value(1000), "Number of samples");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config,c", po::value<string>(&cfg_file), "Configuration file")
+        ("help,?", "Print this help message and exit");
+
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(dconfig_options).add(clo);
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (cfg_file.size() > 0) {
+    ReadFile rf(cfg_file);
+    po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf);
+  }
+  if (conf->count("help")) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  ArcFactoredForest af(5);
+  ArcFFRegistry reg;
+  reg.Register("DistancePenalty", new ArcFFFactory<DistancePenalty>);
+  vector<TrainingInstance> corpus;
+  vector<boost::shared_ptr<ArcFeatureFunction> > ffs;
+  ffs.push_back(boost::shared_ptr<ArcFeatureFunction>(new DistancePenalty("")));
+  TrainingInstance::ReadTraining(conf["training_data"].as<string>(), &corpus);
+  vector<ArcFactoredForest> forests(corpus.size());
+  SparseVector<double> empirical;
+  bool flag = false;
+  for (int i = 0; i < corpus.size(); ++i) {
+    TrainingInstance& cur = corpus[i];
+    if ((i+1) % 10 == 0) { cerr << '.' << flush; flag = true; }
+    if ((i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; }
+    for (int fi = 0; fi < ffs.size(); ++fi) {
+      ArcFeatureFunction& ff = *ffs[fi];
+      ff.PrepareForInput(cur.ts);
+      SparseVector<weight_t> efmap;
+      for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) {
+        efmap.clear();
+        ff.EgdeFeatures(cur.ts, cur.tree.h_m_pairs[j].first,
+                        cur.tree.h_m_pairs[j].second,
+                        &efmap);
+        cur.features += efmap;
+      }
+      for (int j = 0; j < cur.tree.roots.size(); ++j) {
+        efmap.clear();
+        ff.EgdeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap);
+        cur.features += efmap;
+      }
+    }
+    empirical += cur.features;
+    forests[i].resize(cur.ts.words.size());
+    forests[i].ExtractFeatures(cur.ts, ffs);
+  }
+  if (flag) cerr << endl;
+  vector<weight_t> weights(FD::NumFeats(), 0.0);
+  Weights::InitFromFile(conf["q_weights"].as<string>(), &weights);
+  MT19937 rng;
+  SparseVector<double> model_exp;
+  SparseVector<double> sampled_exp;
+  int samples = conf["samples"].as<unsigned>();
+  for (int i = 0; i < corpus.size(); ++i) {
+    const int num_words = corpus[i].ts.words.size();
+    forests[i].Reweight(weights);
+    forests[i].EdgeMarginals();
+    model_exp.clear();
+    for (int h = -1; h < num_words; ++h) {
+      for (int m = 0; m < num_words; ++m) {
+        if (h == m) continue;
+        const ArcFactoredForest::Edge& edge = forests[i](h,m);
+        const SparseVector<weight_t>& fmap = edge.features;
+        double prob = edge.edge_prob.as_float();
+        model_exp += fmap * prob;
+      }
+    }
+    //cerr << "TRUE EXP: " << model_exp << endl;
+
+    forests[i].Reweight(weights);
+    TreeSampler ts(forests[i]);
+    sampled_exp.clear();
+      //ostringstream os; os << "Samples_" << samples;
+      //Timer t(os.str());
+      for (int n = 0; n < samples; ++n) {
+        EdgeSubset tree;
+        ts.SampleRandomSpanningTree(&tree, &rng);
+        SparseVector<double> feats;
+        tree.ExtractFeatures(corpus[i].ts, ffs, &feats);
+        sampled_exp += feats;
+      }
+      sampled_exp /= samples;
+      cerr << "L2 norm of diff @ " << samples << " samples: " << (model_exp - sampled_exp).l2norm() << endl;
+  }
+  return 0;
+}
+
diff --git a/utils/weights.cc b/utils/weights.cc
index ac407dfb..39c18474 100644
--- a/utils/weights.cc
+++ b/utils/weights.cc
@@ -144,8 +144,10 @@ void Weights::ShowLargestFeatures(const vector<weight_t>& w) {
   vector<int> fnums(w.size());
   for (int i = 0; i < w.size(); ++i)
     fnums[i] = i;
+  int nf = FD::NumFeats();
+  if (nf > 10) nf = 10;
   vector<int>::iterator mid = fnums.begin();
-  mid += (w.size() > 10 ? 10 : w.size());
+  mid += nf;
   partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
   cerr << "TOP FEATURES:";
   for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
-- 
cgit v1.2.3


From 8aff3bd109b82b57c32a0b14a019c99c1ec35705 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Mon, 16 Apr 2012 18:37:04 -0400
Subject: improved alias sampler

---
 utils/alias_sampler.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'utils')

diff --git a/utils/alias_sampler.h b/utils/alias_sampler.h
index 85da9944..81541f7a 100644
--- a/utils/alias_sampler.h
+++ b/utils/alias_sampler.h
@@ -14,10 +14,13 @@
 // constructor computes the F's and L's given an arbitrary multionimial p in
 // O(n) time and Draw returns samples in O(1) time.
 struct AliasSampler {
-  explicit AliasSampler(const std::vector<double>& p) :
-      cutoffs_(p.size()),
-      aliases_(p.size(), std::numeric_limits<unsigned>::max()) {
+  AliasSampler() {}
+  explicit AliasSampler(const std::vector<double>& p) { Init(p); }
+  void Init(const std::vector<double>& p) {
     const unsigned N = p.size();
+    cutoffs_.resize(p.size());
+    aliases_.clear();
+    aliases_.resize(p.size(), std::numeric_limits<unsigned>::max());
     std::vector<unsigned> s,g;
     for (unsigned i = 0; i < N; ++i) {
       const double cutoff = cutoffs_[i] = N * p[i];
-- 
cgit v1.2.3