From 34b4752a1eefc002166e95782c2c52747bb08b3a Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 20 Mar 2012 15:37:54 -0400
Subject: make c++11 compatible

---
 decoder/decoder.cc         | 31 +++++++++++++++----------------
 decoder/earley_composer.cc |  4 +---
 decoder/phrasetable_fst.cc |  3 +--
 3 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'decoder')
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 69fbaf85..d4f8f06d 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -57,7 +57,6 @@ static const double kMINUS_EPSILON = -1e-6;  // don't be too strict
 
 using namespace std;
 using namespace std::tr1;
-using boost::shared_ptr;
 namespace po = boost::program_options;
 
 static bool verbose_feature_functions=true;
@@ -101,7 +100,7 @@ inline string str(char const* name,po::variables_map const& conf) {
 
 // print just the --long_opt names suitable for bash compgen
 inline void print_options(std::ostream &out,po::options_description const& opts) {
-  typedef std::vector< shared_ptr<po::option_description> > Ds;
+  typedef std::vector< boost::shared_ptr<po::option_description> > Ds;
   Ds const& ds=opts.options();
   out << '"';
   for (unsigned i=0;i<ds.size();++i) {
@@ -120,13 +119,13 @@ inline bool store_conf(po::variables_map const& conf,std::string const& name,V *
   return false;
 }
 
-inline shared_ptr<FeatureFunction> make_ff(string const& ffp,bool verbose_feature_functions,char const* pre="") {
+inline boost::shared_ptr<FeatureFunction> make_ff(string const& ffp,bool verbose_feature_functions,char const* pre="") {
   string ff, param;
   SplitCommandAndParam(ffp, &ff, &param);
   cerr << pre << "feature: " << ff;
   if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n";
   else cerr << " (no config parameters)\n";
-  shared_ptr<FeatureFunction> pf = ff_registry.Create(ff, param);
+  boost::shared_ptr<FeatureFunction> pf = ff_registry.Create(ff, param);
   if (!pf) exit(1);
   int nbyte=pf->NumBytesContext();
   if (verbose_feature_functions)
@@ -135,13 +134,13 @@ inline shared_ptr<FeatureFunction> make_ff(string const& ffp,bool verbose_featur
 }
 
 #ifdef FSA_RESCORING
-inline shared_ptr<FsaFeatureFunction> make_fsa_ff(string const& ffp,bool verbose_feature_functions,char const* pre="") {
+inline boost::shared_ptr<FsaFeatureFunction> make_fsa_ff(string const& ffp,bool verbose_feature_functions,char const* pre="") {
   string ff, param;
   SplitCommandAndParam(ffp, &ff, &param);
   cerr << "FSA Feature: " << ff;
   if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n";
   else cerr << " (no config parameters)\n";
-  shared_ptr<FsaFeatureFunction> pf = fsa_ff_registry.Create(ff, param);
+  boost::shared_ptr<FsaFeatureFunction> pf = fsa_ff_registry.Create(ff, param);
   if (!pf) exit(1);
   if (verbose_feature_functions)
     cerr<<"State is "<<pf->state_bytes()<<" bytes for "<<pre<<"feature "<<ffp<<endl;
@@ -156,10 +155,10 @@ inline shared_ptr<FsaFeatureFunction> make_fsa_ff(string const& ffp,bool verbose
 // passes are carried over into subsequent passes (where they may have different weights).
 struct RescoringPass {
   RescoringPass() : fid_summary(), density_prune(), beam_prune() {}
-  shared_ptr<ModelSet> models;
-  shared_ptr<IntersectionConfiguration> inter_conf;
+  boost::shared_ptr<ModelSet> models;
+  boost::shared_ptr<IntersectionConfiguration> inter_conf;
   vector<const FeatureFunction*> ffs;
-  shared_ptr<vector<weight_t> > weight_vector;
+  boost::shared_ptr<vector<weight_t> > weight_vector;
   int fid_summary;            // 0 == no summary feature
   double density_prune;       // 0 == don't density prune
   double beam_prune;          // 0 == don't beam prune
@@ -293,15 +292,15 @@ struct DecoderImpl {
   po::variables_map& conf;
   OracleBleu oracle;
   string formalism;
-  shared_ptr<Translator> translator;
-  shared_ptr<vector<weight_t> > init_weights; // weights used with initial parse
-  vector<shared_ptr<FeatureFunction> > pffs;
+  boost::shared_ptr<Translator> translator;
+  boost::shared_ptr<vector<weight_t> > init_weights; // weights used with initial parse
+  vector<boost::shared_ptr<FeatureFunction> > pffs;
 #ifdef FSA_RESCORING
   CFGOptions cfg_options;
-  vector<shared_ptr<FsaFeatureFunction> > fsa_ffs;
+  vector<boost::shared_ptr<FsaFeatureFunction> > fsa_ffs;
   vector<string> fsa_names;
 #endif
-  shared_ptr<RandomNumberGenerator<boost::mt19937> > rng;
+  boost::shared_ptr<RandomNumberGenerator<boost::mt19937> > rng;
   int sample_max_trans;
   bool aligner_mode;
   bool graphviz; 
@@ -310,7 +309,7 @@ struct DecoderImpl {
   bool kbest;
   bool unique_kbest;
   bool get_oracle_forest;
-  shared_ptr<WriteFile> extract_file;
+  boost::shared_ptr<WriteFile> extract_file;
   int combine_size;
   int sent_id;
   SparseVector<prob_t> acc_vec;  // accumulate gradient
@@ -622,7 +621,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
   }
 
   // set up weight vectors since later phases may reuse weights from earlier phases
-  shared_ptr<vector<weight_t> > prev_weights = init_weights;
+  boost::shared_ptr<vector<weight_t> > prev_weights = init_weights;
   for (int pass = 0; pass < rescoring_passes.size(); ++pass) {
     RescoringPass& rp = rescoring_passes[pass];
     if (!rp.weight_vector) {
diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc
index b7af801a..385baf8b 100644
--- a/decoder/earley_composer.cc
+++ b/decoder/earley_composer.cc
@@ -16,8 +16,6 @@
 #include "tdict.h"
 #include "hg.h"
 
-using boost::shared_ptr;
-namespace po = boost::program_options;
 using namespace std;
 using namespace std::tr1;
 
@@ -111,7 +109,7 @@ struct Edge {
   const Edge* const active_parent;    // back pointer, NULL for PREDICT items
   const Edge* const passive_parent;   // back pointer, NULL for SCAN and PREDICT items
   const TargetPhraseSet* const tps;   // translations
-  shared_ptr<SparseVector<double> > features; // features from CFG rule
+  boost::shared_ptr<SparseVector<double> > features; // features from CFG rule
 
   bool IsPassive() const {
     // when a rule is completed, this value will be set
diff --git a/decoder/phrasetable_fst.cc b/decoder/phrasetable_fst.cc
index f421e941..b3bec86b 100644
--- a/decoder/phrasetable_fst.cc
+++ b/decoder/phrasetable_fst.cc
@@ -9,7 +9,6 @@
 #include "filelib.h"
 #include "tdict.h"
 
-using boost::shared_ptr;
 using namespace std;
 
 TargetPhraseSet::~TargetPhraseSet() {}
@@ -46,7 +45,7 @@ class TextFSTNode : public FSTNode {
   void ClearPassThroughTranslations();
  private:
   vector<WordID> passthroughs;
-  shared_ptr<TargetPhraseSet> data;
+  boost::shared_ptr<TargetPhraseSet> data;
   map<WordID, TextFSTNode> ptr;
 };
 
-- 
cgit v1.2.3


From b6eede632af4fa58a6f5325ee0d059c02a898b9f Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 24 Mar 2012 23:04:46 -0400
Subject: rename aligner, add support for distinguishing translation /
 transliteration

---
 decoder/aligner.cc         |  4 +-
 decoder/ff_wordalign.cc    |  1 -
 mteval/aer_scorer.cc       |  6 +--
 utils/Makefile.am          |  2 +-
 utils/alignment_io.cc      | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 utils/alignment_io.h       | 42 ++++++++++++++++++++
 utils/alignment_pharaoh.cc | 77 ------------------------------------
 utils/alignment_pharaoh.h  | 14 -------
 utils/atools.cc            |  8 ++--
 9 files changed, 149 insertions(+), 102 deletions(-)
 create mode 100644 utils/alignment_io.cc
 create mode 100644 utils/alignment_io.h
 delete mode 100644 utils/alignment_pharaoh.cc
 delete mode 100644 utils/alignment_pharaoh.h

(limited to 'decoder')

diff --git a/decoder/aligner.cc b/decoder/aligner.cc
index 53e059fb..232e022a 100644
--- a/decoder/aligner.cc
+++ b/decoder/aligner.cc
@@ -11,7 +11,7 @@
 #include "sentence_metadata.h"
 #include "inside_outside.h"
 #include "viterbi.h"
-#include "alignment_pharaoh.h"
+#include "alignment_io.h"
 
 using namespace std;
 
@@ -300,7 +300,7 @@ void AlignerTools::WriteAlignment(const Lattice& src_lattice,
       cerr << grid << endl;
     }
     (*out) << TD::GetString(src_sent) << " ||| " << TD::GetString(trg_sent) << " ||| ";
-    AlignmentPharaoh::SerializePharaohFormat(grid, out);
+    AlignmentIO::SerializePharaohFormat(grid, out);
   }
 };
 
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
index 9e7c618e..decdf9bc 100644
--- a/decoder/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -15,7 +15,6 @@
 
 #include "factored_lexicon_helper.h"
 #include "verbose.h"
-#include "alignment_pharaoh.h"
 #include "stringlib.h"
 #include "sentence_metadata.h"
 #include "hg.h"
diff --git a/mteval/aer_scorer.cc b/mteval/aer_scorer.cc
index edd4390f..ae3192d4 100644
--- a/mteval/aer_scorer.cc
+++ b/mteval/aer_scorer.cc
@@ -5,7 +5,7 @@
 #include <sstream>
 
 #include "tdict.h"
-#include "alignment_pharaoh.h"
+#include "alignment_io.h"
 
 using namespace std;
 
@@ -85,7 +85,7 @@ AERScorer::AERScorer(const vector<vector<WordID> >& refs, const string& src) : s
     cerr << "AERScorer can only take a single reference!\n";
     abort();
   }
-  ref_ = AlignmentPharaoh::ReadPharaohAlignmentGrid(TD::GetString(refs.front()));
+  ref_ = AlignmentIO::ReadPharaohAlignmentGrid(TD::GetString(refs.front()));
 }
 
 static inline bool Safe(const Array2D<bool>& a, int i, int j) {
@@ -101,7 +101,7 @@ ScoreP AERScorer::ScoreCCandidate(const vector<WordID>& shyp) const {
 
 ScoreP AERScorer::ScoreCandidate(const vector<WordID>& shyp) const {
   boost::shared_ptr<Array2D<bool> > hyp =
-    AlignmentPharaoh::ReadPharaohAlignmentGrid(TD::GetString(shyp));
+    AlignmentIO::ReadPharaohAlignmentGrid(TD::GetString(shyp));
 
   int m = 0;
   int r = 0;
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 3ea21835..2fc6ae21 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -23,7 +23,7 @@ atools_SOURCES = atools.cc
 noinst_LIBRARIES = libutils.a
 
 libutils_a_SOURCES = \
-  alignment_pharaoh.cc \
+  alignment_io.cc \
   b64tools.cc \
   corpus_tools.cc \
   dict.cc \
diff --git a/utils/alignment_io.cc b/utils/alignment_io.cc
new file mode 100644
index 00000000..1d923f7f
--- /dev/null
+++ b/utils/alignment_io.cc
@@ -0,0 +1,97 @@
+#include "utils/alignment_io.h"
+
+using namespace std;
+
+static bool is_digit(char x) { return x >= '0' && x <= '9'; }
+
+boost::shared_ptr<Array2D<bool> > AlignmentIO::ReadPharaohAlignmentGrid(const string& al) {
+  int max_x = 0;
+  int max_y = 0;
+  int i = 0;
+  size_t pos = al.rfind(" ||| ");
+  if (pos != string::npos) { i = pos + 5; }
+  while (i < al.size()) {
+    if (al[i] == '\n' || al[i] == '\r') break;
+    int x = 0;
+    while(i < al.size() && is_digit(al[i])) {
+      x *= 10;
+      x += al[i] - '0';
+      ++i;
+    }
+    if (x > max_x) max_x = x;
+    assert(i < al.size());
+    if(al[i] != '-') {
+      cerr << "BAD ALIGNMENT: " << al << endl;
+      abort();
+    }
+    ++i;
+    int y = 0;
+    while(i < al.size() && is_digit(al[i])) {
+      y *= 10;
+      y += al[i] - '0';
+      ++i;
+    }
+    if (y > max_y) max_y = y;
+    while(i < al.size() && al[i] == ' ') { ++i; }
+  }
+
+  boost::shared_ptr<Array2D<bool> > grid(new Array2D<bool>(max_x + 1, max_y + 1));
+  i = 0;
+  if (pos != string::npos) { i = pos + 5; }
+  while (i < al.size()) {
+    if (al[i] == '\n' || al[i] == '\r') break;
+    int x = 0;
+    while(i < al.size() && is_digit(al[i])) {
+      x *= 10;
+      x += al[i] - '0';
+      ++i;
+    }
+    assert(i < al.size());
+    assert(al[i] == '-');
+    ++i;
+    int y = 0;
+    while(i < al.size() && is_digit(al[i])) {
+      y *= 10;
+      y += al[i] - '0';
+      ++i;
+    }
+    (*grid)(x, y) = true;
+    while(i < al.size() && al[i] == ' ') { ++i; }
+  }
+  // cerr << *grid << endl;
+  return grid;
+}
+
+void AlignmentIO::SerializePharaohFormat(const Array2D<bool>& alignment, ostream* o) {
+  ostream& out = *o;
+  bool need_space = false;
+  for (int i = 0; i < alignment.width(); ++i)
+    for (int j = 0; j < alignment.height(); ++j)
+      if (alignment(i,j)) {
+        if (need_space) out << ' '; else need_space = true;
+        out << i << '-' << j;
+      }
+  out << endl;
+}
+
+void AlignmentIO::SerializeTypedAlignment(const Array2D<AlignmentType>& alignment, ostream* o) {
+  ostream& out = *o;
+  bool need_space = false;
+  for (int i = 0; i < alignment.width(); ++i)
+    for (int j = 0; j < alignment.height(); ++j) {
+      const AlignmentType& aij = alignment(i,j);
+      if (aij != kNONE) {
+        if (need_space) out << ' '; else need_space = true;
+        if (aij == kTRANSLATION) {}
+        else if (aij == kTRANSLITERATION) {
+          out << 'T' << ':';
+        } else {
+          cerr << "\nUnexpected alignment point type: " << static_cast<int>(aij) << endl;
+          abort();
+        }
+        out << i << '-' << j;
+      }
+    }
+  out << endl;
+}
+
diff --git a/utils/alignment_io.h b/utils/alignment_io.h
new file mode 100644
index 00000000..36bcecd7
--- /dev/null
+++ b/utils/alignment_io.h
@@ -0,0 +1,42 @@
+#ifndef _ALIGNMENT_IO_H_
+#define _ALIGNMENT_IO_H_
+
+#include <string>
+#include <iostream>
+#include <boost/shared_ptr.hpp>
+#include "array2d.h"
+
+struct AlignmentIO {
+  enum AlignmentType { kNONE = 0, kTRANSLATION = 1, kTRANSLITERATION = 2 };
+
+  static boost::shared_ptr<Array2D<bool> > ReadPharaohAlignmentGrid(const std::string& al);
+  static void SerializePharaohFormat(const Array2D<bool>& alignment, std::ostream* out);
+  static void SerializeTypedAlignment(const Array2D<AlignmentType>& alignment, std::ostream* out);
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Array2D<AlignmentIO::AlignmentType>& m) {
+  os << ' ';
+  for (int j=0; j<m.height(); ++j)
+    os << (j%10);
+  os << "\n";
+  for (int i=0; i<m.width(); ++i) {
+    os << (i%10);
+    for (int j=0; j<m.height(); ++j) {
+      switch (m(i,j)) {
+        case AlignmentIO::kNONE:            os << '.'; break;
+        case AlignmentIO::kTRANSLATION:     os << '*'; break;
+        case AlignmentIO::kTRANSLITERATION: os << '#'; break;
+        default:                            os << '?'; break;
+      }
+    }
+    os << (i%10) << "\n";
+  }
+  os << ' ';
+  for (int j=0; j<m.height(); ++j)
+    os << (j%10);
+  os << "\n";
+  return os;
+}
+
+
+#endif
diff --git a/utils/alignment_pharaoh.cc b/utils/alignment_pharaoh.cc
deleted file mode 100644
index 890ff565..00000000
--- a/utils/alignment_pharaoh.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-#include "utils/alignment_pharaoh.h"
-
-#include <set>
-
-using namespace std;
-
-static bool is_digit(char x) { return x >= '0' && x <= '9'; }
-
-boost::shared_ptr<Array2D<bool> > AlignmentPharaoh::ReadPharaohAlignmentGrid(const string& al) {
-  int max_x = 0;
-  int max_y = 0;
-  int i = 0;
-  size_t pos = al.rfind(" ||| ");
-  if (pos != string::npos) { i = pos + 5; }
-  while (i < al.size()) {
-    if (al[i] == '\n' || al[i] == '\r') break;
-    int x = 0;
-    while(i < al.size() && is_digit(al[i])) {
-      x *= 10;
-      x += al[i] - '0';
-      ++i;
-    }
-    if (x > max_x) max_x = x;
-    assert(i < al.size());
-    if(al[i] != '-') {
-      cerr << "BAD ALIGNMENT: " << al << endl;
-      abort();
-    }
-    ++i;
-    int y = 0;
-    while(i < al.size() && is_digit(al[i])) {
-      y *= 10;
-      y += al[i] - '0';
-      ++i;
-    }
-    if (y > max_y) max_y = y;
-    while(i < al.size() && al[i] == ' ') { ++i; }
-  }
-
-  boost::shared_ptr<Array2D<bool> > grid(new Array2D<bool>(max_x + 1, max_y + 1));
-  i = 0;
-  if (pos != string::npos) { i = pos + 5; }
-  while (i < al.size()) {
-    if (al[i] == '\n' || al[i] == '\r') break;
-    int x = 0;
-    while(i < al.size() && is_digit(al[i])) {
-      x *= 10;
-      x += al[i] - '0';
-      ++i;
-    }
-    assert(i < al.size());
-    assert(al[i] == '-');
-    ++i;
-    int y = 0;
-    while(i < al.size() && is_digit(al[i])) {
-      y *= 10;
-      y += al[i] - '0';
-      ++i;
-    }
-    (*grid)(x, y) = true;
-    while(i < al.size() && al[i] == ' ') { ++i; }
-  }
-  // cerr << *grid << endl;
-  return grid;
-}
-
-void AlignmentPharaoh::SerializePharaohFormat(const Array2D<bool>& alignment, ostream* out) {
-  bool need_space = false;
-  for (int i = 0; i < alignment.width(); ++i)
-    for (int j = 0; j < alignment.height(); ++j)
-      if (alignment(i,j)) {
-        if (need_space) (*out) << ' '; else need_space = true;
-        (*out) << i << '-' << j;
-      }
-  (*out) << endl;
-}
-
diff --git a/utils/alignment_pharaoh.h b/utils/alignment_pharaoh.h
deleted file mode 100644
index d111c8bf..00000000
--- a/utils/alignment_pharaoh.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _PHARAOH_ALIGNMENT_H_
-#define _PHARAOH_ALIGNMENT_H_
-
-#include <string>
-#include <iostream>
-#include <boost/shared_ptr.hpp>
-#include "array2d.h"
-
-struct AlignmentPharaoh {
-  static boost::shared_ptr<Array2D<bool> > ReadPharaohAlignmentGrid(const std::string& al);
-  static void SerializePharaohFormat(const Array2D<bool>& alignment, std::ostream* out);
-};
-
-#endif
diff --git a/utils/atools.cc b/utils/atools.cc
index ba56dd6c..bce7822e 100644
--- a/utils/atools.cc
+++ b/utils/atools.cc
@@ -8,7 +8,7 @@
 #include <boost/shared_ptr.hpp>
 
 #include "filelib.h"
-#include "alignment_pharaoh.h"
+#include "alignment_io.h"
 
 namespace po = boost::program_options;
 using namespace std;
@@ -348,9 +348,9 @@ int main(int argc, char **argv) {
     }
     if (line1.empty() && !*in1) break;
     boost::shared_ptr<Array2D<bool> > out(new Array2D<bool>);
-    boost::shared_ptr<Array2D<bool> > a1 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line1);
+    boost::shared_ptr<Array2D<bool> > a1 = AlignmentIO::ReadPharaohAlignmentGrid(line1);
     if (in2) {
-      boost::shared_ptr<Array2D<bool> > a2 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line2);
+      boost::shared_ptr<Array2D<bool> > a2 = AlignmentIO::ReadPharaohAlignmentGrid(line2);
       cmd.Apply(*a1, *a2, out.get());
     } else {
       Array2D<bool> dummy;
@@ -358,7 +358,7 @@ int main(int argc, char **argv) {
     }
     
     if (cmd.Result() == 1) {
-      AlignmentPharaoh::SerializePharaohFormat(*out, &cout);
+      AlignmentIO::SerializePharaohFormat(*out, &cout);
     }
   }
   if (cmd.Result() == 2)
-- 
cgit v1.2.3


From bf4a7606151301dba49265e91c289f2caab2b7ec Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Mon, 2 Apr 2012 23:48:19 -0400
Subject: fix bug in lattices with OOVs

---
 decoder/grammar.cc        | 24 +++++++++++---------
 decoder/grammar.h         |  2 --
 rst_parser/Makefile.am    | 16 +++++++++++++
 rst_parser/arc_factored.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
 rst_parser/mst_train.cc   | 11 +++++++++
 rst_parser/rst.cc         |  2 ++
 rst_parser/rst.h          |  7 ++++++
 7 files changed, 107 insertions(+), 13 deletions(-)
 create mode 100644 rst_parser/Makefile.am
 create mode 100644 rst_parser/arc_factored.h
 create mode 100644 rst_parser/mst_train.cc
 create mode 100644 rst_parser/rst.cc
 create mode 100644 rst_parser/rst.h

(limited to 'decoder')

diff --git a/decoder/grammar.cc b/decoder/grammar.cc
index 9e4065a6..714390f0 100644
--- a/decoder/grammar.cc
+++ b/decoder/grammar.cc
@@ -3,12 +3,14 @@
 #include <algorithm>
 #include <utility>
 #include <map>
+#include <tr1/unordered_map>
 
 #include "rule_lexer.h"
 #include "filelib.h"
 #include "tdict.h"
 
 using namespace std;
+using namespace std::tr1;
 
 const vector<TRulePtr> Grammar::NO_RULES;
 
@@ -148,24 +150,24 @@ bool GlueGrammar::HasRuleForSpan(int i, int /* j */, int /* distance */) const {
   return (i == 0);
 }
 
-PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat, const unsigned int ctf_level) :
-    has_rule_(input.size() + 1) {
+PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat, const unsigned int ctf_level) {
+  unordered_set<WordID> ss;
   for (int i = 0; i < input.size(); ++i) {
     const vector<LatticeArc>& alts = input[i];
     for (int k = 0; k < alts.size(); ++k) {
       const int j = alts[k].dist2next + i;
-      has_rule_[i].insert(j);
       const string& src = TD::Convert(alts[k].label);
-      TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1"));
-      pt->a_.push_back(AlignmentPoint(0,0));
-      AddRule(pt);
-      RefineRule(pt, ctf_level);
+      if (ss.count(alts[k].label) == 0) {
+        TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1"));
+        pt->a_.push_back(AlignmentPoint(0,0));
+        AddRule(pt);
+        RefineRule(pt, ctf_level);
+        ss.insert(alts[k].label);
+      }
     }
   }
 }
 
-bool PassThroughGrammar::HasRuleForSpan(int i, int j, int /* distance */) const {
-  const set<int>& hr = has_rule_[i];
-  if (i == j) { return !hr.empty(); }
-  return (hr.find(j) != hr.end());
+bool PassThroughGrammar::HasRuleForSpan(int, int, int distance) const {
+  return (distance < 2);
 }
diff --git a/decoder/grammar.h b/decoder/grammar.h
index f5d00817..e6a15a69 100644
--- a/decoder/grammar.h
+++ b/decoder/grammar.h
@@ -91,8 +91,6 @@ struct GlueGrammar : public TextGrammar {
 struct PassThroughGrammar : public TextGrammar {
   PassThroughGrammar(const Lattice& input, const std::string& cat, const unsigned int ctf_level=0);
   virtual bool HasRuleForSpan(int i, int j, int distance) const;
- private:
-  std::vector<std::set<int> > has_rule_;  // index by [i][j]
 };
 
 void RefineRule(TRulePtr pt, const unsigned int ctf_level);
diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am
new file mode 100644
index 00000000..fef1c1a2
--- /dev/null
+++ b/rst_parser/Makefile.am
@@ -0,0 +1,16 @@
+bin_PROGRAMS = \
+  mst_train
+
+noinst_PROGRAMS = \
+  rst_test
+
+TESTS = rst_test
+
+noinst_LIBRARIES = librst.a
+
+librst_a_SOURCES = rst.cc
+
+mst_train_SOURCES = mst_train.cc
+mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm
diff --git a/rst_parser/arc_factored.h b/rst_parser/arc_factored.h
new file mode 100644
index 00000000..312d7d67
--- /dev/null
+++ b/rst_parser/arc_factored.h
@@ -0,0 +1,58 @@
+#ifndef _ARC_FACTORED_H_
+#define _ARC_FACTORED_H_
+
+#include <vector>
+#include <cassert>
+#include "array2d.h"
+#include "sparse_vector.h"
+
+class ArcFactoredForest {
+ public:
+  explicit ArcFactoredForest(short num_words) :
+      num_words_(num_words),
+      root_edges_(num_words),
+      edges_(num_words, num_words) {}
+
+  struct Edge {
+    Edge() : features(), edge_prob(prob_t::Zero()) {}
+    SparseVector<weight_t> features;
+    prob_t edge_prob;
+  };
+
+  template <class V>
+  void Reweight(const V& weights) {
+    for (int m = 0; m < num_words_; ++m) {
+      for (int h = 0; h < num_words_; ++h) {
+        if (h != m) {
+          Edge& e = edges_(h, m);
+          e.edge_prob.logeq(e.features.dot(weights));
+        }
+      }
+      if (m) {
+        Edge& e = root_edges_[m];
+        e.edge_prob.logeq(e.features.dot(weights));
+      }
+    }
+  }
+
+  const Edge& operator()(short h, short m) const {
+    assert(m > 0);
+    assert(m <= num_words_);
+    assert(h >= 0);
+    assert(h <= num_words_);
+    return h ? edges_(h - 1, m - 1) : root_edges[m - 1];
+  }
+  Edge& operator()(short h, short m) {
+    assert(m > 0);
+    assert(m <= num_words_);
+    assert(h >= 0);
+    assert(h <= num_words_);
+    return h ? edges_(h - 1, m - 1) : root_edges[m - 1];
+  }
+ private:
+  unsigned num_words_;
+  std::vector<Edge> root_edges_;
+  Array2D<Edge> edges_;
+};
+
+#endif
diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc
new file mode 100644
index 00000000..1bceaff5
--- /dev/null
+++ b/rst_parser/mst_train.cc
@@ -0,0 +1,11 @@
+#include "arc_factored.h"
+
+#include <iostream>
+
+using namespace std;
+
+int main(int argc, char** argv) {
+  ArcFactoredForest af(5);
+  return 0;
+}
+
diff --git a/rst_parser/rst.cc b/rst_parser/rst.cc
new file mode 100644
index 00000000..0ab3e296
--- /dev/null
+++ b/rst_parser/rst.cc
@@ -0,0 +1,2 @@
+#include "rst.h"
+
diff --git a/rst_parser/rst.h b/rst_parser/rst.h
new file mode 100644
index 00000000..30a1f8a4
--- /dev/null
+++ b/rst_parser/rst.h
@@ -0,0 +1,7 @@
+#ifndef _RST_H_
+#define _RST_H_
+
+struct RandomSpanningTree {
+};
+
+#endif
-- 
cgit v1.2.3


From 6001b81eba37985d2e7dea6e6ebb488b787789a6 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 3 Apr 2012 02:08:33 -0400
Subject: bayes lattice scoring

---
 decoder/hg_io.cc             |  20 +++
 decoder/hg_io.h              |   1 +
 gi/pf/Makefile.am            |   5 +-
 gi/pf/bayes_lattice_score.cc | 309 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 334 insertions(+), 1 deletion(-)
 create mode 100644 gi/pf/bayes_lattice_score.cc

(limited to 'decoder')

diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc
index 9f0f50fa..d416dbf6 100644
--- a/decoder/hg_io.cc
+++ b/decoder/hg_io.cc
@@ -401,6 +401,26 @@ string HypergraphIO::AsPLF(const Hypergraph& hg, bool include_global_parentheses
   return os.str();
 }
 
+string HypergraphIO::AsPLF(const Lattice& lat, bool include_global_parentheses) {
+  static bool first = true;
+  if (first) { InitEscapes(); first = false; }
+  if (lat.empty()) return "()";
+  ostringstream os;
+  if (include_global_parentheses) os << '(';
+  static const string EPS="*EPS*";
+  for (int i = 0; i < lat.size(); ++i) {
+    const vector<LatticeArc> arcs = lat[i];
+    os << '(';
+    for (int j = 0; j < arcs.size(); ++j) {
+      os << "('" << Escape(TD::Convert(arcs[j].label)) << "',"
+                 << arcs[j].cost << ',' << arcs[j].dist2next << "),";
+    }
+    os << "),";
+  }
+  if (include_global_parentheses) os << ')';
+  return os.str();
+}
+
 namespace PLF {
 
 const string chars = "'\\";
diff --git a/decoder/hg_io.h b/decoder/hg_io.h
index 44817157..4e502a0c 100644
--- a/decoder/hg_io.h
+++ b/decoder/hg_io.h
@@ -30,6 +30,7 @@ struct HypergraphIO {
   static void ReadFromPLF(const std::string& in, Hypergraph* out, int line = 0);
   // return PLF string representation (undefined behavior on non-lattices)
   static std::string AsPLF(const Hypergraph& hg, bool include_global_parentheses = true);
+  static std::string AsPLF(const Lattice& lat, bool include_global_parentheses = true);
   static void PLFtoLattice(const std::string& plf, Lattice* pl);
   static std::string Escape(const std::string& s);  // PLF helper
 };
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index d365016b..86f8e07b 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,9 +1,12 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl pf_test
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl pf_test bayes_lattice_score
 
 noinst_LIBRARIES = libpf.a
 
 libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc hpyp_tm.cc pyp_tm.cc
 
+bayes_lattice_score_SOURCES = bayes_lattice_score.cc
+bayes_lattice_score_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
 pf_test_SOURCES = pf_test.cc
 pf_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
 
diff --git a/gi/pf/bayes_lattice_score.cc b/gi/pf/bayes_lattice_score.cc
new file mode 100644
index 00000000..70cb8dc2
--- /dev/null
+++ b/gi/pf/bayes_lattice_score.cc
@@ -0,0 +1,309 @@
+#include <iostream>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "inside_outside.h"
+#include "hg.h"
+#include "hg_io.h"
+#include "bottom_up_parser.h"
+#include "fdict.h"
+#include "grammar.h"
+#include "m.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp.h"
+#include "ccrp_onetable.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+boost::shared_ptr<MT19937> prng;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("input,i",po::value<string>(),"Read parallel data from")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("input") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+unsigned ReadCorpus(const string& filename,
+                    vector<Lattice>* e,
+                    set<WordID>* vocab_e) {
+  e->clear();
+  vocab_e->clear();
+  ReadFile rf(filename);
+  istream* in = rf.stream();
+  assert(*in);
+  string line;
+  unsigned toks = 0;
+  while(*in) {
+    getline(*in, line);
+    if (line.empty() && !*in) break;
+    e->push_back(Lattice());
+    Lattice& le = e->back();
+    LatticeTools::ConvertTextOrPLF(line, & le);
+    for (unsigned i = 0; i < le.size(); ++i)
+      for (unsigned j = 0; j < le[i].size(); ++j)
+        vocab_e->insert(le[i][j].label);
+    toks += le.size();
+  }
+  return toks;
+}
+
+struct BaseModel {
+  explicit BaseModel(unsigned tc) :
+      unif(1.0 / tc), p(prob_t::One()) {}
+  prob_t prob(const TRule& r) const {
+    return unif;
+  }
+  void increment(const TRule& r, MT19937* rng) {
+    p *= prob(r);
+  }
+  void decrement(const TRule& r, MT19937* rng) {
+    p /= prob(r);
+  }
+  prob_t Likelihood() const {
+    return p;
+  }
+  const prob_t unif;
+  prob_t p;
+};
+
+struct UnigramModel {
+  explicit UnigramModel(unsigned tc) : base(tc), crp(1,1,1,1), glue(1,1,1,1) {}
+  BaseModel base;
+  CCRP<TRule> crp;
+  CCRP<TRule> glue;
+
+  prob_t Prob(const TRule& r) const {
+    if (r.Arity() != 0) {
+      return glue.prob(r, prob_t(0.5));
+    }
+    return crp.prob(r, base.prob(r));
+  }
+
+  int Increment(const TRule& r, MT19937* rng) {
+    if (r.Arity() != 0) {
+      glue.increment(r, 0.5, rng);
+      return 0;
+    } else {
+      if (crp.increment(r, base.prob(r), rng)) {
+        base.increment(r, rng);
+        return 1;
+      }
+      return 0;
+    }
+  }
+
+  int Decrement(const TRule& r, MT19937* rng) {
+    if (r.Arity() != 0) {
+      glue.decrement(r, rng);
+      return 0;
+    } else {
+      if (crp.decrement(r, rng)) {
+        base.decrement(r, rng);
+        return -1;
+      }
+      return 0;
+    }
+  }
+
+  prob_t Likelihood() const {
+    prob_t p;
+    p.logeq(crp.log_crp_prob() + glue.log_crp_prob());
+    p *= base.Likelihood();
+    return p;
+  }
+
+  void ResampleHyperparameters(MT19937* rng) {
+    crp.resample_hyperparameters(rng);
+    glue.resample_hyperparameters(rng);
+    cerr << " d=" << crp.discount() << ", s=" << crp.strength() << "\t STOP d=" << glue.discount() << ", s=" << glue.strength() << endl;
+  }
+};
+
+UnigramModel* plm;
+
+void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv) {
+  vector<prob_t> node_probs;
+  Inside<prob_t, EdgeProb>(hg, &node_probs);
+  queue<unsigned> q;
+  q.push(hg.nodes_.size() - 2);
+  while(!q.empty()) {
+    unsigned cur_node_id = q.front();
+//    cerr << "NODE=" << cur_node_id << endl;
+    q.pop();
+    const Hypergraph::Node& node = hg.nodes_[cur_node_id];
+    const unsigned num_in_edges = node.in_edges_.size();
+    unsigned sampled_edge = 0;
+    if (num_in_edges == 1) {
+      sampled_edge = node.in_edges_[0];
+    } else {
+      //prob_t z;
+      assert(num_in_edges > 1);
+      SampleSet<prob_t> ss;
+      for (unsigned j = 0; j < num_in_edges; ++j) {
+        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+        prob_t p = edge.edge_prob_;
+        for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
+          p *= node_probs[edge.tail_nodes_[k]];
+        ss.add(p);
+//        cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl;
+        //z += p;
+      }
+//      for (unsigned j = 0; j < num_in_edges; ++j) {
+//        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+//        cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl;
+//      }
+//      cerr << " --- \n";
+      sampled_edge = node.in_edges_[rng->SelectSample(ss)];
+    }
+    sampled_deriv->push_back(sampled_edge);
+    const Hypergraph::Edge& edge = hg.edges_[sampled_edge];
+    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
+      q.push(edge.tail_nodes_[j]);
+    }
+  }
+//  for (unsigned i = 0; i < sampled_deriv->size(); ++i) {
+//    cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl;
+//  }
+}
+
+void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, UnigramModel* plm, MT19937* rng) {
+  for (unsigned i = 0; i < d.size(); ++i)
+    plm->Increment(*hg.edges_[d[i]].rule_, rng);
+}
+
+void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, UnigramModel* plm, MT19937* rng) {
+  for (unsigned i = 0; i < d.size(); ++i)
+    plm->Decrement(*hg.edges_[d[i]].rule_, rng);
+}
+
+prob_t TotalProb(const Hypergraph& hg) {
+  return Inside<prob_t, EdgeProb>(hg);
+}
+
+void IncrementLatticePath(const Hypergraph& hg, const vector<unsigned>& d, Lattice* pl) {
+  Lattice& lat = *pl;
+  for (int i = 0; i < d.size(); ++i) {
+    const Hypergraph::Edge& edge = hg.edges_[d[i]];
+    if (edge.rule_->Arity() != 0) continue;
+    WordID sym = edge.rule_->e_[0];
+    vector<LatticeArc>& las = lat[edge.i_];
+    int dist = edge.j_ - edge.i_;
+    assert(dist > 0);
+    for (int j = 0; j < las.size(); ++j) {
+      if (las[j].dist2next == dist &&
+          las[j].label == sym) {
+        las[j].cost += 1;
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+
+  InitCommandLine(argc, argv, &conf);
+  vector<GrammarPtr> grammars(2);
+  grammars[0].reset(new GlueGrammar("S","X"));
+  const unsigned samples = conf["samples"].as<unsigned>();
+
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+  MT19937& rng = *prng;
+  vector<Lattice> corpuse;
+  set<WordID> vocabe;
+  cerr << "Reading corpus...\n";
+  const unsigned toks = ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe);
+  cerr << "E-corpus size: " << corpuse.size() << " lattices\t (" << vocabe.size() << " word types)\n";
+  UnigramModel lm(vocabe.size());
+  vector<Hypergraph> hgs(corpuse.size());
+  vector<vector<unsigned> > derivs(corpuse.size());
+  for (int i = 0; i < corpuse.size(); ++i) {
+    grammars[1].reset(new PassThroughGrammar(corpuse[i], "X"));
+    ExhaustiveBottomUpParser parser("S", grammars);
+    bool res = parser.Parse(corpuse[i], &hgs[i]);  // exhaustive parse
+    assert(res);
+  }
+
+  double csamples = 0;
+  for (int SS=0; SS < samples; ++SS) {
+    const bool is_last = ((samples - 1) == SS);
+    prob_t dlh = prob_t::One();
+    bool record_sample = (SS > (samples * 1 / 3) && (SS % 5 == 3));
+    if (record_sample) csamples++;
+    for (int ci = 0; ci < corpuse.size(); ++ci) {
+      Lattice& lat = corpuse[ci];
+      Hypergraph& hg = hgs[ci];
+      vector<unsigned>& d = derivs[ci];
+      if (!is_last) DecrementDerivation(hg, d, &lm, &rng);
+      for (unsigned i = 0; i < hg.edges_.size(); ++i) {
+        TRule& r = *hg.edges_[i].rule_;
+        if (r.Arity() != 0)
+          hg.edges_[i].edge_prob_ = prob_t::One();
+        else
+          hg.edges_[i].edge_prob_ = lm.Prob(r);
+      }
+      if (!is_last) {
+        d.clear();
+        SampleDerivation(hg, &rng, &d);
+        IncrementDerivation(hg, derivs[ci], &lm, &rng);
+      } else {
+        prob_t p = TotalProb(hg);
+        dlh *= p;
+        cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl;
+      }
+      if (record_sample) IncrementLatticePath(hg, derivs[ci], &lat);
+    }
+    double llh = log(lm.Likelihood());
+    cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl;
+    if (SS % 10 == 9) lm.ResampleHyperparameters(&rng);
+    if (is_last) {
+      double z = log(dlh);
+      cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl;
+    }
+  }
+  cerr << lm.crp << endl;
+  cerr << lm.glue << endl;
+  for (int i = 0; i < corpuse.size(); ++i) {
+    for (int j = 0; j < corpuse[i].size(); ++j)
+      for (int k = 0; k < corpuse[i][j].size(); ++k) {
+        corpuse[i][j][k].cost /= csamples;
+        corpuse[i][j][k].cost += 1e-3;
+        corpuse[i][j][k].cost = log(corpuse[i][j][k].cost);
+      }
+    cout << HypergraphIO::AsPLF(corpuse[i]) << endl;
+  }
+  return 0;
+}
+
-- 
cgit v1.2.3