23 files changed, 2148 insertions, 0 deletions
diff --git a/extools/Makefile.am b/extools/Makefile.am
new file mode 100644
index 00000000..5baefa21
--- /dev/null
+++ b/extools/Makefile.am
@@ -0,0 +1,31 @@
+bin_PROGRAMS = \
+  extractor \
+  mr_stripe_rule_reduce \
+  build_lexical_translation \
+  score_grammar \
+  filter_grammar
+
+noinst_PROGRAMS =
+
+build_lexical_translation_SOURCES = build_lexical_translation.cc extract.cc sentence_pair.cc
+build_lexical_translation_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+build_lexical_translation_LDFLAGS = -all-static
+
+mr_stripe_rule_reduce_SOURCES = mr_stripe_rule_reduce.cc extract.cc sentence_pair.cc
+mr_stripe_rule_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_stripe_rule_reduce_LDFLAGS = -all-static
+
+extractor_SOURCES = sentence_pair.cc extract.cc extractor.cc
+extractor_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+extractor_LDFLAGS = -all-static
+
+filter_grammar_SOURCES = sentence_pair.cc extract.cc filter_grammar.cc
+filter_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+filter_grammar_LDFLAGS = -all-static
+
+score_grammar_SOURCES = sentence_pair.cc score_grammar.cc extract.cc
+score_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+score_grammar_LDFLAGS = -all-static
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
+
diff --git a/extools/README b/extools/README
new file mode 100644
index 00000000..af91ce79
--- /dev/null
+++ b/extools/README
@@ -0,0 +1,32 @@
+
+Categories have the format i-j:CAT where i and j are the indices of the spaces
+between words in the TARGET language.  For example, slash categories can be written:
+
+  the blue house
+  0-1:DT 1-2:JJ 2-3:NN 1-3:NBAR 0-2:NP/NN 0-3:NP
+
+
+You may multiply label each span, e.g.
+
+  NP
+   |
+ NBAR
+   |
+  NN
+   |
+ John
+  0-1:NP 0-1:NBAR 0-1:NP
+
+However, this may result in a very large number of rules being extracted.
+
+
+****
+* Filtering and Scoring of Unscored and Unfiltered Grammars
+****
+
+Take the unfiltered grammar, and a test set, and run:
+./filter_grammar <test set> < unfiltered.grammar > filter.grammar
+
+Then, to score the new filtered grammar, run:
+./score_grammar <alignment> < filtered.grammar > scored.grammar
+
diff --git a/extools/build_lexical_translation.cc b/extools/build_lexical_translation.cc
new file mode 100644
index 00000000..f609f56a
--- /dev/null
+++ b/extools/build_lexical_translation.cc
@@ -0,0 +1,104 @@
+/*
+ * Build lexical translation table from alignment file to use for lexical translation probabilties when scoring a grammar
+ *
+ * Ported largely from the train-factored-phrase-model.perl script by Philipp Koehn
+ */
+#include <iostream>
+#include <string>
+#include <map>
+#include <vector>
+#include <utility>
+#include <cstdlib>
+#include <fstream>
+#include <tr1/unordered_map>
+
+#include "sentence_pair.h"
+#include "extract.h"
+#include "fdict.h"
+#include "tdict.h"
+
+#include <boost/functional/hash.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+using namespace std;
+using namespace std::tr1;
+
+static const size_t MAX_LINE_LENGTH = 64000000;
+
+int main(int argc, char* argv[]){
+
+  bool DEBUG = false;
+
+  map <WordID, map<WordID, int> > word_translation;
+  map <WordID, int> total_foreign;
+  map <WordID, int> total_english;
+
+  AnnotatedParallelSentence sent;
+  char* buf = new char[MAX_LINE_LENGTH];
+  while(cin) 
+    {
+      cin.getline(buf, MAX_LINE_LENGTH);
+      if (buf[0] == 0) continue;
+      
+      sent.ParseInputLine(buf);
+      
+      map <WordID, int> foreign_aligned;
+      map <WordID, int> english_aligned;
+
+      //iterate over the alignment to compute aligned words
+            
+      for(int i =0;i<sent.aligned.width();i++)
+	{
+	  for (int j=0;j<sent.aligned.height();j++)
+	    {
+	      if (DEBUG) cout << sent.aligned(i,j) << " ";
+	      if( sent.aligned(i,j))
+		{
+		  if (DEBUG) cout << TD::Convert(sent.f[i])  << " aligned to " << TD::Convert(sent.e[j]);
+		  //local counts
+		  ++foreign_aligned[sent.f[i]];
+		  ++english_aligned[sent.e[j]];
+
+		  //global counts
+		  ++word_translation[sent.f[i]][sent.e[j]];
+		  ++total_foreign[sent.f[i]];
+		  ++total_english[sent.e[j]];
+		}
+	    }
+	  if (DEBUG)  cout << endl;
+	}
+      if (DEBUG) cout << endl;
+      
+      static const WordID NULL_ = TD::Convert("NULL");
+      //handle unaligned words - align them to null
+      map<WordID, int>& nullcounts = word_translation[NULL_];
+      for (int j =0; j < sent.e_len; j++)
+	{
+	  if (english_aligned.count(sent.e[j])) continue;
+	  ++nullcounts[sent.e[j]];
+	  ++total_foreign[NULL_];
+	  ++total_english[sent.e[j]];
+	}
+
+      for (int i =0; i < sent.f_len; i++)
+	{
+	  if (foreign_aligned.count(sent.f[i])) continue;
+	  ++word_translation[sent.f[i]][NULL_];
+	  ++total_english[NULL_];
+	  ++total_foreign[sent.f[i]];
+	}
+      
+    }
+
+  for(map < WordID, map<WordID,int> >::iterator it = word_translation.begin(); it != word_translation.end(); ++it)
+    {
+    const map<WordID, int>& trans = it->second;
+    for (map<WordID,int>::const_iterator iit = trans.begin(); iit != trans.end(); ++iit) {
+      cout <<  TD::Convert(it->first) <<  "," << TD::Convert(iit->first) << "=" << iit->second << "/" << total_foreign[it->first] << endl;
+    }
+  }
+
+
+  return 0;
+}
diff --git a/extools/extract.cc b/extools/extract.cc
new file mode 100644
index 00000000..6ad124d2
--- /dev/null
+++ b/extools/extract.cc
@@ -0,0 +1,341 @@
+#include "extract.h"
+
+#include <queue>
+#include <vector>
+#include <utility>
+#include <tr1/unordered_map>
+#include <set>
+
+#include <boost/functional/hash.hpp>
+
+#include "sentence_pair.h"
+#include "tdict.h"
+#include "wordid.h"
+#include "array2d.h"
+
+using namespace std;
+using namespace tr1;
+
+namespace {
+  inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+
+  inline void SkipWhitespace(const char* buf, int* ptr) {
+    while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
+  }
+}
+
+Extract::RuleObserver::~RuleObserver() {
+  cerr << "Rules extracted: " << count << endl;
+}
+
+void Extract::ExtractBasePhrases(const int max_base_phrase_size,
+                        const AnnotatedParallelSentence& sentence,
+                        vector<ParallelSpan>* phrases) {
+  phrases->clear();
+
+  vector<pair<int,int> > f_spans(sentence.f_len, pair<int,int>(sentence.e_len, 0));
+  vector<pair<int,int> > e_spans(sentence.e_len, pair<int,int>(sentence.f_len, 0));
+  // for each alignment point in e, precompute the minimal consistent phrases in f
+  // for each alignment point in f, precompute the minimal consistent phrases in e
+  for (int i = 0; i < sentence.f_len; ++i) {
+    for (int j = 0; j < sentence.e_len; ++j) {
+      if (sentence.aligned(i,j)) {
+        if (j < f_spans[i].first) f_spans[i].first = j;
+        f_spans[i].second = j+1;
+        if (i < e_spans[j].first) e_spans[j].first = i;
+        e_spans[j].second = i+1;
+      }
+    }
+  }
+
+  for (int i1 = 0; i1 < sentence.f_len; ++i1) {
+    if (sentence.f_aligned[i1] == 0) continue;
+    int j1 = sentence.e_len;
+    int j2 = 0;
+    const int i_limit = min(sentence.f_len, i1 + max_base_phrase_size);
+    for (int i2 = i1 + 1; i2 <= i_limit; ++i2) {
+      if (sentence.f_aligned[i2-1] == 0) continue;
+      // cerr << "F has aligned span " << i1 << " to " << i2 << endl;
+      j1 = min(j1, f_spans[i2-1].first);
+      j2 = max(j2, f_spans[i2-1].second);
+      if (j1 >= j2) continue;
+      if (j2 - j1 > max_base_phrase_size) continue;
+      int condition = 0;
+      for (int j = j1; j < j2; ++j) {
+        if (e_spans[j].first < i1) { condition = 1; break; }
+        if (e_spans[j].second > i2) { condition = 2; break; }
+      }
+      if (condition == 1) break;
+      if (condition == 2) continue;
+      // category types added later!
+      phrases->push_back(ParallelSpan(i1, i2, j1, j2));
+      // cerr << i1 << " " << i2 << " : " << j1 << " " << j2 << endl;
+    }
+  }
+}
+
+void Extract::LoosenPhraseBounds(const AnnotatedParallelSentence& sentence,
+                                 const int max_base_phrase_size,
+                                 vector<ParallelSpan>* phrases) {
+  const int num_phrases = phrases->size();
+  map<int, map<int, map<int, map<int, bool> > > > marker;
+  for (int i = 0; i < num_phrases; ++i) {
+    const ParallelSpan& cur = (*phrases)[i];
+    marker[cur.i1][cur.i2][cur.j1][cur.j2] = true;
+  }
+  for (int i = 0; i < num_phrases; ++i) {
+    const ParallelSpan& cur = (*phrases)[i];
+    const int i1_max = cur.i1;
+    const int i2_min = cur.i2;
+    const int j1_max = cur.j1;
+    const int j2_min = cur.j2;
+    int i1_min = i1_max;
+    while (i1_min > 0 && sentence.f_aligned[i1_min-1] == 0) { --i1_min; }
+    int j1_min = j1_max;
+    while (j1_min > 0 && sentence.e_aligned[j1_min-1] == 0) { --j1_min; }
+    int i2_max = i2_min;
+    while (i2_max < sentence.f_len && sentence.f_aligned[i2_max] == 0) { ++i2_max; }
+    int j2_max = j2_min;
+    while (j2_max < sentence.e_len && sentence.e_aligned[j2_max] == 0) { ++j2_max; }
+    for (int i1 = i1_min; i1 <= i1_max; ++i1) {
+      const int ilim = min(i2_max, i1 + max_base_phrase_size);
+      for (int i2 = max(i1+1,i2_min); i2 <= ilim; ++i2) {
+        for (int j1 = j1_min; j1 <= j1_max; ++j1) {
+          const int jlim = min(j2_max, j1 + max_base_phrase_size);
+          for (int j2 = max(j1+1, j2_min); j2 <= jlim; ++j2) {
+            bool& seen = marker[i1][i2][j1][j2];
+            if (!seen)
+              phrases->push_back(ParallelSpan(i1,i2,j1,j2));
+            seen = true;
+          }
+        }
+      }
+    }
+  }
+}
+
+// this uses the TARGET span (i,j) to annotate phrases, will copy
+// phrases if there is more than one annotation.
+// TODO: support source annotation
+void Extract::AnnotatePhrasesWithCategoryTypes(const WordID default_cat,
+                                      const Array2D<vector<WordID> >& types,
+                                      vector<ParallelSpan>* phrases) {
+  const int num_unannotated_phrases = phrases->size();
+  // have to use num_unannotated_phrases since we may grow the vector
+  for (int i = 0; i < num_unannotated_phrases; ++i) {
+    ParallelSpan& phrase = (*phrases)[i];
+    const vector<WordID>* pcats = &types(phrase.j1, phrase.j2);
+    if (pcats->empty() && default_cat != 0) {
+      static vector<WordID> s_default(1, default_cat);
+      pcats = &s_default;
+    }
+    if (pcats->empty()) {
+      cerr << "ERROR span " << phrase.i1 << "," << phrase.i2 << "-"
+           << phrase.j1 << "," << phrase.j2 << " has no type. "
+              "Did you forget --default_category?\n";
+    }
+    const vector<WordID>& cats = *pcats;
+    phrase.cat = cats[0];
+    for (int ci = 1; ci < cats.size(); ++ci) {
+      ParallelSpan new_phrase = phrase;
+      new_phrase.cat = cats[ci];
+      phrases->push_back(new_phrase);
+    }
+  }
+}
+
+// a partially complete (f-side) of a rule
+struct RuleItem {
+  vector<ParallelSpan> f;
+  int i,j,syms,vars;
+  explicit RuleItem(int pi) : i(pi), j(pi), syms(), vars() {}
+  void Extend(const WordID& fword) {
+    f.push_back(ParallelSpan(fword));
+    ++j;
+    ++syms;
+  }
+  void Extend(const ParallelSpan& subphrase) {
+    f.push_back(subphrase);
+    j += subphrase.i2 - subphrase.i1;
+    ++vars;
+    ++syms;
+  }
+  bool RuleFEndsInVariable() const {
+    if (f.size() > 0) {
+      return f.back().IsVariable();
+    } else { return false; }
+  }
+};
+
+void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence,
+                          const vector<ParallelSpan>& phrases,
+                          const int max_vars,
+                          const int max_syms,
+                          const bool permit_adjacent_nonterminals,
+                          const bool require_aligned_terminal,
+                          RuleObserver* observer) {
+  queue<RuleItem> q;  // agenda for BFS
+  int max_len = -1;
+  unordered_map<pair<short, short>, vector<ParallelSpan>, boost::hash<pair<short, short> > > fspans;
+  vector<vector<ParallelSpan> > spans_by_start(sentence.f_len);
+  set<int> starts;
+  for (int i = 0; i < phrases.size(); ++i) {
+    fspans[make_pair(phrases[i].i1,phrases[i].i2)].push_back(phrases[i]);
+    max_len = max(max_len, phrases[i].i2 - phrases[i].i1);
+    // have we already added a rule item starting at phrases[i].i1?
+    if (starts.insert(phrases[i].i1).second)
+      q.push(RuleItem(phrases[i].i1));
+    spans_by_start[phrases[i].i1].push_back(phrases[i]);
+  }
+  starts.clear();
+  vector<pair<int,int> > next_e(sentence.e_len);
+  vector<WordID> cur_rhs_f, cur_rhs_e;
+  vector<pair<short, short> > cur_terminal_align;
+  vector<int> cur_es, cur_fs;
+  while(!q.empty()) {
+    const RuleItem& rule = q.front();
+
+    // extend the partial rule
+    if (rule.j < sentence.f_len && (rule.j - rule.i) < max_len && rule.syms < max_syms) {
+      RuleItem ew = rule;
+
+      // extend with a word
+      ew.Extend(sentence.f[ew.j]);
+      q.push(ew);
+
+      // with variables
+      if (rule.vars < max_vars &&
+          !spans_by_start[rule.j].empty() &&
+          ((!rule.RuleFEndsInVariable()) || permit_adjacent_nonterminals)) {
+        const vector<ParallelSpan>& sub_phrases = spans_by_start[rule.j];
+        for (int it = 0; it < sub_phrases.size(); ++it) {
+          if (sub_phrases[it].i2 - sub_phrases[it].i1 + rule.j - rule.i <= max_len) {
+            RuleItem ev = rule;
+            ev.Extend(sub_phrases[it]);
+            q.push(ev);
+            assert(ev.j <= sentence.f_len);
+          }
+        }
+      }
+    }
+    // determine if rule is consistent
+    if (rule.syms > 0 &&
+        fspans.count(make_pair(rule.i,rule.j)) &&
+        (!rule.RuleFEndsInVariable() || rule.syms > 1)) {
+      const vector<ParallelSpan>& orig_spans = fspans[make_pair(rule.i,rule.j)];
+      for (int s = 0; s < orig_spans.size(); ++s) {
+        const ParallelSpan& orig_span = orig_spans[s];
+        const WordID lhs = orig_span.cat;
+        for (int j = orig_span.j1; j < orig_span.j2; ++j) next_e[j].first = -1;
+        int nt_index_e = 0;
+        for (int i = 0; i < rule.f.size(); ++i) {
+          const ParallelSpan& cur = rule.f[i];
+          if (cur.IsVariable())
+            next_e[cur.j1] = pair<int,int>(cur.j2, ++nt_index_e);
+        }
+        cur_rhs_f.clear();
+        cur_rhs_e.clear();
+        cur_terminal_align.clear();
+        cur_fs.clear();
+        cur_es.clear();
+
+        const int elen = orig_span.j2 - orig_span.j1;
+        vector<int> isvar(elen, 0);
+        int fbias = rule.i;
+        bool bad_rule = false;
+        bool has_aligned_terminal = false;
+        for (int i = 0; i < rule.f.size(); ++i) {
+          const ParallelSpan& cur = rule.f[i];
+          cur_rhs_f.push_back(cur.cat);
+          if (cur.cat > 0) {   // terminal
+            if (sentence.f_aligned[fbias + i]) has_aligned_terminal = true;
+            cur_fs.push_back(fbias + i);
+          } else {             // non-terminal
+            int subj1 = cur.j1 - orig_span.j1;
+            int subj2 = cur.j2 - orig_span.j1;
+            if (subj1 < 0 || subj2 > elen) { bad_rule = true; break; }
+            for (int j = subj1; j < subj2 && !bad_rule; ++j) {
+              int& isvarj = isvar[j];
+              isvarj = true;
+            }
+            if (bad_rule) break;
+            cur_fs.push_back(-1);
+            fbias += cur.i2 - cur.i1 - 1;
+          }
+        }
+        if (require_aligned_terminal && !has_aligned_terminal) bad_rule = true;
+        if (!bad_rule) {
+          for (int j = orig_span.j1; j < orig_span.j2; ++j) {
+            if (next_e[j].first < 0) {
+              cur_rhs_e.push_back(sentence.e[j]);
+              cur_es.push_back(j);
+            } else {
+              cur_rhs_e.push_back(1 - next_e[j].second);  // next_e[j].second is NT gap index
+              cur_es.push_back(-1);
+              j = next_e[j].first - 1;
+            }
+          }
+          for (short i = 0; i < cur_fs.size(); ++i)
+            if (cur_fs[i] >= 0)
+              for (short j = 0; j < cur_es.size(); ++j)
+                if (cur_es[j] >= 0 && sentence.aligned(cur_fs[i],cur_es[j]))
+                  cur_terminal_align.push_back(make_pair(i,j));
+          observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align);
+        }
+      }
+    }
+    q.pop();
+  }
+}
+
+void RuleStatistics::ParseRuleStatistics(const char* buf, int start, int end) {
+  int ptr = start;
+  counts.clear();
+  aligns.clear();
+  while (ptr < end) {
+    SkipWhitespace(buf, &ptr);
+    int vstart = ptr;
+    while(ptr < end && buf[ptr] != '=') ++ptr;
+    assert(buf[ptr] == '=');
+    assert(ptr > vstart);
+    if (buf[vstart] == 'A' && buf[vstart+1] == '=') {
+      ++ptr;
+      while (ptr < end && !IsWhitespace(buf[ptr])) {
+        while(ptr < end && buf[ptr] == ',') { ++ptr; }
+        assert(ptr < end);
+        vstart = ptr;
+        while(ptr < end && buf[ptr] != ',' && !IsWhitespace(buf[ptr])) { ++ptr; }
+        if (ptr > vstart) {
+          short a, b;
+          AnnotatedParallelSentence::ReadAlignmentPoint(buf, vstart, ptr, false, &a, &b);
+          aligns.push_back(make_pair(a,b));
+        }
+      }
+    } else {
+      int name = FD::Convert(string(buf,vstart,ptr-vstart));
+      ++ptr;
+      vstart = ptr;
+      while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
+      assert(ptr > vstart);
+      counts.set_value(name, strtod(buf + vstart, NULL));
+    }
+  }
+}
+
+ostream& operator<<(ostream& os, const RuleStatistics& s) {
+  bool needspace = false;
+  for (SparseVector<float>::const_iterator it = s.counts.begin(); it != s.counts.end(); ++it) {
+    if (needspace) os << ' '; else needspace = true;
+    os << FD::Convert(it->first) << '=' << it->second;
+  }
+  if (s.aligns.size() > 0) {
+    os << " A=";
+    needspace = false;
+    for (int i = 0; i < s.aligns.size(); ++i) {
+      if (needspace) os << ','; else needspace = true;
+      os << s.aligns[i].first << '-' << s.aligns[i].second;
+    }
+  }
+  return os;
+}
+
diff --git a/extools/extract.h b/extools/extract.h
new file mode 100644
index 00000000..f87aa6cb
--- /dev/null
+++ b/extools/extract.h
@@ -0,0 +1,110 @@
+#ifndef _EXTRACT_H_
+#define _EXTRACT_H_
+
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "array2d.h"
+#include "wordid.h"
+#include "sparse_vector.h"
+
+struct AnnotatedParallelSentence;
+
+// usually represents a consistent phrase, which may
+// be annotated with a type (cat)
+// inside the rule extractor, this class is also used to represent a word
+// in a partial rule.
+struct ParallelSpan {
+  // i1 = i of f side
+  // i2 = j of f side
+  // j1 = i of e side
+  // j2 = j of e side
+  short i1,i2,j1,j2;
+  // cat is set by AnnotatePhrasesWithCategoryTypes, otherwise it's 0
+  WordID cat;  // category type of span (also overloaded by RuleItem class
+               //                        to be a word ID)
+  ParallelSpan() : i1(-1), i2(-1), j1(-1), j2(-1), cat() {}
+  // used by Rule class to represent a terminal symbol:
+  explicit ParallelSpan(WordID w) : i1(-1), i2(-1), j1(-1), j2(-1), cat(w) {}
+  ParallelSpan(int pi1, int pi2, int pj1, int pj2) : i1(pi1), i2(pi2), j1(pj1), j2(pj2), cat() {}
+  ParallelSpan(int pi1, int pi2, int pj1, int pj2, WordID c) : i1(pi1), i2(pi2), j1(pj1), j2(pj2), cat(c) {}
+
+  // ParallelSpan is used in the Rule class where it is
+  // overloaded to also represent terminal symbols
+  inline bool IsVariable() const { return i1 != -1; }
+};
+
+// rule extraction logic lives here. this has no data, it's just got
+// static member functions.
+struct Extract {
+  // RuleObserver's CountRule is called for each rule extracted
+  // implement CountRuleImpl to do things like count the rules,
+  // write them to a file, etc.
+  struct RuleObserver {
+    RuleObserver() : count() {}
+    virtual void CountRule(WordID lhs,
+                           const std::vector<WordID>& rhs_f,
+                           const std::vector<WordID>& rhs_e,
+                           const std::vector<std::pair<short, short> >& fe_terminal_alignments) {
+      ++count;
+      CountRuleImpl(lhs, rhs_f, rhs_e, fe_terminal_alignments);
+    }
+    virtual ~RuleObserver();
+
+   protected:
+    virtual void CountRuleImpl(WordID lhs,
+                           const std::vector<WordID>& rhs_f,
+                           const std::vector<WordID>& rhs_e,
+                           const std::vector<std::pair<short, short> >& fe_terminal_alignments) = 0;
+   private:
+    int count;
+  };
+
+  // given a set of "tight" phrases and the aligned sentence they were
+  // extracted from, "loosen" them
+  static void LoosenPhraseBounds(const AnnotatedParallelSentence& sentence,
+                                 const int max_base_phrase_size,
+                                 std::vector<ParallelSpan>* phrases);
+
+  // extract all consistent phrase pairs, up to size max_base_phrase_size
+  // (on the source side). these phrases will be "tight".
+  static void ExtractBasePhrases(const int max_base_phrase_size,
+                        const AnnotatedParallelSentence& sentence,
+                        std::vector<ParallelSpan>* phrases);
+
+  // this uses the TARGET span (i,j) to annotate phrases, will copy
+  // phrases if there is more than one annotation.
+  // TODO: support source annotation
+  static void AnnotatePhrasesWithCategoryTypes(const WordID default_cat,
+                                      const Array2D<std::vector<WordID> >& types,
+                                      std::vector<ParallelSpan>* phrases);
+
+  // use the Chiang (2007) extraction logic to extract consistent subphrases
+  // observer->CountRule is called once for each rule extracted
+  static void ExtractConsistentRules(const AnnotatedParallelSentence& sentence,
+                          const std::vector<ParallelSpan>& phrases,
+                          const int max_vars,
+                          const int max_syms,
+                          const bool permit_adjacent_nonterminals,
+                          const bool require_aligned_terminal,
+                          RuleObserver* observer);
+};
+
+// represents statistics / information about a rule pair
+struct RuleStatistics {
+  SparseVector<float> counts;
+  std::vector<std::pair<short,short> > aligns;
+  RuleStatistics() {}
+  RuleStatistics(int name, float val, const std::vector<std::pair<short,short> >& al) :
+      aligns(al) {
+    counts.set_value(name, val);
+  }
+  void ParseRuleStatistics(const char* buf, int start, int end);
+  RuleStatistics& operator+=(const RuleStatistics& rhs) {
+    counts += rhs.counts;
+    return *this;
+  }
+};
+std::ostream& operator<<(std::ostream& os, const RuleStatistics& s);
+
+#endif
diff --git a/extools/extractor.cc b/extools/extractor.cc
new file mode 100644
index 00000000..a3791d2a
--- /dev/null
+++ b/extools/extractor.cc
@@ -0,0 +1,363 @@
+#include <iostream>
+#include <vector>
+#include <utility>
+#include <tr1/unordered_map>
+
+#include <boost/functional/hash.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include "sparse_vector.h"
+#include "sentence_pair.h"
+#include "extract.h"
+#include "tdict.h"
+#include "fdict.h"
+#include "wordid.h"
+#include "array2d.h"
+#include "filelib.h"
+
+using namespace std;
+using namespace std::tr1;
+namespace po = boost::program_options;
+
+static const size_t MAX_LINE_LENGTH = 100000;
+WordID kBOS, kEOS, kDIVIDER, kGAP;
+int kCOUNT;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("input,i", po::value<string>()->default_value("-"), "Input file")
+        ("default_category,d", po::value<string>(), "Default span type (use X for 'Hiero')")
+        ("loose", "Use loose phrase extraction heuristic for base phrases")
+        ("base_phrase,B", "Write base phrases")
+        ("base_phrase_spans", "Write base sentences and phrase spans")
+        ("bidir,b", "Extract bidirectional rules (for computing p(f|e) in addition to p(e|f))")
+        ("combiner_size,c", po::value<size_t>()->default_value(800000), "Number of unique items to store in cache before writing rule counts. Set to 0 to disable cache.")
+        ("silent", "Write nothing to stderr except errors")
+        ("phrase_context,C", "Write base phrase contexts")
+        ("phrase_context_size,S", po::value<int>()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts")
+        ("max_base_phrase_size,L", po::value<int>()->default_value(10), "Maximum starting phrase size")
+        ("max_syms,l", po::value<int>()->default_value(5), "Maximum number of symbols in final phrase size")
+        ("max_vars,v", po::value<int>()->default_value(2), "Maximum number of nonterminal variables in final phrase size")
+        ("permit_adjacent_nonterminals,A", "Permit adjacent nonterminals in source side of rules")
+        ("no_required_aligned_terminal,n", "Do not require an aligned terminal")
+        ("help,h", "Print this help message and exit");
+  po::options_description clo("Command line options");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  po::notify(*conf);
+
+  if (conf->count("help") || conf->count("input") == 0) {
+    cerr << "\nUsage: extractor [-options]\n";
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+// TODO how to handle alignment information?
+void WriteBasePhrases(const AnnotatedParallelSentence& sentence,
+                      const vector<ParallelSpan>& phrases) {
+  vector<WordID> e,f;
+  for (int it = 0; it < phrases.size(); ++it) {
+    const ParallelSpan& phrase = phrases[it];
+    e.clear();
+    f.clear();
+    for (int i = phrase.i1; i < phrase.i2; ++i)
+      f.push_back(sentence.f[i]);
+    for (int j = phrase.j1; j < phrase.j2; ++j)
+      e.push_back(sentence.e[j]);
+    cout << TD::GetString(f) << " ||| " << TD::GetString(e) << endl;
+  }
+}
+
+void WriteBasePhraseSpans(const AnnotatedParallelSentence& sentence,
+                          const vector<ParallelSpan>& phrases) {
+  cout << TD::GetString(sentence.f) << " ||| " << TD::GetString(sentence.e) << " |||";
+  for (int it = 0; it < phrases.size(); ++it) {
+    const ParallelSpan& phrase = phrases[it];
+    cout << " " << phrase.i1 << "-" << phrase.i2 
+      << "-" << phrase.j1 << "-" << phrase.j2;
+  }
+  cout << endl;
+}
+
+struct CountCombiner {
+  CountCombiner(size_t csize) : combiner_size(csize) {}
+  ~CountCombiner() {
+    if (!cache.empty()) WriteAndClearCache();
+  }
+
+  void Count(const vector<WordID>& key,
+             const vector<WordID>& val,
+             const int count_type,
+             const vector<pair<short,short> >& aligns) {
+    if (combiner_size > 0) {
+      RuleStatistics& v = cache[key][val];
+      float newcount = v.counts.add_value(count_type, 1.0f);
+      // hack for adding alignments
+      if (newcount < 7.0f && aligns.size() > v.aligns.size())
+        v.aligns = aligns;
+      if (cache.size() > combiner_size) WriteAndClearCache();
+    } else {
+      cout << TD::GetString(key) << '\t' << TD::GetString(val) << " ||| ";
+      cout << RuleStatistics(count_type, 1.0f, aligns) << endl;
+    }
+  }
+
+ private:
+  void WriteAndClearCache() {
+    for (unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > >::iterator it = cache.begin();
+         it != cache.end(); ++it) {
+      cout << TD::GetString(it->first) << '\t';
+      const Vec2PhraseCount& vals = it->second;
+      bool needdiv = false;
+      for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi) {
+        if (needdiv) cout << " ||| "; else needdiv = true;
+        cout << TD::GetString(vi->first) << " ||| " << vi->second;
+      }
+      cout << endl;
+    }
+    cache.clear();
+  }
+
+  const size_t combiner_size;
+  typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > Vec2PhraseCount;
+  unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > > cache;
+};
+
+// TODO optional source context
+// output <k, v> : k = phrase "document" v = context "term"
+void WritePhraseContexts(const AnnotatedParallelSentence& sentence,
+                         const vector<ParallelSpan>& phrases,
+                         const int ctx_size,
+                         CountCombiner* o) {
+  vector<WordID> context(ctx_size * 2 + 1);
+  context[ctx_size] = kGAP;
+  vector<WordID> key;
+  key.reserve(100);
+  for (int it = 0; it < phrases.size(); ++it) {
+    const ParallelSpan& phrase = phrases[it];
+
+    // TODO, support src keys as well
+    key.resize(phrase.j2 - phrase.j1);
+    for (int j = phrase.j1; j < phrase.j2; ++j)
+      key[j - phrase.j1] = sentence.e[j];
+
+    for (int i = 0; i < ctx_size; ++i) {
+      int epos = phrase.j1 - 1 - i;
+      const WordID left_ctx = (epos < 0) ? kBOS : sentence.e[epos];
+      context[ctx_size - i - 1] = left_ctx;
+      epos = phrase.j2 + i;
+      const WordID right_ctx = (epos >= sentence.e_len) ? kEOS : sentence.e[epos];
+      context[ctx_size + i + 1] = right_ctx;
+    }
+    o->Count(key, context, kCOUNT, vector<pair<short,short> >());
+  }
+}
+
+struct SimpleRuleWriter : public Extract::RuleObserver {
+ protected:
+  virtual void CountRuleImpl(WordID lhs,
+                             const vector<WordID>& rhs_f,
+                             const vector<WordID>& rhs_e,
+                             const vector<pair<short,short> >& fe_terminal_alignments) {
+    cout << "[" << TD::Convert(-lhs) << "] |||";
+    for (int i = 0; i < rhs_f.size(); ++i) {
+      if (rhs_f[i] < 0) cout << " [" << TD::Convert(-rhs_f[i]) << ']';
+      else cout << ' ' << TD::Convert(rhs_f[i]);
+    }
+    cout << " |||";
+    for (int i = 0; i < rhs_e.size(); ++i) {
+      if (rhs_e[i] <= 0) cout << " [" << (1-rhs_e[i]) << ']';
+      else cout << ' ' << TD::Convert(rhs_e[i]);
+    }
+    cout << " |||";
+    for (int i = 0; i < fe_terminal_alignments.size(); ++i) {
+      cout << ' ' << fe_terminal_alignments[i].first << '-' << fe_terminal_alignments[i].second;
+    }
+    cout << endl;
+  }
+};
+
+struct HadoopStreamingRuleObserver : public Extract::RuleObserver {
+  HadoopStreamingRuleObserver(CountCombiner* cc, bool bidir_flag) :
+     bidir(bidir_flag),
+     kF(TD::Convert("F")),
+     kE(TD::Convert("E")),
+     kDIVIDER(TD::Convert("|||")),
+     kLB("["), kRB("]"),
+     combiner(*cc),
+     kEMPTY(),
+     kCFE(FD::Convert("CFE")) {
+   for (int i=1; i < 50; ++i)
+     index2sym[1-i] = TD::Convert(kLB + boost::lexical_cast<string>(i) + kRB);
+   fmajor_key.resize(10, kF);
+   emajor_key.resize(10, kE);
+   if (bidir)
+     fmajor_key[2] = emajor_key[2] = kDIVIDER;
+   else
+     fmajor_key[1] = kDIVIDER;
+ }
+
+ protected:
+  virtual void CountRuleImpl(WordID lhs,
+                             const vector<WordID>& rhs_f,
+                             const vector<WordID>& rhs_e,
+                             const vector<pair<short,short> >& fe_terminal_alignments) {
+    if (bidir) { // extract rules in "both directions" E->F and F->E
+      fmajor_key.resize(3 + rhs_f.size());
+      emajor_key.resize(3 + rhs_e.size());
+      fmajor_val.resize(rhs_e.size());
+      emajor_val.resize(rhs_f.size());
+      emajor_key[1] = fmajor_key[1] = MapSym(lhs);
+      int nt = 1;
+      for (int i = 0; i < rhs_f.size(); ++i) {
+        const WordID id = rhs_f[i];
+        if (id < 0) {
+          fmajor_key[3 + i] = MapSym(id, nt);
+          emajor_val[i] = MapSym(id, nt);
+          ++nt;
+        } else {
+          fmajor_key[3 + i] = id;
+          emajor_val[i] = id;
+        }
+      }
+      for (int i = 0; i < rhs_e.size(); ++i) {
+        WordID id = rhs_e[i];
+        if (id <= 0) {
+          fmajor_val[i] = index2sym[id];
+          emajor_key[3 + i] = index2sym[id];
+        } else {
+          fmajor_val[i] = id;
+          emajor_key[3 + i] = id;
+        }
+      }
+      combiner.Count(fmajor_key, fmajor_val, kCFE, fe_terminal_alignments);
+      combiner.Count(emajor_key, emajor_val, kCFE, kEMPTY);
+    } else { // extract rules only in F->E
+      fmajor_key.resize(2 + rhs_f.size());
+      fmajor_val.resize(rhs_e.size());
+      fmajor_key[0] = MapSym(lhs);
+      int nt = 1;
+      for (int i = 0; i < rhs_f.size(); ++i) {
+        const WordID id = rhs_f[i];
+        if (id < 0)
+          fmajor_key[2 + i] = MapSym(id, nt++);
+        else
+          fmajor_key[2 + i] = id;
+      }
+      for (int i = 0; i < rhs_e.size(); ++i) {
+        const WordID id = rhs_e[i];
+        if (id <= 0)
+          fmajor_val[i] = index2sym[id];
+        else
+          fmajor_val[i] = id;
+      }
+      combiner.Count(fmajor_key, fmajor_val, kCFE, fe_terminal_alignments);
+    }
+  }
+
+ private:
+  WordID MapSym(WordID sym, int ind = 0) {
+    WordID& r = cat2ind2sym[sym][ind];
+    if (!r) {
+      if (ind == 0)
+        r = TD::Convert(kLB + TD::Convert(-sym) + kRB);
+      else
+        r = TD::Convert(kLB + TD::Convert(-sym) + "," + boost::lexical_cast<string>(ind) + kRB);
+    }
+    return r;
+  }
+
+  const bool bidir;
+  const WordID kF, kE, kDIVIDER;
+  const string kLB, kRB;
+  CountCombiner& combiner;
+  const vector<pair<short,short> > kEMPTY;
+  const int kCFE;
+  map<WordID, map<int, WordID> > cat2ind2sym;
+  map<int, WordID> index2sym;
+  vector<WordID> emajor_key, emajor_val, fmajor_key, fmajor_val;
+};
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  kBOS = TD::Convert("<s>");
+  kEOS = TD::Convert("</s>");
+  kDIVIDER = TD::Convert("|||");
+  kGAP = TD::Convert("<PHRASE>");
+  kCOUNT = FD::Convert("C");
+
+  WordID default_cat = 0;  // 0 means no default- extraction will
+                           // fail if a phrase is extracted without a
+                           // category
+  if (conf.count("default_category")) {
+    string sdefault_cat = conf["default_category"].as<string>();
+    default_cat = -TD::Convert(sdefault_cat);
+    cerr << "Default category: " << sdefault_cat << endl;
+  } else {
+    cerr << "No default category (use --default_category if you want to set one)\n";
+  }
+  ReadFile rf(conf["input"].as<string>());
+  istream& in = *rf.stream();
+
+  char buf[MAX_LINE_LENGTH];
+  AnnotatedParallelSentence sentence;
+  vector<ParallelSpan> phrases;
+  const int max_base_phrase_size = conf["max_base_phrase_size"].as<int>();
+  const bool write_phrase_contexts = conf.count("phrase_context") > 0;
+  const bool write_base_phrases = conf.count("base_phrase") > 0;
+  const bool write_base_phrase_spans = conf.count("base_phrase_spans") > 0;
+  const bool loose_phrases = conf.count("loose") > 0;
+  const bool silent = conf.count("silent") > 0;
+  const int max_syms = conf["max_syms"].as<int>();
+  const int max_vars = conf["max_vars"].as<int>();
+  const int ctx_size = conf["phrase_context_size"].as<int>();
+  const bool permit_adjacent_nonterminals = conf.count("permit_adjacent_nonterminals") > 0;
+  const bool require_aligned_terminal = conf.count("no_required_aligned_terminal") == 0;
+  int line = 0;
+  CountCombiner cc(conf["combiner_size"].as<size_t>());
+  HadoopStreamingRuleObserver o(&cc,
+                                conf.count("bidir") > 0);
+  //SimpleRuleWriter o;
+  while(in) {
+    ++line;
+    in.getline(buf, MAX_LINE_LENGTH);
+    if (buf[0] == 0) continue;
+    if (!silent) {
+      if (line % 200 == 0) cerr << '.';
+      if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush;
+    }
+    sentence.ParseInputLine(buf);
+    phrases.clear();
+    Extract::ExtractBasePhrases(max_base_phrase_size, sentence, &phrases);
+    if (loose_phrases)
+      Extract::LoosenPhraseBounds(sentence, max_base_phrase_size, &phrases);
+    if (phrases.empty()) {
+      cerr << "WARNING no phrases extracted line: " << line << endl;
+      continue;
+    }
+    if (write_phrase_contexts) {
+      WritePhraseContexts(sentence, phrases, ctx_size, &cc);
+      continue;
+    }
+    if (write_base_phrases) {
+      WriteBasePhrases(sentence, phrases);
+      continue;
+    }
+    if (write_base_phrase_spans) {
+      WriteBasePhraseSpans(sentence, phrases);
+      continue;
+    }
+    Extract::AnnotatePhrasesWithCategoryTypes(default_cat, sentence.span_types, &phrases);
+    Extract::ExtractConsistentRules(sentence, phrases, max_vars, max_syms, permit_adjacent_nonterminals, require_aligned_terminal, &o);
+  }
+  if (!silent) cerr << endl;
+  return 0;
+}
+
diff --git a/extools/filter_grammar.cc b/extools/filter_grammar.cc
new file mode 100644
index 00000000..290a77f5
--- /dev/null
+++ b/extools/filter_grammar.cc
@@ -0,0 +1,202 @@
+/*
+ * Build suffix tree representation of a data set for grammar filtering
+ * ./filter_grammar <test set> < unfiltered.grammar > filter.grammar
+ *
+ */
+#include <iostream>
+#include <string>
+#include <map>
+#include <vector>
+#include <utility>
+#include <cstdlib>
+#include <fstream>
+#include <tr1/unordered_map>
+
+#include "filelib.h"
+#include "sentence_pair.h"
+#include "suffix_tree.h"
+#include "extract.h"
+#include "fdict.h"
+#include "tdict.h"
+
+#include <boost/functional/hash.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+
+using namespace std;
+using namespace std::tr1;
+
+static const size_t MAX_LINE_LENGTH = 64000000;
+
+typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics;
+
+
+namespace {
+  inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+  inline bool IsBracket(char c){return c == '[' || c == ']';}
+  inline void SkipWhitespace(const char* buf, int* ptr) {
+    while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
+  }
+}
+
+
+
+int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) {
+  static const WordID kDIV = TD::Convert("|||");
+
+  int ptr = sstart;
+  while(ptr < end) {
+    while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; }
+    int start = ptr;
+    while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
+    if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; }
+    //look in the buffer and see if its a nonterminal marker before integerizing it to wordID-anything with [...] or |||
+
+    const WordID w = TD::Convert(string(buf, start, ptr - start));
+
+    if((IsBracket(buf[start]) and IsBracket(buf[ptr-1])) or( w == kDIV))
+      p->push_back(-1);
+    else {
+	if (w == kDIV) return ptr;
+	p->push_back(w);
+    }
+  }
+  return ptr;
+}
+
+
+
+void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) {
+  static const WordID kDIV = TD::Convert("|||");
+  counts->clear();
+  int ptr = 0;
+  while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; }
+  if (buf[ptr] != '\t') {
+    cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl;
+    exit(1);
+  }
+  cur_key->clear();
+  // key is: "[X] ||| word word word"
+  int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key);
+  cur_key->push_back(kDIV);
+  ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
+  ++ptr;
+  int start = ptr;
+  int end = ptr;
+  int state = 0; // 0=reading label, 1=reading count
+  vector<WordID> name;
+  while(buf[ptr] != 0) {
+    while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; }
+    if (buf[ptr] == '|') {
+      ++ptr;
+      if (buf[ptr] == '|') {
+        ++ptr;
+        if (buf[ptr] == '|') {
+          ++ptr;
+          end = ptr - 3;
+          while (end > start && IsWhitespace(buf[end-1])) { --end; }
+          if (start == end) {
+            cerr << "Got empty token!\n  LINE=" << buf << endl;
+            exit(1);
+          }
+          switch (state) {
+            case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
+            case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
+            default: cerr << "Can't happen\n"; abort();
+          }
+          SkipWhitespace(buf, &ptr);
+          start = ptr;
+        }
+      }
+    }
+  }
+  end=ptr;
+  while (end > start && IsWhitespace(buf[end-1])) { --end; }
+  if (end > start) {
+    switch (state) {
+      case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
+      case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
+      default: cerr << "Can't happen\n"; abort();
+    }
+  }
+}
+
+
+
+
+
+
+
+int main(int argc, char* argv[]){
+  if (argc != 2) {
+    cerr << "Usage: " << argv[0] << " testset.txt < unfiltered.grammar\n";
+    return 1;
+  }
+
+  assert(FileExists(argv[1]));
+  ReadFile rfts(argv[1]);
+  istream& testSet = *rfts.stream();
+  ofstream filter_grammar_;
+  bool DEBUG = false;
+
+
+  AnnotatedParallelSentence sent;
+  char* buf = new char[MAX_LINE_LENGTH];
+  cerr << "Build suffix tree from test set in " << argv[1] << endl;
+  //root of the suffix tree
+  Node<int> root;
+  int line=0;
+
+  /* process the data set to build suffix tree
+   */
+  while(!testSet.eof()) {
+    ++line;
+    testSet.getline(buf, MAX_LINE_LENGTH);
+    if (buf[0] == 0) continue;
+
+    //hack to read in the test set using the alignedparallelsentence methods
+    strcat(buf," ||| fake ||| 0-0");   
+    sent.ParseInputLine(buf);
+
+    if (DEBUG)cerr << line << "||| " << buf << " -- " << sent.f_len << endl;
+
+    //add each successive suffix to the tree
+    for(int i =0;i<sent.f_len;i++)
+	root.InsertPath(sent.f, i, sent.f_len - 1);
+    if(DEBUG)cerr<<endl;
+
+  }  
+
+  cerr << "Filtering grammar..." << endl;
+  //process the unfiltered, unscored grammar    
+
+  ID2RuleStatistics cur_counts;
+  vector<WordID> cur_key;
+  line = 0;
+  
+  while(cin)
+      {
+        ++line;
+	cin.getline(buf, MAX_LINE_LENGTH);
+	if (buf[0] == 0) continue;
+	ParseLine(buf, &cur_key, &cur_counts);
+        const Node<int>* curnode = &root;	
+	for(int i=0;i<cur_key.size() - 1; i++)
+	  {
+		if (DEBUG) cerr << line << " " << cur_key[i] << " ::: ";
+		if(cur_key[i] == -1)
+		  {
+			curnode = &root;
+		} else if (curnode) {
+			curnode = root.Extend(cur_key[i]);
+			if (!curnode) break;
+		  }
+	      }
+	    
+	if(curnode)
+          cout << buf << endl;
+      }
+
+  return 0;
+}
diff --git a/extools/lex_trans_tbl.h b/extools/lex_trans_tbl.h
new file mode 100644
index 00000000..81d6ccc9
--- /dev/null
+++ b/extools/lex_trans_tbl.h
@@ -0,0 +1,24 @@
+/*
+ * lex_trans_tbl.h
+ *
+ *  Created on: May 25, 2010
+ *      Author: Vlad
+ */
+
+#ifndef LEX_TRANS_TBL_H_
+#define LEX_TRANS_TBL_H_
+
+#include <map>
+
+class LexTranslationTable
+{
+ public:
+
+  std::map < std::pair<WordID,WordID>,int > word_translation;
+  std::map <WordID, int> total_foreign;
+  std::map <WordID, int> total_english;
+  void createTTable(const char* buf);
+  
+};
+
+#endif /* LEX_TRANS_TBL_H_ */
diff --git a/extools/merge_lines.pl b/extools/merge_lines.pl
new file mode 100755
index 00000000..8711e4ce
--- /dev/null
+++ b/extools/merge_lines.pl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl -w
+use strict;
+
+if (scalar @ARGV < 2) {
+  die "Usage: $0 file1.txt file2.txt ...\n\n  Concatenate the nth line of each input file. All files\n  must be the same length.\n\n";
+}
+
+my @fhs=();
+for my $file (@ARGV) {
+  my $fh;
+  open $fh, "<$file" or die "Can't read $file: $!\n";
+  push @fhs, $fh;
+}
+
+my $first = shift @fhs;
+
+while(my $x = <$first>) {
+  my $ind = 0;
+  chomp $x;
+  my @fields = ($x);
+  for my $fh (@fhs) {
+    $ind++;
+    $x = <$fh>;
+    die "ERROR: Mismatched number of lines: $ARGV[$ind]\n" unless $x;
+    chomp $x;
+    push @fields, $x;
+  }
+  print join ' ||| ', @fields;
+  print "\n";
+}
+my $ind = 0;
+for my $fh (@fhs) {
+  $ind++;
+  my $x=<$fh>;
+  die "ERROR: $ARGV[$ind] has extra lines!\n" if $x;
+}
+
+exit 0;
+
+for my $fh (@fhs) {
+  close $fh;
+}
+
diff --git a/extools/mr_stripe_rule_reduce.cc b/extools/mr_stripe_rule_reduce.cc
new file mode 100644
index 00000000..eaf1b6d7
--- /dev/null
+++ b/extools/mr_stripe_rule_reduce.cc
@@ -0,0 +1,242 @@
+#include <iostream>
+#include <vector>
+#include <utility>
+#include <cstdlib>
+#include <tr1/unordered_map>
+
+#include <boost/functional/hash.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "tdict.h"
+#include "sentence_pair.h"
+#include "fdict.h"
+#include "extract.h"
+
+using namespace std;
+using namespace std::tr1;
+namespace po = boost::program_options;
+
+static const size_t MAX_LINE_LENGTH = 64000000;
+
+bool use_hadoop_counters = false;
+
+namespace {
+  inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+
+  inline void SkipWhitespace(const char* buf, int* ptr) {
+    while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
+  }
+}
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("phrase_marginals,p", "Compute phrase marginals")
+	("use_hadoop_counters,C", "Enable this if running inside Hadoop")
+        ("bidir,b", "Rules are tagged as being F->E or E->F, invert E rules in output")
+        ("help,h", "Print this help message and exit");
+  po::options_description clo("Command line options");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  po::notify(*conf);
+
+  if (conf->count("help")) {
+    cerr << "\nUsage: mr_stripe_rule_reduce [-options]\n";
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics;
+
+void PlusEquals(const ID2RuleStatistics& v, ID2RuleStatistics* self) {
+  for (ID2RuleStatistics::const_iterator it = v.begin(); it != v.end(); ++it) {
+    RuleStatistics& dest = (*self)[it->first];
+    dest += it->second;
+    // TODO - do something smarter about alignments?
+    if (dest.aligns.empty() && !it->second.aligns.empty())
+      dest.aligns = it->second.aligns;
+  }
+}
+
+int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) {
+  static const WordID kDIV = TD::Convert("|||");
+  int ptr = sstart;
+  while(ptr < end) {
+    while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; }
+    int start = ptr;
+    while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
+    if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; }
+    const WordID w = TD::Convert(string(buf, start, ptr - start));
+    if (w == kDIV) return ptr;
+    p->push_back(w);
+  }
+  return ptr;
+}
+
+void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) {
+  static const WordID kDIV = TD::Convert("|||");
+  counts->clear();
+  int ptr = 0;
+  while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; }
+  if (buf[ptr] != '\t') {
+    cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl;
+    exit(1);
+  }
+  cur_key->clear();
+  // key is: "[X] ||| word word word"
+  int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key);
+  if (buf[tmpp] != '\t') {
+    cur_key->push_back(kDIV);
+    ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
+  }
+  ++ptr;
+  int start = ptr;
+  int end = ptr;
+  int state = 0; // 0=reading label, 1=reading count
+  vector<WordID> name;
+  while(buf[ptr] != 0) {
+    while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; }
+    if (buf[ptr] == '|') {
+      ++ptr;
+      if (buf[ptr] == '|') {
+        ++ptr;
+        if (buf[ptr] == '|') {
+          ++ptr;
+          end = ptr - 3;
+          while (end > start && IsWhitespace(buf[end-1])) { --end; }
+          if (start == end) {
+            cerr << "Got empty token!\n  LINE=" << buf << endl;
+            exit(1);
+          }
+          switch (state) {
+            case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
+            case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
+            default: cerr << "Can't happen\n"; abort();
+          }
+          SkipWhitespace(buf, &ptr);
+          start = ptr;
+        }
+      }
+    }
+  }
+  end=ptr;
+  while (end > start && IsWhitespace(buf[end-1])) { --end; }
+  if (end > start) {
+    switch (state) {
+      case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
+      case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
+      default: cerr << "Can't happen\n"; abort();
+    }
+  }
+}
+
+void WriteKeyValue(const vector<WordID>& key, const ID2RuleStatistics& val) {
+  cout << TD::GetString(key) << '\t';
+  bool needdiv = false;
+  for (ID2RuleStatistics::const_iterator it = val.begin(); it != val.end(); ++it) {
+    if (needdiv) cout << " ||| "; else needdiv = true;
+    cout << TD::GetString(it->first) << " ||| " << it->second;
+  }
+  cout << endl;
+  if (use_hadoop_counters) cerr << "reporter:counter:UserCounters,RuleCount," << val.size() << endl;
+}
+
+void DoPhraseMarginals(const vector<WordID>& key, const bool bidir, ID2RuleStatistics* val) {
+  static const WordID kF = TD::Convert("F");
+  static const WordID kE = TD::Convert("E");
+  static const int kCF = FD::Convert("CF");
+  static const int kCE = FD::Convert("CE");
+  static const int kCFE = FD::Convert("CFE");
+  assert(key.size() > 0);
+  int cur_marginal_id = kCF;
+  if (bidir) {
+    if (key[0] != kF && key[0] != kE) {
+      cerr << "DoPhraseMarginals expects keys to have the from 'F|E [NT] word word word'\n";
+      cerr << "  but got: " << TD::GetString(key) << endl;
+      exit(1);
+    }
+    if (key[0] == kE) cur_marginal_id = kCE;
+  }
+  double tot = 0;
+  for (ID2RuleStatistics::iterator it = val->begin(); it != val->end(); ++it)
+    tot += it->second.counts.value(kCFE);
+  for (ID2RuleStatistics::iterator it = val->begin(); it != val->end(); ++it) {
+    it->second.counts.set_value(cur_marginal_id, tot);
+
+    // prevent double counting of the joint
+    if (cur_marginal_id == kCE) it->second.counts.clear_value(kCFE);
+  }
+}
+
+void WriteWithInversions(const vector<WordID>& key, const ID2RuleStatistics& val) {
+  static const WordID kE = TD::Convert("E");
+  static const WordID kDIV = TD::Convert("|||");
+  vector<WordID> new_key(key.size() - 1);
+  for (int i = 1; i < key.size(); ++i)
+    new_key[i - 1] = key[i];
+  const bool do_invert = (key[0] == kE);
+  if (!do_invert) {
+    WriteKeyValue(new_key, val);
+  } else {
+    ID2RuleStatistics inv;
+    assert(new_key.size() > 2);
+    vector<WordID> tk(new_key.size() - 2);
+    for (int i = 0; i < tk.size(); ++i)
+      tk[i] = new_key[2 + i];
+    RuleStatistics& inv_stats = inv[tk];
+    for (ID2RuleStatistics::const_iterator it = val.begin(); it != val.end(); ++it) {
+      inv_stats.counts = it->second.counts;
+      vector<WordID> ekey(2 + it->first.size());
+      ekey[0] = key[1];
+      ekey[1] = kDIV;
+      for (int i = 0; i < it->first.size(); ++i)
+        ekey[2+i] = it->first[i];
+      WriteKeyValue(ekey, inv);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+
+  char* buf = new char[MAX_LINE_LENGTH];
+  ID2RuleStatistics acc, cur_counts;
+  vector<WordID> key, cur_key;
+  int line = 0;
+  use_hadoop_counters = conf.count("use_hadoop_counters") > 0;
+  const bool phrase_marginals = conf.count("phrase_marginals") > 0;
+  const bool bidir = conf.count("bidir") > 0;
+  while(cin) {
+    ++line;
+    cin.getline(buf, MAX_LINE_LENGTH);
+    if (buf[0] == 0) continue;
+    ParseLine(buf, &cur_key, &cur_counts);
+    if (cur_key != key) {
+      if (key.size() > 0) {
+        if (phrase_marginals)
+          DoPhraseMarginals(key, bidir, &acc);
+        if (bidir)
+          WriteWithInversions(key, acc);
+        else
+          WriteKeyValue(key, acc);
+        acc.clear();
+      }
+      key = cur_key;
+    }
+    PlusEquals(cur_counts, &acc);
+  }
+  if (key.size() > 0) {
+    if (phrase_marginals)
+      DoPhraseMarginals(key, bidir, &acc);
+    if (bidir)
+      WriteWithInversions(key, acc);
+    else
+      WriteKeyValue(key, acc);
+  }
+  return 0;
+}
+
diff --git a/extools/score_grammar.cc b/extools/score_grammar.cc
new file mode 100644
index 00000000..534b6295
--- /dev/null
+++ b/extools/score_grammar.cc
@@ -0,0 +1,346 @@
+/*
+ * Score a grammar in striped format
+ * ./score_grammar <alignment> < filtered.grammar > scored.grammar
+ */
+#include <iostream>
+#include <string>
+#include <map>
+#include <vector>
+#include <utility>
+#include <cstdlib>
+#include <fstream>
+#include <tr1/unordered_map>
+
+#include "sentence_pair.h"
+#include "extract.h"
+#include "fdict.h"
+#include "tdict.h"
+#include "lex_trans_tbl.h"
+#include "filelib.h"
+
+#include <boost/functional/hash.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+using namespace std;
+using namespace std::tr1;
+
+
+static const size_t MAX_LINE_LENGTH = 64000000;
+
+typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics;
+
+
+namespace {
+  inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+  inline bool IsBracket(char c){return c == '[' || c == ']';}
+  inline void SkipWhitespace(const char* buf, int* ptr) {
+    while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
+  }
+}
+
+int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) {
+  static const WordID kDIV = TD::Convert("|||");
+  int ptr = sstart;
+  while(ptr < end) {
+    while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; }
+    int start = ptr;
+    while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
+    if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; }
+    const WordID w = TD::Convert(string(buf, start, ptr - start));
+
+    if((IsBracket(buf[start]) and IsBracket(buf[ptr-1])) or( w == kDIV))
+      p->push_back(1 * w);
+    else {
+      if (w == kDIV) return ptr;
+      p->push_back(w);
+    }
+  }
+  return ptr;
+}
+
+
+void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) {
+  static const WordID kDIV = TD::Convert("|||");
+  counts->clear();
+  int ptr = 0;
+  while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; }
+  if (buf[ptr] != '\t') {
+    cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl;
+    exit(1);
+  }
+  cur_key->clear();
+  // key is: "[X] ||| word word word"
+  int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key);
+  cur_key->push_back(kDIV);
+  ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
+  ++ptr;
+  int start = ptr;
+  int end = ptr;
+  int state = 0; // 0=reading label, 1=reading count
+  vector<WordID> name;
+  while(buf[ptr] != 0) {
+    while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; }
+    if (buf[ptr] == '|') {
+      ++ptr;
+      if (buf[ptr] == '|') {
+        ++ptr;
+        if (buf[ptr] == '|') {
+          ++ptr;
+          end = ptr - 3;
+          while (end > start && IsWhitespace(buf[end-1])) { --end; }
+          if (start == end) {
+            cerr << "Got empty token!\n  LINE=" << buf << endl;
+            exit(1);
+          }
+          switch (state) {
+            case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
+            case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
+            default: cerr << "Can't happen\n"; abort();
+          }
+          SkipWhitespace(buf, &ptr);
+          start = ptr;
+        }
+      }
+    }
+  }
+  end=ptr;
+  while (end > start && IsWhitespace(buf[end-1])) { --end; }
+  if (end > start) {
+    switch (state) {
+      case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
+      case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
+      default: cerr << "Can't happen\n"; abort();
+    }
+  }
+}
+
+
+
+void LexTranslationTable::createTTable(const char* buf){
+
+  bool DEBUG = false;
+
+  AnnotatedParallelSentence sent;
+      
+  sent.ParseInputLine(buf);
+      
+  //iterate over the alignment to compute aligned words
+  
+  for(int i =0;i<sent.aligned.width();i++)
+    {
+      for (int j=0;j<sent.aligned.height();j++)
+	{
+	  if (DEBUG) cerr << sent.aligned(i,j) << " ";
+	  if( sent.aligned(i,j))
+	    {
+	      if (DEBUG) cerr << TD::Convert(sent.f[i])  << " aligned to " << TD::Convert(sent.e[j]);
+	      ++word_translation[pair<WordID,WordID> (sent.f[i], sent.e[j])];
+	      ++total_foreign[sent.f[i]];
+	      ++total_english[sent.e[j]];
+	    }
+	}
+      if (DEBUG)  cerr << endl;
+    }
+  if (DEBUG) cerr << endl;
+  
+  static const WordID NULL_ = TD::Convert("NULL");
+  //handle unaligned words - align them to null
+  for (int j =0; j < sent.e_len; j++)
+    {
+      if (sent.e_aligned[j]) continue;
+      ++word_translation[pair<WordID,WordID> (NULL_, sent.e[j])];
+      ++total_foreign[NULL_];
+      ++total_english[sent.e[j]];
+    }
+  
+  for (int i =0; i < sent.f_len; i++)
+    {
+      if (sent.f_aligned[i]) continue;
+      ++word_translation[pair<WordID,WordID> (sent.f[i], NULL_)];
+      ++total_english[NULL_];
+      ++total_foreign[sent.f[i]];
+    }
+ 
+}
+
+
+inline float safenlog(float v) {
+  if (v == 1.0f) return 0.0f;
+  float res = -log(v);
+  if (res > 100.0f) res = 100.0f;
+  return res;
+}
+
+int main(int argc, char** argv){
+  bool DEBUG= false;
+  if (argc != 2) {
+    cerr << "Usage: " << argv[0] << " corpus.al < filtered.grammar\n";
+    return 1;
+  }
+  ifstream alignment (argv[1]);
+  istream& unscored_grammar = cin;
+  ostream& scored_grammar = cout;
+
+  //create lexical translation table
+  cerr << "Creating table..." << endl;
+  char* buf = new char[MAX_LINE_LENGTH];
+
+  LexTranslationTable table;
+
+  while(!alignment.eof())
+    {
+      alignment.getline(buf, MAX_LINE_LENGTH);
+      if (buf[0] == 0) continue;
+      
+      table.createTTable(buf);      	
+    }
+  
+  bool PRINT_TABLE=false;
+  if (PRINT_TABLE)
+    {
+      ofstream trans_table;
+      trans_table.open("lex_trans_table.out");
+      for(map < pair<WordID,WordID>,int >::iterator it = table.word_translation.begin(); it != table.word_translation.end(); ++it)
+      {
+	trans_table <<  TD::Convert(it->first.first) <<  "|||" << TD::Convert(it->first.second) << "==" << it->second << "//" << table.total_foreign[it->first.first] << "//" << table.total_english[it->first.second] << endl;
+      } 
+
+      trans_table.close();
+    }
+  
+ 
+  //score unscored grammar
+  cerr <<"Scoring grammar..." << endl;
+
+  ID2RuleStatistics acc, cur_counts;
+  vector<WordID> key, cur_key,temp_key;
+  vector< pair<short,short> > al;
+  vector< pair<short,short> >::iterator ita;
+  int line = 0;
+
+  static const int kCF = FD::Convert("CF");
+  static const int kCE = FD::Convert("CE");
+  static const int kCFE = FD::Convert("CFE");	
+
+  while(!unscored_grammar.eof())
+    {
+      ++line;
+      unscored_grammar.getline(buf, MAX_LINE_LENGTH);
+      if (buf[0] == 0) continue;
+      ParseLine(buf, &cur_key, &cur_counts);
+      
+      //loop over all the Target side phrases that this source aligns to
+      for (ID2RuleStatistics::const_iterator it = cur_counts.begin(); it != cur_counts.end(); ++it)
+	{
+	  
+	 /*Compute phrase translation prob.
+	   Print out scores in this format:
+	   Phrase trnaslation prob P(F|E)
+	   Phrase translation prob P(E|F)
+	   Lexical weighting prob lex(F|E)
+	   Lexical weighting prob lex(E|F)
+	 */      
+	  
+	  float pEF_ = it->second.counts.value(kCFE) / it->second.counts.value(kCF);
+	  float pFE_ = it->second.counts.value(kCFE) / it->second.counts.value(kCE);
+
+	  map <WordID, pair<int, float> > foreign_aligned;
+	  map <WordID, pair<int, float> > english_aligned;
+
+	  //Loop over all the alignment points to compute lexical translation probability
+	  al = it->second.aligns;	  
+	  for(ita = al.begin(); ita != al.end(); ++ita)
+	    {
+	     
+	      if (DEBUG)
+		{
+		  cerr << "\nA:" << ita->first << "," << ita->second << "::";
+		  cerr <<  TD::Convert(cur_key[ita->first + 2]) << "-" << TD::Convert(it->first[ita->second]);
+		}
+
+
+	      //Lookup this alignment probability in the table
+	      int temp = table.word_translation[pair<WordID,WordID> (cur_key[ita->first+2],it->first[ita->second])];
+	      float f2e=0, e2f=0;
+	      if ( table.total_foreign[cur_key[ita->first+2]] != 0)
+		f2e = (float) temp / table.total_foreign[cur_key[ita->first+2]];
+	      if ( table.total_english[it->first[ita->second]] !=0 )
+		e2f = (float) temp / table.total_english[it->first[ita->second]];
+	      if (DEBUG) printf (" %d %E %E\n", temp, f2e, e2f);
+	      
+	      
+	      //local counts to keep track of which things haven't been aligned, to later compute their null alignment	      
+	      if (foreign_aligned.count(cur_key[ita->first+2]))
+		{
+		  foreign_aligned[ cur_key[ita->first+2] ].first++;
+		  foreign_aligned[ cur_key[ita->first+2] ].second += e2f;
+		}
+	      else
+		foreign_aligned [ cur_key[ita->first+2] ] = pair<int,float> (1,e2f);
+		
+	      
+
+	      if (english_aligned.count( it->first[ ita->second] ))
+		{
+		  english_aligned[ it->first[ ita->second ]].first++;
+		  english_aligned[  it->first[ ita->second] ].second += f2e;
+		}
+	      else
+		english_aligned [ it->first[ ita->second] ] = pair<int,float> (1,f2e);
+		
+	      
+	    
+	   	      
+	    }
+
+	  float final_lex_f2e=1, final_lex_e2f=1;
+	  static const WordID NULL_ = TD::Convert("NULL");
+
+	  //compute lexical weight P(F|E) and include unaligned foreign words
+	   for(int i=0;i<cur_key.size(); i++)
+	     {
+	       
+	       if (!table.total_foreign.count(cur_key[i])) continue;      //if we dont have it in the translation table, we won't know its lexical weight
+	       
+	       if (foreign_aligned.count(cur_key[i])) 
+		 {
+		   pair<int, float> temp_lex_prob = foreign_aligned[cur_key[i]];
+		   final_lex_e2f *= temp_lex_prob.second / temp_lex_prob.first;
+		 }
+	       else //dealing with null alignment
+		 {
+		   int temp_count = table.word_translation[pair<WordID,WordID> (cur_key[i],NULL_)];
+		   float temp_e2f = (float) temp_count / table.total_english[NULL_];
+		   final_lex_e2f *= temp_e2f;
+		 }	       	       
+
+	     }
+
+	   //compute P(E|F) unaligned english words
+	   for(int j=0; j< it->first.size(); j++)
+	     {
+	       if (!table.total_english.count(it->first[j])) continue;
+	       
+	       if (english_aligned.count(it->first[j]))
+		 {
+		   pair<int, float> temp_lex_prob = english_aligned[it->first[j]];
+		   final_lex_f2e *= temp_lex_prob.second / temp_lex_prob.first;
+		 }
+	       else //dealing with null
+		 {
+		   int temp_count = table.word_translation[pair<WordID,WordID> (NULL_,it->first[j])];
+		   float temp_f2e = (float) temp_count / table.total_foreign[NULL_];
+		   final_lex_f2e *= temp_f2e;
+		 }
+	     }
+	   
+	   
+           scored_grammar << TD::GetString(cur_key);
+	   scored_grammar << " " << TD::GetString(it->first) << " |||";
+	   scored_grammar << " " << safenlog(pFE_) << " " << safenlog(pEF_);
+	   scored_grammar << " " << safenlog(final_lex_e2f) << " " << safenlog(final_lex_f2e) << endl;
+	}  
+    }
+}
+
diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc
new file mode 100644
index 00000000..91286059
--- /dev/null
+++ b/extools/sentence_pair.cc
@@ -0,0 +1,155 @@
+#include "sentence_pair.h"
+
+#include <queue>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <utility>
+#include <set>
+
+#include "tdict.h"
+#include "wordid.h"
+#include "array2d.h"
+
+using namespace std;
+
+namespace {
+  inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+
+  inline void SkipWhitespace(const char* buf, int* ptr) {
+    while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
+  }
+}
+
+void AnnotatedParallelSentence::Reset() {
+  f.clear();
+  e.clear();
+  e_aligned.clear();
+  f_aligned.clear();
+  aligns_by_fword.clear();
+  aligned.clear();
+  span_types.clear();
+}
+
+void AnnotatedParallelSentence::AllocateForAlignment() {
+  f_len = f.size();
+  e_len = e.size();
+  aligned.resize(f_len, e_len, false);
+  f_aligned.resize(f_len, 0);
+  e_aligned.resize(e_len, 0);
+  aligns_by_fword.resize(f_len);
+  span_types.resize(e_len, e_len+1);
+}
+
+// read an alignment point of the form X-Y where X and Y are strings
+// of digits. if permit_col is true, the right edge will be determined
+// by the presence of a colon
+int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf,
+                                                  const int start,
+                                                  const int end,
+                                                  const bool permit_col,
+                                                  short* a,
+                                                  short* b) {
+  if (end - start < 3) {
+    cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl; abort();
+  }
+  int c = start;
+  *a = 0;
+  while(c < end && buf[c] != '-') {
+    if (buf[c] < '0' || buf[c] > '9') {
+      cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl;
+      abort();
+    }
+    (*a) *= 10;
+    (*a) += buf[c] - '0';
+    ++c;
+  }
+  ++c;
+  if (c >= end) {
+    cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl; abort();
+  }
+  (*b) = 0;
+  while(c < end && (!permit_col || (permit_col && buf[c] != ':'))) {
+    if (buf[c] < '0' || buf[c] > '9') {
+      cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl;
+      abort();
+    }
+    (*b) *= 10;
+    (*b) += buf[c] - '0';
+    ++c;
+  }
+  return c;
+}
+
+void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) {
+  short a, b;
+  ReadAlignmentPoint(buf, start, end, false, &a, &b);
+  assert(a < f_len);
+  assert(b < e_len);
+  aligned(a,b) = true;
+  ++f_aligned[a];
+  ++e_aligned[b];
+  aligns_by_fword[a].push_back(make_pair(a,b));
+  // cerr << a << " " << b << endl;
+}
+
+void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) {
+  short a,b;
+  int c = ReadAlignmentPoint(buf, start, end, true, &a, &b) + 1;
+  if (buf[c-1] != ':' || c >= end) {
+    cerr << "Span badly formed: " << string(buf, start, end-start) << endl; abort();
+  }
+  // cerr << a << " " << b << " " << string(buf,c,end-c) << endl;
+  span_types(a,b).push_back(-TD::Convert(string(buf, c, end-c)));
+}
+
+// INPUT FORMAT
+// ein haus ||| a house ||| 0-0 1-1 ||| 0-0:DT 1-1:NN 0-1:NP
+void AnnotatedParallelSentence::ParseInputLine(const char* buf) {
+  Reset();
+  int ptr = 0;
+  SkipWhitespace(buf, &ptr);
+  int start = ptr;
+  int state = 0;  // 0 = French, 1 = English, 2 = Alignment, 3 = Spans
+  while(char c = buf[ptr]) {
+    if (!IsWhitespace(c)) { ++ptr; continue; } else {
+      if (ptr - start == 3 && buf[start] == '|' && buf[start+1] == '|' && buf[start+2] == '|') {
+        ++state;
+        if (state == 4) { cerr << "Too many fields (ignoring):\n  " << buf << endl; return; }
+        if (state == 2) {
+          // cerr << "FLEN=" << f->size() << " ELEN=" << e->size() << endl;
+          AllocateForAlignment();
+        }
+        SkipWhitespace(buf, &ptr);
+        start = ptr;
+        continue;
+      }
+      switch (state) {
+        case 0:  f.push_back(TD::Convert(string(buf, start, ptr-start))); break;
+        case 1:  e.push_back(TD::Convert(string(buf, start, ptr-start))); break;
+        case 2:  ParseAlignmentPoint(buf, start, ptr); break;
+        case 3:  ParseSpanLabel(buf, start, ptr); break;
+        default: cerr << "Can't happen\n"; abort();
+      }
+      SkipWhitespace(buf, &ptr);
+      start = ptr;
+    }
+  }
+  if (ptr > start) {
+    switch (state) {
+      case 0:  f.push_back(TD::Convert(string(buf, start, ptr-start))); break;
+      case 1:  e.push_back(TD::Convert(string(buf, start, ptr-start))); break;
+      case 2:  ParseAlignmentPoint(buf, start, ptr); break;
+      case 3:  ParseSpanLabel(buf, start, ptr); break;
+      default: cerr << "Can't happen\n"; abort();
+    }
+  }
+  if (state < 2) {
+    cerr << "Not enough fields: " << buf << endl;
+    abort();
+  }
+  if (e.empty() || f.empty()) {
+    cerr << "Sentences must not be empty: " << buf << endl;
+  }
+}
+
diff --git a/extools/sentence_pair.h b/extools/sentence_pair.h
new file mode 100644
index 00000000..d78be359
--- /dev/null
+++ b/extools/sentence_pair.h
@@ -0,0 +1,39 @@
+#ifndef _SENTENCE_PAIR_H_
+#define _SENTENCE_PAIR_H_
+
+#include <utility>
+#include <vector>
+#include "wordid.h"
+#include "array2d.h"
+
+// represents a parallel sentence with a word alignment and category
+// annotations over subspans (currently in terms of f)
+// you should read one using ParseInputLine and then use the public
+// member variables to query things about it
+struct AnnotatedParallelSentence {
+  // read annotated parallel sentence from string
+  void ParseInputLine(const char* buf);
+
+  std::vector<WordID> f, e;  // words in f and e
+
+  // word alignment information
+  std::vector<int> e_aligned, f_aligned; // counts the number of times column/row x is aligned
+  Array2D<bool> aligned;
+  std::vector<std::vector<std::pair<short, short> > > aligns_by_fword;
+
+  // span type information
+  Array2D<std::vector<WordID> > span_types;  // span_types(i,j) is the list of category
+                               // types for a span (i,j) in the TARGET language.
+
+  int f_len, e_len;
+
+  static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b);
+
+ private:
+  void Reset();
+  void AllocateForAlignment();
+  void ParseAlignmentPoint(const char* buf, int start, int end);
+  void ParseSpanLabel(const char* buf, int start, int end);
+};
+
+#endif
diff --git a/extools/simple-extract-context.sh b/extools/simple-extract-context.sh
new file mode 100755
index 00000000..17487b1c
--- /dev/null
+++ b/extools/simple-extract-context.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+MYDIR=$(dirname $0)
+
+export LANG=C
+date 1>&2
+$MYDIR/extractor -i $1 -c 500000 -L 12 -C | sort -t $'\t' -k 1 | $MYDIR/mr_stripe_rule_reduce
+date 1>&2
+
diff --git a/extools/simple-extract.sh b/extools/simple-extract.sh
new file mode 100755
index 00000000..7d9f439d
--- /dev/null
+++ b/extools/simple-extract.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+export LANG=C
+date
+./extractor -i $1 -d X -c 500000 -L 12 -b | sort -t $'\t' -k 1 | gzip > ex.output.gz
+date
+# -p = compute phrase marginals
+# -b = bidirectional rules (starting with F or E) were extracted
+gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz
+date
+
diff --git a/extools/suffix_tree.h b/extools/suffix_tree.h
new file mode 100644
index 00000000..d3a9c08e
--- /dev/null
+++ b/extools/suffix_tree.h
@@ -0,0 +1,38 @@
+/*
+ * suffix_tree.h
+ *
+ *  Created on: May 17, 2010
+ *      Author: Vlad
+ */
+
+#ifndef SUFFIX_TREE_H_
+#define SUFFIX_TREE_H_
+
+#include <string>
+#include <map>
+#include <vector>
+
+template <class T>
+class Node {
+	public:
+		std::map<T, Node> edge_list_;
+		int InsertPath(const std::vector<T>& p, int start, int end);
+		const Node* Extend(const T& e) const {
+			typename std::map<T, Node>::const_iterator it = edge_list_.find(e);
+			if (it == edge_list_.end()) return NULL;
+			return &it->second;
+		}
+};
+
+bool DEBUG = false;
+
+template <class T>
+int Node<T>::InsertPath(const std::vector<T>& p, int start, int end){
+	Node* currNode = this;
+	for(int i=start;i<= end; i++ ) {
+		currNode = &(currNode->edge_list_)[p[i]];
+	}
+	return 1;
+}
+
+#endif /* SUFFIX_TRIE_H_ */
diff --git a/extools/test_data/README b/extools/test_data/README
new file mode 100644
index 00000000..e368cffc
--- /dev/null
+++ b/extools/test_data/README
@@ -0,0 +1,10 @@
+The following was used to create the test data.  The real inputs
+were corpus.fr, corpus.en, and corpus.aligned.  The generated files
+were corpus.len_cats and fr-en.al.len.
+
+
+  ./make_len_cats.pl corpus.en > corpus.len_cats
+
+  ../merge_lines.pl corpus.fr corpus.en corpus.aligned corpus.len_cats > fr-en.al.len
+
+
diff --git a/extools/test_data/corpus.aligned b/extools/test_data/corpus.aligned
new file mode 100644
index 00000000..aa09e9ab
--- /dev/null
+++ b/extools/test_data/corpus.aligned
@@ -0,0 +1,5 @@
+0-0 1-2 2-1
+0-0 1-1
+0-0 0-1 1-0 1-1 2-0 2-1 3-2 4-3
+0-0
+0-0 1-1
diff --git a/extools/test_data/corpus.en b/extools/test_data/corpus.en
new file mode 100644
index 00000000..2d4751bf
--- /dev/null
+++ b/extools/test_data/corpus.en
@@ -0,0 +1,5 @@
+the blue house
+the hat
+there is a hat
+cap
+the cat
diff --git a/extools/test_data/corpus.fr b/extools/test_data/corpus.fr
new file mode 100644
index 00000000..75b5e127
--- /dev/null
+++ b/extools/test_data/corpus.fr
@@ -0,0 +1,5 @@
+la maison bleue
+le chapeau
+il y a un chapeau
+chapeau
+le chat
diff --git a/extools/test_data/corpus.len_cats b/extools/test_data/corpus.len_cats
new file mode 100644
index 00000000..18d321de
--- /dev/null
+++ b/extools/test_data/corpus.len_cats
@@ -0,0 +1,5 @@
+0-1:SHORT 0-2:SHORT 0-3:MID 1-2:SHORT 1-3:SHORT 2-3:SHORT
+0-1:SHORT 0-2:SHORT 1-2:SHORT
+0-1:SHORT 0-2:SHORT 0-3:MID 0-4:MID 1-2:SHORT 1-3:SHORT 1-4:MID 2-3:SHORT 2-4:SHORT 3-4:SHORT
+0-1:SHORT
+0-1:SHORT 0-2:SHORT 1-2:SHORT
diff --git a/extools/test_data/fr-en.al.len b/extools/test_data/fr-en.al.len
new file mode 100644
index 00000000..7ee6b85d
--- /dev/null
+++ b/extools/test_data/fr-en.al.len
@@ -0,0 +1,5 @@
+la maison bleue ||| the blue house ||| 0-0 1-2 2-1 ||| 0-1:SHORT 0-2:SHORT 0-3:MID 1-2:SHORT 1-3:SHORT 2-3:SHORT
+le chapeau ||| the hat ||| 0-0 1-1 ||| 0-1:SHORT 0-2:SHORT 1-2:SHORT
+il y a un chapeau ||| there is a hat ||| 0-0 0-1 1-0 1-1 2-0 2-1 3-2 4-3 ||| 0-1:SHORT 0-2:SHORT 0-3:MID 0-4:MID 1-2:SHORT 1-3:SHORT 1-4:MID 2-3:SHORT 2-4:SHORT 3-4:SHORT
+chapeau ||| cap ||| 0-0 ||| 0-1:SHORT
+le chat ||| the cat ||| 0-0 1-1 ||| 0-1:SHORT 0-2:SHORT 1-2:SHORT
diff --git a/extools/test_data/make_len_cats.pl b/extools/test_data/make_len_cats.pl
new file mode 100755
index 00000000..25ef75fa
--- /dev/null
+++ b/extools/test_data/make_len_cats.pl
@@ -0,0 +1,23 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $max_len = 15;
+my @cat_names = qw( NULL SHORT SHORT MID MID MID LONG LONG LONG LONG LONG VLONG VLONG VLONG VLONG VLONG );
+
+while(<>) {
+  chomp;
+  my @words = split /\s+/;
+  my $len = scalar @words;
+  my @spans;
+  for (my $i =0; $i < $len; $i++) {
+    for (my $k = 1; $k <= $max_len; $k++) {
+      my $j = $i + $k;
+      next if ($j > $len);
+      my $cat = $cat_names[$k];
+      die unless $cat;
+      push @spans, "$i-$j:$cat";
+    }
+  }
+  print "@spans\n";
+}
+