summaryrefslogtreecommitdiff
path: root/extools
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
commit7cc92b65a3185aa242088d830e166e495674efc9 (patch)
tree681fe5237612a4e96ce36fb9fabef00042c8ee61 /extools
parent37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff)
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'extools')
-rw-r--r--extools/Makefile.am31
-rw-r--r--extools/README32
-rw-r--r--extools/build_lexical_translation.cc104
-rw-r--r--extools/extract.cc341
-rw-r--r--extools/extract.h110
-rw-r--r--extools/extractor.cc363
-rw-r--r--extools/filter_grammar.cc202
-rw-r--r--extools/lex_trans_tbl.h24
-rwxr-xr-xextools/merge_lines.pl43
-rw-r--r--extools/mr_stripe_rule_reduce.cc242
-rw-r--r--extools/score_grammar.cc346
-rw-r--r--extools/sentence_pair.cc155
-rw-r--r--extools/sentence_pair.h39
-rwxr-xr-xextools/simple-extract-context.sh9
-rwxr-xr-xextools/simple-extract.sh11
-rw-r--r--extools/suffix_tree.h38
-rw-r--r--extools/test_data/README10
-rw-r--r--extools/test_data/corpus.aligned5
-rw-r--r--extools/test_data/corpus.en5
-rw-r--r--extools/test_data/corpus.fr5
-rw-r--r--extools/test_data/corpus.len_cats5
-rw-r--r--extools/test_data/fr-en.al.len5
-rwxr-xr-xextools/test_data/make_len_cats.pl23
23 files changed, 2148 insertions, 0 deletions
diff --git a/extools/Makefile.am b/extools/Makefile.am
new file mode 100644
index 00000000..5baefa21
--- /dev/null
+++ b/extools/Makefile.am
@@ -0,0 +1,31 @@
+bin_PROGRAMS = \
+ extractor \
+ mr_stripe_rule_reduce \
+ build_lexical_translation \
+ score_grammar \
+ filter_grammar
+
+noinst_PROGRAMS =
+
+build_lexical_translation_SOURCES = build_lexical_translation.cc extract.cc sentence_pair.cc
+build_lexical_translation_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+build_lexical_translation_LDFLAGS = -all-static
+
+mr_stripe_rule_reduce_SOURCES = mr_stripe_rule_reduce.cc extract.cc sentence_pair.cc
+mr_stripe_rule_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_stripe_rule_reduce_LDFLAGS = -all-static
+
+extractor_SOURCES = sentence_pair.cc extract.cc extractor.cc
+extractor_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+extractor_LDFLAGS = -all-static
+
+filter_grammar_SOURCES = sentence_pair.cc extract.cc filter_grammar.cc
+filter_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+filter_grammar_LDFLAGS = -all-static
+
+score_grammar_SOURCES = sentence_pair.cc score_grammar.cc extract.cc
+score_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+score_grammar_LDFLAGS = -all-static
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
+
diff --git a/extools/README b/extools/README
new file mode 100644
index 00000000..af91ce79
--- /dev/null
+++ b/extools/README
@@ -0,0 +1,32 @@
+
+Categories have the format i-j:CAT where i and j are the indices of the spaces
+between words in the TARGET language. For example, slash categories can be written:
+
+ the blue house
+ 0-1:DT 1-2:JJ 2-3:NN 1-3:NBAR 0-2:NP/NN 0-3:NP
+
+
+You may multiply label each span, e.g.
+
+ NP
+ |
+ NBAR
+ |
+ NN
+ |
+ John
+ 0-1:NP 0-1:NBAR 0-1:NP
+
+However, this may result in a very large number of rules being extracted.
+
+
+****
+* Filtering and Scoring of Unscored and Unfiltered Grammars
+****
+
+Take the unfiltered grammar, and a test set, and run:
+./filter_grammar <test set> < unfiltered.grammar > filter.grammar
+
+Then, to score the new filtered grammar, run:
+./score_grammar <alignment> < filtered.grammar > scored.grammar
+
diff --git a/extools/build_lexical_translation.cc b/extools/build_lexical_translation.cc
new file mode 100644
index 00000000..f609f56a
--- /dev/null
+++ b/extools/build_lexical_translation.cc
@@ -0,0 +1,104 @@
+/*
+ * Build lexical translation table from alignment file to use for lexical translation probabilties when scoring a grammar
+ *
+ * Ported largely from the train-factored-phrase-model.perl script by Philipp Koehn
+ */
+#include <iostream>
+#include <string>
+#include <map>
+#include <vector>
+#include <utility>
+#include <cstdlib>
+#include <fstream>
+#include <tr1/unordered_map>
+
+#include "sentence_pair.h"
+#include "extract.h"
+#include "fdict.h"
+#include "tdict.h"
+
+#include <boost/functional/hash.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+using namespace std;
+using namespace std::tr1;
+
+static const size_t MAX_LINE_LENGTH = 64000000;
+
+int main(int argc, char* argv[]){
+
+ bool DEBUG = false;
+
+ map <WordID, map<WordID, int> > word_translation;
+ map <WordID, int> total_foreign;
+ map <WordID, int> total_english;
+
+ AnnotatedParallelSentence sent;
+ char* buf = new char[MAX_LINE_LENGTH];
+ while(cin)
+ {
+ cin.getline(buf, MAX_LINE_LENGTH);
+ if (buf[0] == 0) continue;
+
+ sent.ParseInputLine(buf);
+
+ map <WordID, int> foreign_aligned;
+ map <WordID, int> english_aligned;
+
+ //iterate over the alignment to compute aligned words
+
+ for(int i =0;i<sent.aligned.width();i++)
+ {
+ for (int j=0;j<sent.aligned.height();j++)
+ {
+ if (DEBUG) cout << sent.aligned(i,j) << " ";
+ if( sent.aligned(i,j))
+ {
+ if (DEBUG) cout << TD::Convert(sent.f[i]) << " aligned to " << TD::Convert(sent.e[j]);
+ //local counts
+ ++foreign_aligned[sent.f[i]];
+ ++english_aligned[sent.e[j]];
+
+ //global counts
+ ++word_translation[sent.f[i]][sent.e[j]];
+ ++total_foreign[sent.f[i]];
+ ++total_english[sent.e[j]];
+ }
+ }
+ if (DEBUG) cout << endl;
+ }
+ if (DEBUG) cout << endl;
+
+ static const WordID NULL_ = TD::Convert("NULL");
+ //handle unaligned words - align them to null
+ map<WordID, int>& nullcounts = word_translation[NULL_];
+ for (int j =0; j < sent.e_len; j++)
+ {
+ if (english_aligned.count(sent.e[j])) continue;
+ ++nullcounts[sent.e[j]];
+ ++total_foreign[NULL_];
+ ++total_english[sent.e[j]];
+ }
+
+ for (int i =0; i < sent.f_len; i++)
+ {
+ if (foreign_aligned.count(sent.f[i])) continue;
+ ++word_translation[sent.f[i]][NULL_];
+ ++total_english[NULL_];
+ ++total_foreign[sent.f[i]];
+ }
+
+ }
+
+ for(map < WordID, map<WordID,int> >::iterator it = word_translation.begin(); it != word_translation.end(); ++it)
+ {
+ const map<WordID, int>& trans = it->second;
+ for (map<WordID,int>::const_iterator iit = trans.begin(); iit != trans.end(); ++iit) {
+ cout << TD::Convert(it->first) << "," << TD::Convert(iit->first) << "=" << iit->second << "/" << total_foreign[it->first] << endl;
+ }
+ }
+
+
+ return 0;
+}
diff --git a/extools/extract.cc b/extools/extract.cc
new file mode 100644
index 00000000..6ad124d2
--- /dev/null
+++ b/extools/extract.cc
@@ -0,0 +1,341 @@
+#include "extract.h"
+
+#include <queue>
+#include <vector>
+#include <utility>
+#include <tr1/unordered_map>
+#include <set>
+
+#include <boost/functional/hash.hpp>
+
+#include "sentence_pair.h"
+#include "tdict.h"
+#include "wordid.h"
+#include "array2d.h"
+
+using namespace std;
+using namespace tr1;
+
+namespace {
+ inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+
+ inline void SkipWhitespace(const char* buf, int* ptr) {
+ while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
+ }
+}
+
+Extract::RuleObserver::~RuleObserver() {
+ cerr << "Rules extracted: " << count << endl;
+}
+
+void Extract::ExtractBasePhrases(const int max_base_phrase_size,
+ const AnnotatedParallelSentence& sentence,
+ vector<ParallelSpan>* phrases) {
+ phrases->clear();
+
+ vector<pair<int,int> > f_spans(sentence.f_len, pair<int,int>(sentence.e_len, 0));
+ vector<pair<int,int> > e_spans(sentence.e_len, pair<int,int>(sentence.f_len, 0));
+ // for each alignment point in e, precompute the minimal consistent phrases in f
+ // for each alignment point in f, precompute the minimal consistent phrases in e
+ for (int i = 0; i < sentence.f_len; ++i) {
+ for (int j = 0; j < sentence.e_len; ++j) {
+ if (sentence.aligned(i,j)) {
+ if (j < f_spans[i].first) f_spans[i].first = j;
+ f_spans[i].second = j+1;
+ if (i < e_spans[j].first) e_spans[j].first = i;
+ e_spans[j].second = i+1;
+ }
+ }
+ }
+
+ for (int i1 = 0; i1 < sentence.f_len; ++i1) {
+ if (sentence.f_aligned[i1] == 0) continue;
+ int j1 = sentence.e_len;
+ int j2 = 0;
+ const int i_limit = min(sentence.f_len, i1 + max_base_phrase_size);
+ for (int i2 = i1 + 1; i2 <= i_limit; ++i2) {
+ if (sentence.f_aligned[i2-1] == 0) continue;
+ // cerr << "F has aligned span " << i1 << " to " << i2 << endl;
+ j1 = min(j1, f_spans[i2-1].first);
+ j2 = max(j2, f_spans[i2-1].second);
+ if (j1 >= j2) continue;
+ if (j2 - j1 > max_base_phrase_size) continue;
+ int condition = 0;
+ for (int j = j1; j < j2; ++j) {
+ if (e_spans[j].first < i1) { condition = 1; break; }
+ if (e_spans[j].second > i2) { condition = 2; break; }
+ }
+ if (condition == 1) break;
+ if (condition == 2) continue;
+ // category types added later!
+ phrases->push_back(ParallelSpan(i1, i2, j1, j2));
+ // cerr << i1 << " " << i2 << " : " << j1 << " " << j2 << endl;
+ }
+ }
+}
+
+void Extract::LoosenPhraseBounds(const AnnotatedParallelSentence& sentence,
+ const int max_base_phrase_size,
+ vector<ParallelSpan>* phrases) {
+ const int num_phrases = phrases->size();
+ map<int, map<int, map<int, map<int, bool> > > > marker;
+ for (int i = 0; i < num_phrases; ++i) {
+ const ParallelSpan& cur = (*phrases)[i];
+ marker[cur.i1][cur.i2][cur.j1][cur.j2] = true;
+ }
+ for (int i = 0; i < num_phrases; ++i) {
+ const ParallelSpan& cur = (*phrases)[i];
+ const int i1_max = cur.i1;
+ const int i2_min = cur.i2;
+ const int j1_max = cur.j1;
+ const int j2_min = cur.j2;
+ int i1_min = i1_max;
+ while (i1_min > 0 && sentence.f_aligned[i1_min-1] == 0) { --i1_min; }
+ int j1_min = j1_max;
+ while (j1_min > 0 && sentence.e_aligned[j1_min-1] == 0) { --j1_min; }
+ int i2_max = i2_min;
+ while (i2_max < sentence.f_len && sentence.f_aligned[i2_max] == 0) { ++i2_max; }
+ int j2_max = j2_min;
+ while (j2_max < sentence.e_len && sentence.e_aligned[j2_max] == 0) { ++j2_max; }
+ for (int i1 = i1_min; i1 <= i1_max; ++i1) {
+ const int ilim = min(i2_max, i1 + max_base_phrase_size);
+ for (int i2 = max(i1+1,i2_min); i2 <= ilim; ++i2) {
+ for (int j1 = j1_min; j1 <= j1_max; ++j1) {
+ const int jlim = min(j2_max, j1 + max_base_phrase_size);
+ for (int j2 = max(j1+1, j2_min); j2 <= jlim; ++j2) {
+ bool& seen = marker[i1][i2][j1][j2];
+ if (!seen)
+ phrases->push_back(ParallelSpan(i1,i2,j1,j2));
+ seen = true;
+ }
+ }
+ }
+ }
+ }
+}
+
+// this uses the TARGET span (i,j) to annotate phrases, will copy
+// phrases if there is more than one annotation.
+// TODO: support source annotation
+void Extract::AnnotatePhrasesWithCategoryTypes(const WordID default_cat,
+ const Array2D<vector<WordID> >& types,
+ vector<ParallelSpan>* phrases) {
+ const int num_unannotated_phrases = phrases->size();
+ // have to use num_unannotated_phrases since we may grow the vector
+ for (int i = 0; i < num_unannotated_phrases; ++i) {
+ ParallelSpan& phrase = (*phrases)[i];
+ const vector<WordID>* pcats = &types(phrase.j1, phrase.j2);
+ if (pcats->empty() && default_cat != 0) {
+ static vector<WordID> s_default(1, default_cat);
+ pcats = &s_default;
+ }
+ if (pcats->empty()) {
+ cerr << "ERROR span " << phrase.i1 << "," << phrase.i2 << "-"
+ << phrase.j1 << "," << phrase.j2 << " has no type. "
+ "Did you forget --default_category?\n";
+ }
+ const vector<WordID>& cats = *pcats;
+ phrase.cat = cats[0];
+ for (int ci = 1; ci < cats.size(); ++ci) {
+ ParallelSpan new_phrase = phrase;
+ new_phrase.cat = cats[ci];
+ phrases->push_back(new_phrase);
+ }
+ }
+}
+
+// a partially complete (f-side) of a rule
+struct RuleItem {
+ vector<ParallelSpan> f;
+ int i,j,syms,vars;
+ explicit RuleItem(int pi) : i(pi), j(pi), syms(), vars() {}
+ void Extend(const WordID& fword) {
+ f.push_back(ParallelSpan(fword));
+ ++j;
+ ++syms;
+ }
+ void Extend(const ParallelSpan& subphrase) {
+ f.push_back(subphrase);
+ j += subphrase.i2 - subphrase.i1;
+ ++vars;
+ ++syms;
+ }
+ bool RuleFEndsInVariable() const {
+ if (f.size() > 0) {
+ return f.back().IsVariable();
+ } else { return false; }
+ }
+};
+
+void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence,
+ const vector<ParallelSpan>& phrases,
+ const int max_vars,
+ const int max_syms,
+ const bool permit_adjacent_nonterminals,
+ const bool require_aligned_terminal,
+ RuleObserver* observer) {
+ queue<RuleItem> q; // agenda for BFS
+ int max_len = -1;
+ unordered_map<pair<short, short>, vector<ParallelSpan>, boost::hash<pair<short, short> > > fspans;
+ vector<vector<ParallelSpan> > spans_by_start(sentence.f_len);
+ set<int> starts;
+ for (int i = 0; i < phrases.size(); ++i) {
+ fspans[make_pair(phrases[i].i1,phrases[i].i2)].push_back(phrases[i]);
+ max_len = max(max_len, phrases[i].i2 - phrases[i].i1);
+ // have we already added a rule item starting at phrases[i].i1?
+ if (starts.insert(phrases[i].i1).second)
+ q.push(RuleItem(phrases[i].i1));
+ spans_by_start[phrases[i].i1].push_back(phrases[i]);
+ }
+ starts.clear();
+ vector<pair<int,int> > next_e(sentence.e_len);
+ vector<WordID> cur_rhs_f, cur_rhs_e;
+ vector<pair<short, short> > cur_terminal_align;
+ vector<int> cur_es, cur_fs;
+ while(!q.empty()) {
+ const RuleItem& rule = q.front();
+
+ // extend the partial rule
+ if (rule.j < sentence.f_len && (rule.j - rule.i) < max_len && rule.syms < max_syms) {
+ RuleItem ew = rule;
+
+ // extend with a word
+ ew.Extend(sentence.f[ew.j]);
+ q.push(ew);
+
+ // with variables
+ if (rule.vars < max_vars &&
+ !spans_by_start[rule.j].empty() &&
+ ((!rule.RuleFEndsInVariable()) || permit_adjacent_nonterminals)) {
+ const vector<ParallelSpan>& sub_phrases = spans_by_start[rule.j];
+ for (int it = 0; it < sub_phrases.size(); ++it) {
+ if (sub_phrases[it].i2 - sub_phrases[it].i1 + rule.j - rule.i <= max_len) {
+ RuleItem ev = rule;
+ ev.Extend(sub_phrases[it]);
+ q.push(ev);
+ assert(ev.j <= sentence.f_len);
+ }
+ }
+ }
+ }
+ // determine if rule is consistent
+ if (rule.syms > 0 &&
+ fspans.count(make_pair(rule.i,rule.j)) &&
+ (!rule.RuleFEndsInVariable() || rule.syms > 1)) {
+ const vector<ParallelSpan>& orig_spans = fspans[make_pair(rule.i,rule.j)];
+ for (int s = 0; s < orig_spans.size(); ++s) {
+ const ParallelSpan& orig_span = orig_spans[s];
+ const WordID lhs = orig_span.cat;
+ for (int j = orig_span.j1; j < orig_span.j2; ++j) next_e[j].first = -1;
+ int nt_index_e = 0;
+ for (int i = 0; i < rule.f.size(); ++i) {
+ const ParallelSpan& cur = rule.f[i];
+ if (cur.IsVariable())
+ next_e[cur.j1] = pair<int,int>(cur.j2, ++nt_index_e);
+ }
+ cur_rhs_f.clear();
+ cur_rhs_e.clear();
+ cur_terminal_align.clear();
+ cur_fs.clear();
+ cur_es.clear();
+
+ const int elen = orig_span.j2 - orig_span.j1;
+ vector<int> isvar(elen, 0);
+ int fbias = rule.i;
+ bool bad_rule = false;
+ bool has_aligned_terminal = false;
+ for (int i = 0; i < rule.f.size(); ++i) {
+ const ParallelSpan& cur = rule.f[i];
+ cur_rhs_f.push_back(cur.cat);
+ if (cur.cat > 0) { // terminal
+ if (sentence.f_aligned[fbias + i]) has_aligned_terminal = true;
+ cur_fs.push_back(fbias + i);
+ } else { // non-terminal
+ int subj1 = cur.j1 - orig_span.j1;
+ int subj2 = cur.j2 - orig_span.j1;
+ if (subj1 < 0 || subj2 > elen) { bad_rule = true; break; }
+ for (int j = subj1; j < subj2 && !bad_rule; ++j) {
+ int& isvarj = isvar[j];
+ isvarj = true;
+ }
+ if (bad_rule) break;
+ cur_fs.push_back(-1);
+ fbias += cur.i2 - cur.i1 - 1;
+ }
+ }
+ if (require_aligned_terminal && !has_aligned_terminal) bad_rule = true;
+ if (!bad_rule) {
+ for (int j = orig_span.j1; j < orig_span.j2; ++j) {
+ if (next_e[j].first < 0) {
+ cur_rhs_e.push_back(sentence.e[j]);
+ cur_es.push_back(j);
+ } else {
+ cur_rhs_e.push_back(1 - next_e[j].second); // next_e[j].second is NT gap index
+ cur_es.push_back(-1);
+ j = next_e[j].first - 1;
+ }
+ }
+ for (short i = 0; i < cur_fs.size(); ++i)
+ if (cur_fs[i] >= 0)
+ for (short j = 0; j < cur_es.size(); ++j)
+ if (cur_es[j] >= 0 && sentence.aligned(cur_fs[i],cur_es[j]))
+ cur_terminal_align.push_back(make_pair(i,j));
+ observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align);
+ }
+ }
+ }
+ q.pop();
+ }
+}
+
+void RuleStatistics::ParseRuleStatistics(const char* buf, int start, int end) {
+ int ptr = start;
+ counts.clear();
+ aligns.clear();
+ while (ptr < end) {
+ SkipWhitespace(buf, &ptr);
+ int vstart = ptr;
+ while(ptr < end && buf[ptr] != '=') ++ptr;
+ assert(buf[ptr] == '=');
+ assert(ptr > vstart);
+ if (buf[vstart] == 'A' && buf[vstart+1] == '=') {
+ ++ptr;
+ while (ptr < end && !IsWhitespace(buf[ptr])) {
+ while(ptr < end && buf[ptr] == ',') { ++ptr; }
+ assert(ptr < end);
+ vstart = ptr;
+ while(ptr < end && buf[ptr] != ',' && !IsWhitespace(buf[ptr])) { ++ptr; }
+ if (ptr > vstart) {
+ short a, b;
+ AnnotatedParallelSentence::ReadAlignmentPoint(buf, vstart, ptr, false, &a, &b);
+ aligns.push_back(make_pair(a,b));
+ }
+ }
+ } else {
+ int name = FD::Convert(string(buf,vstart,ptr-vstart));
+ ++ptr;
+ vstart = ptr;
+ while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
+ assert(ptr > vstart);
+ counts.set_value(name, strtod(buf + vstart, NULL));
+ }
+ }
+}
+
+ostream& operator<<(ostream& os, const RuleStatistics& s) {
+ bool needspace = false;
+ for (SparseVector<float>::const_iterator it = s.counts.begin(); it != s.counts.end(); ++it) {
+ if (needspace) os << ' '; else needspace = true;
+ os << FD::Convert(it->first) << '=' << it->second;
+ }
+ if (s.aligns.size() > 0) {
+ os << " A=";
+ needspace = false;
+ for (int i = 0; i < s.aligns.size(); ++i) {
+ if (needspace) os << ','; else needspace = true;
+ os << s.aligns[i].first << '-' << s.aligns[i].second;
+ }
+ }
+ return os;
+}
+
diff --git a/extools/extract.h b/extools/extract.h
new file mode 100644
index 00000000..f87aa6cb
--- /dev/null
+++ b/extools/extract.h
@@ -0,0 +1,110 @@
+#ifndef _EXTRACT_H_
+#define _EXTRACT_H_
+
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "array2d.h"
+#include "wordid.h"
+#include "sparse_vector.h"
+
+struct AnnotatedParallelSentence;
+
+// usually represents a consistent phrase, which may
+// be annotated with a type (cat)
+// inside the rule extractor, this class is also used to represent a word
+// in a partial rule.
+struct ParallelSpan {
+ // i1 = i of f side
+ // i2 = j of f side
+ // j1 = i of e side
+ // j2 = j of e side
+ short i1,i2,j1,j2;
+ // cat is set by AnnotatePhrasesWithCategoryTypes, otherwise it's 0
+ WordID cat; // category type of span (also overloaded by RuleItem class
+ // to be a word ID)
+ ParallelSpan() : i1(-1), i2(-1), j1(-1), j2(-1), cat() {}
+ // used by Rule class to represent a terminal symbol:
+ explicit ParallelSpan(WordID w) : i1(-1), i2(-1), j1(-1), j2(-1), cat(w) {}
+ ParallelSpan(int pi1, int pi2, int pj1, int pj2) : i1(pi1), i2(pi2), j1(pj1), j2(pj2), cat() {}
+ ParallelSpan(int pi1, int pi2, int pj1, int pj2, WordID c) : i1(pi1), i2(pi2), j1(pj1), j2(pj2), cat(c) {}
+
+ // ParallelSpan is used in the Rule class where it is
+ // overloaded to also represent terminal symbols
+ inline bool IsVariable() const { return i1 != -1; }
+};
+
+// rule extraction logic lives here. this has no data, it's just got
+// static member functions.
+struct Extract {
+ // RuleObserver's CountRule is called for each rule extracted
+ // implement CountRuleImpl to do things like count the rules,
+ // write them to a file, etc.
+ struct RuleObserver {
+ RuleObserver() : count() {}
+ virtual void CountRule(WordID lhs,
+ const std::vector<WordID>& rhs_f,
+ const std::vector<WordID>& rhs_e,
+ const std::vector<std::pair<short, short> >& fe_terminal_alignments) {
+ ++count;
+ CountRuleImpl(lhs, rhs_f, rhs_e, fe_terminal_alignments);
+ }
+ virtual ~RuleObserver();
+
+ protected:
+ virtual void CountRuleImpl(WordID lhs,
+ const std::vector<WordID>& rhs_f,
+ const std::vector<WordID>& rhs_e,
+ const std::vector<std::pair<short, short> >& fe_terminal_alignments) = 0;
+ private:
+ int count;
+ };
+
+ // given a set of "tight" phrases and the aligned sentence they were
+ // extracted from, "loosen" them
+ static void LoosenPhraseBounds(const AnnotatedParallelSentence& sentence,
+ const int max_base_phrase_size,
+ std::vector<ParallelSpan>* phrases);
+
+ // extract all consistent phrase pairs, up to size max_base_phrase_size
+ // (on the source side). these phrases will be "tight".
+ static void ExtractBasePhrases(const int max_base_phrase_size,
+ const AnnotatedParallelSentence& sentence,
+ std::vector<ParallelSpan>* phrases);
+
+ // this uses the TARGET span (i,j) to annotate phrases, will copy
+ // phrases if there is more than one annotation.
+ // TODO: support source annotation
+ static void AnnotatePhrasesWithCategoryTypes(const WordID default_cat,
+ const Array2D<std::vector<WordID> >& types,
+ std::vector<ParallelSpan>* phrases);
+
+ // use the Chiang (2007) extraction logic to extract consistent subphrases
+ // observer->CountRule is called once for each rule extracted
+ static void ExtractConsistentRules(const AnnotatedParallelSentence& sentence,
+ const std::vector<ParallelSpan>& phrases,
+ const int max_vars,
+ const int max_syms,
+ const bool permit_adjacent_nonterminals,
+ const bool require_aligned_terminal,
+ RuleObserver* observer);
+};
+
+// represents statistics / information about a rule pair
+struct RuleStatistics {
+ SparseVector<float> counts;
+ std::vector<std::pair<short,short> > aligns;
+ RuleStatistics() {}
+ RuleStatistics(int name, float val, const std::vector<std::pair<short,short> >& al) :
+ aligns(al) {
+ counts.set_value(name, val);
+ }
+ void ParseRuleStatistics(const char* buf, int start, int end);
+ RuleStatistics& operator+=(const RuleStatistics& rhs) {
+ counts += rhs.counts;
+ return *this;
+ }
+};
+std::ostream& operator<<(std::ostream& os, const RuleStatistics& s);
+
+#endif
diff --git a/extools/extractor.cc b/extools/extractor.cc
new file mode 100644
index 00000000..a3791d2a
--- /dev/null
+++ b/extools/extractor.cc
@@ -0,0 +1,363 @@
+#include <iostream>
+#include <vector>
+#include <utility>
+#include <tr1/unordered_map>
+
+#include <boost/functional/hash.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include "sparse_vector.h"
+#include "sentence_pair.h"
+#include "extract.h"
+#include "tdict.h"
+#include "fdict.h"
+#include "wordid.h"
+#include "array2d.h"
+#include "filelib.h"
+
+using namespace std;
+using namespace std::tr1;
+namespace po = boost::program_options;
+
+static const size_t MAX_LINE_LENGTH = 100000;
+WordID kBOS, kEOS, kDIVIDER, kGAP;
+int kCOUNT;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("input,i", po::value<string>()->default_value("-"), "Input file")
+ ("default_category,d", po::value<string>(), "Default span type (use X for 'Hiero')")
+ ("loose", "Use loose phrase extraction heuristic for base phrases")
+ ("base_phrase,B", "Write base phrases")
+ ("base_phrase_spans", "Write base sentences and phrase spans")
+ ("bidir,b", "Extract bidirectional rules (for computing p(f|e) in addition to p(e|f))")
+ ("combiner_size,c", po::value<size_t>()->default_value(800000), "Number of unique items to store in cache before writing rule counts. Set to 0 to disable cache.")
+ ("silent", "Write nothing to stderr except errors")
+ ("phrase_context,C", "Write base phrase contexts")
+ ("phrase_context_size,S", po::value<int>()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts")
+ ("max_base_phrase_size,L", po::value<int>()->default_value(10), "Maximum starting phrase size")
+ ("max_syms,l", po::value<int>()->default_value(5), "Maximum number of symbols in final phrase size")
+ ("max_vars,v", po::value<int>()->default_value(2), "Maximum number of nonterminal variables in final phrase size")
+ ("permit_adjacent_nonterminals,A", "Permit adjacent nonterminals in source side of rules")
+ ("no_required_aligned_terminal,n", "Do not require an aligned terminal")
+ ("help,h", "Print this help message and exit");
+ po::options_description clo("Command line options");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ po::notify(*conf);
+
+ if (conf->count("help") || conf->count("input") == 0) {
+ cerr << "\nUsage: extractor [-options]\n";
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+// TODO how to handle alignment information?
+void WriteBasePhrases(const AnnotatedParallelSentence& sentence,
+ const vector<ParallelSpan>& phrases) {
+ vector<WordID> e,f;
+ for (int it = 0; it < phrases.size(); ++it) {
+ const ParallelSpan& phrase = phrases[it];
+ e.clear();
+ f.clear();
+ for (int i = phrase.i1; i < phrase.i2; ++i)
+ f.push_back(sentence.f[i]);
+ for (int j = phrase.j1; j < phrase.j2; ++j)
+ e.push_back(sentence.e[j]);
+ cout << TD::GetString(f) << " ||| " << TD::GetString(e) << endl;
+ }
+}
+
+void WriteBasePhraseSpans(const AnnotatedParallelSentence& sentence,
+ const vector<ParallelSpan>& phrases) {
+ cout << TD::GetString(sentence.f) << " ||| " << TD::GetString(sentence.e) << " |||";
+ for (int it = 0; it < phrases.size(); ++it) {
+ const ParallelSpan& phrase = phrases[it];
+ cout << " " << phrase.i1 << "-" << phrase.i2
+ << "-" << phrase.j1 << "-" << phrase.j2;
+ }
+ cout << endl;
+}
+
+struct CountCombiner {
+ CountCombiner(size_t csize) : combiner_size(csize) {}
+ ~CountCombiner() {
+ if (!cache.empty()) WriteAndClearCache();
+ }
+
+ void Count(const vector<WordID>& key,
+ const vector<WordID>& val,
+ const int count_type,
+ const vector<pair<short,short> >& aligns) {
+ if (combiner_size > 0) {
+ RuleStatistics& v = cache[key][val];
+ float newcount = v.counts.add_value(count_type, 1.0f);
+ // hack for adding alignments
+ if (newcount < 7.0f && aligns.size() > v.aligns.size())
+ v.aligns = aligns;
+ if (cache.size() > combiner_size) WriteAndClearCache();
+ } else {
+ cout << TD::GetString(key) << '\t' << TD::GetString(val) << " ||| ";
+ cout << RuleStatistics(count_type, 1.0f, aligns) << endl;
+ }
+ }
+
+ private:
+ void WriteAndClearCache() {
+ for (unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > >::iterator it = cache.begin();
+ it != cache.end(); ++it) {
+ cout << TD::GetString(it->first) << '\t';
+ const Vec2PhraseCount& vals = it->second;
+ bool needdiv = false;
+ for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi) {
+ if (needdiv) cout << " ||| "; else needdiv = true;
+ cout << TD::GetString(vi->first) << " ||| " << vi->second;
+ }
+ cout << endl;
+ }
+ cache.clear();
+ }
+
+ const size_t combiner_size;
+ typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > Vec2PhraseCount;
+ unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > > cache;
+};
+
+// TODO optional source context
+// output <k, v> : k = phrase "document" v = context "term"
+void WritePhraseContexts(const AnnotatedParallelSentence& sentence,
+ const vector<ParallelSpan>& phrases,
+ const int ctx_size,
+ CountCombiner* o) {
+ vector<WordID> context(ctx_size * 2 + 1);
+ context[ctx_size] = kGAP;
+ vector<WordID> key;
+ key.reserve(100);
+ for (int it = 0; it < phrases.size(); ++it) {
+ const ParallelSpan& phrase = phrases[it];
+
+ // TODO, support src keys as well
+ key.resize(phrase.j2 - phrase.j1);
+ for (int j = phrase.j1; j < phrase.j2; ++j)
+ key[j - phrase.j1] = sentence.e[j];
+
+ for (int i = 0; i < ctx_size; ++i) {
+ int epos = phrase.j1 - 1 - i;
+ const WordID left_ctx = (epos < 0) ? kBOS : sentence.e[epos];
+ context[ctx_size - i - 1] = left_ctx;
+ epos = phrase.j2 + i;
+ const WordID right_ctx = (epos >= sentence.e_len) ? kEOS : sentence.e[epos];
+ context[ctx_size + i + 1] = right_ctx;
+ }
+ o->Count(key, context, kCOUNT, vector<pair<short,short> >());
+ }
+}
+
+struct SimpleRuleWriter : public Extract::RuleObserver {
+ protected:
+ virtual void CountRuleImpl(WordID lhs,
+ const vector<WordID>& rhs_f,
+ const vector<WordID>& rhs_e,
+ const vector<pair<short,short> >& fe_terminal_alignments) {
+ cout << "[" << TD::Convert(-lhs) << "] |||";
+ for (int i = 0; i < rhs_f.size(); ++i) {
+ if (rhs_f[i] < 0) cout << " [" << TD::Convert(-rhs_f[i]) << ']';
+ else cout << ' ' << TD::Convert(rhs_f[i]);
+ }
+ cout << " |||";
+ for (int i = 0; i < rhs_e.size(); ++i) {
+ if (rhs_e[i] <= 0) cout << " [" << (1-rhs_e[i]) << ']';
+ else cout << ' ' << TD::Convert(rhs_e[i]);
+ }
+ cout << " |||";
+ for (int i = 0; i < fe_terminal_alignments.size(); ++i) {
+ cout << ' ' << fe_terminal_alignments[i].first << '-' << fe_terminal_alignments[i].second;
+ }
+ cout << endl;
+ }
+};
+
+struct HadoopStreamingRuleObserver : public Extract::RuleObserver {
+ HadoopStreamingRuleObserver(CountCombiner* cc, bool bidir_flag) :
+ bidir(bidir_flag),
+ kF(TD::Convert("F")),
+ kE(TD::Convert("E")),
+ kDIVIDER(TD::Convert("|||")),
+ kLB("["), kRB("]"),
+ combiner(*cc),
+ kEMPTY(),
+ kCFE(FD::Convert("CFE")) {
+ for (int i=1; i < 50; ++i)
+ index2sym[1-i] = TD::Convert(kLB + boost::lexical_cast<string>(i) + kRB);
+ fmajor_key.resize(10, kF);
+ emajor_key.resize(10, kE);
+ if (bidir)
+ fmajor_key[2] = emajor_key[2] = kDIVIDER;
+ else
+ fmajor_key[1] = kDIVIDER;
+ }
+
+ protected:
+ virtual void CountRuleImpl(WordID lhs,
+ const vector<WordID>& rhs_f,
+ const vector<WordID>& rhs_e,
+ const vector<pair<short,short> >& fe_terminal_alignments) {
+ if (bidir) { // extract rules in "both directions" E->F and F->E
+ fmajor_key.resize(3 + rhs_f.size());
+ emajor_key.resize(3 + rhs_e.size());
+ fmajor_val.resize(rhs_e.size());
+ emajor_val.resize(rhs_f.size());
+ emajor_key[1] = fmajor_key[1] = MapSym(lhs);
+ int nt = 1;
+ for (int i = 0; i < rhs_f.size(); ++i) {
+ const WordID id = rhs_f[i];
+ if (id < 0) {
+ fmajor_key[3 + i] = MapSym(id, nt);
+ emajor_val[i] = MapSym(id, nt);
+ ++nt;
+ } else {
+ fmajor_key[3 + i] = id;
+ emajor_val[i] = id;
+ }
+ }
+ for (int i = 0; i < rhs_e.size(); ++i) {
+ WordID id = rhs_e[i];
+ if (id <= 0) {
+ fmajor_val[i] = index2sym[id];
+ emajor_key[3 + i] = index2sym[id];
+ } else {
+ fmajor_val[i] = id;
+ emajor_key[3 + i] = id;
+ }
+ }
+ combiner.Count(fmajor_key, fmajor_val, kCFE, fe_terminal_alignments);
+ combiner.Count(emajor_key, emajor_val, kCFE, kEMPTY);
+ } else { // extract rules only in F->E
+ fmajor_key.resize(2 + rhs_f.size());
+ fmajor_val.resize(rhs_e.size());
+ fmajor_key[0] = MapSym(lhs);
+ int nt = 1;
+ for (int i = 0; i < rhs_f.size(); ++i) {
+ const WordID id = rhs_f[i];
+ if (id < 0)
+ fmajor_key[2 + i] = MapSym(id, nt++);
+ else
+ fmajor_key[2 + i] = id;
+ }
+ for (int i = 0; i < rhs_e.size(); ++i) {
+ const WordID id = rhs_e[i];
+ if (id <= 0)
+ fmajor_val[i] = index2sym[id];
+ else
+ fmajor_val[i] = id;
+ }
+ combiner.Count(fmajor_key, fmajor_val, kCFE, fe_terminal_alignments);
+ }
+ }
+
+ private:
+ WordID MapSym(WordID sym, int ind = 0) {
+ WordID& r = cat2ind2sym[sym][ind];
+ if (!r) {
+ if (ind == 0)
+ r = TD::Convert(kLB + TD::Convert(-sym) + kRB);
+ else
+ r = TD::Convert(kLB + TD::Convert(-sym) + "," + boost::lexical_cast<string>(ind) + kRB);
+ }
+ return r;
+ }
+
+ const bool bidir;
+ const WordID kF, kE, kDIVIDER;
+ const string kLB, kRB;
+ CountCombiner& combiner;
+ const vector<pair<short,short> > kEMPTY;
+ const int kCFE;
+ map<WordID, map<int, WordID> > cat2ind2sym;
+ map<int, WordID> index2sym;
+ vector<WordID> emajor_key, emajor_val, fmajor_key, fmajor_val;
+};
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ kBOS = TD::Convert("<s>");
+ kEOS = TD::Convert("</s>");
+ kDIVIDER = TD::Convert("|||");
+ kGAP = TD::Convert("<PHRASE>");
+ kCOUNT = FD::Convert("C");
+
+ WordID default_cat = 0; // 0 means no default- extraction will
+ // fail if a phrase is extracted without a
+ // category
+ if (conf.count("default_category")) {
+ string sdefault_cat = conf["default_category"].as<string>();
+ default_cat = -TD::Convert(sdefault_cat);
+ cerr << "Default category: " << sdefault_cat << endl;
+ } else {
+ cerr << "No default category (use --default_category if you want to set one)\n";
+ }
+ ReadFile rf(conf["input"].as<string>());
+ istream& in = *rf.stream();
+
+ char buf[MAX_LINE_LENGTH];
+ AnnotatedParallelSentence sentence;
+ vector<ParallelSpan> phrases;
+ const int max_base_phrase_size = conf["max_base_phrase_size"].as<int>();
+ const bool write_phrase_contexts = conf.count("phrase_context") > 0;
+ const bool write_base_phrases = conf.count("base_phrase") > 0;
+ const bool write_base_phrase_spans = conf.count("base_phrase_spans") > 0;
+ const bool loose_phrases = conf.count("loose") > 0;
+ const bool silent = conf.count("silent") > 0;
+ const int max_syms = conf["max_syms"].as<int>();
+ const int max_vars = conf["max_vars"].as<int>();
+ const int ctx_size = conf["phrase_context_size"].as<int>();
+ const bool permit_adjacent_nonterminals = conf.count("permit_adjacent_nonterminals") > 0;
+ const bool require_aligned_terminal = conf.count("no_required_aligned_terminal") == 0;
+ int line = 0;
+ CountCombiner cc(conf["combiner_size"].as<size_t>());
+ HadoopStreamingRuleObserver o(&cc,
+ conf.count("bidir") > 0);
+ //SimpleRuleWriter o;
+ while(in) {
+ ++line;
+ in.getline(buf, MAX_LINE_LENGTH);
+ if (buf[0] == 0) continue;
+ if (!silent) {
+ if (line % 200 == 0) cerr << '.';
+ if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush;
+ }
+ sentence.ParseInputLine(buf);
+ phrases.clear();
+ Extract::ExtractBasePhrases(max_base_phrase_size, sentence, &phrases);
+ if (loose_phrases)
+ Extract::LoosenPhraseBounds(sentence, max_base_phrase_size, &phrases);
+ if (phrases.empty()) {
+ cerr << "WARNING no phrases extracted line: " << line << endl;
+ continue;
+ }
+ if (write_phrase_contexts) {
+ WritePhraseContexts(sentence, phrases, ctx_size, &cc);
+ continue;
+ }
+ if (write_base_phrases) {
+ WriteBasePhrases(sentence, phrases);
+ continue;
+ }
+ if (write_base_phrase_spans) {
+ WriteBasePhraseSpans(sentence, phrases);
+ continue;
+ }
+ Extract::AnnotatePhrasesWithCategoryTypes(default_cat, sentence.span_types, &phrases);
+ Extract::ExtractConsistentRules(sentence, phrases, max_vars, max_syms, permit_adjacent_nonterminals, require_aligned_terminal, &o);
+ }
+ if (!silent) cerr << endl;
+ return 0;
+}
+
diff --git a/extools/filter_grammar.cc b/extools/filter_grammar.cc
new file mode 100644
index 00000000..290a77f5
--- /dev/null
+++ b/extools/filter_grammar.cc
@@ -0,0 +1,202 @@
+/*
+ * Build suffix tree representation of a data set for grammar filtering
+ * ./filter_grammar <test set> < unfiltered.grammar > filter.grammar
+ *
+ */
+#include <iostream>
+#include <string>
+#include <map>
+#include <vector>
+#include <utility>
+#include <cstdlib>
+#include <fstream>
+#include <tr1/unordered_map>
+
+#include "filelib.h"
+#include "sentence_pair.h"
+#include "suffix_tree.h"
+#include "extract.h"
+#include "fdict.h"
+#include "tdict.h"
+
+#include <boost/functional/hash.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+
+using namespace std;
+using namespace std::tr1;
+
+static const size_t MAX_LINE_LENGTH = 64000000;
+
+typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics;
+
+
+namespace {
+ inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+ inline bool IsBracket(char c){return c == '[' || c == ']';}
+ inline void SkipWhitespace(const char* buf, int* ptr) {
+ while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
+ }
+}
+
+
+
+int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) {
+ static const WordID kDIV = TD::Convert("|||");
+
+ int ptr = sstart;
+ while(ptr < end) {
+ while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; }
+ int start = ptr;
+ while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
+ if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; }
+ //look in the buffer and see if its a nonterminal marker before integerizing it to wordID-anything with [...] or |||
+
+ const WordID w = TD::Convert(string(buf, start, ptr - start));
+
+ if((IsBracket(buf[start]) and IsBracket(buf[ptr-1])) or( w == kDIV))
+ p->push_back(-1);
+ else {
+ if (w == kDIV) return ptr;
+ p->push_back(w);
+ }
+ }
+ return ptr;
+}
+
+
+
+void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) {
+ static const WordID kDIV = TD::Convert("|||");
+ counts->clear();
+ int ptr = 0;
+ while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; }
+ if (buf[ptr] != '\t') {
+ cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl;
+ exit(1);
+ }
+ cur_key->clear();
+ // key is: "[X] ||| word word word"
+ int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key);
+ cur_key->push_back(kDIV);
+ ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
+ ++ptr;
+ int start = ptr;
+ int end = ptr;
+ int state = 0; // 0=reading label, 1=reading count
+ vector<WordID> name;
+ while(buf[ptr] != 0) {
+ while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; }
+ if (buf[ptr] == '|') {
+ ++ptr;
+ if (buf[ptr] == '|') {
+ ++ptr;
+ if (buf[ptr] == '|') {
+ ++ptr;
+ end = ptr - 3;
+ while (end > start && IsWhitespace(buf[end-1])) { --end; }
+ if (start == end) {
+ cerr << "Got empty token!\n LINE=" << buf << endl;
+ exit(1);
+ }
+ switch (state) {
+ case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
+ case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
+ default: cerr << "Can't happen\n"; abort();
+ }
+ SkipWhitespace(buf, &ptr);
+ start = ptr;
+ }
+ }
+ }
+ }
+ end=ptr;
+ while (end > start && IsWhitespace(buf[end-1])) { --end; }
+ if (end > start) {
+ switch (state) {
+ case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
+ case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
+ default: cerr << "Can't happen\n"; abort();
+ }
+ }
+}
+
+
+
+
+
+
+
+int main(int argc, char* argv[]){
+ if (argc != 2) {
+ cerr << "Usage: " << argv[0] << " testset.txt < unfiltered.grammar\n";
+ return 1;
+ }
+
+ assert(FileExists(argv[1]));
+ ReadFile rfts(argv[1]);
+ istream& testSet = *rfts.stream();
+ ofstream filter_grammar_;
+ bool DEBUG = false;
+
+
+ AnnotatedParallelSentence sent;
+ char* buf = new char[MAX_LINE_LENGTH];
+ cerr << "Build suffix tree from test set in " << argv[1] << endl;
+ //root of the suffix tree
+ Node<int> root;
+ int line=0;
+
+ /* process the data set to build suffix tree
+ */
+ while(!testSet.eof()) {
+ ++line;
+ testSet.getline(buf, MAX_LINE_LENGTH);
+ if (buf[0] == 0) continue;
+
+ //hack to read in the test set using the alignedparallelsentence methods
+ strcat(buf," ||| fake ||| 0-0");
+ sent.ParseInputLine(buf);
+
+ if (DEBUG)cerr << line << "||| " << buf << " -- " << sent.f_len << endl;
+
+ //add each successive suffix to the tree
+ for(int i =0;i<sent.f_len;i++)
+ root.InsertPath(sent.f, i, sent.f_len - 1);
+ if(DEBUG)cerr<<endl;
+
+ }
+
+ cerr << "Filtering grammar..." << endl;
+ //process the unfiltered, unscored grammar
+
+ ID2RuleStatistics cur_counts;
+ vector<WordID> cur_key;
+ line = 0;
+
+ while(cin)
+ {
+ ++line;
+ cin.getline(buf, MAX_LINE_LENGTH);
+ if (buf[0] == 0) continue;
+ ParseLine(buf, &cur_key, &cur_counts);
+ const Node<int>* curnode = &root;
+ for(int i=0;i<cur_key.size() - 1; i++)
+ {
+ if (DEBUG) cerr << line << " " << cur_key[i] << " ::: ";
+ if(cur_key[i] == -1)
+ {
+ curnode = &root;
+ } else if (curnode) {
+ curnode = root.Extend(cur_key[i]);
+ if (!curnode) break;
+ }
+ }
+
+ if(curnode)
+ cout << buf << endl;
+ }
+
+ return 0;
+}
diff --git a/extools/lex_trans_tbl.h b/extools/lex_trans_tbl.h
new file mode 100644
index 00000000..81d6ccc9
--- /dev/null
+++ b/extools/lex_trans_tbl.h
@@ -0,0 +1,24 @@
+/*
+ * lex_trans_tbl.h
+ *
+ * Created on: May 25, 2010
+ * Author: Vlad
+ */
+
+#ifndef LEX_TRANS_TBL_H_
+#define LEX_TRANS_TBL_H_
+
+#include <map>
+
+class LexTranslationTable
+{
+ public:
+
+ std::map < std::pair<WordID,WordID>,int > word_translation;
+ std::map <WordID, int> total_foreign;
+ std::map <WordID, int> total_english;
+ void createTTable(const char* buf);
+
+};
+
+#endif /* LEX_TRANS_TBL_H_ */
diff --git a/extools/merge_lines.pl b/extools/merge_lines.pl
new file mode 100755
index 00000000..8711e4ce
--- /dev/null
+++ b/extools/merge_lines.pl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl -w
+use strict;
+
+if (scalar @ARGV < 2) {
+ die "Usage: $0 file1.txt file2.txt ...\n\n Concatenate the nth line of each input file. All files\n must be the same length.\n\n";
+}
+
+my @fhs=();
+for my $file (@ARGV) {
+ my $fh;
+ open $fh, "<$file" or die "Can't read $file: $!\n";
+ push @fhs, $fh;
+}
+
+my $first = shift @fhs;
+
+while(my $x = <$first>) {
+ my $ind = 0;
+ chomp $x;
+ my @fields = ($x);
+ for my $fh (@fhs) {
+ $ind++;
+ $x = <$fh>;
+ die "ERROR: Mismatched number of lines: $ARGV[$ind]\n" unless $x;
+ chomp $x;
+ push @fields, $x;
+ }
+ print join ' ||| ', @fields;
+ print "\n";
+}
+my $ind = 0;
+for my $fh (@fhs) {
+ $ind++;
+ my $x=<$fh>;
+ die "ERROR: $ARGV[$ind] has extra lines!\n" if $x;
+}
+
+exit 0;
+
+for my $fh (@fhs) {
+ close $fh;
+}
+
diff --git a/extools/mr_stripe_rule_reduce.cc b/extools/mr_stripe_rule_reduce.cc
new file mode 100644
index 00000000..eaf1b6d7
--- /dev/null
+++ b/extools/mr_stripe_rule_reduce.cc
@@ -0,0 +1,242 @@
+#include <iostream>
+#include <vector>
+#include <utility>
+#include <cstdlib>
+#include <tr1/unordered_map>
+
+#include <boost/functional/hash.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "tdict.h"
+#include "sentence_pair.h"
+#include "fdict.h"
+#include "extract.h"
+
+using namespace std;
+using namespace std::tr1;
+namespace po = boost::program_options;
+
+static const size_t MAX_LINE_LENGTH = 64000000;
+
+bool use_hadoop_counters = false;
+
+namespace {
+ inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+
+ inline void SkipWhitespace(const char* buf, int* ptr) {
+ while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
+ }
+}
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("phrase_marginals,p", "Compute phrase marginals")
+ ("use_hadoop_counters,C", "Enable this if running inside Hadoop")
+ ("bidir,b", "Rules are tagged as being F->E or E->F, invert E rules in output")
+ ("help,h", "Print this help message and exit");
+ po::options_description clo("Command line options");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ po::notify(*conf);
+
+ if (conf->count("help")) {
+ cerr << "\nUsage: mr_stripe_rule_reduce [-options]\n";
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics;
+
+void PlusEquals(const ID2RuleStatistics& v, ID2RuleStatistics* self) {
+ for (ID2RuleStatistics::const_iterator it = v.begin(); it != v.end(); ++it) {
+ RuleStatistics& dest = (*self)[it->first];
+ dest += it->second;
+ // TODO - do something smarter about alignments?
+ if (dest.aligns.empty() && !it->second.aligns.empty())
+ dest.aligns = it->second.aligns;
+ }
+}
+
+int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) {
+ static const WordID kDIV = TD::Convert("|||");
+ int ptr = sstart;
+ while(ptr < end) {
+ while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; }
+ int start = ptr;
+ while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
+ if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; }
+ const WordID w = TD::Convert(string(buf, start, ptr - start));
+ if (w == kDIV) return ptr;
+ p->push_back(w);
+ }
+ return ptr;
+}
+
+void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) {
+ static const WordID kDIV = TD::Convert("|||");
+ counts->clear();
+ int ptr = 0;
+ while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; }
+ if (buf[ptr] != '\t') {
+ cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl;
+ exit(1);
+ }
+ cur_key->clear();
+ // key is: "[X] ||| word word word"
+ int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key);
+ if (buf[tmpp] != '\t') {
+ cur_key->push_back(kDIV);
+ ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
+ }
+ ++ptr;
+ int start = ptr;
+ int end = ptr;
+ int state = 0; // 0=reading label, 1=reading count
+ vector<WordID> name;
+ while(buf[ptr] != 0) {
+ while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; }
+ if (buf[ptr] == '|') {
+ ++ptr;
+ if (buf[ptr] == '|') {
+ ++ptr;
+ if (buf[ptr] == '|') {
+ ++ptr;
+ end = ptr - 3;
+ while (end > start && IsWhitespace(buf[end-1])) { --end; }
+ if (start == end) {
+ cerr << "Got empty token!\n LINE=" << buf << endl;
+ exit(1);
+ }
+ switch (state) {
+ case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
+ case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
+ default: cerr << "Can't happen\n"; abort();
+ }
+ SkipWhitespace(buf, &ptr);
+ start = ptr;
+ }
+ }
+ }
+ }
+ end=ptr;
+ while (end > start && IsWhitespace(buf[end-1])) { --end; }
+ if (end > start) {
+ switch (state) {
+ case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
+ case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
+ default: cerr << "Can't happen\n"; abort();
+ }
+ }
+}
+
+void WriteKeyValue(const vector<WordID>& key, const ID2RuleStatistics& val) {
+ cout << TD::GetString(key) << '\t';
+ bool needdiv = false;
+ for (ID2RuleStatistics::const_iterator it = val.begin(); it != val.end(); ++it) {
+ if (needdiv) cout << " ||| "; else needdiv = true;
+ cout << TD::GetString(it->first) << " ||| " << it->second;
+ }
+ cout << endl;
+ if (use_hadoop_counters) cerr << "reporter:counter:UserCounters,RuleCount," << val.size() << endl;
+}
+
+void DoPhraseMarginals(const vector<WordID>& key, const bool bidir, ID2RuleStatistics* val) {
+ static const WordID kF = TD::Convert("F");
+ static const WordID kE = TD::Convert("E");
+ static const int kCF = FD::Convert("CF");
+ static const int kCE = FD::Convert("CE");
+ static const int kCFE = FD::Convert("CFE");
+ assert(key.size() > 0);
+ int cur_marginal_id = kCF;
+ if (bidir) {
+ if (key[0] != kF && key[0] != kE) {
+ cerr << "DoPhraseMarginals expects keys to have the from 'F|E [NT] word word word'\n";
+ cerr << " but got: " << TD::GetString(key) << endl;
+ exit(1);
+ }
+ if (key[0] == kE) cur_marginal_id = kCE;
+ }
+ double tot = 0;
+ for (ID2RuleStatistics::iterator it = val->begin(); it != val->end(); ++it)
+ tot += it->second.counts.value(kCFE);
+ for (ID2RuleStatistics::iterator it = val->begin(); it != val->end(); ++it) {
+ it->second.counts.set_value(cur_marginal_id, tot);
+
+ // prevent double counting of the joint
+ if (cur_marginal_id == kCE) it->second.counts.clear_value(kCFE);
+ }
+}
+
+void WriteWithInversions(const vector<WordID>& key, const ID2RuleStatistics& val) {
+ static const WordID kE = TD::Convert("E");
+ static const WordID kDIV = TD::Convert("|||");
+ vector<WordID> new_key(key.size() - 1);
+ for (int i = 1; i < key.size(); ++i)
+ new_key[i - 1] = key[i];
+ const bool do_invert = (key[0] == kE);
+ if (!do_invert) {
+ WriteKeyValue(new_key, val);
+ } else {
+ ID2RuleStatistics inv;
+ assert(new_key.size() > 2);
+ vector<WordID> tk(new_key.size() - 2);
+ for (int i = 0; i < tk.size(); ++i)
+ tk[i] = new_key[2 + i];
+ RuleStatistics& inv_stats = inv[tk];
+ for (ID2RuleStatistics::const_iterator it = val.begin(); it != val.end(); ++it) {
+ inv_stats.counts = it->second.counts;
+ vector<WordID> ekey(2 + it->first.size());
+ ekey[0] = key[1];
+ ekey[1] = kDIV;
+ for (int i = 0; i < it->first.size(); ++i)
+ ekey[2+i] = it->first[i];
+ WriteKeyValue(ekey, inv);
+ }
+ }
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+
+ char* buf = new char[MAX_LINE_LENGTH];
+ ID2RuleStatistics acc, cur_counts;
+ vector<WordID> key, cur_key;
+ int line = 0;
+ use_hadoop_counters = conf.count("use_hadoop_counters") > 0;
+ const bool phrase_marginals = conf.count("phrase_marginals") > 0;
+ const bool bidir = conf.count("bidir") > 0;
+ while(cin) {
+ ++line;
+ cin.getline(buf, MAX_LINE_LENGTH);
+ if (buf[0] == 0) continue;
+ ParseLine(buf, &cur_key, &cur_counts);
+ if (cur_key != key) {
+ if (key.size() > 0) {
+ if (phrase_marginals)
+ DoPhraseMarginals(key, bidir, &acc);
+ if (bidir)
+ WriteWithInversions(key, acc);
+ else
+ WriteKeyValue(key, acc);
+ acc.clear();
+ }
+ key = cur_key;
+ }
+ PlusEquals(cur_counts, &acc);
+ }
+ if (key.size() > 0) {
+ if (phrase_marginals)
+ DoPhraseMarginals(key, bidir, &acc);
+ if (bidir)
+ WriteWithInversions(key, acc);
+ else
+ WriteKeyValue(key, acc);
+ }
+ return 0;
+}
+
diff --git a/extools/score_grammar.cc b/extools/score_grammar.cc
new file mode 100644
index 00000000..534b6295
--- /dev/null
+++ b/extools/score_grammar.cc
@@ -0,0 +1,346 @@
+/*
+ * Score a grammar in striped format
+ * ./score_grammar <alignment> < filtered.grammar > scored.grammar
+ */
+#include <iostream>
+#include <string>
+#include <map>
+#include <vector>
+#include <utility>
+#include <cstdlib>
+#include <fstream>
+#include <tr1/unordered_map>
+
+#include "sentence_pair.h"
+#include "extract.h"
+#include "fdict.h"
+#include "tdict.h"
+#include "lex_trans_tbl.h"
+#include "filelib.h"
+
+#include <boost/functional/hash.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+using namespace std;
+using namespace std::tr1;
+
+
+static const size_t MAX_LINE_LENGTH = 64000000;
+
+typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics;
+
+
+namespace {
+ inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+ inline bool IsBracket(char c){return c == '[' || c == ']';}
+ inline void SkipWhitespace(const char* buf, int* ptr) {
+ while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
+ }
+}
+
+int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) {
+ static const WordID kDIV = TD::Convert("|||");
+ int ptr = sstart;
+ while(ptr < end) {
+ while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; }
+ int start = ptr;
+ while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
+ if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; }
+ const WordID w = TD::Convert(string(buf, start, ptr - start));
+
+ if((IsBracket(buf[start]) and IsBracket(buf[ptr-1])) or( w == kDIV))
+ p->push_back(1 * w);
+ else {
+ if (w == kDIV) return ptr;
+ p->push_back(w);
+ }
+ }
+ return ptr;
+}
+
+
+void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) {
+ static const WordID kDIV = TD::Convert("|||");
+ counts->clear();
+ int ptr = 0;
+ while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; }
+ if (buf[ptr] != '\t') {
+ cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl;
+ exit(1);
+ }
+ cur_key->clear();
+ // key is: "[X] ||| word word word"
+ int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key);
+ cur_key->push_back(kDIV);
+ ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
+ ++ptr;
+ int start = ptr;
+ int end = ptr;
+ int state = 0; // 0=reading label, 1=reading count
+ vector<WordID> name;
+ while(buf[ptr] != 0) {
+ while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; }
+ if (buf[ptr] == '|') {
+ ++ptr;
+ if (buf[ptr] == '|') {
+ ++ptr;
+ if (buf[ptr] == '|') {
+ ++ptr;
+ end = ptr - 3;
+ while (end > start && IsWhitespace(buf[end-1])) { --end; }
+ if (start == end) {
+ cerr << "Got empty token!\n LINE=" << buf << endl;
+ exit(1);
+ }
+ switch (state) {
+ case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
+ case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
+ default: cerr << "Can't happen\n"; abort();
+ }
+ SkipWhitespace(buf, &ptr);
+ start = ptr;
+ }
+ }
+ }
+ }
+ end=ptr;
+ while (end > start && IsWhitespace(buf[end-1])) { --end; }
+ if (end > start) {
+ switch (state) {
+ case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
+ case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
+ default: cerr << "Can't happen\n"; abort();
+ }
+ }
+}
+
+
+
+void LexTranslationTable::createTTable(const char* buf){
+
+ bool DEBUG = false;
+
+ AnnotatedParallelSentence sent;
+
+ sent.ParseInputLine(buf);
+
+ //iterate over the alignment to compute aligned words
+
+ for(int i =0;i<sent.aligned.width();i++)
+ {
+ for (int j=0;j<sent.aligned.height();j++)
+ {
+ if (DEBUG) cerr << sent.aligned(i,j) << " ";
+ if( sent.aligned(i,j))
+ {
+ if (DEBUG) cerr << TD::Convert(sent.f[i]) << " aligned to " << TD::Convert(sent.e[j]);
+ ++word_translation[pair<WordID,WordID> (sent.f[i], sent.e[j])];
+ ++total_foreign[sent.f[i]];
+ ++total_english[sent.e[j]];
+ }
+ }
+ if (DEBUG) cerr << endl;
+ }
+ if (DEBUG) cerr << endl;
+
+ static const WordID NULL_ = TD::Convert("NULL");
+ //handle unaligned words - align them to null
+ for (int j =0; j < sent.e_len; j++)
+ {
+ if (sent.e_aligned[j]) continue;
+ ++word_translation[pair<WordID,WordID> (NULL_, sent.e[j])];
+ ++total_foreign[NULL_];
+ ++total_english[sent.e[j]];
+ }
+
+ for (int i =0; i < sent.f_len; i++)
+ {
+ if (sent.f_aligned[i]) continue;
+ ++word_translation[pair<WordID,WordID> (sent.f[i], NULL_)];
+ ++total_english[NULL_];
+ ++total_foreign[sent.f[i]];
+ }
+
+}
+
+
+inline float safenlog(float v) {
+ if (v == 1.0f) return 0.0f;
+ float res = -log(v);
+ if (res > 100.0f) res = 100.0f;
+ return res;
+}
+
+int main(int argc, char** argv){
+ bool DEBUG= false;
+ if (argc != 2) {
+ cerr << "Usage: " << argv[0] << " corpus.al < filtered.grammar\n";
+ return 1;
+ }
+ ifstream alignment (argv[1]);
+ istream& unscored_grammar = cin;
+ ostream& scored_grammar = cout;
+
+ //create lexical translation table
+ cerr << "Creating table..." << endl;
+ char* buf = new char[MAX_LINE_LENGTH];
+
+ LexTranslationTable table;
+
+ while(!alignment.eof())
+ {
+ alignment.getline(buf, MAX_LINE_LENGTH);
+ if (buf[0] == 0) continue;
+
+ table.createTTable(buf);
+ }
+
+ bool PRINT_TABLE=false;
+ if (PRINT_TABLE)
+ {
+ ofstream trans_table;
+ trans_table.open("lex_trans_table.out");
+ for(map < pair<WordID,WordID>,int >::iterator it = table.word_translation.begin(); it != table.word_translation.end(); ++it)
+ {
+ trans_table << TD::Convert(it->first.first) << "|||" << TD::Convert(it->first.second) << "==" << it->second << "//" << table.total_foreign[it->first.first] << "//" << table.total_english[it->first.second] << endl;
+ }
+
+ trans_table.close();
+ }
+
+
+ //score unscored grammar
+ cerr <<"Scoring grammar..." << endl;
+
+ ID2RuleStatistics acc, cur_counts;
+ vector<WordID> key, cur_key,temp_key;
+ vector< pair<short,short> > al;
+ vector< pair<short,short> >::iterator ita;
+ int line = 0;
+
+ static const int kCF = FD::Convert("CF");
+ static const int kCE = FD::Convert("CE");
+ static const int kCFE = FD::Convert("CFE");
+
+ while(!unscored_grammar.eof())
+ {
+ ++line;
+ unscored_grammar.getline(buf, MAX_LINE_LENGTH);
+ if (buf[0] == 0) continue;
+ ParseLine(buf, &cur_key, &cur_counts);
+
+ //loop over all the Target side phrases that this source aligns to
+ for (ID2RuleStatistics::const_iterator it = cur_counts.begin(); it != cur_counts.end(); ++it)
+ {
+
+ /*Compute phrase translation prob.
+ Print out scores in this format:
+ Phrase trnaslation prob P(F|E)
+ Phrase translation prob P(E|F)
+ Lexical weighting prob lex(F|E)
+ Lexical weighting prob lex(E|F)
+ */
+
+ float pEF_ = it->second.counts.value(kCFE) / it->second.counts.value(kCF);
+ float pFE_ = it->second.counts.value(kCFE) / it->second.counts.value(kCE);
+
+ map <WordID, pair<int, float> > foreign_aligned;
+ map <WordID, pair<int, float> > english_aligned;
+
+ //Loop over all the alignment points to compute lexical translation probability
+ al = it->second.aligns;
+ for(ita = al.begin(); ita != al.end(); ++ita)
+ {
+
+ if (DEBUG)
+ {
+ cerr << "\nA:" << ita->first << "," << ita->second << "::";
+ cerr << TD::Convert(cur_key[ita->first + 2]) << "-" << TD::Convert(it->first[ita->second]);
+ }
+
+
+ //Lookup this alignment probability in the table
+ int temp = table.word_translation[pair<WordID,WordID> (cur_key[ita->first+2],it->first[ita->second])];
+ float f2e=0, e2f=0;
+ if ( table.total_foreign[cur_key[ita->first+2]] != 0)
+ f2e = (float) temp / table.total_foreign[cur_key[ita->first+2]];
+ if ( table.total_english[it->first[ita->second]] !=0 )
+ e2f = (float) temp / table.total_english[it->first[ita->second]];
+ if (DEBUG) printf (" %d %E %E\n", temp, f2e, e2f);
+
+
+ //local counts to keep track of which things haven't been aligned, to later compute their null alignment
+ if (foreign_aligned.count(cur_key[ita->first+2]))
+ {
+ foreign_aligned[ cur_key[ita->first+2] ].first++;
+ foreign_aligned[ cur_key[ita->first+2] ].second += e2f;
+ }
+ else
+ foreign_aligned [ cur_key[ita->first+2] ] = pair<int,float> (1,e2f);
+
+
+
+ if (english_aligned.count( it->first[ ita->second] ))
+ {
+ english_aligned[ it->first[ ita->second ]].first++;
+ english_aligned[ it->first[ ita->second] ].second += f2e;
+ }
+ else
+ english_aligned [ it->first[ ita->second] ] = pair<int,float> (1,f2e);
+
+
+
+
+ }
+
+ float final_lex_f2e=1, final_lex_e2f=1;
+ static const WordID NULL_ = TD::Convert("NULL");
+
+ //compute lexical weight P(F|E) and include unaligned foreign words
+ for(int i=0;i<cur_key.size(); i++)
+ {
+
+ if (!table.total_foreign.count(cur_key[i])) continue; //if we dont have it in the translation table, we won't know its lexical weight
+
+ if (foreign_aligned.count(cur_key[i]))
+ {
+ pair<int, float> temp_lex_prob = foreign_aligned[cur_key[i]];
+ final_lex_e2f *= temp_lex_prob.second / temp_lex_prob.first;
+ }
+ else //dealing with null alignment
+ {
+ int temp_count = table.word_translation[pair<WordID,WordID> (cur_key[i],NULL_)];
+ float temp_e2f = (float) temp_count / table.total_english[NULL_];
+ final_lex_e2f *= temp_e2f;
+ }
+
+ }
+
+ //compute P(E|F) unaligned english words
+ for(int j=0; j< it->first.size(); j++)
+ {
+ if (!table.total_english.count(it->first[j])) continue;
+
+ if (english_aligned.count(it->first[j]))
+ {
+ pair<int, float> temp_lex_prob = english_aligned[it->first[j]];
+ final_lex_f2e *= temp_lex_prob.second / temp_lex_prob.first;
+ }
+ else //dealing with null
+ {
+ int temp_count = table.word_translation[pair<WordID,WordID> (NULL_,it->first[j])];
+ float temp_f2e = (float) temp_count / table.total_foreign[NULL_];
+ final_lex_f2e *= temp_f2e;
+ }
+ }
+
+
+ scored_grammar << TD::GetString(cur_key);
+ scored_grammar << " " << TD::GetString(it->first) << " |||";
+ scored_grammar << " " << safenlog(pFE_) << " " << safenlog(pEF_);
+ scored_grammar << " " << safenlog(final_lex_e2f) << " " << safenlog(final_lex_f2e) << endl;
+ }
+ }
+}
+
diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc
new file mode 100644
index 00000000..91286059
--- /dev/null
+++ b/extools/sentence_pair.cc
@@ -0,0 +1,155 @@
+#include "sentence_pair.h"
+
+#include <queue>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <utility>
+#include <set>
+
+#include "tdict.h"
+#include "wordid.h"
+#include "array2d.h"
+
+using namespace std;
+
+namespace {
+ inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+
+ inline void SkipWhitespace(const char* buf, int* ptr) {
+ while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
+ }
+}
+
+void AnnotatedParallelSentence::Reset() {
+ f.clear();
+ e.clear();
+ e_aligned.clear();
+ f_aligned.clear();
+ aligns_by_fword.clear();
+ aligned.clear();
+ span_types.clear();
+}
+
+void AnnotatedParallelSentence::AllocateForAlignment() {
+ f_len = f.size();
+ e_len = e.size();
+ aligned.resize(f_len, e_len, false);
+ f_aligned.resize(f_len, 0);
+ e_aligned.resize(e_len, 0);
+ aligns_by_fword.resize(f_len);
+ span_types.resize(e_len, e_len+1);
+}
+
+// read an alignment point of the form X-Y where X and Y are strings
+// of digits. if permit_col is true, the right edge will be determined
+// by the presence of a colon
+int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf,
+ const int start,
+ const int end,
+ const bool permit_col,
+ short* a,
+ short* b) {
+ if (end - start < 3) {
+ cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl; abort();
+ }
+ int c = start;
+ *a = 0;
+ while(c < end && buf[c] != '-') {
+ if (buf[c] < '0' || buf[c] > '9') {
+ cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl;
+ abort();
+ }
+ (*a) *= 10;
+ (*a) += buf[c] - '0';
+ ++c;
+ }
+ ++c;
+ if (c >= end) {
+ cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl; abort();
+ }
+ (*b) = 0;
+ while(c < end && (!permit_col || (permit_col && buf[c] != ':'))) {
+ if (buf[c] < '0' || buf[c] > '9') {
+ cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl;
+ abort();
+ }
+ (*b) *= 10;
+ (*b) += buf[c] - '0';
+ ++c;
+ }
+ return c;
+}
+
+void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) {
+ short a, b;
+ ReadAlignmentPoint(buf, start, end, false, &a, &b);
+ assert(a < f_len);
+ assert(b < e_len);
+ aligned(a,b) = true;
+ ++f_aligned[a];
+ ++e_aligned[b];
+ aligns_by_fword[a].push_back(make_pair(a,b));
+ // cerr << a << " " << b << endl;
+}
+
+void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) {
+ short a,b;
+ int c = ReadAlignmentPoint(buf, start, end, true, &a, &b) + 1;
+ if (buf[c-1] != ':' || c >= end) {
+ cerr << "Span badly formed: " << string(buf, start, end-start) << endl; abort();
+ }
+ // cerr << a << " " << b << " " << string(buf,c,end-c) << endl;
+ span_types(a,b).push_back(-TD::Convert(string(buf, c, end-c)));
+}
+
+// INPUT FORMAT
+// ein haus ||| a house ||| 0-0 1-1 ||| 0-0:DT 1-1:NN 0-1:NP
+void AnnotatedParallelSentence::ParseInputLine(const char* buf) {
+ Reset();
+ int ptr = 0;
+ SkipWhitespace(buf, &ptr);
+ int start = ptr;
+ int state = 0; // 0 = French, 1 = English, 2 = Alignment, 3 = Spans
+ while(char c = buf[ptr]) {
+ if (!IsWhitespace(c)) { ++ptr; continue; } else {
+ if (ptr - start == 3 && buf[start] == '|' && buf[start+1] == '|' && buf[start+2] == '|') {
+ ++state;
+ if (state == 4) { cerr << "Too many fields (ignoring):\n " << buf << endl; return; }
+ if (state == 2) {
+ // cerr << "FLEN=" << f->size() << " ELEN=" << e->size() << endl;
+ AllocateForAlignment();
+ }
+ SkipWhitespace(buf, &ptr);
+ start = ptr;
+ continue;
+ }
+ switch (state) {
+ case 0: f.push_back(TD::Convert(string(buf, start, ptr-start))); break;
+ case 1: e.push_back(TD::Convert(string(buf, start, ptr-start))); break;
+ case 2: ParseAlignmentPoint(buf, start, ptr); break;
+ case 3: ParseSpanLabel(buf, start, ptr); break;
+ default: cerr << "Can't happen\n"; abort();
+ }
+ SkipWhitespace(buf, &ptr);
+ start = ptr;
+ }
+ }
+ if (ptr > start) {
+ switch (state) {
+ case 0: f.push_back(TD::Convert(string(buf, start, ptr-start))); break;
+ case 1: e.push_back(TD::Convert(string(buf, start, ptr-start))); break;
+ case 2: ParseAlignmentPoint(buf, start, ptr); break;
+ case 3: ParseSpanLabel(buf, start, ptr); break;
+ default: cerr << "Can't happen\n"; abort();
+ }
+ }
+ if (state < 2) {
+ cerr << "Not enough fields: " << buf << endl;
+ abort();
+ }
+ if (e.empty() || f.empty()) {
+ cerr << "Sentences must not be empty: " << buf << endl;
+ }
+}
+
diff --git a/extools/sentence_pair.h b/extools/sentence_pair.h
new file mode 100644
index 00000000..d78be359
--- /dev/null
+++ b/extools/sentence_pair.h
@@ -0,0 +1,39 @@
+#ifndef _SENTENCE_PAIR_H_
+#define _SENTENCE_PAIR_H_
+
+#include <utility>
+#include <vector>
+#include "wordid.h"
+#include "array2d.h"
+
+// represents a parallel sentence with a word alignment and category
+// annotations over subspans (currently in terms of f)
+// you should read one using ParseInputLine and then use the public
+// member variables to query things about it
+struct AnnotatedParallelSentence {
+ // read annotated parallel sentence from string
+ void ParseInputLine(const char* buf);
+
+ std::vector<WordID> f, e; // words in f and e
+
+ // word alignment information
+ std::vector<int> e_aligned, f_aligned; // counts the number of times column/row x is aligned
+ Array2D<bool> aligned;
+ std::vector<std::vector<std::pair<short, short> > > aligns_by_fword;
+
+ // span type information
+ Array2D<std::vector<WordID> > span_types; // span_types(i,j) is the list of category
+ // types for a span (i,j) in the TARGET language.
+
+ int f_len, e_len;
+
+ static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b);
+
+ private:
+ void Reset();
+ void AllocateForAlignment();
+ void ParseAlignmentPoint(const char* buf, int start, int end);
+ void ParseSpanLabel(const char* buf, int start, int end);
+};
+
+#endif
diff --git a/extools/simple-extract-context.sh b/extools/simple-extract-context.sh
new file mode 100755
index 00000000..17487b1c
--- /dev/null
+++ b/extools/simple-extract-context.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+MYDIR=$(dirname $0)
+
+export LANG=C
+date 1>&2
+$MYDIR/extractor -i $1 -c 500000 -L 12 -C | sort -t $'\t' -k 1 | $MYDIR/mr_stripe_rule_reduce
+date 1>&2
+
diff --git a/extools/simple-extract.sh b/extools/simple-extract.sh
new file mode 100755
index 00000000..7d9f439d
--- /dev/null
+++ b/extools/simple-extract.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+export LANG=C
+date
+./extractor -i $1 -d X -c 500000 -L 12 -b | sort -t $'\t' -k 1 | gzip > ex.output.gz
+date
+# -p = compute phrase marginals
+# -b = bidirectional rules (starting with F or E) were extracted
+gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz
+date
+
diff --git a/extools/suffix_tree.h b/extools/suffix_tree.h
new file mode 100644
index 00000000..d3a9c08e
--- /dev/null
+++ b/extools/suffix_tree.h
@@ -0,0 +1,38 @@
+/*
+ * suffix_tree.h
+ *
+ * Created on: May 17, 2010
+ * Author: Vlad
+ */
+
+#ifndef SUFFIX_TREE_H_
+#define SUFFIX_TREE_H_
+
+#include <string>
+#include <map>
+#include <vector>
+
+template <class T>
+class Node {
+ public:
+ std::map<T, Node> edge_list_;
+ int InsertPath(const std::vector<T>& p, int start, int end);
+ const Node* Extend(const T& e) const {
+ typename std::map<T, Node>::const_iterator it = edge_list_.find(e);
+ if (it == edge_list_.end()) return NULL;
+ return &it->second;
+ }
+};
+
+bool DEBUG = false;
+
+template <class T>
+int Node<T>::InsertPath(const std::vector<T>& p, int start, int end){
+ Node* currNode = this;
+ for(int i=start;i<= end; i++ ) {
+ currNode = &(currNode->edge_list_)[p[i]];
+ }
+ return 1;
+}
+
+#endif /* SUFFIX_TRIE_H_ */
diff --git a/extools/test_data/README b/extools/test_data/README
new file mode 100644
index 00000000..e368cffc
--- /dev/null
+++ b/extools/test_data/README
@@ -0,0 +1,10 @@
+The following was used to create the test data. The real inputs
+were corpus.fr, corpus.en, and corpus.aligned. The generated files
+were corpus.len_cats and fr-en.al.len.
+
+
+ ./make_len_cats.pl corpus.en > corpus.len_cats
+
+ ../merge_lines.pl corpus.fr corpus.en corpus.aligned corpus.len_cats > fr-en.al.len
+
+
diff --git a/extools/test_data/corpus.aligned b/extools/test_data/corpus.aligned
new file mode 100644
index 00000000..aa09e9ab
--- /dev/null
+++ b/extools/test_data/corpus.aligned
@@ -0,0 +1,5 @@
+0-0 1-2 2-1
+0-0 1-1
+0-0 0-1 1-0 1-1 2-0 2-1 3-2 4-3
+0-0
+0-0 1-1
diff --git a/extools/test_data/corpus.en b/extools/test_data/corpus.en
new file mode 100644
index 00000000..2d4751bf
--- /dev/null
+++ b/extools/test_data/corpus.en
@@ -0,0 +1,5 @@
+the blue house
+the hat
+there is a hat
+cap
+the cat
diff --git a/extools/test_data/corpus.fr b/extools/test_data/corpus.fr
new file mode 100644
index 00000000..75b5e127
--- /dev/null
+++ b/extools/test_data/corpus.fr
@@ -0,0 +1,5 @@
+la maison bleue
+le chapeau
+il y a un chapeau
+chapeau
+le chat
diff --git a/extools/test_data/corpus.len_cats b/extools/test_data/corpus.len_cats
new file mode 100644
index 00000000..18d321de
--- /dev/null
+++ b/extools/test_data/corpus.len_cats
@@ -0,0 +1,5 @@
+0-1:SHORT 0-2:SHORT 0-3:MID 1-2:SHORT 1-3:SHORT 2-3:SHORT
+0-1:SHORT 0-2:SHORT 1-2:SHORT
+0-1:SHORT 0-2:SHORT 0-3:MID 0-4:MID 1-2:SHORT 1-3:SHORT 1-4:MID 2-3:SHORT 2-4:SHORT 3-4:SHORT
+0-1:SHORT
+0-1:SHORT 0-2:SHORT 1-2:SHORT
diff --git a/extools/test_data/fr-en.al.len b/extools/test_data/fr-en.al.len
new file mode 100644
index 00000000..7ee6b85d
--- /dev/null
+++ b/extools/test_data/fr-en.al.len
@@ -0,0 +1,5 @@
+la maison bleue ||| the blue house ||| 0-0 1-2 2-1 ||| 0-1:SHORT 0-2:SHORT 0-3:MID 1-2:SHORT 1-3:SHORT 2-3:SHORT
+le chapeau ||| the hat ||| 0-0 1-1 ||| 0-1:SHORT 0-2:SHORT 1-2:SHORT
+il y a un chapeau ||| there is a hat ||| 0-0 0-1 1-0 1-1 2-0 2-1 3-2 4-3 ||| 0-1:SHORT 0-2:SHORT 0-3:MID 0-4:MID 1-2:SHORT 1-3:SHORT 1-4:MID 2-3:SHORT 2-4:SHORT 3-4:SHORT
+chapeau ||| cap ||| 0-0 ||| 0-1:SHORT
+le chat ||| the cat ||| 0-0 1-1 ||| 0-1:SHORT 0-2:SHORT 1-2:SHORT
diff --git a/extools/test_data/make_len_cats.pl b/extools/test_data/make_len_cats.pl
new file mode 100755
index 00000000..25ef75fa
--- /dev/null
+++ b/extools/test_data/make_len_cats.pl
@@ -0,0 +1,23 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $max_len = 15;
+my @cat_names = qw( NULL SHORT SHORT MID MID MID LONG LONG LONG LONG LONG VLONG VLONG VLONG VLONG VLONG );
+
+while(<>) {
+ chomp;
+ my @words = split /\s+/;
+ my $len = scalar @words;
+ my @spans;
+ for (my $i =0; $i < $len; $i++) {
+ for (my $k = 1; $k <= $max_len; $k++) {
+ my $j = $i + $k;
+ next if ($j > $len);
+ my $cat = $cat_names[$k];
+ die unless $cat;
+ push @spans, "$i-$j:$cat";
+ }
+ }
+ print "@spans\n";
+}
+