diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 05:12:27 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 05:12:27 +0000 |
commit | 0172721855098ca02b207231a654dffa5e4eb1c9 (patch) | |
tree | 8069c3a62e2d72bd64a2cdeee9724b2679c8a56b /extools | |
parent | 37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff) |
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'extools')
-rw-r--r-- | extools/Makefile.am | 31 | ||||
-rw-r--r-- | extools/README | 32 | ||||
-rw-r--r-- | extools/build_lexical_translation.cc | 104 | ||||
-rw-r--r-- | extools/extract.cc | 341 | ||||
-rw-r--r-- | extools/extract.h | 110 | ||||
-rw-r--r-- | extools/extractor.cc | 363 | ||||
-rw-r--r-- | extools/filter_grammar.cc | 202 | ||||
-rw-r--r-- | extools/lex_trans_tbl.h | 24 | ||||
-rwxr-xr-x | extools/merge_lines.pl | 43 | ||||
-rw-r--r-- | extools/mr_stripe_rule_reduce.cc | 242 | ||||
-rw-r--r-- | extools/score_grammar.cc | 346 | ||||
-rw-r--r-- | extools/sentence_pair.cc | 155 | ||||
-rw-r--r-- | extools/sentence_pair.h | 39 | ||||
-rwxr-xr-x | extools/simple-extract-context.sh | 9 | ||||
-rwxr-xr-x | extools/simple-extract.sh | 11 | ||||
-rw-r--r-- | extools/suffix_tree.h | 38 | ||||
-rw-r--r-- | extools/test_data/README | 10 | ||||
-rw-r--r-- | extools/test_data/corpus.aligned | 5 | ||||
-rw-r--r-- | extools/test_data/corpus.en | 5 | ||||
-rw-r--r-- | extools/test_data/corpus.fr | 5 | ||||
-rw-r--r-- | extools/test_data/corpus.len_cats | 5 | ||||
-rw-r--r-- | extools/test_data/fr-en.al.len | 5 | ||||
-rwxr-xr-x | extools/test_data/make_len_cats.pl | 23 |
23 files changed, 2148 insertions, 0 deletions
diff --git a/extools/Makefile.am b/extools/Makefile.am new file mode 100644 index 00000000..5baefa21 --- /dev/null +++ b/extools/Makefile.am @@ -0,0 +1,31 @@ +bin_PROGRAMS = \ + extractor \ + mr_stripe_rule_reduce \ + build_lexical_translation \ + score_grammar \ + filter_grammar + +noinst_PROGRAMS = + +build_lexical_translation_SOURCES = build_lexical_translation.cc extract.cc sentence_pair.cc +build_lexical_translation_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +build_lexical_translation_LDFLAGS = -all-static + +mr_stripe_rule_reduce_SOURCES = mr_stripe_rule_reduce.cc extract.cc sentence_pair.cc +mr_stripe_rule_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_stripe_rule_reduce_LDFLAGS = -all-static + +extractor_SOURCES = sentence_pair.cc extract.cc extractor.cc +extractor_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +extractor_LDFLAGS = -all-static + +filter_grammar_SOURCES = sentence_pair.cc extract.cc filter_grammar.cc +filter_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +filter_grammar_LDFLAGS = -all-static + +score_grammar_SOURCES = sentence_pair.cc score_grammar.cc extract.cc +score_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +score_grammar_LDFLAGS = -all-static + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder + diff --git a/extools/README b/extools/README new file mode 100644 index 00000000..af91ce79 --- /dev/null +++ b/extools/README @@ -0,0 +1,32 @@ + +Categories have the format i-j:CAT where i and j are the indices of the spaces +between words in the TARGET language. For example, slash categories can be written: + + the blue house + 0-1:DT 1-2:JJ 2-3:NN 1-3:NBAR 0-2:NP/NN 0-3:NP + + +You may multiply label each span, e.g. + + NP + | + NBAR + | + NN + | + John + 0-1:NP 0-1:NBAR 0-1:NP + +However, this may result in a very large number of rules being extracted. + + +**** +* Filtering and Scoring of Unscored and Unfiltered Grammars +**** + +Take the unfiltered grammar, and a test set, and run: +./filter_grammar <test set> < unfiltered.grammar > filter.grammar + +Then, to score the new filtered grammar, run: +./score_grammar <alignment> < filtered.grammar > scored.grammar + diff --git a/extools/build_lexical_translation.cc b/extools/build_lexical_translation.cc new file mode 100644 index 00000000..f609f56a --- /dev/null +++ b/extools/build_lexical_translation.cc @@ -0,0 +1,104 @@ +/* + * Build lexical translation table from alignment file to use for lexical translation probabilties when scoring a grammar + * + * Ported largely from the train-factored-phrase-model.perl script by Philipp Koehn + */ +#include <iostream> +#include <string> +#include <map> +#include <vector> +#include <utility> +#include <cstdlib> +#include <fstream> +#include <tr1/unordered_map> + +#include "sentence_pair.h" +#include "extract.h" +#include "fdict.h" +#include "tdict.h" + +#include <boost/functional/hash.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +using namespace std; +using namespace std::tr1; + +static const size_t MAX_LINE_LENGTH = 64000000; + +int main(int argc, char* argv[]){ + + bool DEBUG = false; + + map <WordID, map<WordID, int> > word_translation; + map <WordID, int> total_foreign; + map <WordID, int> total_english; + + AnnotatedParallelSentence sent; + char* buf = new char[MAX_LINE_LENGTH]; + while(cin) + { + cin.getline(buf, MAX_LINE_LENGTH); + if (buf[0] == 0) continue; + + sent.ParseInputLine(buf); + + map <WordID, int> foreign_aligned; + map <WordID, int> english_aligned; + + //iterate over the alignment to compute aligned words + + for(int i =0;i<sent.aligned.width();i++) + { + for (int j=0;j<sent.aligned.height();j++) + { + if (DEBUG) cout << sent.aligned(i,j) << " "; + if( sent.aligned(i,j)) + { + if (DEBUG) cout << TD::Convert(sent.f[i]) << " aligned to " << TD::Convert(sent.e[j]); + //local counts + ++foreign_aligned[sent.f[i]]; + ++english_aligned[sent.e[j]]; + + //global counts + ++word_translation[sent.f[i]][sent.e[j]]; + ++total_foreign[sent.f[i]]; + ++total_english[sent.e[j]]; + } + } + if (DEBUG) cout << endl; + } + if (DEBUG) cout << endl; + + static const WordID NULL_ = TD::Convert("NULL"); + //handle unaligned words - align them to null + map<WordID, int>& nullcounts = word_translation[NULL_]; + for (int j =0; j < sent.e_len; j++) + { + if (english_aligned.count(sent.e[j])) continue; + ++nullcounts[sent.e[j]]; + ++total_foreign[NULL_]; + ++total_english[sent.e[j]]; + } + + for (int i =0; i < sent.f_len; i++) + { + if (foreign_aligned.count(sent.f[i])) continue; + ++word_translation[sent.f[i]][NULL_]; + ++total_english[NULL_]; + ++total_foreign[sent.f[i]]; + } + + } + + for(map < WordID, map<WordID,int> >::iterator it = word_translation.begin(); it != word_translation.end(); ++it) + { + const map<WordID, int>& trans = it->second; + for (map<WordID,int>::const_iterator iit = trans.begin(); iit != trans.end(); ++iit) { + cout << TD::Convert(it->first) << "," << TD::Convert(iit->first) << "=" << iit->second << "/" << total_foreign[it->first] << endl; + } + } + + + return 0; +} diff --git a/extools/extract.cc b/extools/extract.cc new file mode 100644 index 00000000..6ad124d2 --- /dev/null +++ b/extools/extract.cc @@ -0,0 +1,341 @@ +#include "extract.h" + +#include <queue> +#include <vector> +#include <utility> +#include <tr1/unordered_map> +#include <set> + +#include <boost/functional/hash.hpp> + +#include "sentence_pair.h" +#include "tdict.h" +#include "wordid.h" +#include "array2d.h" + +using namespace std; +using namespace tr1; + +namespace { + inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } + + inline void SkipWhitespace(const char* buf, int* ptr) { + while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); } + } +} + +Extract::RuleObserver::~RuleObserver() { + cerr << "Rules extracted: " << count << endl; +} + +void Extract::ExtractBasePhrases(const int max_base_phrase_size, + const AnnotatedParallelSentence& sentence, + vector<ParallelSpan>* phrases) { + phrases->clear(); + + vector<pair<int,int> > f_spans(sentence.f_len, pair<int,int>(sentence.e_len, 0)); + vector<pair<int,int> > e_spans(sentence.e_len, pair<int,int>(sentence.f_len, 0)); + // for each alignment point in e, precompute the minimal consistent phrases in f + // for each alignment point in f, precompute the minimal consistent phrases in e + for (int i = 0; i < sentence.f_len; ++i) { + for (int j = 0; j < sentence.e_len; ++j) { + if (sentence.aligned(i,j)) { + if (j < f_spans[i].first) f_spans[i].first = j; + f_spans[i].second = j+1; + if (i < e_spans[j].first) e_spans[j].first = i; + e_spans[j].second = i+1; + } + } + } + + for (int i1 = 0; i1 < sentence.f_len; ++i1) { + if (sentence.f_aligned[i1] == 0) continue; + int j1 = sentence.e_len; + int j2 = 0; + const int i_limit = min(sentence.f_len, i1 + max_base_phrase_size); + for (int i2 = i1 + 1; i2 <= i_limit; ++i2) { + if (sentence.f_aligned[i2-1] == 0) continue; + // cerr << "F has aligned span " << i1 << " to " << i2 << endl; + j1 = min(j1, f_spans[i2-1].first); + j2 = max(j2, f_spans[i2-1].second); + if (j1 >= j2) continue; + if (j2 - j1 > max_base_phrase_size) continue; + int condition = 0; + for (int j = j1; j < j2; ++j) { + if (e_spans[j].first < i1) { condition = 1; break; } + if (e_spans[j].second > i2) { condition = 2; break; } + } + if (condition == 1) break; + if (condition == 2) continue; + // category types added later! + phrases->push_back(ParallelSpan(i1, i2, j1, j2)); + // cerr << i1 << " " << i2 << " : " << j1 << " " << j2 << endl; + } + } +} + +void Extract::LoosenPhraseBounds(const AnnotatedParallelSentence& sentence, + const int max_base_phrase_size, + vector<ParallelSpan>* phrases) { + const int num_phrases = phrases->size(); + map<int, map<int, map<int, map<int, bool> > > > marker; + for (int i = 0; i < num_phrases; ++i) { + const ParallelSpan& cur = (*phrases)[i]; + marker[cur.i1][cur.i2][cur.j1][cur.j2] = true; + } + for (int i = 0; i < num_phrases; ++i) { + const ParallelSpan& cur = (*phrases)[i]; + const int i1_max = cur.i1; + const int i2_min = cur.i2; + const int j1_max = cur.j1; + const int j2_min = cur.j2; + int i1_min = i1_max; + while (i1_min > 0 && sentence.f_aligned[i1_min-1] == 0) { --i1_min; } + int j1_min = j1_max; + while (j1_min > 0 && sentence.e_aligned[j1_min-1] == 0) { --j1_min; } + int i2_max = i2_min; + while (i2_max < sentence.f_len && sentence.f_aligned[i2_max] == 0) { ++i2_max; } + int j2_max = j2_min; + while (j2_max < sentence.e_len && sentence.e_aligned[j2_max] == 0) { ++j2_max; } + for (int i1 = i1_min; i1 <= i1_max; ++i1) { + const int ilim = min(i2_max, i1 + max_base_phrase_size); + for (int i2 = max(i1+1,i2_min); i2 <= ilim; ++i2) { + for (int j1 = j1_min; j1 <= j1_max; ++j1) { + const int jlim = min(j2_max, j1 + max_base_phrase_size); + for (int j2 = max(j1+1, j2_min); j2 <= jlim; ++j2) { + bool& seen = marker[i1][i2][j1][j2]; + if (!seen) + phrases->push_back(ParallelSpan(i1,i2,j1,j2)); + seen = true; + } + } + } + } + } +} + +// this uses the TARGET span (i,j) to annotate phrases, will copy +// phrases if there is more than one annotation. +// TODO: support source annotation +void Extract::AnnotatePhrasesWithCategoryTypes(const WordID default_cat, + const Array2D<vector<WordID> >& types, + vector<ParallelSpan>* phrases) { + const int num_unannotated_phrases = phrases->size(); + // have to use num_unannotated_phrases since we may grow the vector + for (int i = 0; i < num_unannotated_phrases; ++i) { + ParallelSpan& phrase = (*phrases)[i]; + const vector<WordID>* pcats = &types(phrase.j1, phrase.j2); + if (pcats->empty() && default_cat != 0) { + static vector<WordID> s_default(1, default_cat); + pcats = &s_default; + } + if (pcats->empty()) { + cerr << "ERROR span " << phrase.i1 << "," << phrase.i2 << "-" + << phrase.j1 << "," << phrase.j2 << " has no type. " + "Did you forget --default_category?\n"; + } + const vector<WordID>& cats = *pcats; + phrase.cat = cats[0]; + for (int ci = 1; ci < cats.size(); ++ci) { + ParallelSpan new_phrase = phrase; + new_phrase.cat = cats[ci]; + phrases->push_back(new_phrase); + } + } +} + +// a partially complete (f-side) of a rule +struct RuleItem { + vector<ParallelSpan> f; + int i,j,syms,vars; + explicit RuleItem(int pi) : i(pi), j(pi), syms(), vars() {} + void Extend(const WordID& fword) { + f.push_back(ParallelSpan(fword)); + ++j; + ++syms; + } + void Extend(const ParallelSpan& subphrase) { + f.push_back(subphrase); + j += subphrase.i2 - subphrase.i1; + ++vars; + ++syms; + } + bool RuleFEndsInVariable() const { + if (f.size() > 0) { + return f.back().IsVariable(); + } else { return false; } + } +}; + +void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence, + const vector<ParallelSpan>& phrases, + const int max_vars, + const int max_syms, + const bool permit_adjacent_nonterminals, + const bool require_aligned_terminal, + RuleObserver* observer) { + queue<RuleItem> q; // agenda for BFS + int max_len = -1; + unordered_map<pair<short, short>, vector<ParallelSpan>, boost::hash<pair<short, short> > > fspans; + vector<vector<ParallelSpan> > spans_by_start(sentence.f_len); + set<int> starts; + for (int i = 0; i < phrases.size(); ++i) { + fspans[make_pair(phrases[i].i1,phrases[i].i2)].push_back(phrases[i]); + max_len = max(max_len, phrases[i].i2 - phrases[i].i1); + // have we already added a rule item starting at phrases[i].i1? + if (starts.insert(phrases[i].i1).second) + q.push(RuleItem(phrases[i].i1)); + spans_by_start[phrases[i].i1].push_back(phrases[i]); + } + starts.clear(); + vector<pair<int,int> > next_e(sentence.e_len); + vector<WordID> cur_rhs_f, cur_rhs_e; + vector<pair<short, short> > cur_terminal_align; + vector<int> cur_es, cur_fs; + while(!q.empty()) { + const RuleItem& rule = q.front(); + + // extend the partial rule + if (rule.j < sentence.f_len && (rule.j - rule.i) < max_len && rule.syms < max_syms) { + RuleItem ew = rule; + + // extend with a word + ew.Extend(sentence.f[ew.j]); + q.push(ew); + + // with variables + if (rule.vars < max_vars && + !spans_by_start[rule.j].empty() && + ((!rule.RuleFEndsInVariable()) || permit_adjacent_nonterminals)) { + const vector<ParallelSpan>& sub_phrases = spans_by_start[rule.j]; + for (int it = 0; it < sub_phrases.size(); ++it) { + if (sub_phrases[it].i2 - sub_phrases[it].i1 + rule.j - rule.i <= max_len) { + RuleItem ev = rule; + ev.Extend(sub_phrases[it]); + q.push(ev); + assert(ev.j <= sentence.f_len); + } + } + } + } + // determine if rule is consistent + if (rule.syms > 0 && + fspans.count(make_pair(rule.i,rule.j)) && + (!rule.RuleFEndsInVariable() || rule.syms > 1)) { + const vector<ParallelSpan>& orig_spans = fspans[make_pair(rule.i,rule.j)]; + for (int s = 0; s < orig_spans.size(); ++s) { + const ParallelSpan& orig_span = orig_spans[s]; + const WordID lhs = orig_span.cat; + for (int j = orig_span.j1; j < orig_span.j2; ++j) next_e[j].first = -1; + int nt_index_e = 0; + for (int i = 0; i < rule.f.size(); ++i) { + const ParallelSpan& cur = rule.f[i]; + if (cur.IsVariable()) + next_e[cur.j1] = pair<int,int>(cur.j2, ++nt_index_e); + } + cur_rhs_f.clear(); + cur_rhs_e.clear(); + cur_terminal_align.clear(); + cur_fs.clear(); + cur_es.clear(); + + const int elen = orig_span.j2 - orig_span.j1; + vector<int> isvar(elen, 0); + int fbias = rule.i; + bool bad_rule = false; + bool has_aligned_terminal = false; + for (int i = 0; i < rule.f.size(); ++i) { + const ParallelSpan& cur = rule.f[i]; + cur_rhs_f.push_back(cur.cat); + if (cur.cat > 0) { // terminal + if (sentence.f_aligned[fbias + i]) has_aligned_terminal = true; + cur_fs.push_back(fbias + i); + } else { // non-terminal + int subj1 = cur.j1 - orig_span.j1; + int subj2 = cur.j2 - orig_span.j1; + if (subj1 < 0 || subj2 > elen) { bad_rule = true; break; } + for (int j = subj1; j < subj2 && !bad_rule; ++j) { + int& isvarj = isvar[j]; + isvarj = true; + } + if (bad_rule) break; + cur_fs.push_back(-1); + fbias += cur.i2 - cur.i1 - 1; + } + } + if (require_aligned_terminal && !has_aligned_terminal) bad_rule = true; + if (!bad_rule) { + for (int j = orig_span.j1; j < orig_span.j2; ++j) { + if (next_e[j].first < 0) { + cur_rhs_e.push_back(sentence.e[j]); + cur_es.push_back(j); + } else { + cur_rhs_e.push_back(1 - next_e[j].second); // next_e[j].second is NT gap index + cur_es.push_back(-1); + j = next_e[j].first - 1; + } + } + for (short i = 0; i < cur_fs.size(); ++i) + if (cur_fs[i] >= 0) + for (short j = 0; j < cur_es.size(); ++j) + if (cur_es[j] >= 0 && sentence.aligned(cur_fs[i],cur_es[j])) + cur_terminal_align.push_back(make_pair(i,j)); + observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align); + } + } + } + q.pop(); + } +} + +void RuleStatistics::ParseRuleStatistics(const char* buf, int start, int end) { + int ptr = start; + counts.clear(); + aligns.clear(); + while (ptr < end) { + SkipWhitespace(buf, &ptr); + int vstart = ptr; + while(ptr < end && buf[ptr] != '=') ++ptr; + assert(buf[ptr] == '='); + assert(ptr > vstart); + if (buf[vstart] == 'A' && buf[vstart+1] == '=') { + ++ptr; + while (ptr < end && !IsWhitespace(buf[ptr])) { + while(ptr < end && buf[ptr] == ',') { ++ptr; } + assert(ptr < end); + vstart = ptr; + while(ptr < end && buf[ptr] != ',' && !IsWhitespace(buf[ptr])) { ++ptr; } + if (ptr > vstart) { + short a, b; + AnnotatedParallelSentence::ReadAlignmentPoint(buf, vstart, ptr, false, &a, &b); + aligns.push_back(make_pair(a,b)); + } + } + } else { + int name = FD::Convert(string(buf,vstart,ptr-vstart)); + ++ptr; + vstart = ptr; + while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; } + assert(ptr > vstart); + counts.set_value(name, strtod(buf + vstart, NULL)); + } + } +} + +ostream& operator<<(ostream& os, const RuleStatistics& s) { + bool needspace = false; + for (SparseVector<float>::const_iterator it = s.counts.begin(); it != s.counts.end(); ++it) { + if (needspace) os << ' '; else needspace = true; + os << FD::Convert(it->first) << '=' << it->second; + } + if (s.aligns.size() > 0) { + os << " A="; + needspace = false; + for (int i = 0; i < s.aligns.size(); ++i) { + if (needspace) os << ','; else needspace = true; + os << s.aligns[i].first << '-' << s.aligns[i].second; + } + } + return os; +} + diff --git a/extools/extract.h b/extools/extract.h new file mode 100644 index 00000000..f87aa6cb --- /dev/null +++ b/extools/extract.h @@ -0,0 +1,110 @@ +#ifndef _EXTRACT_H_ +#define _EXTRACT_H_ + +#include <iostream> +#include <utility> +#include <vector> +#include "array2d.h" +#include "wordid.h" +#include "sparse_vector.h" + +struct AnnotatedParallelSentence; + +// usually represents a consistent phrase, which may +// be annotated with a type (cat) +// inside the rule extractor, this class is also used to represent a word +// in a partial rule. +struct ParallelSpan { + // i1 = i of f side + // i2 = j of f side + // j1 = i of e side + // j2 = j of e side + short i1,i2,j1,j2; + // cat is set by AnnotatePhrasesWithCategoryTypes, otherwise it's 0 + WordID cat; // category type of span (also overloaded by RuleItem class + // to be a word ID) + ParallelSpan() : i1(-1), i2(-1), j1(-1), j2(-1), cat() {} + // used by Rule class to represent a terminal symbol: + explicit ParallelSpan(WordID w) : i1(-1), i2(-1), j1(-1), j2(-1), cat(w) {} + ParallelSpan(int pi1, int pi2, int pj1, int pj2) : i1(pi1), i2(pi2), j1(pj1), j2(pj2), cat() {} + ParallelSpan(int pi1, int pi2, int pj1, int pj2, WordID c) : i1(pi1), i2(pi2), j1(pj1), j2(pj2), cat(c) {} + + // ParallelSpan is used in the Rule class where it is + // overloaded to also represent terminal symbols + inline bool IsVariable() const { return i1 != -1; } +}; + +// rule extraction logic lives here. this has no data, it's just got +// static member functions. +struct Extract { + // RuleObserver's CountRule is called for each rule extracted + // implement CountRuleImpl to do things like count the rules, + // write them to a file, etc. + struct RuleObserver { + RuleObserver() : count() {} + virtual void CountRule(WordID lhs, + const std::vector<WordID>& rhs_f, + const std::vector<WordID>& rhs_e, + const std::vector<std::pair<short, short> >& fe_terminal_alignments) { + ++count; + CountRuleImpl(lhs, rhs_f, rhs_e, fe_terminal_alignments); + } + virtual ~RuleObserver(); + + protected: + virtual void CountRuleImpl(WordID lhs, + const std::vector<WordID>& rhs_f, + const std::vector<WordID>& rhs_e, + const std::vector<std::pair<short, short> >& fe_terminal_alignments) = 0; + private: + int count; + }; + + // given a set of "tight" phrases and the aligned sentence they were + // extracted from, "loosen" them + static void LoosenPhraseBounds(const AnnotatedParallelSentence& sentence, + const int max_base_phrase_size, + std::vector<ParallelSpan>* phrases); + + // extract all consistent phrase pairs, up to size max_base_phrase_size + // (on the source side). these phrases will be "tight". + static void ExtractBasePhrases(const int max_base_phrase_size, + const AnnotatedParallelSentence& sentence, + std::vector<ParallelSpan>* phrases); + + // this uses the TARGET span (i,j) to annotate phrases, will copy + // phrases if there is more than one annotation. + // TODO: support source annotation + static void AnnotatePhrasesWithCategoryTypes(const WordID default_cat, + const Array2D<std::vector<WordID> >& types, + std::vector<ParallelSpan>* phrases); + + // use the Chiang (2007) extraction logic to extract consistent subphrases + // observer->CountRule is called once for each rule extracted + static void ExtractConsistentRules(const AnnotatedParallelSentence& sentence, + const std::vector<ParallelSpan>& phrases, + const int max_vars, + const int max_syms, + const bool permit_adjacent_nonterminals, + const bool require_aligned_terminal, + RuleObserver* observer); +}; + +// represents statistics / information about a rule pair +struct RuleStatistics { + SparseVector<float> counts; + std::vector<std::pair<short,short> > aligns; + RuleStatistics() {} + RuleStatistics(int name, float val, const std::vector<std::pair<short,short> >& al) : + aligns(al) { + counts.set_value(name, val); + } + void ParseRuleStatistics(const char* buf, int start, int end); + RuleStatistics& operator+=(const RuleStatistics& rhs) { + counts += rhs.counts; + return *this; + } +}; +std::ostream& operator<<(std::ostream& os, const RuleStatistics& s); + +#endif diff --git a/extools/extractor.cc b/extools/extractor.cc new file mode 100644 index 00000000..a3791d2a --- /dev/null +++ b/extools/extractor.cc @@ -0,0 +1,363 @@ +#include <iostream> +#include <vector> +#include <utility> +#include <tr1/unordered_map> + +#include <boost/functional/hash.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> +#include <boost/lexical_cast.hpp> + +#include "sparse_vector.h" +#include "sentence_pair.h" +#include "extract.h" +#include "tdict.h" +#include "fdict.h" +#include "wordid.h" +#include "array2d.h" +#include "filelib.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +static const size_t MAX_LINE_LENGTH = 100000; +WordID kBOS, kEOS, kDIVIDER, kGAP; +int kCOUNT; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("input,i", po::value<string>()->default_value("-"), "Input file") + ("default_category,d", po::value<string>(), "Default span type (use X for 'Hiero')") + ("loose", "Use loose phrase extraction heuristic for base phrases") + ("base_phrase,B", "Write base phrases") + ("base_phrase_spans", "Write base sentences and phrase spans") + ("bidir,b", "Extract bidirectional rules (for computing p(f|e) in addition to p(e|f))") + ("combiner_size,c", po::value<size_t>()->default_value(800000), "Number of unique items to store in cache before writing rule counts. Set to 0 to disable cache.") + ("silent", "Write nothing to stderr except errors") + ("phrase_context,C", "Write base phrase contexts") + ("phrase_context_size,S", po::value<int>()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts") + ("max_base_phrase_size,L", po::value<int>()->default_value(10), "Maximum starting phrase size") + ("max_syms,l", po::value<int>()->default_value(5), "Maximum number of symbols in final phrase size") + ("max_vars,v", po::value<int>()->default_value(2), "Maximum number of nonterminal variables in final phrase size") + ("permit_adjacent_nonterminals,A", "Permit adjacent nonterminals in source side of rules") + ("no_required_aligned_terminal,n", "Do not require an aligned terminal") + ("help,h", "Print this help message and exit"); + po::options_description clo("Command line options"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + po::notify(*conf); + + if (conf->count("help") || conf->count("input") == 0) { + cerr << "\nUsage: extractor [-options]\n"; + cerr << dcmdline_options << endl; + exit(1); + } +} + +// TODO how to handle alignment information? +void WriteBasePhrases(const AnnotatedParallelSentence& sentence, + const vector<ParallelSpan>& phrases) { + vector<WordID> e,f; + for (int it = 0; it < phrases.size(); ++it) { + const ParallelSpan& phrase = phrases[it]; + e.clear(); + f.clear(); + for (int i = phrase.i1; i < phrase.i2; ++i) + f.push_back(sentence.f[i]); + for (int j = phrase.j1; j < phrase.j2; ++j) + e.push_back(sentence.e[j]); + cout << TD::GetString(f) << " ||| " << TD::GetString(e) << endl; + } +} + +void WriteBasePhraseSpans(const AnnotatedParallelSentence& sentence, + const vector<ParallelSpan>& phrases) { + cout << TD::GetString(sentence.f) << " ||| " << TD::GetString(sentence.e) << " |||"; + for (int it = 0; it < phrases.size(); ++it) { + const ParallelSpan& phrase = phrases[it]; + cout << " " << phrase.i1 << "-" << phrase.i2 + << "-" << phrase.j1 << "-" << phrase.j2; + } + cout << endl; +} + +struct CountCombiner { + CountCombiner(size_t csize) : combiner_size(csize) {} + ~CountCombiner() { + if (!cache.empty()) WriteAndClearCache(); + } + + void Count(const vector<WordID>& key, + const vector<WordID>& val, + const int count_type, + const vector<pair<short,short> >& aligns) { + if (combiner_size > 0) { + RuleStatistics& v = cache[key][val]; + float newcount = v.counts.add_value(count_type, 1.0f); + // hack for adding alignments + if (newcount < 7.0f && aligns.size() > v.aligns.size()) + v.aligns = aligns; + if (cache.size() > combiner_size) WriteAndClearCache(); + } else { + cout << TD::GetString(key) << '\t' << TD::GetString(val) << " ||| "; + cout << RuleStatistics(count_type, 1.0f, aligns) << endl; + } + } + + private: + void WriteAndClearCache() { + for (unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > >::iterator it = cache.begin(); + it != cache.end(); ++it) { + cout << TD::GetString(it->first) << '\t'; + const Vec2PhraseCount& vals = it->second; + bool needdiv = false; + for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi) { + if (needdiv) cout << " ||| "; else needdiv = true; + cout << TD::GetString(vi->first) << " ||| " << vi->second; + } + cout << endl; + } + cache.clear(); + } + + const size_t combiner_size; + typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > Vec2PhraseCount; + unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > > cache; +}; + +// TODO optional source context +// output <k, v> : k = phrase "document" v = context "term" +void WritePhraseContexts(const AnnotatedParallelSentence& sentence, + const vector<ParallelSpan>& phrases, + const int ctx_size, + CountCombiner* o) { + vector<WordID> context(ctx_size * 2 + 1); + context[ctx_size] = kGAP; + vector<WordID> key; + key.reserve(100); + for (int it = 0; it < phrases.size(); ++it) { + const ParallelSpan& phrase = phrases[it]; + + // TODO, support src keys as well + key.resize(phrase.j2 - phrase.j1); + for (int j = phrase.j1; j < phrase.j2; ++j) + key[j - phrase.j1] = sentence.e[j]; + + for (int i = 0; i < ctx_size; ++i) { + int epos = phrase.j1 - 1 - i; + const WordID left_ctx = (epos < 0) ? kBOS : sentence.e[epos]; + context[ctx_size - i - 1] = left_ctx; + epos = phrase.j2 + i; + const WordID right_ctx = (epos >= sentence.e_len) ? kEOS : sentence.e[epos]; + context[ctx_size + i + 1] = right_ctx; + } + o->Count(key, context, kCOUNT, vector<pair<short,short> >()); + } +} + +struct SimpleRuleWriter : public Extract::RuleObserver { + protected: + virtual void CountRuleImpl(WordID lhs, + const vector<WordID>& rhs_f, + const vector<WordID>& rhs_e, + const vector<pair<short,short> >& fe_terminal_alignments) { + cout << "[" << TD::Convert(-lhs) << "] |||"; + for (int i = 0; i < rhs_f.size(); ++i) { + if (rhs_f[i] < 0) cout << " [" << TD::Convert(-rhs_f[i]) << ']'; + else cout << ' ' << TD::Convert(rhs_f[i]); + } + cout << " |||"; + for (int i = 0; i < rhs_e.size(); ++i) { + if (rhs_e[i] <= 0) cout << " [" << (1-rhs_e[i]) << ']'; + else cout << ' ' << TD::Convert(rhs_e[i]); + } + cout << " |||"; + for (int i = 0; i < fe_terminal_alignments.size(); ++i) { + cout << ' ' << fe_terminal_alignments[i].first << '-' << fe_terminal_alignments[i].second; + } + cout << endl; + } +}; + +struct HadoopStreamingRuleObserver : public Extract::RuleObserver { + HadoopStreamingRuleObserver(CountCombiner* cc, bool bidir_flag) : + bidir(bidir_flag), + kF(TD::Convert("F")), + kE(TD::Convert("E")), + kDIVIDER(TD::Convert("|||")), + kLB("["), kRB("]"), + combiner(*cc), + kEMPTY(), + kCFE(FD::Convert("CFE")) { + for (int i=1; i < 50; ++i) + index2sym[1-i] = TD::Convert(kLB + boost::lexical_cast<string>(i) + kRB); + fmajor_key.resize(10, kF); + emajor_key.resize(10, kE); + if (bidir) + fmajor_key[2] = emajor_key[2] = kDIVIDER; + else + fmajor_key[1] = kDIVIDER; + } + + protected: + virtual void CountRuleImpl(WordID lhs, + const vector<WordID>& rhs_f, + const vector<WordID>& rhs_e, + const vector<pair<short,short> >& fe_terminal_alignments) { + if (bidir) { // extract rules in "both directions" E->F and F->E + fmajor_key.resize(3 + rhs_f.size()); + emajor_key.resize(3 + rhs_e.size()); + fmajor_val.resize(rhs_e.size()); + emajor_val.resize(rhs_f.size()); + emajor_key[1] = fmajor_key[1] = MapSym(lhs); + int nt = 1; + for (int i = 0; i < rhs_f.size(); ++i) { + const WordID id = rhs_f[i]; + if (id < 0) { + fmajor_key[3 + i] = MapSym(id, nt); + emajor_val[i] = MapSym(id, nt); + ++nt; + } else { + fmajor_key[3 + i] = id; + emajor_val[i] = id; + } + } + for (int i = 0; i < rhs_e.size(); ++i) { + WordID id = rhs_e[i]; + if (id <= 0) { + fmajor_val[i] = index2sym[id]; + emajor_key[3 + i] = index2sym[id]; + } else { + fmajor_val[i] = id; + emajor_key[3 + i] = id; + } + } + combiner.Count(fmajor_key, fmajor_val, kCFE, fe_terminal_alignments); + combiner.Count(emajor_key, emajor_val, kCFE, kEMPTY); + } else { // extract rules only in F->E + fmajor_key.resize(2 + rhs_f.size()); + fmajor_val.resize(rhs_e.size()); + fmajor_key[0] = MapSym(lhs); + int nt = 1; + for (int i = 0; i < rhs_f.size(); ++i) { + const WordID id = rhs_f[i]; + if (id < 0) + fmajor_key[2 + i] = MapSym(id, nt++); + else + fmajor_key[2 + i] = id; + } + for (int i = 0; i < rhs_e.size(); ++i) { + const WordID id = rhs_e[i]; + if (id <= 0) + fmajor_val[i] = index2sym[id]; + else + fmajor_val[i] = id; + } + combiner.Count(fmajor_key, fmajor_val, kCFE, fe_terminal_alignments); + } + } + + private: + WordID MapSym(WordID sym, int ind = 0) { + WordID& r = cat2ind2sym[sym][ind]; + if (!r) { + if (ind == 0) + r = TD::Convert(kLB + TD::Convert(-sym) + kRB); + else + r = TD::Convert(kLB + TD::Convert(-sym) + "," + boost::lexical_cast<string>(ind) + kRB); + } + return r; + } + + const bool bidir; + const WordID kF, kE, kDIVIDER; + const string kLB, kRB; + CountCombiner& combiner; + const vector<pair<short,short> > kEMPTY; + const int kCFE; + map<WordID, map<int, WordID> > cat2ind2sym; + map<int, WordID> index2sym; + vector<WordID> emajor_key, emajor_val, fmajor_key, fmajor_val; +}; + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + kBOS = TD::Convert("<s>"); + kEOS = TD::Convert("</s>"); + kDIVIDER = TD::Convert("|||"); + kGAP = TD::Convert("<PHRASE>"); + kCOUNT = FD::Convert("C"); + + WordID default_cat = 0; // 0 means no default- extraction will + // fail if a phrase is extracted without a + // category + if (conf.count("default_category")) { + string sdefault_cat = conf["default_category"].as<string>(); + default_cat = -TD::Convert(sdefault_cat); + cerr << "Default category: " << sdefault_cat << endl; + } else { + cerr << "No default category (use --default_category if you want to set one)\n"; + } + ReadFile rf(conf["input"].as<string>()); + istream& in = *rf.stream(); + + char buf[MAX_LINE_LENGTH]; + AnnotatedParallelSentence sentence; + vector<ParallelSpan> phrases; + const int max_base_phrase_size = conf["max_base_phrase_size"].as<int>(); + const bool write_phrase_contexts = conf.count("phrase_context") > 0; + const bool write_base_phrases = conf.count("base_phrase") > 0; + const bool write_base_phrase_spans = conf.count("base_phrase_spans") > 0; + const bool loose_phrases = conf.count("loose") > 0; + const bool silent = conf.count("silent") > 0; + const int max_syms = conf["max_syms"].as<int>(); + const int max_vars = conf["max_vars"].as<int>(); + const int ctx_size = conf["phrase_context_size"].as<int>(); + const bool permit_adjacent_nonterminals = conf.count("permit_adjacent_nonterminals") > 0; + const bool require_aligned_terminal = conf.count("no_required_aligned_terminal") == 0; + int line = 0; + CountCombiner cc(conf["combiner_size"].as<size_t>()); + HadoopStreamingRuleObserver o(&cc, + conf.count("bidir") > 0); + //SimpleRuleWriter o; + while(in) { + ++line; + in.getline(buf, MAX_LINE_LENGTH); + if (buf[0] == 0) continue; + if (!silent) { + if (line % 200 == 0) cerr << '.'; + if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush; + } + sentence.ParseInputLine(buf); + phrases.clear(); + Extract::ExtractBasePhrases(max_base_phrase_size, sentence, &phrases); + if (loose_phrases) + Extract::LoosenPhraseBounds(sentence, max_base_phrase_size, &phrases); + if (phrases.empty()) { + cerr << "WARNING no phrases extracted line: " << line << endl; + continue; + } + if (write_phrase_contexts) { + WritePhraseContexts(sentence, phrases, ctx_size, &cc); + continue; + } + if (write_base_phrases) { + WriteBasePhrases(sentence, phrases); + continue; + } + if (write_base_phrase_spans) { + WriteBasePhraseSpans(sentence, phrases); + continue; + } + Extract::AnnotatePhrasesWithCategoryTypes(default_cat, sentence.span_types, &phrases); + Extract::ExtractConsistentRules(sentence, phrases, max_vars, max_syms, permit_adjacent_nonterminals, require_aligned_terminal, &o); + } + if (!silent) cerr << endl; + return 0; +} + diff --git a/extools/filter_grammar.cc b/extools/filter_grammar.cc new file mode 100644 index 00000000..290a77f5 --- /dev/null +++ b/extools/filter_grammar.cc @@ -0,0 +1,202 @@ +/* + * Build suffix tree representation of a data set for grammar filtering + * ./filter_grammar <test set> < unfiltered.grammar > filter.grammar + * + */ +#include <iostream> +#include <string> +#include <map> +#include <vector> +#include <utility> +#include <cstdlib> +#include <fstream> +#include <tr1/unordered_map> + +#include "filelib.h" +#include "sentence_pair.h" +#include "suffix_tree.h" +#include "extract.h" +#include "fdict.h" +#include "tdict.h" + +#include <boost/functional/hash.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + + +using namespace std; +using namespace std::tr1; + +static const size_t MAX_LINE_LENGTH = 64000000; + +typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics; + + +namespace { + inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } + inline bool IsBracket(char c){return c == '[' || c == ']';} + inline void SkipWhitespace(const char* buf, int* ptr) { + while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); } + } +} + + + +int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) { + static const WordID kDIV = TD::Convert("|||"); + + int ptr = sstart; + while(ptr < end) { + while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; } + int start = ptr; + while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; } + if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; } + //look in the buffer and see if its a nonterminal marker before integerizing it to wordID-anything with [...] or ||| + + const WordID w = TD::Convert(string(buf, start, ptr - start)); + + if((IsBracket(buf[start]) and IsBracket(buf[ptr-1])) or( w == kDIV)) + p->push_back(-1); + else { + if (w == kDIV) return ptr; + p->push_back(w); + } + } + return ptr; +} + + + +void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) { + static const WordID kDIV = TD::Convert("|||"); + counts->clear(); + int ptr = 0; + while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; } + if (buf[ptr] != '\t') { + cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl; + exit(1); + } + cur_key->clear(); + // key is: "[X] ||| word word word" + int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key); + cur_key->push_back(kDIV); + ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key); + ++ptr; + int start = ptr; + int end = ptr; + int state = 0; // 0=reading label, 1=reading count + vector<WordID> name; + while(buf[ptr] != 0) { + while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; } + if (buf[ptr] == '|') { + ++ptr; + if (buf[ptr] == '|') { + ++ptr; + if (buf[ptr] == '|') { + ++ptr; + end = ptr - 3; + while (end > start && IsWhitespace(buf[end-1])) { --end; } + if (start == end) { + cerr << "Got empty token!\n LINE=" << buf << endl; + exit(1); + } + switch (state) { + case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break; + case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break; + default: cerr << "Can't happen\n"; abort(); + } + SkipWhitespace(buf, &ptr); + start = ptr; + } + } + } + } + end=ptr; + while (end > start && IsWhitespace(buf[end-1])) { --end; } + if (end > start) { + switch (state) { + case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break; + case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break; + default: cerr << "Can't happen\n"; abort(); + } + } +} + + + + + + + +int main(int argc, char* argv[]){ + if (argc != 2) { + cerr << "Usage: " << argv[0] << " testset.txt < unfiltered.grammar\n"; + return 1; + } + + assert(FileExists(argv[1])); + ReadFile rfts(argv[1]); + istream& testSet = *rfts.stream(); + ofstream filter_grammar_; + bool DEBUG = false; + + + AnnotatedParallelSentence sent; + char* buf = new char[MAX_LINE_LENGTH]; + cerr << "Build suffix tree from test set in " << argv[1] << endl; + //root of the suffix tree + Node<int> root; + int line=0; + + /* process the data set to build suffix tree + */ + while(!testSet.eof()) { + ++line; + testSet.getline(buf, MAX_LINE_LENGTH); + if (buf[0] == 0) continue; + + //hack to read in the test set using the alignedparallelsentence methods + strcat(buf," ||| fake ||| 0-0"); + sent.ParseInputLine(buf); + + if (DEBUG)cerr << line << "||| " << buf << " -- " << sent.f_len << endl; + + //add each successive suffix to the tree + for(int i =0;i<sent.f_len;i++) + root.InsertPath(sent.f, i, sent.f_len - 1); + if(DEBUG)cerr<<endl; + + } + + cerr << "Filtering grammar..." << endl; + //process the unfiltered, unscored grammar + + ID2RuleStatistics cur_counts; + vector<WordID> cur_key; + line = 0; + + while(cin) + { + ++line; + cin.getline(buf, MAX_LINE_LENGTH); + if (buf[0] == 0) continue; + ParseLine(buf, &cur_key, &cur_counts); + const Node<int>* curnode = &root; + for(int i=0;i<cur_key.size() - 1; i++) + { + if (DEBUG) cerr << line << " " << cur_key[i] << " ::: "; + if(cur_key[i] == -1) + { + curnode = &root; + } else if (curnode) { + curnode = root.Extend(cur_key[i]); + if (!curnode) break; + } + } + + if(curnode) + cout << buf << endl; + } + + return 0; +} diff --git a/extools/lex_trans_tbl.h b/extools/lex_trans_tbl.h new file mode 100644 index 00000000..81d6ccc9 --- /dev/null +++ b/extools/lex_trans_tbl.h @@ -0,0 +1,24 @@ +/* + * lex_trans_tbl.h + * + * Created on: May 25, 2010 + * Author: Vlad + */ + +#ifndef LEX_TRANS_TBL_H_ +#define LEX_TRANS_TBL_H_ + +#include <map> + +class LexTranslationTable +{ + public: + + std::map < std::pair<WordID,WordID>,int > word_translation; + std::map <WordID, int> total_foreign; + std::map <WordID, int> total_english; + void createTTable(const char* buf); + +}; + +#endif /* LEX_TRANS_TBL_H_ */ diff --git a/extools/merge_lines.pl b/extools/merge_lines.pl new file mode 100755 index 00000000..8711e4ce --- /dev/null +++ b/extools/merge_lines.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl -w +use strict; + +if (scalar @ARGV < 2) { + die "Usage: $0 file1.txt file2.txt ...\n\n Concatenate the nth line of each input file. All files\n must be the same length.\n\n"; +} + +my @fhs=(); +for my $file (@ARGV) { + my $fh; + open $fh, "<$file" or die "Can't read $file: $!\n"; + push @fhs, $fh; +} + +my $first = shift @fhs; + +while(my $x = <$first>) { + my $ind = 0; + chomp $x; + my @fields = ($x); + for my $fh (@fhs) { + $ind++; + $x = <$fh>; + die "ERROR: Mismatched number of lines: $ARGV[$ind]\n" unless $x; + chomp $x; + push @fields, $x; + } + print join ' ||| ', @fields; + print "\n"; +} +my $ind = 0; +for my $fh (@fhs) { + $ind++; + my $x=<$fh>; + die "ERROR: $ARGV[$ind] has extra lines!\n" if $x; +} + +exit 0; + +for my $fh (@fhs) { + close $fh; +} + diff --git a/extools/mr_stripe_rule_reduce.cc b/extools/mr_stripe_rule_reduce.cc new file mode 100644 index 00000000..eaf1b6d7 --- /dev/null +++ b/extools/mr_stripe_rule_reduce.cc @@ -0,0 +1,242 @@ +#include <iostream> +#include <vector> +#include <utility> +#include <cstdlib> +#include <tr1/unordered_map> + +#include <boost/functional/hash.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "tdict.h" +#include "sentence_pair.h" +#include "fdict.h" +#include "extract.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +static const size_t MAX_LINE_LENGTH = 64000000; + +bool use_hadoop_counters = false; + +namespace { + inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } + + inline void SkipWhitespace(const char* buf, int* ptr) { + while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); } + } +} +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("phrase_marginals,p", "Compute phrase marginals") + ("use_hadoop_counters,C", "Enable this if running inside Hadoop") + ("bidir,b", "Rules are tagged as being F->E or E->F, invert E rules in output") + ("help,h", "Print this help message and exit"); + po::options_description clo("Command line options"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + po::notify(*conf); + + if (conf->count("help")) { + cerr << "\nUsage: mr_stripe_rule_reduce [-options]\n"; + cerr << dcmdline_options << endl; + exit(1); + } +} + +typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics; + +void PlusEquals(const ID2RuleStatistics& v, ID2RuleStatistics* self) { + for (ID2RuleStatistics::const_iterator it = v.begin(); it != v.end(); ++it) { + RuleStatistics& dest = (*self)[it->first]; + dest += it->second; + // TODO - do something smarter about alignments? + if (dest.aligns.empty() && !it->second.aligns.empty()) + dest.aligns = it->second.aligns; + } +} + +int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) { + static const WordID kDIV = TD::Convert("|||"); + int ptr = sstart; + while(ptr < end) { + while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; } + int start = ptr; + while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; } + if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; } + const WordID w = TD::Convert(string(buf, start, ptr - start)); + if (w == kDIV) return ptr; + p->push_back(w); + } + return ptr; +} + +void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) { + static const WordID kDIV = TD::Convert("|||"); + counts->clear(); + int ptr = 0; + while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; } + if (buf[ptr] != '\t') { + cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl; + exit(1); + } + cur_key->clear(); + // key is: "[X] ||| word word word" + int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key); + if (buf[tmpp] != '\t') { + cur_key->push_back(kDIV); + ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key); + } + ++ptr; + int start = ptr; + int end = ptr; + int state = 0; // 0=reading label, 1=reading count + vector<WordID> name; + while(buf[ptr] != 0) { + while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; } + if (buf[ptr] == '|') { + ++ptr; + if (buf[ptr] == '|') { + ++ptr; + if (buf[ptr] == '|') { + ++ptr; + end = ptr - 3; + while (end > start && IsWhitespace(buf[end-1])) { --end; } + if (start == end) { + cerr << "Got empty token!\n LINE=" << buf << endl; + exit(1); + } + switch (state) { + case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break; + case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break; + default: cerr << "Can't happen\n"; abort(); + } + SkipWhitespace(buf, &ptr); + start = ptr; + } + } + } + } + end=ptr; + while (end > start && IsWhitespace(buf[end-1])) { --end; } + if (end > start) { + switch (state) { + case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break; + case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break; + default: cerr << "Can't happen\n"; abort(); + } + } +} + +void WriteKeyValue(const vector<WordID>& key, const ID2RuleStatistics& val) { + cout << TD::GetString(key) << '\t'; + bool needdiv = false; + for (ID2RuleStatistics::const_iterator it = val.begin(); it != val.end(); ++it) { + if (needdiv) cout << " ||| "; else needdiv = true; + cout << TD::GetString(it->first) << " ||| " << it->second; + } + cout << endl; + if (use_hadoop_counters) cerr << "reporter:counter:UserCounters,RuleCount," << val.size() << endl; +} + +void DoPhraseMarginals(const vector<WordID>& key, const bool bidir, ID2RuleStatistics* val) { + static const WordID kF = TD::Convert("F"); + static const WordID kE = TD::Convert("E"); + static const int kCF = FD::Convert("CF"); + static const int kCE = FD::Convert("CE"); + static const int kCFE = FD::Convert("CFE"); + assert(key.size() > 0); + int cur_marginal_id = kCF; + if (bidir) { + if (key[0] != kF && key[0] != kE) { + cerr << "DoPhraseMarginals expects keys to have the from 'F|E [NT] word word word'\n"; + cerr << " but got: " << TD::GetString(key) << endl; + exit(1); + } + if (key[0] == kE) cur_marginal_id = kCE; + } + double tot = 0; + for (ID2RuleStatistics::iterator it = val->begin(); it != val->end(); ++it) + tot += it->second.counts.value(kCFE); + for (ID2RuleStatistics::iterator it = val->begin(); it != val->end(); ++it) { + it->second.counts.set_value(cur_marginal_id, tot); + + // prevent double counting of the joint + if (cur_marginal_id == kCE) it->second.counts.clear_value(kCFE); + } +} + +void WriteWithInversions(const vector<WordID>& key, const ID2RuleStatistics& val) { + static const WordID kE = TD::Convert("E"); + static const WordID kDIV = TD::Convert("|||"); + vector<WordID> new_key(key.size() - 1); + for (int i = 1; i < key.size(); ++i) + new_key[i - 1] = key[i]; + const bool do_invert = (key[0] == kE); + if (!do_invert) { + WriteKeyValue(new_key, val); + } else { + ID2RuleStatistics inv; + assert(new_key.size() > 2); + vector<WordID> tk(new_key.size() - 2); + for (int i = 0; i < tk.size(); ++i) + tk[i] = new_key[2 + i]; + RuleStatistics& inv_stats = inv[tk]; + for (ID2RuleStatistics::const_iterator it = val.begin(); it != val.end(); ++it) { + inv_stats.counts = it->second.counts; + vector<WordID> ekey(2 + it->first.size()); + ekey[0] = key[1]; + ekey[1] = kDIV; + for (int i = 0; i < it->first.size(); ++i) + ekey[2+i] = it->first[i]; + WriteKeyValue(ekey, inv); + } + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + char* buf = new char[MAX_LINE_LENGTH]; + ID2RuleStatistics acc, cur_counts; + vector<WordID> key, cur_key; + int line = 0; + use_hadoop_counters = conf.count("use_hadoop_counters") > 0; + const bool phrase_marginals = conf.count("phrase_marginals") > 0; + const bool bidir = conf.count("bidir") > 0; + while(cin) { + ++line; + cin.getline(buf, MAX_LINE_LENGTH); + if (buf[0] == 0) continue; + ParseLine(buf, &cur_key, &cur_counts); + if (cur_key != key) { + if (key.size() > 0) { + if (phrase_marginals) + DoPhraseMarginals(key, bidir, &acc); + if (bidir) + WriteWithInversions(key, acc); + else + WriteKeyValue(key, acc); + acc.clear(); + } + key = cur_key; + } + PlusEquals(cur_counts, &acc); + } + if (key.size() > 0) { + if (phrase_marginals) + DoPhraseMarginals(key, bidir, &acc); + if (bidir) + WriteWithInversions(key, acc); + else + WriteKeyValue(key, acc); + } + return 0; +} + diff --git a/extools/score_grammar.cc b/extools/score_grammar.cc new file mode 100644 index 00000000..534b6295 --- /dev/null +++ b/extools/score_grammar.cc @@ -0,0 +1,346 @@ +/* + * Score a grammar in striped format + * ./score_grammar <alignment> < filtered.grammar > scored.grammar + */ +#include <iostream> +#include <string> +#include <map> +#include <vector> +#include <utility> +#include <cstdlib> +#include <fstream> +#include <tr1/unordered_map> + +#include "sentence_pair.h" +#include "extract.h" +#include "fdict.h" +#include "tdict.h" +#include "lex_trans_tbl.h" +#include "filelib.h" + +#include <boost/functional/hash.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +using namespace std; +using namespace std::tr1; + + +static const size_t MAX_LINE_LENGTH = 64000000; + +typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics; + + +namespace { + inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } + inline bool IsBracket(char c){return c == '[' || c == ']';} + inline void SkipWhitespace(const char* buf, int* ptr) { + while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); } + } +} + +int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) { + static const WordID kDIV = TD::Convert("|||"); + int ptr = sstart; + while(ptr < end) { + while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; } + int start = ptr; + while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; } + if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; } + const WordID w = TD::Convert(string(buf, start, ptr - start)); + + if((IsBracket(buf[start]) and IsBracket(buf[ptr-1])) or( w == kDIV)) + p->push_back(1 * w); + else { + if (w == kDIV) return ptr; + p->push_back(w); + } + } + return ptr; +} + + +void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) { + static const WordID kDIV = TD::Convert("|||"); + counts->clear(); + int ptr = 0; + while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; } + if (buf[ptr] != '\t') { + cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl; + exit(1); + } + cur_key->clear(); + // key is: "[X] ||| word word word" + int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key); + cur_key->push_back(kDIV); + ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key); + ++ptr; + int start = ptr; + int end = ptr; + int state = 0; // 0=reading label, 1=reading count + vector<WordID> name; + while(buf[ptr] != 0) { + while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; } + if (buf[ptr] == '|') { + ++ptr; + if (buf[ptr] == '|') { + ++ptr; + if (buf[ptr] == '|') { + ++ptr; + end = ptr - 3; + while (end > start && IsWhitespace(buf[end-1])) { --end; } + if (start == end) { + cerr << "Got empty token!\n LINE=" << buf << endl; + exit(1); + } + switch (state) { + case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break; + case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break; + default: cerr << "Can't happen\n"; abort(); + } + SkipWhitespace(buf, &ptr); + start = ptr; + } + } + } + } + end=ptr; + while (end > start && IsWhitespace(buf[end-1])) { --end; } + if (end > start) { + switch (state) { + case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break; + case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break; + default: cerr << "Can't happen\n"; abort(); + } + } +} + + + +void LexTranslationTable::createTTable(const char* buf){ + + bool DEBUG = false; + + AnnotatedParallelSentence sent; + + sent.ParseInputLine(buf); + + //iterate over the alignment to compute aligned words + + for(int i =0;i<sent.aligned.width();i++) + { + for (int j=0;j<sent.aligned.height();j++) + { + if (DEBUG) cerr << sent.aligned(i,j) << " "; + if( sent.aligned(i,j)) + { + if (DEBUG) cerr << TD::Convert(sent.f[i]) << " aligned to " << TD::Convert(sent.e[j]); + ++word_translation[pair<WordID,WordID> (sent.f[i], sent.e[j])]; + ++total_foreign[sent.f[i]]; + ++total_english[sent.e[j]]; + } + } + if (DEBUG) cerr << endl; + } + if (DEBUG) cerr << endl; + + static const WordID NULL_ = TD::Convert("NULL"); + //handle unaligned words - align them to null + for (int j =0; j < sent.e_len; j++) + { + if (sent.e_aligned[j]) continue; + ++word_translation[pair<WordID,WordID> (NULL_, sent.e[j])]; + ++total_foreign[NULL_]; + ++total_english[sent.e[j]]; + } + + for (int i =0; i < sent.f_len; i++) + { + if (sent.f_aligned[i]) continue; + ++word_translation[pair<WordID,WordID> (sent.f[i], NULL_)]; + ++total_english[NULL_]; + ++total_foreign[sent.f[i]]; + } + +} + + +inline float safenlog(float v) { + if (v == 1.0f) return 0.0f; + float res = -log(v); + if (res > 100.0f) res = 100.0f; + return res; +} + +int main(int argc, char** argv){ + bool DEBUG= false; + if (argc != 2) { + cerr << "Usage: " << argv[0] << " corpus.al < filtered.grammar\n"; + return 1; + } + ifstream alignment (argv[1]); + istream& unscored_grammar = cin; + ostream& scored_grammar = cout; + + //create lexical translation table + cerr << "Creating table..." << endl; + char* buf = new char[MAX_LINE_LENGTH]; + + LexTranslationTable table; + + while(!alignment.eof()) + { + alignment.getline(buf, MAX_LINE_LENGTH); + if (buf[0] == 0) continue; + + table.createTTable(buf); + } + + bool PRINT_TABLE=false; + if (PRINT_TABLE) + { + ofstream trans_table; + trans_table.open("lex_trans_table.out"); + for(map < pair<WordID,WordID>,int >::iterator it = table.word_translation.begin(); it != table.word_translation.end(); ++it) + { + trans_table << TD::Convert(it->first.first) << "|||" << TD::Convert(it->first.second) << "==" << it->second << "//" << table.total_foreign[it->first.first] << "//" << table.total_english[it->first.second] << endl; + } + + trans_table.close(); + } + + + //score unscored grammar + cerr <<"Scoring grammar..." << endl; + + ID2RuleStatistics acc, cur_counts; + vector<WordID> key, cur_key,temp_key; + vector< pair<short,short> > al; + vector< pair<short,short> >::iterator ita; + int line = 0; + + static const int kCF = FD::Convert("CF"); + static const int kCE = FD::Convert("CE"); + static const int kCFE = FD::Convert("CFE"); + + while(!unscored_grammar.eof()) + { + ++line; + unscored_grammar.getline(buf, MAX_LINE_LENGTH); + if (buf[0] == 0) continue; + ParseLine(buf, &cur_key, &cur_counts); + + //loop over all the Target side phrases that this source aligns to + for (ID2RuleStatistics::const_iterator it = cur_counts.begin(); it != cur_counts.end(); ++it) + { + + /*Compute phrase translation prob. + Print out scores in this format: + Phrase trnaslation prob P(F|E) + Phrase translation prob P(E|F) + Lexical weighting prob lex(F|E) + Lexical weighting prob lex(E|F) + */ + + float pEF_ = it->second.counts.value(kCFE) / it->second.counts.value(kCF); + float pFE_ = it->second.counts.value(kCFE) / it->second.counts.value(kCE); + + map <WordID, pair<int, float> > foreign_aligned; + map <WordID, pair<int, float> > english_aligned; + + //Loop over all the alignment points to compute lexical translation probability + al = it->second.aligns; + for(ita = al.begin(); ita != al.end(); ++ita) + { + + if (DEBUG) + { + cerr << "\nA:" << ita->first << "," << ita->second << "::"; + cerr << TD::Convert(cur_key[ita->first + 2]) << "-" << TD::Convert(it->first[ita->second]); + } + + + //Lookup this alignment probability in the table + int temp = table.word_translation[pair<WordID,WordID> (cur_key[ita->first+2],it->first[ita->second])]; + float f2e=0, e2f=0; + if ( table.total_foreign[cur_key[ita->first+2]] != 0) + f2e = (float) temp / table.total_foreign[cur_key[ita->first+2]]; + if ( table.total_english[it->first[ita->second]] !=0 ) + e2f = (float) temp / table.total_english[it->first[ita->second]]; + if (DEBUG) printf (" %d %E %E\n", temp, f2e, e2f); + + + //local counts to keep track of which things haven't been aligned, to later compute their null alignment + if (foreign_aligned.count(cur_key[ita->first+2])) + { + foreign_aligned[ cur_key[ita->first+2] ].first++; + foreign_aligned[ cur_key[ita->first+2] ].second += e2f; + } + else + foreign_aligned [ cur_key[ita->first+2] ] = pair<int,float> (1,e2f); + + + + if (english_aligned.count( it->first[ ita->second] )) + { + english_aligned[ it->first[ ita->second ]].first++; + english_aligned[ it->first[ ita->second] ].second += f2e; + } + else + english_aligned [ it->first[ ita->second] ] = pair<int,float> (1,f2e); + + + + + } + + float final_lex_f2e=1, final_lex_e2f=1; + static const WordID NULL_ = TD::Convert("NULL"); + + //compute lexical weight P(F|E) and include unaligned foreign words + for(int i=0;i<cur_key.size(); i++) + { + + if (!table.total_foreign.count(cur_key[i])) continue; //if we dont have it in the translation table, we won't know its lexical weight + + if (foreign_aligned.count(cur_key[i])) + { + pair<int, float> temp_lex_prob = foreign_aligned[cur_key[i]]; + final_lex_e2f *= temp_lex_prob.second / temp_lex_prob.first; + } + else //dealing with null alignment + { + int temp_count = table.word_translation[pair<WordID,WordID> (cur_key[i],NULL_)]; + float temp_e2f = (float) temp_count / table.total_english[NULL_]; + final_lex_e2f *= temp_e2f; + } + + } + + //compute P(E|F) unaligned english words + for(int j=0; j< it->first.size(); j++) + { + if (!table.total_english.count(it->first[j])) continue; + + if (english_aligned.count(it->first[j])) + { + pair<int, float> temp_lex_prob = english_aligned[it->first[j]]; + final_lex_f2e *= temp_lex_prob.second / temp_lex_prob.first; + } + else //dealing with null + { + int temp_count = table.word_translation[pair<WordID,WordID> (NULL_,it->first[j])]; + float temp_f2e = (float) temp_count / table.total_foreign[NULL_]; + final_lex_f2e *= temp_f2e; + } + } + + + scored_grammar << TD::GetString(cur_key); + scored_grammar << " " << TD::GetString(it->first) << " |||"; + scored_grammar << " " << safenlog(pFE_) << " " << safenlog(pEF_); + scored_grammar << " " << safenlog(final_lex_e2f) << " " << safenlog(final_lex_f2e) << endl; + } + } +} + diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc new file mode 100644 index 00000000..91286059 --- /dev/null +++ b/extools/sentence_pair.cc @@ -0,0 +1,155 @@ +#include "sentence_pair.h" + +#include <queue> +#include <iostream> +#include <string> +#include <vector> +#include <utility> +#include <set> + +#include "tdict.h" +#include "wordid.h" +#include "array2d.h" + +using namespace std; + +namespace { + inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } + + inline void SkipWhitespace(const char* buf, int* ptr) { + while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); } + } +} + +void AnnotatedParallelSentence::Reset() { + f.clear(); + e.clear(); + e_aligned.clear(); + f_aligned.clear(); + aligns_by_fword.clear(); + aligned.clear(); + span_types.clear(); +} + +void AnnotatedParallelSentence::AllocateForAlignment() { + f_len = f.size(); + e_len = e.size(); + aligned.resize(f_len, e_len, false); + f_aligned.resize(f_len, 0); + e_aligned.resize(e_len, 0); + aligns_by_fword.resize(f_len); + span_types.resize(e_len, e_len+1); +} + +// read an alignment point of the form X-Y where X and Y are strings +// of digits. if permit_col is true, the right edge will be determined +// by the presence of a colon +int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf, + const int start, + const int end, + const bool permit_col, + short* a, + short* b) { + if (end - start < 3) { + cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl; abort(); + } + int c = start; + *a = 0; + while(c < end && buf[c] != '-') { + if (buf[c] < '0' || buf[c] > '9') { + cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl; + abort(); + } + (*a) *= 10; + (*a) += buf[c] - '0'; + ++c; + } + ++c; + if (c >= end) { + cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl; abort(); + } + (*b) = 0; + while(c < end && (!permit_col || (permit_col && buf[c] != ':'))) { + if (buf[c] < '0' || buf[c] > '9') { + cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl; + abort(); + } + (*b) *= 10; + (*b) += buf[c] - '0'; + ++c; + } + return c; +} + +void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) { + short a, b; + ReadAlignmentPoint(buf, start, end, false, &a, &b); + assert(a < f_len); + assert(b < e_len); + aligned(a,b) = true; + ++f_aligned[a]; + ++e_aligned[b]; + aligns_by_fword[a].push_back(make_pair(a,b)); + // cerr << a << " " << b << endl; +} + +void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) { + short a,b; + int c = ReadAlignmentPoint(buf, start, end, true, &a, &b) + 1; + if (buf[c-1] != ':' || c >= end) { + cerr << "Span badly formed: " << string(buf, start, end-start) << endl; abort(); + } + // cerr << a << " " << b << " " << string(buf,c,end-c) << endl; + span_types(a,b).push_back(-TD::Convert(string(buf, c, end-c))); +} + +// INPUT FORMAT +// ein haus ||| a house ||| 0-0 1-1 ||| 0-0:DT 1-1:NN 0-1:NP +void AnnotatedParallelSentence::ParseInputLine(const char* buf) { + Reset(); + int ptr = 0; + SkipWhitespace(buf, &ptr); + int start = ptr; + int state = 0; // 0 = French, 1 = English, 2 = Alignment, 3 = Spans + while(char c = buf[ptr]) { + if (!IsWhitespace(c)) { ++ptr; continue; } else { + if (ptr - start == 3 && buf[start] == '|' && buf[start+1] == '|' && buf[start+2] == '|') { + ++state; + if (state == 4) { cerr << "Too many fields (ignoring):\n " << buf << endl; return; } + if (state == 2) { + // cerr << "FLEN=" << f->size() << " ELEN=" << e->size() << endl; + AllocateForAlignment(); + } + SkipWhitespace(buf, &ptr); + start = ptr; + continue; + } + switch (state) { + case 0: f.push_back(TD::Convert(string(buf, start, ptr-start))); break; + case 1: e.push_back(TD::Convert(string(buf, start, ptr-start))); break; + case 2: ParseAlignmentPoint(buf, start, ptr); break; + case 3: ParseSpanLabel(buf, start, ptr); break; + default: cerr << "Can't happen\n"; abort(); + } + SkipWhitespace(buf, &ptr); + start = ptr; + } + } + if (ptr > start) { + switch (state) { + case 0: f.push_back(TD::Convert(string(buf, start, ptr-start))); break; + case 1: e.push_back(TD::Convert(string(buf, start, ptr-start))); break; + case 2: ParseAlignmentPoint(buf, start, ptr); break; + case 3: ParseSpanLabel(buf, start, ptr); break; + default: cerr << "Can't happen\n"; abort(); + } + } + if (state < 2) { + cerr << "Not enough fields: " << buf << endl; + abort(); + } + if (e.empty() || f.empty()) { + cerr << "Sentences must not be empty: " << buf << endl; + } +} + diff --git a/extools/sentence_pair.h b/extools/sentence_pair.h new file mode 100644 index 00000000..d78be359 --- /dev/null +++ b/extools/sentence_pair.h @@ -0,0 +1,39 @@ +#ifndef _SENTENCE_PAIR_H_ +#define _SENTENCE_PAIR_H_ + +#include <utility> +#include <vector> +#include "wordid.h" +#include "array2d.h" + +// represents a parallel sentence with a word alignment and category +// annotations over subspans (currently in terms of f) +// you should read one using ParseInputLine and then use the public +// member variables to query things about it +struct AnnotatedParallelSentence { + // read annotated parallel sentence from string + void ParseInputLine(const char* buf); + + std::vector<WordID> f, e; // words in f and e + + // word alignment information + std::vector<int> e_aligned, f_aligned; // counts the number of times column/row x is aligned + Array2D<bool> aligned; + std::vector<std::vector<std::pair<short, short> > > aligns_by_fword; + + // span type information + Array2D<std::vector<WordID> > span_types; // span_types(i,j) is the list of category + // types for a span (i,j) in the TARGET language. + + int f_len, e_len; + + static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b); + + private: + void Reset(); + void AllocateForAlignment(); + void ParseAlignmentPoint(const char* buf, int start, int end); + void ParseSpanLabel(const char* buf, int start, int end); +}; + +#endif diff --git a/extools/simple-extract-context.sh b/extools/simple-extract-context.sh new file mode 100755 index 00000000..17487b1c --- /dev/null +++ b/extools/simple-extract-context.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +MYDIR=$(dirname $0) + +export LANG=C +date 1>&2 +$MYDIR/extractor -i $1 -c 500000 -L 12 -C | sort -t $'\t' -k 1 | $MYDIR/mr_stripe_rule_reduce +date 1>&2 + diff --git a/extools/simple-extract.sh b/extools/simple-extract.sh new file mode 100755 index 00000000..7d9f439d --- /dev/null +++ b/extools/simple-extract.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +export LANG=C +date +./extractor -i $1 -d X -c 500000 -L 12 -b | sort -t $'\t' -k 1 | gzip > ex.output.gz +date +# -p = compute phrase marginals +# -b = bidirectional rules (starting with F or E) were extracted +gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz +date + diff --git a/extools/suffix_tree.h b/extools/suffix_tree.h new file mode 100644 index 00000000..d3a9c08e --- /dev/null +++ b/extools/suffix_tree.h @@ -0,0 +1,38 @@ +/* + * suffix_tree.h + * + * Created on: May 17, 2010 + * Author: Vlad + */ + +#ifndef SUFFIX_TREE_H_ +#define SUFFIX_TREE_H_ + +#include <string> +#include <map> +#include <vector> + +template <class T> +class Node { + public: + std::map<T, Node> edge_list_; + int InsertPath(const std::vector<T>& p, int start, int end); + const Node* Extend(const T& e) const { + typename std::map<T, Node>::const_iterator it = edge_list_.find(e); + if (it == edge_list_.end()) return NULL; + return &it->second; + } +}; + +bool DEBUG = false; + +template <class T> +int Node<T>::InsertPath(const std::vector<T>& p, int start, int end){ + Node* currNode = this; + for(int i=start;i<= end; i++ ) { + currNode = &(currNode->edge_list_)[p[i]]; + } + return 1; +} + +#endif /* SUFFIX_TRIE_H_ */ diff --git a/extools/test_data/README b/extools/test_data/README new file mode 100644 index 00000000..e368cffc --- /dev/null +++ b/extools/test_data/README @@ -0,0 +1,10 @@ +The following was used to create the test data. The real inputs +were corpus.fr, corpus.en, and corpus.aligned. The generated files +were corpus.len_cats and fr-en.al.len. + + + ./make_len_cats.pl corpus.en > corpus.len_cats + + ../merge_lines.pl corpus.fr corpus.en corpus.aligned corpus.len_cats > fr-en.al.len + + diff --git a/extools/test_data/corpus.aligned b/extools/test_data/corpus.aligned new file mode 100644 index 00000000..aa09e9ab --- /dev/null +++ b/extools/test_data/corpus.aligned @@ -0,0 +1,5 @@ +0-0 1-2 2-1 +0-0 1-1 +0-0 0-1 1-0 1-1 2-0 2-1 3-2 4-3 +0-0 +0-0 1-1 diff --git a/extools/test_data/corpus.en b/extools/test_data/corpus.en new file mode 100644 index 00000000..2d4751bf --- /dev/null +++ b/extools/test_data/corpus.en @@ -0,0 +1,5 @@ +the blue house +the hat +there is a hat +cap +the cat diff --git a/extools/test_data/corpus.fr b/extools/test_data/corpus.fr new file mode 100644 index 00000000..75b5e127 --- /dev/null +++ b/extools/test_data/corpus.fr @@ -0,0 +1,5 @@ +la maison bleue +le chapeau +il y a un chapeau +chapeau +le chat diff --git a/extools/test_data/corpus.len_cats b/extools/test_data/corpus.len_cats new file mode 100644 index 00000000..18d321de --- /dev/null +++ b/extools/test_data/corpus.len_cats @@ -0,0 +1,5 @@ +0-1:SHORT 0-2:SHORT 0-3:MID 1-2:SHORT 1-3:SHORT 2-3:SHORT +0-1:SHORT 0-2:SHORT 1-2:SHORT +0-1:SHORT 0-2:SHORT 0-3:MID 0-4:MID 1-2:SHORT 1-3:SHORT 1-4:MID 2-3:SHORT 2-4:SHORT 3-4:SHORT +0-1:SHORT +0-1:SHORT 0-2:SHORT 1-2:SHORT diff --git a/extools/test_data/fr-en.al.len b/extools/test_data/fr-en.al.len new file mode 100644 index 00000000..7ee6b85d --- /dev/null +++ b/extools/test_data/fr-en.al.len @@ -0,0 +1,5 @@ +la maison bleue ||| the blue house ||| 0-0 1-2 2-1 ||| 0-1:SHORT 0-2:SHORT 0-3:MID 1-2:SHORT 1-3:SHORT 2-3:SHORT +le chapeau ||| the hat ||| 0-0 1-1 ||| 0-1:SHORT 0-2:SHORT 1-2:SHORT +il y a un chapeau ||| there is a hat ||| 0-0 0-1 1-0 1-1 2-0 2-1 3-2 4-3 ||| 0-1:SHORT 0-2:SHORT 0-3:MID 0-4:MID 1-2:SHORT 1-3:SHORT 1-4:MID 2-3:SHORT 2-4:SHORT 3-4:SHORT +chapeau ||| cap ||| 0-0 ||| 0-1:SHORT +le chat ||| the cat ||| 0-0 1-1 ||| 0-1:SHORT 0-2:SHORT 1-2:SHORT diff --git a/extools/test_data/make_len_cats.pl b/extools/test_data/make_len_cats.pl new file mode 100755 index 00000000..25ef75fa --- /dev/null +++ b/extools/test_data/make_len_cats.pl @@ -0,0 +1,23 @@ +#!/usr/bin/perl -w +use strict; + +my $max_len = 15; +my @cat_names = qw( NULL SHORT SHORT MID MID MID LONG LONG LONG LONG LONG VLONG VLONG VLONG VLONG VLONG ); + +while(<>) { + chomp; + my @words = split /\s+/; + my $len = scalar @words; + my @spans; + for (my $i =0; $i < $len; $i++) { + for (my $k = 1; $k <= $max_len; $k++) { + my $j = $i + $k; + next if ($j > $len); + my $cat = $cat_names[$k]; + die unless $cat; + push @spans, "$i-$j:$cat"; + } + } + print "@spans\n"; +} + |