summaryrefslogtreecommitdiff
path: root/extools
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-08-12 23:33:21 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2012-08-12 23:33:21 -0400
commitb6e70b420ed993ee73f71058d04b382147896068 (patch)
treef46854c2cf037df209ca5dc9064a67c35db25acd /extools
parent8294cf0992815ffd791a5ef9e74f32fea6efeb9a (diff)
use new union api
Diffstat (limited to 'extools')
-rw-r--r--extools/Makefile.am30
-rw-r--r--extools/README32
-rwxr-xr-xextools/coarsen_grammar.pl133
-rw-r--r--extools/extract.cc336
-rw-r--r--extools/extract.h94
-rw-r--r--extools/extractor.cc439
-rw-r--r--extools/extractor_monolingual.cc256
-rw-r--r--extools/featurize_grammar.cc716
-rw-r--r--extools/filter_grammar.cc135
-rw-r--r--extools/lex_trans_tbl.h25
-rwxr-xr-xextools/merge_lines.pl43
-rw-r--r--extools/mr_stripe_rule_reduce.cc172
-rw-r--r--extools/score_grammar.cc352
-rw-r--r--extools/sentence_pair.cc198
-rw-r--r--extools/sentence_pair.h43
-rw-r--r--extools/sg_lexer.l294
-rwxr-xr-xextools/simple-extract-context.sh9
-rwxr-xr-xextools/simple-extract.sh11
-rw-r--r--extools/striped_grammar.cc67
-rw-r--r--extools/striped_grammar.h56
-rw-r--r--extools/suffix_tree.h46
-rw-r--r--extools/test_data/README10
-rw-r--r--extools/test_data/corpus.aligned5
-rw-r--r--extools/test_data/corpus.en5
-rw-r--r--extools/test_data/corpus.fr5
-rw-r--r--extools/test_data/corpus.len_cats5
-rw-r--r--extools/test_data/fr-en.al.len5
-rwxr-xr-xextools/test_data/make_len_cats.pl23
28 files changed, 0 insertions, 3545 deletions
diff --git a/extools/Makefile.am b/extools/Makefile.am
deleted file mode 100644
index ee363264..00000000
--- a/extools/Makefile.am
+++ /dev/null
@@ -1,30 +0,0 @@
-bin_PROGRAMS = \
- extractor \
- mr_stripe_rule_reduce \
- filter_grammar \
- featurize_grammar \
- extractor_monolingual
-
-noinst_PROGRAMS =
-
-sg_lexer.cc: sg_lexer.l
- $(LEX) -s -CF -8 -o$@ $<
-
-filter_grammar_SOURCES = filter_grammar.cc extract.cc sentence_pair.cc striped_grammar.cc sg_lexer.cc
-filter_grammar_LDADD = $(top_srcdir)/utils/libutils.a -lz
-#filter_grammar_LDFLAGS = -all-static
-
-featurize_grammar_SOURCES = featurize_grammar.cc extract.cc sentence_pair.cc sg_lexer.cc striped_grammar.cc
-featurize_grammar_LDADD = $(top_srcdir)/utils/libutils.a -lz
-
-mr_stripe_rule_reduce_SOURCES = mr_stripe_rule_reduce.cc extract.cc sentence_pair.cc striped_grammar.cc sg_lexer.cc
-mr_stripe_rule_reduce_LDADD = $(top_srcdir)/utils/libutils.a -lz
-
-extractor_SOURCES = sentence_pair.cc extract.cc extractor.cc striped_grammar.cc
-extractor_LDADD = $(top_srcdir)/utils/libutils.a -lz
-
-extractor_monolingual_SOURCES = extractor_monolingual.cc
-extractor_monolingual_LDADD = $(top_srcdir)/utils/libutils.a -lz
-
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils
-
diff --git a/extools/README b/extools/README
deleted file mode 100644
index af91ce79..00000000
--- a/extools/README
+++ /dev/null
@@ -1,32 +0,0 @@
-
-Categories have the format i-j:CAT where i and j are the indices of the spaces
-between words in the TARGET language. For example, slash categories can be written:
-
- the blue house
- 0-1:DT 1-2:JJ 2-3:NN 1-3:NBAR 0-2:NP/NN 0-3:NP
-
-
-You may multiply label each span, e.g.
-
- NP
- |
- NBAR
- |
- NN
- |
- John
- 0-1:NP 0-1:NBAR 0-1:NP
-
-However, this may result in a very large number of rules being extracted.
-
-
-****
-* Filtering and Scoring of Unscored and Unfiltered Grammars
-****
-
-Take the unfiltered grammar, and a test set, and run:
-./filter_grammar <test set> < unfiltered.grammar > filter.grammar
-
-Then, to score the new filtered grammar, run:
-./score_grammar <alignment> < filtered.grammar > scored.grammar
-
diff --git a/extools/coarsen_grammar.pl b/extools/coarsen_grammar.pl
deleted file mode 100755
index f2dd6689..00000000
--- a/extools/coarsen_grammar.pl
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/perl
-
-# dumb grammar coarsener that maps every nonterminal to X (except S).
-
-use strict;
-
-unless (@ARGV > 1){
- die "Usage: $0 <weight file> <grammar file> [<grammar file> ... <grammar file>] \n";
-}
-my $weight_file = shift @ARGV;
-
-$ENV{"LC_ALL"} = "C";
-local(*GRAMMAR, *OUT_GRAMMAR, *WEIGHTS);
-
-my %weights;
-unless (open(WEIGHTS, $weight_file)) {die "Could not open weight file $weight_file\n" }
-while (<WEIGHTS>){
- if (/(.+) (.+)$/){
- $weights{$1} = $2;
- }
-}
-close(WEIGHTS);
-unless (keys(%weights)){
- die "Could not find any PhraseModel features in weight file (perhaps you specified the wrong file?)\n\n".
- "Usage: $0 <weight file> <grammar file> [<grammar file> ... <grammar file>] \n";
-}
-
-sub cleanup_and_die;
-$SIG{INT} = "cleanup_and_die";
-$SIG{TERM} = "cleanup_and_die";
-$SIG{HUP} = "cleanup_and_die";
-
-open(OUT_GRAMMAR, ">grammar.tmp");
-while (my $grammar_file = shift @ARGV){
- unless (open(GRAMMAR, $grammar_file)) {die "Could not open grammar file $grammar_file\n"}
- while (<GRAMMAR>){
- if (/^((.*\|{3}){3})(.*)$/){
- my $rule = $1;
- my $rest = $3;
- my $coarse_rule = $rule;
- $coarse_rule =~ s/\[X[^\],]*/[X/g;
- print OUT_GRAMMAR "$coarse_rule $rule $rest\n";
- } else {
- die "Unrecognized rule format: $_\n";
- }
- }
- close(GRAMMAR);
-}
-close(OUT_GRAMMAR);
-
-`sort grammar.tmp > grammar.tmp.sorted`;
-sub dump_rules;
-sub compute_score;
-unless (open(GRAMMAR, "grammar.tmp.sorted")){ die "Something went wrong; could not open intermediate file grammar.tmp.sorted\n"};
-my $prev_coarse_rule = "";
-my $best_features = "";
-my $best_score = 0;
-my @rules = ();
-while (<GRAMMAR>){
- if (/^\s*((\S.*\|{3}\s*){3})((\S.*\|{3}\s*){3})(.*)$/){
- my $coarse_rule = $1;
- my $fine_rule = $3;
- my $features = $5; # This code does not correctly handle rules with other info (e.g. alignments)
- if ($coarse_rule eq $prev_coarse_rule){
- my $score = compute_score($features, %weights);
- if ($score > $best_score){
- $best_score = $score;
- $best_features = $features;
- }
- } else {
- dump_rules($prev_coarse_rule, $best_features, @rules);
- $prev_coarse_rule = $coarse_rule;
- $best_features = $features;
- $best_score = compute_score($features, %weights);
- @rules = ();
- }
- push(@rules, "$fine_rule$features\n");
- } else {
- die "Something went wrong during grammar projection: $_\n";
- }
-}
-dump_rules($prev_coarse_rule, $best_features, @rules);
-close(GRAMMAR);
-cleanup();
-
-sub compute_score {
- my($features, %weights) = @_;
- my $score = 0;
- if ($features =~ s/^\s*(\S.*\S)\s*$/$1/) {
- my @features = split(/\s+/, $features);
- my $pm=0;
- for my $feature (@features) {
- my $feature_name;
- my $feature_val;
- if ($feature =~ /(.*)=(.*)/){
- $feature_name = $1;
- $feature_val= $2;
- } else {
- $feature_name = "PhraseModel_" . $pm;
- $feature_val= $feature;
- }
- $pm++;
- if ($weights{$feature_name}){
- $score += $weights{$feature_name} * $feature_val;
- }
- }
- } else {
- die "Unexpected feature value format: $features\n";
- }
- return $score;
-}
-
-sub dump_rules {
- my($coarse_rule, $coarse_rule_scores, @fine_rules) = @_;
- unless($coarse_rule){ return; }
- print "$coarse_rule $coarse_rule_scores\n";
- for my $rule (@fine_rules){
- print "\t$rule";
- }
-}
-
-sub cleanup_and_die {
- cleanup();
- die "\n";
-}
-
-sub cleanup {
- `rm -rf grammar.tmp grammar.tmp.sorted`;
-}
-
-
-
-
diff --git a/extools/extract.cc b/extools/extract.cc
deleted file mode 100644
index 49542fed..00000000
--- a/extools/extract.cc
+++ /dev/null
@@ -1,336 +0,0 @@
-#include "extract.h"
-
-#include <queue>
-#include <vector>
-#include <utility>
-#include <tr1/unordered_map>
-#include <set>
-#include <boost/tuple/tuple_comparison.hpp>
-
-#include <boost/functional/hash.hpp>
-#include <boost/tuple/tuple.hpp>
-
-#include "sentence_pair.h"
-#include "tdict.h"
-#include "wordid.h"
-#include "array2d.h"
-
-using namespace std;
-using namespace boost;
-using std::tr1::unordered_map;
-using boost::tuple;
-
-namespace {
- inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
-
- inline void SkipWhitespace(const char* buf, int* ptr) {
- while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
- }
-}
-
-Extract::RuleObserver::~RuleObserver() {
- cerr << "Rules extracted: " << count << endl;
-}
-
-void Extract::ExtractBasePhrases(const int max_base_phrase_size,
- const AnnotatedParallelSentence& sentence,
- vector<ParallelSpan>* phrases) {
- phrases->clear();
-
- vector<pair<int,int> > f_spans(sentence.f_len, pair<int,int>(sentence.e_len, 0));
- vector<pair<int,int> > e_spans(sentence.e_len, pair<int,int>(sentence.f_len, 0));
- // for each alignment point in e, precompute the minimal consistent phrases in f
- // for each alignment point in f, precompute the minimal consistent phrases in e
- for (int i = 0; i < sentence.f_len; ++i) {
- for (int j = 0; j < sentence.e_len; ++j) {
- if (sentence.aligned(i,j)) {
- if (j < f_spans[i].first) f_spans[i].first = j;
- f_spans[i].second = j+1;
- if (i < e_spans[j].first) e_spans[j].first = i;
- e_spans[j].second = i+1;
- }
- }
- }
-
- for (int i1 = 0; i1 < sentence.f_len; ++i1) {
- if (sentence.f_aligned[i1] == 0) continue;
- int j1 = sentence.e_len;
- int j2 = 0;
- const int i_limit = min(sentence.f_len, i1 + max_base_phrase_size);
- for (int i2 = i1 + 1; i2 <= i_limit; ++i2) {
- if (sentence.f_aligned[i2-1] == 0) continue;
- // cerr << "F has aligned span " << i1 << " to " << i2 << endl;
- j1 = min(j1, f_spans[i2-1].first);
- j2 = max(j2, f_spans[i2-1].second);
- if (j1 >= j2) continue;
- if (j2 - j1 > max_base_phrase_size) continue;
- int condition = 0;
- for (int j = j1; j < j2; ++j) {
- if (e_spans[j].first < i1) { condition = 1; break; }
- if (e_spans[j].second > i2) { condition = 2; break; }
- }
- if (condition == 1) break;
- if (condition == 2) continue;
- // category types added later!
- phrases->push_back(ParallelSpan(i1, i2, j1, j2));
- // cerr << i1 << " " << i2 << " : " << j1 << " " << j2 << endl;
- }
- }
-}
-
-void Extract::LoosenPhraseBounds(const AnnotatedParallelSentence& sentence,
- const int max_base_phrase_size,
- vector<ParallelSpan>* phrases) {
- const int num_phrases = phrases->size();
- map<int, map<int, map<int, map<int, bool> > > > marker;
- for (int i = 0; i < num_phrases; ++i) {
- const ParallelSpan& cur = (*phrases)[i];
- marker[cur.i1][cur.i2][cur.j1][cur.j2] = true;
- }
- for (int i = 0; i < num_phrases; ++i) {
- const ParallelSpan& cur = (*phrases)[i];
- const int i1_max = cur.i1;
- const int i2_min = cur.i2;
- const int j1_max = cur.j1;
- const int j2_min = cur.j2;
- int i1_min = i1_max;
- while (i1_min > 0 && sentence.f_aligned[i1_min-1] == 0) { --i1_min; }
- int j1_min = j1_max;
- while (j1_min > 0 && sentence.e_aligned[j1_min-1] == 0) { --j1_min; }
- int i2_max = i2_min;
- while (i2_max < sentence.f_len && sentence.f_aligned[i2_max] == 0) { ++i2_max; }
- int j2_max = j2_min;
- while (j2_max < sentence.e_len && sentence.e_aligned[j2_max] == 0) { ++j2_max; }
- for (int i1 = i1_min; i1 <= i1_max; ++i1) {
- const int ilim = min(i2_max, i1 + max_base_phrase_size);
- for (int i2 = max(i1+1,i2_min); i2 <= ilim; ++i2) {
- for (int j1 = j1_min; j1 <= j1_max; ++j1) {
- const int jlim = std::min(j2_max, j1 + max_base_phrase_size);
- for (int j2 = std::max(j1+1, j2_min); j2 <= jlim; ++j2) {
- bool& seen = marker[i1][i2][j1][j2];
- if (!seen)
- phrases->push_back(ParallelSpan(i1,i2,j1,j2));
- seen = true;
- }
- }
- }
- }
- }
-}
-
-template <typename K, typename V>
-void
-lookup_and_append(const map<K, V> &dict, const K &key, V &output)
-{
- typename map<K, V>::const_iterator found = dict.find(key);
- if (found != dict.end())
- copy(found->second.begin(), found->second.end(), back_inserter(output));
-}
-
-// this uses the TARGET span (i,j) to annotate phrases, will copy
-// phrases if there is more than one annotation.
-// TODO: support source annotation
-void Extract::AnnotatePhrasesWithCategoryTypes(const WordID default_cat,
- const map< boost::tuple<short,short,short,short>, vector<WordID> > &types,
- vector<ParallelSpan>* phrases) {
- const int num_unannotated_phrases = phrases->size();
- // have to use num_unannotated_phrases since we may grow the vector
- for (int i = 0; i < num_unannotated_phrases; ++i) {
- ParallelSpan& phrase = (*phrases)[i];
- vector<WordID> cats;
- lookup_and_append(types, boost::make_tuple(phrase.i1, phrase.i2, phrase.j1, phrase.j2), cats);
- lookup_and_append(types, boost::make_tuple((short)-1, (short)-1, phrase.j1, phrase.j2), cats);
- lookup_and_append(types, boost::make_tuple(phrase.i1, phrase.i2, (short)-1, (short)-1), cats);
- if (cats.empty() && default_cat != 0) {
- cats = vector<WordID>(1, default_cat);
- }
- if (cats.empty()) {
- cerr << "ERROR span " << phrase.i1 << "," << phrase.i2 << "-"
- << phrase.j1 << "," << phrase.j2 << " has no type. "
- "Did you forget --default_category?\n";
- }
- phrase.cat = cats[0];
- for (int ci = 1; ci < cats.size(); ++ci) {
- ParallelSpan new_phrase = phrase;
- new_phrase.cat = cats[ci];
- phrases->push_back(new_phrase);
- }
- }
-}
-
-// a partially complete (f-side) of a rule
-struct RuleItem {
- vector<ParallelSpan> f;
- int i,j,syms,vars;
- explicit RuleItem(int pi) : i(pi), j(pi), syms(), vars() {}
- void Extend(const WordID& fword) {
- f.push_back(ParallelSpan(fword));
- ++j;
- ++syms;
- }
- void Extend(const ParallelSpan& subphrase) {
- f.push_back(subphrase);
- j += subphrase.i2 - subphrase.i1;
- ++vars;
- ++syms;
- }
- bool RuleFEndsInVariable() const {
- if (f.size() > 0) {
- return f.back().IsVariable();
- } else { return false; }
- }
-};
-
-void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence,
- const vector<ParallelSpan>& phrases,
- const int max_vars,
- const int max_syms,
- const bool permit_adjacent_nonterminals,
- const bool require_aligned_terminal,
- RuleObserver* observer,
- vector<WordID>* all_cats) {
- const char bkoff_mrkr = '_';
- queue<RuleItem> q; // agenda for BFS
- int max_len = -1;
- unordered_map<pair<short, short>, vector<ParallelSpan>, boost::hash<pair<short, short> > > fspans;
- vector<vector<ParallelSpan> > spans_by_start(sentence.f_len);
- set<int> starts;
- WordID bkoff;
- for (int i = 0; i < phrases.size(); ++i) {
- fspans[make_pair(phrases[i].i1,phrases[i].i2)].push_back(phrases[i]);
- max_len = max(max_len, phrases[i].i2 - phrases[i].i1);
- // have we already added a rule item starting at phrases[i].i1?
- if (starts.insert(phrases[i].i1).second)
- q.push(RuleItem(phrases[i].i1));
- spans_by_start[phrases[i].i1].push_back(phrases[i]);
- }
- starts.clear();
- vector<pair<int,int> > next_e(sentence.e_len);
- vector<WordID> cur_rhs_f, cur_rhs_e;
- vector<pair<short, short> > cur_terminal_align;
- vector<int> cur_es, cur_fs;
- while(!q.empty()) {
- const RuleItem& rule = q.front();
-
- // extend the partial rule
- if (rule.j < sentence.f_len && (rule.j - rule.i) < max_len && rule.syms < max_syms) {
- RuleItem ew = rule;
-
- // extend with a word
- ew.Extend(sentence.f[ew.j]);
- q.push(ew);
-
- // with variables
- if (rule.vars < max_vars &&
- !spans_by_start[rule.j].empty() &&
- ((!rule.RuleFEndsInVariable()) || permit_adjacent_nonterminals)) {
- const vector<ParallelSpan>& sub_phrases = spans_by_start[rule.j];
- for (int it = 0; it < sub_phrases.size(); ++it) {
- if (sub_phrases[it].i2 - sub_phrases[it].i1 + rule.j - rule.i <= max_len) {
- RuleItem ev = rule;
- ev.Extend(sub_phrases[it]);
- q.push(ev);
- assert(ev.j <= sentence.f_len);
- }
- }
- }
- }
- // determine if rule is consistent
- if (rule.syms > 0 &&
- fspans.count(make_pair(rule.i,rule.j)) &&
- (!rule.RuleFEndsInVariable() || rule.syms > 1)) {
- const vector<ParallelSpan>& orig_spans = fspans[make_pair(rule.i,rule.j)];
- for (int s = 0; s < orig_spans.size(); ++s) {
- const ParallelSpan& orig_span = orig_spans[s];
- const WordID lhs = orig_span.cat;
- for (int j = orig_span.j1; j < orig_span.j2; ++j) next_e[j].first = -1;
- int nt_index_e = 0;
- for (int i = 0; i < rule.f.size(); ++i) {
- const ParallelSpan& cur = rule.f[i];
- if (cur.IsVariable())
- next_e[cur.j1] = pair<int,int>(cur.j2, ++nt_index_e);
- }
- cur_rhs_f.clear();
- cur_rhs_e.clear();
- cur_terminal_align.clear();
- cur_fs.clear();
- cur_es.clear();
-
- const int elen = orig_span.j2 - orig_span.j1;
- vector<int> isvar(elen, 0);
- int fbias = rule.i;
- bool bad_rule = false;
- bool has_aligned_terminal = false;
- for (int i = 0; i < rule.f.size(); ++i) {
- const ParallelSpan& cur = rule.f[i];
- cur_rhs_f.push_back(cur.cat);
- if (cur.cat > 0) { // terminal
- if (sentence.f_aligned[fbias + i]) has_aligned_terminal = true;
- cur_fs.push_back(fbias + i);
- } else { // non-terminal
- int subj1 = cur.j1 - orig_span.j1;
- int subj2 = cur.j2 - orig_span.j1;
- if (subj1 < 0 || subj2 > elen) { bad_rule = true; break; }
- for (int j = subj1; j < subj2 && !bad_rule; ++j) {
- int& isvarj = isvar[j];
- isvarj = true;
- }
- if (bad_rule) break;
- cur_fs.push_back(-1);
- fbias += cur.i2 - cur.i1 - 1;
- }
- }
- if (require_aligned_terminal && !has_aligned_terminal) bad_rule = true;
- if (!bad_rule) {
- for (int j = orig_span.j1; j < orig_span.j2; ++j) {
- if (next_e[j].first < 0) {
- cur_rhs_e.push_back(sentence.e[j]);
- cur_es.push_back(j);
- } else {
- cur_rhs_e.push_back(1 - next_e[j].second); // next_e[j].second is NT gap index
- cur_es.push_back(-1);
- j = next_e[j].first - 1;
- }
- }
- for (short i = 0; i < cur_fs.size(); ++i)
- if (cur_fs[i] >= 0)
- for (short j = 0; j < cur_es.size(); ++j)
- if (cur_es[j] >= 0 && sentence.aligned(cur_fs[i],cur_es[j]))
- cur_terminal_align.push_back(make_pair(i,j));
- //observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align);
-
- if(!all_cats->empty()) {
- //produce the backoff grammar if the category wordIDs are available
- for (int i = 0; i < cur_rhs_f.size(); ++i) {
- if(cur_rhs_f[i] < 0) {
- //cerr << cur_rhs_f[i] << ": (cats,f) |" << TD::Convert(-cur_rhs_f[i]) << endl;
- string nonterm = TD::Convert(-cur_rhs_f[i]);
- nonterm+=bkoff_mrkr;
- bkoff = -TD::Convert(nonterm);
- cur_rhs_f[i]=bkoff;
- /*vector<WordID> rhs_f_bkoff;
- vector<WordID> rhs_e_bkoff;
- vector<pair<short,short> > bkoff_align;
- bkoff_align.clear();
- bkoff_align.push_back(make_pair(0,0));
-
- for (int cat = 0; cat < all_cats->size(); ++cat) {
- rhs_f_bkoff.clear();
- rhs_e_bkoff.clear();
- rhs_f_bkoff.push_back(-(*all_cats)[cat]);
- rhs_e_bkoff.push_back(0);
- observer->CountRule(bkoff,rhs_f_bkoff,rhs_e_bkoff,bkoff_align);
-
- }*/
- }
- }
-
- }
- observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align);
- }
- }
- }
- q.pop();
- }
-}
-
diff --git a/extools/extract.h b/extools/extract.h
deleted file mode 100644
index e9ea5e65..00000000
--- a/extools/extract.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef _EXTRACT_H_
-#define _EXTRACT_H_
-
-#include <iostream>
-#include <utility>
-#include <vector>
-#include <boost/tuple/tuple.hpp>
-#include "array2d.h"
-#include "wordid.h"
-#include "sparse_vector.h"
-
-struct AnnotatedParallelSentence;
-
-// usually represents a consistent phrase, which may
-// be annotated with a type (cat)
-// inside the rule extractor, this class is also used to represent a word
-// in a partial rule.
-struct ParallelSpan {
- // i1 = i of f side
- // i2 = j of f side
- // j1 = i of e side
- // j2 = j of e side
- short i1,i2,j1,j2;
- // cat is set by AnnotatePhrasesWithCategoryTypes, otherwise it's 0
- WordID cat; // category type of span (also overloaded by RuleItem class
- // to be a word ID)
- ParallelSpan() : i1(-1), i2(-1), j1(-1), j2(-1), cat() {}
- // used by Rule class to represent a terminal symbol:
- explicit ParallelSpan(WordID w) : i1(-1), i2(-1), j1(-1), j2(-1), cat(w) {}
- ParallelSpan(int pi1, int pi2, int pj1, int pj2) : i1(pi1), i2(pi2), j1(pj1), j2(pj2), cat() {}
- ParallelSpan(int pi1, int pi2, int pj1, int pj2, WordID c) : i1(pi1), i2(pi2), j1(pj1), j2(pj2), cat(c) {}
-
- // ParallelSpan is used in the Rule class where it is
- // overloaded to also represent terminal symbols
- inline bool IsVariable() const { return i1 != -1; }
-};
-
-// rule extraction logic lives here. this has no data, it's just got
-// static member functions.
-struct Extract {
- // RuleObserver's CountRule is called for each rule extracted
- // implement CountRuleImpl to do things like count the rules,
- // write them to a file, etc.
- struct RuleObserver {
- RuleObserver() : count() {}
- virtual void CountRule(WordID lhs,
- const std::vector<WordID>& rhs_f,
- const std::vector<WordID>& rhs_e,
- const std::vector<std::pair<short, short> >& fe_terminal_alignments) {
- ++count;
- CountRuleImpl(lhs, rhs_f, rhs_e, fe_terminal_alignments);
- }
- virtual ~RuleObserver();
-
- protected:
- virtual void CountRuleImpl(WordID lhs,
- const std::vector<WordID>& rhs_f,
- const std::vector<WordID>& rhs_e,
- const std::vector<std::pair<short, short> >& fe_terminal_alignments) = 0;
- private:
- int count;
- };
-
- // given a set of "tight" phrases and the aligned sentence they were
- // extracted from, "loosen" them
- static void LoosenPhraseBounds(const AnnotatedParallelSentence& sentence,
- const int max_base_phrase_size,
- std::vector<ParallelSpan>* phrases);
-
- // extract all consistent phrase pairs, up to size max_base_phrase_size
- // (on the source side). these phrases will be "tight".
- static void ExtractBasePhrases(const int max_base_phrase_size,
- const AnnotatedParallelSentence& sentence,
- std::vector<ParallelSpan>* phrases);
-
- // this uses the TARGET span (i,j) to annotate phrases, will copy
- // phrases if there is more than one annotation.
- static void AnnotatePhrasesWithCategoryTypes(const WordID default_cat,
- const std::map< boost::tuple<short,short,short,short>, std::vector<WordID> > &types,
- std::vector<ParallelSpan>* phrases);
-
- // use the Chiang (2007) extraction logic to extract consistent subphrases
- // observer->CountRule is called once for each rule extracted
- static void ExtractConsistentRules(const AnnotatedParallelSentence& sentence,
- const std::vector<ParallelSpan>& phrases,
- const int max_vars,
- const int max_syms,
- const bool permit_adjacent_nonterminals,
- const bool require_aligned_terminal,
- RuleObserver* observer,
- std::vector<WordID>* all_cats);
-};
-
-#endif
diff --git a/extools/extractor.cc b/extools/extractor.cc
deleted file mode 100644
index 1e4154ef..00000000
--- a/extools/extractor.cc
+++ /dev/null
@@ -1,439 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <utility>
-#include <tr1/unordered_map>
-
-#include <boost/functional/hash.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include <boost/lexical_cast.hpp>
-
-#include "sparse_vector.h"
-#include "sentence_pair.h"
-#include "extract.h"
-#include "tdict.h"
-#include "fdict.h"
-#include "wordid.h"
-#include "array2d.h"
-#include "filelib.h"
-#include "striped_grammar.h"
-
-using namespace std;
-using namespace std::tr1;
-namespace po = boost::program_options;
-
-static const size_t MAX_LINE_LENGTH = 100000;
-WordID kBOS, kEOS, kDIVIDER, kGAP, kSPLIT;
-int kCOUNT;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input,i", po::value<string>()->default_value("-"), "Input file")
- ("default_category,d", po::value<string>(), "Default span type (use X for 'Hiero')")
- ("x_cdyer_pos,x", "Extract monolingual POS contexts (cdyer experimental)")
- ("loose", "Use loose phrase extraction heuristic for base phrases")
- ("base_phrase,B", "Write base phrases")
- ("base_phrase_spans", "Write base sentences and phrase spans")
- ("phrase_language", po::value<string>()->default_value("target"), "Extract phrase strings in source, target or both languages")
- ("context_language", po::value<string>()->default_value("target"), "Extract context strings in source, target or both languages")
- ("bidir,b", "Extract bidirectional rules (for computing p(f|e) in addition to p(e|f))")
- ("combiner_size,c", po::value<size_t>()->default_value(800000), "Number of unique items to store in cache before writing rule counts. Set to 1 to disable cache. Set to 0 for no limit.")
- ("silent", "Write nothing to stderr except errors")
- ("phrase_context,C", "Write base phrase contexts")
- ("phrase_context_size,S", po::value<int>()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts")
- ("max_base_phrase_size,L", po::value<int>()->default_value(10), "Maximum starting phrase size")
- ("max_syms,l", po::value<int>()->default_value(5), "Maximum number of symbols in final phrase size")
- ("max_vars,v", po::value<int>()->default_value(2), "Maximum number of nonterminal variables in final phrase size")
- ("permit_adjacent_nonterminals,A", "Permit adjacent nonterminals in source side of rules")
- ("no_required_aligned_terminal,n", "Do not require an aligned terminal")
- ("topics,t", po::value<int>()->default_value(50), "Number of categories assigned during clustering")
- ("backoff,g","Produce a backoff grammar")
- ("help,h", "Print this help message and exit");
- po::options_description clo("Command line options");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- po::notify(*conf);
-
- if (conf->count("help") || conf->count("input") == 0) {
- cerr << "\nUsage: extractor [-options]\n";
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-// TODO how to handle alignment information?
-void WriteBasePhrases(const AnnotatedParallelSentence& sentence,
- const vector<ParallelSpan>& phrases) {
- vector<WordID> e,f;
- for (int it = 0; it < phrases.size(); ++it) {
- const ParallelSpan& phrase = phrases[it];
- e.clear();
- f.clear();
- for (int i = phrase.i1; i < phrase.i2; ++i)
- f.push_back(sentence.f[i]);
- for (int j = phrase.j1; j < phrase.j2; ++j)
- e.push_back(sentence.e[j]);
- cout << TD::GetString(f) << " ||| " << TD::GetString(e) << endl;
- }
-}
-
-void WriteBasePhraseSpans(const AnnotatedParallelSentence& sentence,
- const vector<ParallelSpan>& phrases) {
- cout << TD::GetString(sentence.f) << " ||| " << TD::GetString(sentence.e) << " |||";
- for (int it = 0; it < phrases.size(); ++it) {
- const ParallelSpan& phrase = phrases[it];
- cout << " " << phrase.i1 << "-" << phrase.i2
- << "-" << phrase.j1 << "-" << phrase.j2;
- }
- cout << endl;
-}
-
-struct CountCombiner {
- CountCombiner(const size_t& csize) : combiner_size(csize) {
- if (csize == 0) { cerr << "Using unlimited combiner cache.\n"; }
- }
- ~CountCombiner() {
- if (!cache.empty()) WriteAndClearCache();
- }
-
- void Count(const vector<WordID>& key,
- const vector<WordID>& val,
- const int count_type,
- const vector<pair<short,short> >& aligns) {
- if (combiner_size != 1) {
- RuleStatistics& v = cache[key][val];
- float newcount = v.counts.add_value(count_type, 1.0f);
- // hack for adding alignments
- if (newcount < 7.0f && aligns.size() > v.aligns.size())
- v.aligns = aligns;
- if (combiner_size > 1 && cache.size() > combiner_size)
- WriteAndClearCache();
- } else {
- cout << TD::GetString(key) << '\t' << TD::GetString(val) << " ||| ";
- cout << RuleStatistics(count_type, 1.0f, aligns) << endl;
- }
- }
-
- private:
- void WriteAndClearCache() {
- for (unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > >::iterator it = cache.begin();
- it != cache.end(); ++it) {
- cout << TD::GetString(it->first) << '\t';
- const Vec2PhraseCount& vals = it->second;
- bool needdiv = false;
- for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi) {
- if (needdiv) cout << " ||| "; else needdiv = true;
- cout << TD::GetString(vi->first) << " ||| " << vi->second;
- }
- cout << endl;
- }
- cache.clear();
- }
-
- const size_t combiner_size;
- typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > Vec2PhraseCount;
- unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > > cache;
-};
-
-// TODO optional source context
-// output <k, v> : k = phrase "document" v = context "term"
-void WritePhraseContexts(const AnnotatedParallelSentence& sentence,
- const vector<ParallelSpan>& phrases,
- const int ctx_size,
- bool phrase_s, bool phrase_t,
- bool context_s, bool context_t,
- CountCombiner* o) {
- vector<WordID> context, context_f;
- if (context_t)
- {
- context.resize(ctx_size * 2 + 1);
- context[ctx_size] = kGAP;
- }
- if (context_s)
- {
- context_f.resize(ctx_size * 2 + 1);
- context_f[ctx_size] = kGAP;
- }
- vector<WordID> key, key_f;
- if (phrase_t) key.reserve(100);
- if (phrase_s) key_f.reserve(100);
-
- for (int it = 0; it < phrases.size(); ++it) {
- const ParallelSpan& phrase = phrases[it];
-
- key.clear();
- for (int j = phrase.j1; j < phrase.j2 && phrase_t; ++j)
- key.push_back(sentence.e[j]);
-
- if (context_t)
- {
- context.resize(ctx_size * 2 + 1);
- for (int i = 0; i < ctx_size && context_t; ++i) {
- int epos = phrase.j1 - 1 - i;
- const WordID left_ctx = (epos < 0) ? kBOS : sentence.e[epos];
- context[ctx_size - i - 1] = left_ctx;
- epos = phrase.j2 + i;
- const WordID right_ctx = (epos >= sentence.e_len) ? kEOS : sentence.e[epos];
- context[ctx_size + i + 1] = right_ctx;
- }
- }
- else
- context.clear();
-
- if (phrase_s)
- {
- key_f.clear();
- for (int i = phrase.i1; i < phrase.i2; ++i)
- key_f.push_back(sentence.f[i]);
- if (phrase_t) key.push_back(kSPLIT);
- copy(key_f.begin(), key_f.end(), back_inserter(key));
- }
-
- if (context_s)
- {
- for (int i = 0; i < ctx_size; ++i) {
- int fpos = phrase.i1 - 1 - i;
- const WordID left_ctx = (fpos < 0) ? kBOS : sentence.f[fpos];
- context_f[ctx_size - i - 1] = left_ctx;
- fpos = phrase.i2 + i;
- const WordID right_ctx = (fpos >= sentence.f_len) ? kEOS : sentence.f[fpos];
- context_f[ctx_size + i + 1] = right_ctx;
- }
- if (context_t) context.push_back(kSPLIT);
- copy(context_f.begin(), context_f.end(), back_inserter(context));
- }
-
- o->Count(key, context, kCOUNT, vector<pair<short,short> >());
- }
-}
-
-struct SimpleRuleWriter : public Extract::RuleObserver {
- protected:
- virtual void CountRuleImpl(WordID lhs,
- const vector<WordID>& rhs_f,
- const vector<WordID>& rhs_e,
- const vector<pair<short,short> >& fe_terminal_alignments) {
- cout << "[" << TD::Convert(-lhs) << "] |||";
- for (int i = 0; i < rhs_f.size(); ++i) {
- if (rhs_f[i] < 0) cout << " [" << TD::Convert(-rhs_f[i]) << ']';
- else cout << ' ' << TD::Convert(rhs_f[i]);
- }
- cout << " |||";
- for (int i = 0; i < rhs_e.size(); ++i) {
- if (rhs_e[i] <= 0) cout << " [" << (1-rhs_e[i]) << ']';
- else cout << ' ' << TD::Convert(rhs_e[i]);
- }
- cout << " |||";
- for (int i = 0; i < fe_terminal_alignments.size(); ++i) {
- cout << ' ' << fe_terminal_alignments[i].first << '-' << fe_terminal_alignments[i].second;
- }
- cout << endl;
- }
-};
-
-struct HadoopStreamingRuleObserver : public Extract::RuleObserver {
- HadoopStreamingRuleObserver(CountCombiner* cc, bool bidir_flag) :
- bidir(bidir_flag),
- kF(TD::Convert("F")),
- kE(TD::Convert("E")),
- kDIVIDER(TD::Convert("|||")),
- kLB("["), kRB("]"),
- combiner(*cc),
- kEMPTY(),
- kCFE(FD::Convert("CFE")) {
- for (int i=1; i < 50; ++i)
- index2sym[1-i] = TD::Convert(kLB + boost::lexical_cast<string>(i) + kRB);
- fmajor_key.resize(10, kF);
- emajor_key.resize(10, kE);
- if (bidir)
- fmajor_key[2] = emajor_key[2] = kDIVIDER;
- else
- fmajor_key[1] = kDIVIDER;
- }
-
- protected:
- virtual void CountRuleImpl(WordID lhs,
- const vector<WordID>& rhs_f,
- const vector<WordID>& rhs_e,
- const vector<pair<short,short> >& fe_terminal_alignments) {
- if (bidir) { // extract rules in "both directions" E->F and F->E
- fmajor_key.resize(3 + rhs_f.size());
- emajor_key.resize(3 + rhs_e.size());
- fmajor_val.resize(rhs_e.size());
- emajor_val.resize(rhs_f.size());
- emajor_key[1] = fmajor_key[1] = MapSym(lhs);
- int nt = 1;
- for (int i = 0; i < rhs_f.size(); ++i) {
- const WordID id = rhs_f[i];
- if (id < 0) {
- fmajor_key[3 + i] = MapSym(id, nt);
- emajor_val[i] = MapSym(id, nt);
- ++nt;
- } else {
- fmajor_key[3 + i] = id;
- emajor_val[i] = id;
- }
- }
- for (int i = 0; i < rhs_e.size(); ++i) {
- WordID id = rhs_e[i];
- if (id <= 0) {
- fmajor_val[i] = index2sym[id];
- emajor_key[3 + i] = index2sym[id];
- } else {
- fmajor_val[i] = id;
- emajor_key[3 + i] = id;
- }
- }
- combiner.Count(fmajor_key, fmajor_val, kCFE, fe_terminal_alignments);
- combiner.Count(emajor_key, emajor_val, kCFE, kEMPTY);
- } else { // extract rules only in F->E
- fmajor_key.resize(2 + rhs_f.size());
- fmajor_val.resize(rhs_e.size());
- fmajor_key[0] = MapSym(lhs);
- int nt = 1;
- for (int i = 0; i < rhs_f.size(); ++i) {
- const WordID id = rhs_f[i];
- if (id < 0)
- fmajor_key[2 + i] = MapSym(id, nt++);
- else
- fmajor_key[2 + i] = id;
- }
- for (int i = 0; i < rhs_e.size(); ++i) {
- const WordID id = rhs_e[i];
- if (id <= 0)
- fmajor_val[i] = index2sym[id];
- else
- fmajor_val[i] = id;
- }
- combiner.Count(fmajor_key, fmajor_val, kCFE, fe_terminal_alignments);
- }
- }
-
- private:
- WordID MapSym(WordID sym, int ind = 0) {
- WordID& r = cat2ind2sym[sym][ind];
- if (!r) {
- if (ind == 0)
- r = TD::Convert(kLB + TD::Convert(-sym) + kRB);
- else
- r = TD::Convert(kLB + TD::Convert(-sym) + "," + boost::lexical_cast<string>(ind) + kRB);
- }
- return r;
- }
-
- const bool bidir;
- const WordID kF, kE, kDIVIDER;
- const string kLB, kRB;
- CountCombiner& combiner;
- const vector<pair<short,short> > kEMPTY;
- const int kCFE;
- map<WordID, map<int, WordID> > cat2ind2sym;
- map<int, WordID> index2sym;
- vector<WordID> emajor_key, emajor_val, fmajor_key, fmajor_val;
-};
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- kBOS = TD::Convert("<s>");
- kEOS = TD::Convert("</s>");
- kDIVIDER = TD::Convert("|||");
- kGAP = TD::Convert("<PHRASE>");
- kCOUNT = FD::Convert("C");
- kSPLIT = TD::Convert("<SPLIT>");
-
- WordID default_cat = 0; // 0 means no default- extraction will
- // fail if a phrase is extracted without a
- // category
- const bool backoff = (conf.count("backoff") ? true : false);
- if (conf.count("default_category")) {
- string sdefault_cat = conf["default_category"].as<string>();
- default_cat = -TD::Convert(sdefault_cat);
- cerr << "Default category: " << sdefault_cat << endl;
- }
- ReadFile rf(conf["input"].as<string>());
- istream& in = *rf.stream();
-
- char buf[MAX_LINE_LENGTH];
- AnnotatedParallelSentence sentence;
- vector<ParallelSpan> phrases;
- vector<WordID> all_cats;
- int max_base_phrase_size = conf["max_base_phrase_size"].as<int>();
- bool write_phrase_contexts = conf.count("phrase_context") > 0;
- const bool write_base_phrases = conf.count("base_phrase") > 0;
- const bool write_base_phrase_spans = conf.count("base_phrase_spans") > 0;
- const bool loose_phrases = conf.count("loose") > 0;
- const bool silent = conf.count("silent") > 0;
- const int max_syms = conf["max_syms"].as<int>();
- const int max_vars = conf["max_vars"].as<int>();
- const int ctx_size = conf["phrase_context_size"].as<int>();
- const int num_categories = conf["topics"].as<int>();
- const bool permit_adjacent_nonterminals = conf.count("permit_adjacent_nonterminals") > 0;
- const bool require_aligned_terminal = conf.count("no_required_aligned_terminal") == 0;
- const string ps = conf["phrase_language"].as<string>();
- const bool phrase_s = ps == "source" || ps == "both";
- const bool phrase_t = ps == "target" || ps == "both";
- const string cs = conf["context_language"].as<string>();
- const bool context_s = cs == "source" || cs == "both";
- const bool context_t = cs == "target" || cs == "both";
- const bool x_cdyer_pos = conf.count("x_cdyer_pos");
- int line = 0;
- CountCombiner cc(conf["combiner_size"].as<size_t>());
- HadoopStreamingRuleObserver o(&cc,
- conf.count("bidir") > 0);
-
- assert(phrase_s || phrase_t);
- assert(context_s || context_t);
-
- if(backoff) {
- for (int i=0;i < num_categories;++i)
- all_cats.push_back(TD::Convert("X"+boost::lexical_cast<string>(i)));
- }
-
- //SimpleRuleWriter o;
- while(in) {
- ++line;
- in.getline(buf, MAX_LINE_LENGTH);
- if (buf[0] == 0) continue;
- //cerr << "line #" << line << " = " << buf << endl;
- if (!silent) {
- if (line % 200 == 0) cerr << '.';
- if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush;
- }
- sentence.ParseInputLine(buf);
- if (x_cdyer_pos) {
- sentence.e = sentence.f;
- sentence.AllocateForAlignment();
- for (int i = 0; i < sentence.e.size(); ++i) sentence.Align(i,i);
- max_base_phrase_size = 1;
- write_phrase_contexts = true;
- }
- phrases.clear();
- Extract::ExtractBasePhrases(max_base_phrase_size, sentence, &phrases);
- if (loose_phrases)
- Extract::LoosenPhraseBounds(sentence, max_base_phrase_size, &phrases);
- if (phrases.empty()) {
- cerr << "WARNING no phrases extracted line: " << line << endl;
- continue;
- }
- if (write_phrase_contexts) {
- WritePhraseContexts(sentence, phrases, ctx_size, phrase_s, phrase_t, context_s, context_t, &cc);
- continue;
- }
- if (write_base_phrases) {
- WriteBasePhrases(sentence, phrases);
- continue;
- }
- if (write_base_phrase_spans) {
- WriteBasePhraseSpans(sentence, phrases);
- continue;
- }
- Extract::AnnotatePhrasesWithCategoryTypes(default_cat, sentence.span_types, &phrases);
- Extract::ExtractConsistentRules(sentence, phrases, max_vars, max_syms, permit_adjacent_nonterminals, require_aligned_terminal, &o, &all_cats);
- }
- if (!silent) cerr << endl;
- return 0;
-}
diff --git a/extools/extractor_monolingual.cc b/extools/extractor_monolingual.cc
deleted file mode 100644
index 049ebc85..00000000
--- a/extools/extractor_monolingual.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <utility>
-#include <tr1/unordered_map>
-
-#include <boost/functional/hash.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include <boost/lexical_cast.hpp>
-
-#include "tdict.h"
-#include "fdict.h"
-#include "wordid.h"
-#include "filelib.h"
-
-using namespace std;
-using namespace std::tr1;
-namespace po = boost::program_options;
-
-static const size_t MAX_LINE_LENGTH = 100000;
-WordID kBOS, kEOS, kDIVIDER, kGAP;
-int kCOUNT;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input,i", po::value<string>()->default_value("-"), "Input file")
- ("phrases,p", po::value<string>(), "File contatining phrases of interest")
- ("phrase_context_size,S", po::value<int>()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts")
- ("combiner_size,c", po::value<size_t>()->default_value(30000), "Number of unique items to store in cache before writing rule counts. Set to 1 to disable cache. Set to 0 for no limit.")
- ("prune", po::value<size_t>()->default_value(0), "Prune items with count less than threshold; applies each time the cache is dumped.")
- ("silent", "Write nothing to stderr except errors")
- ("help,h", "Print this help message and exit");
- po::options_description clo("Command line options");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- po::notify(*conf);
-
- if (conf->count("help") || conf->count("input") != 1 || conf->count("phrases") != 1) {
- cerr << "\nUsage: extractor_monolingual [-options]\n";
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-struct TrieNode
-{
- TrieNode(int l) : finish(false), length(l) {};
- ~TrieNode()
- {
- for (unordered_map<int, TrieNode*>::iterator
- it = next.begin(); it != next.end(); ++it)
- delete it->second;
- next.clear();
- }
-
- TrieNode *follow(int token)
- {
- unordered_map<int, TrieNode*>::iterator
- found = next.find(token);
- if (found != next.end())
- return found->second;
- else
- return 0;
- }
-
- void insert(const vector<int> &tokens)
- {
- insert(tokens.begin(), tokens.end());
- }
-
- void insert(vector<int>::const_iterator begin, vector<int>::const_iterator end)
- {
- if (begin == end)
- finish = true;
- else
- {
- int token = *begin;
- unordered_map<int, TrieNode*>::iterator
- nit = next.find(token);
- if (nit == next.end())
- nit = next.insert(make_pair(token, new TrieNode(length+1))).first;
- ++begin;
- nit->second->insert(begin, end);
- }
- }
-
- bool finish;
- int length;
- unordered_map<int, TrieNode*> next;
-};
-
-struct CountCombiner {
- CountCombiner(const size_t& csize, const size_t& prune) : combiner_size(csize), threshold(prune) {
- if (csize == 0) { cerr << "Using unlimited combiner cache.\n"; }
- }
- ~CountCombiner() {
- if (!cache.empty()) WriteAndClearCache();
- }
-
- void Count(const vector<WordID>& key,
- const vector<WordID>& val,
- const int count_type)
- {
- if (combiner_size != 1) {
- cache[key][val] += count_type;
- if (combiner_size > 1 && cache.size() > combiner_size)
- WriteAndClearCache();
- } else {
- cout << TD::GetString(key) << '\t' << TD::GetString(val) << " ||| C=" << count_type << "\n";
- }
- }
-
- private:
- void WriteAndClearCache() {
- for (unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > >::iterator it = cache.begin();
- it != cache.end(); ++it) {
- const Vec2PhraseCount& vals = it->second;
- bool first = true;
- for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi)
- {
- if (threshold > 1 && combiner_size != 1 && vi->second < threshold)
- continue;
-
- if (!first) cout << " ||| ";
- else
- {
- cout << TD::GetString(it->first) << '\t';
- first = false;
- }
- cout << TD::GetString(vi->first) << " ||| C=" << vi->second;
- }
- if (!first)
- cout << '\n';
- }
- cout << flush;
- cache.clear();
- }
-
- const size_t combiner_size, threshold;
- typedef unordered_map<vector<WordID>, int, boost::hash<vector<WordID> > > Vec2PhraseCount;
- unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > > cache;
-};
-
-void WriteContext(const vector<int>& sentence, int start, int end, int ctx_size, CountCombiner &combiner)
-{
- vector<WordID> phrase, context;
- for (int i = start; i < end; ++i)
- phrase.push_back(sentence[i]);
-
- for (int i = ctx_size; i > 0; --i)
- context.push_back(sentence[start-i]);
- context.push_back(kGAP);
- for (int i = 0; i < ctx_size; ++i)
- context.push_back(sentence[end+i]);
-
- combiner.Count(phrase, context, 1);
-}
-
-inline bool IsWhitespace(char c) {
- return c == ' ' || c == '\t';
-}
-
-inline void SkipWhitespace(const char* buf, int* ptr) {
- while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
-}
-
-vector<int> ReadSentence(const char *buf, int padding)
-{
- int ptr = 0;
- SkipWhitespace(buf, &ptr);
- int start = ptr;
- vector<int> sentence;
- for (int i = 0; i < padding; ++i)
- sentence.push_back(kBOS);
-
- while (char c = buf[ptr])
- {
- if (!IsWhitespace(c))
- ++ptr;
- else {
- sentence.push_back(TD::Convert(string(buf, start, ptr-start)));
- SkipWhitespace(buf, &ptr);
- start = ptr;
- }
- }
- for (int i = 0; i < padding; ++i)
- sentence.push_back(kEOS);
-
- return sentence;
-}
-
-int main(int argc, char** argv)
-{
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- kBOS = TD::Convert("<s>");
- kEOS = TD::Convert("</s>");
- kDIVIDER = TD::Convert("|||");
- kGAP = TD::Convert("<PHRASE>");
- kCOUNT = FD::Convert("C");
-
- bool silent = conf.count("silent") > 0;
- const int ctx_size = conf["phrase_context_size"].as<int>();
- CountCombiner cc(conf["combiner_size"].as<size_t>(), conf["prune"].as<size_t>());
-
- char buf[MAX_LINE_LENGTH];
- TrieNode phrase_trie(0);
- ReadFile rpf(conf["phrases"].as<string>());
- istream& pin = *rpf.stream();
- while (pin) {
- pin.getline(buf, MAX_LINE_LENGTH);
- phrase_trie.insert(ReadSentence(buf, 0));
- }
-
- ReadFile rif(conf["input"].as<string>());
- istream &iin = *rif.stream();
- int line = 0;
- while (iin) {
- ++line;
- iin.getline(buf, MAX_LINE_LENGTH);
- //cout << "line: " << line << " '" << buf << "'" << endl;
- if (buf[0] == 0) continue;
- if (!silent) {
- if (line % 200 == 0) cerr << '.';
- if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush;
- }
-
- vector<int> sentence = ReadSentence(buf, ctx_size);
- //cout << "sentence: " << TD::GetString(sentence) << endl;
- vector<TrieNode*> tries;
- for (int i = ctx_size; i < (int)sentence.size() - ctx_size; ++i)
- {
- //cout << "i: " << i << " token: " << TD::Convert(sentence[i]) << " tries: " << tries.size() << endl;
- vector<TrieNode*> tries_prime;
- tries.push_back(&phrase_trie);
- for (vector<TrieNode*>::iterator tit = tries.begin(); tit != tries.end(); ++tit)
- {
- TrieNode* next = (*tit)->follow(sentence[i]);
- if (next != 0)
- {
- //cout << "\tfollowed edge: " << next->finish << endl;
- if (next->finish)
- WriteContext(sentence, i + 1 - next->length, i + 1, ctx_size, cc);
- tries_prime.push_back(next);
- }
- }
- swap(tries, tries_prime);
- }
- //cout << "/sentence" << endl;
- }
- if (!silent) cerr << endl;
- return 0;
-}
diff --git a/extools/featurize_grammar.cc b/extools/featurize_grammar.cc
deleted file mode 100644
index 78175202..00000000
--- a/extools/featurize_grammar.cc
+++ /dev/null
@@ -1,716 +0,0 @@
-/*
- * Featurize a grammar in striped format
- */
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <map>
-#include <vector>
-#include <utility>
-#include <cstdlib>
-#include <tr1/unordered_map>
-
-#include "lex_trans_tbl.h"
-#include "sparse_vector.h"
-#include "sentence_pair.h"
-#include "extract.h"
-#include "fdict.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "striped_grammar.h"
-
-#include <boost/tuple/tuple.hpp>
-#include <boost/shared_ptr.hpp>
-#include <boost/functional/hash.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-using namespace std;
-using namespace std::tr1;
-using boost::shared_ptr;
-namespace po = boost::program_options;
-
-static string aligned_corpus;
-static const size_t MAX_LINE_LENGTH = 64000000;
-
-// Data structures for indexing and counting rules
-//typedef boost::tuple< WordID, vector<WordID>, vector<WordID> > RuleTuple;
-struct RuleTuple {
- RuleTuple(const WordID& lhs, const vector<WordID>& s, const vector<WordID>& t)
- : m_lhs(lhs), m_source(s), m_target(t) {
- hash_value();
- m_dirty = false;
- }
-
- size_t hash_value() const {
-// if (m_dirty) {
- size_t hash = 0;
- boost::hash_combine(hash, m_lhs);
- boost::hash_combine(hash, m_source);
- boost::hash_combine(hash, m_target);
-// }
-// m_dirty = false;
- return hash;
- }
-
- bool operator==(RuleTuple const& b) const
- { return m_lhs == b.m_lhs && m_source == b.m_source && m_target == b.m_target; }
-
- WordID& lhs() { m_dirty=true; return m_lhs; }
- vector<WordID>& source() { m_dirty=true; return m_source; }
- vector<WordID>& target() { m_dirty=true; return m_target; }
- const WordID& lhs() const { return m_lhs; }
- const vector<WordID>& source() const { return m_source; }
- const vector<WordID>& target() const { return m_target; }
-
-// mutable size_t m_hash;
-private:
- WordID m_lhs;
- vector<WordID> m_source, m_target;
- mutable bool m_dirty;
-};
-std::size_t hash_value(RuleTuple const& b) { return b.hash_value(); }
-bool operator<(RuleTuple const& l, RuleTuple const& r) {
- if (l.lhs() < r.lhs()) return true;
- else if (l.lhs() == r.lhs()) {
- if (l.source() < r.source()) return true;
- else if (l.source() == r.source()) {
- if (l.target() < r.target()) return true;
- }
- }
- return false;
-}
-
-ostream& operator<<(ostream& o, RuleTuple const& r) {
- o << "(" << r.lhs() << "-->" << "<";
- for (vector<WordID>::const_iterator it=r.source().begin(); it!=r.source().end(); ++it)
- o << TD::Convert(*it) << " ";
- o << "|||";
- for (vector<WordID>::const_iterator it=r.target().begin(); it!=r.target().end(); ++it)
- o << " " << TD::Convert(*it);
- o << ">)";
- return o;
-}
-
-template <typename Key>
-struct FreqCount {
- //typedef unordered_map<Key, int, boost::hash<Key> > Counts;
- typedef map<Key, int> Counts;
- Counts counts;
-
- int inc(const Key& r, int c=1) {
- pair<typename Counts::iterator,bool> itb
- = counts.insert(make_pair(r,c));
- if (!itb.second)
- itb.first->second += c;
- return itb.first->second;
- }
-
- int inc_if_exists(const Key& r, int c=1) {
- typename Counts::iterator it = counts.find(r);
- if (it != counts.end())
- it->second += c;
- return it->second;
- }
-
- int count(const Key& r) const {
- typename Counts::const_iterator it = counts.find(r);
- if (it == counts.end()) return 0;
- return it->second;
- }
-
- int operator()(const Key& r) const { return count(r); }
-};
-typedef FreqCount<RuleTuple> RuleFreqCount;
-
-class FeatureExtractor;
-class FERegistry;
-struct FEFactoryBase {
- virtual ~FEFactoryBase() {}
- virtual boost::shared_ptr<FeatureExtractor> Create() const = 0;
-};
-
-
-class FERegistry {
- friend class FEFactoryBase;
- public:
- FERegistry() {}
- boost::shared_ptr<FeatureExtractor> Create(const std::string& ffname) const {
- map<string, boost::shared_ptr<FEFactoryBase> >::const_iterator it = reg_.find(ffname);
- boost::shared_ptr<FeatureExtractor> res;
- if (it == reg_.end()) {
- cerr << "I don't know how to create feature " << ffname << endl;
- } else {
- res = it->second->Create();
- }
- return res;
- }
- void DisplayList(ostream* out) const {
- bool first = true;
- for (map<string, boost::shared_ptr<FEFactoryBase> >::const_iterator it = reg_.begin();
- it != reg_.end(); ++it) {
- if (first) {first=false;} else {*out << ' ';}
- *out << it->first;
- }
- }
-
- void Register(const std::string& ffname, FEFactoryBase* factory) {
- if (reg_.find(ffname) != reg_.end()) {
- cerr << "Duplicate registration of FeatureExtractor with name " << ffname << "!\n";
- exit(1);
- }
- reg_[ffname].reset(factory);
- }
-
- private:
- std::map<std::string, boost::shared_ptr<FEFactoryBase> > reg_;
-};
-
-template<class FE>
-class FEFactory : public FEFactoryBase {
- boost::shared_ptr<FeatureExtractor> Create() const {
- return boost::shared_ptr<FeatureExtractor>(new FE);
- }
-};
-
-void InitCommandLine(const FERegistry& r, int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- ostringstream feats;
- feats << "[multiple] Features to extract (";
- r.DisplayList(&feats);
- feats << ")";
- opts.add_options()
- ("filtered_grammar,g", po::value<string>(), "Grammar to add features to")
- ("list_features,L", "List extractable features")
- ("feature,f", po::value<vector<string> >()->composing(), feats.str().c_str())
- ("aligned_corpus,c", po::value<string>(), "Aligned corpus (single line format)")
- ("help,h", "Print this help message and exit");
- po::options_description clo("Command line options");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- po::notify(*conf);
-
- if (conf->count("help") || conf->count("aligned_corpus")==0 || conf->count("feature") == 0) {
- cerr << "\nUsage: featurize_grammar -g FILTERED-GRAMMAR.gz -c ALIGNED_CORPUS.fr-en-al -f Feat1 -f Feat2 ... < UNFILTERED-GRAMMAR\n";
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-static const bool DEBUG = false;
-
-void LexTranslationTable::createTTable(const char* buf){
- AnnotatedParallelSentence sent;
- sent.ParseInputLine(buf);
-
- //iterate over the alignment to compute aligned words
-
- for(int i =0;i<sent.aligned.width();i++)
- {
- for (int j=0;j<sent.aligned.height();j++)
- {
- if (DEBUG) cerr << sent.aligned(i,j) << " ";
- if( sent.aligned(i,j))
- {
- if (DEBUG) cerr << TD::Convert(sent.f[i]) << " aligned to " << TD::Convert(sent.e[j]);
- ++word_translation[pair<WordID,WordID> (sent.f[i], sent.e[j])];
- ++total_foreign[sent.f[i]];
- ++total_english[sent.e[j]];
- }
- }
- if (DEBUG) cerr << endl;
- }
- if (DEBUG) cerr << endl;
-
- const WordID NULL_ = TD::Convert("NULL");
- //handle unaligned words - align them to null
- for (int j =0; j < sent.e_len; j++) {
- if (sent.e_aligned[j]) continue;
- ++word_translation[pair<WordID,WordID> (NULL_, sent.e[j])];
- ++total_foreign[NULL_];
- ++total_english[sent.e[j]];
- }
-
- for (int i =0; i < sent.f_len; i++) {
- if (sent.f_aligned[i]) continue;
- ++word_translation[pair<WordID,WordID> (sent.f[i], NULL_)];
- ++total_english[NULL_];
- ++total_foreign[sent.f[i]];
- }
-}
-
-inline float safenlog(float v) {
- if (v == 1.0f) return 0.0f;
- float res = -log(v);
- if (res > 100.0f) res = 100.0f;
- return res;
-}
-
-static bool IsZero(float f) { return (f > 0.999 && f < 1.001); }
-
-struct FeatureExtractor {
- // create any keys necessary
- virtual void ObserveFilteredRule(const WordID /* lhs */,
- const vector<WordID>& /* src */,
- const vector<WordID>& /* trg */) {}
-
- // compute statistics over keys, the same lhs-src-trg tuple may be seen
- // more than once
- virtual void ObserveUnfilteredRule(const WordID /* lhs */,
- const vector<WordID>& /* src */,
- const vector<WordID>& /* trg */,
- const RuleStatistics& /* info */) {}
-
- // compute features, a unique lhs-src-trg tuple will be seen exactly once
- virtual void ExtractFeatures(const WordID lhs,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info,
- SparseVector<float>* result) const = 0;
-
- virtual ~FeatureExtractor() {}
-};
-
-struct LogRuleCount : public FeatureExtractor {
- LogRuleCount() :
- fid_(FD::Convert("LogRuleCount")),
- sfid_(FD::Convert("SingletonRule")),
- kCFE(FD::Convert("CFE")) {}
- virtual void ExtractFeatures(const WordID lhs,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info,
- SparseVector<float>* result) const {
- (void) lhs; (void) src; (void) trg;
- //result->set_value(fid_, log(info.counts.get(kCFE)));
- result->set_value(fid_, log(info.counts.get(kCFE)));
- if (IsZero(info.counts.get(kCFE)))
- result->set_value(sfid_, 1);
- }
- const int fid_;
- const int sfid_;
- const int kCFE;
-};
-
-struct RulePenalty : public FeatureExtractor {
- RulePenalty() : fid_(FD::Convert("RulePenalty")) {}
- virtual void ExtractFeatures(const WordID /*lhs*/,
- const vector<WordID>& /*src*/,
- const vector<WordID>& /*trg*/,
- const RuleStatistics& /*info*/,
- SparseVector<float>* result) const
- { result->set_value(fid_, 1); }
-
- const int fid_;
-};
-
-// The negative log of the condition rule probs
-// ignoring the identities of the non-terminals.
-// i.e. the prob Hiero would assign.
-// Also extracts Labelled features.
-struct XFeatures: public FeatureExtractor {
- XFeatures() :
- fid_xfe(FD::Convert("XFE")),
- fid_xef(FD::Convert("XEF")),
- fid_labelledfe(FD::Convert("LabelledFE")),
- fid_labelledef(FD::Convert("LabelledEF")),
- fid_xesingleton(FD::Convert("XE_Singleton")),
- fid_xfsingleton(FD::Convert("XF_Singleton")),
- kCFE(FD::Convert("CFE")) {}
- virtual void ObserveFilteredRule(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg) {
- RuleTuple r(-1, src, trg);
- map_rule(r);
- rule_counts.inc(r, 0);
- source_counts.inc(r.source(), 0);
- target_counts.inc(r.target(), 0);
- }
-
- virtual void ObserveUnfilteredRule(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info) {
- RuleTuple r(-1, src, trg);
- map_rule(r);
- const int count = info.counts.get(kCFE);
- assert(count > 0);
- rule_counts.inc_if_exists(r, count);
- source_counts.inc_if_exists(r.source(), count);
- target_counts.inc_if_exists(r.target(), count);
- }
-
- virtual void ExtractFeatures(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info,
- SparseVector<float>* result) const {
- RuleTuple r(-1, src, trg);
- map_rule(r);
- double l_r_freq = log(rule_counts(r));
-
- const int t_c = target_counts(r.target());
- assert(t_c > 0);
- result->set_value(fid_xfe, log(t_c) - l_r_freq);
- result->set_value(fid_labelledfe, log(t_c) - log(info.counts.get(kCFE)));
-// if (t_c == 1)
-// result->set_value(fid_xesingleton, 1.0);
-
- const int s_c = source_counts(r.source());
- assert(s_c > 0);
- result->set_value(fid_xef, log(s_c) - l_r_freq);
- result->set_value(fid_labelledef, log(s_c) - log(info.counts.get(kCFE)));
-// if (s_c == 1)
-// result->set_value(fid_xfsingleton, 1.0);
- }
-
- void map_rule(RuleTuple& r) const {
- vector<WordID> indexes; int i=0;
- for (vector<WordID>::iterator it = r.target().begin(); it != r.target().end(); ++it) {
- if (*it <= 0)
- indexes.push_back(*it);
- }
- for (vector<WordID>::iterator it = r.source().begin(); it != r.source().end(); ++it) {
- if (*it <= 0)
- *it = indexes.at(i++);
- }
- }
-
- const int fid_xfe, fid_xef;
- const int fid_labelledfe, fid_labelledef;
- const int fid_xesingleton, fid_xfsingleton;
- const int kCFE;
- RuleFreqCount rule_counts;
- FreqCount< vector<WordID> > source_counts, target_counts;
-};
-
-
-struct LabelledRuleConditionals: public FeatureExtractor {
- LabelledRuleConditionals() :
- fid_fe(FD::Convert("LabelledFE")),
- fid_ef(FD::Convert("LabelledEF")),
- kCFE(FD::Convert("CFE")) {}
- virtual void ObserveFilteredRule(const WordID lhs,
- const vector<WordID>& src,
- const vector<WordID>& trg) {
- RuleTuple r(lhs, src, trg);
- rule_counts.inc(r, 0);
- source_counts.inc(r.source(), 0);
-
- target_counts.inc(r.target(), 0);
- }
-
- virtual void ObserveUnfilteredRule(const WordID lhs,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info) {
- RuleTuple r(lhs, src, trg);
- rule_counts.inc_if_exists(r, info.counts.get(kCFE));
- source_counts.inc_if_exists(r.source(), info.counts.get(kCFE));
-
- target_counts.inc_if_exists(r.target(), info.counts.get(kCFE));
- }
-
- virtual void ExtractFeatures(const WordID lhs,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& /*info*/,
- SparseVector<float>* result) const {
- RuleTuple r(lhs, src, trg);
- double l_r_freq = log(rule_counts(r));
- result->set_value(fid_fe, log(target_counts(r.target())) - l_r_freq);
- result->set_value(fid_ef, log(source_counts(r.source())) - l_r_freq);
- }
-
- const int fid_fe, fid_ef;
- const int kCFE;
- RuleFreqCount rule_counts;
- FreqCount< vector<WordID> > source_counts, target_counts;
-};
-
-struct LHSProb: public FeatureExtractor {
- LHSProb() : fid_(FD::Convert("LHSProb")), kCFE(FD::Convert("CFE")), total_count(0) {}
-
- virtual void ObserveUnfilteredRule(const WordID lhs,
- const vector<WordID>& /*src*/,
- const vector<WordID>& /*trg*/,
- const RuleStatistics& info) {
- int count = info.counts.get(kCFE);
- total_count += count;
- lhs_counts.inc(lhs, count);
- }
-
- virtual void ExtractFeatures(const WordID lhs,
- const vector<WordID>& /*src*/,
- const vector<WordID>& /*trg*/,
- const RuleStatistics& /*info*/,
- SparseVector<float>* result) const {
- double lhs_log_prob = log(total_count) - log(lhs_counts(lhs));
- result->set_value(fid_, lhs_log_prob);
- }
-
- const int fid_;
- const int kCFE;
- int total_count;
- FreqCount<WordID> lhs_counts;
-};
-
-// Proper rule generative probability: p( s,t | lhs)
-struct GenerativeProb: public FeatureExtractor {
- GenerativeProb() :
- fid_(FD::Convert("GenerativeProb")),
- kCFE(FD::Convert("CFE")) {}
-
- virtual void ObserveUnfilteredRule(const WordID lhs,
- const vector<WordID>& /*src*/,
- const vector<WordID>& /*trg*/,
- const RuleStatistics& info)
- { lhs_counts.inc(lhs, info.counts.get(kCFE)); }
-
- virtual void ExtractFeatures(const WordID lhs,
- const vector<WordID>& /*src*/,
- const vector<WordID>& /*trg*/,
- const RuleStatistics& info,
- SparseVector<float>* result) const {
- double log_prob = log(lhs_counts(lhs)) - log(info.counts.get(kCFE));
- result->set_value(fid_, log_prob);
- }
-
- const int fid_;
- const int kCFE;
- FreqCount<WordID> lhs_counts;
-};
-
-// remove terminals from the rules before estimating the conditional prob
-struct LabellingShape: public FeatureExtractor {
- LabellingShape() : fid_(FD::Convert("LabellingShape")), kCFE(FD::Convert("CFE")) {}
-
- virtual void ObserveFilteredRule(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg) {
- RuleTuple r(-1, src, trg);
- map_rule(r);
- rule_counts.inc(r, 0);
- source_counts.inc(r.source(), 0);
- }
-
- virtual void ObserveUnfilteredRule(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info) {
- RuleTuple r(-1, src, trg);
- map_rule(r);
- rule_counts.inc_if_exists(r, info.counts.get(kCFE));
- source_counts.inc_if_exists(r.source(), info.counts.get(kCFE));
- }
-
- virtual void ExtractFeatures(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& /*info*/,
- SparseVector<float>* result) const {
- RuleTuple r(-1, src, trg);
- map_rule(r);
- double l_r_freq = log(rule_counts(r));
- result->set_value(fid_, log(source_counts(r.source())) - l_r_freq);
- }
-
- // Replace all terminals with generic -1
- void map_rule(RuleTuple& r) const {
- for (vector<WordID>::iterator it = r.target().begin(); it != r.target().end(); ++it)
- if (*it <= 0) *it = -1;
- for (vector<WordID>::iterator it = r.source().begin(); it != r.source().end(); ++it)
- if (*it <= 0) *it = -1;
- }
-
- const int fid_, kCFE;
- RuleFreqCount rule_counts;
- FreqCount< vector<WordID> > source_counts;
-};
-
-
-// this extracts the lexical translation prob features
-// in BOTH directions.
-struct LexProbExtractor : public FeatureExtractor {
- LexProbExtractor() :
- e2f_(FD::Convert("LexE2F")), f2e_(FD::Convert("LexF2E")) {
- ReadFile rf(aligned_corpus);
- //create lexical translation table
- cerr << "Computing lexical translation probabilities from " << aligned_corpus << "..." << endl;
- char* buf = new char[MAX_LINE_LENGTH];
- istream& alignment = *rf.stream();
- while(alignment) {
- alignment.getline(buf, MAX_LINE_LENGTH);
- if (buf[0] == 0) continue;
- table.createTTable(buf);
- }
- delete[] buf;
- }
-
- virtual void ExtractFeatures(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info,
- SparseVector<float>* result) const {
- map <WordID, pair<int, float> > foreign_aligned;
- map <WordID, pair<int, float> > english_aligned;
-
- //Loop over all the alignment points to compute lexical translation probability
- const vector< pair<short,short> >& al = info.aligns;
- vector< pair<short,short> >::const_iterator ita;
- for (ita = al.begin(); ita != al.end(); ++ita) {
- if (DEBUG) {
- cerr << "\nA:" << ita->first << "," << ita->second << "::";
- cerr << TD::Convert(src[ita->first]) << "-" << TD::Convert(trg[ita->second]);
- }
-
- //Lookup this alignment probability in the table
- int temp = table.word_translation[pair<WordID,WordID> (src[ita->first],trg[ita->second])];
- float f2e=0, e2f=0;
- if ( table.total_foreign[src[ita->first]] != 0)
- f2e = (float) temp / table.total_foreign[src[ita->first]];
- if ( table.total_english[trg[ita->second]] !=0 )
- e2f = (float) temp / table.total_english[trg[ita->second]];
- if (DEBUG) printf (" %d %E %E\n", temp, f2e, e2f);
-
- //local counts to keep track of which things haven't been aligned, to later compute their null alignment
- if (foreign_aligned.count(src[ita->first])) {
- foreign_aligned[ src[ita->first] ].first++;
- foreign_aligned[ src[ita->first] ].second += e2f;
- } else {
- foreign_aligned[ src[ita->first] ] = pair<int,float> (1,e2f);
- }
-
- if (english_aligned.count( trg[ ita->second] )) {
- english_aligned[ trg[ ita->second] ].first++;
- english_aligned[ trg[ ita->second] ].second += f2e;
- } else {
- english_aligned[ trg[ ita->second] ] = pair<int,float> (1,f2e);
- }
- }
-
- float final_lex_f2e=1, final_lex_e2f=1;
- static const WordID NULL_ = TD::Convert("NULL");
-
- //compute lexical weight P(F|E) and include unaligned foreign words
- for(int i=0;i<src.size(); i++) {
- if (!table.total_foreign.count(src[i])) continue; //if we dont have it in the translation table, we won't know its lexical weight
-
- if (foreign_aligned.count(src[i]))
- {
- pair<int, float> temp_lex_prob = foreign_aligned[src[i]];
- final_lex_e2f *= temp_lex_prob.second / temp_lex_prob.first;
- }
- else //dealing with null alignment
- {
- int temp_count = table.word_translation[pair<WordID,WordID> (src[i],NULL_)];
- float temp_e2f = (float) temp_count / table.total_english[NULL_];
- final_lex_e2f *= temp_e2f;
- }
-
- }
-
- //compute P(E|F) unaligned english words
- for(int j=0; j< trg.size(); j++) {
- if (!table.total_english.count(trg[j])) continue;
-
- if (english_aligned.count(trg[j]))
- {
- pair<int, float> temp_lex_prob = english_aligned[trg[j]];
- final_lex_f2e *= temp_lex_prob.second / temp_lex_prob.first;
- }
- else //dealing with null
- {
- int temp_count = table.word_translation[pair<WordID,WordID> (NULL_,trg[j])];
- float temp_f2e = (float) temp_count / table.total_foreign[NULL_];
- final_lex_f2e *= temp_f2e;
- }
- }
- result->set_value(e2f_, safenlog(final_lex_e2f));
- result->set_value(f2e_, safenlog(final_lex_f2e));
- }
- const int e2f_, f2e_;
- mutable LexTranslationTable table;
-};
-
-struct Featurizer {
- Featurizer(const vector<boost::shared_ptr<FeatureExtractor> >& ex) : extractors(ex) {
- }
- void Callback1(WordID lhs, const vector<WordID>& src, const ID2RuleStatistics& trgs) {
- for (ID2RuleStatistics::const_iterator it = trgs.begin(); it != trgs.end(); ++it) {
- for (int i = 0; i < extractors.size(); ++i)
- extractors[i]->ObserveFilteredRule(lhs, src, it->first);
- }
- }
- void Callback2(WordID lhs, const vector<WordID>& src, const ID2RuleStatistics& trgs) {
- for (ID2RuleStatistics::const_iterator it = trgs.begin(); it != trgs.end(); ++it) {
- for (int i = 0; i < extractors.size(); ++i)
- extractors[i]->ObserveUnfilteredRule(lhs, src, it->first, it->second);
- }
- }
- void Callback3(WordID lhs, const vector<WordID>& src, const ID2RuleStatistics& trgs) {
- for (ID2RuleStatistics::const_iterator it = trgs.begin(); it != trgs.end(); ++it) {
- SparseVector<float> feats;
- for (int i = 0; i < extractors.size(); ++i)
- extractors[i]->ExtractFeatures(lhs, src, it->first, it->second, &feats);
- cout << '[' << TD::Convert(-lhs) << "] ||| ";
- WriteNamed(src, &cout);
- cout << " ||| ";
- WriteAnonymous(it->first, &cout);
- cout << " ||| ";
- print(cout,feats,"=");
- cout << endl;
- }
- }
- private:
- vector<boost::shared_ptr<FeatureExtractor> > extractors;
-};
-
-void cb1(WordID lhs, const vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void* extra) {
- static_cast<Featurizer*>(extra)->Callback1(lhs, src_rhs, rules);
-}
-
-void cb2(WordID lhs, const vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void* extra) {
- static_cast<Featurizer*>(extra)->Callback2(lhs, src_rhs, rules);
-}
-
-void cb3(WordID lhs, const vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void* extra) {
- static_cast<Featurizer*>(extra)->Callback3(lhs, src_rhs, rules);
-}
-
-int main(int argc, char** argv){
- FERegistry reg;
- reg.Register("LogRuleCount", new FEFactory<LogRuleCount>);
- reg.Register("LexProb", new FEFactory<LexProbExtractor>);
- reg.Register("XFeatures", new FEFactory<XFeatures>);
- reg.Register("LabelledRuleConditionals", new FEFactory<LabelledRuleConditionals>);
- reg.Register("RulePenalty", new FEFactory<RulePenalty>);
- reg.Register("LHSProb", new FEFactory<LHSProb>);
- reg.Register("LabellingShape", new FEFactory<LabellingShape>);
- reg.Register("GenerativeProb", new FEFactory<GenerativeProb>);
- po::variables_map conf;
- InitCommandLine(reg, argc, argv, &conf);
- aligned_corpus = conf["aligned_corpus"].as<string>(); // GLOBAL VAR
- ReadFile fg1(conf["filtered_grammar"].as<string>());
-
- vector<string> feats = conf["feature"].as<vector<string> >();
- vector<boost::shared_ptr<FeatureExtractor> > extractors(feats.size());
- for (int i = 0; i < feats.size(); ++i)
- extractors[i] = reg.Create(feats[i]);
- Featurizer fizer(extractors);
-
- cerr << "Reading filtered grammar to detect keys..." << endl;
- StripedGrammarLexer::ReadStripedGrammar(fg1.stream(), cb1, &fizer);
-
- cerr << "Reading unfiltered grammar..." << endl;
- StripedGrammarLexer::ReadStripedGrammar(&cin, cb2, &fizer);
-
- ReadFile fg2(conf["filtered_grammar"].as<string>());
- cerr << "Reading filtered grammar and adding features..." << endl;
- StripedGrammarLexer::ReadStripedGrammar(fg2.stream(), cb3, &fizer);
-
- return 0;
-}
-
diff --git a/extools/filter_grammar.cc b/extools/filter_grammar.cc
deleted file mode 100644
index cafcc923..00000000
--- a/extools/filter_grammar.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Filter a grammar in striped format
- */
-#include <iostream>
-#include <string>
-#include <map>
-#include <vector>
-#include <utility>
-#include <tr1/unordered_map>
-
-#include "suffix_tree.h"
-#include "sparse_vector.h"
-#include "sentence_pair.h"
-#include "extract.h"
-#include "fdict.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "striped_grammar.h"
-
-#include <boost/shared_ptr.hpp>
-#include <boost/functional/hash.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-using namespace std;
-using namespace std::tr1;
-namespace po = boost::program_options;
-
-static const size_t MAX_LINE_LENGTH = 64000000;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("test_set,t", po::value<string>(), "Filter for this test set")
- ("top_e_given_f,n", po::value<size_t>()->default_value(30), "Keep top N rules, according to p(e|f). 0 for all")
- ("help,h", "Print this help message and exit");
- po::options_description clo("Command line options");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- po::notify(*conf);
-
- if (conf->count("help") || conf->count("test_set")==0) {
- cerr << "\nUsage: filter_grammar -t TEST-SET.fr [-options] < grammar\n";
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-struct SourceFilter {
- // return true to keep the rule, otherwise false
- virtual bool Matches(const vector<WordID>& key) const = 0;
- virtual ~SourceFilter() {}
-};
-
-struct DumbSuffixTreeFilter : SourceFilter {
- DumbSuffixTreeFilter(const string& corpus) {
- cerr << "Build suffix tree from test set in " << corpus << endl;
- assert(FileExists(corpus));
- ReadFile rfts(corpus);
- istream& testSet = *rfts.stream();
- char* buf = new char[MAX_LINE_LENGTH];
- AnnotatedParallelSentence sent;
-
- /* process the data set to build suffix tree
- */
- while(!testSet.eof()) {
- testSet.getline(buf, MAX_LINE_LENGTH);
- if (buf[0] == 0) continue;
-
- //hack to read in the test set using AnnotatedParallelSentence
- strcat(buf," ||| fake ||| 0-0");
- sent.ParseInputLine(buf);
-
- //add each successive suffix to the tree
- for(int i=0; i<sent.f_len; i++)
- root.InsertPath(sent.f, i, sent.f_len - 1);
- }
- delete[] buf;
- }
- virtual bool Matches(const vector<WordID>& src_rhs) const {
- const Node<int>* curnode = &root;
- for(int i=0; i < src_rhs.size(); i++) {
- if (src_rhs[i] <= 0) {
- curnode = &root;
- } else if (curnode) {
- curnode = curnode->Extend(src_rhs[i]);
- if (!curnode) return false;
- }
- }
- return true;
- }
- Node<int> root;
-};
-
-boost::shared_ptr<SourceFilter> filter;
-multimap<float, ID2RuleStatistics::const_iterator> options;
-int kCOUNT;
-int max_options;
-
-void cb(WordID lhs, const vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void*) {
- options.clear();
- if (!filter || filter->Matches(src_rhs)) {
- for (ID2RuleStatistics::const_iterator it = rules.begin(); it != rules.end(); ++it) {
- options.insert(make_pair(-it->second.counts.get(kCOUNT), it));
- }
- int ocount = 0;
- cout << '[' << TD::Convert(-lhs) << ']' << " ||| ";
- WriteNamed(src_rhs, &cout);
- cout << '\t';
- bool first = true;
- for (multimap<float,ID2RuleStatistics::const_iterator>::iterator it = options.begin(); it != options.end(); ++it) {
- if (first) { first = false; } else { cout << " ||| "; }
- WriteAnonymous(it->second->first, &cout);
- cout << " ||| " << it->second->second;
- ++ocount;
- if (ocount == max_options) break;
- }
- cout << endl;
- }
-}
-
-int main(int argc, char** argv){
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- max_options = conf["top_e_given_f"].as<size_t>();;
- kCOUNT = FD::Convert("CFE");
- istream& unscored_grammar = cin;
- cerr << "Loading test set " << conf["test_set"].as<string>() << "...\n";
- filter.reset(new DumbSuffixTreeFilter(conf["test_set"].as<string>()));
- cerr << "Filtering...\n";
- StripedGrammarLexer::ReadStripedGrammar(&unscored_grammar, cb, NULL);
-}
-
diff --git a/extools/lex_trans_tbl.h b/extools/lex_trans_tbl.h
deleted file mode 100644
index 161b4a0d..00000000
--- a/extools/lex_trans_tbl.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * lex_trans_tbl.h
- *
- * Created on: May 25, 2010
- * Author: Vlad
- */
-
-#ifndef LEX_TRANS_TBL_H_
-#define LEX_TRANS_TBL_H_
-
-#include "wordid.h"
-#include <map>
-
-class LexTranslationTable
-{
- public:
-
- std::map < std::pair<WordID,WordID>,int > word_translation;
- std::map <WordID, int> total_foreign;
- std::map <WordID, int> total_english;
- void createTTable(const char* buf);
-
-};
-
-#endif /* LEX_TRANS_TBL_H_ */
diff --git a/extools/merge_lines.pl b/extools/merge_lines.pl
deleted file mode 100755
index 8711e4ce..00000000
--- a/extools/merge_lines.pl
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-if (scalar @ARGV < 2) {
- die "Usage: $0 file1.txt file2.txt ...\n\n Concatenate the nth line of each input file. All files\n must be the same length.\n\n";
-}
-
-my @fhs=();
-for my $file (@ARGV) {
- my $fh;
- open $fh, "<$file" or die "Can't read $file: $!\n";
- push @fhs, $fh;
-}
-
-my $first = shift @fhs;
-
-while(my $x = <$first>) {
- my $ind = 0;
- chomp $x;
- my @fields = ($x);
- for my $fh (@fhs) {
- $ind++;
- $x = <$fh>;
- die "ERROR: Mismatched number of lines: $ARGV[$ind]\n" unless $x;
- chomp $x;
- push @fields, $x;
- }
- print join ' ||| ', @fields;
- print "\n";
-}
-my $ind = 0;
-for my $fh (@fhs) {
- $ind++;
- my $x=<$fh>;
- die "ERROR: $ARGV[$ind] has extra lines!\n" if $x;
-}
-
-exit 0;
-
-for my $fh (@fhs) {
- close $fh;
-}
-
diff --git a/extools/mr_stripe_rule_reduce.cc b/extools/mr_stripe_rule_reduce.cc
deleted file mode 100644
index c9b2eb2a..00000000
--- a/extools/mr_stripe_rule_reduce.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <utility>
-#include <cstdlib>
-#include <tr1/unordered_map>
-
-#include <boost/functional/hash.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "striped_grammar.h"
-#include "tdict.h"
-#include "sentence_pair.h"
-#include "fdict.h"
-#include "extract.h"
-
-using namespace std;
-using namespace std::tr1;
-namespace po = boost::program_options;
-
-static const size_t MAX_LINE_LENGTH = 64000000;
-
-bool use_hadoop_counters = false;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("phrase_marginals,p", "Compute phrase marginals")
- ("use_hadoop_counters,C", "Enable this if running inside Hadoop")
- ("bidir,b", "Rules are tagged as being F->E or E->F, invert E rules in output")
- ("help,h", "Print this help message and exit");
- po::options_description clo("Command line options");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- po::notify(*conf);
-
- if (conf->count("help")) {
- cerr << "\nUsage: mr_stripe_rule_reduce [-options]\n";
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-void PlusEquals(const ID2RuleStatistics& v, ID2RuleStatistics* self) {
- for (ID2RuleStatistics::const_iterator it = v.begin(); it != v.end(); ++it) {
- RuleStatistics& dest = (*self)[it->first];
- dest += it->second;
- // TODO - do something smarter about alignments?
- if (dest.aligns.empty() && !it->second.aligns.empty())
- dest.aligns = it->second.aligns;
- }
-}
-
-void WriteKeyValue(const vector<WordID>& key, const ID2RuleStatistics& val) {
- cout << TD::GetString(key) << '\t';
- bool needdiv = false;
- for (ID2RuleStatistics::const_iterator it = val.begin(); it != val.end(); ++it) {
- if (needdiv) cout << " ||| "; else needdiv = true;
- cout << TD::GetString(it->first) << " ||| " << it->second;
- }
- cout << endl;
- if (use_hadoop_counters) cerr << "reporter:counter:UserCounters,RuleCount," << val.size() << endl;
-}
-
-void DoPhraseMarginals(const vector<WordID>& key, const bool bidir, ID2RuleStatistics* val) {
- static const WordID kF = TD::Convert("F");
- static const WordID kE = TD::Convert("E");
- static const int kCF = FD::Convert("CF");
- static const int kCE = FD::Convert("CE");
- static const int kCFE = FD::Convert("CFE");
- assert(key.size() > 0);
- int cur_marginal_id = kCF;
- if (bidir) {
- if (key[0] != kF && key[0] != kE) {
- cerr << "DoPhraseMarginals expects keys to have the from 'F|E [NT] word word word'\n";
- cerr << " but got: " << TD::GetString(key) << endl;
- exit(1);
- }
- if (key[0] == kE) cur_marginal_id = kCE;
- }
- double tot = 0;
- for (ID2RuleStatistics::iterator it = val->begin(); it != val->end(); ++it)
- tot += it->second.counts.get(kCFE);
- for (ID2RuleStatistics::iterator it = val->begin(); it != val->end(); ++it) {
- it->second.counts.set_value(cur_marginal_id, tot);
-
- // prevent double counting of the joint
- if (cur_marginal_id == kCE) it->second.counts.erase(kCFE);
- }
-}
-
-void WriteWithInversions(const vector<WordID>& key, const ID2RuleStatistics& val) {
- static const WordID kE = TD::Convert("E");
- static const WordID kDIV = TD::Convert("|||");
- vector<WordID> new_key(key.size() - 1);
- for (int i = 1; i < key.size(); ++i)
- new_key[i - 1] = key[i];
- const bool do_invert = (key[0] == kE);
- if (!do_invert) {
- WriteKeyValue(new_key, val);
- } else {
- ID2RuleStatistics inv;
- assert(new_key.size() > 2);
- vector<WordID> tk(new_key.size() - 2);
- for (int i = 0; i < tk.size(); ++i)
- tk[i] = new_key[2 + i];
- RuleStatistics& inv_stats = inv[tk];
- for (ID2RuleStatistics::const_iterator it = val.begin(); it != val.end(); ++it) {
- inv_stats.counts = it->second.counts;
- vector<WordID> ekey(2 + it->first.size());
- ekey[0] = key[1];
- ekey[1] = kDIV;
- for (int i = 0; i < it->first.size(); ++i)
- ekey[2+i] = it->first[i];
- WriteKeyValue(ekey, inv);
- }
- }
-}
-
-struct Reducer {
- Reducer(bool phrase_marginals, bool bidir) : pm_(phrase_marginals), bidir_(bidir) {}
-
- void ProcessLine(const vector<WordID>& key, const ID2RuleStatistics& rules) {
- if (cur_key_ != key) {
- if (cur_key_.size() > 0) Emit();
- acc_.clear();
- cur_key_ = key;
- }
- PlusEquals(rules, &acc_);
- }
-
- ~Reducer() {
- Emit();
- }
-
- void Emit() {
- if (pm_)
- DoPhraseMarginals(cur_key_, bidir_, &acc_);
- if (bidir_)
- WriteWithInversions(cur_key_, acc_);
- else
- WriteKeyValue(cur_key_, acc_);
- }
-
- const bool pm_;
- const bool bidir_;
- vector<WordID> cur_key_;
- ID2RuleStatistics acc_;
-};
-
-void cb(const vector<WordID>& key, const ID2RuleStatistics& contexts, void* red) {
- static_cast<Reducer*>(red)->ProcessLine(key, contexts);
-}
-
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- char* buf = new char[MAX_LINE_LENGTH];
- vector<WordID> key, cur_key;
- int line = 0;
- use_hadoop_counters = conf.count("use_hadoop_counters") > 0;
- const bool phrase_marginals = conf.count("phrase_marginals") > 0;
- const bool bidir = conf.count("bidir") > 0;
- Reducer reducer(phrase_marginals, bidir);
- StripedGrammarLexer::ReadContexts(&cin, cb, &reducer);
- return 0;
-}
-
diff --git a/extools/score_grammar.cc b/extools/score_grammar.cc
deleted file mode 100644
index 0945e018..00000000
--- a/extools/score_grammar.cc
+++ /dev/null
@@ -1,352 +0,0 @@
-/*
- * Score a grammar in striped format
- * ./score_grammar <alignment> < filtered.grammar > scored.grammar
- */
-#include <iostream>
-#include <string>
-#include <map>
-#include <vector>
-#include <utility>
-#include <cstdlib>
-#include <fstream>
-#include <tr1/unordered_map>
-
-#include "sentence_pair.h"
-#include "extract.h"
-#include "fdict.h"
-#include "tdict.h"
-#include "lex_trans_tbl.h"
-#include "filelib.h"
-
-#include <boost/functional/hash.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-using namespace std;
-using namespace std::tr1;
-
-
-static const size_t MAX_LINE_LENGTH = 64000000;
-
-typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics;
-
-
-namespace {
- inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
- inline bool IsBracket(char c){return c == '[' || c == ']';}
- inline void SkipWhitespace(const char* buf, int* ptr) {
- while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
- }
-}
-
-int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) {
- static const WordID kDIV = TD::Convert("|||");
- int ptr = sstart;
- while(ptr < end) {
- while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; }
- int start = ptr;
- while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
- if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; }
- const WordID w = TD::Convert(string(buf, start, ptr - start));
-
- if((IsBracket(buf[start]) and IsBracket(buf[ptr-1])) or( w == kDIV))
- p->push_back(1 * w);
- else {
- if (w == kDIV) return ptr;
- p->push_back(w);
- }
- }
- return ptr;
-}
-
-
-void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) {
- static const WordID kDIV = TD::Convert("|||");
- counts->clear();
- int ptr = 0;
- while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; }
- if (buf[ptr] != '\t') {
- cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl;
- exit(1);
- }
- cur_key->clear();
- // key is: "[X] ||| word word word"
- int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key);
- cur_key->push_back(kDIV);
- ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
- ++ptr;
- int start = ptr;
- int end = ptr;
- int state = 0; // 0=reading label, 1=reading count
- vector<WordID> name;
- while(buf[ptr] != 0) {
- while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; }
- if (buf[ptr] == '|') {
- ++ptr;
- if (buf[ptr] == '|') {
- ++ptr;
- if (buf[ptr] == '|') {
- ++ptr;
- end = ptr - 3;
- while (end > start && IsWhitespace(buf[end-1])) { --end; }
- if (start == end) {
- cerr << "Got empty token!\n LINE=" << buf << endl;
- exit(1);
- }
- switch (state) {
- case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
- case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
- default: cerr << "Can't happen\n"; abort();
- }
- SkipWhitespace(buf, &ptr);
- start = ptr;
- }
- }
- }
- }
- end=ptr;
- while (end > start && IsWhitespace(buf[end-1])) { --end; }
- if (end > start) {
- switch (state) {
- case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
- case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
- default: cerr << "Can't happen\n"; abort();
- }
- }
-}
-
-
-
-void LexTranslationTable::createTTable(const char* buf){
-
- bool DEBUG = false;
-
- AnnotatedParallelSentence sent;
-
- sent.ParseInputLine(buf);
-
- //iterate over the alignment to compute aligned words
-
- for(int i =0;i<sent.aligned.width();i++)
- {
- for (int j=0;j<sent.aligned.height();j++)
- {
- if (DEBUG) cerr << sent.aligned(i,j) << " ";
- if( sent.aligned(i,j))
- {
- if (DEBUG) cerr << TD::Convert(sent.f[i]) << " aligned to " << TD::Convert(sent.e[j]);
- ++word_translation[pair<WordID,WordID> (sent.f[i], sent.e[j])];
- ++total_foreign[sent.f[i]];
- ++total_english[sent.e[j]];
- }
- }
- if (DEBUG) cerr << endl;
- }
- if (DEBUG) cerr << endl;
-
- static const WordID NULL_ = TD::Convert("NULL");
- //handle unaligned words - align them to null
- for (int j =0; j < sent.e_len; j++)
- {
- if (sent.e_aligned[j]) continue;
- ++word_translation[pair<WordID,WordID> (NULL_, sent.e[j])];
- ++total_foreign[NULL_];
- ++total_english[sent.e[j]];
- }
-
- for (int i =0; i < sent.f_len; i++)
- {
- if (sent.f_aligned[i]) continue;
- ++word_translation[pair<WordID,WordID> (sent.f[i], NULL_)];
- ++total_english[NULL_];
- ++total_foreign[sent.f[i]];
- }
-
-}
-
-
-inline float safenlog(float v) {
- if (v == 1.0f) return 0.0f;
- float res = -log(v);
- if (res > 100.0f) res = 100.0f;
- return res;
-}
-
-int main(int argc, char** argv){
- bool DEBUG= false;
- if (argc != 2) {
- cerr << "Usage: " << argv[0] << " corpus.al < filtered.grammar\n";
- return 1;
- }
- ifstream alignment (argv[1]);
- istream& unscored_grammar = cin;
- ostream& scored_grammar = cout;
-
- //create lexical translation table
- cerr << "Creating table..." << endl;
- char* buf = new char[MAX_LINE_LENGTH];
-
- LexTranslationTable table;
-
- while(!alignment.eof())
- {
- alignment.getline(buf, MAX_LINE_LENGTH);
- if (buf[0] == 0) continue;
-
- table.createTTable(buf);
- }
-
- bool PRINT_TABLE=false;
- if (PRINT_TABLE)
- {
- ofstream trans_table;
- trans_table.open("lex_trans_table.out");
- for(map < pair<WordID,WordID>,int >::iterator it = table.word_translation.begin(); it != table.word_translation.end(); ++it)
- {
- trans_table << TD::Convert(it->first.first) << "|||" << TD::Convert(it->first.second) << "==" << it->second << "//" << table.total_foreign[it->first.first] << "//" << table.total_english[it->first.second] << endl;
- }
-
- trans_table.close();
- }
-
-
- //score unscored grammar
- cerr <<"Scoring grammar..." << endl;
-
- ID2RuleStatistics acc, cur_counts;
- vector<WordID> key, cur_key,temp_key;
- vector< pair<short,short> > al;
- vector< pair<short,short> >::iterator ita;
- int line = 0;
-
- static const int kCF = FD::Convert("CF");
- static const int kCE = FD::Convert("CE");
- static const int kCFE = FD::Convert("CFE");
-
- while(!unscored_grammar.eof())
- {
- ++line;
- unscored_grammar.getline(buf, MAX_LINE_LENGTH);
- if (buf[0] == 0) continue;
- ParseLine(buf, &cur_key, &cur_counts);
-
- //loop over all the Target side phrases that this source aligns to
- for (ID2RuleStatistics::const_iterator it = cur_counts.begin(); it != cur_counts.end(); ++it)
- {
-
- /*Compute phrase translation prob.
- Print out scores in this format:
- Phrase trnaslation prob P(F|E)
- Phrase translation prob P(E|F)
- Lexical weighting prob lex(F|E)
- Lexical weighting prob lex(E|F)
- */
-
- float pEF_ = it->second.counts.value(kCFE) / it->second.counts.value(kCF);
- float pFE_ = it->second.counts.value(kCFE) / it->second.counts.value(kCE);
-
- map <WordID, pair<int, float> > foreign_aligned;
- map <WordID, pair<int, float> > english_aligned;
-
- //Loop over all the alignment points to compute lexical translation probability
- al = it->second.aligns;
- for(ita = al.begin(); ita != al.end(); ++ita)
- {
-
- if (DEBUG)
- {
- cerr << "\nA:" << ita->first << "," << ita->second << "::";
- cerr << TD::Convert(cur_key[ita->first + 2]) << "-" << TD::Convert(it->first[ita->second]);
- }
-
-
- //Lookup this alignment probability in the table
- int temp = table.word_translation[pair<WordID,WordID> (cur_key[ita->first+2],it->first[ita->second])];
- float f2e=0, e2f=0;
- if ( table.total_foreign[cur_key[ita->first+2]] != 0)
- f2e = (float) temp / table.total_foreign[cur_key[ita->first+2]];
- if ( table.total_english[it->first[ita->second]] !=0 )
- e2f = (float) temp / table.total_english[it->first[ita->second]];
- if (DEBUG) printf (" %d %E %E\n", temp, f2e, e2f);
-
-
- //local counts to keep track of which things haven't been aligned, to later compute their null alignment
- if (foreign_aligned.count(cur_key[ita->first+2]))
- {
- foreign_aligned[ cur_key[ita->first+2] ].first++;
- foreign_aligned[ cur_key[ita->first+2] ].second += e2f;
- }
- else
- foreign_aligned [ cur_key[ita->first+2] ] = pair<int,float> (1,e2f);
-
-
-
- if (english_aligned.count( it->first[ ita->second] ))
- {
- english_aligned[ it->first[ ita->second ]].first++;
- english_aligned[ it->first[ ita->second] ].second += f2e;
- }
- else
- english_aligned [ it->first[ ita->second] ] = pair<int,float> (1,f2e);
-
-
-
-
- }
-
- float final_lex_f2e=1, final_lex_e2f=1;
- static const WordID NULL_ = TD::Convert("NULL");
-
- //compute lexical weight P(F|E) and include unaligned foreign words
- for(int i=0;i<cur_key.size(); i++)
- {
-
- if (!table.total_foreign.count(cur_key[i])) continue; //if we dont have it in the translation table, we won't know its lexical weight
-
- if (foreign_aligned.count(cur_key[i]))
- {
- pair<int, float> temp_lex_prob = foreign_aligned[cur_key[i]];
- final_lex_e2f *= temp_lex_prob.second / temp_lex_prob.first;
- }
- else //dealing with null alignment
- {
- int temp_count = table.word_translation[pair<WordID,WordID> (cur_key[i],NULL_)];
- float temp_e2f = (float) temp_count / table.total_english[NULL_];
- final_lex_e2f *= temp_e2f;
- }
-
- }
-
- //compute P(E|F) unaligned english words
- for(int j=0; j< it->first.size(); j++)
- {
- if (!table.total_english.count(it->first[j])) continue;
-
- if (english_aligned.count(it->first[j]))
- {
- pair<int, float> temp_lex_prob = english_aligned[it->first[j]];
- final_lex_f2e *= temp_lex_prob.second / temp_lex_prob.first;
- }
- else //dealing with null
- {
- int temp_count = table.word_translation[pair<WordID,WordID> (NULL_,it->first[j])];
- float temp_f2e = (float) temp_count / table.total_foreign[NULL_];
- final_lex_f2e *= temp_f2e;
- }
- }
-
-
- scored_grammar << TD::GetString(cur_key);
- string lhs = TD::Convert(cur_key[0]);
- scored_grammar << " " << TD::GetString(it->first) << " |||";
- if(lhs.find('_')!=string::npos) {
- scored_grammar << " Bkoff=" << safenlog(3.0f);
- } else {
- scored_grammar << " FGivenE=" << safenlog(pFE_) << " EGivenF=" << safenlog(pEF_);
- scored_grammar << " LexE2F=" << safenlog(final_lex_e2f) << " LexF2E=" << safenlog(final_lex_f2e);
- }
- scored_grammar << endl;
- }
- }
-}
-
diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc
deleted file mode 100644
index 7d60715a..00000000
--- a/extools/sentence_pair.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-#include "sentence_pair.h"
-
-#include <queue>
-#include <iostream>
-#include <string>
-#include <vector>
-#include <utility>
-#include <set>
-#include <boost/tuple/tuple_comparison.hpp>
-
-#include "tdict.h"
-#include "wordid.h"
-#include "array2d.h"
-
-using namespace std;
-using namespace boost;
-
-namespace {
- inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
-
- inline void SkipWhitespace(const char* buf, int* ptr) {
- while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
- }
-}
-
-void AnnotatedParallelSentence::Reset() {
- f.clear();
- e.clear();
- e_aligned.clear();
- f_aligned.clear();
- aligns_by_fword.clear();
- aligned.clear();
- span_types.clear();
-}
-
-void AnnotatedParallelSentence::AllocateForAlignment() {
- f_len = f.size();
- e_len = e.size();
- aligned.resize(f_len, e_len, false);
- f_aligned.resize(f_len, 0);
- e_aligned.resize(e_len, 0);
- aligns_by_fword.resize(f_len);
-}
-
-// read an alignment point of the form X-Y where X and Y are strings
-// of digits. if permit_col is true, the right edge will be determined
-// by the presence of a colon
-int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf,
- const int start,
- const int end,
- const bool permit_col,
- short* a, short* b, short* c, short* d) {
- if (end - start < 3) {
- cerr << "Alignment point badly formed 1: " << string(buf, start, end-start) << endl << buf << endl;
- exit(1);
- }
- int ch = start;
- *a = 0;
- while(ch < end && buf[ch] != '-') {
- if (buf[ch] < '0' || buf[ch] > '9') {
- cerr << "Alignment point badly formed 2: " << string(buf, start, end-start) << endl << buf << endl;
- exit(1);
- }
- (*a) *= 10;
- (*a) += buf[ch] - '0';
- ++ch;
- }
- ++ch;
- if (ch >= end) {
- cerr << "Alignment point badly formed 3: " << string(buf, start, end-start) << endl << buf << endl;
- exit(1);
- }
- (*b) = 0;
- while((ch < end) && (c == 0 && (!permit_col || (permit_col && buf[ch] != ':')) || c != 0 && buf[ch] != '-')) {
- if ((buf[ch] < '0') || (buf[ch] > '9')) {
- cerr << "Alignment point badly formed 4: " << string(buf, start, end-start) << endl << buf << endl << buf[ch] << endl;
- exit(1);
- }
- (*b) *= 10;
- (*b) += buf[ch] - '0';
- ++ch;
- }
- if (c != 0)
- {
- ++ch;
- if (ch >= end) {
- cerr << "Alignment point badly formed 5: " << string(buf, start, end-start) << endl << buf << endl;
- exit(1);
- }
- (*c) = 0;
- while(ch < end && buf[ch] != '-') {
- if (buf[ch] < '0' || buf[ch] > '9') {
- cerr << "Alignment point badly formed 6: " << string(buf, start, end-start) << endl << buf << endl;
- exit(1);
- }
- (*c) *= 10;
- (*c) += buf[ch] - '0';
- ++ch;
- }
- ++ch;
- if (ch >= end) {
- cerr << "Alignment point badly formed 7: " << string(buf, start, end-start) << endl << buf << endl;
- exit(1);
- }
- (*d) = 0;
- while(ch < end && (!permit_col || (permit_col && buf[ch] != ':'))) {
- if (buf[ch] < '0' || buf[ch] > '9') {
- cerr << "Alignment point badly formed 8: " << string(buf, start, end-start) << endl << buf << endl;
- exit(1);
- }
- (*d) *= 10;
- (*d) += buf[ch] - '0';
- ++ch;
- }
- }
- return ch;
-}
-
-void AnnotatedParallelSentence::Align(const short a, const short b) {
- aligned(a,b) = true;
- ++f_aligned[a];
- ++e_aligned[b];
- aligns_by_fword[a].push_back(make_pair(a,b));
- // cerr << a << " " << b << endl;
-}
-
-void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) {
- short a, b;
- ReadAlignmentPoint(buf, start, end, false, &a, &b, 0, 0);
- if (a >= f_len || b >= e_len) {
- cerr << "(" << a << ',' << b << ") is out of bounds. INPUT=\n" << buf << endl;
- exit(1);
- }
- Align(a,b);
-}
-
-void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) {
- short a,b,c,d;
- int ch = ReadAlignmentPoint(buf, start, end, true, &a, &b, &c, &d) + 1;
- if (buf[ch-1] != ':' || ch >= end) {
- cerr << "Span badly formed: " << string(buf, start, end-start) << endl << buf << endl;
- exit(1);
- }
- if (a >= f_len || b > f_len) {
- cerr << "(" << a << ',' << b << ") is out of bounds in labeled span. INPUT=\n" << buf << endl;
- exit(1);
- }
- if (c >= e_len || d > e_len) {
- cerr << "(" << c << ',' << d << ") is out of bounds in labeled span. INPUT=\n" << buf << endl;
- exit(1);
- }
- // cerr << a << " " << b << " " << string(buf,c,end-c) << endl;
- span_types[boost::make_tuple(a,b,c,d)].push_back(-TD::Convert(string(buf, ch, end-ch)));
-}
-
-// INPUT FORMAT
-// ein haus ||| a house ||| 0-0 1-1 ||| 0-0:DT 1-1:NN 0-1:NP
-void AnnotatedParallelSentence::ParseInputLine(const char* buf) {
- Reset();
- int ptr = 0;
- SkipWhitespace(buf, &ptr);
- int start = ptr;
- int state = 0; // 0 = French, 1 = English, 2 = Alignment, 3 = Spans
- while(char c = buf[ptr]) {
- if (!IsWhitespace(c)) { ++ptr; continue; } else {
- if (ptr - start == 3 && buf[start] == '|' && buf[start+1] == '|' && buf[start+2] == '|') {
- ++state;
- if (state == 4) { cerr << "Too many fields (ignoring):\n " << buf << endl; return; }
- if (state == 2) {
- // cerr << "FLEN=" << f->size() << " ELEN=" << e->size() << endl;
- AllocateForAlignment();
- }
- SkipWhitespace(buf, &ptr);
- start = ptr;
- continue;
- }
- switch (state) {
- case 0: f.push_back(TD::Convert(string(buf, start, ptr-start))); break;
- case 1: e.push_back(TD::Convert(string(buf, start, ptr-start))); break;
- case 2: ParseAlignmentPoint(buf, start, ptr); break;
- case 3: ParseSpanLabel(buf, start, ptr); break;
- default: cerr << "Can't happen\n"; abort();
- }
- SkipWhitespace(buf, &ptr);
- start = ptr;
- }
- }
- if (ptr > start) {
- switch (state) {
- case 0: f.push_back(TD::Convert(string(buf, start, ptr-start))); break;
- case 1: e.push_back(TD::Convert(string(buf, start, ptr-start))); break;
- case 2: ParseAlignmentPoint(buf, start, ptr); break;
- case 3: ParseSpanLabel(buf, start, ptr); break;
- default: cerr << "Can't happen\n"; abort();
- }
- }
-}
-
diff --git a/extools/sentence_pair.h b/extools/sentence_pair.h
deleted file mode 100644
index a05275e7..00000000
--- a/extools/sentence_pair.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef _SENTENCE_PAIR_H_
-#define _SENTENCE_PAIR_H_
-
-#include <map>
-#include <utility>
-#include <vector>
-#include <boost/tuple/tuple.hpp>
-#include "wordid.h"
-#include "array2d.h"
-
-// represents a parallel sentence with a word alignment and category
-// annotations over subspans (currently in terms of f)
-// you should read one using ParseInputLine and then use the public
-// member variables to query things about it
-struct AnnotatedParallelSentence {
- // read annotated parallel sentence from string
- void ParseInputLine(const char* buf);
-
- std::vector<WordID> f, e; // words in f and e
-
- // word alignment information
- std::vector<int> e_aligned, f_aligned; // counts the number of times column/row x is aligned
- Array2D<bool> aligned;
- std::vector<std::vector<std::pair<short, short> > > aligns_by_fword;
-
- // span type information
- std::map< boost::tuple<short,short,short,short>, std::vector<WordID> > span_types;
- // span_types(i,j,k,l) is the list of category span (i,j) in source and (k,l) in the target language.
-
- int f_len, e_len;
-
- void Align(const short a, const short b);
- void AllocateForAlignment();
-
- static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b, short* c, short* d);
-
- private:
- void Reset();
- void ParseAlignmentPoint(const char* buf, int start, int end);
- void ParseSpanLabel(const char* buf, int start, int end);
-};
-
-#endif
diff --git a/extools/sg_lexer.l b/extools/sg_lexer.l
deleted file mode 100644
index c85cdea7..00000000
--- a/extools/sg_lexer.l
+++ /dev/null
@@ -1,294 +0,0 @@
-%{
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <cstring>
-#include <cassert>
-#include "tdict.h"
-#include "fdict.h"
-#include "striped_grammar.h"
-
-int lex_line = 0;
-int read_contexts = 0;
-std::istream* sglex_stream = NULL;
-StripedGrammarLexer::GrammarCallback grammar_callback = NULL;
-StripedGrammarLexer::ContextCallback context_callback = NULL;
-void* grammar_callback_extra = NULL;
-void* context_callback_extra = NULL;
-
-#undef YY_INPUT
-#define YY_INPUT(buf, result, max_size) (result = sglex_stream->read(buf, max_size).gcount())
-
-#define YY_SKIP_YYWRAP 1
-int num_rules = 0;
-int yywrap() { return 1; }
-bool fl = true;
-#define MAX_TOKEN_SIZE 255
-std::string sglex_tmp_token(MAX_TOKEN_SIZE, '\0');
-
-#define MAX_RULE_SIZE 48
-WordID sglex_src_rhs[MAX_RULE_SIZE];
-WordID sglex_trg_rhs[MAX_RULE_SIZE];
-int sglex_src_rhs_size;
-int sglex_trg_rhs_size;
-WordID sglex_lhs;
-int sglex_src_arity;
-int sglex_trg_arity;
-
-#define MAX_FEATS 100
-int sglex_feat_ids[MAX_FEATS];
-double sglex_feat_vals[MAX_FEATS];
-int sglex_num_feats;
-
-#define MAX_ARITY 20
-int sglex_nt_sanity[MAX_ARITY];
-int sglex_src_nts[MAX_ARITY];
-float sglex_nt_size_means[MAX_ARITY];
-float sglex_nt_size_vars[MAX_ARITY];
-
-std::vector<WordID> cur_src_rhs;
-std::vector<WordID> cur_trg_rhs;
-ID2RuleStatistics cur_options;
-RuleStatistics* cur_stats = NULL;
-int sglex_cur_fid = 0;
-
-static void sanity_check_trg_index(int index) {
- if (index > sglex_src_arity) {
- std::cerr << "Target index " << index << " exceeds source arity " << sglex_src_arity << std::endl;
- abort();
- }
- int& flag = sglex_nt_sanity[index - 1];
- if (flag) {
- std::cerr << "Target index " << index << " used multiple times!" << std::endl;
- abort();
- }
- flag = 1;
-}
-
-static void sglex_reset() {
- sglex_src_arity = 0;
- sglex_trg_arity = 0;
- sglex_num_feats = 0;
- sglex_src_rhs_size = 0;
- sglex_trg_rhs_size = 0;
-}
-
-%}
-
-REAL [\-+]?[0-9]+(\.[0-9]*([eE][-+]*[0-9]+)?)?|inf|[\-+]inf
-NT [^\t \[\],]+
-ALIGN [0-9]+-[0-9]+
-
-%x LHS_END SRC TRG FEATS FEATVAL ALIGNS
-%%
-
-<INITIAL>[ ] ;
-<INITIAL>[\t] {
- if (read_contexts) {
- cur_options.clear();
- BEGIN(TRG);
- } else {
- std::cerr << "Unexpected tab while reading striped grammar\n";
- exit(1);
- }
- }
-
-<INITIAL>\[{NT}\] {
- if (read_contexts) {
- sglex_tmp_token.assign(yytext, yyleng);
- sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token);
- ++sglex_src_rhs_size;
- } else {
- sglex_tmp_token.assign(yytext + 1, yyleng - 2);
- sglex_lhs = -TD::Convert(sglex_tmp_token);
- // std::cerr << sglex_tmp_token << "\n";
- BEGIN(LHS_END);
- }
- }
-
-<INITIAL>[^ \t]+ {
- if (read_contexts) {
- // std::cerr << "Context: " << yytext << std::endl;
- sglex_tmp_token.assign(yytext, yyleng);
- sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token);
- ++sglex_src_rhs_size;
- } else {
- std::cerr << "Unexpected input: " << yytext << " when NT expected\n";
- exit(1);
- }
- }
-
-<SRC>\[{NT}\] {
- sglex_tmp_token.assign(yytext + 1, yyleng - 2);
- sglex_src_nts[sglex_src_arity] = sglex_src_rhs[sglex_src_rhs_size] = -TD::Convert(sglex_tmp_token);
- ++sglex_src_arity;
- ++sglex_src_rhs_size;
- }
-
-<LHS_END>[ ] { ; }
-<LHS_END>\|\|\| {
- sglex_reset();
- BEGIN(SRC);
- }
-
-<LHS_END>. {
- std::cerr << "Line " << lex_line << ": unexpected input in LHS: " << yytext << std::endl;
- exit(1);
- }
-
-
-<SRC>\[{NT},[1-9][0-9]?\] {
- int index = yytext[yyleng - 2] - '0';
- if (yytext[yyleng - 3] == ',') {
- sglex_tmp_token.assign(yytext + 1, yyleng - 4);
- } else {
- sglex_tmp_token.assign(yytext + 1, yyleng - 5);
- index += 10 * (yytext[yyleng - 3] - '0');
- }
- if ((sglex_src_arity+1) != index) {
- std::cerr << "Src indices must go in order: expected " << sglex_src_arity << " but got " << index << std::endl;
- abort();
- }
- sglex_src_nts[sglex_src_arity] = sglex_src_rhs[sglex_src_rhs_size] = -TD::Convert(sglex_tmp_token);
- ++sglex_src_rhs_size;
- ++sglex_src_arity;
- }
-
-<SRC>[^ \t]+ {
- sglex_tmp_token.assign(yytext, yyleng);
- sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token);
- ++sglex_src_rhs_size;
- }
-<SRC>[ ] { ; }
-<SRC>\t {
- //std::cerr << "LHS=" << TD::Convert(-sglex_lhs) << " ";
- //std::cerr << " src_size: " << sglex_src_rhs_size << std::endl;
- //std::cerr << " src_arity: " << sglex_src_arity << std::endl;
- cur_options.clear();
- memset(sglex_nt_sanity, 0, sglex_src_arity * sizeof(int));
- sglex_trg_rhs_size = 0;
- BEGIN(TRG);
- }
-
-<TRG>\[[1-9][0-9]?\] {
- if (read_contexts) {
- sglex_tmp_token.assign(yytext, yyleng);
- sglex_trg_rhs[sglex_trg_rhs_size] = TD::Convert(sglex_tmp_token);
- ++sglex_trg_rhs_size;
- } else {
- int index = yytext[yyleng - 2] - '0';
- if (yyleng == 4) {
- index += 10 * (yytext[yyleng - 3] - '0');
- }
- ++sglex_trg_arity;
- sanity_check_trg_index(index);
- sglex_trg_rhs[sglex_trg_rhs_size] = 1 - index;
- ++sglex_trg_rhs_size;
- }
-}
-
-<TRG>\|\|\| {
- //std::cerr << " trg_size: " << sglex_trg_rhs_size << std::endl;
- //std::cerr << " trg_arity: " << sglex_trg_arity << std::endl;
- assert(sglex_trg_rhs_size > 0);
- cur_trg_rhs.resize(sglex_trg_rhs_size);
- for (int i = 0; i < sglex_trg_rhs_size; ++i)
- cur_trg_rhs[i] = sglex_trg_rhs[i];
- cur_stats = &cur_options[cur_trg_rhs];
- BEGIN(FEATS);
- }
-
-<TRG>[^ ]+ {
- sglex_tmp_token.assign(yytext, yyleng);
- sglex_trg_rhs[sglex_trg_rhs_size] = TD::Convert(sglex_tmp_token);
-
- ++sglex_trg_rhs_size;
- }
-<TRG>[ ]+ { ; }
-
-<FEATS>\n {
- assert(sglex_src_rhs_size > 0);
- cur_src_rhs.resize(sglex_src_rhs_size);
- for (int i = 0; i < sglex_src_rhs_size; ++i)
- cur_src_rhs[i] = sglex_src_rhs[i];
- if (read_contexts) {
- context_callback(cur_src_rhs, cur_options, context_callback_extra);
- } else {
- assert(sglex_lhs < 0);
- grammar_callback(sglex_lhs, cur_src_rhs, cur_options, grammar_callback_extra);
- }
- cur_options.clear();
- sglex_reset();
- BEGIN(INITIAL);
- }
-<FEATS>[ ]+ { ; }
-<FEATS>\|\|\| {
- memset(sglex_nt_sanity, 0, sglex_src_arity * sizeof(int));
- sglex_trg_rhs_size = 0;
- BEGIN(TRG);
- }
-<FEATS>[A-Z][A-Z_0-9]*= {
- // std::cerr << "FV: " << yytext << std::endl;
- sglex_tmp_token.assign(yytext, yyleng - 1);
- sglex_cur_fid = FD::Convert(sglex_tmp_token);
- static const int Afid = FD::Convert("A");
- if (sglex_cur_fid == Afid) {
- BEGIN(ALIGNS);
- } else {
- BEGIN(FEATVAL);
- }
- }
-<FEATVAL>{REAL} {
- // std::cerr << "Feature val input: " << yytext << std::endl;
- cur_stats->counts.add_value(sglex_cur_fid, strtod(yytext, NULL));
- BEGIN(FEATS);
- }
-<FEATVAL>. {
- std::cerr << "Feature val unexpected input: " << yytext << std::endl;
- exit(1);
- }
-<FEATS>. {
- std::cerr << "Features unexpected input: " << yytext << std::endl;
- exit(1);
- }
-<ALIGNS>{ALIGN}(,{ALIGN})* {
- assert(cur_stats->aligns.empty());
- int i = 0;
- while(i < yyleng) {
- short a = 0;
- short b = 0;
- while (yytext[i] != '-') { a *= 10; a += yytext[i] - '0'; ++i; }
- ++i;
- while (yytext[i] != ',' && i < yyleng) { b *= 10; b += yytext[i] - '0'; ++i; }
- ++i;
- cur_stats->aligns.push_back(std::make_pair(a,b));
- }
- BEGIN(FEATS);
- }
-<ALIGNS>. {
- std::cerr << "Aligns unexpected input: " << yytext << std::endl;
- exit(1);
- }
-%%
-
-#include "filelib.h"
-
-void StripedGrammarLexer::ReadStripedGrammar(std::istream* in, GrammarCallback func, void* extra) {
- read_contexts = 0;
- lex_line = 1;
- sglex_stream = in;
- grammar_callback_extra = extra;
- grammar_callback = func;
- yylex();
-}
-
-void StripedGrammarLexer::ReadContexts(std::istream* in, ContextCallback func, void* extra) {
- read_contexts = 1;
- lex_line = 1;
- sglex_stream = in;
- context_callback_extra = extra;
- context_callback = func;
- yylex();
-}
-
-
diff --git a/extools/simple-extract-context.sh b/extools/simple-extract-context.sh
deleted file mode 100755
index 17487b1c..00000000
--- a/extools/simple-extract-context.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-MYDIR=$(dirname $0)
-
-export LANG=C
-date 1>&2
-$MYDIR/extractor -i $1 -c 500000 -L 12 -C | sort -t $'\t' -k 1 | $MYDIR/mr_stripe_rule_reduce
-date 1>&2
-
diff --git a/extools/simple-extract.sh b/extools/simple-extract.sh
deleted file mode 100755
index ec5c5276..00000000
--- a/extools/simple-extract.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-export LANG=C
-date
-./extractor -i $1 -d X -c 500000 -L 12 -b | sort -t $'\t' -k 1 | gzip > ex.output.gz
-date
-# -p = compute phrase marginals
-# -b = bidirectional rules (starting with F or E) were extracted
-zcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz
-date
-
diff --git a/extools/striped_grammar.cc b/extools/striped_grammar.cc
deleted file mode 100644
index 785f4bbe..00000000
--- a/extools/striped_grammar.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-#include "striped_grammar.h"
-
-#include <iostream>
-
-#include "sentence_pair.h"
-
-using namespace std;
-
-namespace {
- inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
-
- inline void SkipWhitespace(const char* buf, int* ptr) {
- while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
- }
-}
-
-void RuleStatistics::ParseRuleStatistics(const char* buf, int start, int end) {
- int ptr = start;
- counts.clear();
- aligns.clear();
- while (ptr < end) {
- SkipWhitespace(buf, &ptr);
- int vstart = ptr;
- while(ptr < end && buf[ptr] != '=') ++ptr;
- assert(buf[ptr] == '=');
- assert(ptr > vstart);
- if (buf[vstart] == 'A' && buf[vstart+1] == '=') {
- ++ptr;
- while (ptr < end && !IsWhitespace(buf[ptr])) {
- while(ptr < end && buf[ptr] == ',') { ++ptr; }
- assert(ptr < end);
- vstart = ptr;
- while(ptr < end && buf[ptr] != ',' && !IsWhitespace(buf[ptr])) { ++ptr; }
- if (ptr > vstart) {
- short a, b;
- AnnotatedParallelSentence::ReadAlignmentPoint(buf, vstart, ptr, false, &a, &b, 0, 0);
- aligns.push_back(make_pair(a,b));
- }
- }
- } else {
- int name = FD::Convert(string(buf,vstart,ptr-vstart));
- ++ptr;
- vstart = ptr;
- while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
- assert(ptr > vstart);
- counts.set_value(name, strtod(buf + vstart, NULL));
- }
- }
-}
-
-ostream& operator<<(ostream& os, const RuleStatistics& s) {
- bool needspace = false;
- for (SparseVector<float>::const_iterator it = s.counts.begin(); it != s.counts.end(); ++it) {
- if (needspace) os << ' '; else needspace = true;
- os << FD::Convert(it->first) << '=' << it->second;
- }
- if (s.aligns.size() > 0) {
- os << " A=";
- needspace = false;
- for (int i = 0; i < s.aligns.size(); ++i) {
- if (needspace) os << ','; else needspace = true;
- os << s.aligns[i].first << '-' << s.aligns[i].second;
- }
- }
- return os;
-}
-
diff --git a/extools/striped_grammar.h b/extools/striped_grammar.h
deleted file mode 100644
index bf3aec7d..00000000
--- a/extools/striped_grammar.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef _STRIPED_GRAMMAR_H_
-#define _STRIPED_GRAMMAR_H_
-
-#include <iostream>
-#include <boost/functional/hash.hpp>
-#include <vector>
-#include <tr1/unordered_map>
-#include "sparse_vector.h"
-#include "wordid.h"
-#include "tdict.h"
-
-// represents statistics / information about a rule pair
-struct RuleStatistics {
- SparseVector<float> counts;
- std::vector<std::pair<short,short> > aligns;
- RuleStatistics() {}
- RuleStatistics(int name, float val, const std::vector<std::pair<short,short> >& al) :
- aligns(al) {
- counts.set_value(name, val);
- }
- void ParseRuleStatistics(const char* buf, int start, int end);
- RuleStatistics& operator+=(const RuleStatistics& rhs) {
- counts += rhs.counts;
- return *this;
- }
-};
-std::ostream& operator<<(std::ostream& os, const RuleStatistics& s);
-
-inline void WriteNamed(const std::vector<WordID>& v, std::ostream* os) {
- bool first = true;
- for (int i = 0; i < v.size(); ++i) {
- if (first) { first = false; } else { (*os) << ' '; }
- if (v[i] < 0) { (*os) << '[' << TD::Convert(-v[i]) << ']'; }
- else (*os) << TD::Convert(v[i]);
- }
-}
-
-inline void WriteAnonymous(const std::vector<WordID>& v, std::ostream* os) {
- bool first = true;
- for (int i = 0; i < v.size(); ++i) {
- if (first) { first = false; } else { (*os) << ' '; }
- if (v[i] <= 0) { (*os) << '[' << (1-v[i]) << ']'; }
- else (*os) << TD::Convert(v[i]);
- }
-}
-
-typedef std::tr1::unordered_map<std::vector<WordID>, RuleStatistics, boost::hash<std::vector<WordID> > > ID2RuleStatistics;
-
-struct StripedGrammarLexer {
- typedef void (*GrammarCallback)(WordID lhs, const std::vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void *extra);
- static void ReadStripedGrammar(std::istream* in, GrammarCallback func, void* extra);
- typedef void (*ContextCallback)(const std::vector<WordID>& phrase, const ID2RuleStatistics& rules, void *extra);
- static void ReadContexts(std::istream* in, ContextCallback func, void* extra);
-};
-
-#endif
diff --git a/extools/suffix_tree.h b/extools/suffix_tree.h
deleted file mode 100644
index f62f53f4..00000000
--- a/extools/suffix_tree.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * suffix_tree.h
- *
- * Created on: May 17, 2010
- * Author: Vlad
-
-NOTE (graehl): this seems to be a (forward) trie of the suffixes (of sentences).
-so O(m*n^2) for m sentences of length n.
-
-For a real suffix tree (linear size/time), see:
-http://en.wikipedia.org/wiki/Suffix_tree
-http://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
-
- */
-
-#ifndef SUFFIX_TREE_H_
-#define SUFFIX_TREE_H_
-
-#include <string>
-#include <map>
-#include <vector>
-
-template <class T>
-class Node {
- public:
- std::map<T, Node> edge_list_;
- int InsertPath(const std::vector<T>& p, int start, int end);
- const Node* Extend(const T& e) const {
- typename std::map<T, Node>::const_iterator it = edge_list_.find(e);
- if (it == edge_list_.end()) return NULL;
- return &it->second;
- }
-};
-
-bool DEBUG = false;
-
-template <class T>
-int Node<T>::InsertPath(const std::vector<T>& p, int start, int end){
- Node* currNode = this;
- for(int i=start;i<= end; i++ ) {
- currNode = &(currNode->edge_list_)[p[i]];
- }
- return 1;
-}
-
-#endif /* SUFFIX_TRIE_H_ */
diff --git a/extools/test_data/README b/extools/test_data/README
deleted file mode 100644
index e368cffc..00000000
--- a/extools/test_data/README
+++ /dev/null
@@ -1,10 +0,0 @@
-The following was used to create the test data. The real inputs
-were corpus.fr, corpus.en, and corpus.aligned. The generated files
-were corpus.len_cats and fr-en.al.len.
-
-
- ./make_len_cats.pl corpus.en > corpus.len_cats
-
- ../merge_lines.pl corpus.fr corpus.en corpus.aligned corpus.len_cats > fr-en.al.len
-
-
diff --git a/extools/test_data/corpus.aligned b/extools/test_data/corpus.aligned
deleted file mode 100644
index aa09e9ab..00000000
--- a/extools/test_data/corpus.aligned
+++ /dev/null
@@ -1,5 +0,0 @@
-0-0 1-2 2-1
-0-0 1-1
-0-0 0-1 1-0 1-1 2-0 2-1 3-2 4-3
-0-0
-0-0 1-1
diff --git a/extools/test_data/corpus.en b/extools/test_data/corpus.en
deleted file mode 100644
index 2d4751bf..00000000
--- a/extools/test_data/corpus.en
+++ /dev/null
@@ -1,5 +0,0 @@
-the blue house
-the hat
-there is a hat
-cap
-the cat
diff --git a/extools/test_data/corpus.fr b/extools/test_data/corpus.fr
deleted file mode 100644
index 75b5e127..00000000
--- a/extools/test_data/corpus.fr
+++ /dev/null
@@ -1,5 +0,0 @@
-la maison bleue
-le chapeau
-il y a un chapeau
-chapeau
-le chat
diff --git a/extools/test_data/corpus.len_cats b/extools/test_data/corpus.len_cats
deleted file mode 100644
index 18d321de..00000000
--- a/extools/test_data/corpus.len_cats
+++ /dev/null
@@ -1,5 +0,0 @@
-0-1:SHORT 0-2:SHORT 0-3:MID 1-2:SHORT 1-3:SHORT 2-3:SHORT
-0-1:SHORT 0-2:SHORT 1-2:SHORT
-0-1:SHORT 0-2:SHORT 0-3:MID 0-4:MID 1-2:SHORT 1-3:SHORT 1-4:MID 2-3:SHORT 2-4:SHORT 3-4:SHORT
-0-1:SHORT
-0-1:SHORT 0-2:SHORT 1-2:SHORT
diff --git a/extools/test_data/fr-en.al.len b/extools/test_data/fr-en.al.len
deleted file mode 100644
index 7ee6b85d..00000000
--- a/extools/test_data/fr-en.al.len
+++ /dev/null
@@ -1,5 +0,0 @@
-la maison bleue ||| the blue house ||| 0-0 1-2 2-1 ||| 0-1:SHORT 0-2:SHORT 0-3:MID 1-2:SHORT 1-3:SHORT 2-3:SHORT
-le chapeau ||| the hat ||| 0-0 1-1 ||| 0-1:SHORT 0-2:SHORT 1-2:SHORT
-il y a un chapeau ||| there is a hat ||| 0-0 0-1 1-0 1-1 2-0 2-1 3-2 4-3 ||| 0-1:SHORT 0-2:SHORT 0-3:MID 0-4:MID 1-2:SHORT 1-3:SHORT 1-4:MID 2-3:SHORT 2-4:SHORT 3-4:SHORT
-chapeau ||| cap ||| 0-0 ||| 0-1:SHORT
-le chat ||| the cat ||| 0-0 1-1 ||| 0-1:SHORT 0-2:SHORT 1-2:SHORT
diff --git a/extools/test_data/make_len_cats.pl b/extools/test_data/make_len_cats.pl
deleted file mode 100755
index 25ef75fa..00000000
--- a/extools/test_data/make_len_cats.pl
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $max_len = 15;
-my @cat_names = qw( NULL SHORT SHORT MID MID MID LONG LONG LONG LONG LONG VLONG VLONG VLONG VLONG VLONG );
-
-while(<>) {
- chomp;
- my @words = split /\s+/;
- my $len = scalar @words;
- my @spans;
- for (my $i =0; $i < $len; $i++) {
- for (my $k = 1; $k <= $max_len; $k++) {
- my $j = $i + $k;
- next if ($j > $len);
- my $cat = $cat_names[$k];
- die unless $cat;
- push @spans, "$i-$j:$cat";
- }
- }
- print "@spans\n";
-}
-