diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-08-12 23:33:21 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-08-12 23:33:21 -0400 |
commit | da176941c1f481f14e93bd7d055cc29cac0ea8c8 (patch) | |
tree | c7ec8c0f75b386e6ca6d37da830e5a2e369b1cca | |
parent | 4760209baa483403db3bcb9bf1a32ae87a7b576d (diff) |
use new union api
33 files changed, 75 insertions, 3550 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 0a792549..4a98a4f1 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -44,6 +44,7 @@ libcdec_a_SOURCES = \ hg_remove_eps.cc \ decoder.cc \ hg_intersect.cc \ + hg_union.cc \ hg_sampler.cc \ factored_lexicon_helper.cc \ viterbi.cc \ diff --git a/decoder/decoder.cc b/decoder/decoder.cc index a6f7b1ce..a69a6d05 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -24,6 +24,7 @@ #include "hg.h" #include "sentence_metadata.h" #include "hg_intersect.h" +#include "hg_union.h" #include "oracle_bleu.h" #include "apply_models.h" @@ -980,7 +981,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { bool succeeded = HypergraphIO::ReadFromJSON(rf.stream(), &new_hg); if (!succeeded) abort(); } - new_hg.Union(forest); + HG::Union(forest, &new_hg); bool succeeded = writer.Write(new_hg, false); if (!succeeded) abort(); } else { @@ -1067,7 +1068,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { bool succeeded = HypergraphIO::ReadFromJSON(rf.stream(), &new_hg); if (!succeeded) abort(); } - new_hg.Union(forest); + HG::Union(forest, &new_hg); bool succeeded = writer.Write(new_hg, false); if (!succeeded) abort(); } else { diff --git a/decoder/hg_test.cc b/decoder/hg_test.cc index 92ed98b2..c47af850 100644 --- a/decoder/hg_test.cc +++ b/decoder/hg_test.cc @@ -6,6 +6,7 @@ #include "json_parse.h" #include "hg_intersect.h" +#include "hg_union.h" #include "viterbi.h" #include "kbest.h" #include "inside_outside.h" @@ -52,7 +53,7 @@ BOOST_AUTO_TEST_CASE(Union) { int l2 = ViterbiPathLength(hg2); cerr << c1 << "\t" << TD::GetString(t1) << endl; cerr << c2 << "\t" << TD::GetString(t2) << endl; - hg1.Union(hg2); + HG::Union(hg2, &hg1); hg1.Reweight(wts); c3 = ViterbiESentence(hg1, &t3); int l3 = ViterbiPathLength(hg1); @@ -121,8 +122,8 @@ BOOST_AUTO_TEST_CASE(InsideScore) { vector<prob_t> post; inside = hg.ComputeBestPathThroughEdges(&post); BOOST_CHECK_CLOSE(-0.3, log(inside), 1e-4); // computed by hand - BOOST_CHECK_EQUAL(post.size(), 4); - for (int i = 0; i < 4; ++i) { + BOOST_CHECK_EQUAL(post.size(), 5); + for (int i = 0; i < 5; ++i) { cerr << "edge post: " << log(post[i]) << '\t' << hg.edges_[i].rule_->AsString() << endl; } } diff --git a/decoder/hg_union.cc b/decoder/hg_union.cc new file mode 100644 index 00000000..37082976 --- /dev/null +++ b/decoder/hg_union.cc @@ -0,0 +1,58 @@ +#include "hg_union.h" + +#include "hg.h" + +using namespace std; + +namespace HG { + +void Union(const Hypergraph& in, Hypergraph* out) { + if (&in == out) return; + if (out->nodes_.empty()) { + out->nodes_ = in.nodes_; + out->edges_ = in.edges_; return; + } + unsigned noff = out->nodes_.size(); + unsigned eoff = out->edges_.size(); + int ogoal = in.nodes_.size() - 1; + int cgoal = noff - 1; + // keep a single goal node, so add nodes.size - 1 + out->nodes_.resize(out->nodes_.size() + ogoal); + // add all edges + out->edges_.resize(out->edges_.size() + in.edges_.size()); + + for (int i = 0; i < ogoal; ++i) { + const Hypergraph::Node& on = in.nodes_[i]; + Hypergraph::Node& cn = out->nodes_[i + noff]; + cn.id_ = i + noff; + cn.in_edges_.resize(on.in_edges_.size()); + for (unsigned j = 0; j < on.in_edges_.size(); ++j) + cn.in_edges_[j] = on.in_edges_[j] + eoff; + + cn.out_edges_.resize(on.out_edges_.size()); + for (unsigned j = 0; j < on.out_edges_.size(); ++j) + cn.out_edges_[j] = on.out_edges_[j] + eoff; + } + + for (unsigned i = 0; i < in.edges_.size(); ++i) { + const Hypergraph::Edge& oe = in.edges_[i]; + Hypergraph::Edge& ce = out->edges_[i + eoff]; + ce.id_ = i + eoff; + ce.rule_ = oe.rule_; + ce.feature_values_ = oe.feature_values_; + if (oe.head_node_ == ogoal) { + ce.head_node_ = cgoal; + out->nodes_[cgoal].in_edges_.push_back(ce.id_); + } else { + ce.head_node_ = oe.head_node_ + noff; + } + ce.tail_nodes_.resize(oe.tail_nodes_.size()); + for (unsigned j = 0; j < oe.tail_nodes_.size(); ++j) + ce.tail_nodes_[j] = oe.tail_nodes_[j] + noff; + } + + out->TopologicallySortNodesAndEdges(cgoal); +} + +} + diff --git a/decoder/hg_union.h b/decoder/hg_union.h new file mode 100644 index 00000000..34624246 --- /dev/null +++ b/decoder/hg_union.h @@ -0,0 +1,9 @@ +#ifndef _HG_UNION_H_ +#define _HG_UNION_H_ + +class Hypergraph; +namespace HG { + void Union(const Hypergraph& in, Hypergraph* out); +}; + +#endif diff --git a/extools/Makefile.am b/extools/Makefile.am deleted file mode 100644 index ee363264..00000000 --- a/extools/Makefile.am +++ /dev/null @@ -1,30 +0,0 @@ -bin_PROGRAMS = \ - extractor \ - mr_stripe_rule_reduce \ - filter_grammar \ - featurize_grammar \ - extractor_monolingual - -noinst_PROGRAMS = - -sg_lexer.cc: sg_lexer.l - $(LEX) -s -CF -8 -o$@ $< - -filter_grammar_SOURCES = filter_grammar.cc extract.cc sentence_pair.cc striped_grammar.cc sg_lexer.cc -filter_grammar_LDADD = $(top_srcdir)/utils/libutils.a -lz -#filter_grammar_LDFLAGS = -all-static - -featurize_grammar_SOURCES = featurize_grammar.cc extract.cc sentence_pair.cc sg_lexer.cc striped_grammar.cc -featurize_grammar_LDADD = $(top_srcdir)/utils/libutils.a -lz - -mr_stripe_rule_reduce_SOURCES = mr_stripe_rule_reduce.cc extract.cc sentence_pair.cc striped_grammar.cc sg_lexer.cc -mr_stripe_rule_reduce_LDADD = $(top_srcdir)/utils/libutils.a -lz - -extractor_SOURCES = sentence_pair.cc extract.cc extractor.cc striped_grammar.cc -extractor_LDADD = $(top_srcdir)/utils/libutils.a -lz - -extractor_monolingual_SOURCES = extractor_monolingual.cc -extractor_monolingual_LDADD = $(top_srcdir)/utils/libutils.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils - diff --git a/extools/README b/extools/README deleted file mode 100644 index af91ce79..00000000 --- a/extools/README +++ /dev/null @@ -1,32 +0,0 @@ - -Categories have the format i-j:CAT where i and j are the indices of the spaces -between words in the TARGET language. For example, slash categories can be written: - - the blue house - 0-1:DT 1-2:JJ 2-3:NN 1-3:NBAR 0-2:NP/NN 0-3:NP - - -You may multiply label each span, e.g. - - NP - | - NBAR - | - NN - | - John - 0-1:NP 0-1:NBAR 0-1:NP - -However, this may result in a very large number of rules being extracted. - - -**** -* Filtering and Scoring of Unscored and Unfiltered Grammars -**** - -Take the unfiltered grammar, and a test set, and run: -./filter_grammar <test set> < unfiltered.grammar > filter.grammar - -Then, to score the new filtered grammar, run: -./score_grammar <alignment> < filtered.grammar > scored.grammar - diff --git a/extools/coarsen_grammar.pl b/extools/coarsen_grammar.pl deleted file mode 100755 index f2dd6689..00000000 --- a/extools/coarsen_grammar.pl +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/perl - -# dumb grammar coarsener that maps every nonterminal to X (except S). - -use strict; - -unless (@ARGV > 1){ - die "Usage: $0 <weight file> <grammar file> [<grammar file> ... <grammar file>] \n"; -} -my $weight_file = shift @ARGV; - -$ENV{"LC_ALL"} = "C"; -local(*GRAMMAR, *OUT_GRAMMAR, *WEIGHTS); - -my %weights; -unless (open(WEIGHTS, $weight_file)) {die "Could not open weight file $weight_file\n" } -while (<WEIGHTS>){ - if (/(.+) (.+)$/){ - $weights{$1} = $2; - } -} -close(WEIGHTS); -unless (keys(%weights)){ - die "Could not find any PhraseModel features in weight file (perhaps you specified the wrong file?)\n\n". - "Usage: $0 <weight file> <grammar file> [<grammar file> ... <grammar file>] \n"; -} - -sub cleanup_and_die; -$SIG{INT} = "cleanup_and_die"; -$SIG{TERM} = "cleanup_and_die"; -$SIG{HUP} = "cleanup_and_die"; - -open(OUT_GRAMMAR, ">grammar.tmp"); -while (my $grammar_file = shift @ARGV){ - unless (open(GRAMMAR, $grammar_file)) {die "Could not open grammar file $grammar_file\n"} - while (<GRAMMAR>){ - if (/^((.*\|{3}){3})(.*)$/){ - my $rule = $1; - my $rest = $3; - my $coarse_rule = $rule; - $coarse_rule =~ s/\[X[^\],]*/[X/g; - print OUT_GRAMMAR "$coarse_rule $rule $rest\n"; - } else { - die "Unrecognized rule format: $_\n"; - } - } - close(GRAMMAR); -} -close(OUT_GRAMMAR); - -`sort grammar.tmp > grammar.tmp.sorted`; -sub dump_rules; -sub compute_score; -unless (open(GRAMMAR, "grammar.tmp.sorted")){ die "Something went wrong; could not open intermediate file grammar.tmp.sorted\n"}; -my $prev_coarse_rule = ""; -my $best_features = ""; -my $best_score = 0; -my @rules = (); -while (<GRAMMAR>){ - if (/^\s*((\S.*\|{3}\s*){3})((\S.*\|{3}\s*){3})(.*)$/){ - my $coarse_rule = $1; - my $fine_rule = $3; - my $features = $5; # This code does not correctly handle rules with other info (e.g. alignments) - if ($coarse_rule eq $prev_coarse_rule){ - my $score = compute_score($features, %weights); - if ($score > $best_score){ - $best_score = $score; - $best_features = $features; - } - } else { - dump_rules($prev_coarse_rule, $best_features, @rules); - $prev_coarse_rule = $coarse_rule; - $best_features = $features; - $best_score = compute_score($features, %weights); - @rules = (); - } - push(@rules, "$fine_rule$features\n"); - } else { - die "Something went wrong during grammar projection: $_\n"; - } -} -dump_rules($prev_coarse_rule, $best_features, @rules); -close(GRAMMAR); -cleanup(); - -sub compute_score { - my($features, %weights) = @_; - my $score = 0; - if ($features =~ s/^\s*(\S.*\S)\s*$/$1/) { - my @features = split(/\s+/, $features); - my $pm=0; - for my $feature (@features) { - my $feature_name; - my $feature_val; - if ($feature =~ /(.*)=(.*)/){ - $feature_name = $1; - $feature_val= $2; - } else { - $feature_name = "PhraseModel_" . $pm; - $feature_val= $feature; - } - $pm++; - if ($weights{$feature_name}){ - $score += $weights{$feature_name} * $feature_val; - } - } - } else { - die "Unexpected feature value format: $features\n"; - } - return $score; -} - -sub dump_rules { - my($coarse_rule, $coarse_rule_scores, @fine_rules) = @_; - unless($coarse_rule){ return; } - print "$coarse_rule $coarse_rule_scores\n"; - for my $rule (@fine_rules){ - print "\t$rule"; - } -} - -sub cleanup_and_die { - cleanup(); - die "\n"; -} - -sub cleanup { - `rm -rf grammar.tmp grammar.tmp.sorted`; -} - - - - diff --git a/extools/extract.cc b/extools/extract.cc deleted file mode 100644 index 49542fed..00000000 --- a/extools/extract.cc +++ /dev/null @@ -1,336 +0,0 @@ -#include "extract.h" - -#include <queue> -#include <vector> -#include <utility> -#include <tr1/unordered_map> -#include <set> -#include <boost/tuple/tuple_comparison.hpp> - -#include <boost/functional/hash.hpp> -#include <boost/tuple/tuple.hpp> - -#include "sentence_pair.h" -#include "tdict.h" -#include "wordid.h" -#include "array2d.h" - -using namespace std; -using namespace boost; -using std::tr1::unordered_map; -using boost::tuple; - -namespace { - inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } - - inline void SkipWhitespace(const char* buf, int* ptr) { - while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); } - } -} - -Extract::RuleObserver::~RuleObserver() { - cerr << "Rules extracted: " << count << endl; -} - -void Extract::ExtractBasePhrases(const int max_base_phrase_size, - const AnnotatedParallelSentence& sentence, - vector<ParallelSpan>* phrases) { - phrases->clear(); - - vector<pair<int,int> > f_spans(sentence.f_len, pair<int,int>(sentence.e_len, 0)); - vector<pair<int,int> > e_spans(sentence.e_len, pair<int,int>(sentence.f_len, 0)); - // for each alignment point in e, precompute the minimal consistent phrases in f - // for each alignment point in f, precompute the minimal consistent phrases in e - for (int i = 0; i < sentence.f_len; ++i) { - for (int j = 0; j < sentence.e_len; ++j) { - if (sentence.aligned(i,j)) { - if (j < f_spans[i].first) f_spans[i].first = j; - f_spans[i].second = j+1; - if (i < e_spans[j].first) e_spans[j].first = i; - e_spans[j].second = i+1; - } - } - } - - for (int i1 = 0; i1 < sentence.f_len; ++i1) { - if (sentence.f_aligned[i1] == 0) continue; - int j1 = sentence.e_len; - int j2 = 0; - const int i_limit = min(sentence.f_len, i1 + max_base_phrase_size); - for (int i2 = i1 + 1; i2 <= i_limit; ++i2) { - if (sentence.f_aligned[i2-1] == 0) continue; - // cerr << "F has aligned span " << i1 << " to " << i2 << endl; - j1 = min(j1, f_spans[i2-1].first); - j2 = max(j2, f_spans[i2-1].second); - if (j1 >= j2) continue; - if (j2 - j1 > max_base_phrase_size) continue; - int condition = 0; - for (int j = j1; j < j2; ++j) { - if (e_spans[j].first < i1) { condition = 1; break; } - if (e_spans[j].second > i2) { condition = 2; break; } - } - if (condition == 1) break; - if (condition == 2) continue; - // category types added later! - phrases->push_back(ParallelSpan(i1, i2, j1, j2)); - // cerr << i1 << " " << i2 << " : " << j1 << " " << j2 << endl; - } - } -} - -void Extract::LoosenPhraseBounds(const AnnotatedParallelSentence& sentence, - const int max_base_phrase_size, - vector<ParallelSpan>* phrases) { - const int num_phrases = phrases->size(); - map<int, map<int, map<int, map<int, bool> > > > marker; - for (int i = 0; i < num_phrases; ++i) { - const ParallelSpan& cur = (*phrases)[i]; - marker[cur.i1][cur.i2][cur.j1][cur.j2] = true; - } - for (int i = 0; i < num_phrases; ++i) { - const ParallelSpan& cur = (*phrases)[i]; - const int i1_max = cur.i1; - const int i2_min = cur.i2; - const int j1_max = cur.j1; - const int j2_min = cur.j2; - int i1_min = i1_max; - while (i1_min > 0 && sentence.f_aligned[i1_min-1] == 0) { --i1_min; } - int j1_min = j1_max; - while (j1_min > 0 && sentence.e_aligned[j1_min-1] == 0) { --j1_min; } - int i2_max = i2_min; - while (i2_max < sentence.f_len && sentence.f_aligned[i2_max] == 0) { ++i2_max; } - int j2_max = j2_min; - while (j2_max < sentence.e_len && sentence.e_aligned[j2_max] == 0) { ++j2_max; } - for (int i1 = i1_min; i1 <= i1_max; ++i1) { - const int ilim = min(i2_max, i1 + max_base_phrase_size); - for (int i2 = max(i1+1,i2_min); i2 <= ilim; ++i2) { - for (int j1 = j1_min; j1 <= j1_max; ++j1) { - const int jlim = std::min(j2_max, j1 + max_base_phrase_size); - for (int j2 = std::max(j1+1, j2_min); j2 <= jlim; ++j2) { - bool& seen = marker[i1][i2][j1][j2]; - if (!seen) - phrases->push_back(ParallelSpan(i1,i2,j1,j2)); - seen = true; - } - } - } - } - } -} - -template <typename K, typename V> -void -lookup_and_append(const map<K, V> &dict, const K &key, V &output) -{ - typename map<K, V>::const_iterator found = dict.find(key); - if (found != dict.end()) - copy(found->second.begin(), found->second.end(), back_inserter(output)); -} - -// this uses the TARGET span (i,j) to annotate phrases, will copy -// phrases if there is more than one annotation. -// TODO: support source annotation -void Extract::AnnotatePhrasesWithCategoryTypes(const WordID default_cat, - const map< boost::tuple<short,short,short,short>, vector<WordID> > &types, - vector<ParallelSpan>* phrases) { - const int num_unannotated_phrases = phrases->size(); - // have to use num_unannotated_phrases since we may grow the vector - for (int i = 0; i < num_unannotated_phrases; ++i) { - ParallelSpan& phrase = (*phrases)[i]; - vector<WordID> cats; - lookup_and_append(types, boost::make_tuple(phrase.i1, phrase.i2, phrase.j1, phrase.j2), cats); - lookup_and_append(types, boost::make_tuple((short)-1, (short)-1, phrase.j1, phrase.j2), cats); - lookup_and_append(types, boost::make_tuple(phrase.i1, phrase.i2, (short)-1, (short)-1), cats); - if (cats.empty() && default_cat != 0) { - cats = vector<WordID>(1, default_cat); - } - if (cats.empty()) { - cerr << "ERROR span " << phrase.i1 << "," << phrase.i2 << "-" - << phrase.j1 << "," << phrase.j2 << " has no type. " - "Did you forget --default_category?\n"; - } - phrase.cat = cats[0]; - for (int ci = 1; ci < cats.size(); ++ci) { - ParallelSpan new_phrase = phrase; - new_phrase.cat = cats[ci]; - phrases->push_back(new_phrase); - } - } -} - -// a partially complete (f-side) of a rule -struct RuleItem { - vector<ParallelSpan> f; - int i,j,syms,vars; - explicit RuleItem(int pi) : i(pi), j(pi), syms(), vars() {} - void Extend(const WordID& fword) { - f.push_back(ParallelSpan(fword)); - ++j; - ++syms; - } - void Extend(const ParallelSpan& subphrase) { - f.push_back(subphrase); - j += subphrase.i2 - subphrase.i1; - ++vars; - ++syms; - } - bool RuleFEndsInVariable() const { - if (f.size() > 0) { - return f.back().IsVariable(); - } else { return false; } - } -}; - -void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence, - const vector<ParallelSpan>& phrases, - const int max_vars, - const int max_syms, - const bool permit_adjacent_nonterminals, - const bool require_aligned_terminal, - RuleObserver* observer, - vector<WordID>* all_cats) { - const char bkoff_mrkr = '_'; - queue<RuleItem> q; // agenda for BFS - int max_len = -1; - unordered_map<pair<short, short>, vector<ParallelSpan>, boost::hash<pair<short, short> > > fspans; - vector<vector<ParallelSpan> > spans_by_start(sentence.f_len); - set<int> starts; - WordID bkoff; - for (int i = 0; i < phrases.size(); ++i) { - fspans[make_pair(phrases[i].i1,phrases[i].i2)].push_back(phrases[i]); - max_len = max(max_len, phrases[i].i2 - phrases[i].i1); - // have we already added a rule item starting at phrases[i].i1? - if (starts.insert(phrases[i].i1).second) - q.push(RuleItem(phrases[i].i1)); - spans_by_start[phrases[i].i1].push_back(phrases[i]); - } - starts.clear(); - vector<pair<int,int> > next_e(sentence.e_len); - vector<WordID> cur_rhs_f, cur_rhs_e; - vector<pair<short, short> > cur_terminal_align; - vector<int> cur_es, cur_fs; - while(!q.empty()) { - const RuleItem& rule = q.front(); - - // extend the partial rule - if (rule.j < sentence.f_len && (rule.j - rule.i) < max_len && rule.syms < max_syms) { - RuleItem ew = rule; - - // extend with a word - ew.Extend(sentence.f[ew.j]); - q.push(ew); - - // with variables - if (rule.vars < max_vars && - !spans_by_start[rule.j].empty() && - ((!rule.RuleFEndsInVariable()) || permit_adjacent_nonterminals)) { - const vector<ParallelSpan>& sub_phrases = spans_by_start[rule.j]; - for (int it = 0; it < sub_phrases.size(); ++it) { - if (sub_phrases[it].i2 - sub_phrases[it].i1 + rule.j - rule.i <= max_len) { - RuleItem ev = rule; - ev.Extend(sub_phrases[it]); - q.push(ev); - assert(ev.j <= sentence.f_len); - } - } - } - } - // determine if rule is consistent - if (rule.syms > 0 && - fspans.count(make_pair(rule.i,rule.j)) && - (!rule.RuleFEndsInVariable() || rule.syms > 1)) { - const vector<ParallelSpan>& orig_spans = fspans[make_pair(rule.i,rule.j)]; - for (int s = 0; s < orig_spans.size(); ++s) { - const ParallelSpan& orig_span = orig_spans[s]; - const WordID lhs = orig_span.cat; - for (int j = orig_span.j1; j < orig_span.j2; ++j) next_e[j].first = -1; - int nt_index_e = 0; - for (int i = 0; i < rule.f.size(); ++i) { - const ParallelSpan& cur = rule.f[i]; - if (cur.IsVariable()) - next_e[cur.j1] = pair<int,int>(cur.j2, ++nt_index_e); - } - cur_rhs_f.clear(); - cur_rhs_e.clear(); - cur_terminal_align.clear(); - cur_fs.clear(); - cur_es.clear(); - - const int elen = orig_span.j2 - orig_span.j1; - vector<int> isvar(elen, 0); - int fbias = rule.i; - bool bad_rule = false; - bool has_aligned_terminal = false; - for (int i = 0; i < rule.f.size(); ++i) { - const ParallelSpan& cur = rule.f[i]; - cur_rhs_f.push_back(cur.cat); - if (cur.cat > 0) { // terminal - if (sentence.f_aligned[fbias + i]) has_aligned_terminal = true; - cur_fs.push_back(fbias + i); - } else { // non-terminal - int subj1 = cur.j1 - orig_span.j1; - int subj2 = cur.j2 - orig_span.j1; - if (subj1 < 0 || subj2 > elen) { bad_rule = true; break; } - for (int j = subj1; j < subj2 && !bad_rule; ++j) { - int& isvarj = isvar[j]; - isvarj = true; - } - if (bad_rule) break; - cur_fs.push_back(-1); - fbias += cur.i2 - cur.i1 - 1; - } - } - if (require_aligned_terminal && !has_aligned_terminal) bad_rule = true; - if (!bad_rule) { - for (int j = orig_span.j1; j < orig_span.j2; ++j) { - if (next_e[j].first < 0) { - cur_rhs_e.push_back(sentence.e[j]); - cur_es.push_back(j); - } else { - cur_rhs_e.push_back(1 - next_e[j].second); // next_e[j].second is NT gap index - cur_es.push_back(-1); - j = next_e[j].first - 1; - } - } - for (short i = 0; i < cur_fs.size(); ++i) - if (cur_fs[i] >= 0) - for (short j = 0; j < cur_es.size(); ++j) - if (cur_es[j] >= 0 && sentence.aligned(cur_fs[i],cur_es[j])) - cur_terminal_align.push_back(make_pair(i,j)); - //observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align); - - if(!all_cats->empty()) { - //produce the backoff grammar if the category wordIDs are available - for (int i = 0; i < cur_rhs_f.size(); ++i) { - if(cur_rhs_f[i] < 0) { - //cerr << cur_rhs_f[i] << ": (cats,f) |" << TD::Convert(-cur_rhs_f[i]) << endl; - string nonterm = TD::Convert(-cur_rhs_f[i]); - nonterm+=bkoff_mrkr; - bkoff = -TD::Convert(nonterm); - cur_rhs_f[i]=bkoff; - /*vector<WordID> rhs_f_bkoff; - vector<WordID> rhs_e_bkoff; - vector<pair<short,short> > bkoff_align; - bkoff_align.clear(); - bkoff_align.push_back(make_pair(0,0)); - - for (int cat = 0; cat < all_cats->size(); ++cat) { - rhs_f_bkoff.clear(); - rhs_e_bkoff.clear(); - rhs_f_bkoff.push_back(-(*all_cats)[cat]); - rhs_e_bkoff.push_back(0); - observer->CountRule(bkoff,rhs_f_bkoff,rhs_e_bkoff,bkoff_align); - - }*/ - } - } - - } - observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align); - } - } - } - q.pop(); - } -} - diff --git a/extools/extract.h b/extools/extract.h deleted file mode 100644 index e9ea5e65..00000000 --- a/extools/extract.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef _EXTRACT_H_ -#define _EXTRACT_H_ - -#include <iostream> -#include <utility> -#include <vector> -#include <boost/tuple/tuple.hpp> -#include "array2d.h" -#include "wordid.h" -#include "sparse_vector.h" - -struct AnnotatedParallelSentence; - -// usually represents a consistent phrase, which may -// be annotated with a type (cat) -// inside the rule extractor, this class is also used to represent a word -// in a partial rule. -struct ParallelSpan { - // i1 = i of f side - // i2 = j of f side - // j1 = i of e side - // j2 = j of e side - short i1,i2,j1,j2; - // cat is set by AnnotatePhrasesWithCategoryTypes, otherwise it's 0 - WordID cat; // category type of span (also overloaded by RuleItem class - // to be a word ID) - ParallelSpan() : i1(-1), i2(-1), j1(-1), j2(-1), cat() {} - // used by Rule class to represent a terminal symbol: - explicit ParallelSpan(WordID w) : i1(-1), i2(-1), j1(-1), j2(-1), cat(w) {} - ParallelSpan(int pi1, int pi2, int pj1, int pj2) : i1(pi1), i2(pi2), j1(pj1), j2(pj2), cat() {} - ParallelSpan(int pi1, int pi2, int pj1, int pj2, WordID c) : i1(pi1), i2(pi2), j1(pj1), j2(pj2), cat(c) {} - - // ParallelSpan is used in the Rule class where it is - // overloaded to also represent terminal symbols - inline bool IsVariable() const { return i1 != -1; } -}; - -// rule extraction logic lives here. this has no data, it's just got -// static member functions. -struct Extract { - // RuleObserver's CountRule is called for each rule extracted - // implement CountRuleImpl to do things like count the rules, - // write them to a file, etc. - struct RuleObserver { - RuleObserver() : count() {} - virtual void CountRule(WordID lhs, - const std::vector<WordID>& rhs_f, - const std::vector<WordID>& rhs_e, - const std::vector<std::pair<short, short> >& fe_terminal_alignments) { - ++count; - CountRuleImpl(lhs, rhs_f, rhs_e, fe_terminal_alignments); - } - virtual ~RuleObserver(); - - protected: - virtual void CountRuleImpl(WordID lhs, - const std::vector<WordID>& rhs_f, - const std::vector<WordID>& rhs_e, - const std::vector<std::pair<short, short> >& fe_terminal_alignments) = 0; - private: - int count; - }; - - // given a set of "tight" phrases and the aligned sentence they were - // extracted from, "loosen" them - static void LoosenPhraseBounds(const AnnotatedParallelSentence& sentence, - const int max_base_phrase_size, - std::vector<ParallelSpan>* phrases); - - // extract all consistent phrase pairs, up to size max_base_phrase_size - // (on the source side). these phrases will be "tight". - static void ExtractBasePhrases(const int max_base_phrase_size, - const AnnotatedParallelSentence& sentence, - std::vector<ParallelSpan>* phrases); - - // this uses the TARGET span (i,j) to annotate phrases, will copy - // phrases if there is more than one annotation. - static void AnnotatePhrasesWithCategoryTypes(const WordID default_cat, - const std::map< boost::tuple<short,short,short,short>, std::vector<WordID> > &types, - std::vector<ParallelSpan>* phrases); - - // use the Chiang (2007) extraction logic to extract consistent subphrases - // observer->CountRule is called once for each rule extracted - static void ExtractConsistentRules(const AnnotatedParallelSentence& sentence, - const std::vector<ParallelSpan>& phrases, - const int max_vars, - const int max_syms, - const bool permit_adjacent_nonterminals, - const bool require_aligned_terminal, - RuleObserver* observer, - std::vector<WordID>* all_cats); -}; - -#endif diff --git a/extools/extractor.cc b/extools/extractor.cc deleted file mode 100644 index 1e4154ef..00000000 --- a/extools/extractor.cc +++ /dev/null @@ -1,439 +0,0 @@ -#include <iostream> -#include <vector> -#include <utility> -#include <tr1/unordered_map> - -#include <boost/functional/hash.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include <boost/lexical_cast.hpp> - -#include "sparse_vector.h" -#include "sentence_pair.h" -#include "extract.h" -#include "tdict.h" -#include "fdict.h" -#include "wordid.h" -#include "array2d.h" -#include "filelib.h" -#include "striped_grammar.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -static const size_t MAX_LINE_LENGTH = 100000; -WordID kBOS, kEOS, kDIVIDER, kGAP, kSPLIT; -int kCOUNT; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input,i", po::value<string>()->default_value("-"), "Input file") - ("default_category,d", po::value<string>(), "Default span type (use X for 'Hiero')") - ("x_cdyer_pos,x", "Extract monolingual POS contexts (cdyer experimental)") - ("loose", "Use loose phrase extraction heuristic for base phrases") - ("base_phrase,B", "Write base phrases") - ("base_phrase_spans", "Write base sentences and phrase spans") - ("phrase_language", po::value<string>()->default_value("target"), "Extract phrase strings in source, target or both languages") - ("context_language", po::value<string>()->default_value("target"), "Extract context strings in source, target or both languages") - ("bidir,b", "Extract bidirectional rules (for computing p(f|e) in addition to p(e|f))") - ("combiner_size,c", po::value<size_t>()->default_value(800000), "Number of unique items to store in cache before writing rule counts. Set to 1 to disable cache. Set to 0 for no limit.") - ("silent", "Write nothing to stderr except errors") - ("phrase_context,C", "Write base phrase contexts") - ("phrase_context_size,S", po::value<int>()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts") - ("max_base_phrase_size,L", po::value<int>()->default_value(10), "Maximum starting phrase size") - ("max_syms,l", po::value<int>()->default_value(5), "Maximum number of symbols in final phrase size") - ("max_vars,v", po::value<int>()->default_value(2), "Maximum number of nonterminal variables in final phrase size") - ("permit_adjacent_nonterminals,A", "Permit adjacent nonterminals in source side of rules") - ("no_required_aligned_terminal,n", "Do not require an aligned terminal") - ("topics,t", po::value<int>()->default_value(50), "Number of categories assigned during clustering") - ("backoff,g","Produce a backoff grammar") - ("help,h", "Print this help message and exit"); - po::options_description clo("Command line options"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - - if (conf->count("help") || conf->count("input") == 0) { - cerr << "\nUsage: extractor [-options]\n"; - cerr << dcmdline_options << endl; - exit(1); - } -} - -// TODO how to handle alignment information? -void WriteBasePhrases(const AnnotatedParallelSentence& sentence, - const vector<ParallelSpan>& phrases) { - vector<WordID> e,f; - for (int it = 0; it < phrases.size(); ++it) { - const ParallelSpan& phrase = phrases[it]; - e.clear(); - f.clear(); - for (int i = phrase.i1; i < phrase.i2; ++i) - f.push_back(sentence.f[i]); - for (int j = phrase.j1; j < phrase.j2; ++j) - e.push_back(sentence.e[j]); - cout << TD::GetString(f) << " ||| " << TD::GetString(e) << endl; - } -} - -void WriteBasePhraseSpans(const AnnotatedParallelSentence& sentence, - const vector<ParallelSpan>& phrases) { - cout << TD::GetString(sentence.f) << " ||| " << TD::GetString(sentence.e) << " |||"; - for (int it = 0; it < phrases.size(); ++it) { - const ParallelSpan& phrase = phrases[it]; - cout << " " << phrase.i1 << "-" << phrase.i2 - << "-" << phrase.j1 << "-" << phrase.j2; - } - cout << endl; -} - -struct CountCombiner { - CountCombiner(const size_t& csize) : combiner_size(csize) { - if (csize == 0) { cerr << "Using unlimited combiner cache.\n"; } - } - ~CountCombiner() { - if (!cache.empty()) WriteAndClearCache(); - } - - void Count(const vector<WordID>& key, - const vector<WordID>& val, - const int count_type, - const vector<pair<short,short> >& aligns) { - if (combiner_size != 1) { - RuleStatistics& v = cache[key][val]; - float newcount = v.counts.add_value(count_type, 1.0f); - // hack for adding alignments - if (newcount < 7.0f && aligns.size() > v.aligns.size()) - v.aligns = aligns; - if (combiner_size > 1 && cache.size() > combiner_size) - WriteAndClearCache(); - } else { - cout << TD::GetString(key) << '\t' << TD::GetString(val) << " ||| "; - cout << RuleStatistics(count_type, 1.0f, aligns) << endl; - } - } - - private: - void WriteAndClearCache() { - for (unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > >::iterator it = cache.begin(); - it != cache.end(); ++it) { - cout << TD::GetString(it->first) << '\t'; - const Vec2PhraseCount& vals = it->second; - bool needdiv = false; - for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi) { - if (needdiv) cout << " ||| "; else needdiv = true; - cout << TD::GetString(vi->first) << " ||| " << vi->second; - } - cout << endl; - } - cache.clear(); - } - - const size_t combiner_size; - typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > Vec2PhraseCount; - unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > > cache; -}; - -// TODO optional source context -// output <k, v> : k = phrase "document" v = context "term" -void WritePhraseContexts(const AnnotatedParallelSentence& sentence, - const vector<ParallelSpan>& phrases, - const int ctx_size, - bool phrase_s, bool phrase_t, - bool context_s, bool context_t, - CountCombiner* o) { - vector<WordID> context, context_f; - if (context_t) - { - context.resize(ctx_size * 2 + 1); - context[ctx_size] = kGAP; - } - if (context_s) - { - context_f.resize(ctx_size * 2 + 1); - context_f[ctx_size] = kGAP; - } - vector<WordID> key, key_f; - if (phrase_t) key.reserve(100); - if (phrase_s) key_f.reserve(100); - - for (int it = 0; it < phrases.size(); ++it) { - const ParallelSpan& phrase = phrases[it]; - - key.clear(); - for (int j = phrase.j1; j < phrase.j2 && phrase_t; ++j) - key.push_back(sentence.e[j]); - - if (context_t) - { - context.resize(ctx_size * 2 + 1); - for (int i = 0; i < ctx_size && context_t; ++i) { - int epos = phrase.j1 - 1 - i; - const WordID left_ctx = (epos < 0) ? kBOS : sentence.e[epos]; - context[ctx_size - i - 1] = left_ctx; - epos = phrase.j2 + i; - const WordID right_ctx = (epos >= sentence.e_len) ? kEOS : sentence.e[epos]; - context[ctx_size + i + 1] = right_ctx; - } - } - else - context.clear(); - - if (phrase_s) - { - key_f.clear(); - for (int i = phrase.i1; i < phrase.i2; ++i) - key_f.push_back(sentence.f[i]); - if (phrase_t) key.push_back(kSPLIT); - copy(key_f.begin(), key_f.end(), back_inserter(key)); - } - - if (context_s) - { - for (int i = 0; i < ctx_size; ++i) { - int fpos = phrase.i1 - 1 - i; - const WordID left_ctx = (fpos < 0) ? kBOS : sentence.f[fpos]; - context_f[ctx_size - i - 1] = left_ctx; - fpos = phrase.i2 + i; - const WordID right_ctx = (fpos >= sentence.f_len) ? kEOS : sentence.f[fpos]; - context_f[ctx_size + i + 1] = right_ctx; - } - if (context_t) context.push_back(kSPLIT); - copy(context_f.begin(), context_f.end(), back_inserter(context)); - } - - o->Count(key, context, kCOUNT, vector<pair<short,short> >()); - } -} - -struct SimpleRuleWriter : public Extract::RuleObserver { - protected: - virtual void CountRuleImpl(WordID lhs, - const vector<WordID>& rhs_f, - const vector<WordID>& rhs_e, - const vector<pair<short,short> >& fe_terminal_alignments) { - cout << "[" << TD::Convert(-lhs) << "] |||"; - for (int i = 0; i < rhs_f.size(); ++i) { - if (rhs_f[i] < 0) cout << " [" << TD::Convert(-rhs_f[i]) << ']'; - else cout << ' ' << TD::Convert(rhs_f[i]); - } - cout << " |||"; - for (int i = 0; i < rhs_e.size(); ++i) { - if (rhs_e[i] <= 0) cout << " [" << (1-rhs_e[i]) << ']'; - else cout << ' ' << TD::Convert(rhs_e[i]); - } - cout << " |||"; - for (int i = 0; i < fe_terminal_alignments.size(); ++i) { - cout << ' ' << fe_terminal_alignments[i].first << '-' << fe_terminal_alignments[i].second; - } - cout << endl; - } -}; - -struct HadoopStreamingRuleObserver : public Extract::RuleObserver { - HadoopStreamingRuleObserver(CountCombiner* cc, bool bidir_flag) : - bidir(bidir_flag), - kF(TD::Convert("F")), - kE(TD::Convert("E")), - kDIVIDER(TD::Convert("|||")), - kLB("["), kRB("]"), - combiner(*cc), - kEMPTY(), - kCFE(FD::Convert("CFE")) { - for (int i=1; i < 50; ++i) - index2sym[1-i] = TD::Convert(kLB + boost::lexical_cast<string>(i) + kRB); - fmajor_key.resize(10, kF); - emajor_key.resize(10, kE); - if (bidir) - fmajor_key[2] = emajor_key[2] = kDIVIDER; - else - fmajor_key[1] = kDIVIDER; - } - - protected: - virtual void CountRuleImpl(WordID lhs, - const vector<WordID>& rhs_f, - const vector<WordID>& rhs_e, - const vector<pair<short,short> >& fe_terminal_alignments) { - if (bidir) { // extract rules in "both directions" E->F and F->E - fmajor_key.resize(3 + rhs_f.size()); - emajor_key.resize(3 + rhs_e.size()); - fmajor_val.resize(rhs_e.size()); - emajor_val.resize(rhs_f.size()); - emajor_key[1] = fmajor_key[1] = MapSym(lhs); - int nt = 1; - for (int i = 0; i < rhs_f.size(); ++i) { - const WordID id = rhs_f[i]; - if (id < 0) { - fmajor_key[3 + i] = MapSym(id, nt); - emajor_val[i] = MapSym(id, nt); - ++nt; - } else { - fmajor_key[3 + i] = id; - emajor_val[i] = id; - } - } - for (int i = 0; i < rhs_e.size(); ++i) { - WordID id = rhs_e[i]; - if (id <= 0) { - fmajor_val[i] = index2sym[id]; - emajor_key[3 + i] = index2sym[id]; - } else { - fmajor_val[i] = id; - emajor_key[3 + i] = id; - } - } - combiner.Count(fmajor_key, fmajor_val, kCFE, fe_terminal_alignments); - combiner.Count(emajor_key, emajor_val, kCFE, kEMPTY); - } else { // extract rules only in F->E - fmajor_key.resize(2 + rhs_f.size()); - fmajor_val.resize(rhs_e.size()); - fmajor_key[0] = MapSym(lhs); - int nt = 1; - for (int i = 0; i < rhs_f.size(); ++i) { - const WordID id = rhs_f[i]; - if (id < 0) - fmajor_key[2 + i] = MapSym(id, nt++); - else - fmajor_key[2 + i] = id; - } - for (int i = 0; i < rhs_e.size(); ++i) { - const WordID id = rhs_e[i]; - if (id <= 0) - fmajor_val[i] = index2sym[id]; - else - fmajor_val[i] = id; - } - combiner.Count(fmajor_key, fmajor_val, kCFE, fe_terminal_alignments); - } - } - - private: - WordID MapSym(WordID sym, int ind = 0) { - WordID& r = cat2ind2sym[sym][ind]; - if (!r) { - if (ind == 0) - r = TD::Convert(kLB + TD::Convert(-sym) + kRB); - else - r = TD::Convert(kLB + TD::Convert(-sym) + "," + boost::lexical_cast<string>(ind) + kRB); - } - return r; - } - - const bool bidir; - const WordID kF, kE, kDIVIDER; - const string kLB, kRB; - CountCombiner& combiner; - const vector<pair<short,short> > kEMPTY; - const int kCFE; - map<WordID, map<int, WordID> > cat2ind2sym; - map<int, WordID> index2sym; - vector<WordID> emajor_key, emajor_val, fmajor_key, fmajor_val; -}; - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - kBOS = TD::Convert("<s>"); - kEOS = TD::Convert("</s>"); - kDIVIDER = TD::Convert("|||"); - kGAP = TD::Convert("<PHRASE>"); - kCOUNT = FD::Convert("C"); - kSPLIT = TD::Convert("<SPLIT>"); - - WordID default_cat = 0; // 0 means no default- extraction will - // fail if a phrase is extracted without a - // category - const bool backoff = (conf.count("backoff") ? true : false); - if (conf.count("default_category")) { - string sdefault_cat = conf["default_category"].as<string>(); - default_cat = -TD::Convert(sdefault_cat); - cerr << "Default category: " << sdefault_cat << endl; - } - ReadFile rf(conf["input"].as<string>()); - istream& in = *rf.stream(); - - char buf[MAX_LINE_LENGTH]; - AnnotatedParallelSentence sentence; - vector<ParallelSpan> phrases; - vector<WordID> all_cats; - int max_base_phrase_size = conf["max_base_phrase_size"].as<int>(); - bool write_phrase_contexts = conf.count("phrase_context") > 0; - const bool write_base_phrases = conf.count("base_phrase") > 0; - const bool write_base_phrase_spans = conf.count("base_phrase_spans") > 0; - const bool loose_phrases = conf.count("loose") > 0; - const bool silent = conf.count("silent") > 0; - const int max_syms = conf["max_syms"].as<int>(); - const int max_vars = conf["max_vars"].as<int>(); - const int ctx_size = conf["phrase_context_size"].as<int>(); - const int num_categories = conf["topics"].as<int>(); - const bool permit_adjacent_nonterminals = conf.count("permit_adjacent_nonterminals") > 0; - const bool require_aligned_terminal = conf.count("no_required_aligned_terminal") == 0; - const string ps = conf["phrase_language"].as<string>(); - const bool phrase_s = ps == "source" || ps == "both"; - const bool phrase_t = ps == "target" || ps == "both"; - const string cs = conf["context_language"].as<string>(); - const bool context_s = cs == "source" || cs == "both"; - const bool context_t = cs == "target" || cs == "both"; - const bool x_cdyer_pos = conf.count("x_cdyer_pos"); - int line = 0; - CountCombiner cc(conf["combiner_size"].as<size_t>()); - HadoopStreamingRuleObserver o(&cc, - conf.count("bidir") > 0); - - assert(phrase_s || phrase_t); - assert(context_s || context_t); - - if(backoff) { - for (int i=0;i < num_categories;++i) - all_cats.push_back(TD::Convert("X"+boost::lexical_cast<string>(i))); - } - - //SimpleRuleWriter o; - while(in) { - ++line; - in.getline(buf, MAX_LINE_LENGTH); - if (buf[0] == 0) continue; - //cerr << "line #" << line << " = " << buf << endl; - if (!silent) { - if (line % 200 == 0) cerr << '.'; - if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush; - } - sentence.ParseInputLine(buf); - if (x_cdyer_pos) { - sentence.e = sentence.f; - sentence.AllocateForAlignment(); - for (int i = 0; i < sentence.e.size(); ++i) sentence.Align(i,i); - max_base_phrase_size = 1; - write_phrase_contexts = true; - } - phrases.clear(); - Extract::ExtractBasePhrases(max_base_phrase_size, sentence, &phrases); - if (loose_phrases) - Extract::LoosenPhraseBounds(sentence, max_base_phrase_size, &phrases); - if (phrases.empty()) { - cerr << "WARNING no phrases extracted line: " << line << endl; - continue; - } - if (write_phrase_contexts) { - WritePhraseContexts(sentence, phrases, ctx_size, phrase_s, phrase_t, context_s, context_t, &cc); - continue; - } - if (write_base_phrases) { - WriteBasePhrases(sentence, phrases); - continue; - } - if (write_base_phrase_spans) { - WriteBasePhraseSpans(sentence, phrases); - continue; - } - Extract::AnnotatePhrasesWithCategoryTypes(default_cat, sentence.span_types, &phrases); - Extract::ExtractConsistentRules(sentence, phrases, max_vars, max_syms, permit_adjacent_nonterminals, require_aligned_terminal, &o, &all_cats); - } - if (!silent) cerr << endl; - return 0; -} diff --git a/extools/extractor_monolingual.cc b/extools/extractor_monolingual.cc deleted file mode 100644 index 049ebc85..00000000 --- a/extools/extractor_monolingual.cc +++ /dev/null @@ -1,256 +0,0 @@ -#include <iostream> -#include <vector> -#include <utility> -#include <tr1/unordered_map> - -#include <boost/functional/hash.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include <boost/lexical_cast.hpp> - -#include "tdict.h" -#include "fdict.h" -#include "wordid.h" -#include "filelib.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -static const size_t MAX_LINE_LENGTH = 100000; -WordID kBOS, kEOS, kDIVIDER, kGAP; -int kCOUNT; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input,i", po::value<string>()->default_value("-"), "Input file") - ("phrases,p", po::value<string>(), "File contatining phrases of interest") - ("phrase_context_size,S", po::value<int>()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts") - ("combiner_size,c", po::value<size_t>()->default_value(30000), "Number of unique items to store in cache before writing rule counts. Set to 1 to disable cache. Set to 0 for no limit.") - ("prune", po::value<size_t>()->default_value(0), "Prune items with count less than threshold; applies each time the cache is dumped.") - ("silent", "Write nothing to stderr except errors") - ("help,h", "Print this help message and exit"); - po::options_description clo("Command line options"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - - if (conf->count("help") || conf->count("input") != 1 || conf->count("phrases") != 1) { - cerr << "\nUsage: extractor_monolingual [-options]\n"; - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct TrieNode -{ - TrieNode(int l) : finish(false), length(l) {}; - ~TrieNode() - { - for (unordered_map<int, TrieNode*>::iterator - it = next.begin(); it != next.end(); ++it) - delete it->second; - next.clear(); - } - - TrieNode *follow(int token) - { - unordered_map<int, TrieNode*>::iterator - found = next.find(token); - if (found != next.end()) - return found->second; - else - return 0; - } - - void insert(const vector<int> &tokens) - { - insert(tokens.begin(), tokens.end()); - } - - void insert(vector<int>::const_iterator begin, vector<int>::const_iterator end) - { - if (begin == end) - finish = true; - else - { - int token = *begin; - unordered_map<int, TrieNode*>::iterator - nit = next.find(token); - if (nit == next.end()) - nit = next.insert(make_pair(token, new TrieNode(length+1))).first; - ++begin; - nit->second->insert(begin, end); - } - } - - bool finish; - int length; - unordered_map<int, TrieNode*> next; -}; - -struct CountCombiner { - CountCombiner(const size_t& csize, const size_t& prune) : combiner_size(csize), threshold(prune) { - if (csize == 0) { cerr << "Using unlimited combiner cache.\n"; } - } - ~CountCombiner() { - if (!cache.empty()) WriteAndClearCache(); - } - - void Count(const vector<WordID>& key, - const vector<WordID>& val, - const int count_type) - { - if (combiner_size != 1) { - cache[key][val] += count_type; - if (combiner_size > 1 && cache.size() > combiner_size) - WriteAndClearCache(); - } else { - cout << TD::GetString(key) << '\t' << TD::GetString(val) << " ||| C=" << count_type << "\n"; - } - } - - private: - void WriteAndClearCache() { - for (unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > >::iterator it = cache.begin(); - it != cache.end(); ++it) { - const Vec2PhraseCount& vals = it->second; - bool first = true; - for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi) - { - if (threshold > 1 && combiner_size != 1 && vi->second < threshold) - continue; - - if (!first) cout << " ||| "; - else - { - cout << TD::GetString(it->first) << '\t'; - first = false; - } - cout << TD::GetString(vi->first) << " ||| C=" << vi->second; - } - if (!first) - cout << '\n'; - } - cout << flush; - cache.clear(); - } - - const size_t combiner_size, threshold; - typedef unordered_map<vector<WordID>, int, boost::hash<vector<WordID> > > Vec2PhraseCount; - unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > > cache; -}; - -void WriteContext(const vector<int>& sentence, int start, int end, int ctx_size, CountCombiner &combiner) -{ - vector<WordID> phrase, context; - for (int i = start; i < end; ++i) - phrase.push_back(sentence[i]); - - for (int i = ctx_size; i > 0; --i) - context.push_back(sentence[start-i]); - context.push_back(kGAP); - for (int i = 0; i < ctx_size; ++i) - context.push_back(sentence[end+i]); - - combiner.Count(phrase, context, 1); -} - -inline bool IsWhitespace(char c) { - return c == ' ' || c == '\t'; -} - -inline void SkipWhitespace(const char* buf, int* ptr) { - while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); } -} - -vector<int> ReadSentence(const char *buf, int padding) -{ - int ptr = 0; - SkipWhitespace(buf, &ptr); - int start = ptr; - vector<int> sentence; - for (int i = 0; i < padding; ++i) - sentence.push_back(kBOS); - - while (char c = buf[ptr]) - { - if (!IsWhitespace(c)) - ++ptr; - else { - sentence.push_back(TD::Convert(string(buf, start, ptr-start))); - SkipWhitespace(buf, &ptr); - start = ptr; - } - } - for (int i = 0; i < padding; ++i) - sentence.push_back(kEOS); - - return sentence; -} - -int main(int argc, char** argv) -{ - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - kBOS = TD::Convert("<s>"); - kEOS = TD::Convert("</s>"); - kDIVIDER = TD::Convert("|||"); - kGAP = TD::Convert("<PHRASE>"); - kCOUNT = FD::Convert("C"); - - bool silent = conf.count("silent") > 0; - const int ctx_size = conf["phrase_context_size"].as<int>(); - CountCombiner cc(conf["combiner_size"].as<size_t>(), conf["prune"].as<size_t>()); - - char buf[MAX_LINE_LENGTH]; - TrieNode phrase_trie(0); - ReadFile rpf(conf["phrases"].as<string>()); - istream& pin = *rpf.stream(); - while (pin) { - pin.getline(buf, MAX_LINE_LENGTH); - phrase_trie.insert(ReadSentence(buf, 0)); - } - - ReadFile rif(conf["input"].as<string>()); - istream &iin = *rif.stream(); - int line = 0; - while (iin) { - ++line; - iin.getline(buf, MAX_LINE_LENGTH); - //cout << "line: " << line << " '" << buf << "'" << endl; - if (buf[0] == 0) continue; - if (!silent) { - if (line % 200 == 0) cerr << '.'; - if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush; - } - - vector<int> sentence = ReadSentence(buf, ctx_size); - //cout << "sentence: " << TD::GetString(sentence) << endl; - vector<TrieNode*> tries; - for (int i = ctx_size; i < (int)sentence.size() - ctx_size; ++i) - { - //cout << "i: " << i << " token: " << TD::Convert(sentence[i]) << " tries: " << tries.size() << endl; - vector<TrieNode*> tries_prime; - tries.push_back(&phrase_trie); - for (vector<TrieNode*>::iterator tit = tries.begin(); tit != tries.end(); ++tit) - { - TrieNode* next = (*tit)->follow(sentence[i]); - if (next != 0) - { - //cout << "\tfollowed edge: " << next->finish << endl; - if (next->finish) - WriteContext(sentence, i + 1 - next->length, i + 1, ctx_size, cc); - tries_prime.push_back(next); - } - } - swap(tries, tries_prime); - } - //cout << "/sentence" << endl; - } - if (!silent) cerr << endl; - return 0; -} diff --git a/extools/featurize_grammar.cc b/extools/featurize_grammar.cc deleted file mode 100644 index 78175202..00000000 --- a/extools/featurize_grammar.cc +++ /dev/null @@ -1,716 +0,0 @@ -/* - * Featurize a grammar in striped format - */ -#include <iostream> -#include <sstream> -#include <string> -#include <map> -#include <vector> -#include <utility> -#include <cstdlib> -#include <tr1/unordered_map> - -#include "lex_trans_tbl.h" -#include "sparse_vector.h" -#include "sentence_pair.h" -#include "extract.h" -#include "fdict.h" -#include "tdict.h" -#include "filelib.h" -#include "striped_grammar.h" - -#include <boost/tuple/tuple.hpp> -#include <boost/shared_ptr.hpp> -#include <boost/functional/hash.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -using namespace std; -using namespace std::tr1; -using boost::shared_ptr; -namespace po = boost::program_options; - -static string aligned_corpus; -static const size_t MAX_LINE_LENGTH = 64000000; - -// Data structures for indexing and counting rules -//typedef boost::tuple< WordID, vector<WordID>, vector<WordID> > RuleTuple; -struct RuleTuple { - RuleTuple(const WordID& lhs, const vector<WordID>& s, const vector<WordID>& t) - : m_lhs(lhs), m_source(s), m_target(t) { - hash_value(); - m_dirty = false; - } - - size_t hash_value() const { -// if (m_dirty) { - size_t hash = 0; - boost::hash_combine(hash, m_lhs); - boost::hash_combine(hash, m_source); - boost::hash_combine(hash, m_target); -// } -// m_dirty = false; - return hash; - } - - bool operator==(RuleTuple const& b) const - { return m_lhs == b.m_lhs && m_source == b.m_source && m_target == b.m_target; } - - WordID& lhs() { m_dirty=true; return m_lhs; } - vector<WordID>& source() { m_dirty=true; return m_source; } - vector<WordID>& target() { m_dirty=true; return m_target; } - const WordID& lhs() const { return m_lhs; } - const vector<WordID>& source() const { return m_source; } - const vector<WordID>& target() const { return m_target; } - -// mutable size_t m_hash; -private: - WordID m_lhs; - vector<WordID> m_source, m_target; - mutable bool m_dirty; -}; -std::size_t hash_value(RuleTuple const& b) { return b.hash_value(); } -bool operator<(RuleTuple const& l, RuleTuple const& r) { - if (l.lhs() < r.lhs()) return true; - else if (l.lhs() == r.lhs()) { - if (l.source() < r.source()) return true; - else if (l.source() == r.source()) { - if (l.target() < r.target()) return true; - } - } - return false; -} - -ostream& operator<<(ostream& o, RuleTuple const& r) { - o << "(" << r.lhs() << "-->" << "<"; - for (vector<WordID>::const_iterator it=r.source().begin(); it!=r.source().end(); ++it) - o << TD::Convert(*it) << " "; - o << "|||"; - for (vector<WordID>::const_iterator it=r.target().begin(); it!=r.target().end(); ++it) - o << " " << TD::Convert(*it); - o << ">)"; - return o; -} - -template <typename Key> -struct FreqCount { - //typedef unordered_map<Key, int, boost::hash<Key> > Counts; - typedef map<Key, int> Counts; - Counts counts; - - int inc(const Key& r, int c=1) { - pair<typename Counts::iterator,bool> itb - = counts.insert(make_pair(r,c)); - if (!itb.second) - itb.first->second += c; - return itb.first->second; - } - - int inc_if_exists(const Key& r, int c=1) { - typename Counts::iterator it = counts.find(r); - if (it != counts.end()) - it->second += c; - return it->second; - } - - int count(const Key& r) const { - typename Counts::const_iterator it = counts.find(r); - if (it == counts.end()) return 0; - return it->second; - } - - int operator()(const Key& r) const { return count(r); } -}; -typedef FreqCount<RuleTuple> RuleFreqCount; - -class FeatureExtractor; -class FERegistry; -struct FEFactoryBase { - virtual ~FEFactoryBase() {} - virtual boost::shared_ptr<FeatureExtractor> Create() const = 0; -}; - - -class FERegistry { - friend class FEFactoryBase; - public: - FERegistry() {} - boost::shared_ptr<FeatureExtractor> Create(const std::string& ffname) const { - map<string, boost::shared_ptr<FEFactoryBase> >::const_iterator it = reg_.find(ffname); - boost::shared_ptr<FeatureExtractor> res; - if (it == reg_.end()) { - cerr << "I don't know how to create feature " << ffname << endl; - } else { - res = it->second->Create(); - } - return res; - } - void DisplayList(ostream* out) const { - bool first = true; - for (map<string, boost::shared_ptr<FEFactoryBase> >::const_iterator it = reg_.begin(); - it != reg_.end(); ++it) { - if (first) {first=false;} else {*out << ' ';} - *out << it->first; - } - } - - void Register(const std::string& ffname, FEFactoryBase* factory) { - if (reg_.find(ffname) != reg_.end()) { - cerr << "Duplicate registration of FeatureExtractor with name " << ffname << "!\n"; - exit(1); - } - reg_[ffname].reset(factory); - } - - private: - std::map<std::string, boost::shared_ptr<FEFactoryBase> > reg_; -}; - -template<class FE> -class FEFactory : public FEFactoryBase { - boost::shared_ptr<FeatureExtractor> Create() const { - return boost::shared_ptr<FeatureExtractor>(new FE); - } -}; - -void InitCommandLine(const FERegistry& r, int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - ostringstream feats; - feats << "[multiple] Features to extract ("; - r.DisplayList(&feats); - feats << ")"; - opts.add_options() - ("filtered_grammar,g", po::value<string>(), "Grammar to add features to") - ("list_features,L", "List extractable features") - ("feature,f", po::value<vector<string> >()->composing(), feats.str().c_str()) - ("aligned_corpus,c", po::value<string>(), "Aligned corpus (single line format)") - ("help,h", "Print this help message and exit"); - po::options_description clo("Command line options"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - - if (conf->count("help") || conf->count("aligned_corpus")==0 || conf->count("feature") == 0) { - cerr << "\nUsage: featurize_grammar -g FILTERED-GRAMMAR.gz -c ALIGNED_CORPUS.fr-en-al -f Feat1 -f Feat2 ... < UNFILTERED-GRAMMAR\n"; - cerr << dcmdline_options << endl; - exit(1); - } -} - -static const bool DEBUG = false; - -void LexTranslationTable::createTTable(const char* buf){ - AnnotatedParallelSentence sent; - sent.ParseInputLine(buf); - - //iterate over the alignment to compute aligned words - - for(int i =0;i<sent.aligned.width();i++) - { - for (int j=0;j<sent.aligned.height();j++) - { - if (DEBUG) cerr << sent.aligned(i,j) << " "; - if( sent.aligned(i,j)) - { - if (DEBUG) cerr << TD::Convert(sent.f[i]) << " aligned to " << TD::Convert(sent.e[j]); - ++word_translation[pair<WordID,WordID> (sent.f[i], sent.e[j])]; - ++total_foreign[sent.f[i]]; - ++total_english[sent.e[j]]; - } - } - if (DEBUG) cerr << endl; - } - if (DEBUG) cerr << endl; - - const WordID NULL_ = TD::Convert("NULL"); - //handle unaligned words - align them to null - for (int j =0; j < sent.e_len; j++) { - if (sent.e_aligned[j]) continue; - ++word_translation[pair<WordID,WordID> (NULL_, sent.e[j])]; - ++total_foreign[NULL_]; - ++total_english[sent.e[j]]; - } - - for (int i =0; i < sent.f_len; i++) { - if (sent.f_aligned[i]) continue; - ++word_translation[pair<WordID,WordID> (sent.f[i], NULL_)]; - ++total_english[NULL_]; - ++total_foreign[sent.f[i]]; - } -} - -inline float safenlog(float v) { - if (v == 1.0f) return 0.0f; - float res = -log(v); - if (res > 100.0f) res = 100.0f; - return res; -} - -static bool IsZero(float f) { return (f > 0.999 && f < 1.001); } - -struct FeatureExtractor { - // create any keys necessary - virtual void ObserveFilteredRule(const WordID /* lhs */, - const vector<WordID>& /* src */, - const vector<WordID>& /* trg */) {} - - // compute statistics over keys, the same lhs-src-trg tuple may be seen - // more than once - virtual void ObserveUnfilteredRule(const WordID /* lhs */, - const vector<WordID>& /* src */, - const vector<WordID>& /* trg */, - const RuleStatistics& /* info */) {} - - // compute features, a unique lhs-src-trg tuple will be seen exactly once - virtual void ExtractFeatures(const WordID lhs, - const vector<WordID>& src, - const vector<WordID>& trg, - const RuleStatistics& info, - SparseVector<float>* result) const = 0; - - virtual ~FeatureExtractor() {} -}; - -struct LogRuleCount : public FeatureExtractor { - LogRuleCount() : - fid_(FD::Convert("LogRuleCount")), - sfid_(FD::Convert("SingletonRule")), - kCFE(FD::Convert("CFE")) {} - virtual void ExtractFeatures(const WordID lhs, - const vector<WordID>& src, - const vector<WordID>& trg, - const RuleStatistics& info, - SparseVector<float>* result) const { - (void) lhs; (void) src; (void) trg; - //result->set_value(fid_, log(info.counts.get(kCFE))); - result->set_value(fid_, log(info.counts.get(kCFE))); - if (IsZero(info.counts.get(kCFE))) - result->set_value(sfid_, 1); - } - const int fid_; - const int sfid_; - const int kCFE; -}; - -struct RulePenalty : public FeatureExtractor { - RulePenalty() : fid_(FD::Convert("RulePenalty")) {} - virtual void ExtractFeatures(const WordID /*lhs*/, - const vector<WordID>& /*src*/, - const vector<WordID>& /*trg*/, - const RuleStatistics& /*info*/, - SparseVector<float>* result) const - { result->set_value(fid_, 1); } - - const int fid_; -}; - -// The negative log of the condition rule probs -// ignoring the identities of the non-terminals. -// i.e. the prob Hiero would assign. -// Also extracts Labelled features. -struct XFeatures: public FeatureExtractor { - XFeatures() : - fid_xfe(FD::Convert("XFE")), - fid_xef(FD::Convert("XEF")), - fid_labelledfe(FD::Convert("LabelledFE")), - fid_labelledef(FD::Convert("LabelledEF")), - fid_xesingleton(FD::Convert("XE_Singleton")), - fid_xfsingleton(FD::Convert("XF_Singleton")), - kCFE(FD::Convert("CFE")) {} - virtual void ObserveFilteredRule(const WordID /*lhs*/, - const vector<WordID>& src, - const vector<WordID>& trg) { - RuleTuple r(-1, src, trg); - map_rule(r); - rule_counts.inc(r, 0); - source_counts.inc(r.source(), 0); - target_counts.inc(r.target(), 0); - } - - virtual void ObserveUnfilteredRule(const WordID /*lhs*/, - const vector<WordID>& src, - const vector<WordID>& trg, - const RuleStatistics& info) { - RuleTuple r(-1, src, trg); - map_rule(r); - const int count = info.counts.get(kCFE); - assert(count > 0); - rule_counts.inc_if_exists(r, count); - source_counts.inc_if_exists(r.source(), count); - target_counts.inc_if_exists(r.target(), count); - } - - virtual void ExtractFeatures(const WordID /*lhs*/, - const vector<WordID>& src, - const vector<WordID>& trg, - const RuleStatistics& info, - SparseVector<float>* result) const { - RuleTuple r(-1, src, trg); - map_rule(r); - double l_r_freq = log(rule_counts(r)); - - const int t_c = target_counts(r.target()); - assert(t_c > 0); - result->set_value(fid_xfe, log(t_c) - l_r_freq); - result->set_value(fid_labelledfe, log(t_c) - log(info.counts.get(kCFE))); -// if (t_c == 1) -// result->set_value(fid_xesingleton, 1.0); - - const int s_c = source_counts(r.source()); - assert(s_c > 0); - result->set_value(fid_xef, log(s_c) - l_r_freq); - result->set_value(fid_labelledef, log(s_c) - log(info.counts.get(kCFE))); -// if (s_c == 1) -// result->set_value(fid_xfsingleton, 1.0); - } - - void map_rule(RuleTuple& r) const { - vector<WordID> indexes; int i=0; - for (vector<WordID>::iterator it = r.target().begin(); it != r.target().end(); ++it) { - if (*it <= 0) - indexes.push_back(*it); - } - for (vector<WordID>::iterator it = r.source().begin(); it != r.source().end(); ++it) { - if (*it <= 0) - *it = indexes.at(i++); - } - } - - const int fid_xfe, fid_xef; - const int fid_labelledfe, fid_labelledef; - const int fid_xesingleton, fid_xfsingleton; - const int kCFE; - RuleFreqCount rule_counts; - FreqCount< vector<WordID> > source_counts, target_counts; -}; - - -struct LabelledRuleConditionals: public FeatureExtractor { - LabelledRuleConditionals() : - fid_fe(FD::Convert("LabelledFE")), - fid_ef(FD::Convert("LabelledEF")), - kCFE(FD::Convert("CFE")) {} - virtual void ObserveFilteredRule(const WordID lhs, - const vector<WordID>& src, - const vector<WordID>& trg) { - RuleTuple r(lhs, src, trg); - rule_counts.inc(r, 0); - source_counts.inc(r.source(), 0); - - target_counts.inc(r.target(), 0); - } - - virtual void ObserveUnfilteredRule(const WordID lhs, - const vector<WordID>& src, - const vector<WordID>& trg, - const RuleStatistics& info) { - RuleTuple r(lhs, src, trg); - rule_counts.inc_if_exists(r, info.counts.get(kCFE)); - source_counts.inc_if_exists(r.source(), info.counts.get(kCFE)); - - target_counts.inc_if_exists(r.target(), info.counts.get(kCFE)); - } - - virtual void ExtractFeatures(const WordID lhs, - const vector<WordID>& src, - const vector<WordID>& trg, - const RuleStatistics& /*info*/, - SparseVector<float>* result) const { - RuleTuple r(lhs, src, trg); - double l_r_freq = log(rule_counts(r)); - result->set_value(fid_fe, log(target_counts(r.target())) - l_r_freq); - result->set_value(fid_ef, log(source_counts(r.source())) - l_r_freq); - } - - const int fid_fe, fid_ef; - const int kCFE; - RuleFreqCount rule_counts; - FreqCount< vector<WordID> > source_counts, target_counts; -}; - -struct LHSProb: public FeatureExtractor { - LHSProb() : fid_(FD::Convert("LHSProb")), kCFE(FD::Convert("CFE")), total_count(0) {} - - virtual void ObserveUnfilteredRule(const WordID lhs, - const vector<WordID>& /*src*/, - const vector<WordID>& /*trg*/, - const RuleStatistics& info) { - int count = info.counts.get(kCFE); - total_count += count; - lhs_counts.inc(lhs, count); - } - - virtual void ExtractFeatures(const WordID lhs, - const vector<WordID>& /*src*/, - const vector<WordID>& /*trg*/, - const RuleStatistics& /*info*/, - SparseVector<float>* result) const { - double lhs_log_prob = log(total_count) - log(lhs_counts(lhs)); - result->set_value(fid_, lhs_log_prob); - } - - const int fid_; - const int kCFE; - int total_count; - FreqCount<WordID> lhs_counts; -}; - -// Proper rule generative probability: p( s,t | lhs) -struct GenerativeProb: public FeatureExtractor { - GenerativeProb() : - fid_(FD::Convert("GenerativeProb")), - kCFE(FD::Convert("CFE")) {} - - virtual void ObserveUnfilteredRule(const WordID lhs, - const vector<WordID>& /*src*/, - const vector<WordID>& /*trg*/, - const RuleStatistics& info) - { lhs_counts.inc(lhs, info.counts.get(kCFE)); } - - virtual void ExtractFeatures(const WordID lhs, - const vector<WordID>& /*src*/, - const vector<WordID>& /*trg*/, - const RuleStatistics& info, - SparseVector<float>* result) const { - double log_prob = log(lhs_counts(lhs)) - log(info.counts.get(kCFE)); - result->set_value(fid_, log_prob); - } - - const int fid_; - const int kCFE; - FreqCount<WordID> lhs_counts; -}; - -// remove terminals from the rules before estimating the conditional prob -struct LabellingShape: public FeatureExtractor { - LabellingShape() : fid_(FD::Convert("LabellingShape")), kCFE(FD::Convert("CFE")) {} - - virtual void ObserveFilteredRule(const WordID /*lhs*/, - const vector<WordID>& src, - const vector<WordID>& trg) { - RuleTuple r(-1, src, trg); - map_rule(r); - rule_counts.inc(r, 0); - source_counts.inc(r.source(), 0); - } - - virtual void ObserveUnfilteredRule(const WordID /*lhs*/, - const vector<WordID>& src, - const vector<WordID>& trg, - const RuleStatistics& info) { - RuleTuple r(-1, src, trg); - map_rule(r); - rule_counts.inc_if_exists(r, info.counts.get(kCFE)); - source_counts.inc_if_exists(r.source(), info.counts.get(kCFE)); - } - - virtual void ExtractFeatures(const WordID /*lhs*/, - const vector<WordID>& src, - const vector<WordID>& trg, - const RuleStatistics& /*info*/, - SparseVector<float>* result) const { - RuleTuple r(-1, src, trg); - map_rule(r); - double l_r_freq = log(rule_counts(r)); - result->set_value(fid_, log(source_counts(r.source())) - l_r_freq); - } - - // Replace all terminals with generic -1 - void map_rule(RuleTuple& r) const { - for (vector<WordID>::iterator it = r.target().begin(); it != r.target().end(); ++it) - if (*it <= 0) *it = -1; - for (vector<WordID>::iterator it = r.source().begin(); it != r.source().end(); ++it) - if (*it <= 0) *it = -1; - } - - const int fid_, kCFE; - RuleFreqCount rule_counts; - FreqCount< vector<WordID> > source_counts; -}; - - -// this extracts the lexical translation prob features -// in BOTH directions. -struct LexProbExtractor : public FeatureExtractor { - LexProbExtractor() : - e2f_(FD::Convert("LexE2F")), f2e_(FD::Convert("LexF2E")) { - ReadFile rf(aligned_corpus); - //create lexical translation table - cerr << "Computing lexical translation probabilities from " << aligned_corpus << "..." << endl; - char* buf = new char[MAX_LINE_LENGTH]; - istream& alignment = *rf.stream(); - while(alignment) { - alignment.getline(buf, MAX_LINE_LENGTH); - if (buf[0] == 0) continue; - table.createTTable(buf); - } - delete[] buf; - } - - virtual void ExtractFeatures(const WordID /*lhs*/, - const vector<WordID>& src, - const vector<WordID>& trg, - const RuleStatistics& info, - SparseVector<float>* result) const { - map <WordID, pair<int, float> > foreign_aligned; - map <WordID, pair<int, float> > english_aligned; - - //Loop over all the alignment points to compute lexical translation probability - const vector< pair<short,short> >& al = info.aligns; - vector< pair<short,short> >::const_iterator ita; - for (ita = al.begin(); ita != al.end(); ++ita) { - if (DEBUG) { - cerr << "\nA:" << ita->first << "," << ita->second << "::"; - cerr << TD::Convert(src[ita->first]) << "-" << TD::Convert(trg[ita->second]); - } - - //Lookup this alignment probability in the table - int temp = table.word_translation[pair<WordID,WordID> (src[ita->first],trg[ita->second])]; - float f2e=0, e2f=0; - if ( table.total_foreign[src[ita->first]] != 0) - f2e = (float) temp / table.total_foreign[src[ita->first]]; - if ( table.total_english[trg[ita->second]] !=0 ) - e2f = (float) temp / table.total_english[trg[ita->second]]; - if (DEBUG) printf (" %d %E %E\n", temp, f2e, e2f); - - //local counts to keep track of which things haven't been aligned, to later compute their null alignment - if (foreign_aligned.count(src[ita->first])) { - foreign_aligned[ src[ita->first] ].first++; - foreign_aligned[ src[ita->first] ].second += e2f; - } else { - foreign_aligned[ src[ita->first] ] = pair<int,float> (1,e2f); - } - - if (english_aligned.count( trg[ ita->second] )) { - english_aligned[ trg[ ita->second] ].first++; - english_aligned[ trg[ ita->second] ].second += f2e; - } else { - english_aligned[ trg[ ita->second] ] = pair<int,float> (1,f2e); - } - } - - float final_lex_f2e=1, final_lex_e2f=1; - static const WordID NULL_ = TD::Convert("NULL"); - - //compute lexical weight P(F|E) and include unaligned foreign words - for(int i=0;i<src.size(); i++) { - if (!table.total_foreign.count(src[i])) continue; //if we dont have it in the translation table, we won't know its lexical weight - - if (foreign_aligned.count(src[i])) - { - pair<int, float> temp_lex_prob = foreign_aligned[src[i]]; - final_lex_e2f *= temp_lex_prob.second / temp_lex_prob.first; - } - else //dealing with null alignment - { - int temp_count = table.word_translation[pair<WordID,WordID> (src[i],NULL_)]; - float temp_e2f = (float) temp_count / table.total_english[NULL_]; - final_lex_e2f *= temp_e2f; - } - - } - - //compute P(E|F) unaligned english words - for(int j=0; j< trg.size(); j++) { - if (!table.total_english.count(trg[j])) continue; - - if (english_aligned.count(trg[j])) - { - pair<int, float> temp_lex_prob = english_aligned[trg[j]]; - final_lex_f2e *= temp_lex_prob.second / temp_lex_prob.first; - } - else //dealing with null - { - int temp_count = table.word_translation[pair<WordID,WordID> (NULL_,trg[j])]; - float temp_f2e = (float) temp_count / table.total_foreign[NULL_]; - final_lex_f2e *= temp_f2e; - } - } - result->set_value(e2f_, safenlog(final_lex_e2f)); - result->set_value(f2e_, safenlog(final_lex_f2e)); - } - const int e2f_, f2e_; - mutable LexTranslationTable table; -}; - -struct Featurizer { - Featurizer(const vector<boost::shared_ptr<FeatureExtractor> >& ex) : extractors(ex) { - } - void Callback1(WordID lhs, const vector<WordID>& src, const ID2RuleStatistics& trgs) { - for (ID2RuleStatistics::const_iterator it = trgs.begin(); it != trgs.end(); ++it) { - for (int i = 0; i < extractors.size(); ++i) - extractors[i]->ObserveFilteredRule(lhs, src, it->first); - } - } - void Callback2(WordID lhs, const vector<WordID>& src, const ID2RuleStatistics& trgs) { - for (ID2RuleStatistics::const_iterator it = trgs.begin(); it != trgs.end(); ++it) { - for (int i = 0; i < extractors.size(); ++i) - extractors[i]->ObserveUnfilteredRule(lhs, src, it->first, it->second); - } - } - void Callback3(WordID lhs, const vector<WordID>& src, const ID2RuleStatistics& trgs) { - for (ID2RuleStatistics::const_iterator it = trgs.begin(); it != trgs.end(); ++it) { - SparseVector<float> feats; - for (int i = 0; i < extractors.size(); ++i) - extractors[i]->ExtractFeatures(lhs, src, it->first, it->second, &feats); - cout << '[' << TD::Convert(-lhs) << "] ||| "; - WriteNamed(src, &cout); - cout << " ||| "; - WriteAnonymous(it->first, &cout); - cout << " ||| "; - print(cout,feats,"="); - cout << endl; - } - } - private: - vector<boost::shared_ptr<FeatureExtractor> > extractors; -}; - -void cb1(WordID lhs, const vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void* extra) { - static_cast<Featurizer*>(extra)->Callback1(lhs, src_rhs, rules); -} - -void cb2(WordID lhs, const vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void* extra) { - static_cast<Featurizer*>(extra)->Callback2(lhs, src_rhs, rules); -} - -void cb3(WordID lhs, const vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void* extra) { - static_cast<Featurizer*>(extra)->Callback3(lhs, src_rhs, rules); -} - -int main(int argc, char** argv){ - FERegistry reg; - reg.Register("LogRuleCount", new FEFactory<LogRuleCount>); - reg.Register("LexProb", new FEFactory<LexProbExtractor>); - reg.Register("XFeatures", new FEFactory<XFeatures>); - reg.Register("LabelledRuleConditionals", new FEFactory<LabelledRuleConditionals>); - reg.Register("RulePenalty", new FEFactory<RulePenalty>); - reg.Register("LHSProb", new FEFactory<LHSProb>); - reg.Register("LabellingShape", new FEFactory<LabellingShape>); - reg.Register("GenerativeProb", new FEFactory<GenerativeProb>); - po::variables_map conf; - InitCommandLine(reg, argc, argv, &conf); - aligned_corpus = conf["aligned_corpus"].as<string>(); // GLOBAL VAR - ReadFile fg1(conf["filtered_grammar"].as<string>()); - - vector<string> feats = conf["feature"].as<vector<string> >(); - vector<boost::shared_ptr<FeatureExtractor> > extractors(feats.size()); - for (int i = 0; i < feats.size(); ++i) - extractors[i] = reg.Create(feats[i]); - Featurizer fizer(extractors); - - cerr << "Reading filtered grammar to detect keys..." << endl; - StripedGrammarLexer::ReadStripedGrammar(fg1.stream(), cb1, &fizer); - - cerr << "Reading unfiltered grammar..." << endl; - StripedGrammarLexer::ReadStripedGrammar(&cin, cb2, &fizer); - - ReadFile fg2(conf["filtered_grammar"].as<string>()); - cerr << "Reading filtered grammar and adding features..." << endl; - StripedGrammarLexer::ReadStripedGrammar(fg2.stream(), cb3, &fizer); - - return 0; -} - diff --git a/extools/filter_grammar.cc b/extools/filter_grammar.cc deleted file mode 100644 index cafcc923..00000000 --- a/extools/filter_grammar.cc +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Filter a grammar in striped format - */ -#include <iostream> -#include <string> -#include <map> -#include <vector> -#include <utility> -#include <tr1/unordered_map> - -#include "suffix_tree.h" -#include "sparse_vector.h" -#include "sentence_pair.h" -#include "extract.h" -#include "fdict.h" -#include "tdict.h" -#include "filelib.h" -#include "striped_grammar.h" - -#include <boost/shared_ptr.hpp> -#include <boost/functional/hash.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -static const size_t MAX_LINE_LENGTH = 64000000; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("test_set,t", po::value<string>(), "Filter for this test set") - ("top_e_given_f,n", po::value<size_t>()->default_value(30), "Keep top N rules, according to p(e|f). 0 for all") - ("help,h", "Print this help message and exit"); - po::options_description clo("Command line options"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - - if (conf->count("help") || conf->count("test_set")==0) { - cerr << "\nUsage: filter_grammar -t TEST-SET.fr [-options] < grammar\n"; - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct SourceFilter { - // return true to keep the rule, otherwise false - virtual bool Matches(const vector<WordID>& key) const = 0; - virtual ~SourceFilter() {} -}; - -struct DumbSuffixTreeFilter : SourceFilter { - DumbSuffixTreeFilter(const string& corpus) { - cerr << "Build suffix tree from test set in " << corpus << endl; - assert(FileExists(corpus)); - ReadFile rfts(corpus); - istream& testSet = *rfts.stream(); - char* buf = new char[MAX_LINE_LENGTH]; - AnnotatedParallelSentence sent; - - /* process the data set to build suffix tree - */ - while(!testSet.eof()) { - testSet.getline(buf, MAX_LINE_LENGTH); - if (buf[0] == 0) continue; - - //hack to read in the test set using AnnotatedParallelSentence - strcat(buf," ||| fake ||| 0-0"); - sent.ParseInputLine(buf); - - //add each successive suffix to the tree - for(int i=0; i<sent.f_len; i++) - root.InsertPath(sent.f, i, sent.f_len - 1); - } - delete[] buf; - } - virtual bool Matches(const vector<WordID>& src_rhs) const { - const Node<int>* curnode = &root; - for(int i=0; i < src_rhs.size(); i++) { - if (src_rhs[i] <= 0) { - curnode = &root; - } else if (curnode) { - curnode = curnode->Extend(src_rhs[i]); - if (!curnode) return false; - } - } - return true; - } - Node<int> root; -}; - -boost::shared_ptr<SourceFilter> filter; -multimap<float, ID2RuleStatistics::const_iterator> options; -int kCOUNT; -int max_options; - -void cb(WordID lhs, const vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void*) { - options.clear(); - if (!filter || filter->Matches(src_rhs)) { - for (ID2RuleStatistics::const_iterator it = rules.begin(); it != rules.end(); ++it) { - options.insert(make_pair(-it->second.counts.get(kCOUNT), it)); - } - int ocount = 0; - cout << '[' << TD::Convert(-lhs) << ']' << " ||| "; - WriteNamed(src_rhs, &cout); - cout << '\t'; - bool first = true; - for (multimap<float,ID2RuleStatistics::const_iterator>::iterator it = options.begin(); it != options.end(); ++it) { - if (first) { first = false; } else { cout << " ||| "; } - WriteAnonymous(it->second->first, &cout); - cout << " ||| " << it->second->second; - ++ocount; - if (ocount == max_options) break; - } - cout << endl; - } -} - -int main(int argc, char** argv){ - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - max_options = conf["top_e_given_f"].as<size_t>();; - kCOUNT = FD::Convert("CFE"); - istream& unscored_grammar = cin; - cerr << "Loading test set " << conf["test_set"].as<string>() << "...\n"; - filter.reset(new DumbSuffixTreeFilter(conf["test_set"].as<string>())); - cerr << "Filtering...\n"; - StripedGrammarLexer::ReadStripedGrammar(&unscored_grammar, cb, NULL); -} - diff --git a/extools/lex_trans_tbl.h b/extools/lex_trans_tbl.h deleted file mode 100644 index 161b4a0d..00000000 --- a/extools/lex_trans_tbl.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * lex_trans_tbl.h - * - * Created on: May 25, 2010 - * Author: Vlad - */ - -#ifndef LEX_TRANS_TBL_H_ -#define LEX_TRANS_TBL_H_ - -#include "wordid.h" -#include <map> - -class LexTranslationTable -{ - public: - - std::map < std::pair<WordID,WordID>,int > word_translation; - std::map <WordID, int> total_foreign; - std::map <WordID, int> total_english; - void createTTable(const char* buf); - -}; - -#endif /* LEX_TRANS_TBL_H_ */ diff --git a/extools/merge_lines.pl b/extools/merge_lines.pl deleted file mode 100755 index 8711e4ce..00000000 --- a/extools/merge_lines.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -if (scalar @ARGV < 2) { - die "Usage: $0 file1.txt file2.txt ...\n\n Concatenate the nth line of each input file. All files\n must be the same length.\n\n"; -} - -my @fhs=(); -for my $file (@ARGV) { - my $fh; - open $fh, "<$file" or die "Can't read $file: $!\n"; - push @fhs, $fh; -} - -my $first = shift @fhs; - -while(my $x = <$first>) { - my $ind = 0; - chomp $x; - my @fields = ($x); - for my $fh (@fhs) { - $ind++; - $x = <$fh>; - die "ERROR: Mismatched number of lines: $ARGV[$ind]\n" unless $x; - chomp $x; - push @fields, $x; - } - print join ' ||| ', @fields; - print "\n"; -} -my $ind = 0; -for my $fh (@fhs) { - $ind++; - my $x=<$fh>; - die "ERROR: $ARGV[$ind] has extra lines!\n" if $x; -} - -exit 0; - -for my $fh (@fhs) { - close $fh; -} - diff --git a/extools/mr_stripe_rule_reduce.cc b/extools/mr_stripe_rule_reduce.cc deleted file mode 100644 index c9b2eb2a..00000000 --- a/extools/mr_stripe_rule_reduce.cc +++ /dev/null @@ -1,172 +0,0 @@ -#include <iostream> -#include <vector> -#include <utility> -#include <cstdlib> -#include <tr1/unordered_map> - -#include <boost/functional/hash.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "striped_grammar.h" -#include "tdict.h" -#include "sentence_pair.h" -#include "fdict.h" -#include "extract.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -static const size_t MAX_LINE_LENGTH = 64000000; - -bool use_hadoop_counters = false; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("phrase_marginals,p", "Compute phrase marginals") - ("use_hadoop_counters,C", "Enable this if running inside Hadoop") - ("bidir,b", "Rules are tagged as being F->E or E->F, invert E rules in output") - ("help,h", "Print this help message and exit"); - po::options_description clo("Command line options"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - - if (conf->count("help")) { - cerr << "\nUsage: mr_stripe_rule_reduce [-options]\n"; - cerr << dcmdline_options << endl; - exit(1); - } -} - -void PlusEquals(const ID2RuleStatistics& v, ID2RuleStatistics* self) { - for (ID2RuleStatistics::const_iterator it = v.begin(); it != v.end(); ++it) { - RuleStatistics& dest = (*self)[it->first]; - dest += it->second; - // TODO - do something smarter about alignments? - if (dest.aligns.empty() && !it->second.aligns.empty()) - dest.aligns = it->second.aligns; - } -} - -void WriteKeyValue(const vector<WordID>& key, const ID2RuleStatistics& val) { - cout << TD::GetString(key) << '\t'; - bool needdiv = false; - for (ID2RuleStatistics::const_iterator it = val.begin(); it != val.end(); ++it) { - if (needdiv) cout << " ||| "; else needdiv = true; - cout << TD::GetString(it->first) << " ||| " << it->second; - } - cout << endl; - if (use_hadoop_counters) cerr << "reporter:counter:UserCounters,RuleCount," << val.size() << endl; -} - -void DoPhraseMarginals(const vector<WordID>& key, const bool bidir, ID2RuleStatistics* val) { - static const WordID kF = TD::Convert("F"); - static const WordID kE = TD::Convert("E"); - static const int kCF = FD::Convert("CF"); - static const int kCE = FD::Convert("CE"); - static const int kCFE = FD::Convert("CFE"); - assert(key.size() > 0); - int cur_marginal_id = kCF; - if (bidir) { - if (key[0] != kF && key[0] != kE) { - cerr << "DoPhraseMarginals expects keys to have the from 'F|E [NT] word word word'\n"; - cerr << " but got: " << TD::GetString(key) << endl; - exit(1); - } - if (key[0] == kE) cur_marginal_id = kCE; - } - double tot = 0; - for (ID2RuleStatistics::iterator it = val->begin(); it != val->end(); ++it) - tot += it->second.counts.get(kCFE); - for (ID2RuleStatistics::iterator it = val->begin(); it != val->end(); ++it) { - it->second.counts.set_value(cur_marginal_id, tot); - - // prevent double counting of the joint - if (cur_marginal_id == kCE) it->second.counts.erase(kCFE); - } -} - -void WriteWithInversions(const vector<WordID>& key, const ID2RuleStatistics& val) { - static const WordID kE = TD::Convert("E"); - static const WordID kDIV = TD::Convert("|||"); - vector<WordID> new_key(key.size() - 1); - for (int i = 1; i < key.size(); ++i) - new_key[i - 1] = key[i]; - const bool do_invert = (key[0] == kE); - if (!do_invert) { - WriteKeyValue(new_key, val); - } else { - ID2RuleStatistics inv; - assert(new_key.size() > 2); - vector<WordID> tk(new_key.size() - 2); - for (int i = 0; i < tk.size(); ++i) - tk[i] = new_key[2 + i]; - RuleStatistics& inv_stats = inv[tk]; - for (ID2RuleStatistics::const_iterator it = val.begin(); it != val.end(); ++it) { - inv_stats.counts = it->second.counts; - vector<WordID> ekey(2 + it->first.size()); - ekey[0] = key[1]; - ekey[1] = kDIV; - for (int i = 0; i < it->first.size(); ++i) - ekey[2+i] = it->first[i]; - WriteKeyValue(ekey, inv); - } - } -} - -struct Reducer { - Reducer(bool phrase_marginals, bool bidir) : pm_(phrase_marginals), bidir_(bidir) {} - - void ProcessLine(const vector<WordID>& key, const ID2RuleStatistics& rules) { - if (cur_key_ != key) { - if (cur_key_.size() > 0) Emit(); - acc_.clear(); - cur_key_ = key; - } - PlusEquals(rules, &acc_); - } - - ~Reducer() { - Emit(); - } - - void Emit() { - if (pm_) - DoPhraseMarginals(cur_key_, bidir_, &acc_); - if (bidir_) - WriteWithInversions(cur_key_, acc_); - else - WriteKeyValue(cur_key_, acc_); - } - - const bool pm_; - const bool bidir_; - vector<WordID> cur_key_; - ID2RuleStatistics acc_; -}; - -void cb(const vector<WordID>& key, const ID2RuleStatistics& contexts, void* red) { - static_cast<Reducer*>(red)->ProcessLine(key, contexts); -} - - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - char* buf = new char[MAX_LINE_LENGTH]; - vector<WordID> key, cur_key; - int line = 0; - use_hadoop_counters = conf.count("use_hadoop_counters") > 0; - const bool phrase_marginals = conf.count("phrase_marginals") > 0; - const bool bidir = conf.count("bidir") > 0; - Reducer reducer(phrase_marginals, bidir); - StripedGrammarLexer::ReadContexts(&cin, cb, &reducer); - return 0; -} - diff --git a/extools/score_grammar.cc b/extools/score_grammar.cc deleted file mode 100644 index 0945e018..00000000 --- a/extools/score_grammar.cc +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Score a grammar in striped format - * ./score_grammar <alignment> < filtered.grammar > scored.grammar - */ -#include <iostream> -#include <string> -#include <map> -#include <vector> -#include <utility> -#include <cstdlib> -#include <fstream> -#include <tr1/unordered_map> - -#include "sentence_pair.h" -#include "extract.h" -#include "fdict.h" -#include "tdict.h" -#include "lex_trans_tbl.h" -#include "filelib.h" - -#include <boost/functional/hash.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -using namespace std; -using namespace std::tr1; - - -static const size_t MAX_LINE_LENGTH = 64000000; - -typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics; - - -namespace { - inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } - inline bool IsBracket(char c){return c == '[' || c == ']';} - inline void SkipWhitespace(const char* buf, int* ptr) { - while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); } - } -} - -int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) { - static const WordID kDIV = TD::Convert("|||"); - int ptr = sstart; - while(ptr < end) { - while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; } - int start = ptr; - while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; } - if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; } - const WordID w = TD::Convert(string(buf, start, ptr - start)); - - if((IsBracket(buf[start]) and IsBracket(buf[ptr-1])) or( w == kDIV)) - p->push_back(1 * w); - else { - if (w == kDIV) return ptr; - p->push_back(w); - } - } - return ptr; -} - - -void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) { - static const WordID kDIV = TD::Convert("|||"); - counts->clear(); - int ptr = 0; - while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; } - if (buf[ptr] != '\t') { - cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl; - exit(1); - } - cur_key->clear(); - // key is: "[X] ||| word word word" - int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key); - cur_key->push_back(kDIV); - ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key); - ++ptr; - int start = ptr; - int end = ptr; - int state = 0; // 0=reading label, 1=reading count - vector<WordID> name; - while(buf[ptr] != 0) { - while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; } - if (buf[ptr] == '|') { - ++ptr; - if (buf[ptr] == '|') { - ++ptr; - if (buf[ptr] == '|') { - ++ptr; - end = ptr - 3; - while (end > start && IsWhitespace(buf[end-1])) { --end; } - if (start == end) { - cerr << "Got empty token!\n LINE=" << buf << endl; - exit(1); - } - switch (state) { - case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break; - case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break; - default: cerr << "Can't happen\n"; abort(); - } - SkipWhitespace(buf, &ptr); - start = ptr; - } - } - } - } - end=ptr; - while (end > start && IsWhitespace(buf[end-1])) { --end; } - if (end > start) { - switch (state) { - case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break; - case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break; - default: cerr << "Can't happen\n"; abort(); - } - } -} - - - -void LexTranslationTable::createTTable(const char* buf){ - - bool DEBUG = false; - - AnnotatedParallelSentence sent; - - sent.ParseInputLine(buf); - - //iterate over the alignment to compute aligned words - - for(int i =0;i<sent.aligned.width();i++) - { - for (int j=0;j<sent.aligned.height();j++) - { - if (DEBUG) cerr << sent.aligned(i,j) << " "; - if( sent.aligned(i,j)) - { - if (DEBUG) cerr << TD::Convert(sent.f[i]) << " aligned to " << TD::Convert(sent.e[j]); - ++word_translation[pair<WordID,WordID> (sent.f[i], sent.e[j])]; - ++total_foreign[sent.f[i]]; - ++total_english[sent.e[j]]; - } - } - if (DEBUG) cerr << endl; - } - if (DEBUG) cerr << endl; - - static const WordID NULL_ = TD::Convert("NULL"); - //handle unaligned words - align them to null - for (int j =0; j < sent.e_len; j++) - { - if (sent.e_aligned[j]) continue; - ++word_translation[pair<WordID,WordID> (NULL_, sent.e[j])]; - ++total_foreign[NULL_]; - ++total_english[sent.e[j]]; - } - - for (int i =0; i < sent.f_len; i++) - { - if (sent.f_aligned[i]) continue; - ++word_translation[pair<WordID,WordID> (sent.f[i], NULL_)]; - ++total_english[NULL_]; - ++total_foreign[sent.f[i]]; - } - -} - - -inline float safenlog(float v) { - if (v == 1.0f) return 0.0f; - float res = -log(v); - if (res > 100.0f) res = 100.0f; - return res; -} - -int main(int argc, char** argv){ - bool DEBUG= false; - if (argc != 2) { - cerr << "Usage: " << argv[0] << " corpus.al < filtered.grammar\n"; - return 1; - } - ifstream alignment (argv[1]); - istream& unscored_grammar = cin; - ostream& scored_grammar = cout; - - //create lexical translation table - cerr << "Creating table..." << endl; - char* buf = new char[MAX_LINE_LENGTH]; - - LexTranslationTable table; - - while(!alignment.eof()) - { - alignment.getline(buf, MAX_LINE_LENGTH); - if (buf[0] == 0) continue; - - table.createTTable(buf); - } - - bool PRINT_TABLE=false; - if (PRINT_TABLE) - { - ofstream trans_table; - trans_table.open("lex_trans_table.out"); - for(map < pair<WordID,WordID>,int >::iterator it = table.word_translation.begin(); it != table.word_translation.end(); ++it) - { - trans_table << TD::Convert(it->first.first) << "|||" << TD::Convert(it->first.second) << "==" << it->second << "//" << table.total_foreign[it->first.first] << "//" << table.total_english[it->first.second] << endl; - } - - trans_table.close(); - } - - - //score unscored grammar - cerr <<"Scoring grammar..." << endl; - - ID2RuleStatistics acc, cur_counts; - vector<WordID> key, cur_key,temp_key; - vector< pair<short,short> > al; - vector< pair<short,short> >::iterator ita; - int line = 0; - - static const int kCF = FD::Convert("CF"); - static const int kCE = FD::Convert("CE"); - static const int kCFE = FD::Convert("CFE"); - - while(!unscored_grammar.eof()) - { - ++line; - unscored_grammar.getline(buf, MAX_LINE_LENGTH); - if (buf[0] == 0) continue; - ParseLine(buf, &cur_key, &cur_counts); - - //loop over all the Target side phrases that this source aligns to - for (ID2RuleStatistics::const_iterator it = cur_counts.begin(); it != cur_counts.end(); ++it) - { - - /*Compute phrase translation prob. - Print out scores in this format: - Phrase trnaslation prob P(F|E) - Phrase translation prob P(E|F) - Lexical weighting prob lex(F|E) - Lexical weighting prob lex(E|F) - */ - - float pEF_ = it->second.counts.value(kCFE) / it->second.counts.value(kCF); - float pFE_ = it->second.counts.value(kCFE) / it->second.counts.value(kCE); - - map <WordID, pair<int, float> > foreign_aligned; - map <WordID, pair<int, float> > english_aligned; - - //Loop over all the alignment points to compute lexical translation probability - al = it->second.aligns; - for(ita = al.begin(); ita != al.end(); ++ita) - { - - if (DEBUG) - { - cerr << "\nA:" << ita->first << "," << ita->second << "::"; - cerr << TD::Convert(cur_key[ita->first + 2]) << "-" << TD::Convert(it->first[ita->second]); - } - - - //Lookup this alignment probability in the table - int temp = table.word_translation[pair<WordID,WordID> (cur_key[ita->first+2],it->first[ita->second])]; - float f2e=0, e2f=0; - if ( table.total_foreign[cur_key[ita->first+2]] != 0) - f2e = (float) temp / table.total_foreign[cur_key[ita->first+2]]; - if ( table.total_english[it->first[ita->second]] !=0 ) - e2f = (float) temp / table.total_english[it->first[ita->second]]; - if (DEBUG) printf (" %d %E %E\n", temp, f2e, e2f); - - - //local counts to keep track of which things haven't been aligned, to later compute their null alignment - if (foreign_aligned.count(cur_key[ita->first+2])) - { - foreign_aligned[ cur_key[ita->first+2] ].first++; - foreign_aligned[ cur_key[ita->first+2] ].second += e2f; - } - else - foreign_aligned [ cur_key[ita->first+2] ] = pair<int,float> (1,e2f); - - - - if (english_aligned.count( it->first[ ita->second] )) - { - english_aligned[ it->first[ ita->second ]].first++; - english_aligned[ it->first[ ita->second] ].second += f2e; - } - else - english_aligned [ it->first[ ita->second] ] = pair<int,float> (1,f2e); - - - - - } - - float final_lex_f2e=1, final_lex_e2f=1; - static const WordID NULL_ = TD::Convert("NULL"); - - //compute lexical weight P(F|E) and include unaligned foreign words - for(int i=0;i<cur_key.size(); i++) - { - - if (!table.total_foreign.count(cur_key[i])) continue; //if we dont have it in the translation table, we won't know its lexical weight - - if (foreign_aligned.count(cur_key[i])) - { - pair<int, float> temp_lex_prob = foreign_aligned[cur_key[i]]; - final_lex_e2f *= temp_lex_prob.second / temp_lex_prob.first; - } - else //dealing with null alignment - { - int temp_count = table.word_translation[pair<WordID,WordID> (cur_key[i],NULL_)]; - float temp_e2f = (float) temp_count / table.total_english[NULL_]; - final_lex_e2f *= temp_e2f; - } - - } - - //compute P(E|F) unaligned english words - for(int j=0; j< it->first.size(); j++) - { - if (!table.total_english.count(it->first[j])) continue; - - if (english_aligned.count(it->first[j])) - { - pair<int, float> temp_lex_prob = english_aligned[it->first[j]]; - final_lex_f2e *= temp_lex_prob.second / temp_lex_prob.first; - } - else //dealing with null - { - int temp_count = table.word_translation[pair<WordID,WordID> (NULL_,it->first[j])]; - float temp_f2e = (float) temp_count / table.total_foreign[NULL_]; - final_lex_f2e *= temp_f2e; - } - } - - - scored_grammar << TD::GetString(cur_key); - string lhs = TD::Convert(cur_key[0]); - scored_grammar << " " << TD::GetString(it->first) << " |||"; - if(lhs.find('_')!=string::npos) { - scored_grammar << " Bkoff=" << safenlog(3.0f); - } else { - scored_grammar << " FGivenE=" << safenlog(pFE_) << " EGivenF=" << safenlog(pEF_); - scored_grammar << " LexE2F=" << safenlog(final_lex_e2f) << " LexF2E=" << safenlog(final_lex_f2e); - } - scored_grammar << endl; - } - } -} - diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc deleted file mode 100644 index 7d60715a..00000000 --- a/extools/sentence_pair.cc +++ /dev/null @@ -1,198 +0,0 @@ -#include "sentence_pair.h" - -#include <queue> -#include <iostream> -#include <string> -#include <vector> -#include <utility> -#include <set> -#include <boost/tuple/tuple_comparison.hpp> - -#include "tdict.h" -#include "wordid.h" -#include "array2d.h" - -using namespace std; -using namespace boost; - -namespace { - inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } - - inline void SkipWhitespace(const char* buf, int* ptr) { - while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); } - } -} - -void AnnotatedParallelSentence::Reset() { - f.clear(); - e.clear(); - e_aligned.clear(); - f_aligned.clear(); - aligns_by_fword.clear(); - aligned.clear(); - span_types.clear(); -} - -void AnnotatedParallelSentence::AllocateForAlignment() { - f_len = f.size(); - e_len = e.size(); - aligned.resize(f_len, e_len, false); - f_aligned.resize(f_len, 0); - e_aligned.resize(e_len, 0); - aligns_by_fword.resize(f_len); -} - -// read an alignment point of the form X-Y where X and Y are strings -// of digits. if permit_col is true, the right edge will be determined -// by the presence of a colon -int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf, - const int start, - const int end, - const bool permit_col, - short* a, short* b, short* c, short* d) { - if (end - start < 3) { - cerr << "Alignment point badly formed 1: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - int ch = start; - *a = 0; - while(ch < end && buf[ch] != '-') { - if (buf[ch] < '0' || buf[ch] > '9') { - cerr << "Alignment point badly formed 2: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - (*a) *= 10; - (*a) += buf[ch] - '0'; - ++ch; - } - ++ch; - if (ch >= end) { - cerr << "Alignment point badly formed 3: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - (*b) = 0; - while((ch < end) && (c == 0 && (!permit_col || (permit_col && buf[ch] != ':')) || c != 0 && buf[ch] != '-')) { - if ((buf[ch] < '0') || (buf[ch] > '9')) { - cerr << "Alignment point badly formed 4: " << string(buf, start, end-start) << endl << buf << endl << buf[ch] << endl; - exit(1); - } - (*b) *= 10; - (*b) += buf[ch] - '0'; - ++ch; - } - if (c != 0) - { - ++ch; - if (ch >= end) { - cerr << "Alignment point badly formed 5: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - (*c) = 0; - while(ch < end && buf[ch] != '-') { - if (buf[ch] < '0' || buf[ch] > '9') { - cerr << "Alignment point badly formed 6: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - (*c) *= 10; - (*c) += buf[ch] - '0'; - ++ch; - } - ++ch; - if (ch >= end) { - cerr << "Alignment point badly formed 7: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - (*d) = 0; - while(ch < end && (!permit_col || (permit_col && buf[ch] != ':'))) { - if (buf[ch] < '0' || buf[ch] > '9') { - cerr << "Alignment point badly formed 8: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - (*d) *= 10; - (*d) += buf[ch] - '0'; - ++ch; - } - } - return ch; -} - -void AnnotatedParallelSentence::Align(const short a, const short b) { - aligned(a,b) = true; - ++f_aligned[a]; - ++e_aligned[b]; - aligns_by_fword[a].push_back(make_pair(a,b)); - // cerr << a << " " << b << endl; -} - -void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) { - short a, b; - ReadAlignmentPoint(buf, start, end, false, &a, &b, 0, 0); - if (a >= f_len || b >= e_len) { - cerr << "(" << a << ',' << b << ") is out of bounds. INPUT=\n" << buf << endl; - exit(1); - } - Align(a,b); -} - -void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) { - short a,b,c,d; - int ch = ReadAlignmentPoint(buf, start, end, true, &a, &b, &c, &d) + 1; - if (buf[ch-1] != ':' || ch >= end) { - cerr << "Span badly formed: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - if (a >= f_len || b > f_len) { - cerr << "(" << a << ',' << b << ") is out of bounds in labeled span. INPUT=\n" << buf << endl; - exit(1); - } - if (c >= e_len || d > e_len) { - cerr << "(" << c << ',' << d << ") is out of bounds in labeled span. INPUT=\n" << buf << endl; - exit(1); - } - // cerr << a << " " << b << " " << string(buf,c,end-c) << endl; - span_types[boost::make_tuple(a,b,c,d)].push_back(-TD::Convert(string(buf, ch, end-ch))); -} - -// INPUT FORMAT -// ein haus ||| a house ||| 0-0 1-1 ||| 0-0:DT 1-1:NN 0-1:NP -void AnnotatedParallelSentence::ParseInputLine(const char* buf) { - Reset(); - int ptr = 0; - SkipWhitespace(buf, &ptr); - int start = ptr; - int state = 0; // 0 = French, 1 = English, 2 = Alignment, 3 = Spans - while(char c = buf[ptr]) { - if (!IsWhitespace(c)) { ++ptr; continue; } else { - if (ptr - start == 3 && buf[start] == '|' && buf[start+1] == '|' && buf[start+2] == '|') { - ++state; - if (state == 4) { cerr << "Too many fields (ignoring):\n " << buf << endl; return; } - if (state == 2) { - // cerr << "FLEN=" << f->size() << " ELEN=" << e->size() << endl; - AllocateForAlignment(); - } - SkipWhitespace(buf, &ptr); - start = ptr; - continue; - } - switch (state) { - case 0: f.push_back(TD::Convert(string(buf, start, ptr-start))); break; - case 1: e.push_back(TD::Convert(string(buf, start, ptr-start))); break; - case 2: ParseAlignmentPoint(buf, start, ptr); break; - case 3: ParseSpanLabel(buf, start, ptr); break; - default: cerr << "Can't happen\n"; abort(); - } - SkipWhitespace(buf, &ptr); - start = ptr; - } - } - if (ptr > start) { - switch (state) { - case 0: f.push_back(TD::Convert(string(buf, start, ptr-start))); break; - case 1: e.push_back(TD::Convert(string(buf, start, ptr-start))); break; - case 2: ParseAlignmentPoint(buf, start, ptr); break; - case 3: ParseSpanLabel(buf, start, ptr); break; - default: cerr << "Can't happen\n"; abort(); - } - } -} - diff --git a/extools/sentence_pair.h b/extools/sentence_pair.h deleted file mode 100644 index a05275e7..00000000 --- a/extools/sentence_pair.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef _SENTENCE_PAIR_H_ -#define _SENTENCE_PAIR_H_ - -#include <map> -#include <utility> -#include <vector> -#include <boost/tuple/tuple.hpp> -#include "wordid.h" -#include "array2d.h" - -// represents a parallel sentence with a word alignment and category -// annotations over subspans (currently in terms of f) -// you should read one using ParseInputLine and then use the public -// member variables to query things about it -struct AnnotatedParallelSentence { - // read annotated parallel sentence from string - void ParseInputLine(const char* buf); - - std::vector<WordID> f, e; // words in f and e - - // word alignment information - std::vector<int> e_aligned, f_aligned; // counts the number of times column/row x is aligned - Array2D<bool> aligned; - std::vector<std::vector<std::pair<short, short> > > aligns_by_fword; - - // span type information - std::map< boost::tuple<short,short,short,short>, std::vector<WordID> > span_types; - // span_types(i,j,k,l) is the list of category span (i,j) in source and (k,l) in the target language. - - int f_len, e_len; - - void Align(const short a, const short b); - void AllocateForAlignment(); - - static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b, short* c, short* d); - - private: - void Reset(); - void ParseAlignmentPoint(const char* buf, int start, int end); - void ParseSpanLabel(const char* buf, int start, int end); -}; - -#endif diff --git a/extools/sg_lexer.l b/extools/sg_lexer.l deleted file mode 100644 index c85cdea7..00000000 --- a/extools/sg_lexer.l +++ /dev/null @@ -1,294 +0,0 @@ -%{ -#include <string> -#include <iostream> -#include <sstream> -#include <cstring> -#include <cassert> -#include "tdict.h" -#include "fdict.h" -#include "striped_grammar.h" - -int lex_line = 0; -int read_contexts = 0; -std::istream* sglex_stream = NULL; -StripedGrammarLexer::GrammarCallback grammar_callback = NULL; -StripedGrammarLexer::ContextCallback context_callback = NULL; -void* grammar_callback_extra = NULL; -void* context_callback_extra = NULL; - -#undef YY_INPUT -#define YY_INPUT(buf, result, max_size) (result = sglex_stream->read(buf, max_size).gcount()) - -#define YY_SKIP_YYWRAP 1 -int num_rules = 0; -int yywrap() { return 1; } -bool fl = true; -#define MAX_TOKEN_SIZE 255 -std::string sglex_tmp_token(MAX_TOKEN_SIZE, '\0'); - -#define MAX_RULE_SIZE 48 -WordID sglex_src_rhs[MAX_RULE_SIZE]; -WordID sglex_trg_rhs[MAX_RULE_SIZE]; -int sglex_src_rhs_size; -int sglex_trg_rhs_size; -WordID sglex_lhs; -int sglex_src_arity; -int sglex_trg_arity; - -#define MAX_FEATS 100 -int sglex_feat_ids[MAX_FEATS]; -double sglex_feat_vals[MAX_FEATS]; -int sglex_num_feats; - -#define MAX_ARITY 20 -int sglex_nt_sanity[MAX_ARITY]; -int sglex_src_nts[MAX_ARITY]; -float sglex_nt_size_means[MAX_ARITY]; -float sglex_nt_size_vars[MAX_ARITY]; - -std::vector<WordID> cur_src_rhs; -std::vector<WordID> cur_trg_rhs; -ID2RuleStatistics cur_options; -RuleStatistics* cur_stats = NULL; -int sglex_cur_fid = 0; - -static void sanity_check_trg_index(int index) { - if (index > sglex_src_arity) { - std::cerr << "Target index " << index << " exceeds source arity " << sglex_src_arity << std::endl; - abort(); - } - int& flag = sglex_nt_sanity[index - 1]; - if (flag) { - std::cerr << "Target index " << index << " used multiple times!" << std::endl; - abort(); - } - flag = 1; -} - -static void sglex_reset() { - sglex_src_arity = 0; - sglex_trg_arity = 0; - sglex_num_feats = 0; - sglex_src_rhs_size = 0; - sglex_trg_rhs_size = 0; -} - -%} - -REAL [\-+]?[0-9]+(\.[0-9]*([eE][-+]*[0-9]+)?)?|inf|[\-+]inf -NT [^\t \[\],]+ -ALIGN [0-9]+-[0-9]+ - -%x LHS_END SRC TRG FEATS FEATVAL ALIGNS -%% - -<INITIAL>[ ] ; -<INITIAL>[\t] { - if (read_contexts) { - cur_options.clear(); - BEGIN(TRG); - } else { - std::cerr << "Unexpected tab while reading striped grammar\n"; - exit(1); - } - } - -<INITIAL>\[{NT}\] { - if (read_contexts) { - sglex_tmp_token.assign(yytext, yyleng); - sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token); - ++sglex_src_rhs_size; - } else { - sglex_tmp_token.assign(yytext + 1, yyleng - 2); - sglex_lhs = -TD::Convert(sglex_tmp_token); - // std::cerr << sglex_tmp_token << "\n"; - BEGIN(LHS_END); - } - } - -<INITIAL>[^ \t]+ { - if (read_contexts) { - // std::cerr << "Context: " << yytext << std::endl; - sglex_tmp_token.assign(yytext, yyleng); - sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token); - ++sglex_src_rhs_size; - } else { - std::cerr << "Unexpected input: " << yytext << " when NT expected\n"; - exit(1); - } - } - -<SRC>\[{NT}\] { - sglex_tmp_token.assign(yytext + 1, yyleng - 2); - sglex_src_nts[sglex_src_arity] = sglex_src_rhs[sglex_src_rhs_size] = -TD::Convert(sglex_tmp_token); - ++sglex_src_arity; - ++sglex_src_rhs_size; - } - -<LHS_END>[ ] { ; } -<LHS_END>\|\|\| { - sglex_reset(); - BEGIN(SRC); - } - -<LHS_END>. { - std::cerr << "Line " << lex_line << ": unexpected input in LHS: " << yytext << std::endl; - exit(1); - } - - -<SRC>\[{NT},[1-9][0-9]?\] { - int index = yytext[yyleng - 2] - '0'; - if (yytext[yyleng - 3] == ',') { - sglex_tmp_token.assign(yytext + 1, yyleng - 4); - } else { - sglex_tmp_token.assign(yytext + 1, yyleng - 5); - index += 10 * (yytext[yyleng - 3] - '0'); - } - if ((sglex_src_arity+1) != index) { - std::cerr << "Src indices must go in order: expected " << sglex_src_arity << " but got " << index << std::endl; - abort(); - } - sglex_src_nts[sglex_src_arity] = sglex_src_rhs[sglex_src_rhs_size] = -TD::Convert(sglex_tmp_token); - ++sglex_src_rhs_size; - ++sglex_src_arity; - } - -<SRC>[^ \t]+ { - sglex_tmp_token.assign(yytext, yyleng); - sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token); - ++sglex_src_rhs_size; - } -<SRC>[ ] { ; } -<SRC>\t { - //std::cerr << "LHS=" << TD::Convert(-sglex_lhs) << " "; - //std::cerr << " src_size: " << sglex_src_rhs_size << std::endl; - //std::cerr << " src_arity: " << sglex_src_arity << std::endl; - cur_options.clear(); - memset(sglex_nt_sanity, 0, sglex_src_arity * sizeof(int)); - sglex_trg_rhs_size = 0; - BEGIN(TRG); - } - -<TRG>\[[1-9][0-9]?\] { - if (read_contexts) { - sglex_tmp_token.assign(yytext, yyleng); - sglex_trg_rhs[sglex_trg_rhs_size] = TD::Convert(sglex_tmp_token); - ++sglex_trg_rhs_size; - } else { - int index = yytext[yyleng - 2] - '0'; - if (yyleng == 4) { - index += 10 * (yytext[yyleng - 3] - '0'); - } - ++sglex_trg_arity; - sanity_check_trg_index(index); - sglex_trg_rhs[sglex_trg_rhs_size] = 1 - index; - ++sglex_trg_rhs_size; - } -} - -<TRG>\|\|\| { - //std::cerr << " trg_size: " << sglex_trg_rhs_size << std::endl; - //std::cerr << " trg_arity: " << sglex_trg_arity << std::endl; - assert(sglex_trg_rhs_size > 0); - cur_trg_rhs.resize(sglex_trg_rhs_size); - for (int i = 0; i < sglex_trg_rhs_size; ++i) - cur_trg_rhs[i] = sglex_trg_rhs[i]; - cur_stats = &cur_options[cur_trg_rhs]; - BEGIN(FEATS); - } - -<TRG>[^ ]+ { - sglex_tmp_token.assign(yytext, yyleng); - sglex_trg_rhs[sglex_trg_rhs_size] = TD::Convert(sglex_tmp_token); - - ++sglex_trg_rhs_size; - } -<TRG>[ ]+ { ; } - -<FEATS>\n { - assert(sglex_src_rhs_size > 0); - cur_src_rhs.resize(sglex_src_rhs_size); - for (int i = 0; i < sglex_src_rhs_size; ++i) - cur_src_rhs[i] = sglex_src_rhs[i]; - if (read_contexts) { - context_callback(cur_src_rhs, cur_options, context_callback_extra); - } else { - assert(sglex_lhs < 0); - grammar_callback(sglex_lhs, cur_src_rhs, cur_options, grammar_callback_extra); - } - cur_options.clear(); - sglex_reset(); - BEGIN(INITIAL); - } -<FEATS>[ ]+ { ; } -<FEATS>\|\|\| { - memset(sglex_nt_sanity, 0, sglex_src_arity * sizeof(int)); - sglex_trg_rhs_size = 0; - BEGIN(TRG); - } -<FEATS>[A-Z][A-Z_0-9]*= { - // std::cerr << "FV: " << yytext << std::endl; - sglex_tmp_token.assign(yytext, yyleng - 1); - sglex_cur_fid = FD::Convert(sglex_tmp_token); - static const int Afid = FD::Convert("A"); - if (sglex_cur_fid == Afid) { - BEGIN(ALIGNS); - } else { - BEGIN(FEATVAL); - } - } -<FEATVAL>{REAL} { - // std::cerr << "Feature val input: " << yytext << std::endl; - cur_stats->counts.add_value(sglex_cur_fid, strtod(yytext, NULL)); - BEGIN(FEATS); - } -<FEATVAL>. { - std::cerr << "Feature val unexpected input: " << yytext << std::endl; - exit(1); - } -<FEATS>. { - std::cerr << "Features unexpected input: " << yytext << std::endl; - exit(1); - } -<ALIGNS>{ALIGN}(,{ALIGN})* { - assert(cur_stats->aligns.empty()); - int i = 0; - while(i < yyleng) { - short a = 0; - short b = 0; - while (yytext[i] != '-') { a *= 10; a += yytext[i] - '0'; ++i; } - ++i; - while (yytext[i] != ',' && i < yyleng) { b *= 10; b += yytext[i] - '0'; ++i; } - ++i; - cur_stats->aligns.push_back(std::make_pair(a,b)); - } - BEGIN(FEATS); - } -<ALIGNS>. { - std::cerr << "Aligns unexpected input: " << yytext << std::endl; - exit(1); - } -%% - -#include "filelib.h" - -void StripedGrammarLexer::ReadStripedGrammar(std::istream* in, GrammarCallback func, void* extra) { - read_contexts = 0; - lex_line = 1; - sglex_stream = in; - grammar_callback_extra = extra; - grammar_callback = func; - yylex(); -} - -void StripedGrammarLexer::ReadContexts(std::istream* in, ContextCallback func, void* extra) { - read_contexts = 1; - lex_line = 1; - sglex_stream = in; - context_callback_extra = extra; - context_callback = func; - yylex(); -} - - diff --git a/extools/simple-extract-context.sh b/extools/simple-extract-context.sh deleted file mode 100755 index 17487b1c..00000000 --- a/extools/simple-extract-context.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -MYDIR=$(dirname $0) - -export LANG=C -date 1>&2 -$MYDIR/extractor -i $1 -c 500000 -L 12 -C | sort -t $'\t' -k 1 | $MYDIR/mr_stripe_rule_reduce -date 1>&2 - diff --git a/extools/simple-extract.sh b/extools/simple-extract.sh deleted file mode 100755 index ec5c5276..00000000 --- a/extools/simple-extract.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -export LANG=C -date -./extractor -i $1 -d X -c 500000 -L 12 -b | sort -t $'\t' -k 1 | gzip > ex.output.gz -date -# -p = compute phrase marginals -# -b = bidirectional rules (starting with F or E) were extracted -zcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz -date - diff --git a/extools/striped_grammar.cc b/extools/striped_grammar.cc deleted file mode 100644 index 785f4bbe..00000000 --- a/extools/striped_grammar.cc +++ /dev/null @@ -1,67 +0,0 @@ -#include "striped_grammar.h" - -#include <iostream> - -#include "sentence_pair.h" - -using namespace std; - -namespace { - inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } - - inline void SkipWhitespace(const char* buf, int* ptr) { - while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); } - } -} - -void RuleStatistics::ParseRuleStatistics(const char* buf, int start, int end) { - int ptr = start; - counts.clear(); - aligns.clear(); - while (ptr < end) { - SkipWhitespace(buf, &ptr); - int vstart = ptr; - while(ptr < end && buf[ptr] != '=') ++ptr; - assert(buf[ptr] == '='); - assert(ptr > vstart); - if (buf[vstart] == 'A' && buf[vstart+1] == '=') { - ++ptr; - while (ptr < end && !IsWhitespace(buf[ptr])) { - while(ptr < end && buf[ptr] == ',') { ++ptr; } - assert(ptr < end); - vstart = ptr; - while(ptr < end && buf[ptr] != ',' && !IsWhitespace(buf[ptr])) { ++ptr; } - if (ptr > vstart) { - short a, b; - AnnotatedParallelSentence::ReadAlignmentPoint(buf, vstart, ptr, false, &a, &b, 0, 0); - aligns.push_back(make_pair(a,b)); - } - } - } else { - int name = FD::Convert(string(buf,vstart,ptr-vstart)); - ++ptr; - vstart = ptr; - while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; } - assert(ptr > vstart); - counts.set_value(name, strtod(buf + vstart, NULL)); - } - } -} - -ostream& operator<<(ostream& os, const RuleStatistics& s) { - bool needspace = false; - for (SparseVector<float>::const_iterator it = s.counts.begin(); it != s.counts.end(); ++it) { - if (needspace) os << ' '; else needspace = true; - os << FD::Convert(it->first) << '=' << it->second; - } - if (s.aligns.size() > 0) { - os << " A="; - needspace = false; - for (int i = 0; i < s.aligns.size(); ++i) { - if (needspace) os << ','; else needspace = true; - os << s.aligns[i].first << '-' << s.aligns[i].second; - } - } - return os; -} - diff --git a/extools/striped_grammar.h b/extools/striped_grammar.h deleted file mode 100644 index bf3aec7d..00000000 --- a/extools/striped_grammar.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef _STRIPED_GRAMMAR_H_ -#define _STRIPED_GRAMMAR_H_ - -#include <iostream> -#include <boost/functional/hash.hpp> -#include <vector> -#include <tr1/unordered_map> -#include "sparse_vector.h" -#include "wordid.h" -#include "tdict.h" - -// represents statistics / information about a rule pair -struct RuleStatistics { - SparseVector<float> counts; - std::vector<std::pair<short,short> > aligns; - RuleStatistics() {} - RuleStatistics(int name, float val, const std::vector<std::pair<short,short> >& al) : - aligns(al) { - counts.set_value(name, val); - } - void ParseRuleStatistics(const char* buf, int start, int end); - RuleStatistics& operator+=(const RuleStatistics& rhs) { - counts += rhs.counts; - return *this; - } -}; -std::ostream& operator<<(std::ostream& os, const RuleStatistics& s); - -inline void WriteNamed(const std::vector<WordID>& v, std::ostream* os) { - bool first = true; - for (int i = 0; i < v.size(); ++i) { - if (first) { first = false; } else { (*os) << ' '; } - if (v[i] < 0) { (*os) << '[' << TD::Convert(-v[i]) << ']'; } - else (*os) << TD::Convert(v[i]); - } -} - -inline void WriteAnonymous(const std::vector<WordID>& v, std::ostream* os) { - bool first = true; - for (int i = 0; i < v.size(); ++i) { - if (first) { first = false; } else { (*os) << ' '; } - if (v[i] <= 0) { (*os) << '[' << (1-v[i]) << ']'; } - else (*os) << TD::Convert(v[i]); - } -} - -typedef std::tr1::unordered_map<std::vector<WordID>, RuleStatistics, boost::hash<std::vector<WordID> > > ID2RuleStatistics; - -struct StripedGrammarLexer { - typedef void (*GrammarCallback)(WordID lhs, const std::vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void *extra); - static void ReadStripedGrammar(std::istream* in, GrammarCallback func, void* extra); - typedef void (*ContextCallback)(const std::vector<WordID>& phrase, const ID2RuleStatistics& rules, void *extra); - static void ReadContexts(std::istream* in, ContextCallback func, void* extra); -}; - -#endif diff --git a/extools/suffix_tree.h b/extools/suffix_tree.h deleted file mode 100644 index f62f53f4..00000000 --- a/extools/suffix_tree.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * suffix_tree.h - * - * Created on: May 17, 2010 - * Author: Vlad - -NOTE (graehl): this seems to be a (forward) trie of the suffixes (of sentences). -so O(m*n^2) for m sentences of length n. - -For a real suffix tree (linear size/time), see: -http://en.wikipedia.org/wiki/Suffix_tree -http://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf - - */ - -#ifndef SUFFIX_TREE_H_ -#define SUFFIX_TREE_H_ - -#include <string> -#include <map> -#include <vector> - -template <class T> -class Node { - public: - std::map<T, Node> edge_list_; - int InsertPath(const std::vector<T>& p, int start, int end); - const Node* Extend(const T& e) const { - typename std::map<T, Node>::const_iterator it = edge_list_.find(e); - if (it == edge_list_.end()) return NULL; - return &it->second; - } -}; - -bool DEBUG = false; - -template <class T> -int Node<T>::InsertPath(const std::vector<T>& p, int start, int end){ - Node* currNode = this; - for(int i=start;i<= end; i++ ) { - currNode = &(currNode->edge_list_)[p[i]]; - } - return 1; -} - -#endif /* SUFFIX_TRIE_H_ */ diff --git a/extools/test_data/README b/extools/test_data/README deleted file mode 100644 index e368cffc..00000000 --- a/extools/test_data/README +++ /dev/null @@ -1,10 +0,0 @@ -The following was used to create the test data. The real inputs -were corpus.fr, corpus.en, and corpus.aligned. The generated files -were corpus.len_cats and fr-en.al.len. - - - ./make_len_cats.pl corpus.en > corpus.len_cats - - ../merge_lines.pl corpus.fr corpus.en corpus.aligned corpus.len_cats > fr-en.al.len - - diff --git a/extools/test_data/corpus.aligned b/extools/test_data/corpus.aligned deleted file mode 100644 index aa09e9ab..00000000 --- a/extools/test_data/corpus.aligned +++ /dev/null @@ -1,5 +0,0 @@ -0-0 1-2 2-1 -0-0 1-1 -0-0 0-1 1-0 1-1 2-0 2-1 3-2 4-3 -0-0 -0-0 1-1 diff --git a/extools/test_data/corpus.en b/extools/test_data/corpus.en deleted file mode 100644 index 2d4751bf..00000000 --- a/extools/test_data/corpus.en +++ /dev/null @@ -1,5 +0,0 @@ -the blue house -the hat -there is a hat -cap -the cat diff --git a/extools/test_data/corpus.fr b/extools/test_data/corpus.fr deleted file mode 100644 index 75b5e127..00000000 --- a/extools/test_data/corpus.fr +++ /dev/null @@ -1,5 +0,0 @@ -la maison bleue -le chapeau -il y a un chapeau -chapeau -le chat diff --git a/extools/test_data/corpus.len_cats b/extools/test_data/corpus.len_cats deleted file mode 100644 index 18d321de..00000000 --- a/extools/test_data/corpus.len_cats +++ /dev/null @@ -1,5 +0,0 @@ -0-1:SHORT 0-2:SHORT 0-3:MID 1-2:SHORT 1-3:SHORT 2-3:SHORT -0-1:SHORT 0-2:SHORT 1-2:SHORT -0-1:SHORT 0-2:SHORT 0-3:MID 0-4:MID 1-2:SHORT 1-3:SHORT 1-4:MID 2-3:SHORT 2-4:SHORT 3-4:SHORT -0-1:SHORT -0-1:SHORT 0-2:SHORT 1-2:SHORT diff --git a/extools/test_data/fr-en.al.len b/extools/test_data/fr-en.al.len deleted file mode 100644 index 7ee6b85d..00000000 --- a/extools/test_data/fr-en.al.len +++ /dev/null @@ -1,5 +0,0 @@ -la maison bleue ||| the blue house ||| 0-0 1-2 2-1 ||| 0-1:SHORT 0-2:SHORT 0-3:MID 1-2:SHORT 1-3:SHORT 2-3:SHORT -le chapeau ||| the hat ||| 0-0 1-1 ||| 0-1:SHORT 0-2:SHORT 1-2:SHORT -il y a un chapeau ||| there is a hat ||| 0-0 0-1 1-0 1-1 2-0 2-1 3-2 4-3 ||| 0-1:SHORT 0-2:SHORT 0-3:MID 0-4:MID 1-2:SHORT 1-3:SHORT 1-4:MID 2-3:SHORT 2-4:SHORT 3-4:SHORT -chapeau ||| cap ||| 0-0 ||| 0-1:SHORT -le chat ||| the cat ||| 0-0 1-1 ||| 0-1:SHORT 0-2:SHORT 1-2:SHORT diff --git a/extools/test_data/make_len_cats.pl b/extools/test_data/make_len_cats.pl deleted file mode 100755 index 25ef75fa..00000000 --- a/extools/test_data/make_len_cats.pl +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $max_len = 15; -my @cat_names = qw( NULL SHORT SHORT MID MID MID LONG LONG LONG LONG LONG VLONG VLONG VLONG VLONG VLONG ); - -while(<>) { - chomp; - my @words = split /\s+/; - my $len = scalar @words; - my @spans; - for (my $i =0; $i < $len; $i++) { - for (my $k = 1; $k <= $max_len; $k++) { - my $j = $i + $k; - next if ($j > $len); - my $cat = $cat_names[$k]; - die unless $cat; - push @spans, "$i-$j:$cat"; - } - } - print "@spans\n"; -} - |