summaryrefslogtreecommitdiff
path: root/extools/featurize_grammar.cc
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-11-05 15:29:46 +0100
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-11-05 15:29:46 +0100
commit6f29f345dc06c1a1033475eac1d1340781d1d603 (patch)
tree6fa4cdd7aefd7d54c9585c2c6274db61bb8b159a /extools/featurize_grammar.cc
parentb510da2e562c695c90d565eb295c749569c59be8 (diff)
parentc615c37501fa8576584a510a9d2bfe2fdd5bace7 (diff)
merge upstream/master
Diffstat (limited to 'extools/featurize_grammar.cc')
-rw-r--r--extools/featurize_grammar.cc716
1 files changed, 0 insertions, 716 deletions
diff --git a/extools/featurize_grammar.cc b/extools/featurize_grammar.cc
deleted file mode 100644
index 78175202..00000000
--- a/extools/featurize_grammar.cc
+++ /dev/null
@@ -1,716 +0,0 @@
-/*
- * Featurize a grammar in striped format
- */
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <map>
-#include <vector>
-#include <utility>
-#include <cstdlib>
-#include <tr1/unordered_map>
-
-#include "lex_trans_tbl.h"
-#include "sparse_vector.h"
-#include "sentence_pair.h"
-#include "extract.h"
-#include "fdict.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "striped_grammar.h"
-
-#include <boost/tuple/tuple.hpp>
-#include <boost/shared_ptr.hpp>
-#include <boost/functional/hash.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-using namespace std;
-using namespace std::tr1;
-using boost::shared_ptr;
-namespace po = boost::program_options;
-
-static string aligned_corpus;
-static const size_t MAX_LINE_LENGTH = 64000000;
-
-// Data structures for indexing and counting rules
-//typedef boost::tuple< WordID, vector<WordID>, vector<WordID> > RuleTuple;
-struct RuleTuple {
- RuleTuple(const WordID& lhs, const vector<WordID>& s, const vector<WordID>& t)
- : m_lhs(lhs), m_source(s), m_target(t) {
- hash_value();
- m_dirty = false;
- }
-
- size_t hash_value() const {
-// if (m_dirty) {
- size_t hash = 0;
- boost::hash_combine(hash, m_lhs);
- boost::hash_combine(hash, m_source);
- boost::hash_combine(hash, m_target);
-// }
-// m_dirty = false;
- return hash;
- }
-
- bool operator==(RuleTuple const& b) const
- { return m_lhs == b.m_lhs && m_source == b.m_source && m_target == b.m_target; }
-
- WordID& lhs() { m_dirty=true; return m_lhs; }
- vector<WordID>& source() { m_dirty=true; return m_source; }
- vector<WordID>& target() { m_dirty=true; return m_target; }
- const WordID& lhs() const { return m_lhs; }
- const vector<WordID>& source() const { return m_source; }
- const vector<WordID>& target() const { return m_target; }
-
-// mutable size_t m_hash;
-private:
- WordID m_lhs;
- vector<WordID> m_source, m_target;
- mutable bool m_dirty;
-};
-std::size_t hash_value(RuleTuple const& b) { return b.hash_value(); }
-bool operator<(RuleTuple const& l, RuleTuple const& r) {
- if (l.lhs() < r.lhs()) return true;
- else if (l.lhs() == r.lhs()) {
- if (l.source() < r.source()) return true;
- else if (l.source() == r.source()) {
- if (l.target() < r.target()) return true;
- }
- }
- return false;
-}
-
-ostream& operator<<(ostream& o, RuleTuple const& r) {
- o << "(" << r.lhs() << "-->" << "<";
- for (vector<WordID>::const_iterator it=r.source().begin(); it!=r.source().end(); ++it)
- o << TD::Convert(*it) << " ";
- o << "|||";
- for (vector<WordID>::const_iterator it=r.target().begin(); it!=r.target().end(); ++it)
- o << " " << TD::Convert(*it);
- o << ">)";
- return o;
-}
-
-template <typename Key>
-struct FreqCount {
- //typedef unordered_map<Key, int, boost::hash<Key> > Counts;
- typedef map<Key, int> Counts;
- Counts counts;
-
- int inc(const Key& r, int c=1) {
- pair<typename Counts::iterator,bool> itb
- = counts.insert(make_pair(r,c));
- if (!itb.second)
- itb.first->second += c;
- return itb.first->second;
- }
-
- int inc_if_exists(const Key& r, int c=1) {
- typename Counts::iterator it = counts.find(r);
- if (it != counts.end())
- it->second += c;
- return it->second;
- }
-
- int count(const Key& r) const {
- typename Counts::const_iterator it = counts.find(r);
- if (it == counts.end()) return 0;
- return it->second;
- }
-
- int operator()(const Key& r) const { return count(r); }
-};
-typedef FreqCount<RuleTuple> RuleFreqCount;
-
-class FeatureExtractor;
-class FERegistry;
-struct FEFactoryBase {
- virtual ~FEFactoryBase() {}
- virtual boost::shared_ptr<FeatureExtractor> Create() const = 0;
-};
-
-
-class FERegistry {
- friend class FEFactoryBase;
- public:
- FERegistry() {}
- boost::shared_ptr<FeatureExtractor> Create(const std::string& ffname) const {
- map<string, boost::shared_ptr<FEFactoryBase> >::const_iterator it = reg_.find(ffname);
- boost::shared_ptr<FeatureExtractor> res;
- if (it == reg_.end()) {
- cerr << "I don't know how to create feature " << ffname << endl;
- } else {
- res = it->second->Create();
- }
- return res;
- }
- void DisplayList(ostream* out) const {
- bool first = true;
- for (map<string, boost::shared_ptr<FEFactoryBase> >::const_iterator it = reg_.begin();
- it != reg_.end(); ++it) {
- if (first) {first=false;} else {*out << ' ';}
- *out << it->first;
- }
- }
-
- void Register(const std::string& ffname, FEFactoryBase* factory) {
- if (reg_.find(ffname) != reg_.end()) {
- cerr << "Duplicate registration of FeatureExtractor with name " << ffname << "!\n";
- exit(1);
- }
- reg_[ffname].reset(factory);
- }
-
- private:
- std::map<std::string, boost::shared_ptr<FEFactoryBase> > reg_;
-};
-
-template<class FE>
-class FEFactory : public FEFactoryBase {
- boost::shared_ptr<FeatureExtractor> Create() const {
- return boost::shared_ptr<FeatureExtractor>(new FE);
- }
-};
-
-void InitCommandLine(const FERegistry& r, int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- ostringstream feats;
- feats << "[multiple] Features to extract (";
- r.DisplayList(&feats);
- feats << ")";
- opts.add_options()
- ("filtered_grammar,g", po::value<string>(), "Grammar to add features to")
- ("list_features,L", "List extractable features")
- ("feature,f", po::value<vector<string> >()->composing(), feats.str().c_str())
- ("aligned_corpus,c", po::value<string>(), "Aligned corpus (single line format)")
- ("help,h", "Print this help message and exit");
- po::options_description clo("Command line options");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- po::notify(*conf);
-
- if (conf->count("help") || conf->count("aligned_corpus")==0 || conf->count("feature") == 0) {
- cerr << "\nUsage: featurize_grammar -g FILTERED-GRAMMAR.gz -c ALIGNED_CORPUS.fr-en-al -f Feat1 -f Feat2 ... < UNFILTERED-GRAMMAR\n";
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-static const bool DEBUG = false;
-
-void LexTranslationTable::createTTable(const char* buf){
- AnnotatedParallelSentence sent;
- sent.ParseInputLine(buf);
-
- //iterate over the alignment to compute aligned words
-
- for(int i =0;i<sent.aligned.width();i++)
- {
- for (int j=0;j<sent.aligned.height();j++)
- {
- if (DEBUG) cerr << sent.aligned(i,j) << " ";
- if( sent.aligned(i,j))
- {
- if (DEBUG) cerr << TD::Convert(sent.f[i]) << " aligned to " << TD::Convert(sent.e[j]);
- ++word_translation[pair<WordID,WordID> (sent.f[i], sent.e[j])];
- ++total_foreign[sent.f[i]];
- ++total_english[sent.e[j]];
- }
- }
- if (DEBUG) cerr << endl;
- }
- if (DEBUG) cerr << endl;
-
- const WordID NULL_ = TD::Convert("NULL");
- //handle unaligned words - align them to null
- for (int j =0; j < sent.e_len; j++) {
- if (sent.e_aligned[j]) continue;
- ++word_translation[pair<WordID,WordID> (NULL_, sent.e[j])];
- ++total_foreign[NULL_];
- ++total_english[sent.e[j]];
- }
-
- for (int i =0; i < sent.f_len; i++) {
- if (sent.f_aligned[i]) continue;
- ++word_translation[pair<WordID,WordID> (sent.f[i], NULL_)];
- ++total_english[NULL_];
- ++total_foreign[sent.f[i]];
- }
-}
-
-inline float safenlog(float v) {
- if (v == 1.0f) return 0.0f;
- float res = -log(v);
- if (res > 100.0f) res = 100.0f;
- return res;
-}
-
-static bool IsZero(float f) { return (f > 0.999 && f < 1.001); }
-
-struct FeatureExtractor {
- // create any keys necessary
- virtual void ObserveFilteredRule(const WordID /* lhs */,
- const vector<WordID>& /* src */,
- const vector<WordID>& /* trg */) {}
-
- // compute statistics over keys, the same lhs-src-trg tuple may be seen
- // more than once
- virtual void ObserveUnfilteredRule(const WordID /* lhs */,
- const vector<WordID>& /* src */,
- const vector<WordID>& /* trg */,
- const RuleStatistics& /* info */) {}
-
- // compute features, a unique lhs-src-trg tuple will be seen exactly once
- virtual void ExtractFeatures(const WordID lhs,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info,
- SparseVector<float>* result) const = 0;
-
- virtual ~FeatureExtractor() {}
-};
-
-struct LogRuleCount : public FeatureExtractor {
- LogRuleCount() :
- fid_(FD::Convert("LogRuleCount")),
- sfid_(FD::Convert("SingletonRule")),
- kCFE(FD::Convert("CFE")) {}
- virtual void ExtractFeatures(const WordID lhs,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info,
- SparseVector<float>* result) const {
- (void) lhs; (void) src; (void) trg;
- //result->set_value(fid_, log(info.counts.get(kCFE)));
- result->set_value(fid_, log(info.counts.get(kCFE)));
- if (IsZero(info.counts.get(kCFE)))
- result->set_value(sfid_, 1);
- }
- const int fid_;
- const int sfid_;
- const int kCFE;
-};
-
-struct RulePenalty : public FeatureExtractor {
- RulePenalty() : fid_(FD::Convert("RulePenalty")) {}
- virtual void ExtractFeatures(const WordID /*lhs*/,
- const vector<WordID>& /*src*/,
- const vector<WordID>& /*trg*/,
- const RuleStatistics& /*info*/,
- SparseVector<float>* result) const
- { result->set_value(fid_, 1); }
-
- const int fid_;
-};
-
-// The negative log of the condition rule probs
-// ignoring the identities of the non-terminals.
-// i.e. the prob Hiero would assign.
-// Also extracts Labelled features.
-struct XFeatures: public FeatureExtractor {
- XFeatures() :
- fid_xfe(FD::Convert("XFE")),
- fid_xef(FD::Convert("XEF")),
- fid_labelledfe(FD::Convert("LabelledFE")),
- fid_labelledef(FD::Convert("LabelledEF")),
- fid_xesingleton(FD::Convert("XE_Singleton")),
- fid_xfsingleton(FD::Convert("XF_Singleton")),
- kCFE(FD::Convert("CFE")) {}
- virtual void ObserveFilteredRule(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg) {
- RuleTuple r(-1, src, trg);
- map_rule(r);
- rule_counts.inc(r, 0);
- source_counts.inc(r.source(), 0);
- target_counts.inc(r.target(), 0);
- }
-
- virtual void ObserveUnfilteredRule(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info) {
- RuleTuple r(-1, src, trg);
- map_rule(r);
- const int count = info.counts.get(kCFE);
- assert(count > 0);
- rule_counts.inc_if_exists(r, count);
- source_counts.inc_if_exists(r.source(), count);
- target_counts.inc_if_exists(r.target(), count);
- }
-
- virtual void ExtractFeatures(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info,
- SparseVector<float>* result) const {
- RuleTuple r(-1, src, trg);
- map_rule(r);
- double l_r_freq = log(rule_counts(r));
-
- const int t_c = target_counts(r.target());
- assert(t_c > 0);
- result->set_value(fid_xfe, log(t_c) - l_r_freq);
- result->set_value(fid_labelledfe, log(t_c) - log(info.counts.get(kCFE)));
-// if (t_c == 1)
-// result->set_value(fid_xesingleton, 1.0);
-
- const int s_c = source_counts(r.source());
- assert(s_c > 0);
- result->set_value(fid_xef, log(s_c) - l_r_freq);
- result->set_value(fid_labelledef, log(s_c) - log(info.counts.get(kCFE)));
-// if (s_c == 1)
-// result->set_value(fid_xfsingleton, 1.0);
- }
-
- void map_rule(RuleTuple& r) const {
- vector<WordID> indexes; int i=0;
- for (vector<WordID>::iterator it = r.target().begin(); it != r.target().end(); ++it) {
- if (*it <= 0)
- indexes.push_back(*it);
- }
- for (vector<WordID>::iterator it = r.source().begin(); it != r.source().end(); ++it) {
- if (*it <= 0)
- *it = indexes.at(i++);
- }
- }
-
- const int fid_xfe, fid_xef;
- const int fid_labelledfe, fid_labelledef;
- const int fid_xesingleton, fid_xfsingleton;
- const int kCFE;
- RuleFreqCount rule_counts;
- FreqCount< vector<WordID> > source_counts, target_counts;
-};
-
-
-struct LabelledRuleConditionals: public FeatureExtractor {
- LabelledRuleConditionals() :
- fid_fe(FD::Convert("LabelledFE")),
- fid_ef(FD::Convert("LabelledEF")),
- kCFE(FD::Convert("CFE")) {}
- virtual void ObserveFilteredRule(const WordID lhs,
- const vector<WordID>& src,
- const vector<WordID>& trg) {
- RuleTuple r(lhs, src, trg);
- rule_counts.inc(r, 0);
- source_counts.inc(r.source(), 0);
-
- target_counts.inc(r.target(), 0);
- }
-
- virtual void ObserveUnfilteredRule(const WordID lhs,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info) {
- RuleTuple r(lhs, src, trg);
- rule_counts.inc_if_exists(r, info.counts.get(kCFE));
- source_counts.inc_if_exists(r.source(), info.counts.get(kCFE));
-
- target_counts.inc_if_exists(r.target(), info.counts.get(kCFE));
- }
-
- virtual void ExtractFeatures(const WordID lhs,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& /*info*/,
- SparseVector<float>* result) const {
- RuleTuple r(lhs, src, trg);
- double l_r_freq = log(rule_counts(r));
- result->set_value(fid_fe, log(target_counts(r.target())) - l_r_freq);
- result->set_value(fid_ef, log(source_counts(r.source())) - l_r_freq);
- }
-
- const int fid_fe, fid_ef;
- const int kCFE;
- RuleFreqCount rule_counts;
- FreqCount< vector<WordID> > source_counts, target_counts;
-};
-
-struct LHSProb: public FeatureExtractor {
- LHSProb() : fid_(FD::Convert("LHSProb")), kCFE(FD::Convert("CFE")), total_count(0) {}
-
- virtual void ObserveUnfilteredRule(const WordID lhs,
- const vector<WordID>& /*src*/,
- const vector<WordID>& /*trg*/,
- const RuleStatistics& info) {
- int count = info.counts.get(kCFE);
- total_count += count;
- lhs_counts.inc(lhs, count);
- }
-
- virtual void ExtractFeatures(const WordID lhs,
- const vector<WordID>& /*src*/,
- const vector<WordID>& /*trg*/,
- const RuleStatistics& /*info*/,
- SparseVector<float>* result) const {
- double lhs_log_prob = log(total_count) - log(lhs_counts(lhs));
- result->set_value(fid_, lhs_log_prob);
- }
-
- const int fid_;
- const int kCFE;
- int total_count;
- FreqCount<WordID> lhs_counts;
-};
-
-// Proper rule generative probability: p( s,t | lhs)
-struct GenerativeProb: public FeatureExtractor {
- GenerativeProb() :
- fid_(FD::Convert("GenerativeProb")),
- kCFE(FD::Convert("CFE")) {}
-
- virtual void ObserveUnfilteredRule(const WordID lhs,
- const vector<WordID>& /*src*/,
- const vector<WordID>& /*trg*/,
- const RuleStatistics& info)
- { lhs_counts.inc(lhs, info.counts.get(kCFE)); }
-
- virtual void ExtractFeatures(const WordID lhs,
- const vector<WordID>& /*src*/,
- const vector<WordID>& /*trg*/,
- const RuleStatistics& info,
- SparseVector<float>* result) const {
- double log_prob = log(lhs_counts(lhs)) - log(info.counts.get(kCFE));
- result->set_value(fid_, log_prob);
- }
-
- const int fid_;
- const int kCFE;
- FreqCount<WordID> lhs_counts;
-};
-
-// remove terminals from the rules before estimating the conditional prob
-struct LabellingShape: public FeatureExtractor {
- LabellingShape() : fid_(FD::Convert("LabellingShape")), kCFE(FD::Convert("CFE")) {}
-
- virtual void ObserveFilteredRule(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg) {
- RuleTuple r(-1, src, trg);
- map_rule(r);
- rule_counts.inc(r, 0);
- source_counts.inc(r.source(), 0);
- }
-
- virtual void ObserveUnfilteredRule(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info) {
- RuleTuple r(-1, src, trg);
- map_rule(r);
- rule_counts.inc_if_exists(r, info.counts.get(kCFE));
- source_counts.inc_if_exists(r.source(), info.counts.get(kCFE));
- }
-
- virtual void ExtractFeatures(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& /*info*/,
- SparseVector<float>* result) const {
- RuleTuple r(-1, src, trg);
- map_rule(r);
- double l_r_freq = log(rule_counts(r));
- result->set_value(fid_, log(source_counts(r.source())) - l_r_freq);
- }
-
- // Replace all terminals with generic -1
- void map_rule(RuleTuple& r) const {
- for (vector<WordID>::iterator it = r.target().begin(); it != r.target().end(); ++it)
- if (*it <= 0) *it = -1;
- for (vector<WordID>::iterator it = r.source().begin(); it != r.source().end(); ++it)
- if (*it <= 0) *it = -1;
- }
-
- const int fid_, kCFE;
- RuleFreqCount rule_counts;
- FreqCount< vector<WordID> > source_counts;
-};
-
-
-// this extracts the lexical translation prob features
-// in BOTH directions.
-struct LexProbExtractor : public FeatureExtractor {
- LexProbExtractor() :
- e2f_(FD::Convert("LexE2F")), f2e_(FD::Convert("LexF2E")) {
- ReadFile rf(aligned_corpus);
- //create lexical translation table
- cerr << "Computing lexical translation probabilities from " << aligned_corpus << "..." << endl;
- char* buf = new char[MAX_LINE_LENGTH];
- istream& alignment = *rf.stream();
- while(alignment) {
- alignment.getline(buf, MAX_LINE_LENGTH);
- if (buf[0] == 0) continue;
- table.createTTable(buf);
- }
- delete[] buf;
- }
-
- virtual void ExtractFeatures(const WordID /*lhs*/,
- const vector<WordID>& src,
- const vector<WordID>& trg,
- const RuleStatistics& info,
- SparseVector<float>* result) const {
- map <WordID, pair<int, float> > foreign_aligned;
- map <WordID, pair<int, float> > english_aligned;
-
- //Loop over all the alignment points to compute lexical translation probability
- const vector< pair<short,short> >& al = info.aligns;
- vector< pair<short,short> >::const_iterator ita;
- for (ita = al.begin(); ita != al.end(); ++ita) {
- if (DEBUG) {
- cerr << "\nA:" << ita->first << "," << ita->second << "::";
- cerr << TD::Convert(src[ita->first]) << "-" << TD::Convert(trg[ita->second]);
- }
-
- //Lookup this alignment probability in the table
- int temp = table.word_translation[pair<WordID,WordID> (src[ita->first],trg[ita->second])];
- float f2e=0, e2f=0;
- if ( table.total_foreign[src[ita->first]] != 0)
- f2e = (float) temp / table.total_foreign[src[ita->first]];
- if ( table.total_english[trg[ita->second]] !=0 )
- e2f = (float) temp / table.total_english[trg[ita->second]];
- if (DEBUG) printf (" %d %E %E\n", temp, f2e, e2f);
-
- //local counts to keep track of which things haven't been aligned, to later compute their null alignment
- if (foreign_aligned.count(src[ita->first])) {
- foreign_aligned[ src[ita->first] ].first++;
- foreign_aligned[ src[ita->first] ].second += e2f;
- } else {
- foreign_aligned[ src[ita->first] ] = pair<int,float> (1,e2f);
- }
-
- if (english_aligned.count( trg[ ita->second] )) {
- english_aligned[ trg[ ita->second] ].first++;
- english_aligned[ trg[ ita->second] ].second += f2e;
- } else {
- english_aligned[ trg[ ita->second] ] = pair<int,float> (1,f2e);
- }
- }
-
- float final_lex_f2e=1, final_lex_e2f=1;
- static const WordID NULL_ = TD::Convert("NULL");
-
- //compute lexical weight P(F|E) and include unaligned foreign words
- for(int i=0;i<src.size(); i++) {
- if (!table.total_foreign.count(src[i])) continue; //if we dont have it in the translation table, we won't know its lexical weight
-
- if (foreign_aligned.count(src[i]))
- {
- pair<int, float> temp_lex_prob = foreign_aligned[src[i]];
- final_lex_e2f *= temp_lex_prob.second / temp_lex_prob.first;
- }
- else //dealing with null alignment
- {
- int temp_count = table.word_translation[pair<WordID,WordID> (src[i],NULL_)];
- float temp_e2f = (float) temp_count / table.total_english[NULL_];
- final_lex_e2f *= temp_e2f;
- }
-
- }
-
- //compute P(E|F) unaligned english words
- for(int j=0; j< trg.size(); j++) {
- if (!table.total_english.count(trg[j])) continue;
-
- if (english_aligned.count(trg[j]))
- {
- pair<int, float> temp_lex_prob = english_aligned[trg[j]];
- final_lex_f2e *= temp_lex_prob.second / temp_lex_prob.first;
- }
- else //dealing with null
- {
- int temp_count = table.word_translation[pair<WordID,WordID> (NULL_,trg[j])];
- float temp_f2e = (float) temp_count / table.total_foreign[NULL_];
- final_lex_f2e *= temp_f2e;
- }
- }
- result->set_value(e2f_, safenlog(final_lex_e2f));
- result->set_value(f2e_, safenlog(final_lex_f2e));
- }
- const int e2f_, f2e_;
- mutable LexTranslationTable table;
-};
-
-struct Featurizer {
- Featurizer(const vector<boost::shared_ptr<FeatureExtractor> >& ex) : extractors(ex) {
- }
- void Callback1(WordID lhs, const vector<WordID>& src, const ID2RuleStatistics& trgs) {
- for (ID2RuleStatistics::const_iterator it = trgs.begin(); it != trgs.end(); ++it) {
- for (int i = 0; i < extractors.size(); ++i)
- extractors[i]->ObserveFilteredRule(lhs, src, it->first);
- }
- }
- void Callback2(WordID lhs, const vector<WordID>& src, const ID2RuleStatistics& trgs) {
- for (ID2RuleStatistics::const_iterator it = trgs.begin(); it != trgs.end(); ++it) {
- for (int i = 0; i < extractors.size(); ++i)
- extractors[i]->ObserveUnfilteredRule(lhs, src, it->first, it->second);
- }
- }
- void Callback3(WordID lhs, const vector<WordID>& src, const ID2RuleStatistics& trgs) {
- for (ID2RuleStatistics::const_iterator it = trgs.begin(); it != trgs.end(); ++it) {
- SparseVector<float> feats;
- for (int i = 0; i < extractors.size(); ++i)
- extractors[i]->ExtractFeatures(lhs, src, it->first, it->second, &feats);
- cout << '[' << TD::Convert(-lhs) << "] ||| ";
- WriteNamed(src, &cout);
- cout << " ||| ";
- WriteAnonymous(it->first, &cout);
- cout << " ||| ";
- print(cout,feats,"=");
- cout << endl;
- }
- }
- private:
- vector<boost::shared_ptr<FeatureExtractor> > extractors;
-};
-
-void cb1(WordID lhs, const vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void* extra) {
- static_cast<Featurizer*>(extra)->Callback1(lhs, src_rhs, rules);
-}
-
-void cb2(WordID lhs, const vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void* extra) {
- static_cast<Featurizer*>(extra)->Callback2(lhs, src_rhs, rules);
-}
-
-void cb3(WordID lhs, const vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void* extra) {
- static_cast<Featurizer*>(extra)->Callback3(lhs, src_rhs, rules);
-}
-
-int main(int argc, char** argv){
- FERegistry reg;
- reg.Register("LogRuleCount", new FEFactory<LogRuleCount>);
- reg.Register("LexProb", new FEFactory<LexProbExtractor>);
- reg.Register("XFeatures", new FEFactory<XFeatures>);
- reg.Register("LabelledRuleConditionals", new FEFactory<LabelledRuleConditionals>);
- reg.Register("RulePenalty", new FEFactory<RulePenalty>);
- reg.Register("LHSProb", new FEFactory<LHSProb>);
- reg.Register("LabellingShape", new FEFactory<LabellingShape>);
- reg.Register("GenerativeProb", new FEFactory<GenerativeProb>);
- po::variables_map conf;
- InitCommandLine(reg, argc, argv, &conf);
- aligned_corpus = conf["aligned_corpus"].as<string>(); // GLOBAL VAR
- ReadFile fg1(conf["filtered_grammar"].as<string>());
-
- vector<string> feats = conf["feature"].as<vector<string> >();
- vector<boost::shared_ptr<FeatureExtractor> > extractors(feats.size());
- for (int i = 0; i < feats.size(); ++i)
- extractors[i] = reg.Create(feats[i]);
- Featurizer fizer(extractors);
-
- cerr << "Reading filtered grammar to detect keys..." << endl;
- StripedGrammarLexer::ReadStripedGrammar(fg1.stream(), cb1, &fizer);
-
- cerr << "Reading unfiltered grammar..." << endl;
- StripedGrammarLexer::ReadStripedGrammar(&cin, cb2, &fizer);
-
- ReadFile fg2(conf["filtered_grammar"].as<string>());
- cerr << "Reading filtered grammar and adding features..." << endl;
- StripedGrammarLexer::ReadStripedGrammar(fg2.stream(), cb3, &fizer);
-
- return 0;
-}
-