From da176941c1f481f14e93bd7d055cc29cac0ea8c8 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 12 Aug 2012 23:33:21 -0400 Subject: use new union api --- extools/featurize_grammar.cc | 716 ------------------------------------------- 1 file changed, 716 deletions(-) delete mode 100644 extools/featurize_grammar.cc (limited to 'extools/featurize_grammar.cc') diff --git a/extools/featurize_grammar.cc b/extools/featurize_grammar.cc deleted file mode 100644 index 78175202..00000000 --- a/extools/featurize_grammar.cc +++ /dev/null @@ -1,716 +0,0 @@ -/* - * Featurize a grammar in striped format - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "lex_trans_tbl.h" -#include "sparse_vector.h" -#include "sentence_pair.h" -#include "extract.h" -#include "fdict.h" -#include "tdict.h" -#include "filelib.h" -#include "striped_grammar.h" - -#include -#include -#include -#include -#include - -using namespace std; -using namespace std::tr1; -using boost::shared_ptr; -namespace po = boost::program_options; - -static string aligned_corpus; -static const size_t MAX_LINE_LENGTH = 64000000; - -// Data structures for indexing and counting rules -//typedef boost::tuple< WordID, vector, vector > RuleTuple; -struct RuleTuple { - RuleTuple(const WordID& lhs, const vector& s, const vector& t) - : m_lhs(lhs), m_source(s), m_target(t) { - hash_value(); - m_dirty = false; - } - - size_t hash_value() const { -// if (m_dirty) { - size_t hash = 0; - boost::hash_combine(hash, m_lhs); - boost::hash_combine(hash, m_source); - boost::hash_combine(hash, m_target); -// } -// m_dirty = false; - return hash; - } - - bool operator==(RuleTuple const& b) const - { return m_lhs == b.m_lhs && m_source == b.m_source && m_target == b.m_target; } - - WordID& lhs() { m_dirty=true; return m_lhs; } - vector& source() { m_dirty=true; return m_source; } - vector& target() { m_dirty=true; return m_target; } - const WordID& lhs() const { return m_lhs; } - const vector& source() const { return m_source; } - const vector& target() const { return m_target; } - -// mutable size_t m_hash; -private: - WordID m_lhs; - vector m_source, m_target; - mutable bool m_dirty; -}; -std::size_t hash_value(RuleTuple const& b) { return b.hash_value(); } -bool operator<(RuleTuple const& l, RuleTuple const& r) { - if (l.lhs() < r.lhs()) return true; - else if (l.lhs() == r.lhs()) { - if (l.source() < r.source()) return true; - else if (l.source() == r.source()) { - if (l.target() < r.target()) return true; - } - } - return false; -} - -ostream& operator<<(ostream& o, RuleTuple const& r) { - o << "(" << r.lhs() << "-->" << "<"; - for (vector::const_iterator it=r.source().begin(); it!=r.source().end(); ++it) - o << TD::Convert(*it) << " "; - o << "|||"; - for (vector::const_iterator it=r.target().begin(); it!=r.target().end(); ++it) - o << " " << TD::Convert(*it); - o << ">)"; - return o; -} - -template -struct FreqCount { - //typedef unordered_map > Counts; - typedef map Counts; - Counts counts; - - int inc(const Key& r, int c=1) { - pair itb - = counts.insert(make_pair(r,c)); - if (!itb.second) - itb.first->second += c; - return itb.first->second; - } - - int inc_if_exists(const Key& r, int c=1) { - typename Counts::iterator it = counts.find(r); - if (it != counts.end()) - it->second += c; - return it->second; - } - - int count(const Key& r) const { - typename Counts::const_iterator it = counts.find(r); - if (it == counts.end()) return 0; - return it->second; - } - - int operator()(const Key& r) const { return count(r); } -}; -typedef FreqCount RuleFreqCount; - -class FeatureExtractor; -class FERegistry; -struct FEFactoryBase { - virtual ~FEFactoryBase() {} - virtual boost::shared_ptr Create() const = 0; -}; - - -class FERegistry { - friend class FEFactoryBase; - public: - FERegistry() {} - boost::shared_ptr Create(const std::string& ffname) const { - map >::const_iterator it = reg_.find(ffname); - boost::shared_ptr res; - if (it == reg_.end()) { - cerr << "I don't know how to create feature " << ffname << endl; - } else { - res = it->second->Create(); - } - return res; - } - void DisplayList(ostream* out) const { - bool first = true; - for (map >::const_iterator it = reg_.begin(); - it != reg_.end(); ++it) { - if (first) {first=false;} else {*out << ' ';} - *out << it->first; - } - } - - void Register(const std::string& ffname, FEFactoryBase* factory) { - if (reg_.find(ffname) != reg_.end()) { - cerr << "Duplicate registration of FeatureExtractor with name " << ffname << "!\n"; - exit(1); - } - reg_[ffname].reset(factory); - } - - private: - std::map > reg_; -}; - -template -class FEFactory : public FEFactoryBase { - boost::shared_ptr Create() const { - return boost::shared_ptr(new FE); - } -}; - -void InitCommandLine(const FERegistry& r, int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - ostringstream feats; - feats << "[multiple] Features to extract ("; - r.DisplayList(&feats); - feats << ")"; - opts.add_options() - ("filtered_grammar,g", po::value(), "Grammar to add features to") - ("list_features,L", "List extractable features") - ("feature,f", po::value >()->composing(), feats.str().c_str()) - ("aligned_corpus,c", po::value(), "Aligned corpus (single line format)") - ("help,h", "Print this help message and exit"); - po::options_description clo("Command line options"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - - if (conf->count("help") || conf->count("aligned_corpus")==0 || conf->count("feature") == 0) { - cerr << "\nUsage: featurize_grammar -g FILTERED-GRAMMAR.gz -c ALIGNED_CORPUS.fr-en-al -f Feat1 -f Feat2 ... < UNFILTERED-GRAMMAR\n"; - cerr << dcmdline_options << endl; - exit(1); - } -} - -static const bool DEBUG = false; - -void LexTranslationTable::createTTable(const char* buf){ - AnnotatedParallelSentence sent; - sent.ParseInputLine(buf); - - //iterate over the alignment to compute aligned words - - for(int i =0;i (sent.f[i], sent.e[j])]; - ++total_foreign[sent.f[i]]; - ++total_english[sent.e[j]]; - } - } - if (DEBUG) cerr << endl; - } - if (DEBUG) cerr << endl; - - const WordID NULL_ = TD::Convert("NULL"); - //handle unaligned words - align them to null - for (int j =0; j < sent.e_len; j++) { - if (sent.e_aligned[j]) continue; - ++word_translation[pair (NULL_, sent.e[j])]; - ++total_foreign[NULL_]; - ++total_english[sent.e[j]]; - } - - for (int i =0; i < sent.f_len; i++) { - if (sent.f_aligned[i]) continue; - ++word_translation[pair (sent.f[i], NULL_)]; - ++total_english[NULL_]; - ++total_foreign[sent.f[i]]; - } -} - -inline float safenlog(float v) { - if (v == 1.0f) return 0.0f; - float res = -log(v); - if (res > 100.0f) res = 100.0f; - return res; -} - -static bool IsZero(float f) { return (f > 0.999 && f < 1.001); } - -struct FeatureExtractor { - // create any keys necessary - virtual void ObserveFilteredRule(const WordID /* lhs */, - const vector& /* src */, - const vector& /* trg */) {} - - // compute statistics over keys, the same lhs-src-trg tuple may be seen - // more than once - virtual void ObserveUnfilteredRule(const WordID /* lhs */, - const vector& /* src */, - const vector& /* trg */, - const RuleStatistics& /* info */) {} - - // compute features, a unique lhs-src-trg tuple will be seen exactly once - virtual void ExtractFeatures(const WordID lhs, - const vector& src, - const vector& trg, - const RuleStatistics& info, - SparseVector* result) const = 0; - - virtual ~FeatureExtractor() {} -}; - -struct LogRuleCount : public FeatureExtractor { - LogRuleCount() : - fid_(FD::Convert("LogRuleCount")), - sfid_(FD::Convert("SingletonRule")), - kCFE(FD::Convert("CFE")) {} - virtual void ExtractFeatures(const WordID lhs, - const vector& src, - const vector& trg, - const RuleStatistics& info, - SparseVector* result) const { - (void) lhs; (void) src; (void) trg; - //result->set_value(fid_, log(info.counts.get(kCFE))); - result->set_value(fid_, log(info.counts.get(kCFE))); - if (IsZero(info.counts.get(kCFE))) - result->set_value(sfid_, 1); - } - const int fid_; - const int sfid_; - const int kCFE; -}; - -struct RulePenalty : public FeatureExtractor { - RulePenalty() : fid_(FD::Convert("RulePenalty")) {} - virtual void ExtractFeatures(const WordID /*lhs*/, - const vector& /*src*/, - const vector& /*trg*/, - const RuleStatistics& /*info*/, - SparseVector* result) const - { result->set_value(fid_, 1); } - - const int fid_; -}; - -// The negative log of the condition rule probs -// ignoring the identities of the non-terminals. -// i.e. the prob Hiero would assign. -// Also extracts Labelled features. -struct XFeatures: public FeatureExtractor { - XFeatures() : - fid_xfe(FD::Convert("XFE")), - fid_xef(FD::Convert("XEF")), - fid_labelledfe(FD::Convert("LabelledFE")), - fid_labelledef(FD::Convert("LabelledEF")), - fid_xesingleton(FD::Convert("XE_Singleton")), - fid_xfsingleton(FD::Convert("XF_Singleton")), - kCFE(FD::Convert("CFE")) {} - virtual void ObserveFilteredRule(const WordID /*lhs*/, - const vector& src, - const vector& trg) { - RuleTuple r(-1, src, trg); - map_rule(r); - rule_counts.inc(r, 0); - source_counts.inc(r.source(), 0); - target_counts.inc(r.target(), 0); - } - - virtual void ObserveUnfilteredRule(const WordID /*lhs*/, - const vector& src, - const vector& trg, - const RuleStatistics& info) { - RuleTuple r(-1, src, trg); - map_rule(r); - const int count = info.counts.get(kCFE); - assert(count > 0); - rule_counts.inc_if_exists(r, count); - source_counts.inc_if_exists(r.source(), count); - target_counts.inc_if_exists(r.target(), count); - } - - virtual void ExtractFeatures(const WordID /*lhs*/, - const vector& src, - const vector& trg, - const RuleStatistics& info, - SparseVector* result) const { - RuleTuple r(-1, src, trg); - map_rule(r); - double l_r_freq = log(rule_counts(r)); - - const int t_c = target_counts(r.target()); - assert(t_c > 0); - result->set_value(fid_xfe, log(t_c) - l_r_freq); - result->set_value(fid_labelledfe, log(t_c) - log(info.counts.get(kCFE))); -// if (t_c == 1) -// result->set_value(fid_xesingleton, 1.0); - - const int s_c = source_counts(r.source()); - assert(s_c > 0); - result->set_value(fid_xef, log(s_c) - l_r_freq); - result->set_value(fid_labelledef, log(s_c) - log(info.counts.get(kCFE))); -// if (s_c == 1) -// result->set_value(fid_xfsingleton, 1.0); - } - - void map_rule(RuleTuple& r) const { - vector indexes; int i=0; - for (vector::iterator it = r.target().begin(); it != r.target().end(); ++it) { - if (*it <= 0) - indexes.push_back(*it); - } - for (vector::iterator it = r.source().begin(); it != r.source().end(); ++it) { - if (*it <= 0) - *it = indexes.at(i++); - } - } - - const int fid_xfe, fid_xef; - const int fid_labelledfe, fid_labelledef; - const int fid_xesingleton, fid_xfsingleton; - const int kCFE; - RuleFreqCount rule_counts; - FreqCount< vector > source_counts, target_counts; -}; - - -struct LabelledRuleConditionals: public FeatureExtractor { - LabelledRuleConditionals() : - fid_fe(FD::Convert("LabelledFE")), - fid_ef(FD::Convert("LabelledEF")), - kCFE(FD::Convert("CFE")) {} - virtual void ObserveFilteredRule(const WordID lhs, - const vector& src, - const vector& trg) { - RuleTuple r(lhs, src, trg); - rule_counts.inc(r, 0); - source_counts.inc(r.source(), 0); - - target_counts.inc(r.target(), 0); - } - - virtual void ObserveUnfilteredRule(const WordID lhs, - const vector& src, - const vector& trg, - const RuleStatistics& info) { - RuleTuple r(lhs, src, trg); - rule_counts.inc_if_exists(r, info.counts.get(kCFE)); - source_counts.inc_if_exists(r.source(), info.counts.get(kCFE)); - - target_counts.inc_if_exists(r.target(), info.counts.get(kCFE)); - } - - virtual void ExtractFeatures(const WordID lhs, - const vector& src, - const vector& trg, - const RuleStatistics& /*info*/, - SparseVector* result) const { - RuleTuple r(lhs, src, trg); - double l_r_freq = log(rule_counts(r)); - result->set_value(fid_fe, log(target_counts(r.target())) - l_r_freq); - result->set_value(fid_ef, log(source_counts(r.source())) - l_r_freq); - } - - const int fid_fe, fid_ef; - const int kCFE; - RuleFreqCount rule_counts; - FreqCount< vector > source_counts, target_counts; -}; - -struct LHSProb: public FeatureExtractor { - LHSProb() : fid_(FD::Convert("LHSProb")), kCFE(FD::Convert("CFE")), total_count(0) {} - - virtual void ObserveUnfilteredRule(const WordID lhs, - const vector& /*src*/, - const vector& /*trg*/, - const RuleStatistics& info) { - int count = info.counts.get(kCFE); - total_count += count; - lhs_counts.inc(lhs, count); - } - - virtual void ExtractFeatures(const WordID lhs, - const vector& /*src*/, - const vector& /*trg*/, - const RuleStatistics& /*info*/, - SparseVector* result) const { - double lhs_log_prob = log(total_count) - log(lhs_counts(lhs)); - result->set_value(fid_, lhs_log_prob); - } - - const int fid_; - const int kCFE; - int total_count; - FreqCount lhs_counts; -}; - -// Proper rule generative probability: p( s,t | lhs) -struct GenerativeProb: public FeatureExtractor { - GenerativeProb() : - fid_(FD::Convert("GenerativeProb")), - kCFE(FD::Convert("CFE")) {} - - virtual void ObserveUnfilteredRule(const WordID lhs, - const vector& /*src*/, - const vector& /*trg*/, - const RuleStatistics& info) - { lhs_counts.inc(lhs, info.counts.get(kCFE)); } - - virtual void ExtractFeatures(const WordID lhs, - const vector& /*src*/, - const vector& /*trg*/, - const RuleStatistics& info, - SparseVector* result) const { - double log_prob = log(lhs_counts(lhs)) - log(info.counts.get(kCFE)); - result->set_value(fid_, log_prob); - } - - const int fid_; - const int kCFE; - FreqCount lhs_counts; -}; - -// remove terminals from the rules before estimating the conditional prob -struct LabellingShape: public FeatureExtractor { - LabellingShape() : fid_(FD::Convert("LabellingShape")), kCFE(FD::Convert("CFE")) {} - - virtual void ObserveFilteredRule(const WordID /*lhs*/, - const vector& src, - const vector& trg) { - RuleTuple r(-1, src, trg); - map_rule(r); - rule_counts.inc(r, 0); - source_counts.inc(r.source(), 0); - } - - virtual void ObserveUnfilteredRule(const WordID /*lhs*/, - const vector& src, - const vector& trg, - const RuleStatistics& info) { - RuleTuple r(-1, src, trg); - map_rule(r); - rule_counts.inc_if_exists(r, info.counts.get(kCFE)); - source_counts.inc_if_exists(r.source(), info.counts.get(kCFE)); - } - - virtual void ExtractFeatures(const WordID /*lhs*/, - const vector& src, - const vector& trg, - const RuleStatistics& /*info*/, - SparseVector* result) const { - RuleTuple r(-1, src, trg); - map_rule(r); - double l_r_freq = log(rule_counts(r)); - result->set_value(fid_, log(source_counts(r.source())) - l_r_freq); - } - - // Replace all terminals with generic -1 - void map_rule(RuleTuple& r) const { - for (vector::iterator it = r.target().begin(); it != r.target().end(); ++it) - if (*it <= 0) *it = -1; - for (vector::iterator it = r.source().begin(); it != r.source().end(); ++it) - if (*it <= 0) *it = -1; - } - - const int fid_, kCFE; - RuleFreqCount rule_counts; - FreqCount< vector > source_counts; -}; - - -// this extracts the lexical translation prob features -// in BOTH directions. -struct LexProbExtractor : public FeatureExtractor { - LexProbExtractor() : - e2f_(FD::Convert("LexE2F")), f2e_(FD::Convert("LexF2E")) { - ReadFile rf(aligned_corpus); - //create lexical translation table - cerr << "Computing lexical translation probabilities from " << aligned_corpus << "..." << endl; - char* buf = new char[MAX_LINE_LENGTH]; - istream& alignment = *rf.stream(); - while(alignment) { - alignment.getline(buf, MAX_LINE_LENGTH); - if (buf[0] == 0) continue; - table.createTTable(buf); - } - delete[] buf; - } - - virtual void ExtractFeatures(const WordID /*lhs*/, - const vector& src, - const vector& trg, - const RuleStatistics& info, - SparseVector* result) const { - map > foreign_aligned; - map > english_aligned; - - //Loop over all the alignment points to compute lexical translation probability - const vector< pair >& al = info.aligns; - vector< pair >::const_iterator ita; - for (ita = al.begin(); ita != al.end(); ++ita) { - if (DEBUG) { - cerr << "\nA:" << ita->first << "," << ita->second << "::"; - cerr << TD::Convert(src[ita->first]) << "-" << TD::Convert(trg[ita->second]); - } - - //Lookup this alignment probability in the table - int temp = table.word_translation[pair (src[ita->first],trg[ita->second])]; - float f2e=0, e2f=0; - if ( table.total_foreign[src[ita->first]] != 0) - f2e = (float) temp / table.total_foreign[src[ita->first]]; - if ( table.total_english[trg[ita->second]] !=0 ) - e2f = (float) temp / table.total_english[trg[ita->second]]; - if (DEBUG) printf (" %d %E %E\n", temp, f2e, e2f); - - //local counts to keep track of which things haven't been aligned, to later compute their null alignment - if (foreign_aligned.count(src[ita->first])) { - foreign_aligned[ src[ita->first] ].first++; - foreign_aligned[ src[ita->first] ].second += e2f; - } else { - foreign_aligned[ src[ita->first] ] = pair (1,e2f); - } - - if (english_aligned.count( trg[ ita->second] )) { - english_aligned[ trg[ ita->second] ].first++; - english_aligned[ trg[ ita->second] ].second += f2e; - } else { - english_aligned[ trg[ ita->second] ] = pair (1,f2e); - } - } - - float final_lex_f2e=1, final_lex_e2f=1; - static const WordID NULL_ = TD::Convert("NULL"); - - //compute lexical weight P(F|E) and include unaligned foreign words - for(int i=0;i temp_lex_prob = foreign_aligned[src[i]]; - final_lex_e2f *= temp_lex_prob.second / temp_lex_prob.first; - } - else //dealing with null alignment - { - int temp_count = table.word_translation[pair (src[i],NULL_)]; - float temp_e2f = (float) temp_count / table.total_english[NULL_]; - final_lex_e2f *= temp_e2f; - } - - } - - //compute P(E|F) unaligned english words - for(int j=0; j< trg.size(); j++) { - if (!table.total_english.count(trg[j])) continue; - - if (english_aligned.count(trg[j])) - { - pair temp_lex_prob = english_aligned[trg[j]]; - final_lex_f2e *= temp_lex_prob.second / temp_lex_prob.first; - } - else //dealing with null - { - int temp_count = table.word_translation[pair (NULL_,trg[j])]; - float temp_f2e = (float) temp_count / table.total_foreign[NULL_]; - final_lex_f2e *= temp_f2e; - } - } - result->set_value(e2f_, safenlog(final_lex_e2f)); - result->set_value(f2e_, safenlog(final_lex_f2e)); - } - const int e2f_, f2e_; - mutable LexTranslationTable table; -}; - -struct Featurizer { - Featurizer(const vector >& ex) : extractors(ex) { - } - void Callback1(WordID lhs, const vector& src, const ID2RuleStatistics& trgs) { - for (ID2RuleStatistics::const_iterator it = trgs.begin(); it != trgs.end(); ++it) { - for (int i = 0; i < extractors.size(); ++i) - extractors[i]->ObserveFilteredRule(lhs, src, it->first); - } - } - void Callback2(WordID lhs, const vector& src, const ID2RuleStatistics& trgs) { - for (ID2RuleStatistics::const_iterator it = trgs.begin(); it != trgs.end(); ++it) { - for (int i = 0; i < extractors.size(); ++i) - extractors[i]->ObserveUnfilteredRule(lhs, src, it->first, it->second); - } - } - void Callback3(WordID lhs, const vector& src, const ID2RuleStatistics& trgs) { - for (ID2RuleStatistics::const_iterator it = trgs.begin(); it != trgs.end(); ++it) { - SparseVector feats; - for (int i = 0; i < extractors.size(); ++i) - extractors[i]->ExtractFeatures(lhs, src, it->first, it->second, &feats); - cout << '[' << TD::Convert(-lhs) << "] ||| "; - WriteNamed(src, &cout); - cout << " ||| "; - WriteAnonymous(it->first, &cout); - cout << " ||| "; - print(cout,feats,"="); - cout << endl; - } - } - private: - vector > extractors; -}; - -void cb1(WordID lhs, const vector& src_rhs, const ID2RuleStatistics& rules, void* extra) { - static_cast(extra)->Callback1(lhs, src_rhs, rules); -} - -void cb2(WordID lhs, const vector& src_rhs, const ID2RuleStatistics& rules, void* extra) { - static_cast(extra)->Callback2(lhs, src_rhs, rules); -} - -void cb3(WordID lhs, const vector& src_rhs, const ID2RuleStatistics& rules, void* extra) { - static_cast(extra)->Callback3(lhs, src_rhs, rules); -} - -int main(int argc, char** argv){ - FERegistry reg; - reg.Register("LogRuleCount", new FEFactory); - reg.Register("LexProb", new FEFactory); - reg.Register("XFeatures", new FEFactory); - reg.Register("LabelledRuleConditionals", new FEFactory); - reg.Register("RulePenalty", new FEFactory); - reg.Register("LHSProb", new FEFactory); - reg.Register("LabellingShape", new FEFactory); - reg.Register("GenerativeProb", new FEFactory); - po::variables_map conf; - InitCommandLine(reg, argc, argv, &conf); - aligned_corpus = conf["aligned_corpus"].as(); // GLOBAL VAR - ReadFile fg1(conf["filtered_grammar"].as()); - - vector feats = conf["feature"].as >(); - vector > extractors(feats.size()); - for (int i = 0; i < feats.size(); ++i) - extractors[i] = reg.Create(feats[i]); - Featurizer fizer(extractors); - - cerr << "Reading filtered grammar to detect keys..." << endl; - StripedGrammarLexer::ReadStripedGrammar(fg1.stream(), cb1, &fizer); - - cerr << "Reading unfiltered grammar..." << endl; - StripedGrammarLexer::ReadStripedGrammar(&cin, cb2, &fizer); - - ReadFile fg2(conf["filtered_grammar"].as()); - cerr << "Reading filtered grammar and adding features..." << endl; - StripedGrammarLexer::ReadStripedGrammar(fg2.stream(), cb3, &fizer); - - return 0; -} - -- cgit v1.2.3