diff options
Diffstat (limited to 'extools/filter_score_grammar.cc')
-rw-r--r-- | extools/filter_score_grammar.cc | 57 |
1 files changed, 45 insertions, 12 deletions
diff --git a/extools/filter_score_grammar.cc b/extools/filter_score_grammar.cc index f34b240d..fe9a2a07 100644 --- a/extools/filter_score_grammar.cc +++ b/extools/filter_score_grammar.cc @@ -37,7 +37,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("test_set,t", po::value<string>(), "Filter for this test set (not specified = no filtering)") ("top_e_given_f,n", po::value<size_t>()->default_value(30), "Keep top N rules, according to p(e|f). 0 for all") - ("hiero_features", "Use 'Hiero' features") + ("backoff_features", "Extract backoff X-features, assumes E, F, EF counts") // ("feature,f", po::value<vector<string> >()->composing(), "List of features to compute") ("aligned_corpus,c", po::value<string>(), "Aligned corpus (single line format)") ("help,h", "Print this help message and exit"); @@ -247,36 +247,66 @@ struct FeatureExtractor { const string extractor_name; }; +static bool IsZero(float f) { return (f > 0.999 && f < 1.001); } + struct LogRuleCount : public FeatureExtractor { LogRuleCount() : FeatureExtractor("LogRuleCount"), - fid_(FD::Convert("LogRuleCount")), kCFE(FD::Convert("CFE")) {} + fid_(FD::Convert("LogRuleCount")), + sfid_(FD::Convert("SingletonRule")), + kCFE(FD::Convert("CFE")) {} virtual void ExtractFeatures(const vector<WordID>& lhs_src, const vector<WordID>& trg, const RuleStatistics& info, SparseVector<float>* result) const { (void) lhs_src; (void) trg; result->set_value(fid_, log(info.counts.value(kCFE))); + if (IsZero(info.counts.value(kCFE))) + result->set_value(sfid_, 1); } const int fid_; + const int sfid_; const int kCFE; }; -struct SingletonRule : public FeatureExtractor { - SingletonRule() : - FeatureExtractor("SingletonRule"), - fid_(FD::Convert("SingletonRule")), kCFE(FD::Convert("CFE")) {} +struct LogECount : public FeatureExtractor { + LogECount() : + FeatureExtractor("LogECount"), + sfid_(FD::Convert("SingletonE")), + fid_(FD::Convert("LogECount")), kCE(FD::Convert("CE")) {} virtual void ExtractFeatures(const vector<WordID>& lhs_src, const vector<WordID>& trg, const RuleStatistics& info, SparseVector<float>* result) const { (void) lhs_src; (void) trg; - if (info.counts.value(kCFE) > 0.999 && info.counts.value(kCFE) < 1.001) { - result->set_value(fid_, 1.0); - } + assert(info.counts.value(kCE) > 0); + result->set_value(fid_, log(info.counts.value(kCE))); + if (IsZero(info.counts.value(kCE))) + result->set_value(sfid_, 1); } + const int sfid_; const int fid_; - const int kCFE; + const int kCE; +}; + +struct LogFCount : public FeatureExtractor { + LogFCount() : + FeatureExtractor("LogFCount"), + sfid_(FD::Convert("SingletonF")), + fid_(FD::Convert("LogFCount")), kCF(FD::Convert("CF")) {} + virtual void ExtractFeatures(const vector<WordID>& lhs_src, + const vector<WordID>& trg, + const RuleStatistics& info, + SparseVector<float>* result) const { + (void) lhs_src; (void) trg; + assert(info.counts.value(kCF) > 0); + result->set_value(fid_, log(info.counts.value(kCF))); + if (IsZero(info.counts.value(kCF))) + result->set_value(sfid_, 1); + } + const int sfid_; + const int fid_; + const int kCF; }; struct EGivenFExtractor : public FeatureExtractor { @@ -437,13 +467,16 @@ int main(int argc, char** argv){ // TODO make this list configurable vector<boost::shared_ptr<FeatureExtractor> > extractors; - if (conf.count("hiero_features")) { + if (conf.count("backoff_features")) { + extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LogRuleCount)); + extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LogECount)); + extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LogFCount)); extractors.push_back(boost::shared_ptr<FeatureExtractor>(new EGivenFExtractor)); extractors.push_back(boost::shared_ptr<FeatureExtractor>(new FGivenEExtractor)); extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LexProbExtractor(conf["aligned_corpus"].as<string>()))); } else { extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LogRuleCount)); - extractors.push_back(boost::shared_ptr<FeatureExtractor>(new SingletonRule)); + extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LogFCount)); extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LexProbExtractor(conf["aligned_corpus"].as<string>()))); } |