diff options
-rw-r--r-- | extools/filter_score_grammar.cc | 50 |
1 files changed, 45 insertions, 5 deletions
diff --git a/extools/filter_score_grammar.cc b/extools/filter_score_grammar.cc index e1fd714b..f34b240d 100644 --- a/extools/filter_score_grammar.cc +++ b/extools/filter_score_grammar.cc @@ -37,6 +37,8 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("test_set,t", po::value<string>(), "Filter for this test set (not specified = no filtering)") ("top_e_given_f,n", po::value<size_t>()->default_value(30), "Keep top N rules, according to p(e|f). 0 for all") + ("hiero_features", "Use 'Hiero' features") +// ("feature,f", po::value<vector<string> >()->composing(), "List of features to compute") ("aligned_corpus,c", po::value<string>(), "Aligned corpus (single line format)") ("help,h", "Print this help message and exit"); po::options_description clo("Command line options"); @@ -245,6 +247,38 @@ struct FeatureExtractor { const string extractor_name; }; +struct LogRuleCount : public FeatureExtractor { + LogRuleCount() : + FeatureExtractor("LogRuleCount"), + fid_(FD::Convert("LogRuleCount")), kCFE(FD::Convert("CFE")) {} + virtual void ExtractFeatures(const vector<WordID>& lhs_src, + const vector<WordID>& trg, + const RuleStatistics& info, + SparseVector<float>* result) const { + (void) lhs_src; (void) trg; + result->set_value(fid_, log(info.counts.value(kCFE))); + } + const int fid_; + const int kCFE; +}; + +struct SingletonRule : public FeatureExtractor { + SingletonRule() : + FeatureExtractor("SingletonRule"), + fid_(FD::Convert("SingletonRule")), kCFE(FD::Convert("CFE")) {} + virtual void ExtractFeatures(const vector<WordID>& lhs_src, + const vector<WordID>& trg, + const RuleStatistics& info, + SparseVector<float>* result) const { + (void) lhs_src; (void) trg; + if (info.counts.value(kCFE) > 0.999 && info.counts.value(kCFE) < 1.001) { + result->set_value(fid_, 1.0); + } + } + const int fid_; + const int kCFE; +}; + struct EGivenFExtractor : public FeatureExtractor { EGivenFExtractor() : FeatureExtractor("EGivenF"), @@ -403,9 +437,15 @@ int main(int argc, char** argv){ // TODO make this list configurable vector<boost::shared_ptr<FeatureExtractor> > extractors; - extractors.push_back(boost::shared_ptr<FeatureExtractor>(new EGivenFExtractor)); - extractors.push_back(boost::shared_ptr<FeatureExtractor>(new FGivenEExtractor)); - extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LexProbExtractor(conf["aligned_corpus"].as<string>()))); + if (conf.count("hiero_features")) { + extractors.push_back(boost::shared_ptr<FeatureExtractor>(new EGivenFExtractor)); + extractors.push_back(boost::shared_ptr<FeatureExtractor>(new FGivenEExtractor)); + extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LexProbExtractor(conf["aligned_corpus"].as<string>()))); + } else { + extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LogRuleCount)); + extractors.push_back(boost::shared_ptr<FeatureExtractor>(new SingletonRule)); + extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LexProbExtractor(conf["aligned_corpus"].as<string>()))); + } //score unscored grammar cerr <<"Scoring grammar..." << endl; @@ -415,7 +455,7 @@ int main(int argc, char** argv){ vector<WordID> key, cur_key,temp_key; int line = 0; - const int kEGivenF = FD::Convert("EGivenF"); + const int kLogRuleCount = FD::Convert("LogRuleCount"); multimap<float, string> options; while(!unscored_grammar.eof()) { @@ -436,7 +476,7 @@ int main(int argc, char** argv){ os << TD::GetString(cur_key) << ' ' << TD::GetString(it->first) << " ||| "; feats.Write(false, &os); - options.insert(make_pair(feats.value(kEGivenF), os.str())); + options.insert(make_pair(-feats.value(kLogRuleCount), os.str())); } int ocount = 0; for (multimap<float,string>::iterator it = options.begin(); it != options.end(); ++it) { |