From e43fe3326d456d4339297453e593072792a34930 Mon Sep 17 00:00:00 2001 From: redpony Date: Mon, 5 Jul 2010 19:08:20 +0000 Subject: use better features git-svn-id: https://ws10smt.googlecode.com/svn/trunk@135 ec762483-ff6d-05da-a07a-a48fb63a330f --- extools/filter_score_grammar.cc | 50 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) (limited to 'extools') diff --git a/extools/filter_score_grammar.cc b/extools/filter_score_grammar.cc index e1fd714b..f34b240d 100644 --- a/extools/filter_score_grammar.cc +++ b/extools/filter_score_grammar.cc @@ -37,6 +37,8 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("test_set,t", po::value(), "Filter for this test set (not specified = no filtering)") ("top_e_given_f,n", po::value()->default_value(30), "Keep top N rules, according to p(e|f). 0 for all") + ("hiero_features", "Use 'Hiero' features") +// ("feature,f", po::value >()->composing(), "List of features to compute") ("aligned_corpus,c", po::value(), "Aligned corpus (single line format)") ("help,h", "Print this help message and exit"); po::options_description clo("Command line options"); @@ -245,6 +247,38 @@ struct FeatureExtractor { const string extractor_name; }; +struct LogRuleCount : public FeatureExtractor { + LogRuleCount() : + FeatureExtractor("LogRuleCount"), + fid_(FD::Convert("LogRuleCount")), kCFE(FD::Convert("CFE")) {} + virtual void ExtractFeatures(const vector& lhs_src, + const vector& trg, + const RuleStatistics& info, + SparseVector* result) const { + (void) lhs_src; (void) trg; + result->set_value(fid_, log(info.counts.value(kCFE))); + } + const int fid_; + const int kCFE; +}; + +struct SingletonRule : public FeatureExtractor { + SingletonRule() : + FeatureExtractor("SingletonRule"), + fid_(FD::Convert("SingletonRule")), kCFE(FD::Convert("CFE")) {} + virtual void ExtractFeatures(const vector& lhs_src, + const vector& trg, + const RuleStatistics& info, + SparseVector* result) const { + (void) lhs_src; (void) trg; + if (info.counts.value(kCFE) > 0.999 && info.counts.value(kCFE) < 1.001) { + result->set_value(fid_, 1.0); + } + } + const int fid_; + const int kCFE; +}; + struct EGivenFExtractor : public FeatureExtractor { EGivenFExtractor() : FeatureExtractor("EGivenF"), @@ -403,9 +437,15 @@ int main(int argc, char** argv){ // TODO make this list configurable vector > extractors; - extractors.push_back(boost::shared_ptr(new EGivenFExtractor)); - extractors.push_back(boost::shared_ptr(new FGivenEExtractor)); - extractors.push_back(boost::shared_ptr(new LexProbExtractor(conf["aligned_corpus"].as()))); + if (conf.count("hiero_features")) { + extractors.push_back(boost::shared_ptr(new EGivenFExtractor)); + extractors.push_back(boost::shared_ptr(new FGivenEExtractor)); + extractors.push_back(boost::shared_ptr(new LexProbExtractor(conf["aligned_corpus"].as()))); + } else { + extractors.push_back(boost::shared_ptr(new LogRuleCount)); + extractors.push_back(boost::shared_ptr(new SingletonRule)); + extractors.push_back(boost::shared_ptr(new LexProbExtractor(conf["aligned_corpus"].as()))); + } //score unscored grammar cerr <<"Scoring grammar..." << endl; @@ -415,7 +455,7 @@ int main(int argc, char** argv){ vector key, cur_key,temp_key; int line = 0; - const int kEGivenF = FD::Convert("EGivenF"); + const int kLogRuleCount = FD::Convert("LogRuleCount"); multimap options; while(!unscored_grammar.eof()) { @@ -436,7 +476,7 @@ int main(int argc, char** argv){ os << TD::GetString(cur_key) << ' ' << TD::GetString(it->first) << " ||| "; feats.Write(false, &os); - options.insert(make_pair(feats.value(kEGivenF), os.str())); + options.insert(make_pair(-feats.value(kLogRuleCount), os.str())); } int ocount = 0; for (multimap::iterator it = options.begin(); it != options.end(); ++it) { -- cgit v1.2.3