summaryrefslogtreecommitdiff
path: root/extools/filter_score_grammar.cc
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-06 17:45:09 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-06 17:45:09 +0000
commitc04ba5eed5049569d327dfb6162d91bec0a3aec8 (patch)
tree7ce8bc507e600468c95429c3bcd16151fc4ac636 /extools/filter_score_grammar.cc
parent6a015a6dea93394af3ed9f26d78d265b50b3ba9f (diff)
featurizer
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@154 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'extools/filter_score_grammar.cc')
-rw-r--r--extools/filter_score_grammar.cc57
1 files changed, 45 insertions, 12 deletions
diff --git a/extools/filter_score_grammar.cc b/extools/filter_score_grammar.cc
index f34b240d..fe9a2a07 100644
--- a/extools/filter_score_grammar.cc
+++ b/extools/filter_score_grammar.cc
@@ -37,7 +37,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
opts.add_options()
("test_set,t", po::value<string>(), "Filter for this test set (not specified = no filtering)")
("top_e_given_f,n", po::value<size_t>()->default_value(30), "Keep top N rules, according to p(e|f). 0 for all")
- ("hiero_features", "Use 'Hiero' features")
+ ("backoff_features", "Extract backoff X-features, assumes E, F, EF counts")
// ("feature,f", po::value<vector<string> >()->composing(), "List of features to compute")
("aligned_corpus,c", po::value<string>(), "Aligned corpus (single line format)")
("help,h", "Print this help message and exit");
@@ -247,36 +247,66 @@ struct FeatureExtractor {
const string extractor_name;
};
+static bool IsZero(float f) { return (f > 0.999 && f < 1.001); }
+
struct LogRuleCount : public FeatureExtractor {
LogRuleCount() :
FeatureExtractor("LogRuleCount"),
- fid_(FD::Convert("LogRuleCount")), kCFE(FD::Convert("CFE")) {}
+ fid_(FD::Convert("LogRuleCount")),
+ sfid_(FD::Convert("SingletonRule")),
+ kCFE(FD::Convert("CFE")) {}
virtual void ExtractFeatures(const vector<WordID>& lhs_src,
const vector<WordID>& trg,
const RuleStatistics& info,
SparseVector<float>* result) const {
(void) lhs_src; (void) trg;
result->set_value(fid_, log(info.counts.value(kCFE)));
+ if (IsZero(info.counts.value(kCFE)))
+ result->set_value(sfid_, 1);
}
const int fid_;
+ const int sfid_;
const int kCFE;
};
-struct SingletonRule : public FeatureExtractor {
- SingletonRule() :
- FeatureExtractor("SingletonRule"),
- fid_(FD::Convert("SingletonRule")), kCFE(FD::Convert("CFE")) {}
+struct LogECount : public FeatureExtractor {
+ LogECount() :
+ FeatureExtractor("LogECount"),
+ sfid_(FD::Convert("SingletonE")),
+ fid_(FD::Convert("LogECount")), kCE(FD::Convert("CE")) {}
virtual void ExtractFeatures(const vector<WordID>& lhs_src,
const vector<WordID>& trg,
const RuleStatistics& info,
SparseVector<float>* result) const {
(void) lhs_src; (void) trg;
- if (info.counts.value(kCFE) > 0.999 && info.counts.value(kCFE) < 1.001) {
- result->set_value(fid_, 1.0);
- }
+ assert(info.counts.value(kCE) > 0);
+ result->set_value(fid_, log(info.counts.value(kCE)));
+ if (IsZero(info.counts.value(kCE)))
+ result->set_value(sfid_, 1);
}
+ const int sfid_;
const int fid_;
- const int kCFE;
+ const int kCE;
+};
+
+struct LogFCount : public FeatureExtractor {
+ LogFCount() :
+ FeatureExtractor("LogFCount"),
+ sfid_(FD::Convert("SingletonF")),
+ fid_(FD::Convert("LogFCount")), kCF(FD::Convert("CF")) {}
+ virtual void ExtractFeatures(const vector<WordID>& lhs_src,
+ const vector<WordID>& trg,
+ const RuleStatistics& info,
+ SparseVector<float>* result) const {
+ (void) lhs_src; (void) trg;
+ assert(info.counts.value(kCF) > 0);
+ result->set_value(fid_, log(info.counts.value(kCF)));
+ if (IsZero(info.counts.value(kCF)))
+ result->set_value(sfid_, 1);
+ }
+ const int sfid_;
+ const int fid_;
+ const int kCF;
};
struct EGivenFExtractor : public FeatureExtractor {
@@ -437,13 +467,16 @@ int main(int argc, char** argv){
// TODO make this list configurable
vector<boost::shared_ptr<FeatureExtractor> > extractors;
- if (conf.count("hiero_features")) {
+ if (conf.count("backoff_features")) {
+ extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LogRuleCount));
+ extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LogECount));
+ extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LogFCount));
extractors.push_back(boost::shared_ptr<FeatureExtractor>(new EGivenFExtractor));
extractors.push_back(boost::shared_ptr<FeatureExtractor>(new FGivenEExtractor));
extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LexProbExtractor(conf["aligned_corpus"].as<string>())));
} else {
extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LogRuleCount));
- extractors.push_back(boost::shared_ptr<FeatureExtractor>(new SingletonRule));
+ extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LogFCount));
extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LexProbExtractor(conf["aligned_corpus"].as<string>())));
}