summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--extools/filter_score_grammar.cc50
1 files changed, 45 insertions, 5 deletions
diff --git a/extools/filter_score_grammar.cc b/extools/filter_score_grammar.cc
index e1fd714b..f34b240d 100644
--- a/extools/filter_score_grammar.cc
+++ b/extools/filter_score_grammar.cc
@@ -37,6 +37,8 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
opts.add_options()
("test_set,t", po::value<string>(), "Filter for this test set (not specified = no filtering)")
("top_e_given_f,n", po::value<size_t>()->default_value(30), "Keep top N rules, according to p(e|f). 0 for all")
+ ("hiero_features", "Use 'Hiero' features")
+// ("feature,f", po::value<vector<string> >()->composing(), "List of features to compute")
("aligned_corpus,c", po::value<string>(), "Aligned corpus (single line format)")
("help,h", "Print this help message and exit");
po::options_description clo("Command line options");
@@ -245,6 +247,38 @@ struct FeatureExtractor {
const string extractor_name;
};
+struct LogRuleCount : public FeatureExtractor {
+ LogRuleCount() :
+ FeatureExtractor("LogRuleCount"),
+ fid_(FD::Convert("LogRuleCount")), kCFE(FD::Convert("CFE")) {}
+ virtual void ExtractFeatures(const vector<WordID>& lhs_src,
+ const vector<WordID>& trg,
+ const RuleStatistics& info,
+ SparseVector<float>* result) const {
+ (void) lhs_src; (void) trg;
+ result->set_value(fid_, log(info.counts.value(kCFE)));
+ }
+ const int fid_;
+ const int kCFE;
+};
+
+struct SingletonRule : public FeatureExtractor {
+ SingletonRule() :
+ FeatureExtractor("SingletonRule"),
+ fid_(FD::Convert("SingletonRule")), kCFE(FD::Convert("CFE")) {}
+ virtual void ExtractFeatures(const vector<WordID>& lhs_src,
+ const vector<WordID>& trg,
+ const RuleStatistics& info,
+ SparseVector<float>* result) const {
+ (void) lhs_src; (void) trg;
+ if (info.counts.value(kCFE) > 0.999 && info.counts.value(kCFE) < 1.001) {
+ result->set_value(fid_, 1.0);
+ }
+ }
+ const int fid_;
+ const int kCFE;
+};
+
struct EGivenFExtractor : public FeatureExtractor {
EGivenFExtractor() :
FeatureExtractor("EGivenF"),
@@ -403,9 +437,15 @@ int main(int argc, char** argv){
// TODO make this list configurable
vector<boost::shared_ptr<FeatureExtractor> > extractors;
- extractors.push_back(boost::shared_ptr<FeatureExtractor>(new EGivenFExtractor));
- extractors.push_back(boost::shared_ptr<FeatureExtractor>(new FGivenEExtractor));
- extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LexProbExtractor(conf["aligned_corpus"].as<string>())));
+ if (conf.count("hiero_features")) {
+ extractors.push_back(boost::shared_ptr<FeatureExtractor>(new EGivenFExtractor));
+ extractors.push_back(boost::shared_ptr<FeatureExtractor>(new FGivenEExtractor));
+ extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LexProbExtractor(conf["aligned_corpus"].as<string>())));
+ } else {
+ extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LogRuleCount));
+ extractors.push_back(boost::shared_ptr<FeatureExtractor>(new SingletonRule));
+ extractors.push_back(boost::shared_ptr<FeatureExtractor>(new LexProbExtractor(conf["aligned_corpus"].as<string>())));
+ }
//score unscored grammar
cerr <<"Scoring grammar..." << endl;
@@ -415,7 +455,7 @@ int main(int argc, char** argv){
vector<WordID> key, cur_key,temp_key;
int line = 0;
- const int kEGivenF = FD::Convert("EGivenF");
+ const int kLogRuleCount = FD::Convert("LogRuleCount");
multimap<float, string> options;
while(!unscored_grammar.eof())
{
@@ -436,7 +476,7 @@ int main(int argc, char** argv){
os << TD::GetString(cur_key)
<< ' ' << TD::GetString(it->first) << " ||| ";
feats.Write(false, &os);
- options.insert(make_pair(feats.value(kEGivenF), os.str()));
+ options.insert(make_pair(-feats.value(kLogRuleCount), os.str()));
}
int ocount = 0;
for (multimap<float,string>::iterator it = options.begin(); it != options.end(); ++it) {