diff options
-rw-r--r-- | extools/featurize_grammar.cc | 108 | ||||
-rwxr-xr-x | gi/pipeline/evaluation-pipeline.pl | 9 |
2 files changed, 75 insertions, 42 deletions
diff --git a/extools/featurize_grammar.cc b/extools/featurize_grammar.cc index 771948ce..0d054626 100644 --- a/extools/featurize_grammar.cc +++ b/extools/featurize_grammar.cc @@ -390,8 +390,10 @@ struct LogRuleCount : public FeatureExtractor { // i.e. the prob Hiero would assign. struct XFeatures: public FeatureExtractor { XFeatures() : - fid_fe(FD::Convert("XFE")), - fid_ef(FD::Convert("XEF")), + fid_xfe(FD::Convert("XFE")), + fid_xef(FD::Convert("XEF")), + fid_labelledfe(FD::Convert("LabelledFE")), + fid_labelledef(FD::Convert("LabelledEF")), kCFE(FD::Convert("CFE")) {} virtual void ObserveFilteredRule(const WordID /*lhs*/, const vector<WordID>& src, @@ -399,7 +401,11 @@ struct XFeatures: public FeatureExtractor { RuleTuple r(-1, src, trg); map_rule(r); rule_counts.inc(r, 0); + + normalise_string(r.source()); source_counts.inc(r.source(), 0); + + normalise_string(r.target()); target_counts.inc(r.target(), 0); } @@ -413,7 +419,11 @@ struct XFeatures: public FeatureExtractor { // cerr << " ObserveUnfilteredRule() in:" << r << " " << hash_value(r) << endl; map_rule(r); rule_counts.inc_if_exists(r, info.counts.value(kCFE)); + + normalise_string(r.source()); source_counts.inc_if_exists(r.source(), info.counts.value(kCFE)); + + normalise_string(r.target()); target_counts.inc_if_exists(r.target(), info.counts.value(kCFE)); // cerr << " ObserveUnfilteredRule() inc: " << r << " " << hash_value(r) << " " << info.counts.value(kCFE) << " to " << rule_counts(r) << endl; } @@ -421,15 +431,22 @@ struct XFeatures: public FeatureExtractor { virtual void ExtractFeatures(const WordID /*lhs*/, const vector<WordID>& src, const vector<WordID>& trg, - const RuleStatistics& /*info*/, + const RuleStatistics& info, SparseVector<float>* result) const { RuleTuple r(-1, src, trg); map_rule(r); - //result->set_value(fid_fe, log(target_counts(r.target())) - log(rule_counts(r))); - //result->set_value(fid_ef, log(source_counts(r.source())) - log(rule_counts(r))); - result->set_value(fid_ef, target_counts(r.target())); - result->set_value(fid_fe, rule_counts(r)); - //result->set_value(fid_fe, (source_counts(r.source()))); + //result->set_value(fid_fe, rule_counts(r)); + double l_r_freq = log(rule_counts(r)); + + normalise_string(r.target()); + result->set_value(fid_xfe, log(target_counts(r.target())) - l_r_freq); + result->set_value(fid_labelledfe, log(target_counts(r.target())) - log(info.counts.value(kCFE))); + //result->set_value(fid_labelledfe, target_counts(r.target())); + + normalise_string(r.source()); + result->set_value(fid_xef, log(source_counts(r.source())) - l_r_freq); + result->set_value(fid_labelledef, log(source_counts(r.source())) - log(info.counts.value(kCFE))); + //result->set_value(fid_labelledef, source_counts(r.source())); } void map_rule(RuleTuple& r) const { @@ -444,7 +461,14 @@ struct XFeatures: public FeatureExtractor { } } - const int fid_fe, fid_ef; + void normalise_string(vector<WordID>& r) const { + vector<WordID> indexes; + for (vector<WordID>::iterator it = r.begin(); it != r.end(); ++it) + if (validate_non_terminal(TD::Convert(*it))) *it = -1; + } + + const int fid_xfe, fid_xef; + const int fid_labelledfe, fid_labelledef; const int kCFE; RuleFreqCount rule_counts; FreqCount< vector<WordID> > source_counts, target_counts; @@ -452,69 +476,71 @@ struct XFeatures: public FeatureExtractor { struct LabelledRuleConditionals: public FeatureExtractor { LabelledRuleConditionals() : + fid_count(FD::Convert("TCount")), + fid_e(FD::Convert("TCountE")), + fid_f(FD::Convert("TCountF")), fid_fe(FD::Convert("TLabelledFE")), fid_ef(FD::Convert("TLabelledEF")), kCFE(FD::Convert("CFE")) {} - virtual void ObserveFilteredRule(const WordID /*lhs*/, + virtual void ObserveFilteredRule(const WordID lhs, const vector<WordID>& src, const vector<WordID>& trg) { - RuleTuple r(-1, src, trg); + RuleTuple r(lhs, src, trg); rule_counts.inc(r, 0); - cerr << " ObservefilteredRule() inc: " << r << " " << hash_value(r) << endl; -// map_rule(r); + //cerr << " ObservefilteredRule() inc: " << r << " " << hash_value(r) << endl; + normalise_string(r.source()); source_counts.inc(r.source(), 0); + + normalise_string(r.target()); target_counts.inc(r.target(), 0); } // compute statistics over keys, the same lhs-src-trg tuple may be seen // more than once - virtual void ObserveUnfilteredRule(const WordID /*lhs*/, + virtual void ObserveUnfilteredRule(const WordID lhs, const vector<WordID>& src, const vector<WordID>& trg, const RuleStatistics& info) { - RuleTuple r(-1, src, trg); - //cerr << " ObserveUnfilteredRule() in:" << r << " " << hash_value(r) << endl; + RuleTuple r(lhs, src, trg); rule_counts.inc_if_exists(r, info.counts.value(kCFE)); - cerr << " ObserveUnfilteredRule() inc_if_exists: " << r << " " << hash_value(r) << " " << info.counts.value(kCFE) << " to " << rule_counts(r) << endl; -// map_rule(r); + //cerr << " ObserveUnfilteredRule() inc_if_exists: " << r << " " << hash_value(r) << " " << info.counts.value(kCFE) << " to " << rule_counts(r) << endl; + normalise_string(r.source()); source_counts.inc_if_exists(r.source(), info.counts.value(kCFE)); + + normalise_string(r.target()); target_counts.inc_if_exists(r.target(), info.counts.value(kCFE)); } - virtual void ExtractFeatures(const WordID /*lhs*/, + virtual void ExtractFeatures(const WordID lhs, const vector<WordID>& src, const vector<WordID>& trg, - const RuleStatistics& info, + const RuleStatistics& /*info*/, SparseVector<float>* result) const { - RuleTuple r(-1, src, trg); - //cerr << " ExtractFeatures() in:" << " " << r.m_hash << endl; - int r_freq = rule_counts(r); - cerr << " ExtractFeatures() count: " << r << " " << hash_value(r) << " " << info.counts.value(kCFE) << " | " << rule_counts(r) << endl; - assert(r_freq == info.counts.value(kCFE)); + RuleTuple r(lhs, src, trg); + double l_r_freq = log(rule_counts(r)); + //result->set_value(fid_count, rule_counts(r)); + //cerr << " ExtractFeatures() count: " << r << " " << info.counts.value(kCFE) << " | " << rule_counts(r) << endl; + //assert(l_r_freq == log(info.counts.value(kCFE))); //cerr << " ExtractFeatures() after:" << " " << r.hash << endl; //cerr << " ExtractFeatures() in:" << r << " " << r_freq << " " << hash_value(r) << endl; //cerr << " ExtractFeatures() in:" << r << " " << r_freq << endl; -// map_rule(r); - //result->set_value(fid_fe, log(target_counts(r.target())) - log(r_freq)); - //result->set_value(fid_ef, log(source_counts(r.source())) - log(r_freq)); - result->set_value(fid_ef, target_counts(r.target())); - result->set_value(fid_fe, r_freq); - //result->set_value(fid_fe, (source_counts(r.source()))); + normalise_string(r.target()); + result->set_value(fid_fe, log(target_counts(r.target())) - l_r_freq); + normalise_string(r.source()); + result->set_value(fid_ef, log(source_counts(r.source())) - l_r_freq); + + //result->set_value(fid_e, target_counts(r.target())); + //result->set_value(fid_f, source_counts(r.source())); } - void map_rule(RuleTuple& r) const { - vector<WordID> indexes; int i=0; - for (vector<WordID>::iterator it = r.target().begin(); it != r.target().end(); ++it) { - if (validate_non_terminal(TD::Convert(*it))) - indexes.push_back(*it); - } - for (vector<WordID>::iterator it = r.source().begin(); it != r.source().end(); ++it) { - if (validate_non_terminal(TD::Convert(*it))) - *it = indexes.at(i++); - } + void normalise_string(vector<WordID>& r) const { + vector<WordID> indexes; + for (vector<WordID>::iterator it = r.begin(); it != r.end(); ++it) + if (validate_non_terminal(TD::Convert(*it))) *it = -1; } const int fid_fe, fid_ef; + const int fid_count, fid_e, fid_f; const int kCFE; RuleFreqCount rule_counts; FreqCount< vector<WordID> > source_counts, target_counts; diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 8414308d..619e2699 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -10,6 +10,9 @@ my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR # the key in this map is the extractor name, the value is a list of the extracted features my $feat_map = { "LogRuleCount" => [ "LogRuleCount", "SingletonRule" ] , +# "XFeatures" => [ "XFE","XEF" ] , + "XFeatures" => [ "XFE","XEF","LabelledEF","LabelledFE"] , + "LabelledRuleConditionals" => [ "LabelledFE","LabelledEF" ] , "LexProb" => [ "LexE2F", "LexF2E" ] , }; @@ -24,11 +27,15 @@ my %init_weights = qw( LogECount -0.129853 LogFCount -0.194037 LogRuleCount 0.256706 + XFE -0.256706 + XEF -0.256706 + LabelledFE -0.256706 + LabelledEF -0.256706 PassThrough -0.9304905 SingletonE -3.04161 SingletonF 0.0714027 SingletonRule -0.889377 - WordPenalty -7.99495 + WordPenalty -1.99495 ); |