summaryrefslogtreecommitdiff
path: root/extools
diff options
context:
space:
mode:
Diffstat (limited to 'extools')
-rw-r--r--extools/featurize_grammar.cc108
1 files changed, 67 insertions, 41 deletions
diff --git a/extools/featurize_grammar.cc b/extools/featurize_grammar.cc
index 771948ce..0d054626 100644
--- a/extools/featurize_grammar.cc
+++ b/extools/featurize_grammar.cc
@@ -390,8 +390,10 @@ struct LogRuleCount : public FeatureExtractor {
// i.e. the prob Hiero would assign.
struct XFeatures: public FeatureExtractor {
XFeatures() :
- fid_fe(FD::Convert("XFE")),
- fid_ef(FD::Convert("XEF")),
+ fid_xfe(FD::Convert("XFE")),
+ fid_xef(FD::Convert("XEF")),
+ fid_labelledfe(FD::Convert("LabelledFE")),
+ fid_labelledef(FD::Convert("LabelledEF")),
kCFE(FD::Convert("CFE")) {}
virtual void ObserveFilteredRule(const WordID /*lhs*/,
const vector<WordID>& src,
@@ -399,7 +401,11 @@ struct XFeatures: public FeatureExtractor {
RuleTuple r(-1, src, trg);
map_rule(r);
rule_counts.inc(r, 0);
+
+ normalise_string(r.source());
source_counts.inc(r.source(), 0);
+
+ normalise_string(r.target());
target_counts.inc(r.target(), 0);
}
@@ -413,7 +419,11 @@ struct XFeatures: public FeatureExtractor {
// cerr << " ObserveUnfilteredRule() in:" << r << " " << hash_value(r) << endl;
map_rule(r);
rule_counts.inc_if_exists(r, info.counts.value(kCFE));
+
+ normalise_string(r.source());
source_counts.inc_if_exists(r.source(), info.counts.value(kCFE));
+
+ normalise_string(r.target());
target_counts.inc_if_exists(r.target(), info.counts.value(kCFE));
// cerr << " ObserveUnfilteredRule() inc: " << r << " " << hash_value(r) << " " << info.counts.value(kCFE) << " to " << rule_counts(r) << endl;
}
@@ -421,15 +431,22 @@ struct XFeatures: public FeatureExtractor {
virtual void ExtractFeatures(const WordID /*lhs*/,
const vector<WordID>& src,
const vector<WordID>& trg,
- const RuleStatistics& /*info*/,
+ const RuleStatistics& info,
SparseVector<float>* result) const {
RuleTuple r(-1, src, trg);
map_rule(r);
- //result->set_value(fid_fe, log(target_counts(r.target())) - log(rule_counts(r)));
- //result->set_value(fid_ef, log(source_counts(r.source())) - log(rule_counts(r)));
- result->set_value(fid_ef, target_counts(r.target()));
- result->set_value(fid_fe, rule_counts(r));
- //result->set_value(fid_fe, (source_counts(r.source())));
+ //result->set_value(fid_fe, rule_counts(r));
+ double l_r_freq = log(rule_counts(r));
+
+ normalise_string(r.target());
+ result->set_value(fid_xfe, log(target_counts(r.target())) - l_r_freq);
+ result->set_value(fid_labelledfe, log(target_counts(r.target())) - log(info.counts.value(kCFE)));
+ //result->set_value(fid_labelledfe, target_counts(r.target()));
+
+ normalise_string(r.source());
+ result->set_value(fid_xef, log(source_counts(r.source())) - l_r_freq);
+ result->set_value(fid_labelledef, log(source_counts(r.source())) - log(info.counts.value(kCFE)));
+ //result->set_value(fid_labelledef, source_counts(r.source()));
}
void map_rule(RuleTuple& r) const {
@@ -444,7 +461,14 @@ struct XFeatures: public FeatureExtractor {
}
}
- const int fid_fe, fid_ef;
+ void normalise_string(vector<WordID>& r) const {
+ vector<WordID> indexes;
+ for (vector<WordID>::iterator it = r.begin(); it != r.end(); ++it)
+ if (validate_non_terminal(TD::Convert(*it))) *it = -1;
+ }
+
+ const int fid_xfe, fid_xef;
+ const int fid_labelledfe, fid_labelledef;
const int kCFE;
RuleFreqCount rule_counts;
FreqCount< vector<WordID> > source_counts, target_counts;
@@ -452,69 +476,71 @@ struct XFeatures: public FeatureExtractor {
struct LabelledRuleConditionals: public FeatureExtractor {
LabelledRuleConditionals() :
+ fid_count(FD::Convert("TCount")),
+ fid_e(FD::Convert("TCountE")),
+ fid_f(FD::Convert("TCountF")),
fid_fe(FD::Convert("TLabelledFE")),
fid_ef(FD::Convert("TLabelledEF")),
kCFE(FD::Convert("CFE")) {}
- virtual void ObserveFilteredRule(const WordID /*lhs*/,
+ virtual void ObserveFilteredRule(const WordID lhs,
const vector<WordID>& src,
const vector<WordID>& trg) {
- RuleTuple r(-1, src, trg);
+ RuleTuple r(lhs, src, trg);
rule_counts.inc(r, 0);
- cerr << " ObservefilteredRule() inc: " << r << " " << hash_value(r) << endl;
-// map_rule(r);
+ //cerr << " ObservefilteredRule() inc: " << r << " " << hash_value(r) << endl;
+ normalise_string(r.source());
source_counts.inc(r.source(), 0);
+
+ normalise_string(r.target());
target_counts.inc(r.target(), 0);
}
// compute statistics over keys, the same lhs-src-trg tuple may be seen
// more than once
- virtual void ObserveUnfilteredRule(const WordID /*lhs*/,
+ virtual void ObserveUnfilteredRule(const WordID lhs,
const vector<WordID>& src,
const vector<WordID>& trg,
const RuleStatistics& info) {
- RuleTuple r(-1, src, trg);
- //cerr << " ObserveUnfilteredRule() in:" << r << " " << hash_value(r) << endl;
+ RuleTuple r(lhs, src, trg);
rule_counts.inc_if_exists(r, info.counts.value(kCFE));
- cerr << " ObserveUnfilteredRule() inc_if_exists: " << r << " " << hash_value(r) << " " << info.counts.value(kCFE) << " to " << rule_counts(r) << endl;
-// map_rule(r);
+ //cerr << " ObserveUnfilteredRule() inc_if_exists: " << r << " " << hash_value(r) << " " << info.counts.value(kCFE) << " to " << rule_counts(r) << endl;
+ normalise_string(r.source());
source_counts.inc_if_exists(r.source(), info.counts.value(kCFE));
+
+ normalise_string(r.target());
target_counts.inc_if_exists(r.target(), info.counts.value(kCFE));
}
- virtual void ExtractFeatures(const WordID /*lhs*/,
+ virtual void ExtractFeatures(const WordID lhs,
const vector<WordID>& src,
const vector<WordID>& trg,
- const RuleStatistics& info,
+ const RuleStatistics& /*info*/,
SparseVector<float>* result) const {
- RuleTuple r(-1, src, trg);
- //cerr << " ExtractFeatures() in:" << " " << r.m_hash << endl;
- int r_freq = rule_counts(r);
- cerr << " ExtractFeatures() count: " << r << " " << hash_value(r) << " " << info.counts.value(kCFE) << " | " << rule_counts(r) << endl;
- assert(r_freq == info.counts.value(kCFE));
+ RuleTuple r(lhs, src, trg);
+ double l_r_freq = log(rule_counts(r));
+ //result->set_value(fid_count, rule_counts(r));
+ //cerr << " ExtractFeatures() count: " << r << " " << info.counts.value(kCFE) << " | " << rule_counts(r) << endl;
+ //assert(l_r_freq == log(info.counts.value(kCFE)));
//cerr << " ExtractFeatures() after:" << " " << r.hash << endl;
//cerr << " ExtractFeatures() in:" << r << " " << r_freq << " " << hash_value(r) << endl;
//cerr << " ExtractFeatures() in:" << r << " " << r_freq << endl;
-// map_rule(r);
- //result->set_value(fid_fe, log(target_counts(r.target())) - log(r_freq));
- //result->set_value(fid_ef, log(source_counts(r.source())) - log(r_freq));
- result->set_value(fid_ef, target_counts(r.target()));
- result->set_value(fid_fe, r_freq);
- //result->set_value(fid_fe, (source_counts(r.source())));
+ normalise_string(r.target());
+ result->set_value(fid_fe, log(target_counts(r.target())) - l_r_freq);
+ normalise_string(r.source());
+ result->set_value(fid_ef, log(source_counts(r.source())) - l_r_freq);
+
+ //result->set_value(fid_e, target_counts(r.target()));
+ //result->set_value(fid_f, source_counts(r.source()));
}
- void map_rule(RuleTuple& r) const {
- vector<WordID> indexes; int i=0;
- for (vector<WordID>::iterator it = r.target().begin(); it != r.target().end(); ++it) {
- if (validate_non_terminal(TD::Convert(*it)))
- indexes.push_back(*it);
- }
- for (vector<WordID>::iterator it = r.source().begin(); it != r.source().end(); ++it) {
- if (validate_non_terminal(TD::Convert(*it)))
- *it = indexes.at(i++);
- }
+ void normalise_string(vector<WordID>& r) const {
+ vector<WordID> indexes;
+ for (vector<WordID>::iterator it = r.begin(); it != r.end(); ++it)
+ if (validate_non_terminal(TD::Convert(*it))) *it = -1;
}
const int fid_fe, fid_ef;
+ const int fid_count, fid_e, fid_f;
const int kCFE;
RuleFreqCount rule_counts;
FreqCount< vector<WordID> > source_counts, target_counts;