diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2011-09-13 09:45:01 +0100 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2011-09-13 09:45:01 +0100 |
commit | b09ca8a5e6f5e8c1840e51a93c9f8e6b8c4bcc33 (patch) | |
tree | 158b29704922133b71dddb2bcfaf51b44b6bb8f3 | |
parent | af28b860c3f5d5b7c58feb16620853512c8454ad (diff) |
add one more source syntax feature
-rw-r--r-- | decoder/ff_source_syntax.cc | 11 |
1 files changed, 10 insertions, 1 deletions
diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc index 99acbd87..5b7c16f6 100644 --- a/decoder/ff_source_syntax.cc +++ b/decoder/ff_source_syntax.cc @@ -13,6 +13,13 @@ using namespace std; // source trees must be represented in Penn Treebank format, e.g. // (S (NP John) (VP (V left))) +// log transform to make long spans cluster together +// but preserve differences +inline int SpanSizeTransform(unsigned span_size) { + if (!span_size) return 0; + return static_cast<int>(log(span_size+1) / log(1.39)) - 1; +} + struct SourceSyntaxFeaturesImpl { SourceSyntaxFeaturesImpl() {} @@ -87,8 +94,10 @@ struct SourceSyntaxFeaturesImpl { int& fid_ef = fids_ef(i,j)[&rule]; if (fid_ef <= 0) { ostringstream os; + ostringstream os2; os << "SYN:" << TD::Convert(lhs); - fid_cat = FD::Convert(os.str()); + os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i); + fid_cat = FD::Convert(os2.str()); os << ':'; unsigned ntc = 0; for (unsigned k = 0; k < rule.f_.size(); ++k) { |