summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-09-13 09:45:01 +0100
committerChris Dyer <cdyer@cs.cmu.edu>2011-09-13 09:45:01 +0100
commitb09ca8a5e6f5e8c1840e51a93c9f8e6b8c4bcc33 (patch)
tree158b29704922133b71dddb2bcfaf51b44b6bb8f3
parentaf28b860c3f5d5b7c58feb16620853512c8454ad (diff)
add one more source syntax feature
-rw-r--r--decoder/ff_source_syntax.cc11
1 files changed, 10 insertions, 1 deletions
diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc
index 99acbd87..5b7c16f6 100644
--- a/decoder/ff_source_syntax.cc
+++ b/decoder/ff_source_syntax.cc
@@ -13,6 +13,13 @@ using namespace std;
// source trees must be represented in Penn Treebank format, e.g.
// (S (NP John) (VP (V left)))
+// log transform to make long spans cluster together
+// but preserve differences
+inline int SpanSizeTransform(unsigned span_size) {
+ if (!span_size) return 0;
+ return static_cast<int>(log(span_size+1) / log(1.39)) - 1;
+}
+
struct SourceSyntaxFeaturesImpl {
SourceSyntaxFeaturesImpl() {}
@@ -87,8 +94,10 @@ struct SourceSyntaxFeaturesImpl {
int& fid_ef = fids_ef(i,j)[&rule];
if (fid_ef <= 0) {
ostringstream os;
+ ostringstream os2;
os << "SYN:" << TD::Convert(lhs);
- fid_cat = FD::Convert(os.str());
+ os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i);
+ fid_cat = FD::Convert(os2.str());
os << ':';
unsigned ntc = 0;
for (unsigned k = 0; k < rule.f_.size(); ++k) {