summaryrefslogtreecommitdiff
path: root/decoder
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-09-13 09:45:01 +0100
committerChris Dyer <cdyer@cs.cmu.edu>2011-09-13 09:45:01 +0100
commitc41704e876930311539f0cfb5f5125f3401d08ae (patch)
tree4fdb0a7576fdfd723d1294957e3546ba6b3a1503 /decoder
parentd63a32c5d57b2ab5c016c5988db4bf5b374dcba4 (diff)
add one more source syntax feature
Diffstat (limited to 'decoder')
-rw-r--r--decoder/ff_source_syntax.cc11
1 files changed, 10 insertions, 1 deletions
diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc
index 99acbd87..5b7c16f6 100644
--- a/decoder/ff_source_syntax.cc
+++ b/decoder/ff_source_syntax.cc
@@ -13,6 +13,13 @@ using namespace std;
// source trees must be represented in Penn Treebank format, e.g.
// (S (NP John) (VP (V left)))
+// log transform to make long spans cluster together
+// but preserve differences
+inline int SpanSizeTransform(unsigned span_size) {
+ if (!span_size) return 0;
+ return static_cast<int>(log(span_size+1) / log(1.39)) - 1;
+}
+
struct SourceSyntaxFeaturesImpl {
SourceSyntaxFeaturesImpl() {}
@@ -87,8 +94,10 @@ struct SourceSyntaxFeaturesImpl {
int& fid_ef = fids_ef(i,j)[&rule];
if (fid_ef <= 0) {
ostringstream os;
+ ostringstream os2;
os << "SYN:" << TD::Convert(lhs);
- fid_cat = FD::Convert(os.str());
+ os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i);
+ fid_cat = FD::Convert(os2.str());
os << ':';
unsigned ntc = 0;
for (unsigned k = 0; k < rule.f_.size(); ++k) {