From b09ca8a5e6f5e8c1840e51a93c9f8e6b8c4bcc33 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 13 Sep 2011 09:45:01 +0100 Subject: add one more source syntax feature --- decoder/ff_source_syntax.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc index 99acbd87..5b7c16f6 100644 --- a/decoder/ff_source_syntax.cc +++ b/decoder/ff_source_syntax.cc @@ -13,6 +13,13 @@ using namespace std; // source trees must be represented in Penn Treebank format, e.g. // (S (NP John) (VP (V left))) +// log transform to make long spans cluster together +// but preserve differences +inline int SpanSizeTransform(unsigned span_size) { + if (!span_size) return 0; + return static_cast(log(span_size+1) / log(1.39)) - 1; +} + struct SourceSyntaxFeaturesImpl { SourceSyntaxFeaturesImpl() {} @@ -87,8 +94,10 @@ struct SourceSyntaxFeaturesImpl { int& fid_ef = fids_ef(i,j)[&rule]; if (fid_ef <= 0) { ostringstream os; + ostringstream os2; os << "SYN:" << TD::Convert(lhs); - fid_cat = FD::Convert(os.str()); + os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i); + fid_cat = FD::Convert(os2.str()); os << ':'; unsigned ntc = 0; for (unsigned k = 0; k < rule.f_.size(); ++k) { -- cgit v1.2.3