From 06718177056fe5262262e00d98dc89f67cefb193 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 21 Apr 2012 13:33:33 -0400 Subject: full feature set --- rst_parser/arc_ff.cc | 80 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/rst_parser/arc_ff.cc b/rst_parser/arc_ff.cc index b2a87a7d..c4e5aa17 100644 --- a/rst_parser/arc_ff.cc +++ b/rst_parser/arc_ff.cc @@ -3,6 +3,7 @@ #include #include +#include "stringlib.h" #include "tdict.h" #include "fdict.h" #include "sentence_metadata.h" @@ -66,15 +67,30 @@ struct ArcFFImpl { v->set_value(FD::Convert(os.str()), 1); } + static void AddConjoin(const SparseVector& v, const string& feat, SparseVector* pf) { + for (SparseVector::const_iterator it = v.begin(); it != v.end(); ++it) + pf->set_value(FD::Convert(FD::Convert(it->first) + "_" + feat), it->second); + } + + static inline string Fixup(const string& str) { + string res = LowercaseString(str); + if (res.size() < 6) return res; + return res.substr(0, 5) + "*"; + } + + static inline string Suffix(const string& str) { + if (str.size() < 4) return ""; else return str.substr(str.size() - 3); + } + void EdgeFeatures(const TaggedSentence& sent, short h, short m, SparseVector* features) const { const bool is_root = (h == -1); + const string head_word = (is_root ? kROOT : Fixup(TD::Convert(sent.words[h]))); int num_words = sent.words.size(); - const string& head_word = (is_root ? kROOT : TD::Convert(sent.words[h])); const string& head_pos = (is_root ? kROOT : TD::Convert(sent.pos[h])); - const string& mod_word = TD::Convert(sent.words[m]); + const string mod_word = Fixup(TD::Convert(sent.words[m])); const string& mod_pos = TD::Convert(sent.pos[m]); const string& mod_pos_L = (m > 0 ? TD::Convert(sent.pos[m-1]) : kLEFT_POS); const string& mod_pos_R = (m < sent.pos.size() - 1 ? TD::Convert(sent.pos[m]) : kRIGHT_POS); @@ -82,42 +98,41 @@ struct ArcFFImpl { const string dir = (bdir ? "MLeft" : "MRight"); int v = m - h; if (v < 0) { - v= -1 - int(log(-v) / log(2)); + v= -1 - int(log(-v) / log(1.6)); } else { - v= int(log(v) / log(2)); + v= int(log(v) / log(1.6)) + 1; } ostringstream os; if (v < 0) os << "LenL" << -v; else os << "LenR" << v; const string lenstr = os.str(); + Fire(features, dir); + Fire(features, lenstr); + // dir, lenstr if (is_root) { - Fire(features, "ROOT", mod_pos); + Fire(features, "wROOT", mod_word); + Fire(features, "pROOT", mod_pos); + Fire(features, "wpROOT", mod_word, mod_pos); Fire(features, "DROOT", mod_pos, lenstr); Fire(features, "LROOT", mod_pos_L); Fire(features, "RROOT", mod_pos_R); Fire(features, "LROOT", mod_pos_L, mod_pos); Fire(features, "RROOT", mod_pos_R, mod_pos); Fire(features, "LDist", m); - Fire(features, "RDist", m - num_words); + Fire(features, "RDist", num_words - m); } else { // not root const string& head_pos_L = (h > 0 ? TD::Convert(sent.pos[h-1]) : kLEFT_POS); const string& head_pos_R = (h < sent.pos.size() - 1 ? TD::Convert(sent.pos[h]) : kRIGHT_POS); - Fire(features, lenstr); - Fire(features, "H", head_pos); - Fire(features, "M", mod_pos); - Fire(features, "HM", head_pos, mod_pos); - Fire(features, "HM", head_pos, mod_pos, dir); - Fire(features, "HM", head_pos, mod_pos, lenstr); - Fire(features, "LexH", head_word); - Fire(features, "LexM", mod_word); - Fire(features, "LexHM", head_word, mod_word); - Fire(features, "LexHM", head_word, mod_word, dir); - Fire(features, "LexHM", head_word, mod_word, lenstr); + SparseVector fv; + SparseVector* f = &fv; + Fire(f, "H", head_pos); + Fire(f, "M", mod_pos); + Fire(f, "HM", head_pos, mod_pos); + // surrounders - Fire(features, "posLL", head_pos, mod_pos, head_pos_L, mod_pos_L); - Fire(features, "posRR", head_pos, mod_pos, head_pos_R, mod_pos_R); - Fire(features, "posLR", head_pos, mod_pos, head_pos_L, mod_pos_R); - Fire(features, "posRL", head_pos, mod_pos, head_pos_R, mod_pos_L); - Fire(features, "lexRL", head_word, head_pos_L, mod_pos_L); + Fire(f, "posLL", head_pos, mod_pos, head_pos_L, mod_pos_L); + Fire(f, "posRR", head_pos, mod_pos, head_pos_R, mod_pos_R); + Fire(f, "posLR", head_pos, mod_pos, head_pos_L, mod_pos_R); + Fire(f, "posRL", head_pos, mod_pos, head_pos_R, mod_pos_L); // between features int left = min(h,m); @@ -126,11 +141,28 @@ struct ArcFFImpl { if (bdir) --right; else ++left; for (map >::const_iterator it = pcs.begin(); it != pcs.end(); ++it) { if (it->second[left] != it->second[right]) { - Fire(features, "BT", head_pos, TD::Convert(it->first), mod_pos); - Fire(features, "BT", head_pos, TD::Convert(it->first), mod_pos, dir); + Fire(f, "BT", head_pos, TD::Convert(it->first), mod_pos); } } } + + Fire(f, "wH", head_word); + Fire(f, "wM", mod_word); + Fire(f, "wpH", head_word, head_pos); + Fire(f, "wpM", mod_word, mod_pos); + Fire(f, "pHwM", head_pos, mod_word); + Fire(f, "wHpM", head_word, mod_pos); + + Fire(f, "wHM", head_word, mod_word); + Fire(f, "pHMwH", head_pos, mod_pos, head_word); + Fire(f, "pHMwM", head_pos, mod_pos, mod_word); + Fire(f, "wHMpH", head_word, mod_word, head_pos); + Fire(f, "wHMpM", head_word, mod_word, mod_pos); + Fire(f, "wHMpHM", head_word, mod_word, head_pos, mod_pos); + + AddConjoin(fv, dir, features); + AddConjoin(fv, lenstr, features); + (*features) += fv; } } }; -- cgit v1.2.3