diff options
Diffstat (limited to 'rst_parser')
| -rw-r--r-- | rst_parser/arc_ff.cc | 80 | 
1 files changed, 56 insertions, 24 deletions
| diff --git a/rst_parser/arc_ff.cc b/rst_parser/arc_ff.cc index b2a87a7d..c4e5aa17 100644 --- a/rst_parser/arc_ff.cc +++ b/rst_parser/arc_ff.cc @@ -3,6 +3,7 @@  #include <iostream>  #include <sstream> +#include "stringlib.h"  #include "tdict.h"  #include "fdict.h"  #include "sentence_metadata.h" @@ -66,15 +67,30 @@ struct ArcFFImpl {      v->set_value(FD::Convert(os.str()), 1);    } +  static void AddConjoin(const SparseVector<double>& v, const string& feat, SparseVector<double>* pf) { +    for (SparseVector<double>::const_iterator it = v.begin(); it != v.end(); ++it) +      pf->set_value(FD::Convert(FD::Convert(it->first) + "_" + feat), it->second); +  } + +  static inline string Fixup(const string& str) { +    string res = LowercaseString(str); +    if (res.size() < 6) return res; +    return res.substr(0, 5) + "*"; +  } + +  static inline string Suffix(const string& str) { +    if (str.size() < 4) return ""; else return str.substr(str.size() - 3); +  } +    void EdgeFeatures(const TaggedSentence& sent,                      short h,                      short m,                      SparseVector<weight_t>* features) const {      const bool is_root = (h == -1); +    const string head_word = (is_root ? kROOT : Fixup(TD::Convert(sent.words[h])));      int num_words = sent.words.size(); -    const string& head_word = (is_root ? kROOT : TD::Convert(sent.words[h]));      const string& head_pos = (is_root ? kROOT : TD::Convert(sent.pos[h])); -    const string& mod_word = TD::Convert(sent.words[m]); +    const string mod_word = Fixup(TD::Convert(sent.words[m]));      const string& mod_pos = TD::Convert(sent.pos[m]);      const string& mod_pos_L = (m > 0 ? TD::Convert(sent.pos[m-1]) : kLEFT_POS);      const string& mod_pos_R = (m < sent.pos.size() - 1 ? TD::Convert(sent.pos[m]) : kRIGHT_POS); @@ -82,42 +98,41 @@ struct ArcFFImpl {      const string dir = (bdir ? "MLeft" : "MRight");      int v = m - h;      if (v < 0) { -      v= -1 - int(log(-v) / log(2)); +      v= -1 - int(log(-v) / log(1.6));      } else { -      v= int(log(v) / log(2)); +      v= int(log(v) / log(1.6)) + 1;      }      ostringstream os;      if (v < 0) os << "LenL" << -v; else os << "LenR" << v;      const string lenstr = os.str(); +    Fire(features, dir); +    Fire(features, lenstr); +    // dir, lenstr      if (is_root) { -      Fire(features, "ROOT", mod_pos); +      Fire(features, "wROOT", mod_word); +      Fire(features, "pROOT", mod_pos); +      Fire(features, "wpROOT", mod_word, mod_pos);        Fire(features, "DROOT", mod_pos, lenstr);        Fire(features, "LROOT", mod_pos_L);        Fire(features, "RROOT", mod_pos_R);        Fire(features, "LROOT", mod_pos_L, mod_pos);        Fire(features, "RROOT", mod_pos_R, mod_pos);        Fire(features, "LDist", m); -      Fire(features, "RDist", m - num_words); +      Fire(features, "RDist", num_words - m);      } else { // not root        const string& head_pos_L = (h > 0 ? TD::Convert(sent.pos[h-1]) : kLEFT_POS);        const string& head_pos_R = (h < sent.pos.size() - 1 ? TD::Convert(sent.pos[h]) : kRIGHT_POS); -      Fire(features, lenstr); -      Fire(features, "H", head_pos); -      Fire(features, "M", mod_pos); -      Fire(features, "HM", head_pos, mod_pos); -      Fire(features, "HM", head_pos, mod_pos, dir); -      Fire(features, "HM", head_pos, mod_pos, lenstr); -      Fire(features, "LexH", head_word); -      Fire(features, "LexM", mod_word); -      Fire(features, "LexHM", head_word, mod_word); -      Fire(features, "LexHM", head_word, mod_word, dir); -      Fire(features, "LexHM", head_word, mod_word, lenstr); +      SparseVector<double> fv; +      SparseVector<double>* f = &fv; +      Fire(f, "H", head_pos); +      Fire(f, "M", mod_pos); +      Fire(f, "HM", head_pos, mod_pos); +        // surrounders -      Fire(features, "posLL", head_pos, mod_pos, head_pos_L, mod_pos_L); -      Fire(features, "posRR", head_pos, mod_pos, head_pos_R, mod_pos_R); -      Fire(features, "posLR", head_pos, mod_pos, head_pos_L, mod_pos_R); -      Fire(features, "posRL", head_pos, mod_pos, head_pos_R, mod_pos_L); -      Fire(features, "lexRL", head_word, head_pos_L, mod_pos_L); +      Fire(f, "posLL", head_pos, mod_pos, head_pos_L, mod_pos_L); +      Fire(f, "posRR", head_pos, mod_pos, head_pos_R, mod_pos_R); +      Fire(f, "posLR", head_pos, mod_pos, head_pos_L, mod_pos_R); +      Fire(f, "posRL", head_pos, mod_pos, head_pos_R, mod_pos_L);        // between features        int left = min(h,m); @@ -126,11 +141,28 @@ struct ArcFFImpl {          if (bdir) --right; else ++left;          for (map<WordID, vector<int> >::const_iterator it = pcs.begin(); it != pcs.end(); ++it) {            if (it->second[left] != it->second[right]) { -            Fire(features, "BT", head_pos, TD::Convert(it->first), mod_pos); -            Fire(features, "BT", head_pos, TD::Convert(it->first), mod_pos, dir); +            Fire(f, "BT", head_pos, TD::Convert(it->first), mod_pos);            }          }        } + +      Fire(f, "wH", head_word); +      Fire(f, "wM", mod_word); +      Fire(f, "wpH", head_word, head_pos); +      Fire(f, "wpM", mod_word, mod_pos); +      Fire(f, "pHwM", head_pos, mod_word); +      Fire(f, "wHpM", head_word, mod_pos); + +      Fire(f, "wHM", head_word, mod_word); +      Fire(f, "pHMwH", head_pos, mod_pos, head_word); +      Fire(f, "pHMwM", head_pos, mod_pos, mod_word); +      Fire(f, "wHMpH", head_word, mod_word, head_pos); +      Fire(f, "wHMpM", head_word, mod_word, mod_pos); +      Fire(f, "wHMpHM", head_word, mod_word, head_pos, mod_pos); + +      AddConjoin(fv, dir, features); +      AddConjoin(fv, lenstr, features); +      (*features) += fv;      }    }  }; | 
