diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2011-07-13 18:00:22 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2011-07-13 18:00:22 -0400 |
commit | 73284fa32176da9f45953055c12951cb69395a02 (patch) | |
tree | cb91b410c6b1dc070841754429749cfe3b19bc52 | |
parent | 9b469ea153e5ae63f4524a71caf3c4518e5f775d (diff) |
escape bad feature names
-rw-r--r-- | decoder/ff_ngrams.cc | 23 |
1 files changed, 20 insertions, 3 deletions
diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index d52667cd..04dd1906 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -46,6 +46,17 @@ struct State { }; } +namespace { + string Escape(const string& x) { + string y = x; + for (int i = 0; i < y.size(); ++i) { + if (y[i] == '=') y[i]='_'; + if (y[i] == ';') y[i]='_'; + } + return y; + } +} + class NgramDetectorImpl { // returns the number of unscored words at the left edge of a span @@ -114,11 +125,17 @@ class NgramDetectorImpl { int& fid = ft->fids[curword]; ++n; if (!fid) { - const char* code="_UBT456789"; + const char* code="_UBT456789"; // prefix code (unigram, bigram, etc.) ostringstream os; os << code[n] << ':'; - for (int i = n-1; i >= 0; --i) - os << (i != n-1 ? "_" : "") << TD::Convert(buf[i]); + for (int i = n-1; i >= 0; --i) { + os << (i != n-1 ? "_" : ""); + const string& tok = TD::Convert(buf[i]); + if (tok.find('=') == string::npos) + os << tok; + else + os << Escape(tok); + } fid = FD::Convert(os.str()); } feats->set_value(fid, 1); |