diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2011-07-13 18:00:22 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2011-07-13 18:00:22 -0400 |
commit | 816bee82abc909335d4f3a300cff99afa4dd1da5 (patch) | |
tree | f04d4b1aaa77885d3cf5ee479ab370e43df8d71c /decoder/ff_ngrams.cc | |
parent | 34fdc73e613bbc30d59d7bd36c5db31a94a7ac68 (diff) |
escape bad feature names
Diffstat (limited to 'decoder/ff_ngrams.cc')
-rw-r--r-- | decoder/ff_ngrams.cc | 23 |
1 files changed, 20 insertions, 3 deletions
diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index d52667cd..04dd1906 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -46,6 +46,17 @@ struct State { }; } +namespace { + string Escape(const string& x) { + string y = x; + for (int i = 0; i < y.size(); ++i) { + if (y[i] == '=') y[i]='_'; + if (y[i] == ';') y[i]='_'; + } + return y; + } +} + class NgramDetectorImpl { // returns the number of unscored words at the left edge of a span @@ -114,11 +125,17 @@ class NgramDetectorImpl { int& fid = ft->fids[curword]; ++n; if (!fid) { - const char* code="_UBT456789"; + const char* code="_UBT456789"; // prefix code (unigram, bigram, etc.) ostringstream os; os << code[n] << ':'; - for (int i = n-1; i >= 0; --i) - os << (i != n-1 ? "_" : "") << TD::Convert(buf[i]); + for (int i = n-1; i >= 0; --i) { + os << (i != n-1 ? "_" : ""); + const string& tok = TD::Convert(buf[i]); + if (tok.find('=') == string::npos) + os << tok; + else + os << Escape(tok); + } fid = FD::Convert(os.str()); } feats->set_value(fid, 1); |