From 816bee82abc909335d4f3a300cff99afa4dd1da5 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 13 Jul 2011 18:00:22 -0400 Subject: escape bad feature names --- decoder/ff_ngrams.cc | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'decoder') diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index d52667cd..04dd1906 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -46,6 +46,17 @@ struct State { }; } +namespace { + string Escape(const string& x) { + string y = x; + for (int i = 0; i < y.size(); ++i) { + if (y[i] == '=') y[i]='_'; + if (y[i] == ';') y[i]='_'; + } + return y; + } +} + class NgramDetectorImpl { // returns the number of unscored words at the left edge of a span @@ -114,11 +125,17 @@ class NgramDetectorImpl { int& fid = ft->fids[curword]; ++n; if (!fid) { - const char* code="_UBT456789"; + const char* code="_UBT456789"; // prefix code (unigram, bigram, etc.) ostringstream os; os << code[n] << ':'; - for (int i = n-1; i >= 0; --i) - os << (i != n-1 ? "_" : "") << TD::Convert(buf[i]); + for (int i = n-1; i >= 0; --i) { + os << (i != n-1 ? "_" : ""); + const string& tok = TD::Convert(buf[i]); + if (tok.find('=') == string::npos) + os << tok; + else + os << Escape(tok); + } fid = FD::Convert(os.str()); } feats->set_value(fid, 1); -- cgit v1.2.3