summaryrefslogtreecommitdiff
path: root/decoder/ff_ngrams.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-07-13 18:00:22 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2011-07-13 18:00:22 -0400
commit816bee82abc909335d4f3a300cff99afa4dd1da5 (patch)
treef04d4b1aaa77885d3cf5ee479ab370e43df8d71c /decoder/ff_ngrams.cc
parent34fdc73e613bbc30d59d7bd36c5db31a94a7ac68 (diff)
escape bad feature names
Diffstat (limited to 'decoder/ff_ngrams.cc')
-rw-r--r--decoder/ff_ngrams.cc23
1 files changed, 20 insertions, 3 deletions
diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc
index d52667cd..04dd1906 100644
--- a/decoder/ff_ngrams.cc
+++ b/decoder/ff_ngrams.cc
@@ -46,6 +46,17 @@ struct State {
};
}
+namespace {
+ string Escape(const string& x) {
+ string y = x;
+ for (int i = 0; i < y.size(); ++i) {
+ if (y[i] == '=') y[i]='_';
+ if (y[i] == ';') y[i]='_';
+ }
+ return y;
+ }
+}
+
class NgramDetectorImpl {
// returns the number of unscored words at the left edge of a span
@@ -114,11 +125,17 @@ class NgramDetectorImpl {
int& fid = ft->fids[curword];
++n;
if (!fid) {
- const char* code="_UBT456789";
+ const char* code="_UBT456789"; // prefix code (unigram, bigram, etc.)
ostringstream os;
os << code[n] << ':';
- for (int i = n-1; i >= 0; --i)
- os << (i != n-1 ? "_" : "") << TD::Convert(buf[i]);
+ for (int i = n-1; i >= 0; --i) {
+ os << (i != n-1 ? "_" : "");
+ const string& tok = TD::Convert(buf[i]);
+ if (tok.find('=') == string::npos)
+ os << tok;
+ else
+ os << Escape(tok);
+ }
fid = FD::Convert(os.str());
}
feats->set_value(fid, 1);