diff options
Diffstat (limited to 'decoder')
-rw-r--r-- | decoder/ff_ngrams.cc | 23 | ||||
-rw-r--r-- | decoder/ff_spans.cc | 39 |
2 files changed, 45 insertions, 17 deletions
diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index d52667cd..04dd1906 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -46,6 +46,17 @@ struct State { }; } +namespace { + string Escape(const string& x) { + string y = x; + for (int i = 0; i < y.size(); ++i) { + if (y[i] == '=') y[i]='_'; + if (y[i] == ';') y[i]='_'; + } + return y; + } +} + class NgramDetectorImpl { // returns the number of unscored words at the left edge of a span @@ -114,11 +125,17 @@ class NgramDetectorImpl { int& fid = ft->fids[curword]; ++n; if (!fid) { - const char* code="_UBT456789"; + const char* code="_UBT456789"; // prefix code (unigram, bigram, etc.) ostringstream os; os << code[n] << ':'; - for (int i = n-1; i >= 0; --i) - os << (i != n-1 ? "_" : "") << TD::Convert(buf[i]); + for (int i = n-1; i >= 0; --i) { + os << (i != n-1 ? "_" : ""); + const string& tok = TD::Convert(buf[i]); + if (tok.find('=') == string::npos) + os << tok; + else + os << Escape(tok); + } fid = FD::Convert(os.str()); } feats->set_value(fid, 1); diff --git a/decoder/ff_spans.cc b/decoder/ff_spans.cc index e1da088d..bc23974d 100644 --- a/decoder/ff_spans.cc +++ b/decoder/ff_spans.cc @@ -13,6 +13,17 @@ using namespace std; +namespace { + string Escape(const string& x) { + string y = x; + for (int i = 0; i < y.size(); ++i) { + if (y[i] == '=') y[i]='_'; + if (y[i] == ';') y[i]='_'; + } + return y; + } +} + // log transform to make long spans cluster together // but preserve differences int SpanSizeTransform(unsigned span_size) { @@ -140,19 +151,19 @@ void SpanFeatures::PrepareForInput(const SentenceMetadata& smeta) { word = MapIfNecessary(word); ostringstream sfid; sfid << "ES:" << TD::Convert(word); - end_span_ids_[i] = FD::Convert(sfid.str()); + end_span_ids_[i] = FD::Convert(Escape(sfid.str())); ostringstream esbiid; esbiid << "EBI:" << TD::Convert(bword) << "_" << TD::Convert(word); - end_bigram_ids_[i] = FD::Convert(esbiid.str()); + end_bigram_ids_[i] = FD::Convert(Escape(esbiid.str())); ostringstream bsbiid; bsbiid << "BBI:" << TD::Convert(bword) << "_" << TD::Convert(word); - beg_bigram_ids_[i] = FD::Convert(bsbiid.str()); + beg_bigram_ids_[i] = FD::Convert(Escape(bsbiid.str())); ostringstream bfid; bfid << "BS:" << TD::Convert(bword); - beg_span_ids_[i] = FD::Convert(bfid.str()); + beg_span_ids_[i] = FD::Convert(Escape(bfid.str())); if (use_collapsed_features_) { - end_span_vals_[i] = feat2val_[sfid.str()] + feat2val_[esbiid.str()]; - beg_span_vals_[i] = feat2val_[bfid.str()] + feat2val_[bsbiid.str()]; + end_span_vals_[i] = feat2val_[Escape(sfid.str())] + feat2val_[Escape(esbiid.str())]; + beg_span_vals_[i] = feat2val_[Escape(bfid.str())] + feat2val_[Escape(bsbiid.str())]; } } for (int i = 0; i <= lattice.size(); ++i) { @@ -167,16 +178,16 @@ void SpanFeatures::PrepareForInput(const SentenceMetadata& smeta) { word = MapIfNecessary(word); ostringstream pf; pf << "S:" << TD::Convert(bword) << "_" << TD::Convert(word); - span_feats_(i,j).first = FD::Convert(pf.str()); - span_feats_(i,j).second = FD::Convert("S_" + pf.str()); + span_feats_(i,j).first = FD::Convert(Escape(pf.str())); + span_feats_(i,j).second = FD::Convert(Escape("S_" + pf.str())); ostringstream lf; const unsigned span_size = (i < j ? j - i : i - j); lf << "LS:" << SpanSizeTransform(span_size) << "_" << TD::Convert(bword) << "_" << TD::Convert(word); - len_span_feats_(i,j).first = FD::Convert(lf.str()); - len_span_feats_(i,j).second = FD::Convert("S_" + lf.str()); + len_span_feats_(i,j).first = FD::Convert(Escape(lf.str())); + len_span_feats_(i,j).second = FD::Convert(Escape("S_" + lf.str())); if (use_collapsed_features_) { - span_vals_(i,j).first = feat2val_[pf.str()] + feat2val_[lf.str()]; - span_vals_(i,j).second = feat2val_["S_" + pf.str()] + feat2val_["S_" + lf.str()]; + span_vals_(i,j).first = feat2val_[Escape(pf.str())] + feat2val_[Escape(lf.str())]; + span_vals_(i,j).second = feat2val_[Escape("S_" + pf.str())] + feat2val_[Escape("S_" + lf.str())]; } } } @@ -209,14 +220,14 @@ void RuleNgramFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, const string& cur = TD::Convert(w); ostringstream os; os << "RB:" << prev << '_' << cur; - const int fid = FD::Convert(os.str()); + const int fid = FD::Convert(Escape(os.str())); if (fid <= 0) return; f.add_value(fid, 1.0); prev = cur; } ostringstream os; os << "RB:" << prev << '_' << "</r>"; - f.set_value(FD::Convert(os.str()), 1.0); + f.set_value(FD::Convert(Escape(os.str())), 1.0); } (*features) += it->second; } |