summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-07-06 23:32:53 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2011-07-06 23:32:53 -0400
commit75b814cb246052746134f32c723cf6d278b148df (patch)
tree7b2d8c08f7b90e835553b1acca09a9cf0a6b348e
parent3b004be48979da652cc64e7a01e685190eb79498 (diff)
better handling of ngram features
-rw-r--r--decoder/ff_ngrams.cc49
1 files changed, 27 insertions, 22 deletions
diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc
index 54b394ae..d52667cd 100644
--- a/decoder/ff_ngrams.cc
+++ b/decoder/ff_ngrams.cc
@@ -103,27 +103,29 @@ class NgramDetectorImpl {
SetFlag(flag, HAS_FULL_CONTEXT, state);
}
- void FireFeatures(const State<5>& state, const WordID cur, SparseVector<double>* feats) {
- assert(order_ == 2);
- if (cur >= unimap_.size())
- unimap_.resize(cur + 10, 0);
- int& uf = unimap_[cur];
- if (!uf) {
- ostringstream os;
- os << "U:" << TD::Convert(cur);
- uf = FD::Convert(os.str());
- }
- feats->set_value(uf, 1.0);
- if (state.state[0]) {
- if (state.state[0] >= bimap_.size())
- bimap_.resize(state.state[0] + 10);
- int& bf = bimap_[state.state[0]][cur];
- if (!bf) {
+ void FireFeatures(const State<5>& state, WordID cur, SparseVector<double>* feats) {
+ FidTree* ft = &fidroot_;
+ int n = 0;
+ WordID buf[10];
+ int ci = order_ - 1;
+ WordID curword = cur;
+ while(curword) {
+ buf[n] = curword;
+ int& fid = ft->fids[curword];
+ ++n;
+ if (!fid) {
+ const char* code="_UBT456789";
ostringstream os;
- os << "B:" << TD::Convert(state[0]) << '_' << TD::Convert(cur);
- bf = FD::Convert(os.str());
+ os << code[n] << ':';
+ for (int i = n-1; i >= 0; --i)
+ os << (i != n-1 ? "_" : "") << TD::Convert(buf[i]);
+ fid = FD::Convert(os.str());
}
- feats->set_value(bf, 1.0);
+ feats->set_value(fid, 1);
+ ft = &ft->levels[curword];
+ --ci;
+ if (ci < 0) break;
+ curword = state[ci];
}
}
@@ -248,7 +250,7 @@ class NgramDetectorImpl {
explicit NgramDetectorImpl(bool explicit_markers) :
kCDEC_UNK(TD::Convert("<unk>")) ,
add_sos_eos_(!explicit_markers) {
- order_ = 2;
+ order_ = 3;
state_size_ = (order_ - 1) * sizeof(WordID) + 2 + (order_ - 1) * sizeof(WordID);
unscored_size_offset_ = (order_ - 1) * sizeof(WordID);
is_complete_offset_ = unscored_size_offset_ + 1;
@@ -288,8 +290,11 @@ class NgramDetectorImpl {
char* dummy_state_;
vector<const void*> dummy_ants_;
TRulePtr dummy_rule_;
- mutable std::vector<int> unimap_; // [left][right]
- mutable std::vector<std::map<WordID, int> > bimap_; // [left][right]
+ struct FidTree {
+ map<WordID, int> fids;
+ map<WordID, FidTree> levels;
+ };
+ mutable FidTree fidroot_;
};
NgramDetector::NgramDetector(const string& param) {