From 39e6cc773c7e723ac2a23be51b3c15ee3c9a70d5 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 20 Mar 2013 12:24:01 -0400 Subject: n-gram word class features --- decoder/ff_ngrams.cc | 68 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index 9c13fdbb..d337b28b 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -60,7 +60,7 @@ namespace { } } -static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector& prefixes, string& target_separator) { +static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector& prefixes, string& target_separator, string* cluster_file) { vector const& argv=SplitOnWhitespace(in); *explicit_markers = false; *order = 3; @@ -103,6 +103,10 @@ static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, LMSPEC_NEXTARG; prefixes[5] = *i; break; + case 'c': + LMSPEC_NEXTARG; + *cluster_file = *i; + break; case 'S': LMSPEC_NEXTARG; target_separator = *i; @@ -124,6 +128,7 @@ usage: << "NgramFeatures Usage: \n" << " feature_function=NgramFeatures filename.lm [-x] [-o ] \n" + << " [-c ]\n" << " [-U ] [-B ][-T ]\n" << " [-4 <4-gram-prefix>] [-5 <5-gram-prefix>] [-S ]\n\n" @@ -203,6 +208,12 @@ class NgramDetectorImpl { SetFlag(flag, HAS_FULL_CONTEXT, state); } + WordID MapToClusterIfNecessary(WordID w) const { + if (cluster_map.size() == 0) return w; + if (w >= cluster_map.size()) return kCDEC_UNK; + return cluster_map[w]; + } + void FireFeatures(const State<5>& state, WordID cur, SparseVector* feats) { FidTree* ft = &fidroot_; int n = 0; @@ -285,7 +296,7 @@ class NgramDetectorImpl { context_complete = true; } } else { // handle terminal - const WordID cur_word = e[j]; + const WordID cur_word = MapToClusterIfNecessary(e[j]); SparseVector p; if (cur_word == kSOS_) { state = BeginSentenceState(); @@ -348,9 +359,52 @@ class NgramDetectorImpl { } } + void ReadClusterFile(const string& clusters) { + ReadFile rf(clusters); + istream& in = *rf.stream(); + string line; + int lc = 0; + string cluster; + string word; + while(getline(in, line)) { + ++lc; + if (line.size() == 0) continue; + if (line[0] == '#') continue; + unsigned cend = 1; + while((line[cend] != ' ' && line[cend] != '\t') && cend < line.size()) { + ++cend; + } + if (cend == line.size()) { + cerr << "Line " << lc << " in " << clusters << " malformed: " << line << endl; + abort(); + } + unsigned wbeg = cend + 1; + while((line[wbeg] == ' ' || line[wbeg] == '\t') && wbeg < line.size()) { + ++wbeg; + } + if (wbeg == line.size()) { + cerr << "Line " << lc << " in " << clusters << " malformed: " << line << endl; + abort(); + } + unsigned wend = wbeg + 1; + while((line[wend] != ' ' && line[wend] != '\t') && wend < line.size()) { + ++wend; + } + const WordID clusterid = TD::Convert(line.substr(0, cend)); + const WordID wordid = TD::Convert(line.substr(wbeg, wend - wbeg)); + if (wordid >= cluster_map.size()) + cluster_map.resize(wordid + 10, kCDEC_UNK); + cluster_map[wordid] = clusterid; + } + cluster_map[kSOS_] = kSOS_; + cluster_map[kEOS_] = kEOS_; + } + + vector cluster_map; + public: explicit NgramDetectorImpl(bool explicit_markers, unsigned order, - vector& prefixes, string& target_separator) : + vector& prefixes, string& target_separator, const string& clusters) : kCDEC_UNK(TD::Convert("")) , add_sos_eos_(!explicit_markers) { order_ = order; @@ -369,6 +423,9 @@ class NgramDetectorImpl { dummy_rule_.reset(new TRule("[DUMMY] ||| [BOS] [DUMMY] ||| [1] [2] ||| X=0")); kSOS_ = TD::Convert(""); kEOS_ = TD::Convert(""); + + if (clusters.size()) + ReadClusterFile(clusters); } ~NgramDetectorImpl() { @@ -409,9 +466,10 @@ NgramDetector::NgramDetector(const string& param) { vector prefixes; bool explicit_markers = false; unsigned order = 3; - ParseArgs(param, &explicit_markers, &order, prefixes, target_separator); + string clusters; + ParseArgs(param, &explicit_markers, &order, prefixes, target_separator, &clusters); pimpl_ = new NgramDetectorImpl(explicit_markers, order, prefixes, - target_separator); + target_separator, clusters); SetStateSize(pimpl_->ReserveStateSize()); } -- cgit v1.2.3