From 6872cde637b0ee6d308358e5618292da7efba512 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 8 Mar 2011 15:17:28 -0500 Subject: support for class based LMs (without emission probs) --- decoder/ff_klm.cc | 96 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 14 deletions(-) diff --git a/decoder/ff_klm.cc b/decoder/ff_klm.cc index 854653c3..446485f6 100644 --- a/decoder/ff_klm.cc +++ b/decoder/ff_klm.cc @@ -1,7 +1,11 @@ #include "ff_klm.h" #include +#include +#include + +#include "filelib.h" #include "stringlib.h" #include "hg.h" #include "tdict.h" @@ -15,10 +19,11 @@ static const unsigned char MASK = 7; // -x : rules include and // -n NAME : feature id is NAME -bool ParseLMArgs(string const& in, string* filename, bool* explicit_markers, string* featname) { +bool ParseLMArgs(string const& in, string* filename, string* mapfile, bool* explicit_markers, string* featname) { vector const& argv=SplitOnWhitespace(in); *explicit_markers = true; *featname="LanguageModel"; + *mapfile = ""; #define LMSPEC_NEXTARG if (i==argv.end()) { \ cerr << "Missing argument for "<<*last<<". "; goto usage; \ } else { ++i; } @@ -31,6 +36,9 @@ bool ParseLMArgs(string const& in, string* filename, bool* explicit_markers, str case 'x': *explicit_markers = true; break; + case 'm': + LMSPEC_NEXTARG; *mapfile=*i; + break; case 'n': LMSPEC_NEXTARG; *featname=*i; break; @@ -182,7 +190,9 @@ class KLanguageModelImpl { context_complete = true; } } else { // handle terminal - const lm::WordIndex cur_word = MapWord(e[j]); + const WordID cdec_word_or_class = ClassifyWordIfNecessary(e[j]); // in future, + // maybe handle emission + const lm::WordIndex cur_word = MapWord(cdec_word_or_class); // map to LM's id double p = 0; const bool is_oov = (cur_word == 0); if (cur_word == kSOS_) { @@ -248,22 +258,38 @@ class KLanguageModelImpl { } } + // if this is not a class-based LM, returns w untransformed, + // otherwise returns a word class mapping of w, + // returns TD::Convert("") if there is no mapping for w + WordID ClassifyWordIfNecessary(WordID w) const { + if (word2class_map_.empty()) return w; + if (w >= word2class_map_.size()) + return kCDEC_UNK; + else + return word2class_map_[w]; + } + + // converts to cdec word id's to KenLM's id space, OOVs and end up at 0 lm::WordIndex MapWord(WordID w) const { - if (w >= map_.size()) + if (w >= cdec2klm_map_.size()) return 0; else - return map_[w]; + return cdec2klm_map_[w]; } public: - KLanguageModelImpl(const string& filename, bool explicit_markers) : + KLanguageModelImpl(const string& filename, const string& mapfile, bool explicit_markers) : + kCDEC_UNK(TD::Convert("")) , add_sos_eos_(!explicit_markers) { - lm::ngram::Config conf; - VMapper vm(&map_); - conf.enumerate_vocab = &vm; - ngram_ = new Model(filename.c_str(), conf); + if (true) { + boost::scoped_ptr vm; + vm.reset(new VMapper(&cdec2klm_map_)); + lm::ngram::Config conf; + conf.enumerate_vocab = vm.get(); + ngram_ = new Model(filename.c_str(), conf); + } order_ = ngram_->Order(); - cerr << "Loaded " << order_ << "-gram KLM from " << filename << " (MapSize=" << map_.size() << ")\n"; + cerr << "Loaded " << order_ << "-gram KLM from " << filename << " (MapSize=" << cdec2klm_map_.size() << ")\n"; state_size_ = ngram_->StateSize() + 2 + (order_ - 1) * sizeof(lm::WordIndex); unscored_size_offset_ = ngram_->StateSize(); is_complete_offset_ = unscored_size_offset_ + 1; @@ -278,6 +304,46 @@ class KLanguageModelImpl { assert(kSOS_ > 0); kEOS_ = MapWord(TD::Convert("")); assert(kEOS_ > 0); + assert(MapWord(kCDEC_UNK) == 0); // KenLM invariant + + // handle class-based LMs (unambiguous word->class mapping reqd.) + if (mapfile.size()) + LoadWordClasses(mapfile); + } + + void LoadWordClasses(const string& file) { + ReadFile rf(file); + istream& in = *rf.stream(); + string line; + vector dummy; + int lc = 0; + cerr << " Loading word classes from " << file << " ...\n"; + AddWordToClassMapping_(TD::Convert(""), TD::Convert("")); + AddWordToClassMapping_(TD::Convert(""), TD::Convert("")); + while(in) { + getline(in, line); + if (!in) continue; + dummy.clear(); + TD::ConvertSentence(line, &dummy); + ++lc; + if (dummy.size() != 2) { + cerr << " Format error in " << file << ", line " << lc << ": " << line << endl; + abort(); + } + AddWordToClassMapping_(dummy[0], dummy[1]); + } + } + + void AddWordToClassMapping_(WordID word, WordID cls) { + if (word2class_map_.size() <= word) { + word2class_map_.resize((word + 10) * 1.1, kCDEC_UNK); + assert(word2class_map_.size() > word); + } + if(word2class_map_[word] == kCDEC_UNK) { + cerr << "Multiple classes for symbol " << TD::Convert(word) << endl; + abort(); + } + word2class_map_[word] = cls; } ~KLanguageModelImpl() { @@ -288,6 +354,7 @@ class KLanguageModelImpl { int ReserveStateSize() const { return state_size_; } private: + const WordID kCDEC_UNK; lm::WordIndex kSOS_; // - requires special handling. lm::WordIndex kEOS_; // Model* ngram_; @@ -304,18 +371,19 @@ class KLanguageModelImpl { int unscored_words_offset_; char* dummy_state_; vector dummy_ants_; - vector map_; + vector cdec2klm_map_; + vector word2class_map_; // if this is a class-based LM, this is the word->class mapping TRulePtr dummy_rule_; }; template KLanguageModel::KLanguageModel(const string& param) { - string filename, featname; + string filename, mapfile, featname; bool explicit_markers; - if (!ParseLMArgs(param, &filename, &explicit_markers, &featname)) { + if (!ParseLMArgs(param, &filename, &mapfile, &explicit_markers, &featname)) { abort(); } - pimpl_ = new KLanguageModelImpl(filename, explicit_markers); + pimpl_ = new KLanguageModelImpl(filename, mapfile, explicit_markers); fid_ = FD::Convert(featname); oov_fid_ = FD::Convert(featname+"_OOV"); SetStateSize(pimpl_->ReserveStateSize()); -- cgit v1.2.3