From d3e2ec203a5cf550320caa8023ac3dd103b0be7d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 13 Oct 2014 00:42:37 -0400 Subject: new kenlm --- klm/lm/wrappers/nplm.cc | 90 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 klm/lm/wrappers/nplm.cc (limited to 'klm/lm/wrappers/nplm.cc') diff --git a/klm/lm/wrappers/nplm.cc b/klm/lm/wrappers/nplm.cc new file mode 100644 index 00000000..70622bd2 --- /dev/null +++ b/klm/lm/wrappers/nplm.cc @@ -0,0 +1,90 @@ +#include "lm/wrappers/nplm.hh" +#include "util/exception.hh" +#include "util/file.hh" + +#include + +#include + +#include "neuralLM.h" + +namespace lm { +namespace np { + +Vocabulary::Vocabulary(const nplm::vocabulary &vocab) + : base::Vocabulary(vocab.lookup_word(""), vocab.lookup_word(""), vocab.lookup_word("")), + vocab_(vocab), null_word_(vocab.lookup_word("")) {} + +Vocabulary::~Vocabulary() {} + +WordIndex Vocabulary::Index(const std::string &str) const { + return vocab_.lookup_word(str); +} + +bool Model::Recognize(const std::string &name) { + try { + util::scoped_fd file(util::OpenReadOrThrow(name.c_str())); + char magic_check[16]; + util::ReadOrThrow(file.get(), magic_check, sizeof(magic_check)); + const char nnlm_magic[] = "\\config\nversion "; + return !memcmp(magic_check, nnlm_magic, 16); + } catch (const util::Exception &) { + return false; + } +} + +Model::Model(const std::string &file, std::size_t cache) + : base_instance_(new nplm::neuralLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) { + UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ". Change the defintion of NPLM_MAX_ORDER and recompile."); + // log10 compatible with backoff models. + base_instance_->set_log_base(10.0); + State begin_sentence, null_context; + std::fill(begin_sentence.words, begin_sentence.words + NPLM_MAX_ORDER - 1, base_instance_->lookup_word("")); + null_word_ = base_instance_->lookup_word(""); + std::fill(null_context.words, null_context.words + NPLM_MAX_ORDER - 1, null_word_); + + Init(begin_sentence, null_context, vocab_, base_instance_->get_order()); +} + +Model::~Model() {} + +FullScoreReturn Model::FullScore(const State &from, const WordIndex new_word, State &out_state) const { + nplm::neuralLM *lm = backend_.get(); + if (!lm) { + lm = new nplm::neuralLM(*base_instance_); + backend_.reset(lm); + lm->set_cache(cache_size_); + } + // State is in natural word order. + FullScoreReturn ret; + for (int i = 0; i < lm->get_order() - 1; ++i) { + lm->staging_ngram()(i) = from.words[i]; + } + lm->staging_ngram()(lm->get_order() - 1) = new_word; + ret.prob = lm->lookup_from_staging(); + // Always say full order. + ret.ngram_length = lm->get_order(); + // Shift everything down by one. + memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (lm->get_order() - 2)); + out_state.words[lm->get_order() - 2] = new_word; + // Fill in trailing words with zeros so state comparison works. + memset(out_state.words + lm->get_order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - lm->get_order())); + return ret; +} + +// TODO: optimize with direct call? +FullScoreReturn Model::FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const { + // State is in natural word order. The API here specifies reverse order. + std::size_t state_length = std::min(Order() - 1, context_rend - context_rbegin); + State state; + // Pad with null words. + for (lm::WordIndex *i = state.words; i < state.words + Order() - 1 - state_length; ++i) { + *i = null_word_; + } + // Put new words at the end. + std::reverse_copy(context_rbegin, context_rbegin + state_length, state.words + Order() - 1 - state_length); + return FullScore(state, new_word, out_state); +} + +} // namespace np +} // namespace lm -- cgit v1.2.3