From 783c57b2d3312738ddcf992ac55ff750afe7cb47 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 27 Jan 2014 17:42:19 -0800 Subject: KenLM 5cc905bc2d214efa7de2db56a9a672b749a95591 --- klm/lm/model.cc | 84 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 32 deletions(-) (limited to 'klm/lm/model.cc') diff --git a/klm/lm/model.cc b/klm/lm/model.cc index a26654a6..a5a16bf8 100644 --- a/klm/lm/model.cc +++ b/klm/lm/model.cc @@ -34,23 +34,17 @@ template void GenericModel(start - static_cast(base)) != goal_size) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast(base)) << " but Size says they should take " << goal_size); } -template GenericModel::GenericModel(const char *file, const Config &config) { - LoadLM(file, config, *this); - - // g++ prints warnings unless these are fully initialized. - State begin_sentence = State(); - begin_sentence.length = 1; - begin_sentence.words[0] = vocab_.BeginSentence(); - typename Search::Node ignored_node; - bool ignored_independent_left; - uint64_t ignored_extend_left; - begin_sentence.backoff[0] = search_.LookupUnigram(begin_sentence.words[0], ignored_node, ignored_independent_left, ignored_extend_left).Backoff(); - State null_context = State(); - null_context.length = 0; - P::Init(begin_sentence, null_context, vocab_, search_.Order()); +namespace { +void ComplainAboutARPA(const Config &config, ModelType model_type) { + if (config.write_mmap || !config.messages) return; + if (config.arpa_complain == Config::ALL) { + *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl; + } else if (config.arpa_complain == Config::EXPENSIVE && + (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) { + *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl; + } } -namespace { void CheckCounts(const std::vector &counts) { UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE); if (sizeof(uint64_t) > sizeof(std::size_t)) { @@ -59,18 +53,45 @@ void CheckCounts(const std::vector &counts) { } } } + } // namespace -template void GenericModel::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) { - CheckCounts(params.counts); - SetupMemory(start, params.counts, config); - vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab); - search_.LoadedBinary(); +template GenericModel::GenericModel(const char *file, const Config &init_config) : backing_(init_config) { + util::scoped_fd fd(util::OpenReadOrThrow(file)); + if (IsBinaryFormat(fd.get())) { + Parameters parameters; + int fd_shallow = fd.release(); + backing_.InitializeBinary(fd_shallow, kModelType, kVersion, parameters); + CheckCounts(parameters.counts); + + Config new_config(init_config); + new_config.probing_multiplier = parameters.fixed.probing_multiplier; + Search::UpdateConfigFromBinary(backing_, parameters.counts, VocabularyT::Size(parameters.counts[0], new_config), new_config); + UTIL_THROW_IF(new_config.enumerate_vocab && !parameters.fixed.has_vocabulary, FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary."); + + SetupMemory(backing_.LoadBinary(Size(parameters.counts, new_config)), parameters.counts, new_config); + vocab_.LoadedBinary(parameters.fixed.has_vocabulary, fd_shallow, new_config.enumerate_vocab, backing_.VocabStringReadingOffset()); + } else { + ComplainAboutARPA(init_config, kModelType); + InitializeFromARPA(fd.release(), file, init_config); + } + + // g++ prints warnings unless these are fully initialized. + State begin_sentence = State(); + begin_sentence.length = 1; + begin_sentence.words[0] = vocab_.BeginSentence(); + typename Search::Node ignored_node; + bool ignored_independent_left; + uint64_t ignored_extend_left; + begin_sentence.backoff[0] = search_.LookupUnigram(begin_sentence.words[0], ignored_node, ignored_independent_left, ignored_extend_left).Backoff(); + State null_context = State(); + null_context.length = 0; + P::Init(begin_sentence, null_context, vocab_, search_.Order()); } -template void GenericModel::InitializeFromARPA(const char *file, const Config &config) { - // Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any. - util::FilePiece f(backing_.file.release(), file, config.ProgressMessages()); +template void GenericModel::InitializeFromARPA(int fd, const char *file, const Config &config) { + // Backing file is the ARPA. + util::FilePiece f(fd, file, config.ProgressMessages()); try { std::vector counts; // File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_. @@ -81,13 +102,17 @@ template void GenericModel(search_rebase), counts, config); } else { vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]); search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_); @@ -99,18 +124,13 @@ template void GenericModel void GenericModel::UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config) { - util::AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config)); - Search::UpdateConfigFromBinary(fd, counts, config); -} - template FullScoreReturn GenericModel::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const { FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state); for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) { -- cgit v1.2.3