From d04c0ca2d9df0e147239b18e90650ca8bd51d594 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 18 Jan 2011 15:55:40 -0500 Subject: new version of klm --- klm/lm/search_hashed.cc | 64 +++++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 28 deletions(-) (limited to 'klm/lm/search_hashed.cc') diff --git a/klm/lm/search_hashed.cc b/klm/lm/search_hashed.cc index 9cb662a6..9200aeb6 100644 --- a/klm/lm/search_hashed.cc +++ b/klm/lm/search_hashed.cc @@ -1,5 +1,6 @@ #include "lm/search_hashed.hh" +#include "lm/blank.hh" #include "lm/lm_exception.hh" #include "lm/read_arpa.hh" #include "lm/vocab.hh" @@ -13,34 +14,30 @@ namespace ngram { namespace { -/* All of the entropy is in low order bits and boost::hash does poorly with - * these. Odd numbers near 2^64 chosen by mashing on the keyboard. There is a - * stable point: 0. But 0 is which won't be queried here anyway. - */ -inline uint64_t CombineWordHash(uint64_t current, const WordIndex next) { - uint64_t ret = (current * 8978948897894561157ULL) ^ (static_cast(next) * 17894857484156487943ULL); - return ret; -} - -uint64_t ChainedWordHash(const WordIndex *word, const WordIndex *word_end) { - if (word == word_end) return 0; - uint64_t current = static_cast(*word); - for (++word; word != word_end; ++word) { - current = CombineWordHash(current, *word); - } - return current; -} - -template void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, Store &store) { +template void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, std::vector &middle, Store &store) { + ReadNGramHeader(f, n); + ProbBackoff blank; + blank.prob = kBlankProb; + blank.backoff = kBlankBackoff; // vocab ids of words in reverse order WordIndex vocab_ids[n]; + uint64_t keys[n - 1]; typename Store::Packing::Value value; + typename Middle::ConstIterator found; for (size_t i = 0; i < count; ++i) { ReadNGram(f, n, vocab, vocab_ids, value); - uint64_t key = ChainedWordHash(vocab_ids, vocab_ids + n); - store.Insert(Store::Packing::Make(key, value)); + keys[0] = detail::CombineWordHash(static_cast(*vocab_ids), vocab_ids[1]); + for (unsigned int h = 1; h < n - 1; ++h) { + keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]); + } + store.Insert(Store::Packing::Make(keys[n-2], value)); + // Go back and insert blanks. + for (int lower = n - 3; lower >= 0; --lower) { + if (middle[lower].Find(keys[lower], found)) break; + middle[lower].Insert(Middle::Packing::Make(keys[lower], blank)); + } } store.FinishedInserting(); @@ -49,17 +46,28 @@ template void ReadNGrams(util::FilePiece &f, const unsi } // namespace namespace detail { -template template void TemplateHashedSearch::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector &counts, const Config &/*config*/, Voc &vocab) { +template template void TemplateHashedSearch::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector &counts, const Config &config, Voc &vocab, Backing &backing) { + // TODO: fix sorted. + SetupMemory(GrowForSearch(config, HASH_PROBING, counts, Size(counts, config), backing), counts, config); + Read1Grams(f, counts[0], vocab, unigram.Raw()); - // Read the n-grams. - for (unsigned int n = 2; n < counts.size(); ++n) { - ReadNGrams(f, n, counts[n-1], vocab, middle[n-2]); + + try { + for (unsigned int n = 2; n < counts.size(); ++n) { + ReadNGrams(f, n, counts[n-1], vocab, middle, middle[n-2]); + } + ReadNGrams(f, counts.size(), counts[counts.size() - 1], vocab, middle, longest); + } catch (util::ProbingSizeException &e) { + UTIL_THROW(util::ProbingSizeException, "Avoid pruning n-grams like \"bar baz quux\" when \"foo bar baz quux\" is still in the model. KenLM will work when this pruning happens, but the probing model assumes these events are rare enough that using blank space in the probing hash table will cover all of them. Increase probing_multiplier (-p to build_binary) to add more blank spaces. "); } - ReadNGrams(f, counts.size(), counts[counts.size() - 1], vocab, longest); } -template void TemplateHashedSearch::InitializeFromARPA(const char *, util::FilePiece &f, const std::vector &counts, const Config &, ProbingVocabulary &vocab); -template void TemplateHashedSearch::InitializeFromARPA(const char *, util::FilePiece &f, const std::vector &counts, const Config &, SortedVocabulary &vocab); +template void TemplateHashedSearch::InitializeFromARPA(const char *, util::FilePiece &f, const std::vector &counts, const Config &, ProbingVocabulary &vocab, Backing &backing); +template void TemplateHashedSearch::InitializeFromARPA(const char *, util::FilePiece &f, const std::vector &counts, const Config &, SortedVocabulary &vocab, Backing &backing); + +SortedHashedSearch::SortedHashedSearch() { + UTIL_THROW(util::Exception, "Sorted is broken at the moment, sorry"); +} } // namespace detail } // namespace ngram -- cgit v1.2.3