From 3106cf8eca76df8b46d139b8f5ce5002200d660d Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 24 Oct 2011 18:17:24 +0100 Subject: KenLM update. EnumerateVocab moved up a namespace. Fix trie building when bigrams are pruned. Make Chris feel better about MurmurHashNative. --- BUILDING | 2 +- decoder/ff_csplit.cc | 2 +- decoder/ff_klm.cc | 2 +- klm/lm/config.hh | 6 ++- klm/lm/enumerate_vocab.hh | 2 - klm/lm/left.hh | 8 +-- klm/lm/model.cc | 4 +- klm/lm/read_arpa.cc | 2 + klm/lm/search_hashed.cc | 16 +++--- klm/lm/search_trie.cc | 2 +- klm/lm/sri.cc | 108 ---------------------------------------- klm/lm/sri.hh | 102 ------------------------------------- klm/lm/vocab.hh | 2 +- klm/util/mmap.cc | 2 +- klm/util/murmur_hash.cc | 15 ++++-- klm/util/tokenize_piece.hh | 75 ++++++++++++++++++++++++++++ klm/util/tokenize_piece_test.cc | 94 ++++++++++++++++++++++++++++++++++ training/augment_grammar.cc | 2 +- training/test_ngram.cc | 2 +- 19 files changed, 208 insertions(+), 240 deletions(-) delete mode 100644 klm/lm/sri.cc delete mode 100644 klm/lm/sri.hh create mode 100644 klm/util/tokenize_piece_test.cc diff --git a/BUILDING b/BUILDING index b7535d70..c7b954c7 100644 --- a/BUILDING +++ b/BUILDING @@ -33,7 +33,7 @@ Instructions for building If you're building on cygwin, their libtool is buggy; this make command works for now: - make LIBS+="-loolm -ldstruct -lmisc -lz -lboost_program_options" \ + make LIBS+="-lz -lboost_program_options" \ CFLAGS+="-Wno-sign-compare" 5) Test diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc index dee6f4f9..3991d38f 100644 --- a/decoder/ff_csplit.cc +++ b/decoder/ff_csplit.cc @@ -155,7 +155,7 @@ void BasicCSplitFeatures::TraversalFeaturesImpl( } namespace { -struct CSVMapper : public lm::ngram::EnumerateVocab { +struct CSVMapper : public lm::EnumerateVocab { CSVMapper(vector* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } void Add(lm::WordIndex index, const StringPiece &str) { const WordID cdec_id = TD::Convert(str.as_string()); diff --git a/decoder/ff_klm.cc b/decoder/ff_klm.cc index ed6f731e..a4b26f7c 100644 --- a/decoder/ff_klm.cc +++ b/decoder/ff_klm.cc @@ -70,7 +70,7 @@ string KLanguageModel::usage(bool /*param*/,bool /*verbose*/) { namespace { -struct VMapper : public lm::ngram::EnumerateVocab { +struct VMapper : public lm::EnumerateVocab { VMapper(vector* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } void Add(lm::WordIndex index, const StringPiece &str) { const WordID cdec_id = TD::Convert(str.as_string()); diff --git a/klm/lm/config.hh b/klm/lm/config.hh index 227b8512..8564661b 100644 --- a/klm/lm/config.hh +++ b/klm/lm/config.hh @@ -8,10 +8,12 @@ /* Configuration for ngram model. Separate header to reduce pollution. */ -namespace lm { namespace ngram { - +namespace lm { + class EnumerateVocab; +namespace ngram { + struct Config { // EFFECTIVE FOR BOTH ARPA AND BINARY READS diff --git a/klm/lm/enumerate_vocab.hh b/klm/lm/enumerate_vocab.hh index e734316b..27263621 100644 --- a/klm/lm/enumerate_vocab.hh +++ b/klm/lm/enumerate_vocab.hh @@ -5,7 +5,6 @@ #include "util/string_piece.hh" namespace lm { -namespace ngram { /* If you need the actual strings in the vocabulary, inherit from this class * and implement Add. Then put a pointer in Config.enumerate_vocab; it does @@ -23,7 +22,6 @@ class EnumerateVocab { EnumerateVocab() {} }; -} // namespace ngram } // namespace lm #endif // LM_ENUMERATE_VOCAB__ diff --git a/klm/lm/left.hh b/klm/lm/left.hh index bb3f5539..15464c82 100644 --- a/klm/lm/left.hh +++ b/klm/lm/left.hh @@ -176,16 +176,18 @@ template class RuleScore { float backoffs[kMaxOrder - 1], backoffs2[kMaxOrder - 1]; float *back = backoffs, *back2 = backoffs2; unsigned char next_use; - FullScoreReturn ret; - ProcessRet(ret = model_.ExtendLeft(out_.right.words, out_.right.words + out_.right.length, out_.right.backoff, in.left.pointers[0], 1, back, next_use)); + + // First word + ProcessRet(model_.ExtendLeft(out_.right.words, out_.right.words + out_.right.length, out_.right.backoff, in.left.pointers[0], 1, back, next_use)); if (!next_use) { left_done_ = true; out_.right = in.right; return; } + // Words after the first, so extending a bigram to begin with unsigned char extend_length = 2; for (const uint64_t *i = in.left.pointers + 1; i < in.left.pointers + in.left.length; ++i, ++extend_length) { - ProcessRet(ret = model_.ExtendLeft(out_.right.words, out_.right.words + next_use, back, *i, extend_length, back2, next_use)); + ProcessRet(model_.ExtendLeft(out_.right.words, out_.right.words + next_use, back, *i, extend_length, back2, next_use)); if (!next_use) { left_done_ = true; out_.right = in.right; diff --git a/klm/lm/model.cc b/klm/lm/model.cc index 25f1ab7c..e4c1ec1d 100644 --- a/klm/lm/model.cc +++ b/klm/lm/model.cc @@ -91,8 +91,8 @@ template void GenericModel FullScoreReturn GenericModel::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const { FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state); - if (ret.ngram_length - 1 < in_state.length) { - ret.prob = std::accumulate(in_state.backoff + ret.ngram_length - 1, in_state.backoff + in_state.length, ret.prob); + for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) { + ret.prob += *i; } return ret; } diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc index 455bc4ba..dce73f77 100644 --- a/klm/lm/read_arpa.cc +++ b/klm/lm/read_arpa.cc @@ -38,6 +38,8 @@ void ReadARPACounts(util::FilePiece &in, std::vector &number) { } if (static_cast(line.size()) >= strlen(kBinaryMagic) && StringPiece(line.data(), strlen(kBinaryMagic)) == kBinaryMagic) UTIL_THROW(FormatLoadException, "This looks like a binary file but got sent to the ARPA parser. Did you compress the binary file or pass a binary file where only ARPA files are accepted?"); + UTIL_THROW_IF(line.size() >= 4 && StringPiece(line.data(), 4) == "blmt", FormatLoadException, "This looks like an IRSTLM binary file. Did you forget to pass --text yes to compile-lm?"); + UTIL_THROW_IF(line == "iARPA", FormatLoadException, "This looks like an IRSTLM iARPA file. You need an ARPA file. Run\n compile-lm --text yes " << in.FileName() << " " << in.FileName() << ".arpa\nfirst."); UTIL_THROW(FormatLoadException, "first non-empty line was \"" << line << "\" not \\data\\."); } while (!IsEntirelyWhiteSpace(line = in.ReadLine())) { diff --git a/klm/lm/search_hashed.cc b/klm/lm/search_hashed.cc index 334adf12..247832b0 100644 --- a/klm/lm/search_hashed.cc +++ b/klm/lm/search_hashed.cc @@ -87,14 +87,14 @@ template void ReadNGrams( ReadNGramHeader(f, n); // vocab ids of words in reverse order - WordIndex vocab_ids[n]; - uint64_t keys[n - 1]; + std::vector vocab_ids(n); + std::vector keys(n-1); typename Store::Packing::Value value; typename Middle::MutableIterator found; for (size_t i = 0; i < count; ++i) { - ReadNGram(f, n, vocab, vocab_ids, value, warn); + ReadNGram(f, n, vocab, &*vocab_ids.begin(), value, warn); - keys[0] = detail::CombineWordHash(static_cast(*vocab_ids), vocab_ids[1]); + keys[0] = detail::CombineWordHash(static_cast(vocab_ids.front()), vocab_ids[1]); for (unsigned int h = 1; h < n - 1; ++h) { keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]); } @@ -106,9 +106,9 @@ template void ReadNGrams( util::FloatEnc fix_prob; for (lower = n - 3; ; --lower) { if (lower == -1) { - fix_prob.f = unigrams[vocab_ids[0]].prob; + fix_prob.f = unigrams[vocab_ids.front()].prob; fix_prob.i &= ~util::kSignBit; - unigrams[vocab_ids[0]].prob = fix_prob.f; + unigrams[vocab_ids.front()].prob = fix_prob.f; break; } if (middle[lower].UnsafeMutableFind(keys[lower], found)) { @@ -120,8 +120,8 @@ template void ReadNGrams( break; } } - if (lower != static_cast(n) - 3) FixSRI(lower, fix_prob.f, n, keys, vocab_ids, unigrams, middle); - activate(vocab_ids, n); + if (lower != static_cast(n) - 3) FixSRI(lower, fix_prob.f, n, &*keys.begin(), &*vocab_ids.begin(), unigrams, middle); + activate(&*vocab_ids.begin(), n); } store.FinishedInserting(); diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index 5d8c70db..e3cf9547 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -493,7 +493,7 @@ template void BuildTrie(const std::string &file_pre util::scoped_FILE unigram_file; { std::string name(file_prefix + "unigrams"); - unigram_file.reset(OpenOrThrow(name.c_str(), "r")); + unigram_file.reset(OpenOrThrow(name.c_str(), "r+")); util::RemoveOrThrow(name.c_str()); } sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs); diff --git a/klm/lm/sri.cc b/klm/lm/sri.cc deleted file mode 100644 index 825f699b..00000000 --- a/klm/lm/sri.cc +++ /dev/null @@ -1,108 +0,0 @@ -#include "lm/lm_exception.hh" -#include "lm/sri.hh" - -#include -#include - -#include - -namespace lm { -namespace sri { - -Vocabulary::Vocabulary() : sri_(new Vocab) {} - -Vocabulary::~Vocabulary() {} - -WordIndex Vocabulary::Index(const char *str) const { - WordIndex ret = sri_->getIndex(str); - // NGram wants the index of Vocab_Unknown for unknown words, but for some reason SRI returns Vocab_None here :-(. - if (ret == Vocab_None) { - return not_found_; - } else { - return ret; - } -} - -const char *Vocabulary::Word(WordIndex index) const { - return sri_->getWord(index); -} - -void Vocabulary::FinishedLoading() { - SetSpecial( - sri_->ssIndex(), - sri_->seIndex(), - sri_->unkIndex()); -} - -namespace { -Ngram *MakeSRIModel(const char *file_name, unsigned int ngram_length, Vocab &sri_vocab) { - sri_vocab.unkIsWord() = true; - std::auto_ptr ret(new Ngram(sri_vocab, ngram_length)); - File file(file_name, "r"); - errno = 0; - if (!ret->read(file)) { - UTIL_THROW(FormatLoadException, "reading file " << file_name << " with SRI failed."); - } - return ret.release(); -} -} // namespace - -Model::Model(const char *file_name, unsigned int ngram_length) : sri_(MakeSRIModel(file_name, ngram_length, *vocab_.sri_)) { - if (!sri_->setorder()) { - UTIL_THROW(FormatLoadException, "Can't have an SRI model with order 0."); - } - vocab_.FinishedLoading(); - State begin_state = State(); - begin_state.valid_length_ = 1; - if (kMaxOrder > 1) { - begin_state.history_[0] = vocab_.BeginSentence(); - if (kMaxOrder > 2) begin_state.history_[1] = Vocab_None; - } - State null_state = State(); - null_state.valid_length_ = 0; - if (kMaxOrder > 1) null_state.history_[0] = Vocab_None; - Init(begin_state, null_state, vocab_, sri_->setorder()); - not_found_ = vocab_.NotFound(); -} - -Model::~Model() {} - -namespace { - -/* Argh SRI's wordProb knows the ngram length but doesn't return it. One more - * reason you should use my model. */ -// TODO(stolcke): fix SRILM so I don't have to do this. -unsigned int MatchedLength(Ngram &model, const WordIndex new_word, const SRIVocabIndex *const_history) { - unsigned int out_length = 0; - // This gets the length of context used, which is ngram_length - 1 unless new_word is OOV in which case it is 0. - model.contextID(new_word, const_history, out_length); - return out_length + 1; -} - -} // namespace - -FullScoreReturn Model::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const { - // If you get a compiler in this function, change SRIVocabIndex in sri.hh to match the one found in SRI's Vocab.h. - const SRIVocabIndex *const_history; - SRIVocabIndex local_history[Order()]; - if (in_state.valid_length_ < kMaxOrder - 1) { - const_history = in_state.history_; - } else { - std::copy(in_state.history_, in_state.history_ + in_state.valid_length_, local_history); - local_history[in_state.valid_length_] = Vocab_None; - const_history = local_history; - } - FullScoreReturn ret; - ret.ngram_length = MatchedLength(*sri_, new_word, const_history); - out_state.history_[0] = new_word; - out_state.valid_length_ = std::min(ret.ngram_length, Order() - 1); - std::copy(const_history, const_history + out_state.valid_length_ - 1, out_state.history_ + 1); - if (out_state.valid_length_ < kMaxOrder - 1) { - out_state.history_[out_state.valid_length_] = Vocab_None; - } - ret.prob = sri_->wordProb(new_word, const_history); - return ret; -} - -} // namespace sri -} // namespace lm diff --git a/klm/lm/sri.hh b/klm/lm/sri.hh deleted file mode 100644 index b57e9b73..00000000 --- a/klm/lm/sri.hh +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef LM_SRI__ -#define LM_SRI__ - -#include "lm/facade.hh" -#include "util/murmur_hash.hh" - -#include -#include -#include - -class Ngram; -class Vocab; - -/* The ngram length reported uses some random API I found and may be wrong. - * - * See ngram, which should return equivalent results. - */ - -namespace lm { -namespace sri { - -static const unsigned int kMaxOrder = 6; - -/* This should match VocabIndex found in SRI's Vocab.h - * The reason I define this here independently is that SRI's headers - * pollute and increase compile time. - * It's difficult to extract this from their header and anyway would - * break packaging. - * If these differ there will be a compiler error in ActuallyCall. - */ -typedef unsigned int SRIVocabIndex; - -class State { - public: - // You shouldn't need to touch these, but they're public so State will be a POD. - // If valid_length_ < kMaxOrder - 1 then history_[valid_length_] == Vocab_None. - SRIVocabIndex history_[kMaxOrder - 1]; - unsigned char valid_length_; -}; - -inline bool operator==(const State &left, const State &right) { - if (left.valid_length_ != right.valid_length_) { - return false; - } - for (const SRIVocabIndex *l = left.history_, *r = right.history_; - l != left.history_ + left.valid_length_; - ++l, ++r) { - if (*l != *r) return false; - } - return true; -} - -inline size_t hash_value(const State &state) { - return util::MurmurHashNative(&state.history_, sizeof(SRIVocabIndex) * state.valid_length_); -} - -class Vocabulary : public base::Vocabulary { - public: - Vocabulary(); - - ~Vocabulary(); - - WordIndex Index(const StringPiece &str) const { - std::string temp(str.data(), str.length()); - return Index(temp.c_str()); - } - WordIndex Index(const std::string &str) const { - return Index(str.c_str()); - } - WordIndex Index(const char *str) const; - - const char *Word(WordIndex index) const; - - private: - friend class Model; - void FinishedLoading(); - - // The parent class isn't copyable so auto_ptr is the same as scoped_ptr - // but without the boost dependence. - mutable std::auto_ptr sri_; -}; - -class Model : public base::ModelFacade { - public: - Model(const char *file_name, unsigned int ngram_length); - - ~Model(); - - FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const; - - private: - Vocabulary vocab_; - - mutable std::auto_ptr sri_; - - WordIndex not_found_; -}; - -} // namespace sri -} // namespace lm - -#endif // LM_SRI__ diff --git a/klm/lm/vocab.hh b/klm/lm/vocab.hh index 41e97052..4cf68196 100644 --- a/klm/lm/vocab.hh +++ b/klm/lm/vocab.hh @@ -15,10 +15,10 @@ namespace lm { class ProbBackoff; +class EnumerateVocab; namespace ngram { class Config; -class EnumerateVocab; namespace detail { uint64_t HashForVocab(const char *str, std::size_t len); diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index 5ce7adc9..279bafa8 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -15,7 +15,7 @@ namespace util { scoped_mmap::~scoped_mmap() { if (data_ != (void*)-1) { - // Thanks Denis Filimonov for pointing on NFS likes msync first. + // Thanks Denis Filimonov for pointing out NFS likes msync first. if (msync(data_, size_, MS_SYNC) || munmap(data_, size_)) { std::cerr << "msync or mmap failed for " << size_ << " bytes." << std::endl; abort(); diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc index fec47fd9..ef5783fe 100644 --- a/klm/util/murmur_hash.cc +++ b/klm/util/murmur_hash.cc @@ -117,13 +117,18 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed ) return h; } +// Trick to test for 64-bit architecture at compile time. +namespace { +template uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, unsigned int seed) { + return MurmurHash64A(key, len, seed); +} +template <> uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, unsigned int seed) { + return MurmurHash64B(key, len, seed); +} +} // namespace uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) { - if (sizeof(int) == 4) { - return MurmurHash64B(key, len, seed); - } else { - return MurmurHash64A(key, len, seed); - } + return MurmurHashNativeBackend(key, len, seed); } } // namespace util diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh index ee1c7ab2..413bda0b 100644 --- a/klm/util/tokenize_piece.hh +++ b/klm/util/tokenize_piece.hh @@ -5,6 +5,9 @@ #include +#include +#include + /* Usage: * * for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) { @@ -64,6 +67,78 @@ template class PieceIterator : public boost::iterator_facade class TokenIter : public boost::iterator_facade, const StringPiece, boost::forward_traversal_tag> { + public: + TokenIter() {} + + TokenIter(const StringPiece &str, const Find &finder) : after_(str), finder_(finder) { + increment(); + } + + bool operator!() const { + return current_.data() == 0; + } + operator bool() const { + return current_.data() != 0; + } + + static TokenIter end() { + return TokenIter(); + } + + private: + friend class boost::iterator_core_access; + + void increment() { + do { + StringPiece found(finder_.Find(after_)); + current_ = StringPiece(after_.data(), found.data() - after_.data()); + if (found.data() == after_.data() + after_.size()) { + after_ = StringPiece(NULL, 0); + } else { + after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size()); + } + } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false. + } + + bool equal(const TokenIter &other) const { + return after_.data() == other.after_.data(); + } + + const StringPiece &dereference() const { + return current_; + } + + StringPiece current_; + StringPiece after_; + + Find finder_; +}; + } // namespace util #endif // UTIL_TOKENIZE_PIECE__ diff --git a/klm/util/tokenize_piece_test.cc b/klm/util/tokenize_piece_test.cc new file mode 100644 index 00000000..e07ebcf5 --- /dev/null +++ b/klm/util/tokenize_piece_test.cc @@ -0,0 +1,94 @@ +#include "util/tokenize_piece.hh" +#include "util/string_piece.hh" + +#define BOOST_TEST_MODULE TokenIteratorTest +#include + +#include + +namespace util { +namespace { + +BOOST_AUTO_TEST_CASE(simple) { + PieceIterator<' '> it("single spaced words."); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("single"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("spaced"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("words."), *it); + ++it; + BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(null_delimiter) { + const char str[] = "\0first\0\0second\0\0\0third\0fourth\0\0\0"; + PieceIterator<'\0'> it(StringPiece(str, sizeof(str) - 1)); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("first"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("second"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("third"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("fourth"), *it); + ++it; + BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(null_entries) { + const char str[] = "\0split\0\0 \0me\0 "; + PieceIterator<' '> it(StringPiece(str, sizeof(str) - 1)); + BOOST_REQUIRE(it); + const char first[] = "\0split\0\0"; + BOOST_CHECK_EQUAL(StringPiece(first, sizeof(first) - 1), *it); + ++it; + BOOST_REQUIRE(it); + const char second[] = "\0me\0"; + BOOST_CHECK_EQUAL(StringPiece(second, sizeof(second) - 1), *it); + ++it; + BOOST_CHECK(!it); +} + +/*BOOST_AUTO_TEST_CASE(pipe_pipe_none) { + const char str[] = "nodelimit at all"; + TokenIter it(str, MultiCharacter("|||")); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(str), *it); + ++it; + BOOST_CHECK(!it); +} +BOOST_AUTO_TEST_CASE(pipe_pipe_two) { + const char str[] = "|||"; + TokenIter it(str, MultiCharacter("|||")); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(), *it); + ++it; + BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(remove_empty) { + const char str[] = "|||"; + TokenIter it(str, MultiCharacter("|||")); + BOOST_CHECK(!it); +}*/ + +BOOST_AUTO_TEST_CASE(remove_empty_keep) { + const char str[] = " |||"; + TokenIter it(str, MultiCharacter("|||")); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(" "), *it); + ++it; + BOOST_CHECK(!it); +} + +} // namespace +} // namespace util diff --git a/training/augment_grammar.cc b/training/augment_grammar.cc index e89a92d5..1e5af9a1 100644 --- a/training/augment_grammar.cc +++ b/training/augment_grammar.cc @@ -18,7 +18,7 @@ using namespace std; vector word_map; lm::ngram::ProbingModel* ngram; -struct VMapper : public lm::ngram::EnumerateVocab { +struct VMapper : public lm::EnumerateVocab { VMapper(vector* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } void Add(lm::WordIndex index, const StringPiece &str) { const WordID cdec_id = TD::Convert(str.as_string()); diff --git a/training/test_ngram.cc b/training/test_ngram.cc index c481b564..4597cc01 100644 --- a/training/test_ngram.cc +++ b/training/test_ngram.cc @@ -12,7 +12,7 @@ namespace po = boost::program_options; using namespace std; lm::ngram::ProbingModel* ngram; -struct GetVocab : public lm::ngram::EnumerateVocab { +struct GetVocab : public lm::EnumerateVocab { GetVocab(vector* out) : out_(out) { } void Add(lm::WordIndex index, const StringPiece &str) { out_->push_back(index); -- cgit v1.2.3 From b036e03e9db226fde7e6b0e69d86bdb5741f8006 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 3 Nov 2011 19:53:53 +0000 Subject: Bugfix trie building --- klm/lm/search_trie.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index e3cf9547..633bcdf4 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -24,10 +24,8 @@ #include #include #include +#include "util/portability.hh" -#include -#include -#include namespace lm { namespace ngram { @@ -271,7 +269,7 @@ template class WriteEntries { contexts_(contexts), unigrams_(unigrams), middle_(middle), - longest_(longest), + longest_(longest), bigram_pack_((order == 2) ? static_cast(longest_) : static_cast(*middle_)), order_(order), sri_(sri) {} @@ -334,6 +332,7 @@ template class BlankManager { void Visit(const WordIndex *to, unsigned char length, float prob) { basis_[length - 1] = prob; + // Try to match everything except the last word, which is expected to be different. unsigned char overlap = std::min(length - 1, been_length_); const WordIndex *cur; WordIndex *pre; @@ -350,14 +349,15 @@ template class BlankManager { UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context."); const float *lower_basis; for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {} + assert(*lower_basis != kBadProb); unsigned char based_on = lower_basis - basis_ + 1; for (; cur != to + length - 1; ++blank, ++cur, ++pre) { - assert(*lower_basis != kBadProb); doing_.MiddleBlank(blank, to, based_on, *lower_basis); *pre = *cur; // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram. basis_[blank - 1] = kBadProb; } + *pre = *cur; been_length_ = length; } -- cgit v1.2.3 From 635a8d31de50b5514cb471cb79bbe2cd3f23b0b5 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 3 Nov 2011 19:58:37 +0000 Subject: Oops introduced some of Hieu's windows stuff --- klm/lm/search_trie.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index 633bcdf4..4bd3f4ee 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -24,8 +24,10 @@ #include #include #include -#include "util/portability.hh" +#include +#include +#include namespace lm { namespace ngram { @@ -269,7 +271,7 @@ template class WriteEntries { contexts_(contexts), unigrams_(unigrams), middle_(middle), - longest_(longest), + longest_(longest), bigram_pack_((order == 2) ? static_cast(longest_) : static_cast(*middle_)), order_(order), sri_(sri) {} @@ -332,7 +334,6 @@ template class BlankManager { void Visit(const WordIndex *to, unsigned char length, float prob) { basis_[length - 1] = prob; - // Try to match everything except the last word, which is expected to be different. unsigned char overlap = std::min(length - 1, been_length_); const WordIndex *cur; WordIndex *pre; @@ -349,9 +350,9 @@ template class BlankManager { UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context."); const float *lower_basis; for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {} - assert(*lower_basis != kBadProb); unsigned char based_on = lower_basis - basis_ + 1; for (; cur != to + length - 1; ++blank, ++cur, ++pre) { + assert(*lower_basis != kBadProb); doing_.MiddleBlank(blank, to, based_on, *lower_basis); *pre = *cur; // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram. -- cgit v1.2.3 From bdd7fe7b513ade0b979fc050766e375044e84e86 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 3 Nov 2011 20:08:43 +0000 Subject: Mostly minor changes like a missing header guard and bad documentation --- klm/lm/bhiksha.hh | 5 +++++ klm/lm/build_binary.cc | 2 +- klm/lm/left.hh | 39 ++++++++++++++++++++++++--------------- klm/lm/vocab.cc | 1 + klm/lm/vocab.hh | 1 - klm/util/probing_hash_table.hh | 4 ++-- 6 files changed, 33 insertions(+), 19 deletions(-) diff --git a/klm/lm/bhiksha.hh b/klm/lm/bhiksha.hh index bc705959..3df43dda 100644 --- a/klm/lm/bhiksha.hh +++ b/klm/lm/bhiksha.hh @@ -10,6 +10,9 @@ * Currently only used for next pointers. */ +#ifndef LM_BHIKSHA__ +#define LM_BHIKSHA__ + #include #include @@ -108,3 +111,5 @@ class ArrayBhiksha { } // namespace trie } // namespace ngram } // namespace lm + +#endif // LM_BHIKSHA__ diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc index b7aee4de..fdb62a71 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary.cc @@ -15,7 +15,7 @@ namespace ngram { namespace { void Usage(const char *name) { - std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-c bits] [type] input.arpa [output.mmap]\n\n" + std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" "-u sets the log10 probability for if the ARPA file does not have one.\n" " Default is -100. The ARPA file will always take precedence.\n" "-s allows models to be built even if they do not have and .\n" diff --git a/klm/lm/left.hh b/klm/lm/left.hh index 15464c82..41f71f84 100644 --- a/klm/lm/left.hh +++ b/klm/lm/left.hh @@ -175,24 +175,14 @@ template class RuleScore { float backoffs[kMaxOrder - 1], backoffs2[kMaxOrder - 1]; float *back = backoffs, *back2 = backoffs2; - unsigned char next_use; + unsigned char next_use = out_.right.length; // First word - ProcessRet(model_.ExtendLeft(out_.right.words, out_.right.words + out_.right.length, out_.right.backoff, in.left.pointers[0], 1, back, next_use)); - if (!next_use) { - left_done_ = true; - out_.right = in.right; - return; - } + if (ExtendLeft(in, next_use, 1, out_.right.backoff, back)) return; + // Words after the first, so extending a bigram to begin with - unsigned char extend_length = 2; - for (const uint64_t *i = in.left.pointers + 1; i < in.left.pointers + in.left.length; ++i, ++extend_length) { - ProcessRet(model_.ExtendLeft(out_.right.words, out_.right.words + next_use, back, *i, extend_length, back2, next_use)); - if (!next_use) { - left_done_ = true; - out_.right = in.right; - return; - } + for (unsigned char extend_length = 2; extend_length <= in.left.length; ++extend_length) { + if (ExtendLeft(in, next_use, extend_length, back, back2)) return; std::swap(back, back2); } @@ -228,6 +218,25 @@ template class RuleScore { } private: + bool ExtendLeft(const ChartState &in, unsigned char &next_use, unsigned char extend_length, const float *back_in, float *back_out) { + ProcessRet(model_.ExtendLeft( + out_.right.words, out_.right.words + next_use, // Words to extend into + back_in, // Backoffs to use + in.left.pointers[extend_length - 1], extend_length, // Words to be extended + back_out, // Backoffs for the next score + next_use)); // Length of n-gram to use in next scoring. + if (next_use != out_.right.length) { + left_done_ = true; + if (!next_use) { + out_.right = in.right; + // Early exit. + return true; + } + } + // Continue scoring. + return false; + } + void ProcessRet(const FullScoreReturn &ret) { prob_ += ret.prob; if (left_done_) return; diff --git a/klm/lm/vocab.cc b/klm/lm/vocab.cc index 03b0767a..ffec41ca 100644 --- a/klm/lm/vocab.cc +++ b/klm/lm/vocab.cc @@ -135,6 +135,7 @@ void SortedVocabulary::LoadedBinary(int fd, EnumerateVocab *to) { end_ = begin_ + *(reinterpret_cast(begin_) - 1); ReadWords(fd, to); SetSpecial(Index(""), Index(""), 0); + bound_ = end_ - begin_ + 1; } namespace { diff --git a/klm/lm/vocab.hh b/klm/lm/vocab.hh index 4cf68196..3c3414fb 100644 --- a/klm/lm/vocab.hh +++ b/klm/lm/vocab.hh @@ -66,7 +66,6 @@ class SortedVocabulary : public base::Vocabulary { static size_t Size(std::size_t entries, const Config &config); // Vocab words are [0, Bound()) Only valid after FinishedLoading/LoadedBinary. - // While this number is correct, ProbingVocabulary::Bound might not be correct in some cases. WordIndex Bound() const { return bound_; } // Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway. diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh index 2ec342a6..8122d69c 100644 --- a/klm/util/probing_hash_table.hh +++ b/klm/util/probing_hash_table.hh @@ -61,14 +61,14 @@ template void Insert(const T &t) { + template MutableIterator Insert(const T &t) { if (++entries_ >= buckets_) UTIL_THROW(ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); #ifdef DEBUG assert(initialized_); #endif for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) { - if (equal_(i->GetKey(), invalid_)) { *i = t; return; } + if (equal_(i->GetKey(), invalid_)) { *i = t; return i; } if (++i == end_) { i = begin_; } } } -- cgit v1.2.3