summaryrefslogtreecommitdiff
path: root/klm/lm
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2011-10-24 18:17:24 +0100
committerKenneth Heafield <github@kheafield.com>2011-10-24 18:17:24 +0100
commit3106cf8eca76df8b46d139b8f5ce5002200d660d (patch)
tree637930011a082de54bf21a97078cb67483ea248c /klm/lm
parentb2171f53c6c597ac4326f63250269aa13df84718 (diff)
KenLM update. EnumerateVocab moved up a namespace. Fix trie building when bigrams are pruned. Make Chris feel better about MurmurHashNative.
Diffstat (limited to 'klm/lm')
-rw-r--r--klm/lm/config.hh6
-rw-r--r--klm/lm/enumerate_vocab.hh2
-rw-r--r--klm/lm/left.hh8
-rw-r--r--klm/lm/model.cc4
-rw-r--r--klm/lm/read_arpa.cc2
-rw-r--r--klm/lm/search_hashed.cc16
-rw-r--r--klm/lm/search_trie.cc2
-rw-r--r--klm/lm/sri.cc108
-rw-r--r--klm/lm/sri.hh102
-rw-r--r--klm/lm/vocab.hh2
10 files changed, 23 insertions, 229 deletions
diff --git a/klm/lm/config.hh b/klm/lm/config.hh
index 227b8512..8564661b 100644
--- a/klm/lm/config.hh
+++ b/klm/lm/config.hh
@@ -8,10 +8,12 @@
/* Configuration for ngram model. Separate header to reduce pollution. */
-namespace lm { namespace ngram {
-
+namespace lm {
+
class EnumerateVocab;
+namespace ngram {
+
struct Config {
// EFFECTIVE FOR BOTH ARPA AND BINARY READS
diff --git a/klm/lm/enumerate_vocab.hh b/klm/lm/enumerate_vocab.hh
index e734316b..27263621 100644
--- a/klm/lm/enumerate_vocab.hh
+++ b/klm/lm/enumerate_vocab.hh
@@ -5,7 +5,6 @@
#include "util/string_piece.hh"
namespace lm {
-namespace ngram {
/* If you need the actual strings in the vocabulary, inherit from this class
* and implement Add. Then put a pointer in Config.enumerate_vocab; it does
@@ -23,7 +22,6 @@ class EnumerateVocab {
EnumerateVocab() {}
};
-} // namespace ngram
} // namespace lm
#endif // LM_ENUMERATE_VOCAB__
diff --git a/klm/lm/left.hh b/klm/lm/left.hh
index bb3f5539..15464c82 100644
--- a/klm/lm/left.hh
+++ b/klm/lm/left.hh
@@ -176,16 +176,18 @@ template <class M> class RuleScore {
float backoffs[kMaxOrder - 1], backoffs2[kMaxOrder - 1];
float *back = backoffs, *back2 = backoffs2;
unsigned char next_use;
- FullScoreReturn ret;
- ProcessRet(ret = model_.ExtendLeft(out_.right.words, out_.right.words + out_.right.length, out_.right.backoff, in.left.pointers[0], 1, back, next_use));
+
+ // First word
+ ProcessRet(model_.ExtendLeft(out_.right.words, out_.right.words + out_.right.length, out_.right.backoff, in.left.pointers[0], 1, back, next_use));
if (!next_use) {
left_done_ = true;
out_.right = in.right;
return;
}
+ // Words after the first, so extending a bigram to begin with
unsigned char extend_length = 2;
for (const uint64_t *i = in.left.pointers + 1; i < in.left.pointers + in.left.length; ++i, ++extend_length) {
- ProcessRet(ret = model_.ExtendLeft(out_.right.words, out_.right.words + next_use, back, *i, extend_length, back2, next_use));
+ ProcessRet(model_.ExtendLeft(out_.right.words, out_.right.words + next_use, back, *i, extend_length, back2, next_use));
if (!next_use) {
left_done_ = true;
out_.right = in.right;
diff --git a/klm/lm/model.cc b/klm/lm/model.cc
index 25f1ab7c..e4c1ec1d 100644
--- a/klm/lm/model.cc
+++ b/klm/lm/model.cc
@@ -91,8 +91,8 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state);
- if (ret.ngram_length - 1 < in_state.length) {
- ret.prob = std::accumulate(in_state.backoff + ret.ngram_length - 1, in_state.backoff + in_state.length, ret.prob);
+ for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) {
+ ret.prob += *i;
}
return ret;
}
diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc
index 455bc4ba..dce73f77 100644
--- a/klm/lm/read_arpa.cc
+++ b/klm/lm/read_arpa.cc
@@ -38,6 +38,8 @@ void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) {
}
if (static_cast<size_t>(line.size()) >= strlen(kBinaryMagic) && StringPiece(line.data(), strlen(kBinaryMagic)) == kBinaryMagic)
UTIL_THROW(FormatLoadException, "This looks like a binary file but got sent to the ARPA parser. Did you compress the binary file or pass a binary file where only ARPA files are accepted?");
+ UTIL_THROW_IF(line.size() >= 4 && StringPiece(line.data(), 4) == "blmt", FormatLoadException, "This looks like an IRSTLM binary file. Did you forget to pass --text yes to compile-lm?");
+ UTIL_THROW_IF(line == "iARPA", FormatLoadException, "This looks like an IRSTLM iARPA file. You need an ARPA file. Run\n compile-lm --text yes " << in.FileName() << " " << in.FileName() << ".arpa\nfirst.");
UTIL_THROW(FormatLoadException, "first non-empty line was \"" << line << "\" not \\data\\.");
}
while (!IsEntirelyWhiteSpace(line = in.ReadLine())) {
diff --git a/klm/lm/search_hashed.cc b/klm/lm/search_hashed.cc
index 334adf12..247832b0 100644
--- a/klm/lm/search_hashed.cc
+++ b/klm/lm/search_hashed.cc
@@ -87,14 +87,14 @@ template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(
ReadNGramHeader(f, n);
// vocab ids of words in reverse order
- WordIndex vocab_ids[n];
- uint64_t keys[n - 1];
+ std::vector<WordIndex> vocab_ids(n);
+ std::vector<uint64_t> keys(n-1);
typename Store::Packing::Value value;
typename Middle::MutableIterator found;
for (size_t i = 0; i < count; ++i) {
- ReadNGram(f, n, vocab, vocab_ids, value, warn);
+ ReadNGram(f, n, vocab, &*vocab_ids.begin(), value, warn);
- keys[0] = detail::CombineWordHash(static_cast<uint64_t>(*vocab_ids), vocab_ids[1]);
+ keys[0] = detail::CombineWordHash(static_cast<uint64_t>(vocab_ids.front()), vocab_ids[1]);
for (unsigned int h = 1; h < n - 1; ++h) {
keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]);
}
@@ -106,9 +106,9 @@ template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(
util::FloatEnc fix_prob;
for (lower = n - 3; ; --lower) {
if (lower == -1) {
- fix_prob.f = unigrams[vocab_ids[0]].prob;
+ fix_prob.f = unigrams[vocab_ids.front()].prob;
fix_prob.i &= ~util::kSignBit;
- unigrams[vocab_ids[0]].prob = fix_prob.f;
+ unigrams[vocab_ids.front()].prob = fix_prob.f;
break;
}
if (middle[lower].UnsafeMutableFind(keys[lower], found)) {
@@ -120,8 +120,8 @@ template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(
break;
}
}
- if (lower != static_cast<int>(n) - 3) FixSRI(lower, fix_prob.f, n, keys, vocab_ids, unigrams, middle);
- activate(vocab_ids, n);
+ if (lower != static_cast<int>(n) - 3) FixSRI(lower, fix_prob.f, n, &*keys.begin(), &*vocab_ids.begin(), unigrams, middle);
+ activate(&*vocab_ids.begin(), n);
}
store.FinishedInserting();
diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc
index 5d8c70db..e3cf9547 100644
--- a/klm/lm/search_trie.cc
+++ b/klm/lm/search_trie.cc
@@ -493,7 +493,7 @@ template <class Quant, class Bhiksha> void BuildTrie(const std::string &file_pre
util::scoped_FILE unigram_file;
{
std::string name(file_prefix + "unigrams");
- unigram_file.reset(OpenOrThrow(name.c_str(), "r"));
+ unigram_file.reset(OpenOrThrow(name.c_str(), "r+"));
util::RemoveOrThrow(name.c_str());
}
sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs);
diff --git a/klm/lm/sri.cc b/klm/lm/sri.cc
deleted file mode 100644
index 825f699b..00000000
--- a/klm/lm/sri.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-#include "lm/lm_exception.hh"
-#include "lm/sri.hh"
-
-#include <Ngram.h>
-#include <Vocab.h>
-
-#include <errno.h>
-
-namespace lm {
-namespace sri {
-
-Vocabulary::Vocabulary() : sri_(new Vocab) {}
-
-Vocabulary::~Vocabulary() {}
-
-WordIndex Vocabulary::Index(const char *str) const {
- WordIndex ret = sri_->getIndex(str);
- // NGram wants the index of Vocab_Unknown for unknown words, but for some reason SRI returns Vocab_None here :-(.
- if (ret == Vocab_None) {
- return not_found_;
- } else {
- return ret;
- }
-}
-
-const char *Vocabulary::Word(WordIndex index) const {
- return sri_->getWord(index);
-}
-
-void Vocabulary::FinishedLoading() {
- SetSpecial(
- sri_->ssIndex(),
- sri_->seIndex(),
- sri_->unkIndex());
-}
-
-namespace {
-Ngram *MakeSRIModel(const char *file_name, unsigned int ngram_length, Vocab &sri_vocab) {
- sri_vocab.unkIsWord() = true;
- std::auto_ptr<Ngram> ret(new Ngram(sri_vocab, ngram_length));
- File file(file_name, "r");
- errno = 0;
- if (!ret->read(file)) {
- UTIL_THROW(FormatLoadException, "reading file " << file_name << " with SRI failed.");
- }
- return ret.release();
-}
-} // namespace
-
-Model::Model(const char *file_name, unsigned int ngram_length) : sri_(MakeSRIModel(file_name, ngram_length, *vocab_.sri_)) {
- if (!sri_->setorder()) {
- UTIL_THROW(FormatLoadException, "Can't have an SRI model with order 0.");
- }
- vocab_.FinishedLoading();
- State begin_state = State();
- begin_state.valid_length_ = 1;
- if (kMaxOrder > 1) {
- begin_state.history_[0] = vocab_.BeginSentence();
- if (kMaxOrder > 2) begin_state.history_[1] = Vocab_None;
- }
- State null_state = State();
- null_state.valid_length_ = 0;
- if (kMaxOrder > 1) null_state.history_[0] = Vocab_None;
- Init(begin_state, null_state, vocab_, sri_->setorder());
- not_found_ = vocab_.NotFound();
-}
-
-Model::~Model() {}
-
-namespace {
-
-/* Argh SRI's wordProb knows the ngram length but doesn't return it. One more
- * reason you should use my model. */
-// TODO(stolcke): fix SRILM so I don't have to do this.
-unsigned int MatchedLength(Ngram &model, const WordIndex new_word, const SRIVocabIndex *const_history) {
- unsigned int out_length = 0;
- // This gets the length of context used, which is ngram_length - 1 unless new_word is OOV in which case it is 0.
- model.contextID(new_word, const_history, out_length);
- return out_length + 1;
-}
-
-} // namespace
-
-FullScoreReturn Model::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
- // If you get a compiler in this function, change SRIVocabIndex in sri.hh to match the one found in SRI's Vocab.h.
- const SRIVocabIndex *const_history;
- SRIVocabIndex local_history[Order()];
- if (in_state.valid_length_ < kMaxOrder - 1) {
- const_history = in_state.history_;
- } else {
- std::copy(in_state.history_, in_state.history_ + in_state.valid_length_, local_history);
- local_history[in_state.valid_length_] = Vocab_None;
- const_history = local_history;
- }
- FullScoreReturn ret;
- ret.ngram_length = MatchedLength(*sri_, new_word, const_history);
- out_state.history_[0] = new_word;
- out_state.valid_length_ = std::min<unsigned char>(ret.ngram_length, Order() - 1);
- std::copy(const_history, const_history + out_state.valid_length_ - 1, out_state.history_ + 1);
- if (out_state.valid_length_ < kMaxOrder - 1) {
- out_state.history_[out_state.valid_length_] = Vocab_None;
- }
- ret.prob = sri_->wordProb(new_word, const_history);
- return ret;
-}
-
-} // namespace sri
-} // namespace lm
diff --git a/klm/lm/sri.hh b/klm/lm/sri.hh
deleted file mode 100644
index b57e9b73..00000000
--- a/klm/lm/sri.hh
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef LM_SRI__
-#define LM_SRI__
-
-#include "lm/facade.hh"
-#include "util/murmur_hash.hh"
-
-#include <cmath>
-#include <exception>
-#include <memory>
-
-class Ngram;
-class Vocab;
-
-/* The ngram length reported uses some random API I found and may be wrong.
- *
- * See ngram, which should return equivalent results.
- */
-
-namespace lm {
-namespace sri {
-
-static const unsigned int kMaxOrder = 6;
-
-/* This should match VocabIndex found in SRI's Vocab.h
- * The reason I define this here independently is that SRI's headers
- * pollute and increase compile time.
- * It's difficult to extract this from their header and anyway would
- * break packaging.
- * If these differ there will be a compiler error in ActuallyCall.
- */
-typedef unsigned int SRIVocabIndex;
-
-class State {
- public:
- // You shouldn't need to touch these, but they're public so State will be a POD.
- // If valid_length_ < kMaxOrder - 1 then history_[valid_length_] == Vocab_None.
- SRIVocabIndex history_[kMaxOrder - 1];
- unsigned char valid_length_;
-};
-
-inline bool operator==(const State &left, const State &right) {
- if (left.valid_length_ != right.valid_length_) {
- return false;
- }
- for (const SRIVocabIndex *l = left.history_, *r = right.history_;
- l != left.history_ + left.valid_length_;
- ++l, ++r) {
- if (*l != *r) return false;
- }
- return true;
-}
-
-inline size_t hash_value(const State &state) {
- return util::MurmurHashNative(&state.history_, sizeof(SRIVocabIndex) * state.valid_length_);
-}
-
-class Vocabulary : public base::Vocabulary {
- public:
- Vocabulary();
-
- ~Vocabulary();
-
- WordIndex Index(const StringPiece &str) const {
- std::string temp(str.data(), str.length());
- return Index(temp.c_str());
- }
- WordIndex Index(const std::string &str) const {
- return Index(str.c_str());
- }
- WordIndex Index(const char *str) const;
-
- const char *Word(WordIndex index) const;
-
- private:
- friend class Model;
- void FinishedLoading();
-
- // The parent class isn't copyable so auto_ptr is the same as scoped_ptr
- // but without the boost dependence.
- mutable std::auto_ptr<Vocab> sri_;
-};
-
-class Model : public base::ModelFacade<Model, State, Vocabulary> {
- public:
- Model(const char *file_name, unsigned int ngram_length);
-
- ~Model();
-
- FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
-
- private:
- Vocabulary vocab_;
-
- mutable std::auto_ptr<Ngram> sri_;
-
- WordIndex not_found_;
-};
-
-} // namespace sri
-} // namespace lm
-
-#endif // LM_SRI__
diff --git a/klm/lm/vocab.hh b/klm/lm/vocab.hh
index 41e97052..4cf68196 100644
--- a/klm/lm/vocab.hh
+++ b/klm/lm/vocab.hh
@@ -15,10 +15,10 @@
namespace lm {
class ProbBackoff;
+class EnumerateVocab;
namespace ngram {
class Config;
-class EnumerateVocab;
namespace detail {
uint64_t HashForVocab(const char *str, std::size_t len);