From 3106cf8eca76df8b46d139b8f5ce5002200d660d Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 24 Oct 2011 18:17:24 +0100 Subject: KenLM update. EnumerateVocab moved up a namespace. Fix trie building when bigrams are pruned. Make Chris feel better about MurmurHashNative. --- klm/lm/search_trie.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'klm/lm/search_trie.cc') diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index 5d8c70db..e3cf9547 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -493,7 +493,7 @@ template void BuildTrie(const std::string &file_pre util::scoped_FILE unigram_file; { std::string name(file_prefix + "unigrams"); - unigram_file.reset(OpenOrThrow(name.c_str(), "r")); + unigram_file.reset(OpenOrThrow(name.c_str(), "r+")); util::RemoveOrThrow(name.c_str()); } sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs); -- cgit v1.2.3 From b036e03e9db226fde7e6b0e69d86bdb5741f8006 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 3 Nov 2011 19:53:53 +0000 Subject: Bugfix trie building --- klm/lm/search_trie.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'klm/lm/search_trie.cc') diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index e3cf9547..633bcdf4 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -24,10 +24,8 @@ #include #include #include +#include "util/portability.hh" -#include -#include -#include namespace lm { namespace ngram { @@ -271,7 +269,7 @@ template class WriteEntries { contexts_(contexts), unigrams_(unigrams), middle_(middle), - longest_(longest), + longest_(longest), bigram_pack_((order == 2) ? static_cast(longest_) : static_cast(*middle_)), order_(order), sri_(sri) {} @@ -334,6 +332,7 @@ template class BlankManager { void Visit(const WordIndex *to, unsigned char length, float prob) { basis_[length - 1] = prob; + // Try to match everything except the last word, which is expected to be different. unsigned char overlap = std::min(length - 1, been_length_); const WordIndex *cur; WordIndex *pre; @@ -350,14 +349,15 @@ template class BlankManager { UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context."); const float *lower_basis; for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {} + assert(*lower_basis != kBadProb); unsigned char based_on = lower_basis - basis_ + 1; for (; cur != to + length - 1; ++blank, ++cur, ++pre) { - assert(*lower_basis != kBadProb); doing_.MiddleBlank(blank, to, based_on, *lower_basis); *pre = *cur; // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram. basis_[blank - 1] = kBadProb; } + *pre = *cur; been_length_ = length; } -- cgit v1.2.3 From 635a8d31de50b5514cb471cb79bbe2cd3f23b0b5 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 3 Nov 2011 19:58:37 +0000 Subject: Oops introduced some of Hieu's windows stuff --- klm/lm/search_trie.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'klm/lm/search_trie.cc') diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index 633bcdf4..4bd3f4ee 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -24,8 +24,10 @@ #include #include #include -#include "util/portability.hh" +#include +#include +#include namespace lm { namespace ngram { @@ -269,7 +271,7 @@ template class WriteEntries { contexts_(contexts), unigrams_(unigrams), middle_(middle), - longest_(longest), + longest_(longest), bigram_pack_((order == 2) ? static_cast(longest_) : static_cast(*middle_)), order_(order), sri_(sri) {} @@ -332,7 +334,6 @@ template class BlankManager { void Visit(const WordIndex *to, unsigned char length, float prob) { basis_[length - 1] = prob; - // Try to match everything except the last word, which is expected to be different. unsigned char overlap = std::min(length - 1, been_length_); const WordIndex *cur; WordIndex *pre; @@ -349,9 +350,9 @@ template class BlankManager { UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context."); const float *lower_basis; for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {} - assert(*lower_basis != kBadProb); unsigned char based_on = lower_basis - basis_ + 1; for (; cur != to + length - 1; ++blank, ++cur, ++pre) { + assert(*lower_basis != kBadProb); doing_.MiddleBlank(blank, to, based_on, *lower_basis); *pre = *cur; // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram. -- cgit v1.2.3