diff options
author | Kenneth Heafield <github@kheafield.com> | 2011-11-03 19:53:53 +0000 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2011-11-03 19:53:53 +0000 |
commit | b036e03e9db226fde7e6b0e69d86bdb5741f8006 (patch) | |
tree | b6adcd4192987b53ed6c17cd9ed8872c2e9ecb67 /klm/lm | |
parent | 976e492a10d88df932acbff3fec8142edc990929 (diff) |
Bugfix trie building
Diffstat (limited to 'klm/lm')
-rw-r--r-- | klm/lm/search_trie.cc | 10 |
1 files changed, 5 insertions, 5 deletions
diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index e3cf9547..633bcdf4 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -24,10 +24,8 @@ #include <limits> #include <numeric> #include <vector> +#include "util/portability.hh" -#include <sys/mman.h> -#include <sys/types.h> -#include <sys/stat.h> namespace lm { namespace ngram { @@ -271,7 +269,7 @@ template <class Quant, class Bhiksha> class WriteEntries { contexts_(contexts), unigrams_(unigrams), middle_(middle), - longest_(longest), + longest_(longest), bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)), order_(order), sri_(sri) {} @@ -334,6 +332,7 @@ template <class Doing> class BlankManager { void Visit(const WordIndex *to, unsigned char length, float prob) { basis_[length - 1] = prob; + // Try to match everything except the last word, which is expected to be different. unsigned char overlap = std::min<unsigned char>(length - 1, been_length_); const WordIndex *cur; WordIndex *pre; @@ -350,14 +349,15 @@ template <class Doing> class BlankManager { UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context."); const float *lower_basis; for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {} + assert(*lower_basis != kBadProb); unsigned char based_on = lower_basis - basis_ + 1; for (; cur != to + length - 1; ++blank, ++cur, ++pre) { - assert(*lower_basis != kBadProb); doing_.MiddleBlank(blank, to, based_on, *lower_basis); *pre = *cur; // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram. basis_[blank - 1] = kBadProb; } + *pre = *cur; been_length_ = length; } |