summaryrefslogtreecommitdiff
path: root/klm/lm
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2011-11-03 19:53:53 +0000
committerKenneth Heafield <github@kheafield.com>2011-11-03 19:53:53 +0000
commit013639d6246ec4bd09fee9ad0172cd9d802f29b5 (patch)
tree64b93cf88dda210ac31ee46920faa85e5369a81e /klm/lm
parentca5b590478a2cb1c23a706b76e6a881ce8701716 (diff)
Bugfix trie building
Diffstat (limited to 'klm/lm')
-rw-r--r--klm/lm/search_trie.cc10
1 files changed, 5 insertions, 5 deletions
diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc
index e3cf9547..633bcdf4 100644
--- a/klm/lm/search_trie.cc
+++ b/klm/lm/search_trie.cc
@@ -24,10 +24,8 @@
#include <limits>
#include <numeric>
#include <vector>
+#include "util/portability.hh"
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
namespace lm {
namespace ngram {
@@ -271,7 +269,7 @@ template <class Quant, class Bhiksha> class WriteEntries {
contexts_(contexts),
unigrams_(unigrams),
middle_(middle),
- longest_(longest),
+ longest_(longest),
bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)),
order_(order),
sri_(sri) {}
@@ -334,6 +332,7 @@ template <class Doing> class BlankManager {
void Visit(const WordIndex *to, unsigned char length, float prob) {
basis_[length - 1] = prob;
+ // Try to match everything except the last word, which is expected to be different.
unsigned char overlap = std::min<unsigned char>(length - 1, been_length_);
const WordIndex *cur;
WordIndex *pre;
@@ -350,14 +349,15 @@ template <class Doing> class BlankManager {
UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context.");
const float *lower_basis;
for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {}
+ assert(*lower_basis != kBadProb);
unsigned char based_on = lower_basis - basis_ + 1;
for (; cur != to + length - 1; ++blank, ++cur, ++pre) {
- assert(*lower_basis != kBadProb);
doing_.MiddleBlank(blank, to, based_on, *lower_basis);
*pre = *cur;
// Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
basis_[blank - 1] = kBadProb;
}
+ *pre = *cur;
been_length_ = length;
}