From d3e2ec203a5cf550320caa8023ac3dd103b0be7d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 13 Oct 2014 00:42:37 -0400 Subject: new kenlm --- klm/lm/builder/ngram.hh | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) (limited to 'klm/lm/builder/ngram.hh') diff --git a/klm/lm/builder/ngram.hh b/klm/lm/builder/ngram.hh index f5681516..0472bcb1 100644 --- a/klm/lm/builder/ngram.hh +++ b/klm/lm/builder/ngram.hh @@ -1,5 +1,5 @@ -#ifndef LM_BUILDER_NGRAM__ -#define LM_BUILDER_NGRAM__ +#ifndef LM_BUILDER_NGRAM_H +#define LM_BUILDER_NGRAM_H #include "lm/weights.hh" #include "lm/word_index.hh" @@ -26,7 +26,7 @@ union Payload { class NGram { public: - NGram(void *begin, std::size_t order) + NGram(void *begin, std::size_t order) : begin_(static_cast(begin)), end_(begin_ + order) {} const uint8_t *Base() const { return reinterpret_cast(begin_); } @@ -38,12 +38,12 @@ class NGram { end_ = begin_ + difference; } - // Would do operator++ but that can get confusing for a stream. + // Would do operator++ but that can get confusing for a stream. void NextInMemory() { ReBase(&Value() + 1); } - // Lower-case in deference to STL. + // Lower-case in deference to STL. const WordIndex *begin() const { return begin_; } WordIndex *begin() { return begin_; } const WordIndex *end() const { return end_; } @@ -61,7 +61,7 @@ class NGram { return order * sizeof(WordIndex) + sizeof(Payload); } std::size_t TotalSize() const { - // Compiler should optimize this. + // Compiler should optimize this. return TotalSize(Order()); } static std::size_t OrderFromSize(std::size_t size) { @@ -69,6 +69,31 @@ class NGram { assert(size == TotalSize(ret)); return ret; } + + // manipulate msb to signal that ngram can be pruned + /*mjd**********************************************************************/ + + bool IsMarked() const { + return Value().count >> (sizeof(Value().count) * 8 - 1); + } + + void Mark() { + Value().count |= (1ul << (sizeof(Value().count) * 8 - 1)); + } + + void Unmark() { + Value().count &= ~(1ul << (sizeof(Value().count) * 8 - 1)); + } + + uint64_t UnmarkedCount() const { + return Value().count & ~(1ul << (sizeof(Value().count) * 8 - 1)); + } + + uint64_t CutoffCount() const { + return IsMarked() ? 0 : UnmarkedCount(); + } + + /*mjd**********************************************************************/ private: WordIndex *begin_, *end_; @@ -81,4 +106,4 @@ const WordIndex kEOS = 2; } // namespace builder } // namespace lm -#endif // LM_BUILDER_NGRAM__ +#endif // LM_BUILDER_NGRAM_H -- cgit v1.2.3