From d884099e0db8b4510847ec106b59ef7dca3c245b Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Fri, 18 Jan 2013 17:12:51 +0000 Subject: KenLM dffafbf with lmplz source (but not built) --- klm/lm/builder/ngram.hh | 84 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 klm/lm/builder/ngram.hh (limited to 'klm/lm/builder/ngram.hh') diff --git a/klm/lm/builder/ngram.hh b/klm/lm/builder/ngram.hh new file mode 100644 index 00000000..2984ed0b --- /dev/null +++ b/klm/lm/builder/ngram.hh @@ -0,0 +1,84 @@ +#ifndef LM_BUILDER_NGRAM__ +#define LM_BUILDER_NGRAM__ + +#include "lm/weights.hh" +#include "lm/word_index.hh" + +#include + +#include +#include +#include + +namespace lm { +namespace builder { + +struct Uninterpolated { + float prob; // Uninterpolated probability. + float gamma; // Interpolation weight for lower order. +}; + +union Payload { + uint64_t count; + Uninterpolated uninterp; + ProbBackoff complete; +}; + +class NGram { + public: + NGram(void *begin, std::size_t order) + : begin_(static_cast(begin)), end_(begin_ + order) {} + + const uint8_t *Base() const { return reinterpret_cast(begin_); } + uint8_t *Base() { return reinterpret_cast(begin_); } + + void ReBase(void *to) { + std::size_t difference = end_ - begin_; + begin_ = reinterpret_cast(to); + end_ = begin_ + difference; + } + + // Would do operator++ but that can get confusing for a stream. + void NextInMemory() { + ReBase(&Value() + 1); + } + + // Lower-case in deference to STL. + const WordIndex *begin() const { return begin_; } + WordIndex *begin() { return begin_; } + const WordIndex *end() const { return end_; } + WordIndex *end() { return end_; } + + const Payload &Value() const { return *reinterpret_cast(end_); } + Payload &Value() { return *reinterpret_cast(end_); } + + uint64_t &Count() { return Value().count; } + const uint64_t Count() const { return Value().count; } + + std::size_t Order() const { return end_ - begin_; } + + static std::size_t TotalSize(std::size_t order) { + return order * sizeof(WordIndex) + sizeof(Payload); + } + std::size_t TotalSize() const { + // Compiler should optimize this. + return TotalSize(Order()); + } + static std::size_t OrderFromSize(std::size_t size) { + std::size_t ret = (size - sizeof(Payload)) / sizeof(WordIndex); + assert(size == TotalSize(ret)); + return ret; + } + + private: + WordIndex *begin_, *end_; +}; + +const WordIndex kUNK = 0; +const WordIndex kBOS = 1; +const WordIndex kEOS = 2; + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_NGRAM__ -- cgit v1.2.3