diff options
author | Kenneth Heafield <github@kheafield.com> | 2013-01-18 17:12:51 +0000 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2013-01-18 17:12:51 +0000 |
commit | d884099e0db8b4510847ec106b59ef7dca3c245b (patch) | |
tree | b45a3f17eb002e224a7b728e0f985a15e2503196 /klm/lm/builder/ngram.hh | |
parent | bae5fe99037ae7e101953ad0df118127191c711c (diff) |
KenLM dffafbf with lmplz source (but not built)
Diffstat (limited to 'klm/lm/builder/ngram.hh')
-rw-r--r-- | klm/lm/builder/ngram.hh | 84 |
1 files changed, 84 insertions, 0 deletions
diff --git a/klm/lm/builder/ngram.hh b/klm/lm/builder/ngram.hh new file mode 100644 index 00000000..2984ed0b --- /dev/null +++ b/klm/lm/builder/ngram.hh @@ -0,0 +1,84 @@ +#ifndef LM_BUILDER_NGRAM__ +#define LM_BUILDER_NGRAM__ + +#include "lm/weights.hh" +#include "lm/word_index.hh" + +#include <cstddef> + +#include <assert.h> +#include <stdint.h> +#include <string.h> + +namespace lm { +namespace builder { + +struct Uninterpolated { + float prob; // Uninterpolated probability. + float gamma; // Interpolation weight for lower order. +}; + +union Payload { + uint64_t count; + Uninterpolated uninterp; + ProbBackoff complete; +}; + +class NGram { + public: + NGram(void *begin, std::size_t order) + : begin_(static_cast<WordIndex*>(begin)), end_(begin_ + order) {} + + const uint8_t *Base() const { return reinterpret_cast<const uint8_t*>(begin_); } + uint8_t *Base() { return reinterpret_cast<uint8_t*>(begin_); } + + void ReBase(void *to) { + std::size_t difference = end_ - begin_; + begin_ = reinterpret_cast<WordIndex*>(to); + end_ = begin_ + difference; + } + + // Would do operator++ but that can get confusing for a stream. + void NextInMemory() { + ReBase(&Value() + 1); + } + + // Lower-case in deference to STL. + const WordIndex *begin() const { return begin_; } + WordIndex *begin() { return begin_; } + const WordIndex *end() const { return end_; } + WordIndex *end() { return end_; } + + const Payload &Value() const { return *reinterpret_cast<const Payload *>(end_); } + Payload &Value() { return *reinterpret_cast<Payload *>(end_); } + + uint64_t &Count() { return Value().count; } + const uint64_t Count() const { return Value().count; } + + std::size_t Order() const { return end_ - begin_; } + + static std::size_t TotalSize(std::size_t order) { + return order * sizeof(WordIndex) + sizeof(Payload); + } + std::size_t TotalSize() const { + // Compiler should optimize this. + return TotalSize(Order()); + } + static std::size_t OrderFromSize(std::size_t size) { + std::size_t ret = (size - sizeof(Payload)) / sizeof(WordIndex); + assert(size == TotalSize(ret)); + return ret; + } + + private: + WordIndex *begin_, *end_; +}; + +const WordIndex kUNK = 0; +const WordIndex kBOS = 1; +const WordIndex kEOS = 2; + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_NGRAM__ |