summaryrefslogtreecommitdiff
path: root/klm/lm/vocab.hh
diff options
context:
space:
mode:
Diffstat (limited to 'klm/lm/vocab.hh')
-rw-r--r--klm/lm/vocab.hh4
1 files changed, 4 insertions, 0 deletions
diff --git a/klm/lm/vocab.hh b/klm/lm/vocab.hh
index c92518e4..9d218fff 100644
--- a/klm/lm/vocab.hh
+++ b/klm/lm/vocab.hh
@@ -61,6 +61,7 @@ class SortedVocabulary : public base::Vocabulary {
}
}
+ // Size for purposes of file writing
static size_t Size(std::size_t entries, const Config &config);
// Vocab words are [0, Bound()) Only valid after FinishedLoading/LoadedBinary.
@@ -77,6 +78,9 @@ class SortedVocabulary : public base::Vocabulary {
// Reorders reorder_vocab so that the IDs are sorted.
void FinishedLoading(ProbBackoff *reorder_vocab);
+ // Trie stores the correct counts including <unk> in the header. If this was previously sized based on a count exluding <unk>, padding with 8 bytes will make it the correct size based on a count including <unk>.
+ std::size_t UnkCountChangePadding() const { return SawUnk() ? 0 : sizeof(uint64_t); }
+
bool SawUnk() const { return saw_unk_; }
void LoadedBinary(int fd, EnumerateVocab *to);