summaryrefslogtreecommitdiff
path: root/klm/lm/builder/pipeline.hh
diff options
context:
space:
mode:
Diffstat (limited to 'klm/lm/builder/pipeline.hh')
-rw-r--r--klm/lm/builder/pipeline.hh9
1 files changed, 5 insertions, 4 deletions
diff --git a/klm/lm/builder/pipeline.hh b/klm/lm/builder/pipeline.hh
index f1d6c5f6..845e5481 100644
--- a/klm/lm/builder/pipeline.hh
+++ b/klm/lm/builder/pipeline.hh
@@ -3,6 +3,7 @@
#include "lm/builder/initial_probabilities.hh"
#include "lm/builder/header_info.hh"
+#include "lm/word_index.hh"
#include "util/stream/config.hh"
#include "util/file_piece.hh"
@@ -19,9 +20,9 @@ struct PipelineConfig {
util::stream::ChainConfig read_backoffs;
bool verbose_header;
- // Amount of memory to assume that the vocabulary hash table will use. This
- // is subtracted from total memory for CorpusCount.
- std::size_t assume_vocab_hash_size;
+ // Estimated vocabulary size. Used for sizing CorpusCount memory and
+ // initial probing hash table sizing, also in CorpusCount.
+ lm::WordIndex vocab_estimate;
// Minimum block size to tolerate.
std::size_t minimum_block;
@@ -33,7 +34,7 @@ struct PipelineConfig {
std::size_t TotalMemory() const { return sort.total_memory; }
};
-// Takes ownership of text_file.
+// Takes ownership of text_file and out_arpa.
void Pipeline(PipelineConfig config, int text_file, int out_arpa);
}} // namespaces