summaryrefslogtreecommitdiff
path: root/klm/lm/builder/pipeline.hh
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2014-10-13 00:42:37 -0400
committerChris Dyer <redpony@gmail.com>2014-10-13 00:42:37 -0400
commitd3e2ec203a5cf550320caa8023ac3dd103b0be7d (patch)
tree0c340e6e87f42ae048892d91b3084a59b27e0f03 /klm/lm/builder/pipeline.hh
parentf974c887c357adf3703fc1072a391e016d3d05ed (diff)
new kenlm
Diffstat (limited to 'klm/lm/builder/pipeline.hh')
-rw-r--r--klm/lm/builder/pipeline.hh38
1 files changed, 35 insertions, 3 deletions
diff --git a/klm/lm/builder/pipeline.hh b/klm/lm/builder/pipeline.hh
index 845e5481..09e1a4d5 100644
--- a/klm/lm/builder/pipeline.hh
+++ b/klm/lm/builder/pipeline.hh
@@ -1,8 +1,10 @@
-#ifndef LM_BUILDER_PIPELINE__
-#define LM_BUILDER_PIPELINE__
+#ifndef LM_BUILDER_PIPELINE_H
+#define LM_BUILDER_PIPELINE_H
+#include "lm/builder/adjust_counts.hh"
#include "lm/builder/initial_probabilities.hh"
#include "lm/builder/header_info.hh"
+#include "lm/lm_exception.hh"
#include "lm/word_index.hh"
#include "util/stream/config.hh"
#include "util/file_piece.hh"
@@ -18,6 +20,8 @@ struct PipelineConfig {
util::stream::SortConfig sort;
InitialProbabilitiesConfig initial_probs;
util::stream::ChainConfig read_backoffs;
+
+ // Include a header in the ARPA with some statistics?
bool verbose_header;
// Estimated vocabulary size. Used for sizing CorpusCount memory and
@@ -30,6 +34,34 @@ struct PipelineConfig {
// Number of blocks to use. This will be overridden to 1 if everything fits.
std::size_t block_count;
+ // n-gram count thresholds for pruning. 0 values means no pruning for
+ // corresponding n-gram order
+ std::vector<uint64_t> prune_thresholds; //mjd
+
+ // What to do with discount failures.
+ DiscountConfig discount;
+
+ // Compute collapsed q values instead of probability and backoff
+ bool output_q;
+
+ /* Computing the perplexity of LMs with different vocabularies is hard. For
+ * example, the lowest perplexity is attained by a unigram model that
+ * predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly
+ * interpolated models will sum to more than 1 because <unk> is duplicated
+ * (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to
+ * 1 but comes with its own problems). This option will make the vocabulary
+ * a particular size by replicating <unk> multiple times for purposes of
+ * computing vocabulary size. It has no effect if the actual vocabulary is
+ * larger. This parameter serves the same purpose as IRSTLM's "dub".
+ */
+ uint64_t vocab_size_for_unk;
+
+ /* What to do the first time <s>, </s>, or <unk> appears in the input. If
+ * this is anything but THROW_UP, then the symbol will always be treated as
+ * whitespace.
+ */
+ WarningAction disallowed_symbol_action;
+
const std::string &TempPrefix() const { return sort.temp_prefix; }
std::size_t TotalMemory() const { return sort.total_memory; }
};
@@ -38,4 +70,4 @@ struct PipelineConfig {
void Pipeline(PipelineConfig config, int text_file, int out_arpa);
}} // namespaces
-#endif // LM_BUILDER_PIPELINE__
+#endif // LM_BUILDER_PIPELINE_H