diff options
Diffstat (limited to 'klm/lm/builder/pipeline.hh')
-rw-r--r-- | klm/lm/builder/pipeline.hh | 38 |
1 files changed, 35 insertions, 3 deletions
diff --git a/klm/lm/builder/pipeline.hh b/klm/lm/builder/pipeline.hh index 845e5481..09e1a4d5 100644 --- a/klm/lm/builder/pipeline.hh +++ b/klm/lm/builder/pipeline.hh @@ -1,8 +1,10 @@ -#ifndef LM_BUILDER_PIPELINE__ -#define LM_BUILDER_PIPELINE__ +#ifndef LM_BUILDER_PIPELINE_H +#define LM_BUILDER_PIPELINE_H +#include "lm/builder/adjust_counts.hh" #include "lm/builder/initial_probabilities.hh" #include "lm/builder/header_info.hh" +#include "lm/lm_exception.hh" #include "lm/word_index.hh" #include "util/stream/config.hh" #include "util/file_piece.hh" @@ -18,6 +20,8 @@ struct PipelineConfig { util::stream::SortConfig sort; InitialProbabilitiesConfig initial_probs; util::stream::ChainConfig read_backoffs; + + // Include a header in the ARPA with some statistics? bool verbose_header; // Estimated vocabulary size. Used for sizing CorpusCount memory and @@ -30,6 +34,34 @@ struct PipelineConfig { // Number of blocks to use. This will be overridden to 1 if everything fits. std::size_t block_count; + // n-gram count thresholds for pruning. 0 values means no pruning for + // corresponding n-gram order + std::vector<uint64_t> prune_thresholds; //mjd + + // What to do with discount failures. + DiscountConfig discount; + + // Compute collapsed q values instead of probability and backoff + bool output_q; + + /* Computing the perplexity of LMs with different vocabularies is hard. For + * example, the lowest perplexity is attained by a unigram model that + * predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly + * interpolated models will sum to more than 1 because <unk> is duplicated + * (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to + * 1 but comes with its own problems). This option will make the vocabulary + * a particular size by replicating <unk> multiple times for purposes of + * computing vocabulary size. It has no effect if the actual vocabulary is + * larger. This parameter serves the same purpose as IRSTLM's "dub". + */ + uint64_t vocab_size_for_unk; + + /* What to do the first time <s>, </s>, or <unk> appears in the input. If + * this is anything but THROW_UP, then the symbol will always be treated as + * whitespace. + */ + WarningAction disallowed_symbol_action; + const std::string &TempPrefix() const { return sort.temp_prefix; } std::size_t TotalMemory() const { return sort.total_memory; } }; @@ -38,4 +70,4 @@ struct PipelineConfig { void Pipeline(PipelineConfig config, int text_file, int out_arpa); }} // namespaces -#endif // LM_BUILDER_PIPELINE__ +#endif // LM_BUILDER_PIPELINE_H |