Merge branch 'const_reorder_2' into softsyn_2

author: Wu, Ke <wuke@cs.umd.edu> 2014-12-17 16:15:13 -0500
committer: Wu, Ke <wuke@cs.umd.edu> 2014-12-17 16:15:13 -0500
commit: 6829a0bc624b02ebefc79f8cf9ec89d7d64a7c30 (patch)
tree: 125dfb20f73342873476c793995397b26fd202dd /klm/lm/builder/pipeline.hh
parent: b455a108a21f4ba5a58ab1bc53a8d2bf4d829067 (diff)
parent: 7468e8d85e99b4619442c7afaf4a0d92870111bb (diff)
1 files changed, 35 insertions, 3 deletions
diff --git a/klm/lm/builder/pipeline.hh b/klm/lm/builder/pipeline.hh
index 845e5481..09e1a4d5 100644
--- a/klm/lm/builder/pipeline.hh
+++ b/klm/lm/builder/pipeline.hh
@@ -1,8 +1,10 @@
-#ifndef LM_BUILDER_PIPELINE__
-#define LM_BUILDER_PIPELINE__
+#ifndef LM_BUILDER_PIPELINE_H
+#define LM_BUILDER_PIPELINE_H
 
+#include "lm/builder/adjust_counts.hh"
 #include "lm/builder/initial_probabilities.hh"
 #include "lm/builder/header_info.hh"
+#include "lm/lm_exception.hh"
 #include "lm/word_index.hh"
 #include "util/stream/config.hh"
 #include "util/file_piece.hh"
@@ -18,6 +20,8 @@ struct PipelineConfig {
   util::stream::SortConfig sort;
   InitialProbabilitiesConfig initial_probs;
   util::stream::ChainConfig read_backoffs;
+
+  // Include a header in the ARPA with some statistics?
   bool verbose_header;
 
   // Estimated vocabulary size.  Used for sizing CorpusCount memory and
@@ -30,6 +34,34 @@ struct PipelineConfig {
   // Number of blocks to use.  This will be overridden to 1 if everything fits.
   std::size_t block_count;
 
+  // n-gram count thresholds for pruning. 0 values means no pruning for
+  // corresponding n-gram order
+  std::vector<uint64_t> prune_thresholds; //mjd
+
+  // What to do with discount failures.
+  DiscountConfig discount;
+
+  // Compute collapsed q values instead of probability and backoff
+  bool output_q;
+  
+  /* Computing the perplexity of LMs with different vocabularies is hard.  For
+   * example, the lowest perplexity is attained by a unigram model that
+   * predicts p(<unk>) = 1 and has no other vocabulary.  Also, linearly
+   * interpolated models will sum to more than 1 because <unk> is duplicated
+   * (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to
+   * 1 but comes with its own problems).  This option will make the vocabulary
+   * a particular size by replicating <unk> multiple times for purposes of
+   * computing vocabulary size.  It has no effect if the actual vocabulary is
+   * larger.  This parameter serves the same purpose as IRSTLM's "dub".
+   */
+  uint64_t vocab_size_for_unk;
+
+  /* What to do the first time <s>, </s>, or <unk> appears in the input.  If
+   * this is anything but THROW_UP, then the symbol will always be treated as
+   * whitespace.
+   */
+  WarningAction disallowed_symbol_action;
+
   const std::string &TempPrefix() const { return sort.temp_prefix; }
   std::size_t TotalMemory() const { return sort.total_memory; }
 };
@@ -38,4 +70,4 @@ struct PipelineConfig {
 void Pipeline(PipelineConfig config, int text_file, int out_arpa);
 
 }} // namespaces
-#endif // LM_BUILDER_PIPELINE__
+#endif // LM_BUILDER_PIPELINE_H
author	Wu, Ke <wuke@cs.umd.edu>	2014-12-17 16:15:13 -0500
committer	Wu, Ke <wuke@cs.umd.edu>	2014-12-17 16:15:13 -0500
commit	6829a0bc624b02ebefc79f8cf9ec89d7d64a7c30 (patch)
tree	125dfb20f73342873476c793995397b26fd202dd /klm/lm/builder/pipeline.hh
parent	b455a108a21f4ba5a58ab1bc53a8d2bf4d829067 (diff)
parent	7468e8d85e99b4619442c7afaf4a0d92870111bb (diff)