From b35a7f3a96ff8ae42e15922dd6949bf9f5d15501 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Tue, 22 Jan 2013 21:37:49 +0000
Subject: KenLM 58da338b

---
 klm/lm/Makefile.am             |   4 +-
 klm/lm/build_binary.cc         | 228 -------------------------------------
 klm/lm/build_binary_main.cc    | 228 +++++++++++++++++++++++++++++++++++++
 klm/lm/builder/Makefile.am     |   2 +-
 klm/lm/builder/discount.hh     |   2 +-
 klm/lm/builder/lmplz_main.cc   |  94 ++++++++++++++++
 klm/lm/builder/main.cc         |  94 ----------------
 klm/lm/filter/filter_main.cc   | 248 ++++++++++++++++++++++++++++++++++++++++
 klm/lm/filter/main.cc          | 249 -----------------------------------------
 klm/lm/filter/phrase.hh        |   1 +
 klm/lm/filter/vocab.hh         |   1 +
 klm/lm/fragment.cc             |  37 ------
 klm/lm/fragment_main.cc        |  37 ++++++
 klm/lm/kenlm_max_order_main.cc |   6 +
 klm/lm/max_order.cc            |   6 -
 klm/lm/ngram_query.cc          |  47 --------
 klm/lm/query_main.cc           |  47 ++++++++
 17 files changed, 666 insertions(+), 665 deletions(-)
 delete mode 100644 klm/lm/build_binary.cc
 create mode 100644 klm/lm/build_binary_main.cc
 create mode 100644 klm/lm/builder/lmplz_main.cc
 delete mode 100644 klm/lm/builder/main.cc
 create mode 100644 klm/lm/filter/filter_main.cc
 delete mode 100644 klm/lm/filter/main.cc
 delete mode 100644 klm/lm/fragment.cc
 create mode 100644 klm/lm/fragment_main.cc
 create mode 100644 klm/lm/kenlm_max_order_main.cc
 delete mode 100644 klm/lm/max_order.cc
 delete mode 100644 klm/lm/ngram_query.cc
 create mode 100644 klm/lm/query_main.cc

(limited to 'klm/lm')
diff --git a/klm/lm/Makefile.am b/klm/lm/Makefile.am
index 45f40c43..48b0ba34 100644
--- a/klm/lm/Makefile.am
+++ b/klm/lm/Makefile.am
@@ -1,9 +1,9 @@
 bin_PROGRAMS = build_binary ngram_query
 
-build_binary_SOURCES = build_binary.cc
+build_binary_SOURCES = build_binary_main.cc
 build_binary_LDADD = libklm.a ../util/libklm_util.a ../util/double-conversion/libklm_util_double.a -lz
 
-ngram_query_SOURCES = ngram_query.cc
+ngram_query_SOURCES = query_main.cc
 ngram_query_LDADD = libklm.a ../util/libklm_util.a ../util/double-conversion/libklm_util_double.a -lz
 
 #noinst_PROGRAMS = \
diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc
deleted file mode 100644
index ab2c0c32..00000000
--- a/klm/lm/build_binary.cc
+++ /dev/null
@@ -1,228 +0,0 @@
-#include "lm/model.hh"
-#include "lm/sizes.hh"
-#include "util/file_piece.hh"
-#include "util/usage.hh"
-
-#include <algorithm>
-#include <cstdlib>
-#include <exception>
-#include <iostream>
-#include <iomanip>
-#include <limits>
-
-#include <math.h>
-#include <stdlib.h>
-
-#ifdef WIN32
-#include "util/getopt.hh"
-#else
-#include <unistd.h>
-#endif
-
-namespace lm {
-namespace ngram {
-namespace {
-
-void Usage(const char *name, const char *default_mem) {
-  std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
-"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
-"   Default is -100.  The ARPA file will always take precedence.\n"
-"-s allows models to be built even if they do not have <s> and </s>.\n"
-"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
-"-w mmap|after determines how writing is done.\n"
-"   mmap maps the binary file and writes to it.  Default for trie.\n"
-"   after allocates anonymous memory, builds, and writes.  Default for probing.\n"
-"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n"
-"   model files.  order1.arpa must be an ARPA file.  All others may be ARPA or\n"
-"   the same data structure as being built.  All files must have the same\n"
-"   vocabulary.  For probing, the unigrams must be in the same order.\n\n"
-"type is either probing or trie.  Default is probing.\n\n"
-"probing uses a probing hash table.  It is the fastest but uses the most memory.\n"
-"-p sets the space multiplier and must be >1.0.  The default is 1.5.\n\n"
-"trie is a straightforward trie with bit-level packing.  It uses the least\n"
-"memory and is still faster than SRI or IRST.  Building the trie format uses an\n"
-"on-disk sort to save memory.\n"
-"-T is the temporary directory prefix.  Default is the output file name.\n"
-"-S determines memory use for sorting.  Default is " << default_mem << ".  This is compatible\n"
-"   with GNU sort.  The number is followed by a unit: \% for percent of physical\n"
-"   memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y.  \n"
-"   Default unit is K for Kilobytes.\n"
-"-q turns quantization on and sets the number of bits (e.g. -q 8).\n"
-"-b sets backoff quantization bits.  Requires -q and defaults to that value.\n"
-"-a compresses pointers using an array of offsets.  The parameter is the\n"
-"   maximum number of bits encoded by the array.  Memory is minimized subject\n"
-"   to the maximum, so pick 255 to minimize memory.\n\n"
-"Get a memory estimate by passing an ARPA file without an output file name.\n";
-  exit(1);
-}
-
-// I could really use boost::lexical_cast right about now.  
-float ParseFloat(const char *from) {
-  char *end;
-  float ret = strtod(from, &end);
-  if (*end) throw util::ParseNumberException(from);
-  return ret;
-}
-unsigned long int ParseUInt(const char *from) {
-  char *end;
-  unsigned long int ret = strtoul(from, &end, 10);
-  if (*end) throw util::ParseNumberException(from);
-  return ret;
-}
-
-uint8_t ParseBitCount(const char *from) {
-  unsigned long val = ParseUInt(from);
-  if (val > 25) {
-    util::ParseNumberException e(from);
-    e << " bit counts are limited to 25.";
-  }
-  return val;
-}
-
-void ParseFileList(const char *from, std::vector<std::string> &to) {
-  to.clear();
-  while (true) {
-    const char *i;
-    for (i = from; *i && *i != ' '; ++i) {}
-    to.push_back(std::string(from, i - from));
-    if (!*i) break;
-    from = i + 1;
-  }
-}
-
-void ProbingQuantizationUnsupported() {
-  std::cerr << "Quantization is only implemented in the trie data structure." << std::endl;
-  exit(1);
-}
-
-} // namespace ngram
-} // namespace lm
-} // namespace
-
-int main(int argc, char *argv[]) {
-  using namespace lm::ngram;
-
-  const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G";
-
-  try {
-    bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false;
-    lm::ngram::Config config;
-    config.building_memory = util::ParseSize(default_mem);
-    int opt;
-    while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:")) != -1) {
-      switch(opt) {
-        case 'q':
-          config.prob_bits = ParseBitCount(optarg);
-          if (!set_backoff_bits) config.backoff_bits = config.prob_bits;
-          quantize = true;
-          break;
-        case 'b':
-          config.backoff_bits = ParseBitCount(optarg);
-          set_backoff_bits = true;
-          break;
-        case 'a':
-          config.pointer_bhiksha_bits = ParseBitCount(optarg);
-          bhiksha = true;
-          break;
-        case 'u':
-          config.unknown_missing_logprob = ParseFloat(optarg);
-          break;
-        case 'p':
-          config.probing_multiplier = ParseFloat(optarg);
-          break;
-        case 't': // legacy
-        case 'T':
-          config.temporary_directory_prefix = optarg;
-          break;
-        case 'm': // legacy
-          config.building_memory = ParseUInt(optarg) * 1048576;
-          break;
-        case 'S':
-          config.building_memory = std::min(static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), util::ParseSize(optarg));
-          break;
-        case 'w':
-          set_write_method = true;
-          if (!strcmp(optarg, "mmap")) {
-            config.write_method = Config::WRITE_MMAP;
-          } else if (!strcmp(optarg, "after")) {
-            config.write_method = Config::WRITE_AFTER;
-          } else {
-            Usage(argv[0], default_mem);
-          }
-          break;
-        case 's':
-          config.sentence_marker_missing = lm::SILENT;
-          break;
-        case 'i':
-          config.positive_log_probability = lm::SILENT;
-          break;
-        case 'r':
-          rest = true;
-          ParseFileList(optarg, config.rest_lower_files);
-          config.rest_function = Config::REST_LOWER;
-          break;
-        default:
-          Usage(argv[0], default_mem);
-      }
-    }
-    if (!quantize && set_backoff_bits) {
-      std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl;
-      abort();
-    }
-    if (optind + 1 == argc) {
-      ShowSizes(argv[optind], config);
-      return 0;
-    }
-    const char *model_type;
-    const char *from_file;
-
-    if (optind + 2 == argc) {
-      model_type = "probing";
-      from_file = argv[optind];
-      config.write_mmap = argv[optind + 1];
-    } else if (optind + 3 == argc) {
-      model_type = argv[optind];
-      from_file = argv[optind + 1];
-      config.write_mmap = argv[optind + 2];
-    } else {
-      Usage(argv[0], default_mem);
-    }
-    if (!strcmp(model_type, "probing")) {
-      if (!set_write_method) config.write_method = Config::WRITE_AFTER;
-      if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
-      if (rest) {
-        RestProbingModel(from_file, config);
-      } else {
-        ProbingModel(from_file, config);
-      }
-    } else if (!strcmp(model_type, "trie")) {
-      if (rest) {
-        std::cerr << "Rest + trie is not supported yet." << std::endl;
-        return 1;
-      }
-      if (!set_write_method) config.write_method = Config::WRITE_MMAP;
-      if (quantize) {
-        if (bhiksha) {
-          QuantArrayTrieModel(from_file, config);
-        } else {
-          QuantTrieModel(from_file, config);
-        }
-      } else {
-        if (bhiksha) {
-          ArrayTrieModel(from_file, config);
-        } else {
-          TrieModel(from_file, config);
-        }
-      }
-    } else {
-      Usage(argv[0], default_mem);
-    }
-  }
-  catch (const std::exception &e) {
-    std::cerr << e.what() << std::endl;
-    std::cerr << "ERROR" << std::endl;
-    return 1;
-  }
-  std::cerr << "SUCCESS" << std::endl;
-  return 0;
-}
diff --git a/klm/lm/build_binary_main.cc b/klm/lm/build_binary_main.cc
new file mode 100644
index 00000000..ab2c0c32
--- /dev/null
+++ b/klm/lm/build_binary_main.cc
@@ -0,0 +1,228 @@
+#include "lm/model.hh"
+#include "lm/sizes.hh"
+#include "util/file_piece.hh"
+#include "util/usage.hh"
+
+#include <algorithm>
+#include <cstdlib>
+#include <exception>
+#include <iostream>
+#include <iomanip>
+#include <limits>
+
+#include <math.h>
+#include <stdlib.h>
+
+#ifdef WIN32
+#include "util/getopt.hh"
+#else
+#include <unistd.h>
+#endif
+
+namespace lm {
+namespace ngram {
+namespace {
+
+void Usage(const char *name, const char *default_mem) {
+  std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
+"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
+"   Default is -100.  The ARPA file will always take precedence.\n"
+"-s allows models to be built even if they do not have <s> and </s>.\n"
+"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
+"-w mmap|after determines how writing is done.\n"
+"   mmap maps the binary file and writes to it.  Default for trie.\n"
+"   after allocates anonymous memory, builds, and writes.  Default for probing.\n"
+"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n"
+"   model files.  order1.arpa must be an ARPA file.  All others may be ARPA or\n"
+"   the same data structure as being built.  All files must have the same\n"
+"   vocabulary.  For probing, the unigrams must be in the same order.\n\n"
+"type is either probing or trie.  Default is probing.\n\n"
+"probing uses a probing hash table.  It is the fastest but uses the most memory.\n"
+"-p sets the space multiplier and must be >1.0.  The default is 1.5.\n\n"
+"trie is a straightforward trie with bit-level packing.  It uses the least\n"
+"memory and is still faster than SRI or IRST.  Building the trie format uses an\n"
+"on-disk sort to save memory.\n"
+"-T is the temporary directory prefix.  Default is the output file name.\n"
+"-S determines memory use for sorting.  Default is " << default_mem << ".  This is compatible\n"
+"   with GNU sort.  The number is followed by a unit: \% for percent of physical\n"
+"   memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y.  \n"
+"   Default unit is K for Kilobytes.\n"
+"-q turns quantization on and sets the number of bits (e.g. -q 8).\n"
+"-b sets backoff quantization bits.  Requires -q and defaults to that value.\n"
+"-a compresses pointers using an array of offsets.  The parameter is the\n"
+"   maximum number of bits encoded by the array.  Memory is minimized subject\n"
+"   to the maximum, so pick 255 to minimize memory.\n\n"
+"Get a memory estimate by passing an ARPA file without an output file name.\n";
+  exit(1);
+}
+
+// I could really use boost::lexical_cast right about now.  
+float ParseFloat(const char *from) {
+  char *end;
+  float ret = strtod(from, &end);
+  if (*end) throw util::ParseNumberException(from);
+  return ret;
+}
+unsigned long int ParseUInt(const char *from) {
+  char *end;
+  unsigned long int ret = strtoul(from, &end, 10);
+  if (*end) throw util::ParseNumberException(from);
+  return ret;
+}
+
+uint8_t ParseBitCount(const char *from) {
+  unsigned long val = ParseUInt(from);
+  if (val > 25) {
+    util::ParseNumberException e(from);
+    e << " bit counts are limited to 25.";
+  }
+  return val;
+}
+
+void ParseFileList(const char *from, std::vector<std::string> &to) {
+  to.clear();
+  while (true) {
+    const char *i;
+    for (i = from; *i && *i != ' '; ++i) {}
+    to.push_back(std::string(from, i - from));
+    if (!*i) break;
+    from = i + 1;
+  }
+}
+
+void ProbingQuantizationUnsupported() {
+  std::cerr << "Quantization is only implemented in the trie data structure." << std::endl;
+  exit(1);
+}
+
+} // namespace ngram
+} // namespace lm
+} // namespace
+
+int main(int argc, char *argv[]) {
+  using namespace lm::ngram;
+
+  const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G";
+
+  try {
+    bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false;
+    lm::ngram::Config config;
+    config.building_memory = util::ParseSize(default_mem);
+    int opt;
+    while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:")) != -1) {
+      switch(opt) {
+        case 'q':
+          config.prob_bits = ParseBitCount(optarg);
+          if (!set_backoff_bits) config.backoff_bits = config.prob_bits;
+          quantize = true;
+          break;
+        case 'b':
+          config.backoff_bits = ParseBitCount(optarg);
+          set_backoff_bits = true;
+          break;
+        case 'a':
+          config.pointer_bhiksha_bits = ParseBitCount(optarg);
+          bhiksha = true;
+          break;
+        case 'u':
+          config.unknown_missing_logprob = ParseFloat(optarg);
+          break;
+        case 'p':
+          config.probing_multiplier = ParseFloat(optarg);
+          break;
+        case 't': // legacy
+        case 'T':
+          config.temporary_directory_prefix = optarg;
+          break;
+        case 'm': // legacy
+          config.building_memory = ParseUInt(optarg) * 1048576;
+          break;
+        case 'S':
+          config.building_memory = std::min(static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), util::ParseSize(optarg));
+          break;
+        case 'w':
+          set_write_method = true;
+          if (!strcmp(optarg, "mmap")) {
+            config.write_method = Config::WRITE_MMAP;
+          } else if (!strcmp(optarg, "after")) {
+            config.write_method = Config::WRITE_AFTER;
+          } else {
+            Usage(argv[0], default_mem);
+          }
+          break;
+        case 's':
+          config.sentence_marker_missing = lm::SILENT;
+          break;
+        case 'i':
+          config.positive_log_probability = lm::SILENT;
+          break;
+        case 'r':
+          rest = true;
+          ParseFileList(optarg, config.rest_lower_files);
+          config.rest_function = Config::REST_LOWER;
+          break;
+        default:
+          Usage(argv[0], default_mem);
+      }
+    }
+    if (!quantize && set_backoff_bits) {
+      std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl;
+      abort();
+    }
+    if (optind + 1 == argc) {
+      ShowSizes(argv[optind], config);
+      return 0;
+    }
+    const char *model_type;
+    const char *from_file;
+
+    if (optind + 2 == argc) {
+      model_type = "probing";
+      from_file = argv[optind];
+      config.write_mmap = argv[optind + 1];
+    } else if (optind + 3 == argc) {
+      model_type = argv[optind];
+      from_file = argv[optind + 1];
+      config.write_mmap = argv[optind + 2];
+    } else {
+      Usage(argv[0], default_mem);
+    }
+    if (!strcmp(model_type, "probing")) {
+      if (!set_write_method) config.write_method = Config::WRITE_AFTER;
+      if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
+      if (rest) {
+        RestProbingModel(from_file, config);
+      } else {
+        ProbingModel(from_file, config);
+      }
+    } else if (!strcmp(model_type, "trie")) {
+      if (rest) {
+        std::cerr << "Rest + trie is not supported yet." << std::endl;
+        return 1;
+      }
+      if (!set_write_method) config.write_method = Config::WRITE_MMAP;
+      if (quantize) {
+        if (bhiksha) {
+          QuantArrayTrieModel(from_file, config);
+        } else {
+          QuantTrieModel(from_file, config);
+        }
+      } else {
+        if (bhiksha) {
+          ArrayTrieModel(from_file, config);
+        } else {
+          TrieModel(from_file, config);
+        }
+      }
+    } else {
+      Usage(argv[0], default_mem);
+    }
+  }
+  catch (const std::exception &e) {
+    std::cerr << e.what() << std::endl;
+    std::cerr << "ERROR" << std::endl;
+    return 1;
+  }
+  std::cerr << "SUCCESS" << std::endl;
+  return 0;
+}
diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am
index b5c147fd..317e03ce 100644
--- a/klm/lm/builder/Makefile.am
+++ b/klm/lm/builder/Makefile.am
@@ -1,7 +1,7 @@
 bin_PROGRAMS = builder
 
 builder_SOURCES = \
-  main.cc \
+  lmplz_main.cc \
   adjust_counts.cc \
   adjust_counts.hh \
   corpus_count.cc \
diff --git a/klm/lm/builder/discount.hh b/klm/lm/builder/discount.hh
index 754fb20d..4d0aa4fd 100644
--- a/klm/lm/builder/discount.hh
+++ b/klm/lm/builder/discount.hh
@@ -3,7 +3,7 @@
 
 #include <algorithm>
 
-#include <inttypes.h>
+#include <stdint.h>
 
 namespace lm {
 namespace builder {
diff --git a/klm/lm/builder/lmplz_main.cc b/klm/lm/builder/lmplz_main.cc
new file mode 100644
index 00000000..90b9dca2
--- /dev/null
+++ b/klm/lm/builder/lmplz_main.cc
@@ -0,0 +1,94 @@
+#include "lm/builder/pipeline.hh"
+#include "util/file.hh"
+#include "util/file_piece.hh"
+#include "util/usage.hh"
+
+#include <iostream>
+
+#include <boost/program_options.hpp>
+
+namespace {
+class SizeNotify {
+  public:
+    SizeNotify(std::size_t &out) : behind_(out) {}
+
+    void operator()(const std::string &from) {
+      behind_ = util::ParseSize(from);
+    }
+
+  private:
+    std::size_t &behind_;
+};
+
+boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
+  return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
+}
+
+} // namespace
+
+int main(int argc, char *argv[]) {
+  try {
+    namespace po = boost::program_options;
+    po::options_description options("Language model building options");
+    lm::builder::PipelineConfig pipeline;
+
+    options.add_options()
+      ("order,o", po::value<std::size_t>(&pipeline.order)->required(), "Order of the model")
+      ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)")
+      ("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
+      ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
+      ("vocab_memory", SizeOption(pipeline.assume_vocab_hash_size, "50M"), "Assume that the vocabulary hash table will use this much memory for purposes of calculating total memory in the count step")
+      ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
+      ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
+      ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
+      ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
+      ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.");
+    if (argc == 1) {
+      std::cerr << 
+        "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
+        "Please cite:\n"
+        "@inproceedings{kenlm,\n"
+        "author    = {Kenneth Heafield},\n"
+        "title     = {{KenLM}: Faster and Smaller Language Model Queries},\n"
+        "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n"
+        "month     = {July}, year={2011},\n"
+        "address   = {Edinburgh, UK},\n"
+        "publisher = {Association for Computational Linguistics},\n"
+        "}\n\n"
+        "Provide the corpus on stdin.  The ARPA file will be written to stdout.  Order of\n"
+        "the model (-o) is the only mandatory option.  As this is an on-disk program,\n"
+        "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n"
+        "Memory sizes are specified like GNU sort: a number followed by a unit character.\n"
+        "Valid units are \% for percentage of memory (supported platforms only) and (in\n"
+        "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y.  Default is K (*1024).\n\n";
+      std::cerr << options << std::endl;
+      return 1;
+    }
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, options), vm);
+    po::notify(vm);
+
+    util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
+
+    lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
+    // TODO: evaluate options for these.  
+    initial.adder_in.total_memory = 32768;
+    initial.adder_in.block_count = 2;
+    initial.adder_out.total_memory = 32768;
+    initial.adder_out.block_count = 2;
+    pipeline.read_backoffs = initial.adder_out;
+
+    // Read from stdin
+    try {
+      lm::builder::Pipeline(pipeline, 0, 1);
+    } catch (const util::MallocException &e) {
+      std::cerr << e.what() << std::endl;
+      std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
+      return 1;
+    }
+    util::PrintUsage(std::cerr);
+  } catch (const std::exception &e) {
+    std::cerr << e.what() << std::endl;
+    return 1;
+  }
+}
diff --git a/klm/lm/builder/main.cc b/klm/lm/builder/main.cc
deleted file mode 100644
index 90b9dca2..00000000
--- a/klm/lm/builder/main.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "lm/builder/pipeline.hh"
-#include "util/file.hh"
-#include "util/file_piece.hh"
-#include "util/usage.hh"
-
-#include <iostream>
-
-#include <boost/program_options.hpp>
-
-namespace {
-class SizeNotify {
-  public:
-    SizeNotify(std::size_t &out) : behind_(out) {}
-
-    void operator()(const std::string &from) {
-      behind_ = util::ParseSize(from);
-    }
-
-  private:
-    std::size_t &behind_;
-};
-
-boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
-  return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
-}
-
-} // namespace
-
-int main(int argc, char *argv[]) {
-  try {
-    namespace po = boost::program_options;
-    po::options_description options("Language model building options");
-    lm::builder::PipelineConfig pipeline;
-
-    options.add_options()
-      ("order,o", po::value<std::size_t>(&pipeline.order)->required(), "Order of the model")
-      ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)")
-      ("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
-      ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
-      ("vocab_memory", SizeOption(pipeline.assume_vocab_hash_size, "50M"), "Assume that the vocabulary hash table will use this much memory for purposes of calculating total memory in the count step")
-      ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
-      ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
-      ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
-      ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
-      ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.");
-    if (argc == 1) {
-      std::cerr << 
-        "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
-        "Please cite:\n"
-        "@inproceedings{kenlm,\n"
-        "author    = {Kenneth Heafield},\n"
-        "title     = {{KenLM}: Faster and Smaller Language Model Queries},\n"
-        "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n"
-        "month     = {July}, year={2011},\n"
-        "address   = {Edinburgh, UK},\n"
-        "publisher = {Association for Computational Linguistics},\n"
-        "}\n\n"
-        "Provide the corpus on stdin.  The ARPA file will be written to stdout.  Order of\n"
-        "the model (-o) is the only mandatory option.  As this is an on-disk program,\n"
-        "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n"
-        "Memory sizes are specified like GNU sort: a number followed by a unit character.\n"
-        "Valid units are \% for percentage of memory (supported platforms only) and (in\n"
-        "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y.  Default is K (*1024).\n\n";
-      std::cerr << options << std::endl;
-      return 1;
-    }
-    po::variables_map vm;
-    po::store(po::parse_command_line(argc, argv, options), vm);
-    po::notify(vm);
-
-    util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
-
-    lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
-    // TODO: evaluate options for these.  
-    initial.adder_in.total_memory = 32768;
-    initial.adder_in.block_count = 2;
-    initial.adder_out.total_memory = 32768;
-    initial.adder_out.block_count = 2;
-    pipeline.read_backoffs = initial.adder_out;
-
-    // Read from stdin
-    try {
-      lm::builder::Pipeline(pipeline, 0, 1);
-    } catch (const util::MallocException &e) {
-      std::cerr << e.what() << std::endl;
-      std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
-      return 1;
-    }
-    util::PrintUsage(std::cerr);
-  } catch (const std::exception &e) {
-    std::cerr << e.what() << std::endl;
-    return 1;
-  }
-}
diff --git a/klm/lm/filter/filter_main.cc b/klm/lm/filter/filter_main.cc
new file mode 100644
index 00000000..1a4ba84f
--- /dev/null
+++ b/klm/lm/filter/filter_main.cc
@@ -0,0 +1,248 @@
+#include "lm/filter/arpa_io.hh"
+#include "lm/filter/format.hh"
+#include "lm/filter/phrase.hh"
+#ifndef NTHREAD
+#include "lm/filter/thread.hh"
+#endif
+#include "lm/filter/vocab.hh"
+#include "lm/filter/wrapper.hh"
+#include "util/file_piece.hh"
+
+#include <boost/ptr_container/ptr_vector.hpp>
+
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+namespace lm {
+namespace {
+
+void DisplayHelp(const char *name) {
+  std::cerr
+    << "Usage: " << name << " mode [context] [phrase] [raw|arpa] [threads:m] [batch_size:m] (vocab|model):input_file output_file\n\n"
+    "copy mode just copies, but makes the format nicer for e.g. irstlm's broken\n"
+    "    parser.\n"
+    "single mode treats the entire input as a single sentence.\n"
+    "multiple mode filters to multiple sentences in parallel.  Each sentence is on\n"
+    "    a separate line.  A separate file is created for each file by appending the\n"
+    "    0-indexed line number to the output file name.\n"
+    "union mode produces one filtered model that is the union of models created by\n"
+    "    multiple mode.\n\n"
+    "context means only the context (all but last word) has to pass the filter, but\n"
+    "    the entire n-gram is output.\n\n"
+    "phrase means that the vocabulary is actually tab-delimited phrases and that the\n"
+    "    phrases can generate the n-gram when assembled in arbitrary order and\n"
+    "    clipped.  Currently works with multiple or union mode.\n\n"
+    "The file format is set by [raw|arpa] with default arpa:\n"
+    "raw means space-separated tokens, optionally followed by a tab and arbitrary\n"
+    "    text.  This is useful for ngram count files.\n"
+    "arpa means the ARPA file format for n-gram language models.\n\n"
+#ifndef NTHREAD
+    "threads:m sets m threads (default: conccurrency detected by boost)\n"
+    "batch_size:m sets the batch size for threading.  Expect memory usage from this\n"
+    "    of 2*threads*batch_size n-grams.\n\n"
+#else
+    "This binary was compiled with -DNTHREAD, disabling threading.  If you wanted\n"
+    "    threading, compile without this flag against Boost >=1.42.0.\n\n"
+#endif
+    "There are two inputs: vocabulary and model.  Either may be given as a file\n"
+    "    while the other is on stdin.  Specify the type given as a file using\n"
+    "    vocab: or model: before the file name.  \n\n"
+    "For ARPA format, the output must be seekable.  For raw format, it can be a\n"
+    "    stream i.e. /dev/stdout\n";
+}
+
+typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} FilterMode;
+typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
+
+struct Config {
+  Config() : 
+#ifndef NTHREAD
+  batch_size(25000),
+  threads(boost::thread::hardware_concurrency()),
+#endif
+  phrase(false),
+  context(false),
+  format(FORMAT_ARPA)
+  {
+#ifndef NTHREAD
+    if (!threads) threads = 1;
+#endif
+  }
+
+#ifndef NTHREAD
+  size_t batch_size;
+  size_t threads;
+#endif
+  bool phrase;
+  bool context;
+  FilterMode mode;
+  Format format;
+};
+
+template <class Format, class Filter, class OutputBuffer, class Output> void RunThreadedFilter(const Config &config, util::FilePiece &in_lm, Filter &filter, Output &output) {
+#ifndef NTHREAD
+  if (config.threads == 1) {
+#endif
+    Format::RunFilter(in_lm, filter, output);
+#ifndef NTHREAD
+  } else {
+    typedef Controller<Filter, OutputBuffer, Output> Threaded;
+    Threaded threading(config.batch_size, config.threads * 2, config.threads, filter, output);
+    Format::RunFilter(in_lm, threading, output);
+  }
+#endif
+}
+
+template <class Format, class Filter, class OutputBuffer, class Output> void RunContextFilter(const Config &config, util::FilePiece &in_lm, Filter filter, Output &output) {
+  if (config.context) {
+    ContextFilter<Filter> context_filter(filter);
+    RunThreadedFilter<Format, ContextFilter<Filter>, OutputBuffer, Output>(config, in_lm, context_filter, output);
+  } else {
+    RunThreadedFilter<Format, Filter, OutputBuffer, Output>(config, in_lm, filter, output);
+  }
+}
+
+template <class Format, class Binary> void DispatchBinaryFilter(const Config &config, util::FilePiece &in_lm, const Binary &binary, typename Format::Output &out) {
+  typedef BinaryFilter<Binary> Filter;
+  RunContextFilter<Format, Filter, BinaryOutputBuffer, typename Format::Output>(config, in_lm, Filter(binary), out);
+}
+
+template <class Format> void DispatchFilterModes(const Config &config, std::istream &in_vocab, util::FilePiece &in_lm, const char *out_name) {
+  if (config.mode == MODE_MULTIPLE) {
+    if (config.phrase) {
+      typedef phrase::Multiple Filter;
+      phrase::Substrings substrings;
+      typename Format::Multiple out(out_name, phrase::ReadMultiple(in_vocab, substrings));
+      RunContextFilter<Format, Filter, MultipleOutputBuffer, typename Format::Multiple>(config, in_lm, Filter(substrings), out);
+    } else {
+      typedef vocab::Multiple Filter;
+      boost::unordered_map<std::string, std::vector<unsigned int> > words;
+      typename Format::Multiple out(out_name, vocab::ReadMultiple(in_vocab, words));
+      RunContextFilter<Format, Filter, MultipleOutputBuffer, typename Format::Multiple>(config, in_lm, Filter(words), out);
+    }
+    return;
+  }
+
+  typename Format::Output out(out_name);
+
+  if (config.mode == MODE_COPY) {
+    Format::Copy(in_lm, out);
+    return;
+  }
+
+  if (config.mode == MODE_SINGLE) {
+    vocab::Single::Words words;
+    vocab::ReadSingle(in_vocab, words);
+    DispatchBinaryFilter<Format, vocab::Single>(config, in_lm, vocab::Single(words), out);
+    return;
+  }
+
+  if (config.mode == MODE_UNION) {
+    if (config.phrase) {
+      phrase::Substrings substrings;
+      phrase::ReadMultiple(in_vocab, substrings);
+      DispatchBinaryFilter<Format, phrase::Union>(config, in_lm, phrase::Union(substrings), out);
+    } else {
+      vocab::Union::Words words;
+      vocab::ReadMultiple(in_vocab, words);
+      DispatchBinaryFilter<Format, vocab::Union>(config, in_lm, vocab::Union(words), out);
+    }
+    return;
+  }
+}
+
+} // namespace
+} // namespace lm
+
+int main(int argc, char *argv[]) {
+  if (argc < 4) {
+    lm::DisplayHelp(argv[0]);
+    return 1;
+  }
+
+  // I used to have boost::program_options, but some users didn't want to compile boost.
+  lm::Config config;
+  config.mode = lm::MODE_UNSET;
+  for (int i = 1; i < argc - 2; ++i) {
+    const char *str = argv[i];
+    if (!std::strcmp(str, "copy")) {
+      config.mode = lm::MODE_COPY;
+    } else if (!std::strcmp(str, "single")) {
+      config.mode = lm::MODE_SINGLE;
+    } else if (!std::strcmp(str, "multiple")) {
+      config.mode = lm::MODE_MULTIPLE;
+    } else if (!std::strcmp(str, "union")) {
+      config.mode = lm::MODE_UNION;
+    } else if (!std::strcmp(str, "phrase")) {
+      config.phrase = true;
+    } else if (!std::strcmp(str, "context")) {
+      config.context = true;
+    } else if (!std::strcmp(str, "arpa")) {
+      config.format = lm::FORMAT_ARPA;
+    } else if (!std::strcmp(str, "raw")) {
+      config.format = lm::FORMAT_COUNT;
+#ifndef NTHREAD
+    } else if (!std::strncmp(str, "threads:", 8)) {
+      config.threads = boost::lexical_cast<size_t>(str + 8);
+      if (!config.threads) {
+        std::cerr << "Specify at least one thread." << std::endl;
+        return 1;
+      }
+    } else if (!std::strncmp(str, "batch_size:", 11)) {
+      config.batch_size = boost::lexical_cast<size_t>(str + 11);
+      if (config.batch_size < 5000) {
+        std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl;
+        if (!config.batch_size) return 1;
+      }
+#endif
+    } else {
+      lm::DisplayHelp(argv[0]);
+      return 1;
+    }
+  }
+  
+  if (config.mode == lm::MODE_UNSET) {
+    lm::DisplayHelp(argv[0]);
+    return 1;
+  }
+
+  if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) {
+    std::cerr << "Phrase constraint currently only works in multiple or union mode.  If you really need it for single, put everything on one line and use union." << std::endl;
+    return 1;
+  }
+
+  bool cmd_is_model = true;
+  const char *cmd_input = argv[argc - 2];
+  if (!strncmp(cmd_input, "vocab:", 6)) {
+    cmd_is_model = false;
+    cmd_input += 6;
+  } else if (!strncmp(cmd_input, "model:", 6)) {
+    cmd_input += 6;
+  } else if (strchr(cmd_input, ':')) {
+    errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input);
+  } else {
+    std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl;
+  }
+  std::ifstream cmd_file;
+  std::istream *vocab;
+  if (cmd_is_model) {
+    vocab = &std::cin;
+  } else {
+    cmd_file.open(cmd_input, std::ios::in);
+    if (!cmd_file) {
+      err(2, "Could not open input file %s", cmd_input);
+    }
+    vocab = &cmd_file;
+  }
+
+  util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr);
+
+  if (config.format == lm::FORMAT_ARPA) {
+    lm::DispatchFilterModes<lm::ARPAFormat>(config, *vocab, model, argv[argc - 1]);
+  } else if (config.format == lm::FORMAT_COUNT) {
+    lm::DispatchFilterModes<lm::CountFormat>(config, *vocab, model, argv[argc - 1]);
+  }
+  return 0;
+}
diff --git a/klm/lm/filter/main.cc b/klm/lm/filter/main.cc
deleted file mode 100644
index c42243e2..00000000
--- a/klm/lm/filter/main.cc
+++ /dev/null
@@ -1,249 +0,0 @@
-#include "lm/filter/arpa_io.hh"
-#include "lm/filter/format.hh"
-#include "lm/filter/phrase.hh"
-#ifndef NTHREAD
-#include "lm/filter/thread.hh"
-#endif
-#include "lm/filter/vocab.hh"
-#include "lm/filter/wrapper.hh"
-#include "util/file_piece.hh"
-
-#include <boost/ptr_container/ptr_vector.hpp>
-
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-
-namespace lm {
-namespace {
-
-void DisplayHelp(const char *name) {
-  std::cerr
-    << "Usage: " << name << " mode [context] [phrase] [raw|arpa] [threads:m] [batch_size:m] (vocab|model):input_file output_file\n\n"
-    "copy mode just copies, but makes the format nicer for e.g. irstlm's broken\n"
-    "    parser.\n"
-    "single mode treats the entire input as a single sentence.\n"
-    "multiple mode filters to multiple sentences in parallel.  Each sentence is on\n"
-    "    a separate line.  A separate file is created for each file by appending the\n"
-    "    0-indexed line number to the output file name.\n"
-    "union mode produces one filtered model that is the union of models created by\n"
-    "    multiple mode.\n\n"
-    "context means only the context (all but last word) has to pass the filter, but\n"
-    "    the entire n-gram is output.\n\n"
-    "phrase means that the vocabulary is actually tab-delimited phrases and that the\n"
-    "    phrases can generate the n-gram when assembled in arbitrary order and\n"
-    "    clipped.  Currently works with multiple or union mode.\n\n"
-    "The file format is set by [raw|arpa] with default arpa:\n"
-    "raw means space-separated tokens, optionally followed by a tab and arbitrary\n"
-    "    text.  This is useful for ngram count files.\n"
-    "arpa means the ARPA file format for n-gram language models.\n\n"
-#ifndef NTHREAD
-    "threads:m sets m threads (default: conccurrency detected by boost)\n"
-    "batch_size:m sets the batch size for threading.  Expect memory usage from this\n"
-    "    of 2*threads*batch_size n-grams.\n\n"
-#else
-    "This binary was compiled with -DNTHREAD, disabling threading.  If you wanted\n"
-    "    threading, compile without this flag against Boost >=1.42.0.\n\n"
-#endif
-    "There are two inputs: vocabulary and model.  Either may be given as a file\n"
-    "    while the other is on stdin.  Specify the type given as a file using\n"
-    "    vocab: or model: before the file name.  \n\n"
-    "For ARPA format, the output must be seekable.  For raw format, it can be a\n"
-    "    stream i.e. /dev/stdout\n";
-}
-
-typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION} FilterMode;
-typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
-
-struct Config {
-  Config() : 
-#ifndef NTHREAD
-  batch_size(25000),
-  threads(boost::thread::hardware_concurrency()),
-#endif
-  phrase(false),
-  context(false),
-  format(FORMAT_ARPA)
-  {
-#ifndef NTHREAD
-    if (!threads) threads = 1;
-#endif
-  }
-
-#ifndef NTHREAD
-  size_t batch_size;
-  size_t threads;
-#endif
-  bool phrase;
-  bool context;
-  FilterMode mode;
-  Format format;
-};
-
-template <class Format, class Filter, class OutputBuffer, class Output> void RunThreadedFilter(const Config &config, util::FilePiece &in_lm, Filter &filter, Output &output) {
-#ifndef NTHREAD
-  if (config.threads == 1) {
-#endif
-    Format::RunFilter(in_lm, filter, output);
-#ifndef NTHREAD
-  } else {
-    typedef Controller<Filter, OutputBuffer, Output> Threaded;
-    Threaded threading(config.batch_size, config.threads * 2, config.threads, filter, output);
-    Format::RunFilter(in_lm, threading, output);
-  }
-#endif
-}
-
-template <class Format, class Filter, class OutputBuffer, class Output> void RunContextFilter(const Config &config, util::FilePiece &in_lm, Filter filter, Output &output) {
-  if (config.context) {
-    ContextFilter<Filter> context_filter(filter);
-    RunThreadedFilter<Format, ContextFilter<Filter>, OutputBuffer, Output>(config, in_lm, context_filter, output);
-  } else {
-    RunThreadedFilter<Format, Filter, OutputBuffer, Output>(config, in_lm, filter, output);
-  }
-}
-
-template <class Format, class Binary> void DispatchBinaryFilter(const Config &config, util::FilePiece &in_lm, const Binary &binary, typename Format::Output &out) {
-  typedef BinaryFilter<Binary> Filter;
-  RunContextFilter<Format, Filter, BinaryOutputBuffer, typename Format::Output>(config, in_lm, Filter(binary), out);
-}
-
-template <class Format> void DispatchFilterModes(const Config &config, std::istream &in_vocab, util::FilePiece &in_lm, const char *out_name) {
-  if (config.mode == MODE_MULTIPLE) {
-    if (config.phrase) {
-      typedef phrase::Multiple Filter;
-      phrase::Substrings substrings;
-      typename Format::Multiple out(out_name, phrase::ReadMultiple(in_vocab, substrings));
-      RunContextFilter<Format, Filter, MultipleOutputBuffer, typename Format::Multiple>(config, in_lm, Filter(substrings), out);
-    } else {
-      typedef vocab::Multiple Filter;
-      boost::unordered_map<std::string, std::vector<unsigned int> > words;
-      typename Format::Multiple out(out_name, vocab::ReadMultiple(in_vocab, words));
-      RunContextFilter<Format, Filter, MultipleOutputBuffer, typename Format::Multiple>(config, in_lm, Filter(words), out);
-    }
-    return;
-  }
-
-  typename Format::Output out(out_name);
-
-  if (config.mode == MODE_COPY) {
-    Format::Copy(in_lm, out);
-    return;
-  }
-
-  if (config.mode == MODE_SINGLE) {
-    vocab::Single::Words words;
-    vocab::ReadSingle(in_vocab, words);
-    DispatchBinaryFilter<Format, vocab::Single>(config, in_lm, vocab::Single(words), out);
-    return;
-  }
-
-  if (config.mode == MODE_UNION) {
-    if (config.phrase) {
-      phrase::Substrings substrings;
-      phrase::ReadMultiple(in_vocab, substrings);
-      DispatchBinaryFilter<Format, phrase::Union>(config, in_lm, phrase::Union(substrings), out);
-    } else {
-      vocab::Union::Words words;
-      vocab::ReadMultiple(in_vocab, words);
-      DispatchBinaryFilter<Format, vocab::Union>(config, in_lm, vocab::Union(words), out);
-    }
-    return;
-  }
-}
-
-} // namespace
-} // namespace lm
-
-int main(int argc, char *argv[]) {
-  if (argc < 4) {
-    lm::DisplayHelp(argv[0]);
-    return 1;
-  }
-
-  // I used to have boost::program_options, but some users didn't want to compile boost.  
-  lm::Config config;
-  boost::optional<lm::FilterMode> mode;
-  for (int i = 1; i < argc - 2; ++i) {
-    const char *str = argv[i];
-    if (!std::strcmp(str, "copy")) {
-      mode = lm::MODE_COPY;
-    } else if (!std::strcmp(str, "single")) {
-      mode = lm::MODE_SINGLE;
-    } else if (!std::strcmp(str, "multiple")) {
-      mode = lm::MODE_MULTIPLE;
-    } else if (!std::strcmp(str, "union")) {
-      mode = lm::MODE_UNION;
-    } else if (!std::strcmp(str, "phrase")) {
-      config.phrase = true;
-    } else if (!std::strcmp(str, "context")) {
-      config.context = true;
-    } else if (!std::strcmp(str, "arpa")) {
-      config.format = lm::FORMAT_ARPA;
-    } else if (!std::strcmp(str, "raw")) {
-      config.format = lm::FORMAT_COUNT;
-#ifndef NTHREAD
-    } else if (!std::strncmp(str, "threads:", 8)) {
-      config.threads = boost::lexical_cast<size_t>(str + 8);
-      if (!config.threads) {
-        std::cerr << "Specify at least one thread." << std::endl;
-        return 1;
-      }
-    } else if (!std::strncmp(str, "batch_size:", 11)) {
-      config.batch_size = boost::lexical_cast<size_t>(str + 11);
-      if (config.batch_size < 5000) {
-        std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl;
-        if (!config.batch_size) return 1;
-      }
-#endif
-    } else {
-      lm::DisplayHelp(argv[0]);
-      return 1;
-    }
-  }
-  
-  if (!mode) {
-    lm::DisplayHelp(argv[0]);
-    return 1;
-  }
-  config.mode = *mode;
-
-  if (config.phrase && config.mode != lm::MODE_UNION && mode != lm::MODE_MULTIPLE) {
-    std::cerr << "Phrase constraint currently only works in multiple or union mode.  If you really need it for single, put everything on one line and use union." << std::endl;
-    return 1;
-  }
-
-  bool cmd_is_model = true;
-  const char *cmd_input = argv[argc - 2];
-  if (!strncmp(cmd_input, "vocab:", 6)) {
-    cmd_is_model = false;
-    cmd_input += 6;
-  } else if (!strncmp(cmd_input, "model:", 6)) {
-    cmd_input += 6;
-  } else if (strchr(cmd_input, ':')) {
-    errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input);
-  } else {
-    std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl;
-  }
-  std::ifstream cmd_file;
-  std::istream *vocab;
-  if (cmd_is_model) {
-    vocab = &std::cin;
-  } else {
-    cmd_file.open(cmd_input, std::ios::in);
-    if (!cmd_file) {
-      err(2, "Could not open input file %s", cmd_input);
-    }
-    vocab = &cmd_file;
-  }
-
-  util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr);
-
-  if (config.format == lm::FORMAT_ARPA) {
-    lm::DispatchFilterModes<lm::ARPAFormat>(config, *vocab, model, argv[argc - 1]);
-  } else if (config.format == lm::FORMAT_COUNT) {
-    lm::DispatchFilterModes<lm::CountFormat>(config, *vocab, model, argv[argc - 1]);
-  }
-  return 0;
-}
diff --git a/klm/lm/filter/phrase.hh b/klm/lm/filter/phrase.hh
index 07479dea..b4edff41 100644
--- a/klm/lm/filter/phrase.hh
+++ b/klm/lm/filter/phrase.hh
@@ -57,6 +57,7 @@ class Substrings {
     LM_FILTER_PHRASE_METHOD(Right, right)
     LM_FILTER_PHRASE_METHOD(Phrase, phrase)
 
+#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization
     // sentence_id must be non-decreasing.  Iterators are over words in the phrase.  
     template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) {
       // Iterate over all substrings.  
diff --git a/klm/lm/filter/vocab.hh b/klm/lm/filter/vocab.hh
index e2b6adff..7f0fadaa 100644
--- a/klm/lm/filter/vocab.hh
+++ b/klm/lm/filter/vocab.hh
@@ -5,6 +5,7 @@
 
 #include "util/multi_intersection.hh"
 #include "util/string_piece.hh"
+#include "util/string_piece_hash.hh"
 #include "util/tokenize_piece.hh"
 
 #include <boost/noncopyable.hpp>
diff --git a/klm/lm/fragment.cc b/klm/lm/fragment.cc
deleted file mode 100644
index 0267cd4e..00000000
--- a/klm/lm/fragment.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "lm/binary_format.hh"
-#include "lm/model.hh"
-#include "lm/left.hh"
-#include "util/tokenize_piece.hh"
-
-template <class Model> void Query(const char *name) {
-  Model model(name);
-  std::string line;
-  lm::ngram::ChartState ignored;
-  while (getline(std::cin, line)) {
-    lm::ngram::RuleScore<Model> scorer(model, ignored);
-    for (util::TokenIter<util::SingleCharacter, true> i(line, ' '); i; ++i) {
-      scorer.Terminal(model.GetVocabulary().Index(*i));
-    }
-    std::cout << scorer.Finish() << '\n';
-  }
-}
-
-int main(int argc, char *argv[]) {
-  if (argc != 2) {
-    std::cerr << "Expected model file name." << std::endl;
-    return 1;
-  }
-  const char *name = argv[1];
-  lm::ngram::ModelType model_type = lm::ngram::PROBING;
-  lm::ngram::RecognizeBinary(name, model_type);
-  switch (model_type) {
-    case lm::ngram::PROBING:
-      Query<lm::ngram::ProbingModel>(name);
-      break;
-    case lm::ngram::REST_PROBING:
-      Query<lm::ngram::RestProbingModel>(name);
-      break;
-    default:
-      std::cerr << "Model type not supported yet." << std::endl;
-  }
-}
diff --git a/klm/lm/fragment_main.cc b/klm/lm/fragment_main.cc
new file mode 100644
index 00000000..0267cd4e
--- /dev/null
+++ b/klm/lm/fragment_main.cc
@@ -0,0 +1,37 @@
+#include "lm/binary_format.hh"
+#include "lm/model.hh"
+#include "lm/left.hh"
+#include "util/tokenize_piece.hh"
+
+template <class Model> void Query(const char *name) {
+  Model model(name);
+  std::string line;
+  lm::ngram::ChartState ignored;
+  while (getline(std::cin, line)) {
+    lm::ngram::RuleScore<Model> scorer(model, ignored);
+    for (util::TokenIter<util::SingleCharacter, true> i(line, ' '); i; ++i) {
+      scorer.Terminal(model.GetVocabulary().Index(*i));
+    }
+    std::cout << scorer.Finish() << '\n';
+  }
+}
+
+int main(int argc, char *argv[]) {
+  if (argc != 2) {
+    std::cerr << "Expected model file name." << std::endl;
+    return 1;
+  }
+  const char *name = argv[1];
+  lm::ngram::ModelType model_type = lm::ngram::PROBING;
+  lm::ngram::RecognizeBinary(name, model_type);
+  switch (model_type) {
+    case lm::ngram::PROBING:
+      Query<lm::ngram::ProbingModel>(name);
+      break;
+    case lm::ngram::REST_PROBING:
+      Query<lm::ngram::RestProbingModel>(name);
+      break;
+    default:
+      std::cerr << "Model type not supported yet." << std::endl;
+  }
+}
diff --git a/klm/lm/kenlm_max_order_main.cc b/klm/lm/kenlm_max_order_main.cc
new file mode 100644
index 00000000..94221201
--- /dev/null
+++ b/klm/lm/kenlm_max_order_main.cc
@@ -0,0 +1,6 @@
+#include "lm/max_order.hh"
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+  std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl;
+}
diff --git a/klm/lm/max_order.cc b/klm/lm/max_order.cc
deleted file mode 100644
index 94221201..00000000
--- a/klm/lm/max_order.cc
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "lm/max_order.hh"
-#include <iostream>
-
-int main(int argc, char *argv[]) {
-  std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl;
-}
diff --git a/klm/lm/ngram_query.cc b/klm/lm/ngram_query.cc
deleted file mode 100644
index 49757d9a..00000000
--- a/klm/lm/ngram_query.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-#include "lm/ngram_query.hh"
-
-int main(int argc, char *argv[]) {
-  if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) {
-    std::cerr << "Usage: " << argv[0] << " lm_file [null]" << std::endl;
-    std::cerr << "Input is wrapped in <s> and </s> unless null is passed." << std::endl;
-    return 1;
-  }
-  try {
-    bool sentence_context = (argc == 2);
-    using namespace lm::ngram;
-    ModelType model_type;
-    if (RecognizeBinary(argv[1], model_type)) {
-      switch(model_type) {
-        case PROBING:
-          Query<lm::ngram::ProbingModel>(argv[1], sentence_context, std::cin, std::cout);
-          break;
-        case REST_PROBING:
-          Query<lm::ngram::RestProbingModel>(argv[1], sentence_context, std::cin, std::cout);
-          break;
-        case TRIE:
-          Query<TrieModel>(argv[1], sentence_context, std::cin, std::cout);
-          break;
-        case QUANT_TRIE:
-          Query<QuantTrieModel>(argv[1], sentence_context, std::cin, std::cout);
-          break;
-        case ARRAY_TRIE:
-          Query<ArrayTrieModel>(argv[1], sentence_context, std::cin, std::cout);
-          break;
-        case QUANT_ARRAY_TRIE:
-          Query<QuantArrayTrieModel>(argv[1], sentence_context, std::cin, std::cout);
-          break;
-        default:
-          std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
-          abort();
-      }
-    } else {
-      Query<ProbingModel>(argv[1], sentence_context, std::cin, std::cout);
-    }
-    std::cerr << "Total time including destruction:\n";
-    util::PrintUsage(std::cerr);
-  } catch (const std::exception &e) {
-    std::cerr << e.what() << std::endl;
-    return 1;
-  }
-  return 0;
-}
diff --git a/klm/lm/query_main.cc b/klm/lm/query_main.cc
new file mode 100644
index 00000000..49757d9a
--- /dev/null
+++ b/klm/lm/query_main.cc
@@ -0,0 +1,47 @@
+#include "lm/ngram_query.hh"
+
+int main(int argc, char *argv[]) {
+  if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) {
+    std::cerr << "Usage: " << argv[0] << " lm_file [null]" << std::endl;
+    std::cerr << "Input is wrapped in <s> and </s> unless null is passed." << std::endl;
+    return 1;
+  }
+  try {
+    bool sentence_context = (argc == 2);
+    using namespace lm::ngram;
+    ModelType model_type;
+    if (RecognizeBinary(argv[1], model_type)) {
+      switch(model_type) {
+        case PROBING:
+          Query<lm::ngram::ProbingModel>(argv[1], sentence_context, std::cin, std::cout);
+          break;
+        case REST_PROBING:
+          Query<lm::ngram::RestProbingModel>(argv[1], sentence_context, std::cin, std::cout);
+          break;
+        case TRIE:
+          Query<TrieModel>(argv[1], sentence_context, std::cin, std::cout);
+          break;
+        case QUANT_TRIE:
+          Query<QuantTrieModel>(argv[1], sentence_context, std::cin, std::cout);
+          break;
+        case ARRAY_TRIE:
+          Query<ArrayTrieModel>(argv[1], sentence_context, std::cin, std::cout);
+          break;
+        case QUANT_ARRAY_TRIE:
+          Query<QuantArrayTrieModel>(argv[1], sentence_context, std::cin, std::cout);
+          break;
+        default:
+          std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
+          abort();
+      }
+    } else {
+      Query<ProbingModel>(argv[1], sentence_context, std::cin, std::cout);
+    }
+    std::cerr << "Total time including destruction:\n";
+    util::PrintUsage(std::cerr);
+  } catch (const std::exception &e) {
+    std::cerr << e.what() << std::endl;
+    return 1;
+  }
+  return 0;
+}
-- 
cgit v1.2.3