From 516c132fb683b5bf77ae3230a1b3709beb57618e Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Tue, 22 Jan 2013 21:37:49 +0000
Subject: KenLM 58da338b

---
 klm/lm/builder/Makefile.am   |  2 +-
 klm/lm/builder/discount.hh   |  2 +-
 klm/lm/builder/lmplz_main.cc | 94 ++++++++++++++++++++++++++++++++++++++++++++
 klm/lm/builder/main.cc       | 94 --------------------------------------------
 4 files changed, 96 insertions(+), 96 deletions(-)
 create mode 100644 klm/lm/builder/lmplz_main.cc
 delete mode 100644 klm/lm/builder/main.cc

(limited to 'klm/lm/builder')
diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am
index b5c147fd..317e03ce 100644
--- a/klm/lm/builder/Makefile.am
+++ b/klm/lm/builder/Makefile.am
@@ -1,7 +1,7 @@
 bin_PROGRAMS = builder
 
 builder_SOURCES = \
-  main.cc \
+  lmplz_main.cc \
   adjust_counts.cc \
   adjust_counts.hh \
   corpus_count.cc \
diff --git a/klm/lm/builder/discount.hh b/klm/lm/builder/discount.hh
index 754fb20d..4d0aa4fd 100644
--- a/klm/lm/builder/discount.hh
+++ b/klm/lm/builder/discount.hh
@@ -3,7 +3,7 @@
 
 #include <algorithm>
 
-#include <inttypes.h>
+#include <stdint.h>
 
 namespace lm {
 namespace builder {
diff --git a/klm/lm/builder/lmplz_main.cc b/klm/lm/builder/lmplz_main.cc
new file mode 100644
index 00000000..90b9dca2
--- /dev/null
+++ b/klm/lm/builder/lmplz_main.cc
@@ -0,0 +1,94 @@
+#include "lm/builder/pipeline.hh"
+#include "util/file.hh"
+#include "util/file_piece.hh"
+#include "util/usage.hh"
+
+#include <iostream>
+
+#include <boost/program_options.hpp>
+
+namespace {
+class SizeNotify {
+  public:
+    SizeNotify(std::size_t &out) : behind_(out) {}
+
+    void operator()(const std::string &from) {
+      behind_ = util::ParseSize(from);
+    }
+
+  private:
+    std::size_t &behind_;
+};
+
+boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
+  return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
+}
+
+} // namespace
+
+int main(int argc, char *argv[]) {
+  try {
+    namespace po = boost::program_options;
+    po::options_description options("Language model building options");
+    lm::builder::PipelineConfig pipeline;
+
+    options.add_options()
+      ("order,o", po::value<std::size_t>(&pipeline.order)->required(), "Order of the model")
+      ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)")
+      ("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
+      ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
+      ("vocab_memory", SizeOption(pipeline.assume_vocab_hash_size, "50M"), "Assume that the vocabulary hash table will use this much memory for purposes of calculating total memory in the count step")
+      ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
+      ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
+      ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
+      ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
+      ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.");
+    if (argc == 1) {
+      std::cerr << 
+        "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
+        "Please cite:\n"
+        "@inproceedings{kenlm,\n"
+        "author    = {Kenneth Heafield},\n"
+        "title     = {{KenLM}: Faster and Smaller Language Model Queries},\n"
+        "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n"
+        "month     = {July}, year={2011},\n"
+        "address   = {Edinburgh, UK},\n"
+        "publisher = {Association for Computational Linguistics},\n"
+        "}\n\n"
+        "Provide the corpus on stdin.  The ARPA file will be written to stdout.  Order of\n"
+        "the model (-o) is the only mandatory option.  As this is an on-disk program,\n"
+        "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n"
+        "Memory sizes are specified like GNU sort: a number followed by a unit character.\n"
+        "Valid units are \% for percentage of memory (supported platforms only) and (in\n"
+        "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y.  Default is K (*1024).\n\n";
+      std::cerr << options << std::endl;
+      return 1;
+    }
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, options), vm);
+    po::notify(vm);
+
+    util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
+
+    lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
+    // TODO: evaluate options for these.  
+    initial.adder_in.total_memory = 32768;
+    initial.adder_in.block_count = 2;
+    initial.adder_out.total_memory = 32768;
+    initial.adder_out.block_count = 2;
+    pipeline.read_backoffs = initial.adder_out;
+
+    // Read from stdin
+    try {
+      lm::builder::Pipeline(pipeline, 0, 1);
+    } catch (const util::MallocException &e) {
+      std::cerr << e.what() << std::endl;
+      std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
+      return 1;
+    }
+    util::PrintUsage(std::cerr);
+  } catch (const std::exception &e) {
+    std::cerr << e.what() << std::endl;
+    return 1;
+  }
+}
diff --git a/klm/lm/builder/main.cc b/klm/lm/builder/main.cc
deleted file mode 100644
index 90b9dca2..00000000
--- a/klm/lm/builder/main.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "lm/builder/pipeline.hh"
-#include "util/file.hh"
-#include "util/file_piece.hh"
-#include "util/usage.hh"
-
-#include <iostream>
-
-#include <boost/program_options.hpp>
-
-namespace {
-class SizeNotify {
-  public:
-    SizeNotify(std::size_t &out) : behind_(out) {}
-
-    void operator()(const std::string &from) {
-      behind_ = util::ParseSize(from);
-    }
-
-  private:
-    std::size_t &behind_;
-};
-
-boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
-  return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
-}
-
-} // namespace
-
-int main(int argc, char *argv[]) {
-  try {
-    namespace po = boost::program_options;
-    po::options_description options("Language model building options");
-    lm::builder::PipelineConfig pipeline;
-
-    options.add_options()
-      ("order,o", po::value<std::size_t>(&pipeline.order)->required(), "Order of the model")
-      ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)")
-      ("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
-      ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
-      ("vocab_memory", SizeOption(pipeline.assume_vocab_hash_size, "50M"), "Assume that the vocabulary hash table will use this much memory for purposes of calculating total memory in the count step")
-      ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
-      ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
-      ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
-      ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
-      ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.");
-    if (argc == 1) {
-      std::cerr << 
-        "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
-        "Please cite:\n"
-        "@inproceedings{kenlm,\n"
-        "author    = {Kenneth Heafield},\n"
-        "title     = {{KenLM}: Faster and Smaller Language Model Queries},\n"
-        "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n"
-        "month     = {July}, year={2011},\n"
-        "address   = {Edinburgh, UK},\n"
-        "publisher = {Association for Computational Linguistics},\n"
-        "}\n\n"
-        "Provide the corpus on stdin.  The ARPA file will be written to stdout.  Order of\n"
-        "the model (-o) is the only mandatory option.  As this is an on-disk program,\n"
-        "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n"
-        "Memory sizes are specified like GNU sort: a number followed by a unit character.\n"
-        "Valid units are \% for percentage of memory (supported platforms only) and (in\n"
-        "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y.  Default is K (*1024).\n\n";
-      std::cerr << options << std::endl;
-      return 1;
-    }
-    po::variables_map vm;
-    po::store(po::parse_command_line(argc, argv, options), vm);
-    po::notify(vm);
-
-    util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
-
-    lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
-    // TODO: evaluate options for these.  
-    initial.adder_in.total_memory = 32768;
-    initial.adder_in.block_count = 2;
-    initial.adder_out.total_memory = 32768;
-    initial.adder_out.block_count = 2;
-    pipeline.read_backoffs = initial.adder_out;
-
-    // Read from stdin
-    try {
-      lm::builder::Pipeline(pipeline, 0, 1);
-    } catch (const util::MallocException &e) {
-      std::cerr << e.what() << std::endl;
-      std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
-      return 1;
-    }
-    util::PrintUsage(std::cerr);
-  } catch (const std::exception &e) {
-    std::cerr << e.what() << std::endl;
-    return 1;
-  }
-}
-- 
cgit v1.2.3