new version of kenlm

author: Chris Dyer <cdyer@cs.cmu.edu> 2010-12-13 16:18:34 -0500
committer: Chris Dyer <cdyer@cs.cmu.edu> 2010-12-13 16:18:34 -0500
commit: 66e5956906e61b047d2fd451f3053916cbc92433 (patch)
tree: 5bd4222506deae0c8e5f4c001bb6f7505b73f846 /klm/lm/build_binary.cc
parent: 6d2a75d7deb35fcb2fac674190c19e0a0143aaed (diff)
1 files changed, 106 insertions, 6 deletions
diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc
index 4db631a2..ec034640 100644
--- a/klm/lm/build_binary.cc
+++ b/klm/lm/build_binary.cc
@@ -1,13 +1,113 @@
 #include "lm/model.hh"
+#include "util/file_piece.hh"
 
 #include <iostream>
+#include <iomanip>
+
+#include <math.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+namespace lm {
+namespace ngram {
+namespace {
+
+void Usage(const char *name) {
+  std::cerr << "Usage: " << name << " [-u unknown_probability] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n"
+"Where type is one of probing, trie, or sorted:\n\n"
+"probing uses a probing hash table.  It is the fastest but uses the most memory.\n"
+"-p sets the space multiplier and must be >1.0.  The default is 1.5.\n\n"
+"trie is a straightforward trie with bit-level packing.  It uses the least\n"
+"memory and is still faster than SRI or IRST.  Building the trie format uses an\n"
+"on-disk sort to save memory.\n"
+"-t is the temporary directory prefix.  Default is the output file name.\n"
+"-m is the amount of memory to use, in MB.  Default is 1024MB (1GB).\n\n"
+"sorted is like probing but uses a sorted uniform map instead of a hash table.\n"
+"It uses more memory than trie and is also slower, so there's no real reason to\n"
+"use it.\n\n"
+"See http://kheafield.com/code/kenlm/benchmark/ for data structure benchmarks.\n"
+"Passing only an input file will print memory usage of each data structure.\n"
+"If the ARPA file does not have <unk>, -u sets <unk>'s probability; default 0.0.\n";
+  exit(1);
+}
+
+// I could really use boost::lexical_cast right about now.  
+float ParseFloat(const char *from) {
+  char *end;
+  float ret = strtod(from, &end);
+  if (*end) throw util::ParseNumberException(from);
+  return ret;
+}
+unsigned long int ParseUInt(const char *from) {
+  char *end;
+  unsigned long int ret = strtoul(from, &end, 10);
+  if (*end) throw util::ParseNumberException(from);
+  return ret;
+}
+
+void ShowSizes(const char *file, const lm::ngram::Config &config) {
+  std::vector<uint64_t> counts;
+  util::FilePiece f(file);
+  lm::ReadARPACounts(f, counts);
+  std::size_t probing_size = ProbingModel::Size(counts, config);
+  // probing is always largest so use it to determine number of columns.  
+  long int length = std::max<long int>(5, lrint(ceil(log10(probing_size))));
+  std::cout << "Memory usage:\ntype    ";
+  // right align bytes.  
+  for (long int i = 0; i < length - 5; ++i) std::cout << ' ';
+  std::cout << "bytes\n"
+    "probing " << std::setw(length) << probing_size << " assuming -p " << config.probing_multiplier << "\n"
+    "trie    " << std::setw(length) << TrieModel::Size(counts, config) << "\n"
+    "sorted  " << std::setw(length) << SortedModel::Size(counts, config) << "\n";
+}
+
+} // namespace ngram
+} // namespace lm
+} // namespace
 
 int main(int argc, char *argv[]) {
-  if (argc != 3) {
-    std::cerr << "Usage: " << argv[0] << " input.arpa output.mmap" << std::endl;
-    return 1;
-  }
+  using namespace lm::ngram;
+
   lm::ngram::Config config;
-  config.write_mmap = argv[2];
-  lm::ngram::Model(argv[1], config);
+  int opt;
+  while ((opt = getopt(argc, argv, "u:p:t:m:")) != -1) {
+    switch(opt) {
+      case 'u':
+        config.unknown_missing_prob = ParseFloat(optarg);
+        break;
+      case 'p':
+        config.probing_multiplier = ParseFloat(optarg);
+        break;
+      case 't':
+        config.temporary_directory_prefix = optarg;
+        break;
+      case 'm':
+        config.building_memory = ParseUInt(optarg) * 1048576;
+        break;
+      default:
+        Usage(argv[0]);
+    }
+  }
+  if (optind + 1 == argc) {
+    ShowSizes(argv[optind], config);
+  } else if (optind + 2 == argc) {
+    config.write_mmap = argv[optind + 1];
+    ProbingModel(argv[optind], config);
+  } else if (optind + 3 == argc) {
+    const char *model_type = argv[optind];
+    const char *from_file = argv[optind + 1];
+    config.write_mmap = argv[optind + 2];
+    if (!strcmp(model_type, "probing")) {
+      ProbingModel(from_file, config);
+    } else if (!strcmp(model_type, "sorted")) {
+      SortedModel(from_file, config);
+    } else if (!strcmp(model_type, "trie")) {
+      TrieModel(from_file, config);
+    } else {
+      Usage(argv[0]);
+    }
+  } else {
+    Usage(argv[0]);
+  }
+  return 0;
 }
author	Chris Dyer <cdyer@cs.cmu.edu>	2010-12-13 16:18:34 -0500
committer	Chris Dyer <cdyer@cs.cmu.edu>	2010-12-13 16:18:34 -0500
commit	66e5956906e61b047d2fd451f3053916cbc92433 (patch)
tree	5bd4222506deae0c8e5f4c001bb6f7505b73f846 /klm/lm/build_binary.cc
parent	6d2a75d7deb35fcb2fac674190c19e0a0143aaed (diff)