Quantization

author: Kenneth Heafield <kenlm@kheafield.com> 2011-06-26 18:40:15 -0400
committer: Patrick Simianer <p@simianer.de> 2011-09-23 19:13:57 +0200
commit: 9308fcdbdbe6564d25fb848139d3f55254d491be (patch)
tree: 67fdaa819488e231b5d70b2227527510571f2108 /klm/lm/build_binary.cc
parent: 83a6ea9b5a4064a83035b505350618fbf81baeff (diff)
1 files changed, 68 insertions, 15 deletions
diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc
index 91ad2fb9..4552c419 100644
--- a/klm/lm/build_binary.cc
+++ b/klm/lm/build_binary.cc
@@ -15,22 +15,21 @@ namespace ngram {
 namespace {
 
 void Usage(const char *name) {
-  std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-n] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n"
+  std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-n] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [type] input.arpa output.mmap\n\n"
 "-u sets the default log10 probability for <unk> if the ARPA file does not have\n"
 "one.\n"
 "-s allows models to be built even if they do not have <s> and </s>.\n"
 "-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
-"type is one of probing, trie, or sorted:\n\n"
+"type is either probing or trie:\n\n"
 "probing uses a probing hash table.  It is the fastest but uses the most memory.\n"
 "-p sets the space multiplier and must be >1.0.  The default is 1.5.\n\n"
 "trie is a straightforward trie with bit-level packing.  It uses the least\n"
 "memory and is still faster than SRI or IRST.  Building the trie format uses an\n"
 "on-disk sort to save memory.\n"
 "-t is the temporary directory prefix.  Default is the output file name.\n"
-"-m limits memory use for sorting.  Measured in MB.  Default is 1024MB.\n\n"
-/*"sorted is like probing but uses a sorted uniform map instead of a hash table.\n"
-"It uses more memory than trie and is also slower, so there's no real reason to\n"
-"use it.\n\n"*/
+"-m limits memory use for sorting.  Measured in MB.  Default is 1024MB.\n"
+"-q turns quantization on and sets the number of bits (e.g. -q 8).\n"
+"-b sets backoff quantization bits.  Requires -q and defaults to that value.\n\n"
 "See http://kheafield.com/code/kenlm/benchmark/ for data structure benchmarks.\n"
 "Passing only an input file will print memory usage of each data structure.\n"
 "If the ARPA file does not have <unk>, -u sets <unk>'s probability; default 0.0.\n";
@@ -51,19 +50,53 @@ unsigned long int ParseUInt(const char *from) {
   return ret;
 }
 
+uint8_t ParseBitCount(const char *from) {
+  unsigned long val = ParseUInt(from);
+  if (val > 25) {
+    util::ParseNumberException e(from);
+    e << " bit counts are limited to 256.";
+  }
+  return val;
+}
+
 void ShowSizes(const char *file, const lm::ngram::Config &config) {
   std::vector<uint64_t> counts;
   util::FilePiece f(file);
   lm::ReadARPACounts(f, counts);
-  std::size_t probing_size = ProbingModel::Size(counts, config);
-  // probing is always largest so use it to determine number of columns.  
-  long int length = std::max<long int>(5, lrint(ceil(log10(probing_size))));
+  std::size_t sizes[3];
+  sizes[0] = ProbingModel::Size(counts, config);
+  sizes[1] = TrieModel::Size(counts, config);
+  sizes[2] = QuantTrieModel::Size(counts, config);
+  std::size_t max_length = *std::max_element(sizes, sizes + 3);
+  std::size_t min_length = *std::max_element(sizes, sizes + 3);
+  std::size_t divide;
+  char prefix;
+  if (min_length < (1 << 10) * 10) {
+    prefix = ' ';
+    divide = 1;
+  } else if (min_length < (1 << 20) * 10) {
+    prefix = 'k';
+    divide = 1 << 10;
+  } else if (min_length < (1ULL << 30) * 10) {
+    prefix = 'M';
+    divide = 1 << 20;
+  } else {
+    prefix = 'G';
+    divide = 1 << 30;
+  }
+  long int length = std::max<long int>(2, lrint(ceil(log10(max_length / divide))));
   std::cout << "Memory estimate:\ntype    ";
   // right align bytes.  
-  for (long int i = 0; i < length - 5; ++i) std::cout << ' ';
-  std::cout << "bytes\n"
-    "probing " << std::setw(length) << probing_size << " assuming -p " << config.probing_multiplier << "\n"
-    "trie    " << std::setw(length) << TrieModel::Size(counts, config) << "\n";
+  for (long int i = 0; i < length - 2; ++i) std::cout << ' ';
+  std::cout << prefix << "B\n"
+    "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n"
+    "trie    " << std::setw(length) << (sizes[1] / divide) << " without quantization\n"
+    "trie    " << std::setw(length) << (sizes[2] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n";
+}
+
+void ProbingQuantizationUnsupported() {
+  std::cerr << "Quantization is only implemented in the trie data structure." << std::endl;
+  exit(1);
 }
 
 } // namespace ngram
@@ -73,11 +106,21 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
 int main(int argc, char *argv[]) {
   using namespace lm::ngram;
 
+  bool quantize = false, set_backoff_bits = false;
   try {
     lm::ngram::Config config;
     int opt;
-    while ((opt = getopt(argc, argv, "siu:p:t:m:")) != -1) {
+    while ((opt = getopt(argc, argv, "siu:p:t:m:q:b:")) != -1) {
       switch(opt) {
+        case 'q':
+          config.prob_bits = ParseBitCount(optarg);
+          if (!set_backoff_bits) config.backoff_bits = config.prob_bits;
+          quantize = true;
+          break;
+        case 'b':
+          config.backoff_bits = ParseBitCount(optarg);
+          set_backoff_bits = true;
+          break;
         case 'u':
           config.unknown_missing_logprob = ParseFloat(optarg);
           break;
@@ -100,19 +143,29 @@ int main(int argc, char *argv[]) {
           Usage(argv[0]);
       }
     }
+    if (!quantize && set_backoff_bits) {
+      std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl;
+      abort();
+    }
     if (optind + 1 == argc) {
       ShowSizes(argv[optind], config);
     } else if (optind + 2 == argc) {
       config.write_mmap = argv[optind + 1];
+      if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
       ProbingModel(argv[optind], config);
     } else if (optind + 3 == argc) {
       const char *model_type = argv[optind];
       const char *from_file = argv[optind + 1];
       config.write_mmap = argv[optind + 2];
       if (!strcmp(model_type, "probing")) {
+        if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
         ProbingModel(from_file, config);
       } else if (!strcmp(model_type, "trie")) {
-        TrieModel(from_file, config);
+        if (quantize) {
+          QuantTrieModel(from_file, config);
+        } else {
+          TrieModel(from_file, config);
+        }
       } else {
         Usage(argv[0]);
       }
author	Kenneth Heafield <kenlm@kheafield.com>	2011-06-26 18:40:15 -0400
committer	Patrick Simianer <p@simianer.de>	2011-09-23 19:13:57 +0200
commit	9308fcdbdbe6564d25fb848139d3f55254d491be (patch)
tree	67fdaa819488e231b5d70b2227527510571f2108 /klm/lm/build_binary.cc
parent	83a6ea9b5a4064a83035b505350618fbf81baeff (diff)