From 205893513c8343fdc55789e427fab4c8b536dc12 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 26 Jun 2011 18:40:15 -0400 Subject: Quantization --- klm/lm/build_binary.cc | 83 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 15 deletions(-) (limited to 'klm/lm/build_binary.cc') diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc index 91ad2fb9..4552c419 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary.cc @@ -15,22 +15,21 @@ namespace ngram { namespace { void Usage(const char *name) { - std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-n] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n" + std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-n] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [type] input.arpa output.mmap\n\n" "-u sets the default log10 probability for if the ARPA file does not have\n" "one.\n" "-s allows models to be built even if they do not have and .\n" "-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" -"type is one of probing, trie, or sorted:\n\n" +"type is either probing or trie:\n\n" "probing uses a probing hash table. It is the fastest but uses the most memory.\n" "-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n" "trie is a straightforward trie with bit-level packing. It uses the least\n" "memory and is still faster than SRI or IRST. Building the trie format uses an\n" "on-disk sort to save memory.\n" "-t is the temporary directory prefix. Default is the output file name.\n" -"-m limits memory use for sorting. Measured in MB. Default is 1024MB.\n\n" -/*"sorted is like probing but uses a sorted uniform map instead of a hash table.\n" -"It uses more memory than trie and is also slower, so there's no real reason to\n" -"use it.\n\n"*/ +"-m limits memory use for sorting. Measured in MB. Default is 1024MB.\n" +"-q turns quantization on and sets the number of bits (e.g. -q 8).\n" +"-b sets backoff quantization bits. Requires -q and defaults to that value.\n\n" "See http://kheafield.com/code/kenlm/benchmark/ for data structure benchmarks.\n" "Passing only an input file will print memory usage of each data structure.\n" "If the ARPA file does not have , -u sets 's probability; default 0.0.\n"; @@ -51,19 +50,53 @@ unsigned long int ParseUInt(const char *from) { return ret; } +uint8_t ParseBitCount(const char *from) { + unsigned long val = ParseUInt(from); + if (val > 25) { + util::ParseNumberException e(from); + e << " bit counts are limited to 256."; + } + return val; +} + void ShowSizes(const char *file, const lm::ngram::Config &config) { std::vector counts; util::FilePiece f(file); lm::ReadARPACounts(f, counts); - std::size_t probing_size = ProbingModel::Size(counts, config); - // probing is always largest so use it to determine number of columns. - long int length = std::max(5, lrint(ceil(log10(probing_size)))); + std::size_t sizes[3]; + sizes[0] = ProbingModel::Size(counts, config); + sizes[1] = TrieModel::Size(counts, config); + sizes[2] = QuantTrieModel::Size(counts, config); + std::size_t max_length = *std::max_element(sizes, sizes + 3); + std::size_t min_length = *std::max_element(sizes, sizes + 3); + std::size_t divide; + char prefix; + if (min_length < (1 << 10) * 10) { + prefix = ' '; + divide = 1; + } else if (min_length < (1 << 20) * 10) { + prefix = 'k'; + divide = 1 << 10; + } else if (min_length < (1ULL << 30) * 10) { + prefix = 'M'; + divide = 1 << 20; + } else { + prefix = 'G'; + divide = 1 << 30; + } + long int length = std::max(2, lrint(ceil(log10(max_length / divide)))); std::cout << "Memory estimate:\ntype "; // right align bytes. - for (long int i = 0; i < length - 5; ++i) std::cout << ' '; - std::cout << "bytes\n" - "probing " << std::setw(length) << probing_size << " assuming -p " << config.probing_multiplier << "\n" - "trie " << std::setw(length) << TrieModel::Size(counts, config) << "\n"; + for (long int i = 0; i < length - 2; ++i) std::cout << ' '; + std::cout << prefix << "B\n" + "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n" + "trie " << std::setw(length) << (sizes[1] / divide) << " without quantization\n" + "trie " << std::setw(length) << (sizes[2] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n"; +} + +void ProbingQuantizationUnsupported() { + std::cerr << "Quantization is only implemented in the trie data structure." << std::endl; + exit(1); } } // namespace ngram @@ -73,11 +106,21 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) { int main(int argc, char *argv[]) { using namespace lm::ngram; + bool quantize = false, set_backoff_bits = false; try { lm::ngram::Config config; int opt; - while ((opt = getopt(argc, argv, "siu:p:t:m:")) != -1) { + while ((opt = getopt(argc, argv, "siu:p:t:m:q:b:")) != -1) { switch(opt) { + case 'q': + config.prob_bits = ParseBitCount(optarg); + if (!set_backoff_bits) config.backoff_bits = config.prob_bits; + quantize = true; + break; + case 'b': + config.backoff_bits = ParseBitCount(optarg); + set_backoff_bits = true; + break; case 'u': config.unknown_missing_logprob = ParseFloat(optarg); break; @@ -100,19 +143,29 @@ int main(int argc, char *argv[]) { Usage(argv[0]); } } + if (!quantize && set_backoff_bits) { + std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl; + abort(); + } if (optind + 1 == argc) { ShowSizes(argv[optind], config); } else if (optind + 2 == argc) { config.write_mmap = argv[optind + 1]; + if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); ProbingModel(argv[optind], config); } else if (optind + 3 == argc) { const char *model_type = argv[optind]; const char *from_file = argv[optind + 1]; config.write_mmap = argv[optind + 2]; if (!strcmp(model_type, "probing")) { + if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); ProbingModel(from_file, config); } else if (!strcmp(model_type, "trie")) { - TrieModel(from_file, config); + if (quantize) { + QuantTrieModel(from_file, config); + } else { + TrieModel(from_file, config); + } } else { Usage(argv[0]); } -- cgit v1.2.3