diff options
Diffstat (limited to 'klm/lm/build_binary.cc')
-rw-r--r-- | klm/lm/build_binary.cc | 77 |
1 files changed, 25 insertions, 52 deletions
diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc index 2b8c9d5b..ab2c0c32 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary.cc @@ -1,10 +1,14 @@ #include "lm/model.hh" +#include "lm/sizes.hh" #include "util/file_piece.hh" +#include "util/usage.hh" +#include <algorithm> #include <cstdlib> #include <exception> #include <iostream> #include <iomanip> +#include <limits> #include <math.h> #include <stdlib.h> @@ -19,8 +23,8 @@ namespace lm { namespace ngram { namespace { -void Usage(const char *name) { - std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" +void Usage(const char *name, const char *default_mem) { + std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" "-u sets the log10 probability for <unk> if the ARPA file does not have one.\n" " Default is -100. The ARPA file will always take precedence.\n" "-s allows models to be built even if they do not have <s> and </s>.\n" @@ -38,8 +42,11 @@ void Usage(const char *name) { "trie is a straightforward trie with bit-level packing. It uses the least\n" "memory and is still faster than SRI or IRST. Building the trie format uses an\n" "on-disk sort to save memory.\n" -"-t is the temporary directory prefix. Default is the output file name.\n" -"-m limits memory use for sorting. Measured in MB. Default is 1024MB.\n" +"-T is the temporary directory prefix. Default is the output file name.\n" +"-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n" +" with GNU sort. The number is followed by a unit: \% for percent of physical\n" +" memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n" +" Default unit is K for Kilobytes.\n" "-q turns quantization on and sets the number of bits (e.g. -q 8).\n" "-b sets backoff quantization bits. Requires -q and defaults to that value.\n" "-a compresses pointers using an array of offsets. The parameter is the\n" @@ -83,47 +90,6 @@ void ParseFileList(const char *from, std::vector<std::string> &to) { } } -void ShowSizes(const char *file, const lm::ngram::Config &config) { - std::vector<uint64_t> counts; - util::FilePiece f(file); - lm::ReadARPACounts(f, counts); - uint64_t sizes[6]; - sizes[0] = ProbingModel::Size(counts, config); - sizes[1] = RestProbingModel::Size(counts, config); - sizes[2] = TrieModel::Size(counts, config); - sizes[3] = QuantTrieModel::Size(counts, config); - sizes[4] = ArrayTrieModel::Size(counts, config); - sizes[5] = QuantArrayTrieModel::Size(counts, config); - uint64_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t)); - uint64_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t)); - uint64_t divide; - char prefix; - if (min_length < (1 << 10) * 10) { - prefix = ' '; - divide = 1; - } else if (min_length < (1 << 20) * 10) { - prefix = 'k'; - divide = 1 << 10; - } else if (min_length < (1ULL << 30) * 10) { - prefix = 'M'; - divide = 1 << 20; - } else { - prefix = 'G'; - divide = 1 << 30; - } - long int length = std::max<long int>(2, static_cast<long int>(ceil(log10((double) max_length / divide)))); - std::cout << "Memory estimate:\ntype "; - // right align bytes. - for (long int i = 0; i < length - 2; ++i) std::cout << ' '; - std::cout << prefix << "B\n" - "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n" - "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r models -p " << config.probing_multiplier << "\n" - "trie " << std::setw(length) << (sizes[2] / divide) << " without quantization\n" - "trie " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n" - "trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n" - "trie " << std::setw(length) << (sizes[5] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n"; -} - void ProbingQuantizationUnsupported() { std::cerr << "Quantization is only implemented in the trie data structure." << std::endl; exit(1); @@ -136,11 +102,14 @@ void ProbingQuantizationUnsupported() { int main(int argc, char *argv[]) { using namespace lm::ngram; + const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G"; + try { bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false; lm::ngram::Config config; + config.building_memory = util::ParseSize(default_mem); int opt; - while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:sir:")) != -1) { + while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:")) != -1) { switch(opt) { case 'q': config.prob_bits = ParseBitCount(optarg); @@ -161,12 +130,16 @@ int main(int argc, char *argv[]) { case 'p': config.probing_multiplier = ParseFloat(optarg); break; - case 't': + case 't': // legacy + case 'T': config.temporary_directory_prefix = optarg; break; - case 'm': + case 'm': // legacy config.building_memory = ParseUInt(optarg) * 1048576; break; + case 'S': + config.building_memory = std::min(static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), util::ParseSize(optarg)); + break; case 'w': set_write_method = true; if (!strcmp(optarg, "mmap")) { @@ -174,7 +147,7 @@ int main(int argc, char *argv[]) { } else if (!strcmp(optarg, "after")) { config.write_method = Config::WRITE_AFTER; } else { - Usage(argv[0]); + Usage(argv[0], default_mem); } break; case 's': @@ -189,7 +162,7 @@ int main(int argc, char *argv[]) { config.rest_function = Config::REST_LOWER; break; default: - Usage(argv[0]); + Usage(argv[0], default_mem); } } if (!quantize && set_backoff_bits) { @@ -212,7 +185,7 @@ int main(int argc, char *argv[]) { from_file = argv[optind + 1]; config.write_mmap = argv[optind + 2]; } else { - Usage(argv[0]); + Usage(argv[0], default_mem); } if (!strcmp(model_type, "probing")) { if (!set_write_method) config.write_method = Config::WRITE_AFTER; @@ -242,7 +215,7 @@ int main(int argc, char *argv[]) { } } } else { - Usage(argv[0]); + Usage(argv[0], default_mem); } } catch (const std::exception &e) { |