diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2010-12-13 16:18:34 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2010-12-13 16:18:34 -0500 |
commit | 66e5956906e61b047d2fd451f3053916cbc92433 (patch) | |
tree | 5bd4222506deae0c8e5f4c001bb6f7505b73f846 /klm/lm/build_binary.cc | |
parent | 6d2a75d7deb35fcb2fac674190c19e0a0143aaed (diff) |
new version of kenlm
Diffstat (limited to 'klm/lm/build_binary.cc')
-rw-r--r-- | klm/lm/build_binary.cc | 112 |
1 files changed, 106 insertions, 6 deletions
diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc index 4db631a2..ec034640 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary.cc @@ -1,13 +1,113 @@ #include "lm/model.hh" +#include "util/file_piece.hh" #include <iostream> +#include <iomanip> + +#include <math.h> +#include <stdlib.h> +#include <unistd.h> + +namespace lm { +namespace ngram { +namespace { + +void Usage(const char *name) { + std::cerr << "Usage: " << name << " [-u unknown_probability] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n" +"Where type is one of probing, trie, or sorted:\n\n" +"probing uses a probing hash table. It is the fastest but uses the most memory.\n" +"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n" +"trie is a straightforward trie with bit-level packing. It uses the least\n" +"memory and is still faster than SRI or IRST. Building the trie format uses an\n" +"on-disk sort to save memory.\n" +"-t is the temporary directory prefix. Default is the output file name.\n" +"-m is the amount of memory to use, in MB. Default is 1024MB (1GB).\n\n" +"sorted is like probing but uses a sorted uniform map instead of a hash table.\n" +"It uses more memory than trie and is also slower, so there's no real reason to\n" +"use it.\n\n" +"See http://kheafield.com/code/kenlm/benchmark/ for data structure benchmarks.\n" +"Passing only an input file will print memory usage of each data structure.\n" +"If the ARPA file does not have <unk>, -u sets <unk>'s probability; default 0.0.\n"; + exit(1); +} + +// I could really use boost::lexical_cast right about now. +float ParseFloat(const char *from) { + char *end; + float ret = strtod(from, &end); + if (*end) throw util::ParseNumberException(from); + return ret; +} +unsigned long int ParseUInt(const char *from) { + char *end; + unsigned long int ret = strtoul(from, &end, 10); + if (*end) throw util::ParseNumberException(from); + return ret; +} + +void ShowSizes(const char *file, const lm::ngram::Config &config) { + std::vector<uint64_t> counts; + util::FilePiece f(file); + lm::ReadARPACounts(f, counts); + std::size_t probing_size = ProbingModel::Size(counts, config); + // probing is always largest so use it to determine number of columns. + long int length = std::max<long int>(5, lrint(ceil(log10(probing_size)))); + std::cout << "Memory usage:\ntype "; + // right align bytes. + for (long int i = 0; i < length - 5; ++i) std::cout << ' '; + std::cout << "bytes\n" + "probing " << std::setw(length) << probing_size << " assuming -p " << config.probing_multiplier << "\n" + "trie " << std::setw(length) << TrieModel::Size(counts, config) << "\n" + "sorted " << std::setw(length) << SortedModel::Size(counts, config) << "\n"; +} + +} // namespace ngram +} // namespace lm +} // namespace int main(int argc, char *argv[]) { - if (argc != 3) { - std::cerr << "Usage: " << argv[0] << " input.arpa output.mmap" << std::endl; - return 1; - } + using namespace lm::ngram; + lm::ngram::Config config; - config.write_mmap = argv[2]; - lm::ngram::Model(argv[1], config); + int opt; + while ((opt = getopt(argc, argv, "u:p:t:m:")) != -1) { + switch(opt) { + case 'u': + config.unknown_missing_prob = ParseFloat(optarg); + break; + case 'p': + config.probing_multiplier = ParseFloat(optarg); + break; + case 't': + config.temporary_directory_prefix = optarg; + break; + case 'm': + config.building_memory = ParseUInt(optarg) * 1048576; + break; + default: + Usage(argv[0]); + } + } + if (optind + 1 == argc) { + ShowSizes(argv[optind], config); + } else if (optind + 2 == argc) { + config.write_mmap = argv[optind + 1]; + ProbingModel(argv[optind], config); + } else if (optind + 3 == argc) { + const char *model_type = argv[optind]; + const char *from_file = argv[optind + 1]; + config.write_mmap = argv[optind + 2]; + if (!strcmp(model_type, "probing")) { + ProbingModel(from_file, config); + } else if (!strcmp(model_type, "sorted")) { + SortedModel(from_file, config); + } else if (!strcmp(model_type, "trie")) { + TrieModel(from_file, config); + } else { + Usage(argv[0]); + } + } else { + Usage(argv[0]); + } + return 0; } |