summaryrefslogtreecommitdiff
path: root/klm/lm/build_binary.cc
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2011-08-18 12:14:01 +0100
committerKenneth Heafield <github@kheafield.com>2011-08-18 12:14:01 +0100
commit7607b0a7873f52d6e3ea387bf88c773cbb55f8ee (patch)
tree908fd94fea8d09725bc86ec9b3752b89c78338e5 /klm/lm/build_binary.cc
parentd92124ccc866192e4cdc689f2b41f0324d35dd3b (diff)
KenLM update: Bhiksha's trick, simple test for lms without unk, auto-detect binary files instead of requiring them to be specified at runtime.
Diffstat (limited to 'klm/lm/build_binary.cc')
-rw-r--r--klm/lm/build_binary.cc54
1 files changed, 35 insertions, 19 deletions
diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc
index 4552c419..b7aee4de 100644
--- a/klm/lm/build_binary.cc
+++ b/klm/lm/build_binary.cc
@@ -15,12 +15,12 @@ namespace ngram {
namespace {
void Usage(const char *name) {
- std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-n] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [type] input.arpa output.mmap\n\n"
-"-u sets the default log10 probability for <unk> if the ARPA file does not have\n"
-"one.\n"
+ std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-c bits] [type] input.arpa [output.mmap]\n\n"
+"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
+" Default is -100. The ARPA file will always take precedence.\n"
"-s allows models to be built even if they do not have <s> and </s>.\n"
-"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
-"type is either probing or trie:\n\n"
+"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n\n"
+"type is either probing or trie. Default is probing.\n\n"
"probing uses a probing hash table. It is the fastest but uses the most memory.\n"
"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n"
"trie is a straightforward trie with bit-level packing. It uses the least\n"
@@ -29,10 +29,11 @@ void Usage(const char *name) {
"-t is the temporary directory prefix. Default is the output file name.\n"
"-m limits memory use for sorting. Measured in MB. Default is 1024MB.\n"
"-q turns quantization on and sets the number of bits (e.g. -q 8).\n"
-"-b sets backoff quantization bits. Requires -q and defaults to that value.\n\n"
-"See http://kheafield.com/code/kenlm/benchmark/ for data structure benchmarks.\n"
-"Passing only an input file will print memory usage of each data structure.\n"
-"If the ARPA file does not have <unk>, -u sets <unk>'s probability; default 0.0.\n";
+"-b sets backoff quantization bits. Requires -q and defaults to that value.\n"
+"-a compresses pointers using an array of offsets. The parameter is the\n"
+" maximum number of bits encoded by the array. Memory is minimized subject\n"
+" to the maximum, so pick 255 to minimize memory.\n\n"
+"Get a memory estimate by passing an ARPA file without an output file name.\n";
exit(1);
}
@@ -63,12 +64,14 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
std::vector<uint64_t> counts;
util::FilePiece f(file);
lm::ReadARPACounts(f, counts);
- std::size_t sizes[3];
+ std::size_t sizes[5];
sizes[0] = ProbingModel::Size(counts, config);
sizes[1] = TrieModel::Size(counts, config);
sizes[2] = QuantTrieModel::Size(counts, config);
- std::size_t max_length = *std::max_element(sizes, sizes + 3);
- std::size_t min_length = *std::max_element(sizes, sizes + 3);
+ sizes[3] = ArrayTrieModel::Size(counts, config);
+ sizes[4] = QuantArrayTrieModel::Size(counts, config);
+ std::size_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(size_t));
+ std::size_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(size_t));
std::size_t divide;
char prefix;
if (min_length < (1 << 10) * 10) {
@@ -91,7 +94,9 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
std::cout << prefix << "B\n"
"probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n"
"trie " << std::setw(length) << (sizes[1] / divide) << " without quantization\n"
- "trie " << std::setw(length) << (sizes[2] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n";
+ "trie " << std::setw(length) << (sizes[2] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n"
+ "trie " << std::setw(length) << (sizes[3] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n"
+ "trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n";
}
void ProbingQuantizationUnsupported() {
@@ -106,11 +111,11 @@ void ProbingQuantizationUnsupported() {
int main(int argc, char *argv[]) {
using namespace lm::ngram;
- bool quantize = false, set_backoff_bits = false;
try {
+ bool quantize = false, set_backoff_bits = false, bhiksha = false;
lm::ngram::Config config;
int opt;
- while ((opt = getopt(argc, argv, "siu:p:t:m:q:b:")) != -1) {
+ while ((opt = getopt(argc, argv, "siu:p:t:m:q:b:a:")) != -1) {
switch(opt) {
case 'q':
config.prob_bits = ParseBitCount(optarg);
@@ -121,6 +126,9 @@ int main(int argc, char *argv[]) {
config.backoff_bits = ParseBitCount(optarg);
set_backoff_bits = true;
break;
+ case 'a':
+ config.pointer_bhiksha_bits = ParseBitCount(optarg);
+ bhiksha = true;
case 'u':
config.unknown_missing_logprob = ParseFloat(optarg);
break;
@@ -162,9 +170,17 @@ int main(int argc, char *argv[]) {
ProbingModel(from_file, config);
} else if (!strcmp(model_type, "trie")) {
if (quantize) {
- QuantTrieModel(from_file, config);
+ if (bhiksha) {
+ QuantArrayTrieModel(from_file, config);
+ } else {
+ QuantTrieModel(from_file, config);
+ }
} else {
- TrieModel(from_file, config);
+ if (bhiksha) {
+ ArrayTrieModel(from_file, config);
+ } else {
+ TrieModel(from_file, config);
+ }
}
} else {
Usage(argv[0]);
@@ -173,9 +189,9 @@ int main(int argc, char *argv[]) {
Usage(argv[0]);
}
}
- catch (std::exception &e) {
+ catch (const std::exception &e) {
std::cerr << e.what() << std::endl;
- abort();
+ return 1;
}
return 0;
}