From 2b63fa0755954edf467a2421997eaf72771260cf Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 16 May 2012 13:24:08 -0700 Subject: Big kenlm change includes lower order models for probing only. And other stuff. --- klm/lm/build_binary.cc | 97 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 64 insertions(+), 33 deletions(-) (limited to 'klm/lm/build_binary.cc') diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc index 8cbb69d0..c4a01cb4 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary.cc @@ -66,16 +66,28 @@ uint8_t ParseBitCount(const char *from) { return val; } +void ParseFileList(const char *from, std::vector &to) { + to.clear(); + while (true) { + const char *i; + for (i = from; *i && *i != ' '; ++i) {} + to.push_back(std::string(from, i - from)); + if (!*i) break; + from = i + 1; + } +} + void ShowSizes(const char *file, const lm::ngram::Config &config) { std::vector counts; util::FilePiece f(file); lm::ReadARPACounts(f, counts); - std::size_t sizes[5]; + std::size_t sizes[6]; sizes[0] = ProbingModel::Size(counts, config); - sizes[1] = TrieModel::Size(counts, config); - sizes[2] = QuantTrieModel::Size(counts, config); - sizes[3] = ArrayTrieModel::Size(counts, config); - sizes[4] = QuantArrayTrieModel::Size(counts, config); + sizes[1] = RestProbingModel::Size(counts, config); + sizes[2] = TrieModel::Size(counts, config); + sizes[3] = QuantTrieModel::Size(counts, config); + sizes[4] = ArrayTrieModel::Size(counts, config); + sizes[5] = QuantArrayTrieModel::Size(counts, config); std::size_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(size_t)); std::size_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(size_t)); std::size_t divide; @@ -99,10 +111,11 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) { for (long int i = 0; i < length - 2; ++i) std::cout << ' '; std::cout << prefix << "B\n" "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n" - "trie " << std::setw(length) << (sizes[1] / divide) << " without quantization\n" - "trie " << std::setw(length) << (sizes[2] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n" - "trie " << std::setw(length) << (sizes[3] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n" - "trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n"; + "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r -p " << config.probing_multiplier << "\n" + "trie " << std::setw(length) << (sizes[2] / divide) << " without quantization\n" + "trie " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n" + "trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n" + "trie " << std::setw(length) << (sizes[5] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n"; } void ProbingQuantizationUnsupported() { @@ -118,10 +131,10 @@ int main(int argc, char *argv[]) { using namespace lm::ngram; try { - bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false; + bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false; lm::ngram::Config config; int opt; - while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:si")) != -1) { + while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:sir:")) != -1) { switch(opt) { case 'q': config.prob_bits = ParseBitCount(optarg); @@ -164,6 +177,11 @@ int main(int argc, char *argv[]) { case 'i': config.positive_log_probability = lm::SILENT; break; + case 'r': + rest = true; + ParseFileList(optarg, config.rest_lower_files); + config.rest_function = Config::REST_LOWER; + break; default: Usage(argv[0]); } @@ -174,35 +192,48 @@ int main(int argc, char *argv[]) { } if (optind + 1 == argc) { ShowSizes(argv[optind], config); - } else if (optind + 2 == argc) { + return 0; + } + const char *model_type; + const char *from_file; + + if (optind + 2 == argc) { + model_type = "probing"; + from_file = argv[optind]; config.write_mmap = argv[optind + 1]; - if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); - ProbingModel(argv[optind], config); } else if (optind + 3 == argc) { - const char *model_type = argv[optind]; - const char *from_file = argv[optind + 1]; + model_type = argv[optind]; + from_file = argv[optind + 1]; config.write_mmap = argv[optind + 2]; - if (!strcmp(model_type, "probing")) { - if (!set_write_method) config.write_method = Config::WRITE_AFTER; - if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); + } else { + Usage(argv[0]); + } + if (!strcmp(model_type, "probing")) { + if (!set_write_method) config.write_method = Config::WRITE_AFTER; + if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); + if (rest) { + RestProbingModel(from_file, config); + } else { ProbingModel(from_file, config); - } else if (!strcmp(model_type, "trie")) { - if (!set_write_method) config.write_method = Config::WRITE_MMAP; - if (quantize) { - if (bhiksha) { - QuantArrayTrieModel(from_file, config); - } else { - QuantTrieModel(from_file, config); - } + } + } else if (!strcmp(model_type, "trie")) { + if (rest) { + std::cerr << "Rest + trie is not supported yet." << std::endl; + return 1; + } + if (!set_write_method) config.write_method = Config::WRITE_MMAP; + if (quantize) { + if (bhiksha) { + QuantArrayTrieModel(from_file, config); } else { - if (bhiksha) { - ArrayTrieModel(from_file, config); - } else { - TrieModel(from_file, config); - } + QuantTrieModel(from_file, config); } } else { - Usage(argv[0]); + if (bhiksha) { + ArrayTrieModel(from_file, config); + } else { + TrieModel(from_file, config); + } } } else { Usage(argv[0]); -- cgit v1.2.3