From 516c132fb683b5bf77ae3230a1b3709beb57618e Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Tue, 22 Jan 2013 21:37:49 +0000 Subject: KenLM 58da338b --- klm/lm/Makefile.am | 4 +- klm/lm/build_binary.cc | 228 ------------------------------------- klm/lm/build_binary_main.cc | 228 +++++++++++++++++++++++++++++++++++++ klm/lm/builder/Makefile.am | 2 +- klm/lm/builder/discount.hh | 2 +- klm/lm/builder/lmplz_main.cc | 94 ++++++++++++++++ klm/lm/builder/main.cc | 94 ---------------- klm/lm/filter/filter_main.cc | 248 ++++++++++++++++++++++++++++++++++++++++ klm/lm/filter/main.cc | 249 ----------------------------------------- klm/lm/filter/phrase.hh | 1 + klm/lm/filter/vocab.hh | 1 + klm/lm/fragment.cc | 37 ------ klm/lm/fragment_main.cc | 37 ++++++ klm/lm/kenlm_max_order_main.cc | 6 + klm/lm/max_order.cc | 6 - klm/lm/ngram_query.cc | 47 -------- klm/lm/query_main.cc | 47 ++++++++ 17 files changed, 666 insertions(+), 665 deletions(-) delete mode 100644 klm/lm/build_binary.cc create mode 100644 klm/lm/build_binary_main.cc create mode 100644 klm/lm/builder/lmplz_main.cc delete mode 100644 klm/lm/builder/main.cc create mode 100644 klm/lm/filter/filter_main.cc delete mode 100644 klm/lm/filter/main.cc delete mode 100644 klm/lm/fragment.cc create mode 100644 klm/lm/fragment_main.cc create mode 100644 klm/lm/kenlm_max_order_main.cc delete mode 100644 klm/lm/max_order.cc delete mode 100644 klm/lm/ngram_query.cc create mode 100644 klm/lm/query_main.cc (limited to 'klm/lm') diff --git a/klm/lm/Makefile.am b/klm/lm/Makefile.am index 45f40c43..48b0ba34 100644 --- a/klm/lm/Makefile.am +++ b/klm/lm/Makefile.am @@ -1,9 +1,9 @@ bin_PROGRAMS = build_binary ngram_query -build_binary_SOURCES = build_binary.cc +build_binary_SOURCES = build_binary_main.cc build_binary_LDADD = libklm.a ../util/libklm_util.a ../util/double-conversion/libklm_util_double.a -lz -ngram_query_SOURCES = ngram_query.cc +ngram_query_SOURCES = query_main.cc ngram_query_LDADD = libklm.a ../util/libklm_util.a ../util/double-conversion/libklm_util_double.a -lz #noinst_PROGRAMS = \ diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc deleted file mode 100644 index ab2c0c32..00000000 --- a/klm/lm/build_binary.cc +++ /dev/null @@ -1,228 +0,0 @@ -#include "lm/model.hh" -#include "lm/sizes.hh" -#include "util/file_piece.hh" -#include "util/usage.hh" - -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifdef WIN32 -#include "util/getopt.hh" -#else -#include -#endif - -namespace lm { -namespace ngram { -namespace { - -void Usage(const char *name, const char *default_mem) { - std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" -"-u sets the log10 probability for if the ARPA file does not have one.\n" -" Default is -100. The ARPA file will always take precedence.\n" -"-s allows models to be built even if they do not have and .\n" -"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" -"-w mmap|after determines how writing is done.\n" -" mmap maps the binary file and writes to it. Default for trie.\n" -" after allocates anonymous memory, builds, and writes. Default for probing.\n" -"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n" -" model files. order1.arpa must be an ARPA file. All others may be ARPA or\n" -" the same data structure as being built. All files must have the same\n" -" vocabulary. For probing, the unigrams must be in the same order.\n\n" -"type is either probing or trie. Default is probing.\n\n" -"probing uses a probing hash table. It is the fastest but uses the most memory.\n" -"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n" -"trie is a straightforward trie with bit-level packing. It uses the least\n" -"memory and is still faster than SRI or IRST. Building the trie format uses an\n" -"on-disk sort to save memory.\n" -"-T is the temporary directory prefix. Default is the output file name.\n" -"-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n" -" with GNU sort. The number is followed by a unit: \% for percent of physical\n" -" memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n" -" Default unit is K for Kilobytes.\n" -"-q turns quantization on and sets the number of bits (e.g. -q 8).\n" -"-b sets backoff quantization bits. Requires -q and defaults to that value.\n" -"-a compresses pointers using an array of offsets. The parameter is the\n" -" maximum number of bits encoded by the array. Memory is minimized subject\n" -" to the maximum, so pick 255 to minimize memory.\n\n" -"Get a memory estimate by passing an ARPA file without an output file name.\n"; - exit(1); -} - -// I could really use boost::lexical_cast right about now. -float ParseFloat(const char *from) { - char *end; - float ret = strtod(from, &end); - if (*end) throw util::ParseNumberException(from); - return ret; -} -unsigned long int ParseUInt(const char *from) { - char *end; - unsigned long int ret = strtoul(from, &end, 10); - if (*end) throw util::ParseNumberException(from); - return ret; -} - -uint8_t ParseBitCount(const char *from) { - unsigned long val = ParseUInt(from); - if (val > 25) { - util::ParseNumberException e(from); - e << " bit counts are limited to 25."; - } - return val; -} - -void ParseFileList(const char *from, std::vector &to) { - to.clear(); - while (true) { - const char *i; - for (i = from; *i && *i != ' '; ++i) {} - to.push_back(std::string(from, i - from)); - if (!*i) break; - from = i + 1; - } -} - -void ProbingQuantizationUnsupported() { - std::cerr << "Quantization is only implemented in the trie data structure." << std::endl; - exit(1); -} - -} // namespace ngram -} // namespace lm -} // namespace - -int main(int argc, char *argv[]) { - using namespace lm::ngram; - - const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G"; - - try { - bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false; - lm::ngram::Config config; - config.building_memory = util::ParseSize(default_mem); - int opt; - while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:")) != -1) { - switch(opt) { - case 'q': - config.prob_bits = ParseBitCount(optarg); - if (!set_backoff_bits) config.backoff_bits = config.prob_bits; - quantize = true; - break; - case 'b': - config.backoff_bits = ParseBitCount(optarg); - set_backoff_bits = true; - break; - case 'a': - config.pointer_bhiksha_bits = ParseBitCount(optarg); - bhiksha = true; - break; - case 'u': - config.unknown_missing_logprob = ParseFloat(optarg); - break; - case 'p': - config.probing_multiplier = ParseFloat(optarg); - break; - case 't': // legacy - case 'T': - config.temporary_directory_prefix = optarg; - break; - case 'm': // legacy - config.building_memory = ParseUInt(optarg) * 1048576; - break; - case 'S': - config.building_memory = std::min(static_cast(std::numeric_limits::max()), util::ParseSize(optarg)); - break; - case 'w': - set_write_method = true; - if (!strcmp(optarg, "mmap")) { - config.write_method = Config::WRITE_MMAP; - } else if (!strcmp(optarg, "after")) { - config.write_method = Config::WRITE_AFTER; - } else { - Usage(argv[0], default_mem); - } - break; - case 's': - config.sentence_marker_missing = lm::SILENT; - break; - case 'i': - config.positive_log_probability = lm::SILENT; - break; - case 'r': - rest = true; - ParseFileList(optarg, config.rest_lower_files); - config.rest_function = Config::REST_LOWER; - break; - default: - Usage(argv[0], default_mem); - } - } - if (!quantize && set_backoff_bits) { - std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl; - abort(); - } - if (optind + 1 == argc) { - ShowSizes(argv[optind], config); - return 0; - } - const char *model_type; - const char *from_file; - - if (optind + 2 == argc) { - model_type = "probing"; - from_file = argv[optind]; - config.write_mmap = argv[optind + 1]; - } else if (optind + 3 == argc) { - model_type = argv[optind]; - from_file = argv[optind + 1]; - config.write_mmap = argv[optind + 2]; - } else { - Usage(argv[0], default_mem); - } - if (!strcmp(model_type, "probing")) { - if (!set_write_method) config.write_method = Config::WRITE_AFTER; - if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); - if (rest) { - RestProbingModel(from_file, config); - } else { - ProbingModel(from_file, config); - } - } else if (!strcmp(model_type, "trie")) { - if (rest) { - std::cerr << "Rest + trie is not supported yet." << std::endl; - return 1; - } - if (!set_write_method) config.write_method = Config::WRITE_MMAP; - if (quantize) { - if (bhiksha) { - QuantArrayTrieModel(from_file, config); - } else { - QuantTrieModel(from_file, config); - } - } else { - if (bhiksha) { - ArrayTrieModel(from_file, config); - } else { - TrieModel(from_file, config); - } - } - } else { - Usage(argv[0], default_mem); - } - } - catch (const std::exception &e) { - std::cerr << e.what() << std::endl; - std::cerr << "ERROR" << std::endl; - return 1; - } - std::cerr << "SUCCESS" << std::endl; - return 0; -} diff --git a/klm/lm/build_binary_main.cc b/klm/lm/build_binary_main.cc new file mode 100644 index 00000000..ab2c0c32 --- /dev/null +++ b/klm/lm/build_binary_main.cc @@ -0,0 +1,228 @@ +#include "lm/model.hh" +#include "lm/sizes.hh" +#include "util/file_piece.hh" +#include "util/usage.hh" + +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef WIN32 +#include "util/getopt.hh" +#else +#include +#endif + +namespace lm { +namespace ngram { +namespace { + +void Usage(const char *name, const char *default_mem) { + std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" +"-u sets the log10 probability for if the ARPA file does not have one.\n" +" Default is -100. The ARPA file will always take precedence.\n" +"-s allows models to be built even if they do not have and .\n" +"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" +"-w mmap|after determines how writing is done.\n" +" mmap maps the binary file and writes to it. Default for trie.\n" +" after allocates anonymous memory, builds, and writes. Default for probing.\n" +"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n" +" model files. order1.arpa must be an ARPA file. All others may be ARPA or\n" +" the same data structure as being built. All files must have the same\n" +" vocabulary. For probing, the unigrams must be in the same order.\n\n" +"type is either probing or trie. Default is probing.\n\n" +"probing uses a probing hash table. It is the fastest but uses the most memory.\n" +"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n" +"trie is a straightforward trie with bit-level packing. It uses the least\n" +"memory and is still faster than SRI or IRST. Building the trie format uses an\n" +"on-disk sort to save memory.\n" +"-T is the temporary directory prefix. Default is the output file name.\n" +"-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n" +" with GNU sort. The number is followed by a unit: \% for percent of physical\n" +" memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n" +" Default unit is K for Kilobytes.\n" +"-q turns quantization on and sets the number of bits (e.g. -q 8).\n" +"-b sets backoff quantization bits. Requires -q and defaults to that value.\n" +"-a compresses pointers using an array of offsets. The parameter is the\n" +" maximum number of bits encoded by the array. Memory is minimized subject\n" +" to the maximum, so pick 255 to minimize memory.\n\n" +"Get a memory estimate by passing an ARPA file without an output file name.\n"; + exit(1); +} + +// I could really use boost::lexical_cast right about now. +float ParseFloat(const char *from) { + char *end; + float ret = strtod(from, &end); + if (*end) throw util::ParseNumberException(from); + return ret; +} +unsigned long int ParseUInt(const char *from) { + char *end; + unsigned long int ret = strtoul(from, &end, 10); + if (*end) throw util::ParseNumberException(from); + return ret; +} + +uint8_t ParseBitCount(const char *from) { + unsigned long val = ParseUInt(from); + if (val > 25) { + util::ParseNumberException e(from); + e << " bit counts are limited to 25."; + } + return val; +} + +void ParseFileList(const char *from, std::vector &to) { + to.clear(); + while (true) { + const char *i; + for (i = from; *i && *i != ' '; ++i) {} + to.push_back(std::string(from, i - from)); + if (!*i) break; + from = i + 1; + } +} + +void ProbingQuantizationUnsupported() { + std::cerr << "Quantization is only implemented in the trie data structure." << std::endl; + exit(1); +} + +} // namespace ngram +} // namespace lm +} // namespace + +int main(int argc, char *argv[]) { + using namespace lm::ngram; + + const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G"; + + try { + bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false; + lm::ngram::Config config; + config.building_memory = util::ParseSize(default_mem); + int opt; + while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:")) != -1) { + switch(opt) { + case 'q': + config.prob_bits = ParseBitCount(optarg); + if (!set_backoff_bits) config.backoff_bits = config.prob_bits; + quantize = true; + break; + case 'b': + config.backoff_bits = ParseBitCount(optarg); + set_backoff_bits = true; + break; + case 'a': + config.pointer_bhiksha_bits = ParseBitCount(optarg); + bhiksha = true; + break; + case 'u': + config.unknown_missing_logprob = ParseFloat(optarg); + break; + case 'p': + config.probing_multiplier = ParseFloat(optarg); + break; + case 't': // legacy + case 'T': + config.temporary_directory_prefix = optarg; + break; + case 'm': // legacy + config.building_memory = ParseUInt(optarg) * 1048576; + break; + case 'S': + config.building_memory = std::min(static_cast(std::numeric_limits::max()), util::ParseSize(optarg)); + break; + case 'w': + set_write_method = true; + if (!strcmp(optarg, "mmap")) { + config.write_method = Config::WRITE_MMAP; + } else if (!strcmp(optarg, "after")) { + config.write_method = Config::WRITE_AFTER; + } else { + Usage(argv[0], default_mem); + } + break; + case 's': + config.sentence_marker_missing = lm::SILENT; + break; + case 'i': + config.positive_log_probability = lm::SILENT; + break; + case 'r': + rest = true; + ParseFileList(optarg, config.rest_lower_files); + config.rest_function = Config::REST_LOWER; + break; + default: + Usage(argv[0], default_mem); + } + } + if (!quantize && set_backoff_bits) { + std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl; + abort(); + } + if (optind + 1 == argc) { + ShowSizes(argv[optind], config); + return 0; + } + const char *model_type; + const char *from_file; + + if (optind + 2 == argc) { + model_type = "probing"; + from_file = argv[optind]; + config.write_mmap = argv[optind + 1]; + } else if (optind + 3 == argc) { + model_type = argv[optind]; + from_file = argv[optind + 1]; + config.write_mmap = argv[optind + 2]; + } else { + Usage(argv[0], default_mem); + } + if (!strcmp(model_type, "probing")) { + if (!set_write_method) config.write_method = Config::WRITE_AFTER; + if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); + if (rest) { + RestProbingModel(from_file, config); + } else { + ProbingModel(from_file, config); + } + } else if (!strcmp(model_type, "trie")) { + if (rest) { + std::cerr << "Rest + trie is not supported yet." << std::endl; + return 1; + } + if (!set_write_method) config.write_method = Config::WRITE_MMAP; + if (quantize) { + if (bhiksha) { + QuantArrayTrieModel(from_file, config); + } else { + QuantTrieModel(from_file, config); + } + } else { + if (bhiksha) { + ArrayTrieModel(from_file, config); + } else { + TrieModel(from_file, config); + } + } + } else { + Usage(argv[0], default_mem); + } + } + catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + std::cerr << "ERROR" << std::endl; + return 1; + } + std::cerr << "SUCCESS" << std::endl; + return 0; +} diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am index b5c147fd..317e03ce 100644 --- a/klm/lm/builder/Makefile.am +++ b/klm/lm/builder/Makefile.am @@ -1,7 +1,7 @@ bin_PROGRAMS = builder builder_SOURCES = \ - main.cc \ + lmplz_main.cc \ adjust_counts.cc \ adjust_counts.hh \ corpus_count.cc \ diff --git a/klm/lm/builder/discount.hh b/klm/lm/builder/discount.hh index 754fb20d..4d0aa4fd 100644 --- a/klm/lm/builder/discount.hh +++ b/klm/lm/builder/discount.hh @@ -3,7 +3,7 @@ #include -#include +#include namespace lm { namespace builder { diff --git a/klm/lm/builder/lmplz_main.cc b/klm/lm/builder/lmplz_main.cc new file mode 100644 index 00000000..90b9dca2 --- /dev/null +++ b/klm/lm/builder/lmplz_main.cc @@ -0,0 +1,94 @@ +#include "lm/builder/pipeline.hh" +#include "util/file.hh" +#include "util/file_piece.hh" +#include "util/usage.hh" + +#include + +#include + +namespace { +class SizeNotify { + public: + SizeNotify(std::size_t &out) : behind_(out) {} + + void operator()(const std::string &from) { + behind_ = util::ParseSize(from); + } + + private: + std::size_t &behind_; +}; + +boost::program_options::typed_value *SizeOption(std::size_t &to, const char *default_value) { + return boost::program_options::value()->notifier(SizeNotify(to))->default_value(default_value); +} + +} // namespace + +int main(int argc, char *argv[]) { + try { + namespace po = boost::program_options; + po::options_description options("Language model building options"); + lm::builder::PipelineConfig pipeline; + + options.add_options() + ("order,o", po::value(&pipeline.order)->required(), "Order of the model") + ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)") + ("temp_prefix,T", po::value(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix") + ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory") + ("vocab_memory", SizeOption(pipeline.assume_vocab_hash_size, "50M"), "Assume that the vocabulary hash table will use this much memory for purposes of calculating total memory in the count step") + ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow") + ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)") + ("block_count", po::value(&pipeline.block_count)->default_value(2), "Block count (per order)") + ("vocab_file", po::value(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file") + ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc."); + if (argc == 1) { + std::cerr << + "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n" + "Please cite:\n" + "@inproceedings{kenlm,\n" + "author = {Kenneth Heafield},\n" + "title = {{KenLM}: Faster and Smaller Language Model Queries},\n" + "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n" + "month = {July}, year={2011},\n" + "address = {Edinburgh, UK},\n" + "publisher = {Association for Computational Linguistics},\n" + "}\n\n" + "Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n" + "the model (-o) is the only mandatory option. As this is an on-disk program,\n" + "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n" + "Memory sizes are specified like GNU sort: a number followed by a unit character.\n" + "Valid units are \% for percentage of memory (supported platforms only) and (in\n" + "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n\n"; + std::cerr << options << std::endl; + return 1; + } + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, options), vm); + po::notify(vm); + + util::NormalizeTempPrefix(pipeline.sort.temp_prefix); + + lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; + // TODO: evaluate options for these. + initial.adder_in.total_memory = 32768; + initial.adder_in.block_count = 2; + initial.adder_out.total_memory = 32768; + initial.adder_out.block_count = 2; + pipeline.read_backoffs = initial.adder_out; + + // Read from stdin + try { + lm::builder::Pipeline(pipeline, 0, 1); + } catch (const util::MallocException &e) { + std::cerr << e.what() << std::endl; + std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as() << std::endl; + return 1; + } + util::PrintUsage(std::cerr); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } +} diff --git a/klm/lm/builder/main.cc b/klm/lm/builder/main.cc deleted file mode 100644 index 90b9dca2..00000000 --- a/klm/lm/builder/main.cc +++ /dev/null @@ -1,94 +0,0 @@ -#include "lm/builder/pipeline.hh" -#include "util/file.hh" -#include "util/file_piece.hh" -#include "util/usage.hh" - -#include - -#include - -namespace { -class SizeNotify { - public: - SizeNotify(std::size_t &out) : behind_(out) {} - - void operator()(const std::string &from) { - behind_ = util::ParseSize(from); - } - - private: - std::size_t &behind_; -}; - -boost::program_options::typed_value *SizeOption(std::size_t &to, const char *default_value) { - return boost::program_options::value()->notifier(SizeNotify(to))->default_value(default_value); -} - -} // namespace - -int main(int argc, char *argv[]) { - try { - namespace po = boost::program_options; - po::options_description options("Language model building options"); - lm::builder::PipelineConfig pipeline; - - options.add_options() - ("order,o", po::value(&pipeline.order)->required(), "Order of the model") - ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)") - ("temp_prefix,T", po::value(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix") - ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory") - ("vocab_memory", SizeOption(pipeline.assume_vocab_hash_size, "50M"), "Assume that the vocabulary hash table will use this much memory for purposes of calculating total memory in the count step") - ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow") - ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)") - ("block_count", po::value(&pipeline.block_count)->default_value(2), "Block count (per order)") - ("vocab_file", po::value(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file") - ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc."); - if (argc == 1) { - std::cerr << - "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n" - "Please cite:\n" - "@inproceedings{kenlm,\n" - "author = {Kenneth Heafield},\n" - "title = {{KenLM}: Faster and Smaller Language Model Queries},\n" - "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n" - "month = {July}, year={2011},\n" - "address = {Edinburgh, UK},\n" - "publisher = {Association for Computational Linguistics},\n" - "}\n\n" - "Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n" - "the model (-o) is the only mandatory option. As this is an on-disk program,\n" - "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n" - "Memory sizes are specified like GNU sort: a number followed by a unit character.\n" - "Valid units are \% for percentage of memory (supported platforms only) and (in\n" - "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n\n"; - std::cerr << options << std::endl; - return 1; - } - po::variables_map vm; - po::store(po::parse_command_line(argc, argv, options), vm); - po::notify(vm); - - util::NormalizeTempPrefix(pipeline.sort.temp_prefix); - - lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; - // TODO: evaluate options for these. - initial.adder_in.total_memory = 32768; - initial.adder_in.block_count = 2; - initial.adder_out.total_memory = 32768; - initial.adder_out.block_count = 2; - pipeline.read_backoffs = initial.adder_out; - - // Read from stdin - try { - lm::builder::Pipeline(pipeline, 0, 1); - } catch (const util::MallocException &e) { - std::cerr << e.what() << std::endl; - std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as() << std::endl; - return 1; - } - util::PrintUsage(std::cerr); - } catch (const std::exception &e) { - std::cerr << e.what() << std::endl; - return 1; - } -} diff --git a/klm/lm/filter/filter_main.cc b/klm/lm/filter/filter_main.cc new file mode 100644 index 00000000..1a4ba84f --- /dev/null +++ b/klm/lm/filter/filter_main.cc @@ -0,0 +1,248 @@ +#include "lm/filter/arpa_io.hh" +#include "lm/filter/format.hh" +#include "lm/filter/phrase.hh" +#ifndef NTHREAD +#include "lm/filter/thread.hh" +#endif +#include "lm/filter/vocab.hh" +#include "lm/filter/wrapper.hh" +#include "util/file_piece.hh" + +#include + +#include +#include +#include +#include + +namespace lm { +namespace { + +void DisplayHelp(const char *name) { + std::cerr + << "Usage: " << name << " mode [context] [phrase] [raw|arpa] [threads:m] [batch_size:m] (vocab|model):input_file output_file\n\n" + "copy mode just copies, but makes the format nicer for e.g. irstlm's broken\n" + " parser.\n" + "single mode treats the entire input as a single sentence.\n" + "multiple mode filters to multiple sentences in parallel. Each sentence is on\n" + " a separate line. A separate file is created for each file by appending the\n" + " 0-indexed line number to the output file name.\n" + "union mode produces one filtered model that is the union of models created by\n" + " multiple mode.\n\n" + "context means only the context (all but last word) has to pass the filter, but\n" + " the entire n-gram is output.\n\n" + "phrase means that the vocabulary is actually tab-delimited phrases and that the\n" + " phrases can generate the n-gram when assembled in arbitrary order and\n" + " clipped. Currently works with multiple or union mode.\n\n" + "The file format is set by [raw|arpa] with default arpa:\n" + "raw means space-separated tokens, optionally followed by a tab and arbitrary\n" + " text. This is useful for ngram count files.\n" + "arpa means the ARPA file format for n-gram language models.\n\n" +#ifndef NTHREAD + "threads:m sets m threads (default: conccurrency detected by boost)\n" + "batch_size:m sets the batch size for threading. Expect memory usage from this\n" + " of 2*threads*batch_size n-grams.\n\n" +#else + "This binary was compiled with -DNTHREAD, disabling threading. If you wanted\n" + " threading, compile without this flag against Boost >=1.42.0.\n\n" +#endif + "There are two inputs: vocabulary and model. Either may be given as a file\n" + " while the other is on stdin. Specify the type given as a file using\n" + " vocab: or model: before the file name. \n\n" + "For ARPA format, the output must be seekable. For raw format, it can be a\n" + " stream i.e. /dev/stdout\n"; +} + +typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} FilterMode; +typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format; + +struct Config { + Config() : +#ifndef NTHREAD + batch_size(25000), + threads(boost::thread::hardware_concurrency()), +#endif + phrase(false), + context(false), + format(FORMAT_ARPA) + { +#ifndef NTHREAD + if (!threads) threads = 1; +#endif + } + +#ifndef NTHREAD + size_t batch_size; + size_t threads; +#endif + bool phrase; + bool context; + FilterMode mode; + Format format; +}; + +template void RunThreadedFilter(const Config &config, util::FilePiece &in_lm, Filter &filter, Output &output) { +#ifndef NTHREAD + if (config.threads == 1) { +#endif + Format::RunFilter(in_lm, filter, output); +#ifndef NTHREAD + } else { + typedef Controller Threaded; + Threaded threading(config.batch_size, config.threads * 2, config.threads, filter, output); + Format::RunFilter(in_lm, threading, output); + } +#endif +} + +template void RunContextFilter(const Config &config, util::FilePiece &in_lm, Filter filter, Output &output) { + if (config.context) { + ContextFilter context_filter(filter); + RunThreadedFilter, OutputBuffer, Output>(config, in_lm, context_filter, output); + } else { + RunThreadedFilter(config, in_lm, filter, output); + } +} + +template void DispatchBinaryFilter(const Config &config, util::FilePiece &in_lm, const Binary &binary, typename Format::Output &out) { + typedef BinaryFilter Filter; + RunContextFilter(config, in_lm, Filter(binary), out); +} + +template void DispatchFilterModes(const Config &config, std::istream &in_vocab, util::FilePiece &in_lm, const char *out_name) { + if (config.mode == MODE_MULTIPLE) { + if (config.phrase) { + typedef phrase::Multiple Filter; + phrase::Substrings substrings; + typename Format::Multiple out(out_name, phrase::ReadMultiple(in_vocab, substrings)); + RunContextFilter(config, in_lm, Filter(substrings), out); + } else { + typedef vocab::Multiple Filter; + boost::unordered_map > words; + typename Format::Multiple out(out_name, vocab::ReadMultiple(in_vocab, words)); + RunContextFilter(config, in_lm, Filter(words), out); + } + return; + } + + typename Format::Output out(out_name); + + if (config.mode == MODE_COPY) { + Format::Copy(in_lm, out); + return; + } + + if (config.mode == MODE_SINGLE) { + vocab::Single::Words words; + vocab::ReadSingle(in_vocab, words); + DispatchBinaryFilter(config, in_lm, vocab::Single(words), out); + return; + } + + if (config.mode == MODE_UNION) { + if (config.phrase) { + phrase::Substrings substrings; + phrase::ReadMultiple(in_vocab, substrings); + DispatchBinaryFilter(config, in_lm, phrase::Union(substrings), out); + } else { + vocab::Union::Words words; + vocab::ReadMultiple(in_vocab, words); + DispatchBinaryFilter(config, in_lm, vocab::Union(words), out); + } + return; + } +} + +} // namespace +} // namespace lm + +int main(int argc, char *argv[]) { + if (argc < 4) { + lm::DisplayHelp(argv[0]); + return 1; + } + + // I used to have boost::program_options, but some users didn't want to compile boost. + lm::Config config; + config.mode = lm::MODE_UNSET; + for (int i = 1; i < argc - 2; ++i) { + const char *str = argv[i]; + if (!std::strcmp(str, "copy")) { + config.mode = lm::MODE_COPY; + } else if (!std::strcmp(str, "single")) { + config.mode = lm::MODE_SINGLE; + } else if (!std::strcmp(str, "multiple")) { + config.mode = lm::MODE_MULTIPLE; + } else if (!std::strcmp(str, "union")) { + config.mode = lm::MODE_UNION; + } else if (!std::strcmp(str, "phrase")) { + config.phrase = true; + } else if (!std::strcmp(str, "context")) { + config.context = true; + } else if (!std::strcmp(str, "arpa")) { + config.format = lm::FORMAT_ARPA; + } else if (!std::strcmp(str, "raw")) { + config.format = lm::FORMAT_COUNT; +#ifndef NTHREAD + } else if (!std::strncmp(str, "threads:", 8)) { + config.threads = boost::lexical_cast(str + 8); + if (!config.threads) { + std::cerr << "Specify at least one thread." << std::endl; + return 1; + } + } else if (!std::strncmp(str, "batch_size:", 11)) { + config.batch_size = boost::lexical_cast(str + 11); + if (config.batch_size < 5000) { + std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl; + if (!config.batch_size) return 1; + } +#endif + } else { + lm::DisplayHelp(argv[0]); + return 1; + } + } + + if (config.mode == lm::MODE_UNSET) { + lm::DisplayHelp(argv[0]); + return 1; + } + + if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) { + std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl; + return 1; + } + + bool cmd_is_model = true; + const char *cmd_input = argv[argc - 2]; + if (!strncmp(cmd_input, "vocab:", 6)) { + cmd_is_model = false; + cmd_input += 6; + } else if (!strncmp(cmd_input, "model:", 6)) { + cmd_input += 6; + } else if (strchr(cmd_input, ':')) { + errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input); + } else { + std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl; + } + std::ifstream cmd_file; + std::istream *vocab; + if (cmd_is_model) { + vocab = &std::cin; + } else { + cmd_file.open(cmd_input, std::ios::in); + if (!cmd_file) { + err(2, "Could not open input file %s", cmd_input); + } + vocab = &cmd_file; + } + + util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr); + + if (config.format == lm::FORMAT_ARPA) { + lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); + } else if (config.format == lm::FORMAT_COUNT) { + lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); + } + return 0; +} diff --git a/klm/lm/filter/main.cc b/klm/lm/filter/main.cc deleted file mode 100644 index c42243e2..00000000 --- a/klm/lm/filter/main.cc +++ /dev/null @@ -1,249 +0,0 @@ -#include "lm/filter/arpa_io.hh" -#include "lm/filter/format.hh" -#include "lm/filter/phrase.hh" -#ifndef NTHREAD -#include "lm/filter/thread.hh" -#endif -#include "lm/filter/vocab.hh" -#include "lm/filter/wrapper.hh" -#include "util/file_piece.hh" - -#include - -#include -#include -#include -#include - -namespace lm { -namespace { - -void DisplayHelp(const char *name) { - std::cerr - << "Usage: " << name << " mode [context] [phrase] [raw|arpa] [threads:m] [batch_size:m] (vocab|model):input_file output_file\n\n" - "copy mode just copies, but makes the format nicer for e.g. irstlm's broken\n" - " parser.\n" - "single mode treats the entire input as a single sentence.\n" - "multiple mode filters to multiple sentences in parallel. Each sentence is on\n" - " a separate line. A separate file is created for each file by appending the\n" - " 0-indexed line number to the output file name.\n" - "union mode produces one filtered model that is the union of models created by\n" - " multiple mode.\n\n" - "context means only the context (all but last word) has to pass the filter, but\n" - " the entire n-gram is output.\n\n" - "phrase means that the vocabulary is actually tab-delimited phrases and that the\n" - " phrases can generate the n-gram when assembled in arbitrary order and\n" - " clipped. Currently works with multiple or union mode.\n\n" - "The file format is set by [raw|arpa] with default arpa:\n" - "raw means space-separated tokens, optionally followed by a tab and arbitrary\n" - " text. This is useful for ngram count files.\n" - "arpa means the ARPA file format for n-gram language models.\n\n" -#ifndef NTHREAD - "threads:m sets m threads (default: conccurrency detected by boost)\n" - "batch_size:m sets the batch size for threading. Expect memory usage from this\n" - " of 2*threads*batch_size n-grams.\n\n" -#else - "This binary was compiled with -DNTHREAD, disabling threading. If you wanted\n" - " threading, compile without this flag against Boost >=1.42.0.\n\n" -#endif - "There are two inputs: vocabulary and model. Either may be given as a file\n" - " while the other is on stdin. Specify the type given as a file using\n" - " vocab: or model: before the file name. \n\n" - "For ARPA format, the output must be seekable. For raw format, it can be a\n" - " stream i.e. /dev/stdout\n"; -} - -typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION} FilterMode; -typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format; - -struct Config { - Config() : -#ifndef NTHREAD - batch_size(25000), - threads(boost::thread::hardware_concurrency()), -#endif - phrase(false), - context(false), - format(FORMAT_ARPA) - { -#ifndef NTHREAD - if (!threads) threads = 1; -#endif - } - -#ifndef NTHREAD - size_t batch_size; - size_t threads; -#endif - bool phrase; - bool context; - FilterMode mode; - Format format; -}; - -template void RunThreadedFilter(const Config &config, util::FilePiece &in_lm, Filter &filter, Output &output) { -#ifndef NTHREAD - if (config.threads == 1) { -#endif - Format::RunFilter(in_lm, filter, output); -#ifndef NTHREAD - } else { - typedef Controller Threaded; - Threaded threading(config.batch_size, config.threads * 2, config.threads, filter, output); - Format::RunFilter(in_lm, threading, output); - } -#endif -} - -template void RunContextFilter(const Config &config, util::FilePiece &in_lm, Filter filter, Output &output) { - if (config.context) { - ContextFilter context_filter(filter); - RunThreadedFilter, OutputBuffer, Output>(config, in_lm, context_filter, output); - } else { - RunThreadedFilter(config, in_lm, filter, output); - } -} - -template void DispatchBinaryFilter(const Config &config, util::FilePiece &in_lm, const Binary &binary, typename Format::Output &out) { - typedef BinaryFilter Filter; - RunContextFilter(config, in_lm, Filter(binary), out); -} - -template void DispatchFilterModes(const Config &config, std::istream &in_vocab, util::FilePiece &in_lm, const char *out_name) { - if (config.mode == MODE_MULTIPLE) { - if (config.phrase) { - typedef phrase::Multiple Filter; - phrase::Substrings substrings; - typename Format::Multiple out(out_name, phrase::ReadMultiple(in_vocab, substrings)); - RunContextFilter(config, in_lm, Filter(substrings), out); - } else { - typedef vocab::Multiple Filter; - boost::unordered_map > words; - typename Format::Multiple out(out_name, vocab::ReadMultiple(in_vocab, words)); - RunContextFilter(config, in_lm, Filter(words), out); - } - return; - } - - typename Format::Output out(out_name); - - if (config.mode == MODE_COPY) { - Format::Copy(in_lm, out); - return; - } - - if (config.mode == MODE_SINGLE) { - vocab::Single::Words words; - vocab::ReadSingle(in_vocab, words); - DispatchBinaryFilter(config, in_lm, vocab::Single(words), out); - return; - } - - if (config.mode == MODE_UNION) { - if (config.phrase) { - phrase::Substrings substrings; - phrase::ReadMultiple(in_vocab, substrings); - DispatchBinaryFilter(config, in_lm, phrase::Union(substrings), out); - } else { - vocab::Union::Words words; - vocab::ReadMultiple(in_vocab, words); - DispatchBinaryFilter(config, in_lm, vocab::Union(words), out); - } - return; - } -} - -} // namespace -} // namespace lm - -int main(int argc, char *argv[]) { - if (argc < 4) { - lm::DisplayHelp(argv[0]); - return 1; - } - - // I used to have boost::program_options, but some users didn't want to compile boost. - lm::Config config; - boost::optional mode; - for (int i = 1; i < argc - 2; ++i) { - const char *str = argv[i]; - if (!std::strcmp(str, "copy")) { - mode = lm::MODE_COPY; - } else if (!std::strcmp(str, "single")) { - mode = lm::MODE_SINGLE; - } else if (!std::strcmp(str, "multiple")) { - mode = lm::MODE_MULTIPLE; - } else if (!std::strcmp(str, "union")) { - mode = lm::MODE_UNION; - } else if (!std::strcmp(str, "phrase")) { - config.phrase = true; - } else if (!std::strcmp(str, "context")) { - config.context = true; - } else if (!std::strcmp(str, "arpa")) { - config.format = lm::FORMAT_ARPA; - } else if (!std::strcmp(str, "raw")) { - config.format = lm::FORMAT_COUNT; -#ifndef NTHREAD - } else if (!std::strncmp(str, "threads:", 8)) { - config.threads = boost::lexical_cast(str + 8); - if (!config.threads) { - std::cerr << "Specify at least one thread." << std::endl; - return 1; - } - } else if (!std::strncmp(str, "batch_size:", 11)) { - config.batch_size = boost::lexical_cast(str + 11); - if (config.batch_size < 5000) { - std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl; - if (!config.batch_size) return 1; - } -#endif - } else { - lm::DisplayHelp(argv[0]); - return 1; - } - } - - if (!mode) { - lm::DisplayHelp(argv[0]); - return 1; - } - config.mode = *mode; - - if (config.phrase && config.mode != lm::MODE_UNION && mode != lm::MODE_MULTIPLE) { - std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl; - return 1; - } - - bool cmd_is_model = true; - const char *cmd_input = argv[argc - 2]; - if (!strncmp(cmd_input, "vocab:", 6)) { - cmd_is_model = false; - cmd_input += 6; - } else if (!strncmp(cmd_input, "model:", 6)) { - cmd_input += 6; - } else if (strchr(cmd_input, ':')) { - errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input); - } else { - std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl; - } - std::ifstream cmd_file; - std::istream *vocab; - if (cmd_is_model) { - vocab = &std::cin; - } else { - cmd_file.open(cmd_input, std::ios::in); - if (!cmd_file) { - err(2, "Could not open input file %s", cmd_input); - } - vocab = &cmd_file; - } - - util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr); - - if (config.format == lm::FORMAT_ARPA) { - lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); - } else if (config.format == lm::FORMAT_COUNT) { - lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); - } - return 0; -} diff --git a/klm/lm/filter/phrase.hh b/klm/lm/filter/phrase.hh index 07479dea..b4edff41 100644 --- a/klm/lm/filter/phrase.hh +++ b/klm/lm/filter/phrase.hh @@ -57,6 +57,7 @@ class Substrings { LM_FILTER_PHRASE_METHOD(Right, right) LM_FILTER_PHRASE_METHOD(Phrase, phrase) +#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization // sentence_id must be non-decreasing. Iterators are over words in the phrase. template void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) { // Iterate over all substrings. diff --git a/klm/lm/filter/vocab.hh b/klm/lm/filter/vocab.hh index e2b6adff..7f0fadaa 100644 --- a/klm/lm/filter/vocab.hh +++ b/klm/lm/filter/vocab.hh @@ -5,6 +5,7 @@ #include "util/multi_intersection.hh" #include "util/string_piece.hh" +#include "util/string_piece_hash.hh" #include "util/tokenize_piece.hh" #include diff --git a/klm/lm/fragment.cc b/klm/lm/fragment.cc deleted file mode 100644 index 0267cd4e..00000000 --- a/klm/lm/fragment.cc +++ /dev/null @@ -1,37 +0,0 @@ -#include "lm/binary_format.hh" -#include "lm/model.hh" -#include "lm/left.hh" -#include "util/tokenize_piece.hh" - -template void Query(const char *name) { - Model model(name); - std::string line; - lm::ngram::ChartState ignored; - while (getline(std::cin, line)) { - lm::ngram::RuleScore scorer(model, ignored); - for (util::TokenIter i(line, ' '); i; ++i) { - scorer.Terminal(model.GetVocabulary().Index(*i)); - } - std::cout << scorer.Finish() << '\n'; - } -} - -int main(int argc, char *argv[]) { - if (argc != 2) { - std::cerr << "Expected model file name." << std::endl; - return 1; - } - const char *name = argv[1]; - lm::ngram::ModelType model_type = lm::ngram::PROBING; - lm::ngram::RecognizeBinary(name, model_type); - switch (model_type) { - case lm::ngram::PROBING: - Query(name); - break; - case lm::ngram::REST_PROBING: - Query(name); - break; - default: - std::cerr << "Model type not supported yet." << std::endl; - } -} diff --git a/klm/lm/fragment_main.cc b/klm/lm/fragment_main.cc new file mode 100644 index 00000000..0267cd4e --- /dev/null +++ b/klm/lm/fragment_main.cc @@ -0,0 +1,37 @@ +#include "lm/binary_format.hh" +#include "lm/model.hh" +#include "lm/left.hh" +#include "util/tokenize_piece.hh" + +template void Query(const char *name) { + Model model(name); + std::string line; + lm::ngram::ChartState ignored; + while (getline(std::cin, line)) { + lm::ngram::RuleScore scorer(model, ignored); + for (util::TokenIter i(line, ' '); i; ++i) { + scorer.Terminal(model.GetVocabulary().Index(*i)); + } + std::cout << scorer.Finish() << '\n'; + } +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + std::cerr << "Expected model file name." << std::endl; + return 1; + } + const char *name = argv[1]; + lm::ngram::ModelType model_type = lm::ngram::PROBING; + lm::ngram::RecognizeBinary(name, model_type); + switch (model_type) { + case lm::ngram::PROBING: + Query(name); + break; + case lm::ngram::REST_PROBING: + Query(name); + break; + default: + std::cerr << "Model type not supported yet." << std::endl; + } +} diff --git a/klm/lm/kenlm_max_order_main.cc b/klm/lm/kenlm_max_order_main.cc new file mode 100644 index 00000000..94221201 --- /dev/null +++ b/klm/lm/kenlm_max_order_main.cc @@ -0,0 +1,6 @@ +#include "lm/max_order.hh" +#include + +int main(int argc, char *argv[]) { + std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl; +} diff --git a/klm/lm/max_order.cc b/klm/lm/max_order.cc deleted file mode 100644 index 94221201..00000000 --- a/klm/lm/max_order.cc +++ /dev/null @@ -1,6 +0,0 @@ -#include "lm/max_order.hh" -#include - -int main(int argc, char *argv[]) { - std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl; -} diff --git a/klm/lm/ngram_query.cc b/klm/lm/ngram_query.cc deleted file mode 100644 index 49757d9a..00000000 --- a/klm/lm/ngram_query.cc +++ /dev/null @@ -1,47 +0,0 @@ -#include "lm/ngram_query.hh" - -int main(int argc, char *argv[]) { - if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) { - std::cerr << "Usage: " << argv[0] << " lm_file [null]" << std::endl; - std::cerr << "Input is wrapped in and unless null is passed." << std::endl; - return 1; - } - try { - bool sentence_context = (argc == 2); - using namespace lm::ngram; - ModelType model_type; - if (RecognizeBinary(argv[1], model_type)) { - switch(model_type) { - case PROBING: - Query(argv[1], sentence_context, std::cin, std::cout); - break; - case REST_PROBING: - Query(argv[1], sentence_context, std::cin, std::cout); - break; - case TRIE: - Query(argv[1], sentence_context, std::cin, std::cout); - break; - case QUANT_TRIE: - Query(argv[1], sentence_context, std::cin, std::cout); - break; - case ARRAY_TRIE: - Query(argv[1], sentence_context, std::cin, std::cout); - break; - case QUANT_ARRAY_TRIE: - Query(argv[1], sentence_context, std::cin, std::cout); - break; - default: - std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; - abort(); - } - } else { - Query(argv[1], sentence_context, std::cin, std::cout); - } - std::cerr << "Total time including destruction:\n"; - util::PrintUsage(std::cerr); - } catch (const std::exception &e) { - std::cerr << e.what() << std::endl; - return 1; - } - return 0; -} diff --git a/klm/lm/query_main.cc b/klm/lm/query_main.cc new file mode 100644 index 00000000..49757d9a --- /dev/null +++ b/klm/lm/query_main.cc @@ -0,0 +1,47 @@ +#include "lm/ngram_query.hh" + +int main(int argc, char *argv[]) { + if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) { + std::cerr << "Usage: " << argv[0] << " lm_file [null]" << std::endl; + std::cerr << "Input is wrapped in and unless null is passed." << std::endl; + return 1; + } + try { + bool sentence_context = (argc == 2); + using namespace lm::ngram; + ModelType model_type; + if (RecognizeBinary(argv[1], model_type)) { + switch(model_type) { + case PROBING: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case REST_PROBING: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case TRIE: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case QUANT_TRIE: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case ARRAY_TRIE: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case QUANT_ARRAY_TRIE: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + default: + std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; + abort(); + } + } else { + Query(argv[1], sentence_context, std::cin, std::cout); + } + std::cerr << "Total time including destruction:\n"; + util::PrintUsage(std::cerr); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } + return 0; +} -- cgit v1.2.3