From 516c132fb683b5bf77ae3230a1b3709beb57618e Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Tue, 22 Jan 2013 21:37:49 +0000 Subject: KenLM 58da338b --- klm/lm/builder/Makefile.am | 2 +- klm/lm/builder/discount.hh | 2 +- klm/lm/builder/lmplz_main.cc | 94 ++++++++++++++++++++++++++++++++++++++++++++ klm/lm/builder/main.cc | 94 -------------------------------------------- 4 files changed, 96 insertions(+), 96 deletions(-) create mode 100644 klm/lm/builder/lmplz_main.cc delete mode 100644 klm/lm/builder/main.cc (limited to 'klm/lm/builder') diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am index b5c147fd..317e03ce 100644 --- a/klm/lm/builder/Makefile.am +++ b/klm/lm/builder/Makefile.am @@ -1,7 +1,7 @@ bin_PROGRAMS = builder builder_SOURCES = \ - main.cc \ + lmplz_main.cc \ adjust_counts.cc \ adjust_counts.hh \ corpus_count.cc \ diff --git a/klm/lm/builder/discount.hh b/klm/lm/builder/discount.hh index 754fb20d..4d0aa4fd 100644 --- a/klm/lm/builder/discount.hh +++ b/klm/lm/builder/discount.hh @@ -3,7 +3,7 @@ #include -#include +#include namespace lm { namespace builder { diff --git a/klm/lm/builder/lmplz_main.cc b/klm/lm/builder/lmplz_main.cc new file mode 100644 index 00000000..90b9dca2 --- /dev/null +++ b/klm/lm/builder/lmplz_main.cc @@ -0,0 +1,94 @@ +#include "lm/builder/pipeline.hh" +#include "util/file.hh" +#include "util/file_piece.hh" +#include "util/usage.hh" + +#include + +#include + +namespace { +class SizeNotify { + public: + SizeNotify(std::size_t &out) : behind_(out) {} + + void operator()(const std::string &from) { + behind_ = util::ParseSize(from); + } + + private: + std::size_t &behind_; +}; + +boost::program_options::typed_value *SizeOption(std::size_t &to, const char *default_value) { + return boost::program_options::value()->notifier(SizeNotify(to))->default_value(default_value); +} + +} // namespace + +int main(int argc, char *argv[]) { + try { + namespace po = boost::program_options; + po::options_description options("Language model building options"); + lm::builder::PipelineConfig pipeline; + + options.add_options() + ("order,o", po::value(&pipeline.order)->required(), "Order of the model") + ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)") + ("temp_prefix,T", po::value(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix") + ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory") + ("vocab_memory", SizeOption(pipeline.assume_vocab_hash_size, "50M"), "Assume that the vocabulary hash table will use this much memory for purposes of calculating total memory in the count step") + ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow") + ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)") + ("block_count", po::value(&pipeline.block_count)->default_value(2), "Block count (per order)") + ("vocab_file", po::value(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file") + ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc."); + if (argc == 1) { + std::cerr << + "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n" + "Please cite:\n" + "@inproceedings{kenlm,\n" + "author = {Kenneth Heafield},\n" + "title = {{KenLM}: Faster and Smaller Language Model Queries},\n" + "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n" + "month = {July}, year={2011},\n" + "address = {Edinburgh, UK},\n" + "publisher = {Association for Computational Linguistics},\n" + "}\n\n" + "Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n" + "the model (-o) is the only mandatory option. As this is an on-disk program,\n" + "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n" + "Memory sizes are specified like GNU sort: a number followed by a unit character.\n" + "Valid units are \% for percentage of memory (supported platforms only) and (in\n" + "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n\n"; + std::cerr << options << std::endl; + return 1; + } + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, options), vm); + po::notify(vm); + + util::NormalizeTempPrefix(pipeline.sort.temp_prefix); + + lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; + // TODO: evaluate options for these. + initial.adder_in.total_memory = 32768; + initial.adder_in.block_count = 2; + initial.adder_out.total_memory = 32768; + initial.adder_out.block_count = 2; + pipeline.read_backoffs = initial.adder_out; + + // Read from stdin + try { + lm::builder::Pipeline(pipeline, 0, 1); + } catch (const util::MallocException &e) { + std::cerr << e.what() << std::endl; + std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as() << std::endl; + return 1; + } + util::PrintUsage(std::cerr); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } +} diff --git a/klm/lm/builder/main.cc b/klm/lm/builder/main.cc deleted file mode 100644 index 90b9dca2..00000000 --- a/klm/lm/builder/main.cc +++ /dev/null @@ -1,94 +0,0 @@ -#include "lm/builder/pipeline.hh" -#include "util/file.hh" -#include "util/file_piece.hh" -#include "util/usage.hh" - -#include - -#include - -namespace { -class SizeNotify { - public: - SizeNotify(std::size_t &out) : behind_(out) {} - - void operator()(const std::string &from) { - behind_ = util::ParseSize(from); - } - - private: - std::size_t &behind_; -}; - -boost::program_options::typed_value *SizeOption(std::size_t &to, const char *default_value) { - return boost::program_options::value()->notifier(SizeNotify(to))->default_value(default_value); -} - -} // namespace - -int main(int argc, char *argv[]) { - try { - namespace po = boost::program_options; - po::options_description options("Language model building options"); - lm::builder::PipelineConfig pipeline; - - options.add_options() - ("order,o", po::value(&pipeline.order)->required(), "Order of the model") - ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)") - ("temp_prefix,T", po::value(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix") - ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory") - ("vocab_memory", SizeOption(pipeline.assume_vocab_hash_size, "50M"), "Assume that the vocabulary hash table will use this much memory for purposes of calculating total memory in the count step") - ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow") - ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)") - ("block_count", po::value(&pipeline.block_count)->default_value(2), "Block count (per order)") - ("vocab_file", po::value(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file") - ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc."); - if (argc == 1) { - std::cerr << - "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n" - "Please cite:\n" - "@inproceedings{kenlm,\n" - "author = {Kenneth Heafield},\n" - "title = {{KenLM}: Faster and Smaller Language Model Queries},\n" - "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n" - "month = {July}, year={2011},\n" - "address = {Edinburgh, UK},\n" - "publisher = {Association for Computational Linguistics},\n" - "}\n\n" - "Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n" - "the model (-o) is the only mandatory option. As this is an on-disk program,\n" - "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n" - "Memory sizes are specified like GNU sort: a number followed by a unit character.\n" - "Valid units are \% for percentage of memory (supported platforms only) and (in\n" - "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n\n"; - std::cerr << options << std::endl; - return 1; - } - po::variables_map vm; - po::store(po::parse_command_line(argc, argv, options), vm); - po::notify(vm); - - util::NormalizeTempPrefix(pipeline.sort.temp_prefix); - - lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; - // TODO: evaluate options for these. - initial.adder_in.total_memory = 32768; - initial.adder_in.block_count = 2; - initial.adder_out.total_memory = 32768; - initial.adder_out.block_count = 2; - pipeline.read_backoffs = initial.adder_out; - - // Read from stdin - try { - lm::builder::Pipeline(pipeline, 0, 1); - } catch (const util::MallocException &e) { - std::cerr << e.what() << std::endl; - std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as() << std::endl; - return 1; - } - util::PrintUsage(std::cerr); - } catch (const std::exception &e) { - std::cerr << e.what() << std::endl; - return 1; - } -} -- cgit v1.2.3