1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
#include "lm/builder/pipeline.hh"
#include "util/file.hh"
#include "util/file_piece.hh"
#include "util/usage.hh"
#include <iostream>
#include <boost/program_options.hpp>
namespace {
class SizeNotify {
public:
SizeNotify(std::size_t &out) : behind_(out) {}
void operator()(const std::string &from) {
behind_ = util::ParseSize(from);
}
private:
std::size_t &behind_;
};
boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
}
} // namespace
int main(int argc, char *argv[]) {
try {
namespace po = boost::program_options;
po::options_description options("Language model building options");
lm::builder::PipelineConfig pipeline;
options.add_options()
("order,o", po::value<std::size_t>(&pipeline.order)->required(), "Order of the model")
("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)")
("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
("vocab_memory", SizeOption(pipeline.assume_vocab_hash_size, "50M"), "Assume that the vocabulary hash table will use this much memory for purposes of calculating total memory in the count step")
("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.");
if (argc == 1) {
std::cerr <<
"Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
"Please cite:\n"
"@inproceedings{kenlm,\n"
"author = {Kenneth Heafield},\n"
"title = {{KenLM}: Faster and Smaller Language Model Queries},\n"
"booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n"
"month = {July}, year={2011},\n"
"address = {Edinburgh, UK},\n"
"publisher = {Association for Computational Linguistics},\n"
"}\n\n"
"Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n"
"the model (-o) is the only mandatory option. As this is an on-disk program,\n"
"setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n"
"Memory sizes are specified like GNU sort: a number followed by a unit character.\n"
"Valid units are \% for percentage of memory (supported platforms only) and (in\n"
"increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n\n";
std::cerr << options << std::endl;
return 1;
}
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, options), vm);
po::notify(vm);
util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
// TODO: evaluate options for these.
initial.adder_in.total_memory = 32768;
initial.adder_in.block_count = 2;
initial.adder_out.total_memory = 32768;
initial.adder_out.block_count = 2;
pipeline.read_backoffs = initial.adder_out;
// Read from stdin
try {
lm::builder::Pipeline(pipeline, 0, 1);
} catch (const util::MallocException &e) {
std::cerr << e.what() << std::endl;
std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
return 1;
}
util::PrintUsage(std::cerr);
} catch (const std::exception &e) {
std::cerr << e.what() << std::endl;
return 1;
}
}
|