summaryrefslogtreecommitdiff
path: root/klm/lm/build_binary.cc
diff options
context:
space:
mode:
Diffstat (limited to 'klm/lm/build_binary.cc')
-rw-r--r--klm/lm/build_binary.cc97
1 files changed, 64 insertions, 33 deletions
diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc
index 8cbb69d0..c4a01cb4 100644
--- a/klm/lm/build_binary.cc
+++ b/klm/lm/build_binary.cc
@@ -66,16 +66,28 @@ uint8_t ParseBitCount(const char *from) {
return val;
}
+void ParseFileList(const char *from, std::vector<std::string> &to) {
+ to.clear();
+ while (true) {
+ const char *i;
+ for (i = from; *i && *i != ' '; ++i) {}
+ to.push_back(std::string(from, i - from));
+ if (!*i) break;
+ from = i + 1;
+ }
+}
+
void ShowSizes(const char *file, const lm::ngram::Config &config) {
std::vector<uint64_t> counts;
util::FilePiece f(file);
lm::ReadARPACounts(f, counts);
- std::size_t sizes[5];
+ std::size_t sizes[6];
sizes[0] = ProbingModel::Size(counts, config);
- sizes[1] = TrieModel::Size(counts, config);
- sizes[2] = QuantTrieModel::Size(counts, config);
- sizes[3] = ArrayTrieModel::Size(counts, config);
- sizes[4] = QuantArrayTrieModel::Size(counts, config);
+ sizes[1] = RestProbingModel::Size(counts, config);
+ sizes[2] = TrieModel::Size(counts, config);
+ sizes[3] = QuantTrieModel::Size(counts, config);
+ sizes[4] = ArrayTrieModel::Size(counts, config);
+ sizes[5] = QuantArrayTrieModel::Size(counts, config);
std::size_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(size_t));
std::size_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(size_t));
std::size_t divide;
@@ -99,10 +111,11 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
for (long int i = 0; i < length - 2; ++i) std::cout << ' ';
std::cout << prefix << "B\n"
"probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n"
- "trie " << std::setw(length) << (sizes[1] / divide) << " without quantization\n"
- "trie " << std::setw(length) << (sizes[2] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n"
- "trie " << std::setw(length) << (sizes[3] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n"
- "trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n";
+ "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r -p " << config.probing_multiplier << "\n"
+ "trie " << std::setw(length) << (sizes[2] / divide) << " without quantization\n"
+ "trie " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n"
+ "trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n"
+ "trie " << std::setw(length) << (sizes[5] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n";
}
void ProbingQuantizationUnsupported() {
@@ -118,10 +131,10 @@ int main(int argc, char *argv[]) {
using namespace lm::ngram;
try {
- bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false;
+ bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false;
lm::ngram::Config config;
int opt;
- while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:si")) != -1) {
+ while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:sir:")) != -1) {
switch(opt) {
case 'q':
config.prob_bits = ParseBitCount(optarg);
@@ -164,6 +177,11 @@ int main(int argc, char *argv[]) {
case 'i':
config.positive_log_probability = lm::SILENT;
break;
+ case 'r':
+ rest = true;
+ ParseFileList(optarg, config.rest_lower_files);
+ config.rest_function = Config::REST_LOWER;
+ break;
default:
Usage(argv[0]);
}
@@ -174,35 +192,48 @@ int main(int argc, char *argv[]) {
}
if (optind + 1 == argc) {
ShowSizes(argv[optind], config);
- } else if (optind + 2 == argc) {
+ return 0;
+ }
+ const char *model_type;
+ const char *from_file;
+
+ if (optind + 2 == argc) {
+ model_type = "probing";
+ from_file = argv[optind];
config.write_mmap = argv[optind + 1];
- if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
- ProbingModel(argv[optind], config);
} else if (optind + 3 == argc) {
- const char *model_type = argv[optind];
- const char *from_file = argv[optind + 1];
+ model_type = argv[optind];
+ from_file = argv[optind + 1];
config.write_mmap = argv[optind + 2];
- if (!strcmp(model_type, "probing")) {
- if (!set_write_method) config.write_method = Config::WRITE_AFTER;
- if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
+ } else {
+ Usage(argv[0]);
+ }
+ if (!strcmp(model_type, "probing")) {
+ if (!set_write_method) config.write_method = Config::WRITE_AFTER;
+ if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
+ if (rest) {
+ RestProbingModel(from_file, config);
+ } else {
ProbingModel(from_file, config);
- } else if (!strcmp(model_type, "trie")) {
- if (!set_write_method) config.write_method = Config::WRITE_MMAP;
- if (quantize) {
- if (bhiksha) {
- QuantArrayTrieModel(from_file, config);
- } else {
- QuantTrieModel(from_file, config);
- }
+ }
+ } else if (!strcmp(model_type, "trie")) {
+ if (rest) {
+ std::cerr << "Rest + trie is not supported yet." << std::endl;
+ return 1;
+ }
+ if (!set_write_method) config.write_method = Config::WRITE_MMAP;
+ if (quantize) {
+ if (bhiksha) {
+ QuantArrayTrieModel(from_file, config);
} else {
- if (bhiksha) {
- ArrayTrieModel(from_file, config);
- } else {
- TrieModel(from_file, config);
- }
+ QuantTrieModel(from_file, config);
}
} else {
- Usage(argv[0]);
+ if (bhiksha) {
+ ArrayTrieModel(from_file, config);
+ } else {
+ TrieModel(from_file, config);
+ }
}
} else {
Usage(argv[0]);