klm/lm/build_binary.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

#include "lm/model.hh"
#include "util/file_piece.hh"

#include <cstdlib>
#include <exception>
#include <iostream>
#include <iomanip>

#include <math.h>
#include <stdlib.h>
#include <unistd.h>

namespace lm {
namespace ngram {
namespace {

void Usage(const char *name) {
  std::cerr << "Usage: " << name << " [-u unknown_probability] [-s] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n"
"-u sets the default probability for <unk> if the ARPA file does not have one.\n"
"-s allows models to be built even if they do not have <s> and </s>.\n\n"
"type is one of probing, trie, or sorted:\n\n"
"probing uses a probing hash table.  It is the fastest but uses the most memory.\n"
"-p sets the space multiplier and must be >1.0.  The default is 1.5.\n\n"
"trie is a straightforward trie with bit-level packing.  It uses the least\n"
"memory and is still faster than SRI or IRST.  Building the trie format uses an\n"
"on-disk sort to save memory.\n"
"-t is the temporary directory prefix.  Default is the output file name.\n"
"-m limits memory use for sorting.  Measured in MB.  Default is 1024MB.\n\n"
/*"sorted is like probing but uses a sorted uniform map instead of a hash table.\n"
"It uses more memory than trie and is also slower, so there's no real reason to\n"
"use it.\n\n"*/
"See http://kheafield.com/code/kenlm/benchmark/ for data structure benchmarks.\n"
"Passing only an input file will print memory usage of each data structure.\n"
"If the ARPA file does not have <unk>, -u sets <unk>'s probability; default 0.0.\n";
  exit(1);
}

// I could really use boost::lexical_cast right about now.  
float ParseFloat(const char *from) {
  char *end;
  float ret = strtod(from, &end);
  if (*end) throw util::ParseNumberException(from);
  return ret;
}
unsigned long int ParseUInt(const char *from) {
  char *end;
  unsigned long int ret = strtoul(from, &end, 10);
  if (*end) throw util::ParseNumberException(from);
  return ret;
}

void ShowSizes(const char *file, const lm::ngram::Config &config) {
  std::vector<uint64_t> counts;
  util::FilePiece f(file);
  lm::ReadARPACounts(f, counts);
  std::size_t probing_size = ProbingModel::Size(counts, config);
  // probing is always largest so use it to determine number of columns.  
  long int length = std::max<long int>(5, lrint(ceil(log10(probing_size))));
  std::cout << "Memory estimate:\ntype    ";
  // right align bytes.  
  for (long int i = 0; i < length - 5; ++i) std::cout << ' ';
  std::cout << "bytes\n"
    "probing " << std::setw(length) << probing_size << " assuming -p " << config.probing_multiplier << "\n"
    "trie    " << std::setw(length) << TrieModel::Size(counts, config) << "\n";
/*    "sorted  " << std::setw(length) << SortedModel::Size(counts, config) << "\n";*/
}

} // namespace ngram
} // namespace lm
} // namespace

void terminate_handler() {
  try { throw; }
  catch(const std::exception& e) {
    std::cerr << e.what() << std::endl;
  }
  catch(...) {
    std::cerr << "A non-standard exception was thrown." << std::endl;
  }
  std::abort();
}

int main(int argc, char *argv[]) {
  using namespace lm::ngram;

  std::set_terminate(terminate_handler);

  lm::ngram::Config config;
  int opt;
  while ((opt = getopt(argc, argv, "su:p:t:m:")) != -1) {
    switch(opt) {
      case 'u':
        config.unknown_missing_prob = ParseFloat(optarg);
        break;
      case 'p':
        config.probing_multiplier = ParseFloat(optarg);
        break;
      case 't':
        config.temporary_directory_prefix = optarg;
        break;
      case 'm':
        config.building_memory = ParseUInt(optarg) * 1048576;
        break;
      case 's':
        config.sentence_marker_missing = lm::ngram::Config::SILENT;
        break;
      default:
        Usage(argv[0]);
    }
  }
  if (optind + 1 == argc) {
    ShowSizes(argv[optind], config);
  } else if (optind + 2 == argc) {
    config.write_mmap = argv[optind + 1];
    ProbingModel(argv[optind], config);
  } else if (optind + 3 == argc) {
    const char *model_type = argv[optind];
    const char *from_file = argv[optind + 1];
    config.write_mmap = argv[optind + 2];
    if (!strcmp(model_type, "probing")) {
      ProbingModel(from_file, config);
    } else if (!strcmp(model_type, "sorted")) {
      SortedModel(from_file, config);
    } else if (!strcmp(model_type, "trie")) {
      TrieModel(from_file, config);
    } else {
      Usage(argv[0]);
    }
  } else {
    Usage(argv[0]);
  }
  return 0;
}