klm/lm/build_binary.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

#include "lm/model.hh"
#include "util/file_piece.hh"

#include <iostream>
#include <iomanip>

#include <math.h>
#include <stdlib.h>
#include <unistd.h>

namespace lm {
namespace ngram {
namespace {

void Usage(const char *name) {
  std::cerr << "Usage: " << name << " [-u unknown_probability] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n"
"Where type is one of probing, trie, or sorted:\n\n"
"probing uses a probing hash table.  It is the fastest but uses the most memory.\n"
"-p sets the space multiplier and must be >1.0.  The default is 1.5.\n\n"
"trie is a straightforward trie with bit-level packing.  It uses the least\n"
"memory and is still faster than SRI or IRST.  Building the trie format uses an\n"
"on-disk sort to save memory.\n"
"-t is the temporary directory prefix.  Default is the output file name.\n"
"-m limits memory use for sorting.  Measured in MB.  Default is 1024MB.\n\n"
/*"sorted is like probing but uses a sorted uniform map instead of a hash table.\n"
"It uses more memory than trie and is also slower, so there's no real reason to\n"
"use it.\n\n"*/
"See http://kheafield.com/code/kenlm/benchmark/ for data structure benchmarks.\n"
"Passing only an input file will print memory usage of each data structure.\n"
"If the ARPA file does not have <unk>, -u sets <unk>'s probability; default 0.0.\n";
  exit(1);
}

// I could really use boost::lexical_cast right about now.  
float ParseFloat(const char *from) {
  char *end;
  float ret = strtod(from, &end);
  if (*end) throw util::ParseNumberException(from);
  return ret;
}
unsigned long int ParseUInt(const char *from) {
  char *end;
  unsigned long int ret = strtoul(from, &end, 10);
  if (*end) throw util::ParseNumberException(from);
  return ret;
}

void ShowSizes(const char *file, const lm::ngram::Config &config) {
  std::vector<uint64_t> counts;
  util::FilePiece f(file);
  lm::ReadARPACounts(f, counts);
  std::size_t probing_size = ProbingModel::Size(counts, config);
  // probing is always largest so use it to determine number of columns.  
  long int length = std::max<long int>(5, lrint(ceil(log10(probing_size))));
  std::cout << "Memory estimate:\ntype    ";
  // right align bytes.  
  for (long int i = 0; i < length - 5; ++i) std::cout << ' ';
  std::cout << "bytes\n"
    "probing " << std::setw(length) << probing_size << " assuming -p " << config.probing_multiplier << "\n"
    "trie    " << std::setw(length) << TrieModel::Size(counts, config) << "\n";
/*    "sorted  " << std::setw(length) << SortedModel::Size(counts, config) << "\n";*/
}

} // namespace ngram
} // namespace lm
} // namespace

int main(int argc, char *argv[]) {
  using namespace lm::ngram;

  lm::ngram::Config config;
  int opt;
  while ((opt = getopt(argc, argv, "u:p:t:m:")) != -1) {
    switch(opt) {
      case 'u':
        config.unknown_missing_prob = ParseFloat(optarg);
        break;
      case 'p':
        config.probing_multiplier = ParseFloat(optarg);
        break;
      case 't':
        config.temporary_directory_prefix = optarg;
        break;
      case 'm':
        config.building_memory = ParseUInt(optarg) * 1048576;
        break;
      default:
        Usage(argv[0]);
    }
  }
  if (optind + 1 == argc) {
    ShowSizes(argv[optind], config);
  } else if (optind + 2 == argc) {
    config.write_mmap = argv[optind + 1];
    ProbingModel(argv[optind], config);
  } else if (optind + 3 == argc) {
    const char *model_type = argv[optind];
    const char *from_file = argv[optind + 1];
    config.write_mmap = argv[optind + 2];
    if (!strcmp(model_type, "probing")) {
      ProbingModel(from_file, config);
    } else if (!strcmp(model_type, "sorted")) {
      SortedModel(from_file, config);
    } else if (!strcmp(model_type, "trie")) {
      TrieModel(from_file, config);
    } else {
      Usage(argv[0]);
    }
  } else {
    Usage(argv[0]);
  }
  return 0;
}