diff options
| author | Kenneth Heafield <github@kheafield.com> | 2013-01-22 21:37:49 +0000 | 
|---|---|---|
| committer | Kenneth Heafield <github@kheafield.com> | 2013-01-22 21:37:49 +0000 | 
| commit | b35a7f3a96ff8ae42e15922dd6949bf9f5d15501 (patch) | |
| tree | 26edbe78931ffc50864a899c087d851005fe560b /klm/lm/build_binary_main.cc | |
| parent | 51a412aa7f5f50035cf28a274a70508c839f3d40 (diff) | |
KenLM 58da338b
Diffstat (limited to 'klm/lm/build_binary_main.cc')
| -rw-r--r-- | klm/lm/build_binary_main.cc | 228 | 
1 files changed, 228 insertions, 0 deletions
diff --git a/klm/lm/build_binary_main.cc b/klm/lm/build_binary_main.cc new file mode 100644 index 00000000..ab2c0c32 --- /dev/null +++ b/klm/lm/build_binary_main.cc @@ -0,0 +1,228 @@ +#include "lm/model.hh" +#include "lm/sizes.hh" +#include "util/file_piece.hh" +#include "util/usage.hh" + +#include <algorithm> +#include <cstdlib> +#include <exception> +#include <iostream> +#include <iomanip> +#include <limits> + +#include <math.h> +#include <stdlib.h> + +#ifdef WIN32 +#include "util/getopt.hh" +#else +#include <unistd.h> +#endif + +namespace lm { +namespace ngram { +namespace { + +void Usage(const char *name, const char *default_mem) { +  std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" +"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n" +"   Default is -100.  The ARPA file will always take precedence.\n" +"-s allows models to be built even if they do not have <s> and </s>.\n" +"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" +"-w mmap|after determines how writing is done.\n" +"   mmap maps the binary file and writes to it.  Default for trie.\n" +"   after allocates anonymous memory, builds, and writes.  Default for probing.\n" +"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n" +"   model files.  order1.arpa must be an ARPA file.  All others may be ARPA or\n" +"   the same data structure as being built.  All files must have the same\n" +"   vocabulary.  For probing, the unigrams must be in the same order.\n\n" +"type is either probing or trie.  Default is probing.\n\n" +"probing uses a probing hash table.  It is the fastest but uses the most memory.\n" +"-p sets the space multiplier and must be >1.0.  The default is 1.5.\n\n" +"trie is a straightforward trie with bit-level packing.  It uses the least\n" +"memory and is still faster than SRI or IRST.  Building the trie format uses an\n" +"on-disk sort to save memory.\n" +"-T is the temporary directory prefix.  Default is the output file name.\n" +"-S determines memory use for sorting.  Default is " << default_mem << ".  This is compatible\n" +"   with GNU sort.  The number is followed by a unit: \% for percent of physical\n" +"   memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y.  \n" +"   Default unit is K for Kilobytes.\n" +"-q turns quantization on and sets the number of bits (e.g. -q 8).\n" +"-b sets backoff quantization bits.  Requires -q and defaults to that value.\n" +"-a compresses pointers using an array of offsets.  The parameter is the\n" +"   maximum number of bits encoded by the array.  Memory is minimized subject\n" +"   to the maximum, so pick 255 to minimize memory.\n\n" +"Get a memory estimate by passing an ARPA file without an output file name.\n"; +  exit(1); +} + +// I could really use boost::lexical_cast right about now.   +float ParseFloat(const char *from) { +  char *end; +  float ret = strtod(from, &end); +  if (*end) throw util::ParseNumberException(from); +  return ret; +} +unsigned long int ParseUInt(const char *from) { +  char *end; +  unsigned long int ret = strtoul(from, &end, 10); +  if (*end) throw util::ParseNumberException(from); +  return ret; +} + +uint8_t ParseBitCount(const char *from) { +  unsigned long val = ParseUInt(from); +  if (val > 25) { +    util::ParseNumberException e(from); +    e << " bit counts are limited to 25."; +  } +  return val; +} + +void ParseFileList(const char *from, std::vector<std::string> &to) { +  to.clear(); +  while (true) { +    const char *i; +    for (i = from; *i && *i != ' '; ++i) {} +    to.push_back(std::string(from, i - from)); +    if (!*i) break; +    from = i + 1; +  } +} + +void ProbingQuantizationUnsupported() { +  std::cerr << "Quantization is only implemented in the trie data structure." << std::endl; +  exit(1); +} + +} // namespace ngram +} // namespace lm +} // namespace + +int main(int argc, char *argv[]) { +  using namespace lm::ngram; + +  const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G"; + +  try { +    bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false; +    lm::ngram::Config config; +    config.building_memory = util::ParseSize(default_mem); +    int opt; +    while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:")) != -1) { +      switch(opt) { +        case 'q': +          config.prob_bits = ParseBitCount(optarg); +          if (!set_backoff_bits) config.backoff_bits = config.prob_bits; +          quantize = true; +          break; +        case 'b': +          config.backoff_bits = ParseBitCount(optarg); +          set_backoff_bits = true; +          break; +        case 'a': +          config.pointer_bhiksha_bits = ParseBitCount(optarg); +          bhiksha = true; +          break; +        case 'u': +          config.unknown_missing_logprob = ParseFloat(optarg); +          break; +        case 'p': +          config.probing_multiplier = ParseFloat(optarg); +          break; +        case 't': // legacy +        case 'T': +          config.temporary_directory_prefix = optarg; +          break; +        case 'm': // legacy +          config.building_memory = ParseUInt(optarg) * 1048576; +          break; +        case 'S': +          config.building_memory = std::min(static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), util::ParseSize(optarg)); +          break; +        case 'w': +          set_write_method = true; +          if (!strcmp(optarg, "mmap")) { +            config.write_method = Config::WRITE_MMAP; +          } else if (!strcmp(optarg, "after")) { +            config.write_method = Config::WRITE_AFTER; +          } else { +            Usage(argv[0], default_mem); +          } +          break; +        case 's': +          config.sentence_marker_missing = lm::SILENT; +          break; +        case 'i': +          config.positive_log_probability = lm::SILENT; +          break; +        case 'r': +          rest = true; +          ParseFileList(optarg, config.rest_lower_files); +          config.rest_function = Config::REST_LOWER; +          break; +        default: +          Usage(argv[0], default_mem); +      } +    } +    if (!quantize && set_backoff_bits) { +      std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl; +      abort(); +    } +    if (optind + 1 == argc) { +      ShowSizes(argv[optind], config); +      return 0; +    } +    const char *model_type; +    const char *from_file; + +    if (optind + 2 == argc) { +      model_type = "probing"; +      from_file = argv[optind]; +      config.write_mmap = argv[optind + 1]; +    } else if (optind + 3 == argc) { +      model_type = argv[optind]; +      from_file = argv[optind + 1]; +      config.write_mmap = argv[optind + 2]; +    } else { +      Usage(argv[0], default_mem); +    } +    if (!strcmp(model_type, "probing")) { +      if (!set_write_method) config.write_method = Config::WRITE_AFTER; +      if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); +      if (rest) { +        RestProbingModel(from_file, config); +      } else { +        ProbingModel(from_file, config); +      } +    } else if (!strcmp(model_type, "trie")) { +      if (rest) { +        std::cerr << "Rest + trie is not supported yet." << std::endl; +        return 1; +      } +      if (!set_write_method) config.write_method = Config::WRITE_MMAP; +      if (quantize) { +        if (bhiksha) { +          QuantArrayTrieModel(from_file, config); +        } else { +          QuantTrieModel(from_file, config); +        } +      } else { +        if (bhiksha) { +          ArrayTrieModel(from_file, config); +        } else { +          TrieModel(from_file, config); +        } +      } +    } else { +      Usage(argv[0], default_mem); +    } +  } +  catch (const std::exception &e) { +    std::cerr << e.what() << std::endl; +    std::cerr << "ERROR" << std::endl; +    return 1; +  } +  std::cerr << "SUCCESS" << std::endl; +  return 0; +}  | 
