diff options
| author | Chris Dyer <cdyer@cs.cmu.edu> | 2010-12-13 16:18:34 -0500 | 
|---|---|---|
| committer | Chris Dyer <cdyer@cs.cmu.edu> | 2010-12-13 16:18:34 -0500 | 
| commit | 66e5956906e61b047d2fd451f3053916cbc92433 (patch) | |
| tree | 5bd4222506deae0c8e5f4c001bb6f7505b73f846 /klm/lm/build_binary.cc | |
| parent | 6d2a75d7deb35fcb2fac674190c19e0a0143aaed (diff) | |
new version of kenlm
Diffstat (limited to 'klm/lm/build_binary.cc')
| -rw-r--r-- | klm/lm/build_binary.cc | 112 | 
1 files changed, 106 insertions, 6 deletions
| diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc index 4db631a2..ec034640 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary.cc @@ -1,13 +1,113 @@  #include "lm/model.hh" +#include "util/file_piece.hh"  #include <iostream> +#include <iomanip> + +#include <math.h> +#include <stdlib.h> +#include <unistd.h> + +namespace lm { +namespace ngram { +namespace { + +void Usage(const char *name) { +  std::cerr << "Usage: " << name << " [-u unknown_probability] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n" +"Where type is one of probing, trie, or sorted:\n\n" +"probing uses a probing hash table.  It is the fastest but uses the most memory.\n" +"-p sets the space multiplier and must be >1.0.  The default is 1.5.\n\n" +"trie is a straightforward trie with bit-level packing.  It uses the least\n" +"memory and is still faster than SRI or IRST.  Building the trie format uses an\n" +"on-disk sort to save memory.\n" +"-t is the temporary directory prefix.  Default is the output file name.\n" +"-m is the amount of memory to use, in MB.  Default is 1024MB (1GB).\n\n" +"sorted is like probing but uses a sorted uniform map instead of a hash table.\n" +"It uses more memory than trie and is also slower, so there's no real reason to\n" +"use it.\n\n" +"See http://kheafield.com/code/kenlm/benchmark/ for data structure benchmarks.\n" +"Passing only an input file will print memory usage of each data structure.\n" +"If the ARPA file does not have <unk>, -u sets <unk>'s probability; default 0.0.\n"; +  exit(1); +} + +// I could really use boost::lexical_cast right about now.   +float ParseFloat(const char *from) { +  char *end; +  float ret = strtod(from, &end); +  if (*end) throw util::ParseNumberException(from); +  return ret; +} +unsigned long int ParseUInt(const char *from) { +  char *end; +  unsigned long int ret = strtoul(from, &end, 10); +  if (*end) throw util::ParseNumberException(from); +  return ret; +} + +void ShowSizes(const char *file, const lm::ngram::Config &config) { +  std::vector<uint64_t> counts; +  util::FilePiece f(file); +  lm::ReadARPACounts(f, counts); +  std::size_t probing_size = ProbingModel::Size(counts, config); +  // probing is always largest so use it to determine number of columns.   +  long int length = std::max<long int>(5, lrint(ceil(log10(probing_size)))); +  std::cout << "Memory usage:\ntype    "; +  // right align bytes.   +  for (long int i = 0; i < length - 5; ++i) std::cout << ' '; +  std::cout << "bytes\n" +    "probing " << std::setw(length) << probing_size << " assuming -p " << config.probing_multiplier << "\n" +    "trie    " << std::setw(length) << TrieModel::Size(counts, config) << "\n" +    "sorted  " << std::setw(length) << SortedModel::Size(counts, config) << "\n"; +} + +} // namespace ngram +} // namespace lm +} // namespace  int main(int argc, char *argv[]) { -  if (argc != 3) { -    std::cerr << "Usage: " << argv[0] << " input.arpa output.mmap" << std::endl; -    return 1; -  } +  using namespace lm::ngram; +    lm::ngram::Config config; -  config.write_mmap = argv[2]; -  lm::ngram::Model(argv[1], config); +  int opt; +  while ((opt = getopt(argc, argv, "u:p:t:m:")) != -1) { +    switch(opt) { +      case 'u': +        config.unknown_missing_prob = ParseFloat(optarg); +        break; +      case 'p': +        config.probing_multiplier = ParseFloat(optarg); +        break; +      case 't': +        config.temporary_directory_prefix = optarg; +        break; +      case 'm': +        config.building_memory = ParseUInt(optarg) * 1048576; +        break; +      default: +        Usage(argv[0]); +    } +  } +  if (optind + 1 == argc) { +    ShowSizes(argv[optind], config); +  } else if (optind + 2 == argc) { +    config.write_mmap = argv[optind + 1]; +    ProbingModel(argv[optind], config); +  } else if (optind + 3 == argc) { +    const char *model_type = argv[optind]; +    const char *from_file = argv[optind + 1]; +    config.write_mmap = argv[optind + 2]; +    if (!strcmp(model_type, "probing")) { +      ProbingModel(from_file, config); +    } else if (!strcmp(model_type, "sorted")) { +      SortedModel(from_file, config); +    } else if (!strcmp(model_type, "trie")) { +      TrieModel(from_file, config); +    } else { +      Usage(argv[0]); +    } +  } else { +    Usage(argv[0]); +  } +  return 0;  } | 
