diff options
-rw-r--r-- | klm/lm/config.hh | 83 | ||||
-rw-r--r-- | klm/lm/read_arpa.hh | 64 |
2 files changed, 147 insertions, 0 deletions
diff --git a/klm/lm/config.hh b/klm/lm/config.hh new file mode 100644 index 00000000..88240b5f --- /dev/null +++ b/klm/lm/config.hh @@ -0,0 +1,83 @@ +#ifndef LM_CONFIG__ +#define LM_CONFIG__ + +#include <iosfwd> + +#include "util/mmap.hh" + +/* Configuration for ngram model. Separate header to reduce pollution. */ + +namespace lm { namespace ngram { + +class EnumerateVocab; + +struct Config { + // EFFECTIVE FOR BOTH ARPA AND BINARY READS + + // Where to log messages including the progress bar. Set to NULL for + // silence. + std::ostream *messages; + + // This will be called with every string in the vocabulary. See + // enumerate_vocab.hh for more detail. Config does not take ownership; you + // are still responsible for deleting it (or stack allocating). + EnumerateVocab *enumerate_vocab; + + + + // ONLY EFFECTIVE WHEN READING ARPA + + // What to do when <unk> isn't in the provided model. + typedef enum {THROW_UP, COMPLAIN, SILENT} UnknownMissing; + UnknownMissing unknown_missing; + + // The probability to substitute for <unk> if it's missing from the model. + // No effect if the model has <unk> or unknown_missing == THROW_UP. + float unknown_missing_prob; + + // Size multiplier for probing hash table. Must be > 1. Space is linear in + // this. Time is probing_multiplier / (probing_multiplier - 1). No effect + // for sorted variant. + // If you find yourself setting this to a low number, consider using the + // Sorted version instead which has lower memory consumption. + float probing_multiplier; + + // Amount of memory to use for building. The actual memory usage will be + // higher since this just sets sort buffer size. Only applies to trie + // models. + std::size_t building_memory; + + // Template for temporary directory appropriate for passing to mkdtemp. + // The characters XXXXXX are appended before passing to mkdtemp. Only + // applies to trie. If NULL, defaults to write_mmap. If that's NULL, + // defaults to input file name. + const char *temporary_directory_prefix; + + // Level of complaining to do when an ARPA instead of a binary format. + typedef enum {ALL, EXPENSIVE, NONE} ARPALoadComplain; + ARPALoadComplain arpa_complain; + + // While loading an ARPA file, also write out this binary format file. Set + // to NULL to disable. + const char *write_mmap; + + // Include the vocab in the binary file? Only effective if write_mmap != NULL. + bool include_vocab; + + + + // ONLY EFFECTIVE WHEN READING BINARY + + // How to get the giant array into memory: lazy mmap, populate, read etc. + // See util/mmap.hh for details of MapMethod. + util::LoadMethod load_method; + + + + // Set defaults. + Config(); +}; + +} /* namespace ngram */ } /* namespace lm */ + +#endif // LM_CONFIG__ diff --git a/klm/lm/read_arpa.hh b/klm/lm/read_arpa.hh new file mode 100644 index 00000000..cabdb195 --- /dev/null +++ b/klm/lm/read_arpa.hh @@ -0,0 +1,64 @@ +#ifndef LM_READ_ARPA__ +#define LM_READ_ARPA__ + +#include "lm/lm_exception.hh" +#include "lm/word_index.hh" +#include "lm/weights.hh" +#include "util/file_piece.hh" + +#include <cstddef> +#include <iosfwd> +#include <vector> + +namespace lm { + +void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number); +void ReadARPACounts(std::istream &in, std::vector<uint64_t> &number); +void ReadNGramHeader(util::FilePiece &in, unsigned int length); +void ReadNGramHeader(std::istream &in, unsigned int length); + +void ReadBackoff(util::FilePiece &in, Prob &weights); +void ReadBackoff(util::FilePiece &in, ProbBackoff &weights); + +void ReadEnd(util::FilePiece &in); +void ReadEnd(std::istream &in); + +template <class Voc> void Read1Gram(util::FilePiece &f, Voc &vocab, ProbBackoff *unigrams) { + try { + float prob = f.ReadFloat(); + if (prob > 0) UTIL_THROW(FormatLoadException, "Positive probability " << prob); + if (f.get() != '\t') UTIL_THROW(FormatLoadException, "Expected tab after probability"); + ProbBackoff &value = unigrams[vocab.Insert(f.ReadDelimited())]; + value.prob = prob; + ReadBackoff(f, value); + } catch(util::Exception &e) { + e << " in the 1-gram at byte " << f.Offset(); + throw; + } +} + +template <class Voc> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, ProbBackoff *unigrams) { + ReadNGramHeader(f, 1); + for (std::size_t i = 0; i < count; ++i) { + Read1Gram(f, vocab, unigrams); + } + vocab.FinishedLoading(unigrams); +} + +template <class Voc, class Weights> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, WordIndex *const reverse_indices, Weights &weights) { + try { + weights.prob = f.ReadFloat(); + if (weights.prob > 0) UTIL_THROW(FormatLoadException, "Positive probability " << weights.prob); + for (WordIndex *vocab_out = reverse_indices + n - 1; vocab_out >= reverse_indices; --vocab_out) { + *vocab_out = vocab.Index(f.ReadDelimited()); + } + ReadBackoff(f, weights); + } catch(util::Exception &e) { + e << " in the " << n << "-gram at byte " << f.Offset(); + throw; + } +} + +} // namespace lm + +#endif // LM_READ_ARPA__ |