diff options
Diffstat (limited to 'klm/lm')
| -rw-r--r-- | klm/lm/bhiksha.cc | 1 | ||||
| -rw-r--r-- | klm/lm/bhiksha.hh | 2 | ||||
| -rw-r--r-- | klm/lm/build_binary.cc | 8 | ||||
| -rw-r--r-- | klm/lm/left.hh | 2 | ||||
| -rw-r--r-- | klm/lm/max_order.cc | 6 | ||||
| -rw-r--r-- | klm/lm/max_order.hh | 26 | ||||
| -rw-r--r-- | klm/lm/model.cc | 11 | ||||
| -rw-r--r-- | klm/lm/model.hh | 1 | ||||
| -rw-r--r-- | klm/lm/quantize.hh | 4 | ||||
| -rw-r--r-- | klm/lm/read_arpa.cc | 16 | ||||
| -rw-r--r-- | klm/lm/search_trie.cc | 20 | ||||
| -rw-r--r-- | klm/lm/state.hh | 10 | ||||
| -rw-r--r-- | klm/lm/trie.hh | 2 | ||||
| -rw-r--r-- | klm/lm/trie_sort.cc | 24 | ||||
| -rw-r--r-- | klm/lm/trie_sort.hh | 4 | ||||
| -rw-r--r-- | klm/lm/value.hh | 2 | ||||
| -rw-r--r-- | klm/lm/value_build.hh | 6 | ||||
| -rw-r--r-- | klm/lm/vocab.hh | 4 | 
18 files changed, 93 insertions, 56 deletions
| diff --git a/klm/lm/bhiksha.cc b/klm/lm/bhiksha.cc index cdeafb47..870a4eee 100644 --- a/klm/lm/bhiksha.cc +++ b/klm/lm/bhiksha.cc @@ -1,6 +1,7 @@  #include "lm/bhiksha.hh"  #include "lm/config.hh"  #include "util/file.hh" +#include "util/exception.hh"  #include <limits> diff --git a/klm/lm/bhiksha.hh b/klm/lm/bhiksha.hh index 5182ee2e..9734f3ab 100644 --- a/klm/lm/bhiksha.hh +++ b/klm/lm/bhiksha.hh @@ -23,7 +23,7 @@  namespace lm {  namespace ngram { -class Config; +struct Config;  namespace trie { diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc index c4a01cb4..49901c9e 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary.cc @@ -25,7 +25,11 @@ void Usage(const char *name) {  "-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"  "-w mmap|after determines how writing is done.\n"  "   mmap maps the binary file and writes to it.  Default for trie.\n" -"   after allocates anonymous memory, builds, and writes.  Default for probing.\n\n" +"   after allocates anonymous memory, builds, and writes.  Default for probing.\n" +"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n" +"   model files.  order1.arpa must be an ARPA file.  All others may be ARPA or\n" +"   the same data structure as being built.  All files must have the same\n" +"   vocabulary.  For probing, the unigrams must be in the same order.\n\n"  "type is either probing or trie.  Default is probing.\n\n"  "probing uses a probing hash table.  It is the fastest but uses the most memory.\n"  "-p sets the space multiplier and must be >1.0.  The default is 1.5.\n\n" @@ -111,7 +115,7 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {    for (long int i = 0; i < length - 2; ++i) std::cout << ' ';    std::cout << prefix << "B\n"      "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n" -    "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r -p " << config.probing_multiplier << "\n" +    "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r models -p " << config.probing_multiplier << "\n"      "trie    " << std::setw(length) << (sizes[2] / divide) << " without quantization\n"      "trie    " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n"      "trie    " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n" diff --git a/klm/lm/left.hh b/klm/lm/left.hh index c00af88a..8c27232e 100644 --- a/klm/lm/left.hh +++ b/klm/lm/left.hh @@ -111,7 +111,7 @@ template <class M> class RuleScore {          return;        } -      float backoffs[kMaxOrder - 1], backoffs2[kMaxOrder - 1]; +      float backoffs[KENLM_MAX_ORDER - 1], backoffs2[KENLM_MAX_ORDER - 1];        float *back = backoffs, *back2 = backoffs2;        unsigned char next_use = out_.right.length; diff --git a/klm/lm/max_order.cc b/klm/lm/max_order.cc new file mode 100644 index 00000000..94221201 --- /dev/null +++ b/klm/lm/max_order.cc @@ -0,0 +1,6 @@ +#include "lm/max_order.hh" +#include <iostream> + +int main(int argc, char *argv[]) { +  std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl; +} diff --git a/klm/lm/max_order.hh b/klm/lm/max_order.hh index aff9de27..bc8687cd 100644 --- a/klm/lm/max_order.hh +++ b/klm/lm/max_order.hh @@ -1,14 +1,12 @@ -#ifndef LM_MAX_ORDER__ -#define LM_MAX_ORDER__ -namespace lm { -namespace ngram { -// If you need higher order, change this and recompile.   -// Having this limit means that State can be -// (kMaxOrder - 1) * sizeof(float) bytes instead of -// sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead -const unsigned char kMaxOrder = 5; - -} // namespace ngram -} // namespace lm - -#endif // LM_MAX_ORDER__ +/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM. + * If not, this is the default maximum order.   + * Having this limit means that State can be + * (kMaxOrder - 1) * sizeof(float) bytes instead of + * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead + */ +#ifndef KENLM_MAX_ORDER +#define KENLM_MAX_ORDER 6 +#endif +#ifndef KENLM_ORDER_MESSAGE +#define KENLM_ORDER_MESSAGE "Edit klm/lm/max_order.hh." +#endif diff --git a/klm/lm/model.cc b/klm/lm/model.cc index a2d31ce0..b46333a4 100644 --- a/klm/lm/model.cc +++ b/klm/lm/model.cc @@ -5,6 +5,7 @@  #include "lm/search_hashed.hh"  #include "lm/search_trie.hh"  #include "lm/read_arpa.hh" +#include "util/have.hh"  #include "util/murmur_hash.hh"  #include <algorithm> @@ -47,7 +48,14 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge    P::Init(begin_sentence, null_context, vocab_, search_.Order());  } +namespace { +void CheckMaxOrder(size_t order) { +  UTIL_THROW_IF(order > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << order << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ".  " << KENLM_ORDER_MESSAGE); +} +} // namespace +  template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) { +  CheckMaxOrder(params.counts.size());    SetupMemory(start, params.counts, config);    vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab);    search_.LoadedBinary(); @@ -60,8 +68,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT      std::vector<uint64_t> counts;      // File counts do not include pruned trigrams that extend to quadgrams etc.   These will be fixed by search_.      ReadARPACounts(f, counts); - -    if (counts.size() > kMaxOrder) UTIL_THROW(FormatLoadException, "This model has order " << counts.size() << ".  Edit lm/max_order.hh, set kMaxOrder to at least this value, and recompile."); +    CheckMaxOrder(counts.size());      if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model.");      if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0"); diff --git a/klm/lm/model.hh b/klm/lm/model.hh index be872178..6dee9419 100644 --- a/klm/lm/model.hh +++ b/klm/lm/model.hh @@ -5,7 +5,6 @@  #include "lm/binary_format.hh"  #include "lm/config.hh"  #include "lm/facade.hh" -#include "lm/max_order.hh"  #include "lm/quantize.hh"  #include "lm/search_hashed.hh"  #include "lm/search_trie.hh" diff --git a/klm/lm/quantize.hh b/klm/lm/quantize.hh index 3e9153e3..abed0112 100644 --- a/klm/lm/quantize.hh +++ b/klm/lm/quantize.hh @@ -17,7 +17,7 @@  namespace lm {  namespace ngram { -class Config; +struct Config;  /* Store values directly and don't quantize. */  class DontQuantize { @@ -217,7 +217,7 @@ class SeparatelyQuantize {      const Bins &LongestTable() const { return longest_; }    private: -    Bins tables_[kMaxOrder - 1][2]; +    Bins tables_[KENLM_MAX_ORDER - 1][2];      Bins longest_; diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc index 2d9a337d..70727e4c 100644 --- a/klm/lm/read_arpa.cc +++ b/klm/lm/read_arpa.cc @@ -7,9 +7,14 @@  #include <vector>  #include <ctype.h> +#include <math.h>  #include <string.h>  #include <stdint.h> +#ifdef WIN32 +#include <float.h> +#endif +  namespace lm {  // 1 for '\t', '\n', and ' '.  This is stricter than isspace.   @@ -93,7 +98,16 @@ void ReadBackoff(util::FilePiece &in, float &backoff) {      case '\t':        backoff = in.ReadFloat();        if (backoff == ngram::kExtensionBackoff) backoff = ngram::kNoExtensionBackoff; -      if ((in.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff"); +      { +#ifdef WIN32 +		int float_class = _fpclass(backoff); +        UTIL_THROW_IF(float_class == _FPCLASS_SNAN || float_class == _FPCLASS_QNAN || float_class == _FPCLASS_NINF || float_class == _FPCLASS_PINF, FormatLoadException, "Bad backoff " << backoff); +#else +        int float_class = fpclassify(backoff); +        UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff); +#endif +      } +      UTIL_THROW_IF(in.get() != '\n', FormatLoadException, "Expected newline after backoff");        break;      case '\n':        backoff = ngram::kNoExtensionBackoff; diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index 18e80d5a..832cc9f7 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -180,7 +180,7 @@ const float kBadProb = std::numeric_limits<float>::infinity();  class SRISucks {    public:      SRISucks() { -      for (BackoffMessages *i = messages_; i != messages_ + kMaxOrder - 1; ++i) +      for (BackoffMessages *i = messages_; i != messages_ + KENLM_MAX_ORDER - 1; ++i)          i->Init(sizeof(ProbPointer) + sizeof(WordIndex) * (i - messages_ + 1));      } @@ -196,7 +196,7 @@ class SRISucks {      }      void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) { -      for (unsigned char i = 0; i < kMaxOrder - 1; ++i) { +      for (unsigned char i = 0; i < KENLM_MAX_ORDER - 1; ++i) {          it_[i] = values_[i].empty() ? NULL : &*values_[i].begin();        }        messages_[0].Apply(it_, unigram_file); @@ -221,10 +221,10 @@ class SRISucks {    private:      // This used to be one array.  Then I needed to separate it by order for quantization to work.   -    std::vector<float> values_[kMaxOrder - 1]; -    BackoffMessages messages_[kMaxOrder - 1]; +    std::vector<float> values_[KENLM_MAX_ORDER - 1]; +    BackoffMessages messages_[KENLM_MAX_ORDER - 1]; -    float *it_[kMaxOrder - 1]; +    float *it_[KENLM_MAX_ORDER - 1];  };  class FindBlanks { @@ -337,7 +337,7 @@ struct Gram {  template <class Doing> class BlankManager {    public:      BlankManager(unsigned char total_order, Doing &doing) : total_order_(total_order), been_length_(0), doing_(doing) { -      for (float *i = basis_; i != basis_ + kMaxOrder - 1; ++i) *i = kBadProb; +      for (float *i = basis_; i != basis_ + KENLM_MAX_ORDER - 1; ++i) *i = kBadProb;      }      void Visit(const WordIndex *to, unsigned char length, float prob) { @@ -373,10 +373,10 @@ template <class Doing> class BlankManager {    private:      const unsigned char total_order_; -    WordIndex been_[kMaxOrder]; +    WordIndex been_[KENLM_MAX_ORDER];      unsigned char been_length_; -    float basis_[kMaxOrder]; +    float basis_[KENLM_MAX_ORDER];      Doing &doing_;  }; @@ -470,8 +470,8 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c  } // namespace  template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) { -  RecordReader inputs[kMaxOrder - 1]; -  RecordReader contexts[kMaxOrder - 1]; +  RecordReader inputs[KENLM_MAX_ORDER - 1]; +  RecordReader contexts[KENLM_MAX_ORDER - 1];    for (unsigned char i = 2; i <= counts.size(); ++i) {      inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff))); diff --git a/klm/lm/state.hh b/klm/lm/state.hh index c7438414..830e40aa 100644 --- a/klm/lm/state.hh +++ b/klm/lm/state.hh @@ -32,7 +32,7 @@ class State {      // Call this before using raw memcmp.        void ZeroRemaining() { -      for (unsigned char i = length; i < kMaxOrder - 1; ++i) { +      for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) {          words[i] = 0;          backoff[i] = 0.0;        } @@ -42,8 +42,8 @@ class State {      // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD.        // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit.   -    WordIndex words[kMaxOrder - 1]; -    float backoff[kMaxOrder - 1]; +    WordIndex words[KENLM_MAX_ORDER - 1]; +    float backoff[KENLM_MAX_ORDER - 1];      unsigned char length;  }; @@ -72,11 +72,11 @@ struct Left {    }    void ZeroRemaining() { -    for (uint64_t * i = pointers + length; i < pointers + kMaxOrder - 1; ++i) +    for (uint64_t * i = pointers + length; i < pointers + KENLM_MAX_ORDER - 1; ++i)        *i = 0;    } -  uint64_t pointers[kMaxOrder - 1]; +  uint64_t pointers[KENLM_MAX_ORDER - 1];    unsigned char length;    bool full;  }; diff --git a/klm/lm/trie.hh b/klm/lm/trie.hh index eff93292..034a1414 100644 --- a/klm/lm/trie.hh +++ b/klm/lm/trie.hh @@ -11,7 +11,7 @@  namespace lm {  namespace ngram { -class Config; +struct Config;  namespace trie {  struct NodeRange { diff --git a/klm/lm/trie_sort.cc b/klm/lm/trie_sort.cc index b80fed02..0d83221e 100644 --- a/klm/lm/trie_sort.cc +++ b/klm/lm/trie_sort.cc @@ -148,13 +148,17 @@ template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_f  } // namespace  void RecordReader::Init(FILE *file, std::size_t entry_size) { -  rewind(file); -  file_ = file; +  entry_size_ = entry_size;    data_.reset(malloc(entry_size));    UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read buffer"); -  remains_ = true; -  entry_size_ = entry_size; -  ++*this; +  file_ = file; +  if (file) { +    rewind(file); +    remains_ = true; +    ++*this; +  } else { +    remains_ = false; +  }  }  void RecordReader::Overwrite(const void *start, std::size_t amount) { @@ -169,9 +173,13 @@ void RecordReader::Overwrite(const void *start, std::size_t amount) {  }  void RecordReader::Rewind() { -  rewind(file_); -  remains_ = true; -  ++*this; +  if (file_) { +    rewind(file_); +    remains_ = true; +    ++*this; +  } else { +    remains_ = false; +  }  }  SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { diff --git a/klm/lm/trie_sort.hh b/klm/lm/trie_sort.hh index 3036319d..1e6fce51 100644 --- a/klm/lm/trie_sort.hh +++ b/klm/lm/trie_sort.hh @@ -25,7 +25,7 @@ namespace lm {  class PositiveProbWarn;  namespace ngram {  class SortedVocabulary; -class Config; +struct Config;  namespace trie { @@ -107,7 +107,7 @@ class SortedFiles {      util::scoped_fd unigram_; -    util::scoped_FILE full_[kMaxOrder - 1], context_[kMaxOrder - 1]; +    util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1];  };  } // namespace trie diff --git a/klm/lm/value.hh b/klm/lm/value.hh index 85e53f14..ba716713 100644 --- a/klm/lm/value.hh +++ b/klm/lm/value.hh @@ -6,7 +6,7 @@  #include "lm/weights.hh"  #include "util/bit_packing.hh" -#include <inttypes.h> +#include <stdint.h>  namespace lm {  namespace ngram { diff --git a/klm/lm/value_build.hh b/klm/lm/value_build.hh index 687a41a0..461e6a5c 100644 --- a/klm/lm/value_build.hh +++ b/klm/lm/value_build.hh @@ -10,9 +10,9 @@  namespace lm {  namespace ngram { -class Config; -class BackoffValue; -class RestValue; +struct Config; +struct BackoffValue; +struct RestValue;  class NoRestBuild {    public: diff --git a/klm/lm/vocab.hh b/klm/lm/vocab.hh index c3efcb4a..a25432f9 100644 --- a/klm/lm/vocab.hh +++ b/klm/lm/vocab.hh @@ -13,11 +13,11 @@  #include <vector>  namespace lm { -class ProbBackoff; +struct ProbBackoff;  class EnumerateVocab;  namespace ngram { -class Config; +struct Config;  namespace detail {  uint64_t HashForVocab(const char *str, std::size_t len); | 
