From f82833385b7cf3c01dc2f92830119dfe3ebc573e Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 16 Aug 2012 17:02:56 -0400 Subject: KenLM update. Remove a couple of segfaults for weird input. Other oddball stuff. --- klm/lm/bhiksha.cc | 1 + klm/lm/bhiksha.hh | 2 +- klm/lm/build_binary.cc | 8 ++++++-- klm/lm/left.hh | 2 +- klm/lm/max_order.cc | 6 ++++++ klm/lm/max_order.hh | 26 ++++++++++++-------------- klm/lm/model.cc | 11 +++++++++-- klm/lm/model.hh | 1 - klm/lm/quantize.hh | 4 ++-- klm/lm/read_arpa.cc | 16 +++++++++++++++- klm/lm/search_trie.cc | 20 ++++++++++---------- klm/lm/state.hh | 10 +++++----- klm/lm/trie.hh | 2 +- klm/lm/trie_sort.cc | 24 ++++++++++++++++-------- klm/lm/trie_sort.hh | 4 ++-- klm/lm/value.hh | 2 +- klm/lm/value_build.hh | 6 +++--- klm/lm/vocab.hh | 4 ++-- klm/util/file.cc | 10 ++++++++++ klm/util/file.hh | 8 +++++++- klm/util/file_piece.cc | 2 +- klm/util/have.hh | 2 +- klm/util/mmap.cc | 16 +--------------- klm/util/string_piece.hh | 5 +++++ 24 files changed, 118 insertions(+), 74 deletions(-) create mode 100644 klm/lm/max_order.cc diff --git a/klm/lm/bhiksha.cc b/klm/lm/bhiksha.cc index cdeafb47..870a4eee 100644 --- a/klm/lm/bhiksha.cc +++ b/klm/lm/bhiksha.cc @@ -1,6 +1,7 @@ #include "lm/bhiksha.hh" #include "lm/config.hh" #include "util/file.hh" +#include "util/exception.hh" #include diff --git a/klm/lm/bhiksha.hh b/klm/lm/bhiksha.hh index 5182ee2e..9734f3ab 100644 --- a/klm/lm/bhiksha.hh +++ b/klm/lm/bhiksha.hh @@ -23,7 +23,7 @@ namespace lm { namespace ngram { -class Config; +struct Config; namespace trie { diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc index c4a01cb4..49901c9e 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary.cc @@ -25,7 +25,11 @@ void Usage(const char *name) { "-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" "-w mmap|after determines how writing is done.\n" " mmap maps the binary file and writes to it. Default for trie.\n" -" after allocates anonymous memory, builds, and writes. Default for probing.\n\n" +" after allocates anonymous memory, builds, and writes. Default for probing.\n" +"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n" +" model files. order1.arpa must be an ARPA file. All others may be ARPA or\n" +" the same data structure as being built. All files must have the same\n" +" vocabulary. For probing, the unigrams must be in the same order.\n\n" "type is either probing or trie. Default is probing.\n\n" "probing uses a probing hash table. It is the fastest but uses the most memory.\n" "-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n" @@ -111,7 +115,7 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) { for (long int i = 0; i < length - 2; ++i) std::cout << ' '; std::cout << prefix << "B\n" "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n" - "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r -p " << config.probing_multiplier << "\n" + "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r models -p " << config.probing_multiplier << "\n" "trie " << std::setw(length) << (sizes[2] / divide) << " without quantization\n" "trie " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n" "trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n" diff --git a/klm/lm/left.hh b/klm/lm/left.hh index c00af88a..8c27232e 100644 --- a/klm/lm/left.hh +++ b/klm/lm/left.hh @@ -111,7 +111,7 @@ template class RuleScore { return; } - float backoffs[kMaxOrder - 1], backoffs2[kMaxOrder - 1]; + float backoffs[KENLM_MAX_ORDER - 1], backoffs2[KENLM_MAX_ORDER - 1]; float *back = backoffs, *back2 = backoffs2; unsigned char next_use = out_.right.length; diff --git a/klm/lm/max_order.cc b/klm/lm/max_order.cc new file mode 100644 index 00000000..94221201 --- /dev/null +++ b/klm/lm/max_order.cc @@ -0,0 +1,6 @@ +#include "lm/max_order.hh" +#include + +int main(int argc, char *argv[]) { + std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl; +} diff --git a/klm/lm/max_order.hh b/klm/lm/max_order.hh index aff9de27..bc8687cd 100644 --- a/klm/lm/max_order.hh +++ b/klm/lm/max_order.hh @@ -1,14 +1,12 @@ -#ifndef LM_MAX_ORDER__ -#define LM_MAX_ORDER__ -namespace lm { -namespace ngram { -// If you need higher order, change this and recompile. -// Having this limit means that State can be -// (kMaxOrder - 1) * sizeof(float) bytes instead of -// sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead -const unsigned char kMaxOrder = 5; - -} // namespace ngram -} // namespace lm - -#endif // LM_MAX_ORDER__ +/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM. + * If not, this is the default maximum order. + * Having this limit means that State can be + * (kMaxOrder - 1) * sizeof(float) bytes instead of + * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead + */ +#ifndef KENLM_MAX_ORDER +#define KENLM_MAX_ORDER 6 +#endif +#ifndef KENLM_ORDER_MESSAGE +#define KENLM_ORDER_MESSAGE "Edit klm/lm/max_order.hh." +#endif diff --git a/klm/lm/model.cc b/klm/lm/model.cc index a2d31ce0..b46333a4 100644 --- a/klm/lm/model.cc +++ b/klm/lm/model.cc @@ -5,6 +5,7 @@ #include "lm/search_hashed.hh" #include "lm/search_trie.hh" #include "lm/read_arpa.hh" +#include "util/have.hh" #include "util/murmur_hash.hh" #include @@ -47,7 +48,14 @@ template GenericModel::Ge P::Init(begin_sentence, null_context, vocab_, search_.Order()); } +namespace { +void CheckMaxOrder(size_t order) { + UTIL_THROW_IF(order > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << order << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE); +} +} // namespace + template void GenericModel::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) { + CheckMaxOrder(params.counts.size()); SetupMemory(start, params.counts, config); vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab); search_.LoadedBinary(); @@ -60,8 +68,7 @@ template void GenericModel counts; // File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_. ReadARPACounts(f, counts); - - if (counts.size() > kMaxOrder) UTIL_THROW(FormatLoadException, "This model has order " << counts.size() << ". Edit lm/max_order.hh, set kMaxOrder to at least this value, and recompile."); + CheckMaxOrder(counts.size()); if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model."); if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0"); diff --git a/klm/lm/model.hh b/klm/lm/model.hh index be872178..6dee9419 100644 --- a/klm/lm/model.hh +++ b/klm/lm/model.hh @@ -5,7 +5,6 @@ #include "lm/binary_format.hh" #include "lm/config.hh" #include "lm/facade.hh" -#include "lm/max_order.hh" #include "lm/quantize.hh" #include "lm/search_hashed.hh" #include "lm/search_trie.hh" diff --git a/klm/lm/quantize.hh b/klm/lm/quantize.hh index 3e9153e3..abed0112 100644 --- a/klm/lm/quantize.hh +++ b/klm/lm/quantize.hh @@ -17,7 +17,7 @@ namespace lm { namespace ngram { -class Config; +struct Config; /* Store values directly and don't quantize. */ class DontQuantize { @@ -217,7 +217,7 @@ class SeparatelyQuantize { const Bins &LongestTable() const { return longest_; } private: - Bins tables_[kMaxOrder - 1][2]; + Bins tables_[KENLM_MAX_ORDER - 1][2]; Bins longest_; diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc index 2d9a337d..70727e4c 100644 --- a/klm/lm/read_arpa.cc +++ b/klm/lm/read_arpa.cc @@ -7,9 +7,14 @@ #include #include +#include #include #include +#ifdef WIN32 +#include +#endif + namespace lm { // 1 for '\t', '\n', and ' '. This is stricter than isspace. @@ -93,7 +98,16 @@ void ReadBackoff(util::FilePiece &in, float &backoff) { case '\t': backoff = in.ReadFloat(); if (backoff == ngram::kExtensionBackoff) backoff = ngram::kNoExtensionBackoff; - if ((in.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff"); + { +#ifdef WIN32 + int float_class = _fpclass(backoff); + UTIL_THROW_IF(float_class == _FPCLASS_SNAN || float_class == _FPCLASS_QNAN || float_class == _FPCLASS_NINF || float_class == _FPCLASS_PINF, FormatLoadException, "Bad backoff " << backoff); +#else + int float_class = fpclassify(backoff); + UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff); +#endif + } + UTIL_THROW_IF(in.get() != '\n', FormatLoadException, "Expected newline after backoff"); break; case '\n': backoff = ngram::kNoExtensionBackoff; diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index 18e80d5a..832cc9f7 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -180,7 +180,7 @@ const float kBadProb = std::numeric_limits::infinity(); class SRISucks { public: SRISucks() { - for (BackoffMessages *i = messages_; i != messages_ + kMaxOrder - 1; ++i) + for (BackoffMessages *i = messages_; i != messages_ + KENLM_MAX_ORDER - 1; ++i) i->Init(sizeof(ProbPointer) + sizeof(WordIndex) * (i - messages_ + 1)); } @@ -196,7 +196,7 @@ class SRISucks { } void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) { - for (unsigned char i = 0; i < kMaxOrder - 1; ++i) { + for (unsigned char i = 0; i < KENLM_MAX_ORDER - 1; ++i) { it_[i] = values_[i].empty() ? NULL : &*values_[i].begin(); } messages_[0].Apply(it_, unigram_file); @@ -221,10 +221,10 @@ class SRISucks { private: // This used to be one array. Then I needed to separate it by order for quantization to work. - std::vector values_[kMaxOrder - 1]; - BackoffMessages messages_[kMaxOrder - 1]; + std::vector values_[KENLM_MAX_ORDER - 1]; + BackoffMessages messages_[KENLM_MAX_ORDER - 1]; - float *it_[kMaxOrder - 1]; + float *it_[KENLM_MAX_ORDER - 1]; }; class FindBlanks { @@ -337,7 +337,7 @@ struct Gram { template class BlankManager { public: BlankManager(unsigned char total_order, Doing &doing) : total_order_(total_order), been_length_(0), doing_(doing) { - for (float *i = basis_; i != basis_ + kMaxOrder - 1; ++i) *i = kBadProb; + for (float *i = basis_; i != basis_ + KENLM_MAX_ORDER - 1; ++i) *i = kBadProb; } void Visit(const WordIndex *to, unsigned char length, float prob) { @@ -373,10 +373,10 @@ template class BlankManager { private: const unsigned char total_order_; - WordIndex been_[kMaxOrder]; + WordIndex been_[KENLM_MAX_ORDER]; unsigned char been_length_; - float basis_[kMaxOrder]; + float basis_[KENLM_MAX_ORDER]; Doing &doing_; }; @@ -470,8 +470,8 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c } // namespace template void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) { - RecordReader inputs[kMaxOrder - 1]; - RecordReader contexts[kMaxOrder - 1]; + RecordReader inputs[KENLM_MAX_ORDER - 1]; + RecordReader contexts[KENLM_MAX_ORDER - 1]; for (unsigned char i = 2; i <= counts.size(); ++i) { inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff))); diff --git a/klm/lm/state.hh b/klm/lm/state.hh index c7438414..830e40aa 100644 --- a/klm/lm/state.hh +++ b/klm/lm/state.hh @@ -32,7 +32,7 @@ class State { // Call this before using raw memcmp. void ZeroRemaining() { - for (unsigned char i = length; i < kMaxOrder - 1; ++i) { + for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) { words[i] = 0; backoff[i] = 0.0; } @@ -42,8 +42,8 @@ class State { // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD. // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit. - WordIndex words[kMaxOrder - 1]; - float backoff[kMaxOrder - 1]; + WordIndex words[KENLM_MAX_ORDER - 1]; + float backoff[KENLM_MAX_ORDER - 1]; unsigned char length; }; @@ -72,11 +72,11 @@ struct Left { } void ZeroRemaining() { - for (uint64_t * i = pointers + length; i < pointers + kMaxOrder - 1; ++i) + for (uint64_t * i = pointers + length; i < pointers + KENLM_MAX_ORDER - 1; ++i) *i = 0; } - uint64_t pointers[kMaxOrder - 1]; + uint64_t pointers[KENLM_MAX_ORDER - 1]; unsigned char length; bool full; }; diff --git a/klm/lm/trie.hh b/klm/lm/trie.hh index eff93292..034a1414 100644 --- a/klm/lm/trie.hh +++ b/klm/lm/trie.hh @@ -11,7 +11,7 @@ namespace lm { namespace ngram { -class Config; +struct Config; namespace trie { struct NodeRange { diff --git a/klm/lm/trie_sort.cc b/klm/lm/trie_sort.cc index b80fed02..0d83221e 100644 --- a/klm/lm/trie_sort.cc +++ b/klm/lm/trie_sort.cc @@ -148,13 +148,17 @@ template FILE *MergeSortedFiles(FILE *first_file, FILE *second_f } // namespace void RecordReader::Init(FILE *file, std::size_t entry_size) { - rewind(file); - file_ = file; + entry_size_ = entry_size; data_.reset(malloc(entry_size)); UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read buffer"); - remains_ = true; - entry_size_ = entry_size; - ++*this; + file_ = file; + if (file) { + rewind(file); + remains_ = true; + ++*this; + } else { + remains_ = false; + } } void RecordReader::Overwrite(const void *start, std::size_t amount) { @@ -169,9 +173,13 @@ void RecordReader::Overwrite(const void *start, std::size_t amount) { } void RecordReader::Rewind() { - rewind(file_); - remains_ = true; - ++*this; + if (file_) { + rewind(file_); + remains_ = true; + ++*this; + } else { + remains_ = false; + } } SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { diff --git a/klm/lm/trie_sort.hh b/klm/lm/trie_sort.hh index 3036319d..1e6fce51 100644 --- a/klm/lm/trie_sort.hh +++ b/klm/lm/trie_sort.hh @@ -25,7 +25,7 @@ namespace lm { class PositiveProbWarn; namespace ngram { class SortedVocabulary; -class Config; +struct Config; namespace trie { @@ -107,7 +107,7 @@ class SortedFiles { util::scoped_fd unigram_; - util::scoped_FILE full_[kMaxOrder - 1], context_[kMaxOrder - 1]; + util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1]; }; } // namespace trie diff --git a/klm/lm/value.hh b/klm/lm/value.hh index 85e53f14..ba716713 100644 --- a/klm/lm/value.hh +++ b/klm/lm/value.hh @@ -6,7 +6,7 @@ #include "lm/weights.hh" #include "util/bit_packing.hh" -#include +#include namespace lm { namespace ngram { diff --git a/klm/lm/value_build.hh b/klm/lm/value_build.hh index 687a41a0..461e6a5c 100644 --- a/klm/lm/value_build.hh +++ b/klm/lm/value_build.hh @@ -10,9 +10,9 @@ namespace lm { namespace ngram { -class Config; -class BackoffValue; -class RestValue; +struct Config; +struct BackoffValue; +struct RestValue; class NoRestBuild { public: diff --git a/klm/lm/vocab.hh b/klm/lm/vocab.hh index c3efcb4a..a25432f9 100644 --- a/klm/lm/vocab.hh +++ b/klm/lm/vocab.hh @@ -13,11 +13,11 @@ #include namespace lm { -class ProbBackoff; +struct ProbBackoff; class EnumerateVocab; namespace ngram { -class Config; +struct Config; namespace detail { uint64_t HashForVocab(const char *str, std::size_t len); diff --git a/klm/util/file.cc b/klm/util/file.cc index 6a3885a7..98f13983 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -44,6 +44,16 @@ int OpenReadOrThrow(const char *name) { return ret; } +int CreateOrThrow(const char *name) { + int ret; +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); +#else + UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); +#endif + return ret; +} + uint64_t SizeFile(int fd) { #if defined(_WIN32) || defined(_WIN64) __int64 ret = _filelengthi64(fd); diff --git a/klm/util/file.hh b/klm/util/file.hh index 5c57e2a9..8af1ff4f 100644 --- a/klm/util/file.hh +++ b/klm/util/file.hh @@ -65,7 +65,10 @@ class scoped_FILE { std::FILE *file_; }; +// Open for read only. int OpenReadOrThrow(const char *name); +// Create file if it doesn't exist, truncate if it does. Opened for write. +int CreateOrThrow(const char *name); // Return value for SizeFile when it can't size properly. const uint64_t kBadSize = (uint64_t)-1; @@ -91,10 +94,13 @@ class TempMaker { public: explicit TempMaker(const std::string &prefix); + // These will already be unlinked for you. int Make() const; - std::FILE *MakeFile() const; + // This will force you to close the fd instead of leaving it open. + std::string Name(scoped_fd &opened) const; + private: std::string base_; }; diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index a205995a..19a68728 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -27,7 +27,7 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() { #ifdef HAVE_ZLIB GZException::GZException(gzFile file) { int num; - *this << gzerror( file, &num) << " from zlib"; + *this << gzerror(file, &num) << " from zlib"; } #endif // HAVE_ZLIB diff --git a/klm/util/have.hh b/klm/util/have.hh index b8181e99..1d76a7fc 100644 --- a/klm/util/have.hh +++ b/klm/util/have.hh @@ -13,7 +13,7 @@ #endif #ifndef HAVE_BOOST -#define HAVE_BOOST +//#define HAVE_BOOST #endif #ifndef HAVE_THREADS diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index 576fd4cc..bc9e3f81 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -19,8 +19,8 @@ #include #include #else -#include #include +#include #endif namespace util { @@ -171,20 +171,6 @@ void *MapZeroedWrite(int fd, std::size_t size) { return MapOrThrow(size, true, kFileFlags, false, fd, 0); } -namespace { - -int CreateOrThrow(const char *name) { - int ret; -#if defined(_WIN32) || defined(_WIN64) - UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); -#else - UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); -#endif - return ret; -} - -} // namespace - void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { file.reset(CreateOrThrow(name)); try { diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh index 5de053aa..be6a643d 100644 --- a/klm/util/string_piece.hh +++ b/klm/util/string_piece.hh @@ -85,6 +85,11 @@ U_NAMESPACE_BEGIN #include #include +#ifdef WIN32 +#undef max +#undef min +#endif + class StringPiece { public: typedef size_t size_type; -- cgit v1.2.3