From d884099e0db8b4510847ec106b59ef7dca3c245b Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Fri, 18 Jan 2013 17:12:51 +0000 Subject: KenLM dffafbf with lmplz source (but not built) --- klm/lm/builder/corpus_count.cc | 223 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 klm/lm/builder/corpus_count.cc (limited to 'klm/lm/builder/corpus_count.cc') diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc new file mode 100644 index 00000000..8c3de57d --- /dev/null +++ b/klm/lm/builder/corpus_count.cc @@ -0,0 +1,223 @@ +#include "lm/builder/corpus_count.hh" + +#include "lm/builder/ngram.hh" +#include "lm/lm_exception.hh" +#include "lm/word_index.hh" +#include "util/file.hh" +#include "util/file_piece.hh" +#include "util/murmur_hash.hh" +#include "util/probing_hash_table.hh" +#include "util/scoped.hh" +#include "util/stream/chain.hh" +#include "util/stream/timer.hh" +#include "util/tokenize_piece.hh" + +#include +#include + +#include + +#include + +namespace lm { +namespace builder { +namespace { + +class VocabHandout { + public: + explicit VocabHandout(int fd) { + util::scoped_fd duped(util::DupOrThrow(fd)); + word_list_.reset(util::FDOpenOrThrow(duped)); + + Lookup(""); // Force 0 + Lookup(""); // Force 1 + Lookup(""); // Force 2 + } + + WordIndex Lookup(const StringPiece &word) { + uint64_t hashed = util::MurmurHashNative(word.data(), word.size()); + std::pair ret(seen_.insert(std::pair(hashed, seen_.size()))); + if (ret.second) { + char null_delimit = 0; + util::WriteOrThrow(word_list_.get(), word.data(), word.size()); + util::WriteOrThrow(word_list_.get(), &null_delimit, 1); + UTIL_THROW_IF(seen_.size() >= std::numeric_limits::max(), VocabLoadException, "Too many vocabulary words. Change WordIndex to uint64_t in lm/word_index.hh."); + } + return ret.first->second; + } + + WordIndex Size() const { + return seen_.size(); + } + + private: + typedef boost::unordered_map Seen; + + Seen seen_; + + util::scoped_FILE word_list_; +}; + +class DedupeHash : public std::unary_function { + public: + explicit DedupeHash(std::size_t order) : size_(order * sizeof(WordIndex)) {} + + std::size_t operator()(const WordIndex *start) const { + return util::MurmurHashNative(start, size_); + } + + private: + const std::size_t size_; +}; + +class DedupeEquals : public std::binary_function { + public: + explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {} + + bool operator()(const WordIndex *first, const WordIndex *second) const { + return !memcmp(first, second, size_); + } + + private: + const std::size_t size_; +}; + +struct DedupeEntry { + typedef WordIndex *Key; + Key GetKey() const { return key; } + Key key; + static DedupeEntry Construct(WordIndex *at) { + DedupeEntry ret; + ret.key = at; + return ret; + } +}; + +typedef util::ProbingHashTable Dedupe; + +const float kProbingMultiplier = 1.5; + +class Writer { + public: + Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) + : block_(position), gram_(block_->Get(), order), + dedupe_invalid_(order, std::numeric_limits::max()), + dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)), + buffer_(new WordIndex[order - 1]), + block_size_(position.GetChain().BlockSize()) { + dedupe_.Clear(DedupeEntry::Construct(&dedupe_invalid_[0])); + assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size); + if (order == 1) { + // Add special words. AdjustCounts is responsible if order != 1. + AddUnigramWord(kUNK); + AddUnigramWord(kBOS); + } + } + + ~Writer() { + block_->SetValidSize(reinterpret_cast(gram_.begin()) - static_cast(block_->Get())); + (++block_).Poison(); + } + + // Write context with a bunch of + void StartSentence() { + for (WordIndex *i = gram_.begin(); i != gram_.end() - 1; ++i) { + *i = kBOS; + } + } + + void Append(WordIndex word) { + *(gram_.end() - 1) = word; + Dedupe::MutableIterator at; + bool found = dedupe_.FindOrInsert(DedupeEntry::Construct(gram_.begin()), at); + if (found) { + // Already present. + NGram already(at->key, gram_.Order()); + ++(already.Count()); + // Shift left by one. + memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1)); + return; + } + // Complete the write. + gram_.Count() = 1; + // Prepare the next n-gram. + if (reinterpret_cast(gram_.begin()) + gram_.TotalSize() != static_cast(block_->Get()) + block_size_) { + NGram last(gram_); + gram_.NextInMemory(); + std::copy(last.begin() + 1, last.end(), gram_.begin()); + return; + } + // Block end. Need to store the context in a temporary buffer. + std::copy(gram_.begin() + 1, gram_.end(), buffer_.get()); + dedupe_.Clear(DedupeEntry::Construct(&dedupe_invalid_[0])); + block_->SetValidSize(block_size_); + gram_.ReBase((++block_)->Get()); + std::copy(buffer_.get(), buffer_.get() + gram_.Order() - 1, gram_.begin()); + } + + private: + void AddUnigramWord(WordIndex index) { + *gram_.begin() = index; + gram_.Count() = 0; + gram_.NextInMemory(); + if (gram_.Base() == static_cast(block_->Get()) + block_size_) { + block_->SetValidSize(block_size_); + gram_.ReBase((++block_)->Get()); + } + } + + util::stream::Link block_; + + NGram gram_; + + // This is the memory behind the invalid value in dedupe_. + std::vector dedupe_invalid_; + // Hash table combiner implementation. + Dedupe dedupe_; + + // Small buffer to hold existing ngrams when shifting across a block boundary. + boost::scoped_array buffer_; + + const std::size_t block_size_; +}; + +} // namespace + +float CorpusCount::DedupeMultiplier(std::size_t order) { + return kProbingMultiplier * static_cast(sizeof(DedupeEntry)) / static_cast(NGram::TotalSize(order)); +} + +CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block) + : from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count), + dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)), + dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) { + token_count_ = 0; + type_count_ = 0; +} + +void CorpusCount::Run(const util::stream::ChainPosition &position) { + UTIL_TIMER("(%w s) Counted n-grams\n"); + + VocabHandout vocab(vocab_write_); + const WordIndex end_sentence = vocab.Lookup(""); + Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_); + uint64_t count = 0; + try { + while(true) { + StringPiece line(from_.ReadLine()); + writer.StartSentence(); + for (util::TokenIter w(line, " \t"); w; ++w) { + WordIndex word = vocab.Lookup(*w); + UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus. I plan to support models containing in the future."); + writer.Append(word); + ++count; + } + writer.Append(end_sentence); + } + } catch (const util::EndOfFileException &e) {} + token_count_ = count; + type_count_ = vocab.Size(); +} + +} // namespace builder +} // namespace lm -- cgit v1.2.3 From dc16aa2accc7d9033d9c31c7bbc5e581d43a5101 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 20 Jan 2013 12:31:03 +0000 Subject: Better delimiters, cross-platform fixes --- klm/lm/builder/corpus_count.cc | 3 ++- klm/lm/filter/arpa_io.cc | 36 +++++++++++------------------------- klm/lm/filter/arpa_io.hh | 27 ++++++++++----------------- klm/util/stream/sort.hh | 5 +++-- klm/util/stream/timer.hh | 8 +++++--- 5 files changed, 31 insertions(+), 48 deletions(-) (limited to 'klm/lm/builder/corpus_count.cc') diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc index 8c3de57d..abea4ed0 100644 --- a/klm/lm/builder/corpus_count.cc +++ b/klm/lm/builder/corpus_count.cc @@ -202,11 +202,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) { const WordIndex end_sentence = vocab.Lookup(""); Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_); uint64_t count = 0; + StringPiece delimiters("\0\t\r ", 4); try { while(true) { StringPiece line(from_.ReadLine()); writer.StartSentence(); - for (util::TokenIter w(line, " \t"); w; ++w) { + for (util::TokenIter w(line, delimiters); w; ++w) { WordIndex word = vocab.Lookup(*w); UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus. I plan to support models containing in the future."); writer.Append(word); diff --git a/klm/lm/filter/arpa_io.cc b/klm/lm/filter/arpa_io.cc index caf8df95..f8568ac4 100644 --- a/klm/lm/filter/arpa_io.cc +++ b/klm/lm/filter/arpa_io.cc @@ -12,38 +12,24 @@ namespace lm { -ARPAInputException::ARPAInputException(const StringPiece &message) throw() : what_("Error: ") { - what_.append(message.data(), message.size()); +ARPAInputException::ARPAInputException(const StringPiece &message) throw() { + *this << message; } ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() { - what_ = "Error: "; - what_.append(message.data(), message.size()); - what_ += " in line '"; - what_.append(line.data(), line.size()); - what_ += "'."; + *this << message << " in line " << line; } -ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() - : what_(std::string(message) + " file " + file_name), file_name_(file_name) { - if (errno) { - char buf[1024]; - buf[0] = 0; -#if (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE - const char *add = buf; - if (!strerror_r(errno, buf, 1024)) { -#else - const char *add = strerror_r(errno, buf, 1024); - if (add) { -#endif - what_ += " :"; - what_ += add; - } - } +ARPAInputException::~ARPAInputException() throw() {} + +ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() { + *this << message << " in file " << file_name; } +ARPAOutputException::~ARPAOutputException() throw() {} + // Seeking is the responsibility of the caller. -void WriteCounts(std::ostream &out, const std::vector &number) { +void WriteCounts(std::ostream &out, const std::vector &number) { out << "\n\\data\\\n"; for (unsigned int i = 0; i < number.size(); ++i) { out << "ngram " << i+1 << "=" << number[i] << '\n'; @@ -51,7 +37,7 @@ void WriteCounts(std::ostream &out, const std::vector &number) { out << '\n'; } -size_t SizeNeededForCounts(const std::vector &number) { +size_t SizeNeededForCounts(const std::vector &number) { std::ostringstream buf; WriteCounts(buf, number); return buf.tellp(); diff --git a/klm/lm/filter/arpa_io.hh b/klm/lm/filter/arpa_io.hh index 90f48447..5b31620b 100644 --- a/klm/lm/filter/arpa_io.hh +++ b/klm/lm/filter/arpa_io.hh @@ -16,6 +16,7 @@ #include #include +#include namespace util { class FilePiece; } @@ -25,34 +26,26 @@ class ARPAInputException : public util::Exception { public: explicit ARPAInputException(const StringPiece &message) throw(); explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw(); - virtual ~ARPAInputException() throw() {} - - const char *what() const throw() { return what_.c_str(); } - - private: - std::string what_; + virtual ~ARPAInputException() throw(); }; -class ARPAOutputException : public std::exception { +class ARPAOutputException : public util::ErrnoException { public: ARPAOutputException(const char *prefix, const std::string &file_name) throw(); - virtual ~ARPAOutputException() throw() {} - - const char *what() const throw() { return what_.c_str(); } + virtual ~ARPAOutputException() throw(); const std::string &File() const throw() { return file_name_; } private: - std::string what_; const std::string file_name_; }; // Handling for the counts of n-grams at the beginning of ARPA files. -size_t SizeNeededForCounts(const std::vector &number); +size_t SizeNeededForCounts(const std::vector &number); /* Writes an ARPA file. This has to be seekable so the counts can be written * at the end. Hence, I just have it own a std::fstream instead of accepting - * a separately held std::ostream. + * a separately held std::ostream. TODO: use the fast one from estimation. */ class ARPAOutput : boost::noncopyable { public: @@ -88,14 +81,14 @@ class ARPAOutput : boost::noncopyable { boost::scoped_array buffer_; std::fstream file_; size_t fast_counter_; - std::vector counts_; + std::vector counts_; }; -template void ReadNGrams(util::FilePiece &in, unsigned int length, size_t number, Output &out) { +template void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) { ReadNGramHeader(in, length); out.BeginLength(length); - for (size_t i = 0; i < number; ++i) { + for (uint64_t i = 0; i < number; ++i) { StringPiece line = in.ReadLine(); util::TokenIter tabber(line, '\t'); if (!tabber) throw ARPAInputException("blank line", line); @@ -107,7 +100,7 @@ template void ReadNGrams(util::FilePiece &in, unsigned int length } template void ReadARPA(util::FilePiece &in_lm, Output &out) { - std::vector number; + std::vector number; ReadARPACounts(in_lm, number); out.ReserveForCounts(SizeNeededForCounts(number)); for (unsigned int i = 0; i < number.size(); ++i) { diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh index df57fa41..a86f160f 100644 --- a/klm/util/stream/sort.hh +++ b/klm/util/stream/sort.hh @@ -259,8 +259,9 @@ template class MergingReader { while (in_offsets_->RemainingBlocks()) { // Use bigger buffers if there's less remaining. - uint64_t per_buffer = std::max(static_cast(buffer_size_), - static_cast(total_memory_ / in_offsets_->RemainingBlocks())); + uint64_t per_buffer = static_cast(std::max( + buffer_size_, + static_cast((static_cast(total_memory_) / in_offsets_->RemainingBlocks())))); per_buffer -= per_buffer % entry_size; assert(per_buffer); diff --git a/klm/util/stream/timer.hh b/klm/util/stream/timer.hh index 50e94fe8..7e1a5885 100644 --- a/klm/util/stream/timer.hh +++ b/klm/util/stream/timer.hh @@ -1,14 +1,16 @@ #ifndef UTIL_STREAM_TIMER__ #define UTIL_STREAM_TIMER__ -#include +// Sorry Jon, this was adding library dependencies in Moses and people complained. + +/*#include #if BOOST_VERSION >= 104800 #include #define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str)) #else -//#warning Using Boost older than 1.48. Timing information will not be available. +//#warning Using Boost older than 1.48. Timing information will not be available.*/ #define UTIL_TIMER(str) -#endif +//#endif #endif // UTIL_STREAM_TIMER__ -- cgit v1.2.3