From d884099e0db8b4510847ec106b59ef7dca3c245b Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Fri, 18 Jan 2013 17:12:51 +0000
Subject: KenLM dffafbf with lmplz source (but not built)

---
 klm/lm/builder/corpus_count.cc | 223 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 223 insertions(+)
 create mode 100644 klm/lm/builder/corpus_count.cc

(limited to 'klm/lm/builder/corpus_count.cc')
diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc
new file mode 100644
index 00000000..8c3de57d
--- /dev/null
+++ b/klm/lm/builder/corpus_count.cc
@@ -0,0 +1,223 @@
+#include "lm/builder/corpus_count.hh"
+
+#include "lm/builder/ngram.hh"
+#include "lm/lm_exception.hh"
+#include "lm/word_index.hh"
+#include "util/file.hh"
+#include "util/file_piece.hh"
+#include "util/murmur_hash.hh"
+#include "util/probing_hash_table.hh"
+#include "util/scoped.hh"
+#include "util/stream/chain.hh"
+#include "util/stream/timer.hh"
+#include "util/tokenize_piece.hh"
+
+#include <boost/unordered_set.hpp>
+#include <boost/unordered_map.hpp>
+
+#include <functional>
+
+#include <stdint.h>
+
+namespace lm {
+namespace builder {
+namespace {
+
+class VocabHandout {
+  public:
+    explicit VocabHandout(int fd) {
+      util::scoped_fd duped(util::DupOrThrow(fd));
+      word_list_.reset(util::FDOpenOrThrow(duped));
+      
+      Lookup("<unk>"); // Force 0
+      Lookup("<s>"); // Force 1
+      Lookup("</s>"); // Force 2
+    }
+
+    WordIndex Lookup(const StringPiece &word) {
+      uint64_t hashed = util::MurmurHashNative(word.data(), word.size());
+      std::pair<Seen::iterator, bool> ret(seen_.insert(std::pair<uint64_t, lm::WordIndex>(hashed, seen_.size())));
+      if (ret.second) {
+        char null_delimit = 0;
+        util::WriteOrThrow(word_list_.get(), word.data(), word.size());
+        util::WriteOrThrow(word_list_.get(), &null_delimit, 1);
+        UTIL_THROW_IF(seen_.size() >= std::numeric_limits<lm::WordIndex>::max(), VocabLoadException, "Too many vocabulary words.  Change WordIndex to uint64_t in lm/word_index.hh.");
+      }
+      return ret.first->second;
+    }
+
+    WordIndex Size() const {
+      return seen_.size();
+    }
+
+  private:
+    typedef boost::unordered_map<uint64_t, lm::WordIndex> Seen;
+
+    Seen seen_;
+
+    util::scoped_FILE word_list_;
+};
+
+class DedupeHash : public std::unary_function<const WordIndex *, bool> {
+  public:
+    explicit DedupeHash(std::size_t order) : size_(order * sizeof(WordIndex)) {}
+
+    std::size_t operator()(const WordIndex *start) const {
+      return util::MurmurHashNative(start, size_);
+    }
+    
+  private:
+    const std::size_t size_;
+};
+
+class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
+  public:
+    explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
+    
+    bool operator()(const WordIndex *first, const WordIndex *second) const {
+      return !memcmp(first, second, size_);
+    } 
+    
+  private:
+    const std::size_t size_;
+};
+
+struct DedupeEntry {
+  typedef WordIndex *Key;
+  Key GetKey() const { return key; }
+  Key key;
+  static DedupeEntry Construct(WordIndex *at) {
+    DedupeEntry ret;
+    ret.key = at;
+    return ret;
+  }
+};
+
+typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
+
+const float kProbingMultiplier = 1.5;
+
+class Writer {
+  public:
+    Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) 
+      : block_(position), gram_(block_->Get(), order),
+        dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
+        dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
+        buffer_(new WordIndex[order - 1]),
+        block_size_(position.GetChain().BlockSize()) {
+      dedupe_.Clear(DedupeEntry::Construct(&dedupe_invalid_[0]));
+      assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
+      if (order == 1) {
+        // Add special words.  AdjustCounts is responsible if order != 1.    
+        AddUnigramWord(kUNK);
+        AddUnigramWord(kBOS);
+      }
+    }
+
+    ~Writer() {
+      block_->SetValidSize(reinterpret_cast<const uint8_t*>(gram_.begin()) - static_cast<const uint8_t*>(block_->Get()));
+      (++block_).Poison();
+    }
+
+    // Write context with a bunch of <s>
+    void StartSentence() {
+      for (WordIndex *i = gram_.begin(); i != gram_.end() - 1; ++i) {
+        *i = kBOS;
+      }
+    }
+
+    void Append(WordIndex word) {
+      *(gram_.end() - 1) = word;
+      Dedupe::MutableIterator at;
+      bool found = dedupe_.FindOrInsert(DedupeEntry::Construct(gram_.begin()), at);
+      if (found) {
+        // Already present.
+        NGram already(at->key, gram_.Order());
+        ++(already.Count());
+        // Shift left by one.
+        memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
+        return;
+      }
+      // Complete the write.  
+      gram_.Count() = 1;
+      // Prepare the next n-gram.  
+      if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
+        NGram last(gram_);
+        gram_.NextInMemory();
+        std::copy(last.begin() + 1, last.end(), gram_.begin());
+        return;
+      }
+      // Block end.  Need to store the context in a temporary buffer.  
+      std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
+      dedupe_.Clear(DedupeEntry::Construct(&dedupe_invalid_[0]));
+      block_->SetValidSize(block_size_);
+      gram_.ReBase((++block_)->Get());
+      std::copy(buffer_.get(), buffer_.get() + gram_.Order() - 1, gram_.begin());
+    }
+
+  private:
+    void AddUnigramWord(WordIndex index) {
+      *gram_.begin() = index;
+      gram_.Count() = 0;
+      gram_.NextInMemory();
+      if (gram_.Base() == static_cast<uint8_t*>(block_->Get()) + block_size_) {
+        block_->SetValidSize(block_size_);
+        gram_.ReBase((++block_)->Get());
+      }
+    }
+
+    util::stream::Link block_;
+
+    NGram gram_;
+
+    // This is the memory behind the invalid value in dedupe_.
+    std::vector<WordIndex> dedupe_invalid_;
+    // Hash table combiner implementation.
+    Dedupe dedupe_;
+
+    // Small buffer to hold existing ngrams when shifting across a block boundary.  
+    boost::scoped_array<WordIndex> buffer_;
+
+    const std::size_t block_size_;
+};
+
+} // namespace
+
+float CorpusCount::DedupeMultiplier(std::size_t order) {
+  return kProbingMultiplier * static_cast<float>(sizeof(DedupeEntry)) / static_cast<float>(NGram::TotalSize(order));
+}
+
+CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block) 
+  : from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count),
+    dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)),
+    dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) {
+  token_count_ = 0;
+  type_count_ = 0;
+}
+
+void CorpusCount::Run(const util::stream::ChainPosition &position) {
+  UTIL_TIMER("(%w s) Counted n-grams\n");
+
+  VocabHandout vocab(vocab_write_);
+  const WordIndex end_sentence = vocab.Lookup("</s>");
+  Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);
+  uint64_t count = 0;
+  try {
+    while(true) {
+      StringPiece line(from_.ReadLine());
+      writer.StartSentence();
+      for (util::TokenIter<util::AnyCharacter, true> w(line, " \t"); w; ++w) {
+        WordIndex word = vocab.Lookup(*w);
+        UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus.  I plan to support models containing <unk> in the future.");
+        writer.Append(word);
+        ++count;
+      }
+      writer.Append(end_sentence);
+    }
+  } catch (const util::EndOfFileException &e) {}
+  token_count_ = count;
+  type_count_ = vocab.Size();
+}
+
+} // namespace builder
+} // namespace lm
-- 
cgit v1.2.3


From dc16aa2accc7d9033d9c31c7bbc5e581d43a5101 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Sun, 20 Jan 2013 12:31:03 +0000
Subject: Better delimiters, cross-platform fixes

---
 klm/lm/builder/corpus_count.cc |  3 ++-
 klm/lm/filter/arpa_io.cc       | 36 +++++++++++-------------------------
 klm/lm/filter/arpa_io.hh       | 27 ++++++++++-----------------
 klm/util/stream/sort.hh        |  5 +++--
 klm/util/stream/timer.hh       |  8 +++++---
 5 files changed, 31 insertions(+), 48 deletions(-)

(limited to 'klm/lm/builder/corpus_count.cc')

diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc
index 8c3de57d..abea4ed0 100644
--- a/klm/lm/builder/corpus_count.cc
+++ b/klm/lm/builder/corpus_count.cc
@@ -202,11 +202,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
   const WordIndex end_sentence = vocab.Lookup("</s>");
   Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);
   uint64_t count = 0;
+  StringPiece delimiters("\0\t\r ", 4);
   try {
     while(true) {
       StringPiece line(from_.ReadLine());
       writer.StartSentence();
-      for (util::TokenIter<util::AnyCharacter, true> w(line, " \t"); w; ++w) {
+      for (util::TokenIter<util::AnyCharacter, true> w(line, delimiters); w; ++w) {
         WordIndex word = vocab.Lookup(*w);
         UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus.  I plan to support models containing <unk> in the future.");
         writer.Append(word);
diff --git a/klm/lm/filter/arpa_io.cc b/klm/lm/filter/arpa_io.cc
index caf8df95..f8568ac4 100644
--- a/klm/lm/filter/arpa_io.cc
+++ b/klm/lm/filter/arpa_io.cc
@@ -12,38 +12,24 @@
 
 namespace lm {
 
-ARPAInputException::ARPAInputException(const StringPiece &message) throw() : what_("Error: ") {
-  what_.append(message.data(), message.size());
+ARPAInputException::ARPAInputException(const StringPiece &message) throw() {
+  *this << message;
 }
 
 ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() {
-  what_ = "Error: ";
-  what_.append(message.data(), message.size());
-  what_ += " in line '";
-  what_.append(line.data(), line.size());
-  what_ += "'.";
+  *this << message << " in line " << line;
 }
 
-ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw()
-  : what_(std::string(message) + " file " + file_name), file_name_(file_name) {
-  if (errno) {
-    char buf[1024];
-    buf[0] = 0;
-#if (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE
-    const char *add = buf;
-    if (!strerror_r(errno, buf, 1024)) {
-#else
-    const char *add = strerror_r(errno, buf, 1024);
-    if (add) {
-#endif
-      what_ += " :";
-      what_ += add;
-    }
-  }
+ARPAInputException::~ARPAInputException() throw() {}
+
+ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() {
+  *this << message << " in file " << file_name;
 }
 
+ARPAOutputException::~ARPAOutputException() throw() {}
+
 // Seeking is the responsibility of the caller.
-void WriteCounts(std::ostream &out, const std::vector<size_t> &number) {
+void WriteCounts(std::ostream &out, const std::vector<uint64_t> &number) {
   out << "\n\\data\\\n";
   for (unsigned int i = 0; i < number.size(); ++i) {
     out << "ngram " << i+1 << "=" << number[i] << '\n';
@@ -51,7 +37,7 @@ void WriteCounts(std::ostream &out, const std::vector<size_t> &number) {
   out << '\n';
 }
 
-size_t SizeNeededForCounts(const std::vector<size_t> &number) {
+size_t SizeNeededForCounts(const std::vector<uint64_t> &number) {
   std::ostringstream buf;
   WriteCounts(buf, number);
   return buf.tellp();
diff --git a/klm/lm/filter/arpa_io.hh b/klm/lm/filter/arpa_io.hh
index 90f48447..5b31620b 100644
--- a/klm/lm/filter/arpa_io.hh
+++ b/klm/lm/filter/arpa_io.hh
@@ -16,6 +16,7 @@
 
 #include <err.h>
 #include <string.h>
+#include <stdint.h>
 
 namespace util { class FilePiece; }
 
@@ -25,34 +26,26 @@ class ARPAInputException : public util::Exception {
   public:
     explicit ARPAInputException(const StringPiece &message) throw();
     explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw();
-    virtual ~ARPAInputException() throw() {}
-
-    const char *what() const throw() { return what_.c_str(); }
-
-  private:
-    std::string what_;
+    virtual ~ARPAInputException() throw();
 };
 
-class ARPAOutputException : public std::exception {
+class ARPAOutputException : public util::ErrnoException {
   public:
     ARPAOutputException(const char *prefix, const std::string &file_name) throw();
-    virtual ~ARPAOutputException() throw() {}
-
-    const char *what() const throw() { return what_.c_str(); }
+    virtual ~ARPAOutputException() throw();
 
     const std::string &File() const throw() { return file_name_; }
 
   private:
-    std::string what_;
     const std::string file_name_;
 };
 
 // Handling for the counts of n-grams at the beginning of ARPA files.
-size_t SizeNeededForCounts(const std::vector<size_t> &number);
+size_t SizeNeededForCounts(const std::vector<uint64_t> &number);
 
 /* Writes an ARPA file.  This has to be seekable so the counts can be written
  * at the end.  Hence, I just have it own a std::fstream instead of accepting
- * a separately held std::ostream.  
+ * a separately held std::ostream.  TODO: use the fast one from estimation.
  */
 class ARPAOutput : boost::noncopyable {
   public:
@@ -88,14 +81,14 @@ class ARPAOutput : boost::noncopyable {
     boost::scoped_array<char> buffer_;
     std::fstream file_;
     size_t fast_counter_;
-    std::vector<size_t> counts_;
+    std::vector<uint64_t> counts_;
 };
 
 
-template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, size_t number, Output &out) {
+template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) {
   ReadNGramHeader(in, length);
   out.BeginLength(length);
-  for (size_t i = 0; i < number; ++i) {
+  for (uint64_t i = 0; i < number; ++i) {
     StringPiece line = in.ReadLine();
     util::TokenIter<util::SingleCharacter> tabber(line, '\t');
     if (!tabber) throw ARPAInputException("blank line", line);
@@ -107,7 +100,7 @@ template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length
 }
 
 template <class Output> void ReadARPA(util::FilePiece &in_lm, Output &out) {
-  std::vector<size_t> number;
+  std::vector<uint64_t> number;
   ReadARPACounts(in_lm, number);
   out.ReserveForCounts(SizeNeededForCounts(number));
   for (unsigned int i = 0; i < number.size(); ++i) {
diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh
index df57fa41..a86f160f 100644
--- a/klm/util/stream/sort.hh
+++ b/klm/util/stream/sort.hh
@@ -259,8 +259,9 @@ template <class Compare, class Combine> class MergingReader {
 
       while (in_offsets_->RemainingBlocks()) {
         // Use bigger buffers if there's less remaining.
-        uint64_t per_buffer = std::max(static_cast<uint64_t>(buffer_size_),
-                                       static_cast<uint64_t>(total_memory_ / in_offsets_->RemainingBlocks()));
+        uint64_t per_buffer = static_cast<uint64_t>(std::max<std::size_t>(
+            buffer_size_,
+            static_cast<std::size_t>((static_cast<uint64_t>(total_memory_) / in_offsets_->RemainingBlocks()))));
         per_buffer -= per_buffer % entry_size;
         assert(per_buffer);
 
diff --git a/klm/util/stream/timer.hh b/klm/util/stream/timer.hh
index 50e94fe8..7e1a5885 100644
--- a/klm/util/stream/timer.hh
+++ b/klm/util/stream/timer.hh
@@ -1,14 +1,16 @@
 #ifndef UTIL_STREAM_TIMER__
 #define UTIL_STREAM_TIMER__
 
-#include <boost/version.hpp>
+// Sorry Jon, this was adding library dependencies in Moses and people complained.
+
+/*#include <boost/version.hpp>
 
 #if BOOST_VERSION >= 104800
 #include <boost/timer/timer.hpp>
 #define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str))
 #else
-//#warning Using Boost older than 1.48. Timing information will not be available.
+//#warning Using Boost older than 1.48. Timing information will not be available.*/
 #define UTIL_TIMER(str) 
-#endif
+//#endif
 
 #endif // UTIL_STREAM_TIMER__
-- 
cgit v1.2.3