From d884099e0db8b4510847ec106b59ef7dca3c245b Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Fri, 18 Jan 2013 17:12:51 +0000
Subject: KenLM dffafbf with lmplz source (but not built)

---
 klm/util/stream/block.hh          |  43 +++
 klm/util/stream/chain.cc          | 155 +++++++++++
 klm/util/stream/chain.hh          | 198 ++++++++++++++
 klm/util/stream/config.hh         |  32 +++
 klm/util/stream/io.cc             |  64 +++++
 klm/util/stream/io.hh             |  76 ++++++
 klm/util/stream/io_test.cc        |  38 +++
 klm/util/stream/line_input.cc     |  52 ++++
 klm/util/stream/line_input.hh     |  22 ++
 klm/util/stream/multi_progress.cc |  86 ++++++
 klm/util/stream/multi_progress.hh |  90 +++++++
 klm/util/stream/sort.hh           | 542 ++++++++++++++++++++++++++++++++++++++
 klm/util/stream/sort_test.cc      |  62 +++++
 klm/util/stream/stream.hh         |  74 ++++++
 klm/util/stream/stream_test.cc    |  35 +++
 klm/util/stream/timer.hh          |  14 +
 16 files changed, 1583 insertions(+)
 create mode 100644 klm/util/stream/block.hh
 create mode 100644 klm/util/stream/chain.cc
 create mode 100644 klm/util/stream/chain.hh
 create mode 100644 klm/util/stream/config.hh
 create mode 100644 klm/util/stream/io.cc
 create mode 100644 klm/util/stream/io.hh
 create mode 100644 klm/util/stream/io_test.cc
 create mode 100644 klm/util/stream/line_input.cc
 create mode 100644 klm/util/stream/line_input.hh
 create mode 100644 klm/util/stream/multi_progress.cc
 create mode 100644 klm/util/stream/multi_progress.hh
 create mode 100644 klm/util/stream/sort.hh
 create mode 100644 klm/util/stream/sort_test.cc
 create mode 100644 klm/util/stream/stream.hh
 create mode 100644 klm/util/stream/stream_test.cc
 create mode 100644 klm/util/stream/timer.hh

(limited to 'klm/util/stream')
diff --git a/klm/util/stream/block.hh b/klm/util/stream/block.hh
new file mode 100644
index 00000000..11aa991e
--- /dev/null
+++ b/klm/util/stream/block.hh
@@ -0,0 +1,43 @@
+#ifndef UTIL_STREAM_BLOCK__
+#define UTIL_STREAM_BLOCK__
+
+#include <cstddef>
+#include <stdint.h>
+
+namespace util {
+namespace stream {
+
+class Block {
+  public:
+    Block() : mem_(NULL), valid_size_(0) {}
+
+    Block(void *mem, std::size_t size) : mem_(mem), valid_size_(size) {}
+
+    void SetValidSize(std::size_t to) { valid_size_ = to; }
+    // Read might fill in less than Allocated at EOF.   
+    std::size_t ValidSize() const { return valid_size_; }
+
+    void *Get() { return mem_; }
+    const void *Get() const { return mem_; }
+
+    const void *ValidEnd() const { 
+      return reinterpret_cast<const uint8_t*>(mem_) + valid_size_;
+    }
+
+    operator bool() const { return mem_ != NULL; }
+    bool operator!() const { return mem_ == NULL; }
+ 
+  private:
+    friend class Link;
+    void SetToPoison() {
+      mem_ = NULL;
+    }
+
+    void *mem_;
+    std::size_t valid_size_;
+};
+
+} // namespace stream
+} // namespace util
+
+#endif // UTIL_STREAM_BLOCK__
diff --git a/klm/util/stream/chain.cc b/klm/util/stream/chain.cc
new file mode 100644
index 00000000..46708c60
--- /dev/null
+++ b/klm/util/stream/chain.cc
@@ -0,0 +1,155 @@
+#include "util/stream/chain.hh"
+
+#include "util/stream/io.hh"
+
+#include "util/exception.hh"
+#include "util/pcqueue.hh"
+
+#include <cstdlib>
+#include <new>
+#include <iostream>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+namespace util {
+namespace stream {
+
+ChainConfigException::ChainConfigException() throw() { *this << "Chain configured with "; }
+ChainConfigException::~ChainConfigException() throw() {}
+
+Thread::~Thread() {
+  thread_.join();
+}
+
+void Thread::UnhandledException(const std::exception &e) {
+  std::cerr << e.what() << std::endl;
+  abort();
+}
+
+void Recycler::Run(const ChainPosition &position) {
+  for (Link l(position); l; ++l) {
+    l->SetValidSize(position.GetChain().BlockSize());
+  }
+}
+
+const Recycler kRecycle = Recycler();
+
+Chain::Chain(const ChainConfig &config) : config_(config), complete_called_(false) {
+  UTIL_THROW_IF(!config.entry_size, ChainConfigException, "zero-size entries.");
+  UTIL_THROW_IF(!config.block_count, ChainConfigException, "block count zero");
+  UTIL_THROW_IF(config.total_memory < config.entry_size * config.block_count, ChainConfigException, config.total_memory << " total memory, too small for " << config.block_count << " blocks of containing entries of size " << config.entry_size);
+  // Round down block size to a multiple of entry size.   
+  block_size_ = config.total_memory / (config.block_count * config.entry_size) * config.entry_size;
+}
+
+Chain::~Chain() {
+  Wait();
+}
+
+ChainPosition Chain::Add() {
+  if (!Running()) Start();
+  PCQueue<Block> &in = queues_.back();
+  queues_.push_back(new PCQueue<Block>(config_.block_count));
+  return ChainPosition(in, queues_.back(), this, progress_);
+}
+
+Chain &Chain::operator>>(const WriteAndRecycle &writer) {
+  threads_.push_back(new Thread(Complete(), writer));
+  return *this;
+}
+
+void Chain::Wait(bool release_memory) {
+  if (queues_.empty()) {
+    assert(threads_.empty());
+    return; // Nothing to wait for.  
+  }
+  if (!complete_called_) CompleteLoop();
+  threads_.clear();
+  for (std::size_t i = 0; queues_.front().Consume(); ++i) {
+    if (i == config_.block_count) {
+      std::cerr << "Chain ending without poison." << std::endl;
+      abort();
+    }
+  }
+  queues_.clear();
+  progress_.Finished();
+  complete_called_ = false;
+  if (release_memory) memory_.reset();
+}
+
+void Chain::Start() {
+  Wait(false);
+  if (!memory_.get()) {
+    // Allocate memory.  
+    assert(threads_.empty());
+    assert(queues_.empty());
+    std::size_t malloc_size = block_size_ * config_.block_count;
+    memory_.reset(MallocOrThrow(malloc_size));
+  }
+  // This queue can accomodate all blocks.    
+  queues_.push_back(new PCQueue<Block>(config_.block_count));
+  // Populate the lead queue with blocks.  
+  uint8_t *base = static_cast<uint8_t*>(memory_.get());
+  for (std::size_t i = 0; i < config_.block_count; ++i) {
+    queues_.front().Produce(Block(base, block_size_));
+    base += block_size_;
+  }
+}
+
+ChainPosition Chain::Complete() {
+  assert(Running());
+  UTIL_THROW_IF(complete_called_, util::Exception, "CompleteLoop() called twice");
+  complete_called_ = true;
+  return ChainPosition(queues_.back(), queues_.front(), this, progress_);
+}
+
+Link::Link() : in_(NULL), out_(NULL), poisoned_(true) {}
+
+void Link::Init(const ChainPosition &position) {
+  UTIL_THROW_IF(in_, util::Exception, "Link::Init twice");
+  in_ = position.in_;
+  out_ = position.out_;
+  poisoned_ = false;
+  progress_ = position.progress_;
+  in_->Consume(current_);
+}
+
+Link::Link(const ChainPosition &position) : in_(NULL) {
+  Init(position);
+}
+
+Link::~Link() {
+  if (current_) {
+    // Probably an exception unwinding.  
+    std::cerr << "Last input should have been poison." << std::endl;
+//    abort();
+  } else {
+    if (!poisoned_) {
+      // Pass the poison!
+      out_->Produce(current_);
+    }
+  }
+}
+
+Link &Link::operator++() {
+  assert(current_);
+  progress_ += current_.ValidSize();
+  out_->Produce(current_);
+  in_->Consume(current_);
+  if (!current_) {
+    poisoned_ = true;
+    out_->Produce(current_);
+  }
+  return *this;
+}
+
+void Link::Poison() {
+  assert(!poisoned_);
+  current_.SetToPoison();
+  out_->Produce(current_);
+  poisoned_ = true;
+}
+
+} // namespace stream
+} // namespace util
diff --git a/klm/util/stream/chain.hh b/klm/util/stream/chain.hh
new file mode 100644
index 00000000..154b9b33
--- /dev/null
+++ b/klm/util/stream/chain.hh
@@ -0,0 +1,198 @@
+#ifndef UTIL_STREAM_CHAIN__
+#define UTIL_STREAM_CHAIN__
+
+#include "util/stream/block.hh"
+#include "util/stream/config.hh"
+#include "util/stream/multi_progress.hh"
+#include "util/scoped.hh"
+
+#include <boost/ptr_container/ptr_vector.hpp>
+#include <boost/thread/thread.hpp>
+
+#include <cstddef>
+
+#include <assert.h>
+
+namespace util {
+template <class T> class PCQueue;
+namespace stream {
+
+class ChainConfigException : public Exception {
+  public:
+    ChainConfigException() throw();
+    ~ChainConfigException() throw();
+};
+
+class Chain;
+// Specifies position in chain for Link constructor.
+class ChainPosition {
+  public:
+    const Chain &GetChain() const { return *chain_; }
+  private:
+    friend class Chain;
+    friend class Link;
+    ChainPosition(PCQueue<Block> &in, PCQueue<Block> &out, Chain *chain, MultiProgress &progress) 
+      : in_(&in), out_(&out), chain_(chain), progress_(progress.Add()) {}
+
+    PCQueue<Block> *in_, *out_;
+
+    Chain *chain_;
+
+    WorkerProgress progress_;
+};
+
+// Position is usually ChainPosition but if there are multiple streams involved, this can be ChainPositions.  
+class Thread {
+  public:
+    template <class Position, class Worker> Thread(const Position &position, const Worker &worker)
+      : thread_(boost::ref(*this), position, worker) {}
+
+    ~Thread();
+
+    template <class Position, class Worker> void operator()(const Position &position, Worker &worker) {
+      try {
+        worker.Run(position);
+      } catch (const std::exception &e) {
+        UnhandledException(e);
+      }
+    }
+
+  private:
+    void UnhandledException(const std::exception &e);
+
+    boost::thread thread_;
+};
+
+class Recycler {
+  public:
+    void Run(const ChainPosition &position);
+};
+
+extern const Recycler kRecycle;
+class WriteAndRecycle;
+
+class Chain {
+  private:
+    template <class T, void (T::*ptr)(const ChainPosition &) = &T::Run> struct CheckForRun {
+      typedef Chain type;
+    };
+
+  public:
+    explicit Chain(const ChainConfig &config);
+
+    ~Chain();
+
+    void ActivateProgress() {
+      assert(!Running());
+      progress_.Activate();
+    }
+
+    void SetProgressTarget(uint64_t target) {
+      progress_.SetTarget(target);
+    }
+
+    std::size_t EntrySize() const {
+      return config_.entry_size;
+    }
+    std::size_t BlockSize() const {
+      return block_size_;
+    }
+
+    // Two ways to add to the chain: Add() or operator>>.  
+    ChainPosition Add();
+
+    // This is for adding threaded workers with a Run method.  
+    template <class Worker> typename CheckForRun<Worker>::type &operator>>(const Worker &worker) {
+      assert(!complete_called_);
+      threads_.push_back(new Thread(Add(), worker));
+      return *this;
+    }
+
+    // Avoid copying the worker.  
+    template <class Worker> typename CheckForRun<Worker>::type &operator>>(const boost::reference_wrapper<Worker> &worker) {
+      assert(!complete_called_);
+      threads_.push_back(new Thread(Add(), worker));
+      return *this;
+    }
+
+    // Note that Link and Stream also define operator>> outside this class.  
+
+    // To complete the loop, call CompleteLoop(), >> kRecycle, or the destructor.  
+    void CompleteLoop() {
+      threads_.push_back(new Thread(Complete(), kRecycle));
+    }
+
+    Chain &operator>>(const Recycler &recycle) {
+      CompleteLoop();
+      return *this;
+    }
+
+    Chain &operator>>(const WriteAndRecycle &writer);
+
+    // Chains are reusable.  Call Wait to wait for everything to finish and free memory.  
+    void Wait(bool release_memory = true);
+
+    // Waits for the current chain to complete (if any) then starts again.  
+    void Start();
+
+    bool Running() const { return !queues_.empty(); }
+
+  private:
+    ChainPosition Complete();
+
+    ChainConfig config_;
+
+    std::size_t block_size_;
+
+    scoped_malloc memory_;
+
+    boost::ptr_vector<PCQueue<Block> > queues_;
+
+    bool complete_called_;
+
+    boost::ptr_vector<Thread> threads_;
+
+    MultiProgress progress_;
+};
+
+// Create the link in the worker thread using the position token.
+class Link {
+  public:
+    // Either default construct and Init or just construct all at once.
+    Link();
+    void Init(const ChainPosition &position);
+
+    explicit Link(const ChainPosition &position);
+
+    ~Link();
+
+    Block &operator*() { return current_; }
+    const Block &operator*() const { return current_; }
+
+    Block *operator->() { return &current_; }
+    const Block *operator->() const { return &current_; }
+
+    Link &operator++();
+
+    operator bool() const { return current_; }
+
+    void Poison();
+
+  private:
+    Block current_;
+    PCQueue<Block> *in_, *out_;
+ 
+    bool poisoned_;
+
+    WorkerProgress progress_;
+};
+
+inline Chain &operator>>(Chain &chain, Link &link) {
+  link.Init(chain.Add());
+  return chain;
+}
+
+} // namespace stream
+} // namespace util
+
+#endif // UTIL_STREAM_CHAIN__
diff --git a/klm/util/stream/config.hh b/klm/util/stream/config.hh
new file mode 100644
index 00000000..1eeb3a8a
--- /dev/null
+++ b/klm/util/stream/config.hh
@@ -0,0 +1,32 @@
+#ifndef UTIL_STREAM_CONFIG__
+#define UTIL_STREAM_CONFIG__
+
+#include <cstddef>
+#include <string>
+
+namespace util { namespace stream {
+
+struct ChainConfig {
+  ChainConfig() {}
+
+  ChainConfig(std::size_t in_entry_size, std::size_t in_block_count, std::size_t in_total_memory)
+    : entry_size(in_entry_size), block_count(in_block_count), total_memory(in_total_memory) {}
+
+  std::size_t entry_size;
+  std::size_t block_count;
+  // Chain's constructor will make this a multiple of entry_size. 
+  std::size_t total_memory;
+};
+
+struct SortConfig {
+  std::string temp_prefix;
+
+  // Size of each input/output buffer.
+  std::size_t buffer_size;
+
+  // Total memory to use when running alone.
+  std::size_t total_memory;
+};
+
+}} // namespaces
+#endif // UTIL_STREAM_CONFIG__
diff --git a/klm/util/stream/io.cc b/klm/util/stream/io.cc
new file mode 100644
index 00000000..c7ad2980
--- /dev/null
+++ b/klm/util/stream/io.cc
@@ -0,0 +1,64 @@
+#include "util/stream/io.hh"
+
+#include "util/file.hh"
+#include "util/stream/chain.hh"
+
+#include <cstddef>
+
+namespace util {
+namespace stream {
+
+ReadSizeException::ReadSizeException() throw() {}
+ReadSizeException::~ReadSizeException() throw() {}
+
+void Read::Run(const ChainPosition &position) {
+  const std::size_t block_size = position.GetChain().BlockSize();
+  const std::size_t entry_size = position.GetChain().EntrySize();
+  for (Link link(position); link; ++link) {
+    std::size_t got = util::ReadOrEOF(file_, link->Get(), block_size);
+    UTIL_THROW_IF(got % entry_size, ReadSizeException, "File ended with " << got << " bytes, not a multiple of " << entry_size << "."); 
+    if (got == 0) {
+      link.Poison();
+      return;
+    } else {
+      link->SetValidSize(got);
+    }
+  }
+}
+
+void PRead::Run(const ChainPosition &position) {
+  scoped_fd owner;
+  if (own_) owner.reset(file_);
+  uint64_t size = SizeOrThrow(file_);
+  UTIL_THROW_IF(size % static_cast<uint64_t>(position.GetChain().EntrySize()), ReadSizeException, "File size " << file_ << " size is " << size << " not a multiple of " << position.GetChain().EntrySize());
+  std::size_t block_size = position.GetChain().BlockSize();
+  Link link(position);
+  uint64_t offset = 0;
+  for (; offset + block_size < size; offset += block_size, ++link) {
+    PReadOrThrow(file_, link->Get(), block_size, offset);
+    link->SetValidSize(block_size);
+  }
+  if (size - offset) {
+    PReadOrThrow(file_, link->Get(), size - offset, offset);
+    link->SetValidSize(size - offset);
+    ++link;
+  }
+  link.Poison();
+}
+
+void Write::Run(const ChainPosition &position) {
+  for (Link link(position); link; ++link) {
+    WriteOrThrow(file_, link->Get(), link->ValidSize());
+  }
+}
+
+void WriteAndRecycle::Run(const ChainPosition &position) {
+  const std::size_t block_size = position.GetChain().BlockSize();
+  for (Link link(position); link; ++link) {
+    WriteOrThrow(file_, link->Get(), link->ValidSize());
+    link->SetValidSize(block_size);
+  }
+}
+
+} // namespace stream
+} // namespace util
diff --git a/klm/util/stream/io.hh b/klm/util/stream/io.hh
new file mode 100644
index 00000000..934b6b3f
--- /dev/null
+++ b/klm/util/stream/io.hh
@@ -0,0 +1,76 @@
+#ifndef UTIL_STREAM_IO__
+#define UTIL_STREAM_IO__
+
+#include "util/exception.hh"
+#include "util/file.hh"
+
+namespace util {
+namespace stream {
+
+class ChainPosition;
+
+class ReadSizeException : public util::Exception {
+  public:
+    ReadSizeException() throw();
+    ~ReadSizeException() throw();
+};
+
+class Read {
+  public:
+    explicit Read(int fd) : file_(fd) {}
+    void Run(const ChainPosition &position); 
+  private:
+    int file_;
+};
+
+// Like read but uses pread so that the file can be accessed from multiple threads.  
+class PRead {
+  public:
+    explicit PRead(int fd, bool take_own = false) : file_(fd), own_(take_own) {}
+    void Run(const ChainPosition &position);
+  private:
+    int file_;
+    bool own_;
+};
+
+class Write {
+  public:
+    explicit Write(int fd) : file_(fd) {}
+    void Run(const ChainPosition &position);
+  private:
+    int file_;
+};
+
+class WriteAndRecycle {
+  public:
+    explicit WriteAndRecycle(int fd) : file_(fd) {}
+    void Run(const ChainPosition &position);
+  private:
+    int file_;
+};
+
+// Reuse the same file over and over again to buffer output.  
+class FileBuffer {
+  public:
+    explicit FileBuffer(int fd) : file_(fd) {}
+
+    WriteAndRecycle Sink() const {
+      util::SeekOrThrow(file_.get(), 0);
+      return WriteAndRecycle(file_.get());
+    }
+
+    PRead Source() const {
+      return PRead(file_.get());
+    }
+
+    uint64_t Size() const {
+      return SizeOrThrow(file_.get());
+    }
+
+  private:
+    scoped_fd file_;
+};
+
+} // namespace stream
+} // namespace util
+#endif // UTIL_STREAM_IO__
diff --git a/klm/util/stream/io_test.cc b/klm/util/stream/io_test.cc
new file mode 100644
index 00000000..82108335
--- /dev/null
+++ b/klm/util/stream/io_test.cc
@@ -0,0 +1,38 @@
+#include "util/stream/io.hh"
+
+#include "util/stream/chain.hh"
+#include "util/file.hh"
+
+#define BOOST_TEST_MODULE IOTest
+#include <boost/test/unit_test.hpp>
+
+#include <unistd.h>
+
+namespace util { namespace stream { namespace {
+
+BOOST_AUTO_TEST_CASE(CopyFile) {
+  std::string temps("io_test_temp");
+
+  scoped_fd in(MakeTemp(temps));
+  for (uint64_t i = 0; i < 100000; ++i) {
+    WriteOrThrow(in.get(), &i, sizeof(uint64_t));
+  }
+  SeekOrThrow(in.get(), 0);
+  scoped_fd out(MakeTemp(temps));
+
+  ChainConfig config;
+  config.entry_size = 8;
+  config.total_memory = 1024;
+  config.block_count = 10;
+
+  Chain(config) >> PRead(in.get()) >> Write(out.get());
+
+  SeekOrThrow(out.get(), 0);
+  for (uint64_t i = 0; i < 100000; ++i) {
+    uint64_t got;
+    ReadOrThrow(out.get(), &got, sizeof(uint64_t));
+    BOOST_CHECK_EQUAL(i, got);
+  }
+}
+
+}}} // namespaces
diff --git a/klm/util/stream/line_input.cc b/klm/util/stream/line_input.cc
new file mode 100644
index 00000000..dafa5020
--- /dev/null
+++ b/klm/util/stream/line_input.cc
@@ -0,0 +1,52 @@
+#include "util/stream/line_input.hh"
+
+#include "util/exception.hh"
+#include "util/file.hh"
+#include "util/read_compressed.hh"
+#include "util/stream/chain.hh"
+
+#include <algorithm>
+#include <vector>
+
+namespace util { namespace stream {
+
+void LineInput::Run(const ChainPosition &position) {
+  ReadCompressed reader(fd_);
+  // Holding area for beginning of line to be placed in next block.
+  std::vector<char> carry;
+  
+  for (Link block(position); ; ++block) {
+    char *to = static_cast<char*>(block->Get());
+    char *begin = to;
+    char *end = to + position.GetChain().BlockSize();
+    std::copy(carry.begin(), carry.end(), to);
+    to += carry.size();
+    while (to != end) {
+      std::size_t got = reader.Read(to, end - to);
+      if (!got) {
+        // EOF
+        block->SetValidSize(to - begin);
+        ++block;
+        block.Poison();
+        return;
+      }
+      to += got;
+    }
+
+    // Find the last newline.
+    char *newline;
+    for (newline = to - 1; ; --newline) {
+      UTIL_THROW_IF(newline < begin, Exception, "Did not find a newline in " << position.GetChain().BlockSize() << " bytes of input of " << NameFromFD(fd_) << ".  Is this a text file?");
+      if (*newline == '\n') break;
+    }
+    
+    // Copy everything after the last newline to the carry.
+    carry.clear();
+    carry.resize(to - (newline + 1));
+    std::copy(newline + 1, to, &*carry.begin());
+
+    block->SetValidSize(newline + 1 - begin);
+  }
+}
+
+}} // namespaces
diff --git a/klm/util/stream/line_input.hh b/klm/util/stream/line_input.hh
new file mode 100644
index 00000000..86db1dd0
--- /dev/null
+++ b/klm/util/stream/line_input.hh
@@ -0,0 +1,22 @@
+#ifndef UTIL_STREAM_LINE_INPUT__
+#define UTIL_STREAM_LINE_INPUT__
+namespace util {namespace stream {
+
+class ChainPosition;
+
+/* Worker that reads input into blocks, ensuring that blocks contain whole
+ * lines.  Assumes that the maximum size of a line is less than the block size
+ */
+class LineInput {
+  public:
+    // Takes ownership upon thread execution.
+    explicit LineInput(int fd);
+
+    void Run(const ChainPosition &position);
+
+  private:
+    int fd_;
+};
+
+}} // namespaces
+#endif // UTIL_STREAM_LINE_INPUT__
diff --git a/klm/util/stream/multi_progress.cc b/klm/util/stream/multi_progress.cc
new file mode 100644
index 00000000..8ba10386
--- /dev/null
+++ b/klm/util/stream/multi_progress.cc
@@ -0,0 +1,86 @@
+#include "util/stream/multi_progress.hh"
+
+// TODO: merge some functionality with the simple progress bar?
+#include "util/ersatz_progress.hh"
+
+#include <iostream>
+#include <limits>
+
+#include <string.h>
+
+#if !defined(_WIN32) && !defined(_WIN64)
+#include <unistd.h>
+#endif
+
+namespace util { namespace stream {
+
+namespace {
+const char kDisplayCharacters[] = "-+*#0123456789";
+
+uint64_t Next(unsigned char stone, uint64_t complete) {
+  return (static_cast<uint64_t>(stone + 1) * complete + MultiProgress::kWidth - 1) / MultiProgress::kWidth;
+}
+
+} // namespace
+
+MultiProgress::MultiProgress() : active_(false), complete_(std::numeric_limits<uint64_t>::max()), character_handout_(0) {}
+
+MultiProgress::~MultiProgress() {
+  if (active_ && complete_ != std::numeric_limits<uint64_t>::max())
+    std::cerr << '\n';
+}
+
+void MultiProgress::Activate() {
+  active_ = 
+#if !defined(_WIN32) && !defined(_WIN64)
+    // Is stderr a terminal?  
+    (isatty(2) == 1)
+#else
+    true
+#endif
+    ;
+}
+
+void MultiProgress::SetTarget(uint64_t complete) {
+  if (!active_) return;
+  complete_ = complete;
+  if (!complete) complete_ = 1;
+  memset(display_, 0, sizeof(display_));
+  character_handout_ = 0;
+  std::cerr << kProgressBanner;
+}
+
+WorkerProgress MultiProgress::Add() {
+  if (!active_)
+    return WorkerProgress(std::numeric_limits<uint64_t>::max(), *this, '\0');
+  std::size_t character_index;
+  {
+    boost::unique_lock<boost::mutex> lock(mutex_);
+    character_index = character_handout_++;
+    if (character_handout_ == sizeof(kDisplayCharacters) - 1)
+      character_handout_ = 0;
+  }
+  return WorkerProgress(Next(0, complete_), *this, kDisplayCharacters[character_index]);
+}
+
+void MultiProgress::Finished() {
+  if (!active_ || complete_ == std::numeric_limits<uint64_t>::max()) return;
+  std::cerr << '\n';
+  complete_ = std::numeric_limits<uint64_t>::max();
+}
+
+void MultiProgress::Milestone(WorkerProgress &worker) {
+  if (!active_ || complete_ == std::numeric_limits<uint64_t>::max()) return;
+  unsigned char stone = std::min(static_cast<uint64_t>(kWidth), worker.current_ * kWidth / complete_);
+  for (char *i = &display_[worker.stone_]; i < &display_[stone]; ++i) {
+    *i = worker.character_;
+  }
+  worker.next_ = Next(stone, complete_);
+  worker.stone_ = stone;
+  {
+    boost::unique_lock<boost::mutex> lock(mutex_);
+    std::cerr << '\r' << display_ << std::flush;
+  }
+}
+
+}} // namespaces
diff --git a/klm/util/stream/multi_progress.hh b/klm/util/stream/multi_progress.hh
new file mode 100644
index 00000000..c4dd45a9
--- /dev/null
+++ b/klm/util/stream/multi_progress.hh
@@ -0,0 +1,90 @@
+/* Progress bar suitable for chains of workers */
+#ifndef UTIL_MULTI_PROGRESS__
+#define UTIL_MULTI_PROGRESS__
+
+#include <boost/thread/mutex.hpp>
+
+#include <cstddef>
+
+#include <stdint.h>
+
+namespace util { namespace stream {
+
+class WorkerProgress;
+
+class MultiProgress {
+  public:
+    static const unsigned char kWidth = 100;
+
+    MultiProgress();
+
+    ~MultiProgress();
+
+    // Turns on showing (requires SetTarget too).
+    void Activate();
+
+    void SetTarget(uint64_t complete);
+
+    WorkerProgress Add();
+
+    void Finished();
+
+  private:
+    friend class WorkerProgress;
+    void Milestone(WorkerProgress &worker);
+
+    bool active_;
+
+    uint64_t complete_;
+
+    boost::mutex mutex_;
+
+    // \0 at the end.  
+    char display_[kWidth + 1];
+
+    std::size_t character_handout_;
+
+    MultiProgress(const MultiProgress &);
+    MultiProgress &operator=(const MultiProgress &);
+};
+
+class WorkerProgress {
+  public:
+    // Default contrutor must be initialized with operator= later.  
+    WorkerProgress() : parent_(NULL) {}
+
+    // Not threadsafe for the same worker by default.  
+    WorkerProgress &operator++() {
+      if (++current_ >= next_) {
+        parent_->Milestone(*this);
+      }
+      return *this;
+    }
+
+    WorkerProgress &operator+=(uint64_t amount) {
+      current_ += amount;
+      if (current_ >= next_) {
+        parent_->Milestone(*this);
+      }
+      return *this;
+    }
+
+  private:
+    friend class MultiProgress;
+    WorkerProgress(uint64_t next, MultiProgress &parent, char character) 
+      : current_(0), next_(next), parent_(&parent), stone_(0), character_(character) {}
+
+    uint64_t current_, next_;
+
+    MultiProgress *parent_;
+
+    // Previous milestone reached.  
+    unsigned char stone_;
+
+    // Character to display in bar.  
+    char character_;
+};
+
+}} // namespaces
+
+#endif // UTIL_MULTI_PROGRESS__
diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh
new file mode 100644
index 00000000..be6c11ea
--- /dev/null
+++ b/klm/util/stream/sort.hh
@@ -0,0 +1,542 @@
+/* Usage:
+ * Sort<Compare> sorter(temp, compare);
+ * Chain(config) >> Read(file) >> sorter.Unsorted();
+ * Stream stream;
+ * Chain chain(config) >> sorter.Sorted(internal_config, lazy_config) >> stream;
+ * 
+ * Note that sorter must outlive any threads that use Unsorted or Sorted.  
+ *
+ * Combiners take the form:
+ * bool operator()(void *into, const void *option, const Compare &compare) const
+ * which returns true iff a combination happened.  The sorting algorithm
+ * guarantees compare(into, option).  But it does not guarantee 
+ * compare(option, into).  
+ * Currently, combining is only done in merge steps, not during on-the-fly
+ * sort.  Use a hash table for that.  
+ */
+
+#ifndef UTIL_STREAM_SORT__
+#define UTIL_STREAM_SORT__
+
+#include "util/stream/chain.hh"
+#include "util/stream/config.hh"
+#include "util/stream/io.hh"
+#include "util/stream/stream.hh"
+#include "util/stream/timer.hh"
+
+#include "util/file.hh"
+#include "util/scoped.hh"
+#include "util/sized_iterator.hh"
+
+#include <algorithm>
+#include <iostream>
+#include <queue>
+#include <string>
+
+namespace util {
+namespace stream {
+
+struct NeverCombine {
+  template <class Compare> bool operator()(const void *, const void *, const Compare &) const { 
+    return false;
+  }
+};
+
+// Manage the offsets of sorted blocks in a file.  
+class Offsets {
+  public:
+    explicit Offsets(int fd) : log_(fd) {
+      Reset();
+    }
+
+    int File() const { return log_; }
+
+    void Append(uint64_t length) {
+      if (!length) return;
+      ++block_count_;
+      if (length == cur_.length) {
+        ++cur_.run;
+        return;
+      }
+      WriteOrThrow(log_, &cur_, sizeof(Entry));
+      cur_.length = length;
+      cur_.run = 1;
+    }
+
+    void FinishedAppending() {
+      WriteOrThrow(log_, &cur_, sizeof(Entry));
+      SeekOrThrow(log_, sizeof(Entry)); // Skip 0,0 at beginning.
+      cur_.run = 0;
+      if (block_count_) {
+        ReadOrThrow(log_, &cur_, sizeof(Entry));
+        assert(cur_.length);
+        assert(cur_.run);
+      }
+    }
+
+    uint64_t RemainingBlocks() const { return block_count_; }
+
+    uint64_t TotalOffset() const { return output_sum_; }
+
+    uint64_t PeekSize() const {
+      return cur_.length;
+    }
+
+    uint64_t NextSize() {
+      assert(block_count_);
+      uint64_t ret = cur_.length;
+      output_sum_ += ret;
+
+      --cur_.run;
+      --block_count_;
+      if (!cur_.run && block_count_) {
+        ReadOrThrow(log_, &cur_, sizeof(Entry));
+        assert(cur_.length);
+        assert(cur_.run);
+      }
+      return ret;
+    }
+
+    void Reset() {
+      SeekOrThrow(log_, 0);
+      ResizeOrThrow(log_, 0);
+      cur_.length = 0;
+      cur_.run = 0;
+      block_count_ = 0;
+      output_sum_ = 0;
+    }
+
+  private:
+    int log_;
+
+    struct Entry {
+      uint64_t length;
+      uint64_t run;
+    };
+    Entry cur_;
+
+    uint64_t block_count_;
+
+    uint64_t output_sum_;
+};
+
+// A priority queue of entries backed by file buffers
+template <class Compare> class MergeQueue {
+  public:
+    MergeQueue(int fd, std::size_t buffer_size, std::size_t entry_size, const Compare &compare)
+      : queue_(Greater(compare)), in_(fd), buffer_size_(buffer_size), entry_size_(entry_size) {}
+
+    void Push(void *base, uint64_t offset, uint64_t amount) {
+      queue_.push(Entry(base, in_, offset, amount, buffer_size_));
+    }
+
+    const void *Top() const {
+      return queue_.top().Current();
+    }
+
+    void Pop() {
+      Entry top(queue_.top());
+      queue_.pop();
+      if (top.Increment(in_, buffer_size_, entry_size_))
+        queue_.push(top);
+    }
+
+    std::size_t Size() const {
+      return queue_.size();
+    }
+
+    bool Empty() const {
+      return queue_.empty();
+    }
+
+  private:
+    // Priority queue contains these entries.  
+    class Entry {
+      public:
+        Entry() {}
+
+        Entry(void *base, int fd, uint64_t offset, uint64_t amount, std::size_t buf_size) {
+          offset_ = offset;
+          remaining_ = amount;
+          buffer_end_ = static_cast<uint8_t*>(base) + buf_size;
+          Read(fd, buf_size);
+        }
+
+        bool Increment(int fd, std::size_t buf_size, std::size_t entry_size) {
+          current_ += entry_size;
+          if (current_ != buffer_end_) return true;
+          return Read(fd, buf_size);
+        }
+
+        const void *Current() const { return current_; }
+
+      private:
+        bool Read(int fd, std::size_t buf_size) {
+          current_ = buffer_end_ - buf_size;
+          std::size_t amount;
+          if (static_cast<uint64_t>(buf_size) < remaining_) {
+            amount = buf_size;
+          } else if (!remaining_) {
+            return false;
+          } else {
+            amount = remaining_;
+            buffer_end_ = current_ + remaining_;
+          }
+          PReadOrThrow(fd, current_, amount, offset_);
+          offset_ += amount;
+          assert(current_ <= buffer_end_);
+          remaining_ -= amount;
+          return true;
+        }
+
+        // Buffer
+        uint8_t *current_, *buffer_end_;
+        // File
+        uint64_t remaining_, offset_;
+    };
+
+    // Wrapper comparison function for queue entries.   
+    class Greater : public std::binary_function<const Entry &, const Entry &, bool> {
+      public:
+        explicit Greater(const Compare &compare) : compare_(compare) {}
+
+        bool operator()(const Entry &first, const Entry &second) const {
+          return compare_(second.Current(), first.Current());
+        }
+
+      private:
+        const Compare compare_;
+    };
+
+    typedef std::priority_queue<Entry, std::vector<Entry>, Greater> Queue;
+    Queue queue_;
+
+    const int in_;
+    const std::size_t buffer_size_;
+    const std::size_t entry_size_;
+};
+
+/* A worker object that merges.  If the number of pieces to merge exceeds the
+ * arity, it outputs multiple sorted blocks, recording to out_offsets.  
+ * However, users will only every see a single sorted block out output because
+ * Sort::Sorted insures the arity is higher than the number of pieces before
+ * returning this.   
+ */
+template <class Compare, class Combine> class MergingReader {
+  public:
+    MergingReader(int in, Offsets *in_offsets, Offsets *out_offsets, std::size_t buffer_size, std::size_t total_memory, const Compare &compare, const Combine &combine) :
+        compare_(compare), combine_(combine),
+        in_(in),
+        in_offsets_(in_offsets), out_offsets_(out_offsets),
+        buffer_size_(buffer_size), total_memory_(total_memory) {}
+
+    void Run(const ChainPosition &position) {
+      Run(position, false);
+    }
+
+    void Run(const ChainPosition &position, bool assert_one) {
+      // Special case: nothing to read.  
+      if (!in_offsets_->RemainingBlocks()) {
+        Link l(position);
+        l.Poison();
+        return;
+      }
+      // If there's just one entry, just read.
+      if (in_offsets_->RemainingBlocks() == 1) {
+        // Sequencing is important.
+        uint64_t offset = in_offsets_->TotalOffset();
+        uint64_t amount = in_offsets_->NextSize();
+        ReadSingle(offset, amount, position);
+        if (out_offsets_) out_offsets_->Append(amount);
+        return;
+      }
+
+      Stream str(position);
+      scoped_malloc buffer(MallocOrThrow(total_memory_));
+      uint8_t *const buffer_end = static_cast<uint8_t*>(buffer.get()) + total_memory_;
+
+      const std::size_t entry_size = position.GetChain().EntrySize();
+
+      while (in_offsets_->RemainingBlocks()) {
+        // Use bigger buffers if there's less remaining.
+        uint64_t per_buffer = std::max(buffer_size_, total_memory_ / in_offsets_->RemainingBlocks());
+        per_buffer -= per_buffer % entry_size;
+        assert(per_buffer);
+
+        // Populate queue.
+        MergeQueue<Compare> queue(in_, per_buffer, entry_size, compare_);
+        for (uint8_t *buf = static_cast<uint8_t*>(buffer.get()); 
+            in_offsets_->RemainingBlocks() && (buf + std::min(per_buffer, in_offsets_->PeekSize()) <= buffer_end);) {
+          uint64_t offset = in_offsets_->TotalOffset();
+          uint64_t size = in_offsets_->NextSize();
+          queue.Push(buf, offset, size);
+          buf += static_cast<std::size_t>(std::min<uint64_t>(size, per_buffer));
+        }
+        // This shouldn't happen but it's probably better to die than loop indefinitely.
+        if (queue.Size() < 2 && in_offsets_->RemainingBlocks()) {
+          std::cerr << "Bug in sort implementation: not merging at least two stripes." << std::endl;
+          abort();
+        }
+        if (assert_one && in_offsets_->RemainingBlocks()) {
+          std::cerr << "Bug in sort implementation: should only be one merge group for lazy sort" << std::endl;
+          abort();
+        }
+
+        uint64_t written = 0;
+        // Merge including combiner support.  
+        memcpy(str.Get(), queue.Top(), entry_size);
+        for (queue.Pop(); !queue.Empty(); queue.Pop()) {
+          if (!combine_(str.Get(), queue.Top(), compare_)) {
+            ++written; ++str;
+            memcpy(str.Get(), queue.Top(), entry_size);
+          }
+        }
+        ++written; ++str;
+        if (out_offsets_)
+          out_offsets_->Append(written * entry_size);
+      }
+      str.Poison();
+    }
+
+  private: 
+    void ReadSingle(uint64_t offset, const uint64_t size, const ChainPosition &position) {
+      // Special case: only one to read.  
+      const uint64_t end = offset + size;
+      const uint64_t block_size = position.GetChain().BlockSize();
+      Link l(position);
+      for (; offset + block_size < end; ++l, offset += block_size) {
+        PReadOrThrow(in_, l->Get(), block_size, offset);
+        l->SetValidSize(block_size);
+      }
+      PReadOrThrow(in_, l->Get(), end - offset, offset);
+      l->SetValidSize(end - offset);
+      (++l).Poison();
+      return;
+    }
+   
+    Compare compare_;
+    Combine combine_;
+
+    int in_;
+
+  protected:
+    Offsets *in_offsets_;
+
+  private:
+    Offsets *out_offsets_;
+ 
+    std::size_t buffer_size_;
+    std::size_t total_memory_;
+};
+
+// The lazy step owns the remaining files.  This keeps track of them.  
+template <class Compare, class Combine> class OwningMergingReader : public MergingReader<Compare, Combine> {
+  private:
+    typedef MergingReader<Compare, Combine> P;
+  public:
+    OwningMergingReader(int data, const Offsets &offsets, std::size_t buffer, std::size_t lazy, const Compare &compare, const Combine &combine) 
+      : P(data, NULL, NULL, buffer, lazy, compare, combine),
+        data_(data),
+        offsets_(offsets) {}
+
+    void Run(const ChainPosition &position) {
+      P::in_offsets_ = &offsets_;
+      scoped_fd data(data_);
+      scoped_fd offsets_file(offsets_.File());
+      P::Run(position, true);
+    }
+
+  private:
+    int data_;
+    Offsets offsets_;
+};
+
+// Don't use this directly.  Worker that sorts blocks.   
+template <class Compare> class BlockSorter {
+  public:
+    BlockSorter(Offsets &offsets, const Compare &compare) :
+      offsets_(&offsets), compare_(compare) {}
+
+    void Run(const ChainPosition &position) {
+      const std::size_t entry_size = position.GetChain().EntrySize();
+      for (Link link(position); link; ++link) {
+        // Record the size of each block in a separate file.    
+        offsets_->Append(link->ValidSize());
+        void *end = static_cast<uint8_t*>(link->Get()) + link->ValidSize();
+        std::sort(
+            SizedIt(link->Get(), entry_size),
+            SizedIt(end, entry_size),
+            compare_);
+      }
+      offsets_->FinishedAppending();
+    }
+
+  private:
+    Offsets *offsets_;
+    SizedCompare<Compare> compare_;
+};
+
+class BadSortConfig : public Exception {
+  public:
+    BadSortConfig() throw() {}
+    ~BadSortConfig() throw() {}
+};
+
+template <class Compare, class Combine = NeverCombine> class Sort {
+  public:
+    Sort(Chain &in, const SortConfig &config, const Compare &compare = Compare(), const Combine &combine = Combine())
+      : config_(config),
+        data_(MakeTemp(config.temp_prefix)),
+        offsets_file_(MakeTemp(config.temp_prefix)), offsets_(offsets_file_.get()),
+        compare_(compare), combine_(combine),
+        entry_size_(in.EntrySize()) {
+      UTIL_THROW_IF(!entry_size_, BadSortConfig, "Sorting entries of size 0");
+      // Make buffer_size a multiple of the entry_size.  
+      config_.buffer_size -= config_.buffer_size % entry_size_;
+      UTIL_THROW_IF(!config_.buffer_size, BadSortConfig, "Sort buffer too small");
+      UTIL_THROW_IF(config_.total_memory < config_.buffer_size * 4, BadSortConfig, "Sorting memory " << config_.total_memory << " is too small for four buffers (two read and two write).");
+      in >> BlockSorter<Compare>(offsets_, compare_) >> WriteAndRecycle(data_.get());
+    }
+
+    uint64_t Size() const {
+      return SizeOrThrow(data_.get());
+    }
+
+    // Do merge sort, terminating when lazy merge could be done with the
+    // specified memory.  Return the minimum memory necessary to do lazy merge.
+    std::size_t Merge(std::size_t lazy_memory) {
+      if (offsets_.RemainingBlocks() <= 1) return 0;
+      const uint64_t lazy_arity = std::max<uint64_t>(1, lazy_memory / config_.buffer_size);
+      uint64_t size = Size();
+      /* No overflow because
+       * offsets_.RemainingBlocks() * config_.buffer_size <= lazy_memory ||
+       * size < lazy_memory
+       */
+      if (offsets_.RemainingBlocks() <= lazy_arity || size <= static_cast<uint64_t>(lazy_memory))
+        return std::min<std::size_t>(size, offsets_.RemainingBlocks() * config_.buffer_size);
+
+      scoped_fd data2(MakeTemp(config_.temp_prefix));
+      int fd_in = data_.get(), fd_out = data2.get();
+      scoped_fd offsets2_file(MakeTemp(config_.temp_prefix));
+      Offsets offsets2(offsets2_file.get());
+      Offsets *offsets_in = &offsets_, *offsets_out = &offsets2;
+
+      // Double buffered writing.  
+      ChainConfig chain_config;
+      chain_config.entry_size = entry_size_;
+      chain_config.block_count = 2;
+      chain_config.total_memory = config_.buffer_size * 2;
+      Chain chain(chain_config);
+
+      while (offsets_in->RemainingBlocks() > lazy_arity) {
+        if (size <= static_cast<uint64_t>(lazy_memory)) break;
+        std::size_t reading_memory = config_.total_memory - 2 * config_.buffer_size;
+        if (size < static_cast<uint64_t>(reading_memory)) {
+          reading_memory = static_cast<std::size_t>(size);
+        }
+        SeekOrThrow(fd_in, 0);
+        chain >>
+          MergingReader<Compare, Combine>(
+              fd_in,
+              offsets_in, offsets_out,
+              config_.buffer_size,
+              reading_memory,
+              compare_, combine_) >>
+          WriteAndRecycle(fd_out);
+        chain.Wait();
+        offsets_out->FinishedAppending();
+        ResizeOrThrow(fd_in, 0);
+        offsets_in->Reset();
+        std::swap(fd_in, fd_out);
+        std::swap(offsets_in, offsets_out);
+        size = SizeOrThrow(fd_in);
+      }
+
+      SeekOrThrow(fd_in, 0);
+      if (fd_in == data2.get()) {
+        data_.reset(data2.release());
+        offsets_file_.reset(offsets2_file.release());
+        offsets_ = offsets2;
+      }
+      if (offsets_.RemainingBlocks() <= 1) return 0;
+      // No overflow because the while loop exited.
+      return std::min(size, offsets_.RemainingBlocks() * static_cast<uint64_t>(config_.buffer_size));
+    }
+
+    // Output to chain, using this amount of memory, maximum, for lazy merge
+    // sort.  
+    void Output(Chain &out, std::size_t lazy_memory) {
+      Merge(lazy_memory);
+      out.SetProgressTarget(Size());
+      out >> OwningMergingReader<Compare, Combine>(data_.get(), offsets_, config_.buffer_size, lazy_memory, compare_, combine_);
+      data_.release();
+      offsets_file_.release();
+    }
+
+    /* If a pipeline step is reading sorted input and writing to a different
+     * sort order, then there's a trade-off between using RAM to read lazily
+     * (avoiding copying the file) and using RAM to increase block size and, 
+     * therefore, decrease the number of merge sort passes in the next
+     * iteration.  
+     * 
+     * Merge sort takes log_{arity}(pieces) passes.  Thus, each time the chain
+     * block size is multiplied by arity, the number of output passes decreases
+     * by one.  Up to a constant, then, log_{arity}(chain) is the number of
+     * passes saved.  Chain simply divides the memory evenly over all blocks.
+     * 
+     * Lazy sort saves this many passes (up to a constant)
+     *   log_{arity}((memory-lazy)/block_count) + 1
+     * Non-lazy sort saves this many passes (up to the same constant):
+     *   log_{arity}(memory/block_count)
+     * Add log_{arity}(block_count) to both:
+     *   log_{arity}(memory-lazy) + 1 versus log_{arity}(memory)
+     * Take arity to the power of both sizes (arity > 1)
+     *   (memory - lazy)*arity versus memory
+     * Solve for lazy
+     *   lazy = memory * (arity - 1) / arity
+     */
+    std::size_t DefaultLazy() {
+      float arity = static_cast<float>(config_.total_memory / config_.buffer_size);
+      return static_cast<std::size_t>(static_cast<float>(config_.total_memory) * (arity - 1.0) / arity);
+    }
+
+    // Same as Output with default lazy memory setting.
+    void Output(Chain &out) {
+      Output(out, DefaultLazy());
+    }
+
+    // Completely merge sort and transfer ownership to the caller.
+    int StealCompleted() {
+      // Merge all the way.
+      Merge(0);
+      SeekOrThrow(data_.get(), 0);
+      offsets_file_.reset();
+      return data_.release();
+    }
+
+  private:
+    SortConfig config_;
+
+    scoped_fd data_;
+
+    scoped_fd offsets_file_;
+    Offsets offsets_;
+
+    const Compare compare_;
+    const Combine combine_;
+    const std::size_t entry_size_;
+};
+
+// returns bytes to be read on demand.  
+template <class Compare, class Combine> uint64_t BlockingSort(Chain &chain, const SortConfig &config, const Compare &compare = Compare(), const Combine &combine = NeverCombine()) {
+  Sort<Compare, Combine> sorter(chain, config, compare, combine);
+  chain.Wait(true);
+  uint64_t size = sorter.Size();
+  sorter.Output(chain);
+  return size;
+}
+
+} // namespace stream
+} // namespace util
+
+#endif // UTIL_STREAM_SORT__
diff --git a/klm/util/stream/sort_test.cc b/klm/util/stream/sort_test.cc
new file mode 100644
index 00000000..fd7705cd
--- /dev/null
+++ b/klm/util/stream/sort_test.cc
@@ -0,0 +1,62 @@
+#include "util/stream/sort.hh"
+
+#define BOOST_TEST_MODULE SortTest
+#include <boost/test/unit_test.hpp>
+
+#include <algorithm>
+
+#include <unistd.h>
+
+namespace util { namespace stream { namespace {
+
+struct CompareUInt64 : public std::binary_function<const void *, const void *, bool> {
+  bool operator()(const void *first, const void *second) const {
+    return *static_cast<const uint64_t*>(first) < *reinterpret_cast<const uint64_t*>(second);
+  }
+};
+
+const uint64_t kSize = 100000;
+
+struct Putter {
+  Putter(std::vector<uint64_t> &shuffled) : shuffled_(shuffled) {}
+
+  void Run(const ChainPosition &position) {
+    Stream put_shuffled(position);
+    for (uint64_t i = 0; i < shuffled_.size(); ++i, ++put_shuffled) {
+      *static_cast<uint64_t*>(put_shuffled.Get()) = shuffled_[i];
+    }
+    put_shuffled.Poison();
+  }
+  std::vector<uint64_t> &shuffled_;
+};
+
+BOOST_AUTO_TEST_CASE(FromShuffled) {
+  std::vector<uint64_t> shuffled;
+  shuffled.reserve(kSize);
+  for (uint64_t i = 0; i < kSize; ++i) {
+    shuffled.push_back(i);
+  }
+  std::random_shuffle(shuffled.begin(), shuffled.end());
+  
+  ChainConfig config;
+  config.entry_size = 8;
+  config.total_memory = 800;
+  config.block_count = 3;
+
+  SortConfig merge_config;
+  merge_config.temp_prefix = "sort_test_temp";
+  merge_config.buffer_size = 800;
+  merge_config.total_memory = 3300;
+
+  Chain chain(config);
+  chain >> Putter(shuffled);
+  BlockingSort(chain, merge_config, CompareUInt64(), NeverCombine());
+  Stream sorted;
+  chain >> sorted >> kRecycle;
+  for (uint64_t i = 0; i < kSize; ++i, ++sorted) {
+    BOOST_CHECK_EQUAL(i, *static_cast<const uint64_t*>(sorted.Get()));
+  }
+  BOOST_CHECK(!sorted);
+}
+
+}}} // namespaces
diff --git a/klm/util/stream/stream.hh b/klm/util/stream/stream.hh
new file mode 100644
index 00000000..6ff45b82
--- /dev/null
+++ b/klm/util/stream/stream.hh
@@ -0,0 +1,74 @@
+#ifndef UTIL_STREAM_STREAM__
+#define UTIL_STREAM_STREAM__
+
+#include "util/stream/chain.hh"
+
+#include <boost/noncopyable.hpp>
+
+#include <assert.h>
+#include <stdint.h>
+
+namespace util {
+namespace stream {
+
+class Stream : boost::noncopyable {
+  public:
+    Stream() : current_(NULL), end_(NULL) {}
+
+    void Init(const ChainPosition &position) {
+      entry_size_ = position.GetChain().EntrySize();
+      block_size_ = position.GetChain().BlockSize();
+      block_it_.Init(position);
+      StartBlock();
+    }
+
+    explicit Stream(const ChainPosition &position) {
+      Init(position);
+    }
+
+    operator bool() const { return current_ != NULL; }
+    bool operator!() const { return current_ == NULL; }
+
+    const void *Get() const { return current_; }
+    void *Get() { return current_; }
+
+    void Poison() {
+      block_it_->SetValidSize(current_ - static_cast<uint8_t*>(block_it_->Get()));
+      ++block_it_;
+      block_it_.Poison();
+    }
+    
+    Stream &operator++() {
+      assert(*this);
+      assert(current_ < end_);
+      current_ += entry_size_;
+      if (current_ == end_) {
+        ++block_it_;
+        StartBlock();
+      }
+      return *this;
+    }
+
+  private:
+    void StartBlock() {
+      for (; block_it_ && !block_it_->ValidSize(); ++block_it_) {}
+      current_ = static_cast<uint8_t*>(block_it_->Get());
+      end_ = current_ + block_it_->ValidSize();
+    }
+
+    uint8_t *current_, *end_;
+
+    std::size_t entry_size_;
+    std::size_t block_size_;
+
+    Link block_it_;
+};
+
+inline Chain &operator>>(Chain &chain, Stream &stream) {
+  stream.Init(chain.Add());
+  return chain;
+}
+
+} // namespace stream
+} // namespace util
+#endif // UTIL_STREAM_STREAM__
diff --git a/klm/util/stream/stream_test.cc b/klm/util/stream/stream_test.cc
new file mode 100644
index 00000000..6575d50d
--- /dev/null
+++ b/klm/util/stream/stream_test.cc
@@ -0,0 +1,35 @@
+#include "util/stream/io.hh"
+
+#include "util/stream/stream.hh"
+#include "util/file.hh"
+
+#define BOOST_TEST_MODULE StreamTest
+#include <boost/test/unit_test.hpp>
+
+#include <unistd.h>
+
+namespace util { namespace stream { namespace {
+
+BOOST_AUTO_TEST_CASE(StreamTest) {
+  scoped_fd in(MakeTemp("io_test_temp"));
+  for (uint64_t i = 0; i < 100000; ++i) {
+    WriteOrThrow(in.get(), &i, sizeof(uint64_t));
+  }
+  SeekOrThrow(in.get(), 0);
+
+  ChainConfig config;
+  config.entry_size = 8;
+  config.total_memory = 100;
+  config.block_count = 12;
+
+  Stream s;
+  Chain chain(config);
+  chain >> Read(in.get()) >> s >> kRecycle;
+  uint64_t i = 0;
+  for (; s; ++s, ++i) {
+    BOOST_CHECK_EQUAL(i, *static_cast<const uint64_t*>(s.Get()));
+  }
+  BOOST_CHECK_EQUAL(100000ULL, i);
+}
+
+}}} // namespaces
diff --git a/klm/util/stream/timer.hh b/klm/util/stream/timer.hh
new file mode 100644
index 00000000..50e94fe8
--- /dev/null
+++ b/klm/util/stream/timer.hh
@@ -0,0 +1,14 @@
+#ifndef UTIL_STREAM_TIMER__
+#define UTIL_STREAM_TIMER__
+
+#include <boost/version.hpp>
+
+#if BOOST_VERSION >= 104800
+#include <boost/timer/timer.hpp>
+#define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str))
+#else
+//#warning Using Boost older than 1.48. Timing information will not be available.
+#define UTIL_TIMER(str) 
+#endif
+
+#endif // UTIL_STREAM_TIMER__
-- 
cgit v1.2.3


From 8b9aae7cff1efd1be195cdd000b21546bd5fca04 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@Chriss-MacBook-Air.local>
Date: Sat, 19 Jan 2013 19:09:48 -0500
Subject: updated version of boost.m4 and automatically build kenneth's LM
 builder

---
 Makefile.am                            |   2 +
 configure.ac                           |   7 +-
 corpus/cut-corpus.pl                   |   2 +-
 klm/lm/builder/Makefile.am             |  28 +++
 klm/util/Makefile.am                   |   2 +-
 klm/util/double-conversion/Makefile.am |   2 +-
 klm/util/stream/Makefile.am            |  20 ++
 klm/util/stream/sort.hh                |   3 +-
 m4/boost.m4                            | 322 +++++++++++++++++++++++++--------
 9 files changed, 311 insertions(+), 77 deletions(-)
 create mode 100644 klm/lm/builder/Makefile.am
 create mode 100644 klm/util/stream/Makefile.am

(limited to 'klm/util/stream')

diff --git a/Makefile.am b/Makefile.am
index c2444928..17190d27 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -5,8 +5,10 @@ SUBDIRS = \
   utils \
   mteval \
   klm/util/double-conversion \
+  klm/util/stream \
   klm/util \
   klm/lm \
+  klm/lm/builder \
   klm/search \
   decoder \
   training \
diff --git a/configure.ac b/configure.ac
index d6030752..a1e5ad84 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cdec],[2013-01-15])
+AC_INIT([cdec],[2013-01-19])
 AC_CONFIG_SRCDIR([decoder/cdec.cc])
 AM_INIT_AUTOMAKE
 AC_CONFIG_HEADERS(config.h)
@@ -15,7 +15,10 @@ BOOST_REQUIRE([1.44])
 BOOST_PROGRAM_OPTIONS
 BOOST_SYSTEM
 BOOST_SERIALIZATION
+BOOST_CHRONO
+BOOST_TIMER
 BOOST_TEST
+BOOST_THREADS
 AM_PATH_PYTHON
 AC_CHECK_HEADER(dlfcn.h,AC_DEFINE(HAVE_DLFCN_H))
 AC_CHECK_LIB(dl, dlopen)
@@ -111,8 +114,10 @@ AC_CONFIG_FILES([word-aligner/Makefile])
 
 # KenLM stuff
 AC_CONFIG_FILES([klm/util/double-conversion/Makefile])
+AC_CONFIG_FILES([klm/util/stream/Makefile])
 AC_CONFIG_FILES([klm/util/Makefile])
 AC_CONFIG_FILES([klm/lm/Makefile])
+AC_CONFIG_FILES([klm/lm/builder/Makefile])
 AC_CONFIG_FILES([klm/search/Makefile])
 
 # training stuff
diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl
index 7daac0e2..0af3b23c 100755
--- a/corpus/cut-corpus.pl
+++ b/corpus/cut-corpus.pl
@@ -22,7 +22,7 @@ for my $ff (@ind) {
 
 while(<>) {
   chomp;
-  my @fields = split / \|\|\| /;
+  my @fields = split /\s*\|\|\|\s*/;
   my @sf;
   for my $i (@o) {
     my $y = $fields[$i];
diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am
new file mode 100644
index 00000000..00444256
--- /dev/null
+++ b/klm/lm/builder/Makefile.am
@@ -0,0 +1,28 @@
+bin_PROGRAMS = builder
+
+builder_SOURCES = \
+  main.cc \
+  adjust_counts.cc \
+  adjust_counts.hh \
+  corpus_count.cc \
+  corpus_count.hh \
+  discount.hh \
+  header_info.hh \
+  initial_probabilities.cc \
+  initial_probabilities.hh \
+  interpolate.cc \
+  interpolate.hh \
+  joint_order.hh \
+  multi_stream.hh \
+  ngram.hh \
+  ngram_stream.hh \
+  pipeline.cc \
+  pipeline.hh \
+  print.cc \
+  print.hh \
+  sort.hh
+
+builder_LDADD = ../libklm.a ../../util/double-conversion/libklm_util_double.a ../../util/stream/libklm_util_stream.a ../../util/libklm_util.a $(BOOST_TIMER_LIBS) $(BOOST_CHRONO_LIBS) $(BOOST_THREAD_LIBS)
+
+AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm
+
diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am
index 294ebc0a..248cc844 100644
--- a/klm/util/Makefile.am
+++ b/klm/util/Makefile.am
@@ -54,4 +54,4 @@ libklm_util_a_SOURCES = \
   string_piece.cc \
 	usage.cc
 
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion
+AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion
diff --git a/klm/util/double-conversion/Makefile.am b/klm/util/double-conversion/Makefile.am
index eb6616f7..dfcfb009 100644
--- a/klm/util/double-conversion/Makefile.am
+++ b/klm/util/double-conversion/Makefile.am
@@ -20,4 +20,4 @@ libklm_util_double_a_SOURCES = \
   fixed-dtoa.cc \
   strtod.cc
 
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion
+AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion
diff --git a/klm/util/stream/Makefile.am b/klm/util/stream/Makefile.am
new file mode 100644
index 00000000..f18cbedb
--- /dev/null
+++ b/klm/util/stream/Makefile.am
@@ -0,0 +1,20 @@
+noinst_LIBRARIES = libklm_util_stream.a
+
+libklm_util_stream_a_SOURCES = \
+  block.hh \
+  chain.cc \
+  chain.hh \
+  config.hh \
+  io.cc \
+  io.hh \
+  line_input.cc \
+  line_input.hh \
+  multi_progress.cc \
+  multi_progress.hh \
+  sort.hh \
+  stream.hh \
+  timer.hh
+
+AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm
+
+#-I$(top_srcdir)/klm/util/double-conversion
diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh
index be6c11ea..df57fa41 100644
--- a/klm/util/stream/sort.hh
+++ b/klm/util/stream/sort.hh
@@ -259,7 +259,8 @@ template <class Compare, class Combine> class MergingReader {
 
       while (in_offsets_->RemainingBlocks()) {
         // Use bigger buffers if there's less remaining.
-        uint64_t per_buffer = std::max(buffer_size_, total_memory_ / in_offsets_->RemainingBlocks());
+        uint64_t per_buffer = std::max(static_cast<uint64_t>(buffer_size_),
+                                       static_cast<uint64_t>(total_memory_ / in_offsets_->RemainingBlocks()));
         per_buffer -= per_buffer % entry_size;
         assert(per_buffer);
 
diff --git a/m4/boost.m4 b/m4/boost.m4
index 7e0ed075..027e039b 100644
--- a/m4/boost.m4
+++ b/m4/boost.m4
@@ -1,5 +1,5 @@
 # boost.m4: Locate Boost headers and libraries for autoconf-based projects.
-# Copyright (C) 2007, 2008, 2009  Benoit Sigoure <tsuna@lrde.epita.fr>
+# Copyright (C) 2007, 2008, 2009, 2010, 2011  Benoit Sigoure <tsuna@lrde.epita.fr>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -22,7 +22,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 m4_define([_BOOST_SERIAL], [m4_translit([
-# serial 12
+# serial 16
 ], [#
 ], [])])
 
@@ -45,15 +45,19 @@ m4_define([_BOOST_SERIAL], [m4_translit([
 # Note: THESE MACROS ASSUME THAT YOU USE LIBTOOL.  If you don't, don't worry,
 # simply read the README, it will show you what to do step by step.
 
-m4_pattern_forbid([^_?BOOST_])
+m4_pattern_forbid([^_?(BOOST|Boost)_])
 
 
 # _BOOST_SED_CPP(SED-PROGRAM, PROGRAM,
 #                [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
 # --------------------------------------------------------
 # Same as AC_EGREP_CPP, but leave the result in conftest.i.
-# PATTERN is *not* overquoted, as in AC_EGREP_CPP.  It could be useful
-# to turn this into a macro which extracts the value of any macro.
+#
+# SED-PROGRAM is *not* overquoted, as in AC_EGREP_CPP.  It is expanded
+# in double-quotes, so escape your double quotes.
+#
+# It could be useful to turn this into a macro which extracts the
+# value of any macro.
 m4_define([_BOOST_SED_CPP],
 [AC_LANG_PREPROC_REQUIRE()dnl
 AC_REQUIRE([AC_PROG_SED])dnl
@@ -98,6 +102,7 @@ set x $boost_version_req 0 0 0
 IFS=$boost_save_IFS
 shift
 boost_version_req=`expr "$[1]" '*' 100000 + "$[2]" '*' 100 + "$[3]"`
+boost_version_req_string=$[1].$[2].$[3]
 AC_ARG_WITH([boost],
    [AS_HELP_STRING([--with-boost=DIR],
                    [prefix of Boost $1 @<:@guess@:>@])])dnl
@@ -113,9 +118,9 @@ if test x"$BOOST_ROOT" != x; then
   fi
 fi
 AC_SUBST([DISTCHECK_CONFIGURE_FLAGS],
-         ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"])
+         ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"])dnl
 boost_save_CPPFLAGS=$CPPFLAGS
-  AC_CACHE_CHECK([for Boost headers version >= $boost_version_req],
+  AC_CACHE_CHECK([for Boost headers version >= $boost_version_req_string],
     [boost_cv_inc_path],
     [boost_cv_inc_path=no
 AC_LANG_PUSH([C++])dnl
@@ -183,24 +188,25 @@ AC_LANG_POP([C++])dnl
     ])
     case $boost_cv_inc_path in #(
       no)
-        boost_errmsg="cannot find Boost headers version >= $boost_version_req"
+        boost_errmsg="cannot find Boost headers version >= $boost_version_req_string"
         m4_if([$2], [],  [AC_MSG_ERROR([$boost_errmsg])],
                         [AC_MSG_NOTICE([$boost_errmsg])])
         $2
         ;;#(
       yes)
         BOOST_CPPFLAGS=
-        AC_DEFINE([HAVE_BOOST], [1],
-                  [Defined if the requested minimum BOOST version is satisfied])
         ;;#(
       *)
-        AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"])
+        AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"])dnl
         ;;
     esac
+  if test x"$boost_cv_inc_path" != xno; then
+  AC_DEFINE([HAVE_BOOST], [1],
+            [Defined if the requested minimum BOOST version is satisfied])
   AC_CACHE_CHECK([for Boost's header version],
     [boost_cv_lib_version],
     [m4_pattern_allow([^BOOST_LIB_VERSION$])dnl
-     _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;g;}],
+     _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;q;}],
                     [#include <boost/version.hpp>
 boost-lib-version = BOOST_LIB_VERSION],
     [boost_cv_lib_version=`cat conftest.i`])])
@@ -211,6 +217,7 @@ boost-lib-version = BOOST_LIB_VERSION],
         AC_MSG_ERROR([invalid value: boost_major_version=$boost_major_version])
         ;;
     esac
+fi
 CPPFLAGS=$boost_save_CPPFLAGS
 ])# BOOST_REQUIRE
 
@@ -220,7 +227,7 @@ CPPFLAGS=$boost_save_CPPFLAGS
 # on the command line, static versions of the libraries will be looked up.
 AC_DEFUN([BOOST_STATIC],
   [AC_ARG_ENABLE([static-boost],
-     [AC_HELP_STRING([--enable-static-boost],
+     [AS_HELP_STRING([--enable-static-boost],
                [Prefer the static boost libraries over the shared ones [no]])],
      [enable_static_boost=yes],
      [enable_static_boost=no])])# BOOST_STATIC
@@ -290,6 +297,7 @@ dnl The else branch is huge and wasn't intended on purpose.
 AC_LANG_PUSH([C++])dnl
 AS_VAR_PUSHDEF([Boost_lib], [boost_cv_lib_$1])dnl
 AS_VAR_PUSHDEF([Boost_lib_LDFLAGS], [boost_cv_lib_$1_LDFLAGS])dnl
+AS_VAR_PUSHDEF([Boost_lib_LDPATH], [boost_cv_lib_$1_LDPATH])dnl
 AS_VAR_PUSHDEF([Boost_lib_LIBS], [boost_cv_lib_$1_LIBS])dnl
 BOOST_FIND_HEADER([$3])
 boost_save_CPPFLAGS=$CPPFLAGS
@@ -371,8 +379,8 @@ for boost_rtopt_ in $boost_rtopt '' -d; do
     boost_tmp_lib=$with_boost
     test x"$with_boost" = x && boost_tmp_lib=${boost_cv_inc_path%/include}
     for boost_ldpath in "$boost_tmp_lib/lib" '' \
-             /opt/local/lib /usr/local/lib /opt/lib /usr/lib \
-             "$with_boost" C:/Boost/lib /lib /usr/lib64 /lib64
+             /opt/local/lib* /usr/local/lib* /opt/lib* /usr/lib* \
+             "$with_boost" C:/Boost/lib /lib*
     do
       test -e "$boost_ldpath" || continue
       boost_save_LDFLAGS=$LDFLAGS
@@ -395,7 +403,16 @@ dnl generated only once above (before we start the for loops).
       LDFLAGS=$boost_save_LDFLAGS
       LIBS=$boost_save_LIBS
       if test x"$Boost_lib" = xyes; then
-        Boost_lib_LDFLAGS="-L$boost_ldpath -R$boost_ldpath"
+        # Because Boost is often installed in non-standard locations we want to
+        # hardcode the path to the library (with rpath).  Here we assume that
+        # Libtool's macro was already invoked so we can steal its variable
+        # hardcode_libdir_flag_spec in order to get the right flags for ld.
+        boost_save_libdir=$libdir
+        libdir=$boost_ldpath
+        eval boost_rpath=\"$hardcode_libdir_flag_spec\"
+        libdir=$boost_save_libdir
+        Boost_lib_LDFLAGS="-L$boost_ldpath $boost_rpath"
+        Boost_lib_LDPATH="$boost_ldpath"
         break 6
       else
         boost_failed_libs="$boost_failed_libs@$boost_lib@"
@@ -410,14 +427,17 @@ rm -f conftest.$ac_objext
 ])
 case $Boost_lib in #(
   no) _AC_MSG_LOG_CONFTEST
-    AC_MSG_ERROR([cannot not find the flags to link with Boost $1])
+    AC_MSG_ERROR([cannot find the flags to link with Boost $1])
     ;;
 esac
-AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS])
-AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS])
+AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS])dnl
+AC_SUBST(AS_TR_CPP([BOOST_$1_LDPATH]), [$Boost_lib_LDPATH])dnl
+AC_SUBST([BOOST_LDPATH], [$Boost_lib_LDPATH])dnl
+AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS])dnl
 CPPFLAGS=$boost_save_CPPFLAGS
 AS_VAR_POPDEF([Boost_lib])dnl
 AS_VAR_POPDEF([Boost_lib_LDFLAGS])dnl
+AS_VAR_POPDEF([Boost_lib_LDPATH])dnl
 AS_VAR_POPDEF([Boost_lib_LIBS])dnl
 AC_LANG_POP([C++])dnl
 fi
@@ -432,17 +452,31 @@ fi
 # The page http://beta.boost.org/doc/libs is useful: it gives the first release
 # version of each library (among other things).
 
+# BOOST_DEFUN(LIBRARY, CODE)
+# --------------------------
+# Define BOOST_<LIBRARY-UPPERCASE> as a macro that runs CODE.
+#
+# Use indir to avoid the warning on underquoted macro name given to AC_DEFUN.
+m4_define([BOOST_DEFUN],
+[m4_indir([AC_DEFUN],
+          m4_toupper([BOOST_$1]),
+[m4_pushdef([BOOST_Library], [$1])dnl
+$2
+m4_popdef([BOOST_Library])dnl
+])
+])
+
 # BOOST_ARRAY()
 # -------------
 # Look for Boost.Array
-AC_DEFUN([BOOST_ARRAY],
+BOOST_DEFUN([Array],
 [BOOST_FIND_HEADER([boost/array.hpp])])
 
 
 # BOOST_ASIO()
 # ------------
 # Look for Boost.Asio (new in Boost 1.35).
-AC_DEFUN([BOOST_ASIO],
+BOOST_DEFUN([Asio],
 [AC_REQUIRE([BOOST_SYSTEM])dnl
 BOOST_FIND_HEADER([boost/asio.hpp])])
 
@@ -450,14 +484,41 @@ BOOST_FIND_HEADER([boost/asio.hpp])])
 # BOOST_BIND()
 # ------------
 # Look for Boost.Bind
-AC_DEFUN([BOOST_BIND],
+BOOST_DEFUN([Bind],
 [BOOST_FIND_HEADER([boost/bind.hpp])])
 
 
+# BOOST_CHRONO()
+# ------------------
+# Look for Boost.Chrono
+BOOST_DEFUN([Chrono],
+[# Do we have to check for Boost.System?  This link-time dependency was
+# added as of 1.35.0.  If we have a version <1.35, we must not attempt to
+# find Boost.System as it didn't exist by then.
+if test $boost_major_version -ge 135; then
+  BOOST_SYSTEM([$1])
+fi # end of the Boost.System check.
+boost_system_save_LIBS=$LIBS
+boost_system_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
+BOOST_FIND_LIB([chrono], [$1],
+               [boost/chrono.hpp],
+               [boost::chrono::system_clock::time_point d = boost::chrono::system_clock::now();])
+if test $enable_static_boost = yes && test $boost_major_version -ge 135; then
+    AC_SUBST([BOOST_SYSTEM_LIBS], ["$BOOST_SYSTEM_LIBS $BOOST_SYSTEM_LIBS"])
+fi
+LIBS=$boost_system_save_LIBS
+LDFLAGS=$boost_system_save_LDFLAGS
+
+])# BOOST_CHRONO
+
+
 # BOOST_CONVERSION()
 # ------------------
 # Look for Boost.Conversion (cast / lexical_cast)
-AC_DEFUN([BOOST_CONVERSION],
+BOOST_DEFUN([Conversion],
 [BOOST_FIND_HEADER([boost/cast.hpp])
 BOOST_FIND_HEADER([boost/lexical_cast.hpp])
 ])# BOOST_CONVERSION
@@ -467,12 +528,31 @@ BOOST_FIND_HEADER([boost/lexical_cast.hpp])
 # -----------------------------------
 # Look for Boost.Date_Time.  For the documentation of PREFERRED-RT-OPT, see the
 # documentation of BOOST_FIND_LIB above.
-AC_DEFUN([BOOST_DATE_TIME],
+BOOST_DEFUN([Date_Time],
 [BOOST_FIND_LIB([date_time], [$1],
                 [boost/date_time/posix_time/posix_time.hpp],
                 [boost::posix_time::ptime t;])
 ])# BOOST_DATE_TIME
 
+# BOOST_TIMER([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.Timer.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Timer],
+[#check for Boost.System
+BOOST_SYSTEM([$1])
+boost_system_save_LIBS=$LIBS
+boost_system_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
+BOOST_FIND_LIB([timer], [$1],
+                [boost/timer/timer.hpp],
+                [boost::timer::auto_cpu_timer t;])
+AC_SUBST([BOOST_SYSTEM_LIBS], ["$BOOST_SYSTEM_LIBS $BOOST_SYSTEM_LIBS"])
+LIBS=$boost_system_save_LIBS
+LDFLAGS=$boost_system_save_LDFLAGS
+])# BOOST_TIMER
 
 # BOOST_FILESYSTEM([PREFERRED-RT-OPT])
 # ------------------------------------
@@ -480,7 +560,7 @@ AC_DEFUN([BOOST_DATE_TIME],
 # the documentation of BOOST_FIND_LIB above.
 # Do not check for boost/filesystem.hpp because this file was introduced in
 # 1.34.
-AC_DEFUN([BOOST_FILESYSTEM],
+BOOST_DEFUN([Filesystem],
 [# Do we have to check for Boost.System?  This link-time dependency was
 # added as of 1.35.0.  If we have a version <1.35, we must not attempt to
 # find Boost.System as it didn't exist by then.
@@ -494,6 +574,9 @@ LIBS="$LIBS $BOOST_SYSTEM_LIBS"
 LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
 BOOST_FIND_LIB([filesystem], [$1],
                 [boost/filesystem/path.hpp], [boost::filesystem::path p;])
+if test $enable_static_boost = yes && test $boost_major_version -ge 135; then
+    AC_SUBST([BOOST_FILESYSTEM_LIBS], ["$BOOST_FILESYSTEM_LIBS $BOOST_SYSTEM_LIBS"])
+fi
 LIBS=$boost_filesystem_save_LIBS
 LDFLAGS=$boost_filesystem_save_LDFLAGS
 ])# BOOST_FILESYSTEM
@@ -502,7 +585,7 @@ LDFLAGS=$boost_filesystem_save_LDFLAGS
 # BOOST_FOREACH()
 # ---------------
 # Look for Boost.Foreach
-AC_DEFUN([BOOST_FOREACH],
+BOOST_DEFUN([Foreach],
 [BOOST_FIND_HEADER([boost/foreach.hpp])])
 
 
@@ -513,14 +596,14 @@ AC_DEFUN([BOOST_FOREACH],
 # standalone.  It can't be compiled because it triggers the following error:
 # boost/format/detail/config_macros.hpp:88: error: 'locale' in namespace 'std'
 #                                                  does not name a type
-AC_DEFUN([BOOST_FORMAT],
+BOOST_DEFUN([Format],
 [BOOST_FIND_HEADER([boost/format.hpp])])
 
 
 # BOOST_FUNCTION()
 # ----------------
 # Look for Boost.Function
-AC_DEFUN([BOOST_FUNCTION],
+BOOST_DEFUN([Function],
 [BOOST_FIND_HEADER([boost/function.hpp])])
 
 
@@ -528,37 +611,60 @@ AC_DEFUN([BOOST_FUNCTION],
 # -------------------------------
 # Look for Boost.Graphs.  For the documentation of PREFERRED-RT-OPT, see the
 # documentation of BOOST_FIND_LIB above.
-AC_DEFUN([BOOST_GRAPH],
+BOOST_DEFUN([Graph],
 [BOOST_FIND_LIB([graph], [$1],
                 [boost/graph/adjacency_list.hpp], [boost::adjacency_list<> g;])
 ])# BOOST_GRAPH
 
 
 # BOOST_IOSTREAMS([PREFERRED-RT-OPT])
-# -------------------------------
+# -----------------------------------
 # Look for Boost.IOStreams.  For the documentation of PREFERRED-RT-OPT, see the
 # documentation of BOOST_FIND_LIB above.
-AC_DEFUN([BOOST_IOSTREAMS],
+BOOST_DEFUN([IOStreams],
 [BOOST_FIND_LIB([iostreams], [$1],
                 [boost/iostreams/device/file_descriptor.hpp],
-                [boost::iostreams::file_descriptor fd(0); fd.close();])
+                [boost::iostreams::file_descriptor fd; fd.close();])
 ])# BOOST_IOSTREAMS
 
 
 # BOOST_HASH()
 # ------------
 # Look for Boost.Functional/Hash
-AC_DEFUN([BOOST_HASH],
+BOOST_DEFUN([Hash],
 [BOOST_FIND_HEADER([boost/functional/hash.hpp])])
 
 
 # BOOST_LAMBDA()
 # --------------
 # Look for Boost.Lambda
-AC_DEFUN([BOOST_LAMBDA],
+BOOST_DEFUN([Lambda],
 [BOOST_FIND_HEADER([boost/lambda/lambda.hpp])])
 
 
+# BOOST_LOG([PREFERRED-RT-OPT])
+# -----------------------------
+# Look for Boost.Log For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Log],
+[BOOST_FIND_LIB([log], [$1],
+    [boost/log/core/core.hpp],
+    [boost::log::attribute a; a.get_value();])
+])# BOOST_LOG
+
+
+# BOOST_LOG_SETUP([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.Log For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Log_Setup],
+[AC_REQUIRE([BOOST_LOG])dnl
+BOOST_FIND_LIB([log_setup], [$1],
+    [boost/log/utility/init/from_settings.hpp],
+    [boost::log::basic_settings<char> bs; bs.empty();])
+])# BOOST_LOG_SETUP
+
+
 # BOOST_MATH()
 # ------------
 # Look for Boost.Math
@@ -567,21 +673,21 @@ AC_DEFUN([BOOST_LAMBDA],
 # libboost_math_c99f, libboost_math_c99l, libboost_math_tr1,
 # libboost_math_tr1f, libboost_math_tr1l).  This macro must be fixed to do the
 # right thing anyway.
-AC_DEFUN([BOOST_MATH],
+BOOST_DEFUN([Math],
 [BOOST_FIND_HEADER([boost/math/special_functions.hpp])])
 
 
 # BOOST_MULTIARRAY()
 # ------------------
 # Look for Boost.MultiArray
-AC_DEFUN([BOOST_MULTIARRAY],
+BOOST_DEFUN([MultiArray],
 [BOOST_FIND_HEADER([boost/multi_array.hpp])])
 
 
 # BOOST_NUMERIC_CONVERSION()
 # --------------------------
 # Look for Boost.NumericConversion (policy-based numeric conversion)
-AC_DEFUN([BOOST_NUMERIC_CONVERSION],
+BOOST_DEFUN([Numeric_Conversion],
 [BOOST_FIND_HEADER([boost/numeric/conversion/converter.hpp])
 ])# BOOST_NUMERIC_CONVERSION
 
@@ -589,32 +695,76 @@ AC_DEFUN([BOOST_NUMERIC_CONVERSION],
 # BOOST_OPTIONAL()
 # ----------------
 # Look for Boost.Optional
-AC_DEFUN([BOOST_OPTIONAL],
+BOOST_DEFUN([Optional],
 [BOOST_FIND_HEADER([boost/optional.hpp])])
 
 
 # BOOST_PREPROCESSOR()
 # --------------------
 # Look for Boost.Preprocessor
-AC_DEFUN([BOOST_PREPROCESSOR],
+BOOST_DEFUN([Preprocessor],
 [BOOST_FIND_HEADER([boost/preprocessor/repeat.hpp])])
 
 
+# BOOST_UNORDERED()
+# -----------------
+# Look for Boost.Unordered
+BOOST_DEFUN([Unordered],
+[BOOST_FIND_HEADER([boost/unordered_map.hpp])])
+
+
+# BOOST_UUID()
+# ------------
+# Look for Boost.Uuid
+BOOST_DEFUN([Uuid],
+[BOOST_FIND_HEADER([boost/uuid/uuid.hpp])])
+
+
 # BOOST_PROGRAM_OPTIONS([PREFERRED-RT-OPT])
 # -----------------------------------------
-# Look for Boost.Program_options.  For the documentation of PREFERRED-RT-OPT, see
-# the documentation of BOOST_FIND_LIB above.
-AC_DEFUN([BOOST_PROGRAM_OPTIONS],
+# Look for Boost.Program_options.  For the documentation of PREFERRED-RT-OPT,
+# see the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Program_Options],
 [BOOST_FIND_LIB([program_options], [$1],
                 [boost/program_options.hpp],
                 [boost::program_options::options_description d("test");])
 ])# BOOST_PROGRAM_OPTIONS
 
 
+
+# _BOOST_PYTHON_CONFIG(VARIABLE, FLAG)
+# ------------------------------------
+# Save VARIABLE, and define it via `python-config --FLAG`.
+# Substitute BOOST_PYTHON_VARIABLE.
+m4_define([_BOOST_PYTHON_CONFIG],
+[AC_SUBST([BOOST_PYTHON_$1],
+          [`python-config --$2 2>/dev/null`])dnl
+boost_python_save_$1=$$1
+$1="$$1 $BOOST_PYTHON_$1"])
+
+
+# BOOST_PYTHON([PREFERRED-RT-OPT])
+# --------------------------------
+# Look for Boost.Python.  For the documentation of PREFERRED-RT-OPT,
+# see the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Python],
+[_BOOST_PYTHON_CONFIG([CPPFLAGS], [includes])
+_BOOST_PYTHON_CONFIG([LDFLAGS],   [ldflags])
+_BOOST_PYTHON_CONFIG([LIBS],      [libs])
+m4_pattern_allow([^BOOST_PYTHON_MODULE$])dnl
+BOOST_FIND_LIB([python], [$1],
+               [boost/python.hpp],
+               [], [BOOST_PYTHON_MODULE(empty) {}])
+CPPFLAGS=$boost_python_save_CPPFLAGS
+LDFLAGS=$boost_python_save_LDFLAGS
+LIBS=$boost_python_save_LIBS
+])# BOOST_PYTHON
+
+
 # BOOST_REF()
 # -----------
 # Look for Boost.Ref
-AC_DEFUN([BOOST_REF],
+BOOST_DEFUN([Ref],
 [BOOST_FIND_HEADER([boost/ref.hpp])])
 
 
@@ -622,7 +772,7 @@ AC_DEFUN([BOOST_REF],
 # -------------------------------
 # Look for Boost.Regex.  For the documentation of PREFERRED-RT-OPT, see the
 # documentation of BOOST_FIND_LIB above.
-AC_DEFUN([BOOST_REGEX],
+BOOST_DEFUN([Regex],
 [BOOST_FIND_LIB([regex], [$1],
                 [boost/regex.hpp],
                 [boost::regex exp("*"); boost::regex_match("foo", exp);])
@@ -633,19 +783,19 @@ AC_DEFUN([BOOST_REGEX],
 # ---------------------------------------
 # Look for Boost.Serialization.  For the documentation of PREFERRED-RT-OPT, see
 # the documentation of BOOST_FIND_LIB above.
-AC_DEFUN([BOOST_SERIALIZATION],
+BOOST_DEFUN([Serialization],
 [BOOST_FIND_LIB([serialization], [$1],
                 [boost/archive/text_oarchive.hpp],
                 [std::ostream* o = 0; // Cheap way to get an ostream...
                 boost::archive::text_oarchive t(*o);])
-])# BOOST_SIGNALS
+])# BOOST_SERIALIZATION
 
 
 # BOOST_SIGNALS([PREFERRED-RT-OPT])
 # ---------------------------------
 # Look for Boost.Signals.  For the documentation of PREFERRED-RT-OPT, see the
 # documentation of BOOST_FIND_LIB above.
-AC_DEFUN([BOOST_SIGNALS],
+BOOST_DEFUN([Signals],
 [BOOST_FIND_LIB([signals], [$1],
                 [boost/signal.hpp],
                 [boost::signal<void ()> s;])
@@ -655,7 +805,7 @@ AC_DEFUN([BOOST_SIGNALS],
 # BOOST_SMART_PTR()
 # -----------------
 # Look for Boost.SmartPtr
-AC_DEFUN([BOOST_SMART_PTR],
+BOOST_DEFUN([Smart_Ptr],
 [BOOST_FIND_HEADER([boost/scoped_ptr.hpp])
 BOOST_FIND_HEADER([boost/shared_ptr.hpp])
 ])
@@ -664,14 +814,14 @@ BOOST_FIND_HEADER([boost/shared_ptr.hpp])
 # BOOST_STATICASSERT()
 # --------------------
 # Look for Boost.StaticAssert
-AC_DEFUN([BOOST_STATICASSERT],
+BOOST_DEFUN([StaticAssert],
 [BOOST_FIND_HEADER([boost/static_assert.hpp])])
 
 
 # BOOST_STRING_ALGO()
 # -------------------
 # Look for Boost.StringAlgo
-AC_DEFUN([BOOST_STRING_ALGO],
+BOOST_DEFUN([String_Algo],
 [BOOST_FIND_HEADER([boost/algorithm/string.hpp])
 ])
 
@@ -681,7 +831,7 @@ AC_DEFUN([BOOST_STRING_ALGO],
 # Look for Boost.System.  For the documentation of PREFERRED-RT-OPT, see the
 # documentation of BOOST_FIND_LIB above.  This library was introduced in Boost
 # 1.35.0.
-AC_DEFUN([BOOST_SYSTEM],
+BOOST_DEFUN([System],
 [BOOST_FIND_LIB([system], [$1],
                 [boost/system/error_code.hpp],
                 [boost::system::error_code e; e.clear();])
@@ -692,7 +842,7 @@ AC_DEFUN([BOOST_SYSTEM],
 # ------------------------------
 # Look for Boost.Test.  For the documentation of PREFERRED-RT-OPT, see the
 # documentation of BOOST_FIND_LIB above.
-AC_DEFUN([BOOST_TEST],
+BOOST_DEFUN([Test],
 [m4_pattern_allow([^BOOST_CHECK$])dnl
 BOOST_FIND_LIB([unit_test_framework], [$1],
                [boost/test/unit_test.hpp], [BOOST_CHECK(2 == 2);],
@@ -707,25 +857,49 @@ BOOST_FIND_LIB([unit_test_framework], [$1],
 # Look for Boost.Thread.  For the documentation of PREFERRED-RT-OPT, see the
 # documentation of BOOST_FIND_LIB above.
 # FIXME: Provide an alias "BOOST_THREAD".
-AC_DEFUN([BOOST_THREADS],
+BOOST_DEFUN([Threads],
 [dnl Having the pthread flag is required at least on GCC3 where
 dnl boost/thread.hpp would complain if we try to compile without
 dnl -pthread on GNU/Linux.
 AC_REQUIRE([_BOOST_PTHREAD_FLAG])dnl
 boost_threads_save_LIBS=$LIBS
+boost_threads_save_LDFLAGS=$LDFLAGS
 boost_threads_save_CPPFLAGS=$CPPFLAGS
-LIBS="$LIBS $boost_cv_pthread_flag"
+# Link-time dependency from thread to system was added as of 1.49.0.
+if test $boost_major_version -ge 149; then
+BOOST_SYSTEM([$1])
+fi # end of the Boost.System check.
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
 # Yes, we *need* to put the -pthread thing in CPPFLAGS because with GCC3,
 # boost/thread.hpp will trigger a #error if -pthread isn't used:
 #   boost/config/requires_threads.hpp:47:5: #error "Compiler threading support
 #   is not turned on. Please set the correct command line options for
 #   threading: -pthread (Linux), -pthreads (Solaris) or -mthreads (Mingw32)"
 CPPFLAGS="$CPPFLAGS $boost_cv_pthread_flag"
-BOOST_FIND_LIB([thread], [$1],
-                [boost/thread.hpp], [boost::thread t; boost::mutex m;])
-BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $boost_cv_pthread_flag"
+
+# When compiling for the Windows platform, the threads library is named
+# differently.
+case $host_os in
+  (*mingw*)
+    BOOST_FIND_LIB([thread_win32], [$1],
+                   [boost/thread.hpp], [boost::thread t; boost::mutex m;])
+    BOOST_THREAD_LDFLAGS=$BOOST_THREAD_WIN32_LDFLAGS
+    BOOST_THREAD_LDPATH=$BOOST_THREAD_WIN32_LDPATH
+    BOOST_THREAD_LIBS=$BOOST_THREAD_WIN32_LIBS
+  ;;
+  (*)
+    BOOST_FIND_LIB([thread], [$1],
+                   [boost/thread.hpp], [boost::thread t; boost::mutex m;])
+  ;;
+esac
+
+BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag"
+BOOST_THREAD_LDFLAGS="$BOOST_SYSTEM_LDFLAGS"
 BOOST_CPPFLAGS="$BOOST_CPPFLAGS $boost_cv_pthread_flag"
 LIBS=$boost_threads_save_LIBS
+LDFLAGS=$boost_threads_save_LDFLAGS
 CPPFLAGS=$boost_threads_save_CPPFLAGS
 ])# BOOST_THREADS
 
@@ -733,14 +907,14 @@ CPPFLAGS=$boost_threads_save_CPPFLAGS
 # BOOST_TOKENIZER()
 # -----------------
 # Look for Boost.Tokenizer
-AC_DEFUN([BOOST_TOKENIZER],
+BOOST_DEFUN([Tokenizer],
 [BOOST_FIND_HEADER([boost/tokenizer.hpp])])
 
 
 # BOOST_TRIBOOL()
 # ---------------
 # Look for Boost.Tribool
-AC_DEFUN([BOOST_TRIBOOL],
+BOOST_DEFUN([Tribool],
 [BOOST_FIND_HEADER([boost/logic/tribool_fwd.hpp])
 BOOST_FIND_HEADER([boost/logic/tribool.hpp])
 ])
@@ -749,14 +923,14 @@ BOOST_FIND_HEADER([boost/logic/tribool.hpp])
 # BOOST_TUPLE()
 # -------------
 # Look for Boost.Tuple
-AC_DEFUN([BOOST_TUPLE],
+BOOST_DEFUN([Tuple],
 [BOOST_FIND_HEADER([boost/tuple/tuple.hpp])])
 
 
 # BOOST_TYPETRAITS()
 # --------------------
 # Look for Boost.TypeTraits
-AC_DEFUN([BOOST_TYPETRAITS],
+BOOST_DEFUN([TypeTraits],
 [BOOST_FIND_HEADER([boost/type_traits.hpp])])
 
 
@@ -764,14 +938,14 @@ AC_DEFUN([BOOST_TYPETRAITS],
 # ---------------
 # Look for Boost.Utility (noncopyable, result_of, base-from-member idiom,
 # etc.)
-AC_DEFUN([BOOST_UTILITY],
+BOOST_DEFUN([Utility],
 [BOOST_FIND_HEADER([boost/utility.hpp])])
 
 
 # BOOST_VARIANT()
 # ---------------
 # Look for Boost.Variant.
-AC_DEFUN([BOOST_VARIANT],
+BOOST_DEFUN([Variant],
 [BOOST_FIND_HEADER([boost/variant/variant_fwd.hpp])
 BOOST_FIND_HEADER([boost/variant.hpp])])
 
@@ -782,15 +956,15 @@ BOOST_FIND_HEADER([boost/variant.hpp])])
 # call BOOST_THREADS first.
 # Look for Boost.Wave.  For the documentation of PREFERRED-RT-OPT, see the
 # documentation of BOOST_FIND_LIB above.
-AC_DEFUN([BOOST_WAVE],
+BOOST_DEFUN([Wave],
 [AC_REQUIRE([BOOST_FILESYSTEM])dnl
 AC_REQUIRE([BOOST_DATE_TIME])dnl
 boost_wave_save_LIBS=$LIBS
 boost_wave_save_LDFLAGS=$LDFLAGS
 m4_pattern_allow([^BOOST_((FILE)?SYSTEM|DATE_TIME|THREAD)_(LIBS|LDFLAGS)$])dnl
-LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS\
+LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS \
 $BOOST_THREAD_LIBS"
-LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS\
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS \
 $BOOST_DATE_TIME_LDFLAGS $BOOST_THREAD_LDFLAGS"
 BOOST_FIND_LIB([wave], [$1],
                 [boost/wave.hpp],
@@ -803,7 +977,7 @@ LDFLAGS=$boost_wave_save_LDFLAGS
 # BOOST_XPRESSIVE()
 # -----------------
 # Look for Boost.Xpressive (new since 1.36.0).
-AC_DEFUN([BOOST_XPRESSIVE],
+BOOST_DEFUN([Xpressive],
 [BOOST_FIND_HEADER([boost/xpressive/xpressive.hpp])])
 
 
@@ -893,8 +1067,9 @@ AC_DEFUN([_BOOST_FIND_COMPILER_TAG],
 [AC_REQUIRE([AC_PROG_CXX])dnl
 AC_REQUIRE([AC_CANONICAL_HOST])dnl
 AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag],
-[AC_LANG_PUSH([C++])dnl
-  boost_cv_lib_tag=unknown
+[boost_cv_lib_tag=unknown
+if test x$boost_cv_inc_path != xno; then
+  AC_LANG_PUSH([C++])dnl
   # The following tests are mostly inspired by boost/config/auto_link.hpp
   # The list is sorted to most recent/common to oldest compiler (in order
   # to increase the likelihood of finding the right compiler with the
@@ -908,8 +1083,12 @@ AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag]
   #   como, edg, kcc, bck, mp, sw, tru, xlc
   # I'm not sure about my test for `il' (be careful: Intel's ICC pre-defines
   # the same defines as GCC's).
-  # TODO: Move the test on GCC 4.4 up once it's released.
   for i in \
+    _BOOST_gcc_test(4, 8) \
+    _BOOST_gcc_test(4, 7) \
+    _BOOST_gcc_test(4, 6) \
+    _BOOST_gcc_test(4, 5) \
+    _BOOST_gcc_test(4, 4) \
     _BOOST_gcc_test(4, 3) \
     _BOOST_gcc_test(4, 2) \
     _BOOST_gcc_test(4, 1) \
@@ -929,7 +1108,6 @@ AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag]
     "defined __ICC && (defined __unix || defined __unix__) @ il" \
     "defined __ICL @ iw" \
     "defined _MSC_VER && _MSC_VER == 1300 @ vc7" \
-    _BOOST_gcc_test(4, 4) \
     _BOOST_gcc_test(2, 95) \
     "defined __MWERKS__ && __MWERKS__ <= 0x32FF @ cw9" \
     "defined _MSC_VER && _MSC_VER < 1300 && !defined UNDER_CE @ vc6" \
@@ -969,7 +1147,7 @@ AC_LANG_POP([C++])dnl
       boost_cv_lib_tag=
       ;;
   esac
-])dnl end of AC_CACHE_CHECK
+fi])dnl end of AC_CACHE_CHECK
 ])# _BOOST_FIND_COMPILER_TAG
 
 
-- 
cgit v1.2.3


From dc16aa2accc7d9033d9c31c7bbc5e581d43a5101 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Sun, 20 Jan 2013 12:31:03 +0000
Subject: Better delimiters, cross-platform fixes

---
 klm/lm/builder/corpus_count.cc |  3 ++-
 klm/lm/filter/arpa_io.cc       | 36 +++++++++++-------------------------
 klm/lm/filter/arpa_io.hh       | 27 ++++++++++-----------------
 klm/util/stream/sort.hh        |  5 +++--
 klm/util/stream/timer.hh       |  8 +++++---
 5 files changed, 31 insertions(+), 48 deletions(-)

(limited to 'klm/util/stream')

diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc
index 8c3de57d..abea4ed0 100644
--- a/klm/lm/builder/corpus_count.cc
+++ b/klm/lm/builder/corpus_count.cc
@@ -202,11 +202,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
   const WordIndex end_sentence = vocab.Lookup("</s>");
   Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);
   uint64_t count = 0;
+  StringPiece delimiters("\0\t\r ", 4);
   try {
     while(true) {
       StringPiece line(from_.ReadLine());
       writer.StartSentence();
-      for (util::TokenIter<util::AnyCharacter, true> w(line, " \t"); w; ++w) {
+      for (util::TokenIter<util::AnyCharacter, true> w(line, delimiters); w; ++w) {
         WordIndex word = vocab.Lookup(*w);
         UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus.  I plan to support models containing <unk> in the future.");
         writer.Append(word);
diff --git a/klm/lm/filter/arpa_io.cc b/klm/lm/filter/arpa_io.cc
index caf8df95..f8568ac4 100644
--- a/klm/lm/filter/arpa_io.cc
+++ b/klm/lm/filter/arpa_io.cc
@@ -12,38 +12,24 @@
 
 namespace lm {
 
-ARPAInputException::ARPAInputException(const StringPiece &message) throw() : what_("Error: ") {
-  what_.append(message.data(), message.size());
+ARPAInputException::ARPAInputException(const StringPiece &message) throw() {
+  *this << message;
 }
 
 ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() {
-  what_ = "Error: ";
-  what_.append(message.data(), message.size());
-  what_ += " in line '";
-  what_.append(line.data(), line.size());
-  what_ += "'.";
+  *this << message << " in line " << line;
 }
 
-ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw()
-  : what_(std::string(message) + " file " + file_name), file_name_(file_name) {
-  if (errno) {
-    char buf[1024];
-    buf[0] = 0;
-#if (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE
-    const char *add = buf;
-    if (!strerror_r(errno, buf, 1024)) {
-#else
-    const char *add = strerror_r(errno, buf, 1024);
-    if (add) {
-#endif
-      what_ += " :";
-      what_ += add;
-    }
-  }
+ARPAInputException::~ARPAInputException() throw() {}
+
+ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() {
+  *this << message << " in file " << file_name;
 }
 
+ARPAOutputException::~ARPAOutputException() throw() {}
+
 // Seeking is the responsibility of the caller.
-void WriteCounts(std::ostream &out, const std::vector<size_t> &number) {
+void WriteCounts(std::ostream &out, const std::vector<uint64_t> &number) {
   out << "\n\\data\\\n";
   for (unsigned int i = 0; i < number.size(); ++i) {
     out << "ngram " << i+1 << "=" << number[i] << '\n';
@@ -51,7 +37,7 @@ void WriteCounts(std::ostream &out, const std::vector<size_t> &number) {
   out << '\n';
 }
 
-size_t SizeNeededForCounts(const std::vector<size_t> &number) {
+size_t SizeNeededForCounts(const std::vector<uint64_t> &number) {
   std::ostringstream buf;
   WriteCounts(buf, number);
   return buf.tellp();
diff --git a/klm/lm/filter/arpa_io.hh b/klm/lm/filter/arpa_io.hh
index 90f48447..5b31620b 100644
--- a/klm/lm/filter/arpa_io.hh
+++ b/klm/lm/filter/arpa_io.hh
@@ -16,6 +16,7 @@
 
 #include <err.h>
 #include <string.h>
+#include <stdint.h>
 
 namespace util { class FilePiece; }
 
@@ -25,34 +26,26 @@ class ARPAInputException : public util::Exception {
   public:
     explicit ARPAInputException(const StringPiece &message) throw();
     explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw();
-    virtual ~ARPAInputException() throw() {}
-
-    const char *what() const throw() { return what_.c_str(); }
-
-  private:
-    std::string what_;
+    virtual ~ARPAInputException() throw();
 };
 
-class ARPAOutputException : public std::exception {
+class ARPAOutputException : public util::ErrnoException {
   public:
     ARPAOutputException(const char *prefix, const std::string &file_name) throw();
-    virtual ~ARPAOutputException() throw() {}
-
-    const char *what() const throw() { return what_.c_str(); }
+    virtual ~ARPAOutputException() throw();
 
     const std::string &File() const throw() { return file_name_; }
 
   private:
-    std::string what_;
     const std::string file_name_;
 };
 
 // Handling for the counts of n-grams at the beginning of ARPA files.
-size_t SizeNeededForCounts(const std::vector<size_t> &number);
+size_t SizeNeededForCounts(const std::vector<uint64_t> &number);
 
 /* Writes an ARPA file.  This has to be seekable so the counts can be written
  * at the end.  Hence, I just have it own a std::fstream instead of accepting
- * a separately held std::ostream.  
+ * a separately held std::ostream.  TODO: use the fast one from estimation.
  */
 class ARPAOutput : boost::noncopyable {
   public:
@@ -88,14 +81,14 @@ class ARPAOutput : boost::noncopyable {
     boost::scoped_array<char> buffer_;
     std::fstream file_;
     size_t fast_counter_;
-    std::vector<size_t> counts_;
+    std::vector<uint64_t> counts_;
 };
 
 
-template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, size_t number, Output &out) {
+template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) {
   ReadNGramHeader(in, length);
   out.BeginLength(length);
-  for (size_t i = 0; i < number; ++i) {
+  for (uint64_t i = 0; i < number; ++i) {
     StringPiece line = in.ReadLine();
     util::TokenIter<util::SingleCharacter> tabber(line, '\t');
     if (!tabber) throw ARPAInputException("blank line", line);
@@ -107,7 +100,7 @@ template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length
 }
 
 template <class Output> void ReadARPA(util::FilePiece &in_lm, Output &out) {
-  std::vector<size_t> number;
+  std::vector<uint64_t> number;
   ReadARPACounts(in_lm, number);
   out.ReserveForCounts(SizeNeededForCounts(number));
   for (unsigned int i = 0; i < number.size(); ++i) {
diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh
index df57fa41..a86f160f 100644
--- a/klm/util/stream/sort.hh
+++ b/klm/util/stream/sort.hh
@@ -259,8 +259,9 @@ template <class Compare, class Combine> class MergingReader {
 
       while (in_offsets_->RemainingBlocks()) {
         // Use bigger buffers if there's less remaining.
-        uint64_t per_buffer = std::max(static_cast<uint64_t>(buffer_size_),
-                                       static_cast<uint64_t>(total_memory_ / in_offsets_->RemainingBlocks()));
+        uint64_t per_buffer = static_cast<uint64_t>(std::max<std::size_t>(
+            buffer_size_,
+            static_cast<std::size_t>((static_cast<uint64_t>(total_memory_) / in_offsets_->RemainingBlocks()))));
         per_buffer -= per_buffer % entry_size;
         assert(per_buffer);
 
diff --git a/klm/util/stream/timer.hh b/klm/util/stream/timer.hh
index 50e94fe8..7e1a5885 100644
--- a/klm/util/stream/timer.hh
+++ b/klm/util/stream/timer.hh
@@ -1,14 +1,16 @@
 #ifndef UTIL_STREAM_TIMER__
 #define UTIL_STREAM_TIMER__
 
-#include <boost/version.hpp>
+// Sorry Jon, this was adding library dependencies in Moses and people complained.
+
+/*#include <boost/version.hpp>
 
 #if BOOST_VERSION >= 104800
 #include <boost/timer/timer.hpp>
 #define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str))
 #else
-//#warning Using Boost older than 1.48. Timing information will not be available.
+//#warning Using Boost older than 1.48. Timing information will not be available.*/
 #define UTIL_TIMER(str) 
-#endif
+//#endif
 
 #endif // UTIL_STREAM_TIMER__
-- 
cgit v1.2.3