summaryrefslogtreecommitdiff
path: root/klm/util
diff options
context:
space:
mode:
Diffstat (limited to 'klm/util')
-rw-r--r--klm/util/Makefile.am18
-rw-r--r--klm/util/ersatz_progress.cc47
-rw-r--r--klm/util/ersatz_progress.hh50
-rw-r--r--klm/util/exception.cc35
-rw-r--r--klm/util/exception.hh72
-rw-r--r--klm/util/file_piece.cc224
-rw-r--r--klm/util/file_piece.hh105
-rw-r--r--klm/util/file_piece_test.cc41
-rw-r--r--klm/util/joint_sort.hh145
-rw-r--r--klm/util/joint_sort_test.cc50
-rw-r--r--klm/util/key_value_packing.hh122
-rw-r--r--klm/util/key_value_packing_test.cc75
-rw-r--r--klm/util/mmap.cc95
-rw-r--r--klm/util/mmap.hh101
-rw-r--r--klm/util/murmur_hash.cc129
-rw-r--r--klm/util/murmur_hash.hh14
-rw-r--r--klm/util/probing_hash_table.hh97
-rw-r--r--klm/util/probing_hash_table_test.cc30
-rw-r--r--klm/util/proxy_iterator.hh94
-rw-r--r--klm/util/scoped.cc12
-rw-r--r--klm/util/scoped.hh66
-rw-r--r--klm/util/sorted_uniform.hh139
-rw-r--r--klm/util/sorted_uniform_test.cc116
-rw-r--r--klm/util/string_piece.cc57
-rw-r--r--klm/util/string_piece.hh260
25 files changed, 2194 insertions, 0 deletions
diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am
new file mode 100644
index 00000000..d3aea6b7
--- /dev/null
+++ b/klm/util/Makefile.am
@@ -0,0 +1,18 @@
+if HAVE_GTEST
+noinst_PROGRAMS = \
+ scorer_test
+TESTS = scorer_test
+endif
+
+noinst_LIBRARIES = libklm_util.a
+
+libklm_util_a_SOURCES = \
+ ersatz_progress.cc \
+ exception.cc \
+ file_piece.cc \
+ mmap.cc \
+ murmur_hash.cc \
+ scoped.cc \
+ string_piece.cc
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I..
diff --git a/klm/util/ersatz_progress.cc b/klm/util/ersatz_progress.cc
new file mode 100644
index 00000000..09e3a106
--- /dev/null
+++ b/klm/util/ersatz_progress.cc
@@ -0,0 +1,47 @@
+#include "util/ersatz_progress.hh"
+
+#include <algorithm>
+#include <ostream>
+#include <limits>
+#include <string>
+
+namespace util {
+
+namespace { const unsigned char kWidth = 100; }
+
+ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<std::size_t>::max()), complete_(next_), out_(NULL) {}
+
+ErsatzProgress::~ErsatzProgress() {
+ if (!out_) return;
+ for (; stones_written_ < kWidth; ++stones_written_) {
+ (*out_) << '*';
+ }
+ *out_ << '\n';
+}
+
+ErsatzProgress::ErsatzProgress(std::ostream *to, const std::string &message, std::size_t complete)
+ : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) {
+ if (!out_) {
+ next_ = std::numeric_limits<std::size_t>::max();
+ return;
+ }
+ *out_ << message << "\n----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n";
+}
+
+void ErsatzProgress::Milestone() {
+ if (!out_) { current_ = 0; return; }
+ if (!complete_) return;
+ unsigned char stone = std::min(static_cast<std::size_t>(kWidth), (current_ * kWidth) / complete_);
+
+ for (; stones_written_ < stone; ++stones_written_) {
+ (*out_) << '*';
+ }
+
+ if (current_ >= complete_) {
+ next_ = std::numeric_limits<std::size_t>::max();
+ } else {
+ next_ = std::max(next_, (stone * complete_) / kWidth);
+ }
+}
+
+} // namespace util
diff --git a/klm/util/ersatz_progress.hh b/klm/util/ersatz_progress.hh
new file mode 100644
index 00000000..ea6c3bb9
--- /dev/null
+++ b/klm/util/ersatz_progress.hh
@@ -0,0 +1,50 @@
+#ifndef UTIL_ERSATZ_PROGRESS__
+#define UTIL_ERSATZ_PROGRESS__
+
+#include <iosfwd>
+#include <string>
+
+// Ersatz version of boost::progress so core language model doesn't depend on
+// boost. Also adds option to print nothing.
+
+namespace util {
+class ErsatzProgress {
+ public:
+ // No output.
+ ErsatzProgress();
+
+ // Null means no output. The null value is useful for passing along the ostream pointer from another caller.
+ ErsatzProgress(std::ostream *to, const std::string &message, std::size_t complete);
+
+ ~ErsatzProgress();
+
+ ErsatzProgress &operator++() {
+ if (++current_ == next_) Milestone();
+ return *this;
+ }
+
+ ErsatzProgress &operator+=(std::size_t amount) {
+ if ((current_ += amount) >= next_) Milestone();
+ return *this;
+ }
+
+ void Set(std::size_t to) {
+ if ((current_ = to) >= next_) Milestone();
+ Milestone();
+ }
+
+ private:
+ void Milestone();
+
+ std::size_t current_, next_, complete_;
+ unsigned char stones_written_;
+ std::ostream *out_;
+
+ // noncopyable
+ ErsatzProgress(const ErsatzProgress &other);
+ ErsatzProgress &operator=(const ErsatzProgress &other);
+};
+
+} // namespace util
+
+#endif // UTIL_ERSATZ_PROGRESS__
diff --git a/klm/util/exception.cc b/klm/util/exception.cc
new file mode 100644
index 00000000..dd337a76
--- /dev/null
+++ b/klm/util/exception.cc
@@ -0,0 +1,35 @@
+#include "util/exception.hh"
+
+#include <errno.h>
+#include <string.h>
+
+namespace util {
+
+Exception::Exception() throw() {}
+Exception::~Exception() throw() {}
+
+namespace {
+// The XOPEN version.
+const char *HandleStrerror(int ret, const char *buf) {
+ if (!ret) return buf;
+ return NULL;
+}
+
+// The GNU version.
+const char *HandleStrerror(const char *ret, const char *buf) {
+ return ret;
+}
+} // namespace
+
+ErrnoException::ErrnoException() throw() : errno_(errno) {
+ char buf[200];
+ buf[0] = 0;
+ const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf);
+ if (add) {
+ *this << add << ' ';
+ }
+}
+
+ErrnoException::~ErrnoException() throw() {}
+
+} // namespace util
diff --git a/klm/util/exception.hh b/klm/util/exception.hh
new file mode 100644
index 00000000..124689cf
--- /dev/null
+++ b/klm/util/exception.hh
@@ -0,0 +1,72 @@
+#ifndef UTIL_EXCEPTION__
+#define UTIL_EXCEPTION__
+
+#include "util/string_piece.hh"
+
+#include <exception>
+#include <sstream>
+#include <string>
+
+namespace util {
+
+class Exception : public std::exception {
+ public:
+ Exception() throw();
+ virtual ~Exception() throw();
+
+ const char *what() const throw() { return what_.c_str(); }
+
+ // This helps restrict operator<< defined below.
+ template <class T> struct ExceptionTag {
+ typedef T Identity;
+ };
+
+ std::string &Str() {
+ return what_;
+ }
+
+ protected:
+ std::string what_;
+};
+
+/* This implements the normal operator<< for Exception and all its children.
+ * SNIFAE means it only applies to Exception. Think of this as an ersatz
+ * boost::enable_if.
+ */
+template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) {
+ // Argh I had a stringstream in the exception, but the only way to get the string is by calling str(). But that's a temporary string, so virtual const char *what() const can't actually return it.
+ std::stringstream stream;
+ stream << data;
+ e.Str() += stream.str();
+ return e;
+}
+template <class Except> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const char *data) {
+ e.Str() += data;
+ return e;
+}
+template <class Except> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const std::string &data) {
+ e.Str() += data;
+ return e;
+}
+template <class Except> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const StringPiece &str) {
+ e.Str().append(str.data(), str.length());
+ return e;
+}
+
+#define UTIL_THROW(Exception, Modify) { Exception UTIL_e; {UTIL_e << Modify;} throw UTIL_e; }
+
+class ErrnoException : public Exception {
+ public:
+ ErrnoException() throw();
+
+ virtual ~ErrnoException() throw();
+
+ int Error() { return errno_; }
+
+ private:
+ int errno_;
+};
+
+} // namespace util
+
+#endif // UTIL_EXCEPTION__
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
new file mode 100644
index 00000000..2b439499
--- /dev/null
+++ b/klm/util/file_piece.cc
@@ -0,0 +1,224 @@
+#include "util/file_piece.hh"
+
+#include "util/exception.hh"
+
+#include <iostream>
+#include <string>
+#include <limits>
+
+#include <assert.h>
+#include <cstdlib>
+#include <ctype.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace util {
+
+EndOfFileException::EndOfFileException() throw() {
+ *this << "End of file";
+}
+EndOfFileException::~EndOfFileException() throw() {}
+
+ParseNumberException::ParseNumberException(StringPiece value) throw() {
+ *this << "Could not parse \"" << value << "\" into a float";
+}
+
+int OpenReadOrThrow(const char *name) {
+ int ret = open(name, O_RDONLY);
+ if (ret == -1) UTIL_THROW(ErrnoException, "in open (" << name << ") for reading");
+ return ret;
+}
+
+off_t SizeFile(int fd) {
+ struct stat sb;
+ if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize;
+ return sb.st_size;
+}
+
+FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) :
+ file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
+ progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
+ Initialize(name, show_progress, min_buffer);
+}
+
+FilePiece::FilePiece(const char *name, int fd, std::ostream *show_progress, off_t min_buffer) :
+ file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
+ progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
+ Initialize(name, show_progress, min_buffer);
+}
+
+void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) {
+ if (total_size_ == kBadSize) {
+ fallback_to_read_ = true;
+ if (show_progress)
+ *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl;
+ } else {
+ fallback_to_read_ = false;
+ }
+ default_map_size_ = page_ * std::max<off_t>((min_buffer / page_ + 1), 2);
+ position_ = NULL;
+ position_end_ = NULL;
+ mapped_offset_ = 0;
+ at_end_ = false;
+ Shift();
+}
+
+float FilePiece::ReadFloat() throw(EndOfFileException, ParseNumberException) {
+ SkipSpaces();
+ while (last_space_ < position_) {
+ if (at_end_) {
+ // Hallucinate a null off the end of the file.
+ std::string buffer(position_, position_end_);
+ char *end;
+ float ret = std::strtof(buffer.c_str(), &end);
+ if (buffer.c_str() == end) throw ParseNumberException(buffer);
+ position_ += end - buffer.c_str();
+ return ret;
+ }
+ Shift();
+ }
+ char *end;
+ float ret = std::strtof(position_, &end);
+ if (end == position_) throw ParseNumberException(ReadDelimited());
+ position_ = end;
+ return ret;
+}
+
+void FilePiece::SkipSpaces() throw (EndOfFileException) {
+ for (; ; ++position_) {
+ if (position_ == position_end_) Shift();
+ if (!isspace(*position_)) return;
+ }
+}
+
+const char *FilePiece::FindDelimiterOrEOF() throw (EndOfFileException) {
+ for (const char *i = position_; i <= last_space_; ++i) {
+ if (isspace(*i)) return i;
+ }
+ while (!at_end_) {
+ size_t skip = position_end_ - position_;
+ Shift();
+ for (const char *i = position_ + skip; i <= last_space_; ++i) {
+ if (isspace(*i)) return i;
+ }
+ }
+ return position_end_;
+}
+
+StringPiece FilePiece::ReadLine(char delim) throw (EndOfFileException) {
+ const char *start = position_;
+ do {
+ for (const char *i = start; i < position_end_; ++i) {
+ if (*i == delim) {
+ StringPiece ret(position_, i - position_);
+ position_ = i + 1;
+ return ret;
+ }
+ }
+ size_t skip = position_end_ - position_;
+ Shift();
+ start = position_ + skip;
+ } while (!at_end_);
+ StringPiece ret(position_, position_end_ - position_);
+ position_ = position_end_;
+ return position_;
+}
+
+void FilePiece::Shift() throw(EndOfFileException) {
+ if (at_end_) throw EndOfFileException();
+ off_t desired_begin = position_ - data_.begin() + mapped_offset_;
+ progress_.Set(desired_begin);
+
+ if (!fallback_to_read_) MMapShift(desired_begin);
+ // Notice an mmap failure might set the fallback.
+ if (fallback_to_read_) ReadShift(desired_begin);
+
+ for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) {
+ if (isspace(*last_space_)) break;
+ }
+}
+
+void FilePiece::MMapShift(off_t desired_begin) throw() {
+ // Use mmap.
+ off_t ignore = desired_begin % page_;
+ // Duplicate request for Shift means give more data.
+ if (position_ == data_.begin() + ignore) {
+ default_map_size_ *= 2;
+ }
+ // Local version so that in case of failure it doesn't overwrite the class variable.
+ off_t mapped_offset = desired_begin - ignore;
+
+ off_t mapped_size;
+ if (default_map_size_ >= static_cast<size_t>(total_size_ - mapped_offset)) {
+ at_end_ = true;
+ mapped_size = total_size_ - mapped_offset;
+ } else {
+ mapped_size = default_map_size_;
+ }
+
+ // Forcibly clear the existing mmap first.
+ data_.reset();
+ data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_PRIVATE, *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED);
+ if (data_.get() == MAP_FAILED) {
+ fallback_to_read_ = true;
+ if (desired_begin) {
+ if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either.");
+ }
+ return;
+ }
+ mapped_offset_ = mapped_offset;
+ position_ = data_.begin() + ignore;
+ position_end_ = data_.begin() + mapped_size;
+}
+
+void FilePiece::ReadShift(off_t desired_begin) throw() {
+ assert(fallback_to_read_);
+ if (data_.source() != scoped_memory::MALLOC_ALLOCATED) {
+ // First call.
+ data_.reset();
+ data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
+ if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_);
+ position_ = data_.begin();
+ position_end_ = position_;
+ }
+
+ // Bytes [data_.begin(), position_) have been consumed.
+ // Bytes [position_, position_end_) have been read into the buffer.
+
+ // Start at the beginning of the buffer if there's nothing useful in it.
+ if (position_ == position_end_) {
+ mapped_offset_ += (position_end_ - data_.begin());
+ position_ = data_.begin();
+ position_end_ = position_;
+ }
+
+ std::size_t already_read = position_end_ - data_.begin();
+
+ if (already_read == default_map_size_) {
+ if (position_ == data_.begin()) {
+ // Buffer too small.
+ std::size_t valid_length = position_end_ - position_;
+ default_map_size_ *= 2;
+ data_.call_realloc(default_map_size_);
+ if (!data_.get()) UTIL_THROW(ErrnoException, "realloc failed for " << default_map_size_);
+ position_ = data_.begin();
+ position_end_ = position_ + valid_length;
+ } else {
+ size_t moving = position_end_ - position_;
+ memmove(data_.get(), position_, moving);
+ position_ = data_.begin();
+ position_end_ = position_ + moving;
+ already_read = moving;
+ }
+ }
+
+ ssize_t read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
+ if (read_return == -1) UTIL_THROW(ErrnoException, "read failed");
+ if (read_return == 0) at_end_ = true;
+ position_end_ += read_return;
+}
+
+} // namespace util
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
new file mode 100644
index 00000000..704f0ac6
--- /dev/null
+++ b/klm/util/file_piece.hh
@@ -0,0 +1,105 @@
+#ifndef UTIL_FILE_PIECE__
+#define UTIL_FILE_PIECE__
+
+#include "util/ersatz_progress.hh"
+#include "util/exception.hh"
+#include "util/mmap.hh"
+#include "util/scoped.hh"
+#include "util/string_piece.hh"
+
+#include <string>
+
+#include <cstddef>
+
+namespace util {
+
+class EndOfFileException : public Exception {
+ public:
+ EndOfFileException() throw();
+ ~EndOfFileException() throw();
+};
+
+class ParseNumberException : public Exception {
+ public:
+ explicit ParseNumberException(StringPiece value) throw();
+ ~ParseNumberException() throw() {}
+};
+
+int OpenReadOrThrow(const char *name);
+
+// Return value for SizeFile when it can't size properly.
+const off_t kBadSize = -1;
+off_t SizeFile(int fd);
+
+class FilePiece {
+ public:
+ // 32 MB default.
+ explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
+ // Takes ownership of fd. name is used for messages.
+ explicit FilePiece(const char *name, int fd, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
+
+ char get() throw(EndOfFileException) {
+ if (position_ == position_end_) Shift();
+ return *(position_++);
+ }
+
+ // Memory backing the returned StringPiece may vanish on the next call.
+ // Leaves the delimiter, if any, to be returned by get().
+ StringPiece ReadDelimited() throw(EndOfFileException) {
+ SkipSpaces();
+ return Consume(FindDelimiterOrEOF());
+ }
+ // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
+ // It is similar to getline in that way.
+ StringPiece ReadLine(char delim = '\n') throw(EndOfFileException);
+
+ float ReadFloat() throw(EndOfFileException, ParseNumberException);
+
+ void SkipSpaces() throw (EndOfFileException);
+
+ off_t Offset() const {
+ return position_ - data_.begin() + mapped_offset_;
+ }
+
+ // Only for testing.
+ void ForceFallbackToRead() {
+ fallback_to_read_ = true;
+ }
+
+ private:
+ void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer);
+
+ StringPiece Consume(const char *to) {
+ StringPiece ret(position_, to - position_);
+ position_ = to;
+ return ret;
+ }
+
+ const char *FindDelimiterOrEOF() throw(EndOfFileException);
+
+ void Shift() throw (EndOfFileException);
+ // Backends to Shift().
+ void MMapShift(off_t desired_begin) throw ();
+ void ReadShift(off_t desired_begin) throw ();
+
+ const char *position_, *last_space_, *position_end_;
+
+ scoped_fd file_;
+ const off_t total_size_;
+ const off_t page_;
+
+ size_t default_map_size_;
+ off_t mapped_offset_;
+
+ // Order matters: file_ should always be destroyed after this.
+ scoped_memory data_;
+
+ bool at_end_;
+ bool fallback_to_read_;
+
+ ErsatzProgress progress_;
+};
+
+} // namespace util
+
+#endif // UTIL_FILE_PIECE__
diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc
new file mode 100644
index 00000000..befb7866
--- /dev/null
+++ b/klm/util/file_piece_test.cc
@@ -0,0 +1,41 @@
+#include "util/file_piece.hh"
+
+#define BOOST_TEST_MODULE FilePieceTest
+#include <boost/test/unit_test.hpp>
+#include <fstream>
+#include <iostream>
+
+namespace util {
+namespace {
+
+/* mmap implementation */
+BOOST_AUTO_TEST_CASE(MMapLine) {
+ std::fstream ref("file_piece.cc", std::ios::in);
+ FilePiece test("file_piece.cc", NULL, 1);
+ std::string ref_line;
+ while (getline(ref, ref_line)) {
+ StringPiece test_line(test.ReadLine());
+ // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
+ if (!test_line.empty() || !ref_line.empty()) {
+ BOOST_CHECK_EQUAL(ref_line, test_line);
+ }
+ }
+}
+
+/* read() implementation */
+BOOST_AUTO_TEST_CASE(ReadLine) {
+ std::fstream ref("file_piece.cc", std::ios::in);
+ FilePiece test("file_piece.cc", NULL, 1);
+ test.ForceFallbackToRead();
+ std::string ref_line;
+ while (getline(ref, ref_line)) {
+ StringPiece test_line(test.ReadLine());
+ // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
+ if (!test_line.empty() || !ref_line.empty()) {
+ BOOST_CHECK_EQUAL(ref_line, test_line);
+ }
+ }
+}
+
+} // namespace
+} // namespace util
diff --git a/klm/util/joint_sort.hh b/klm/util/joint_sort.hh
new file mode 100644
index 00000000..a2f1c01d
--- /dev/null
+++ b/klm/util/joint_sort.hh
@@ -0,0 +1,145 @@
+#ifndef UTIL_JOINT_SORT__
+#define UTIL_JOINT_SORT__
+
+/* A terrifying amount of C++ to coax std::sort into soring one range while
+ * also permuting another range the same way.
+ */
+
+#include "util/proxy_iterator.hh"
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+
+namespace util {
+
+namespace detail {
+
+template <class KeyIter, class ValueIter> class JointProxy;
+
+template <class KeyIter, class ValueIter> class JointIter {
+ public:
+ JointIter() {}
+
+ JointIter(const KeyIter &key_iter, const ValueIter &value_iter) : key_(key_iter), value_(value_iter) {}
+
+ bool operator==(const JointIter<KeyIter, ValueIter> &other) const { return key_ == other.key_; }
+
+ bool operator<(const JointIter<KeyIter, ValueIter> &other) const { return (key_ < other.key_); }
+
+ std::ptrdiff_t operator-(const JointIter<KeyIter, ValueIter> &other) const { return key_ - other.key_; }
+
+ JointIter<KeyIter, ValueIter> &operator+=(std::ptrdiff_t amount) {
+ key_ += amount;
+ value_ += amount;
+ return *this;
+ }
+
+ void swap(const JointIter &other) {
+ std::swap(key_, other.key_);
+ std::swap(value_, other.value_);
+ }
+
+ private:
+ friend class JointProxy<KeyIter, ValueIter>;
+ KeyIter key_;
+ ValueIter value_;
+};
+
+template <class KeyIter, class ValueIter> class JointProxy {
+ private:
+ typedef JointIter<KeyIter, ValueIter> InnerIterator;
+
+ public:
+ typedef struct {
+ typename std::iterator_traits<KeyIter>::value_type key;
+ typename std::iterator_traits<ValueIter>::value_type value;
+ const typename std::iterator_traits<KeyIter>::value_type &GetKey() const { return key; }
+ } value_type;
+
+ JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {}
+ JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {}
+
+ operator const value_type() const {
+ value_type ret;
+ ret.key = *inner_.key_;
+ ret.value = *inner_.value_;
+ return ret;
+ }
+
+ JointProxy &operator=(const JointProxy &other) {
+ *inner_.key_ = *other.inner_.key_;
+ *inner_.value_ = *other.inner_.value_;
+ return *this;
+ }
+
+ JointProxy &operator=(const value_type &other) {
+ *inner_.key_ = other.key;
+ *inner_.value_ = other.value;
+ return *this;
+ }
+
+ typename std::iterator_traits<KeyIter>::reference GetKey() const {
+ return *(inner_.key_);
+ }
+
+ void swap(JointProxy<KeyIter, ValueIter> &other) {
+ std::swap(*inner_.key_, *other.inner_.key_);
+ std::swap(*inner_.value_, *other.inner_.value_);
+ }
+
+ private:
+ friend class ProxyIterator<JointProxy<KeyIter, ValueIter> >;
+
+ InnerIterator &Inner() { return inner_; }
+ const InnerIterator &Inner() const { return inner_; }
+ InnerIterator inner_;
+};
+
+template <class Proxy, class Less> class LessWrapper : public std::binary_function<const typename Proxy::value_type &, const typename Proxy::value_type &, bool> {
+ public:
+ explicit LessWrapper(const Less &less) : less_(less) {}
+
+ bool operator()(const Proxy &left, const Proxy &right) const {
+ return less_(left.GetKey(), right.GetKey());
+ }
+ bool operator()(const Proxy &left, const typename Proxy::value_type &right) const {
+ return less_(left.GetKey(), right.GetKey());
+ }
+ bool operator()(const typename Proxy::value_type &left, const Proxy &right) const {
+ return less_(left.GetKey(), right.GetKey());
+ }
+ bool operator()(const typename Proxy::value_type &left, const typename Proxy::value_type &right) const {
+ return less_(left.GetKey(), right.GetKey());
+ }
+
+ private:
+ const Less less_;
+};
+
+} // namespace detail
+
+template <class KeyIter, class ValueIter, class Less> void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin, const Less &less) {
+ ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > full_begin(detail::JointProxy<KeyIter, ValueIter>(key_begin, value_begin));
+ detail::LessWrapper<detail::JointProxy<KeyIter, ValueIter>, Less> less_wrap(less);
+ std::sort(full_begin, full_begin + (key_end - key_begin), less_wrap);
+}
+
+
+template <class KeyIter, class ValueIter> void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin) {
+ JointSort(key_begin, key_end, value_begin, std::less<typename std::iterator_traits<KeyIter>::value_type>());
+}
+
+} // namespace util
+
+namespace std {
+template <class KeyIter, class ValueIter> void swap(util::detail::JointIter<KeyIter, ValueIter> &left, util::detail::JointIter<KeyIter, ValueIter> &right) {
+ left.swap(right);
+}
+
+template <class KeyIter, class ValueIter> void swap(util::detail::JointProxy<KeyIter, ValueIter> &left, util::detail::JointProxy<KeyIter, ValueIter> &right) {
+ left.swap(right);
+}
+} // namespace std
+
+#endif // UTIL_JOINT_SORT__
diff --git a/klm/util/joint_sort_test.cc b/klm/util/joint_sort_test.cc
new file mode 100644
index 00000000..4dc85916
--- /dev/null
+++ b/klm/util/joint_sort_test.cc
@@ -0,0 +1,50 @@
+#include "util/joint_sort.hh"
+
+#define BOOST_TEST_MODULE JointSortTest
+#include <boost/test/unit_test.hpp>
+
+namespace util { namespace {
+
+BOOST_AUTO_TEST_CASE(just_flip) {
+ char keys[2];
+ int values[2];
+ keys[0] = 1; values[0] = 327;
+ keys[1] = 0; values[1] = 87897;
+ JointSort<char *, int *>(keys + 0, keys + 2, values + 0);
+ BOOST_CHECK_EQUAL(0, keys[0]);
+ BOOST_CHECK_EQUAL(87897, values[0]);
+ BOOST_CHECK_EQUAL(1, keys[1]);
+ BOOST_CHECK_EQUAL(327, values[1]);
+}
+
+BOOST_AUTO_TEST_CASE(three) {
+ char keys[3];
+ int values[3];
+ keys[0] = 1; values[0] = 327;
+ keys[1] = 2; values[1] = 87897;
+ keys[2] = 0; values[2] = 10;
+ JointSort<char *, int *>(keys + 0, keys + 3, values + 0);
+ BOOST_CHECK_EQUAL(0, keys[0]);
+ BOOST_CHECK_EQUAL(1, keys[1]);
+ BOOST_CHECK_EQUAL(2, keys[2]);
+}
+
+BOOST_AUTO_TEST_CASE(char_int) {
+ char keys[4];
+ int values[4];
+ keys[0] = 3; values[0] = 327;
+ keys[1] = 1; values[1] = 87897;
+ keys[2] = 2; values[2] = 10;
+ keys[3] = 0; values[3] = 24347;
+ JointSort<char *, int *>(keys + 0, keys + 4, values + 0);
+ BOOST_CHECK_EQUAL(0, keys[0]);
+ BOOST_CHECK_EQUAL(24347, values[0]);
+ BOOST_CHECK_EQUAL(1, keys[1]);
+ BOOST_CHECK_EQUAL(87897, values[1]);
+ BOOST_CHECK_EQUAL(2, keys[2]);
+ BOOST_CHECK_EQUAL(10, values[2]);
+ BOOST_CHECK_EQUAL(3, keys[3]);
+ BOOST_CHECK_EQUAL(327, values[3]);
+}
+
+}} // namespace anonymous util
diff --git a/klm/util/key_value_packing.hh b/klm/util/key_value_packing.hh
new file mode 100644
index 00000000..450512ac
--- /dev/null
+++ b/klm/util/key_value_packing.hh
@@ -0,0 +1,122 @@
+#ifndef UTIL_KEY_VALUE_PACKING__
+#define UTIL_KEY_VALUE_PACKING__
+
+/* Why such a general interface? I'm planning on doing bit-level packing. */
+
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+
+#include <inttypes.h>
+
+namespace util {
+
+template <class Key, class Value> struct Entry {
+ Key key;
+ Value value;
+
+ const Key &GetKey() const { return key; }
+ const Value &GetValue() const { return value; }
+
+ void Set(const Key &key_in, const Value &value_in) {
+ SetKey(key_in);
+ SetValue(value_in);
+ }
+ void SetKey(const Key &key_in) { key = key_in; }
+ void SetValue(const Value &value_in) { value = value_in; }
+
+ bool operator<(const Entry<Key, Value> &other) const { return GetKey() < other.GetKey(); }
+};
+
+// And now for a brief interlude to specialize std::swap.
+} // namespace util
+namespace std {
+template <class Key, class Value> void swap(util::Entry<Key, Value> &first, util::Entry<Key, Value> &second) {
+ swap(first.key, second.key);
+ swap(first.value, second.value);
+}
+}// namespace std
+namespace util {
+
+template <class KeyT, class ValueT> class AlignedPacking {
+ public:
+ typedef KeyT Key;
+ typedef ValueT Value;
+
+ public:
+ static const std::size_t kBytes = sizeof(Entry<Key, Value>);
+ static const std::size_t kBits = kBytes * 8;
+
+ typedef Entry<Key, Value> * MutableIterator;
+ typedef const Entry<Key, Value> * ConstIterator;
+ typedef const Entry<Key, Value> & ConstReference;
+
+ static MutableIterator FromVoid(void *start) {
+ return reinterpret_cast<MutableIterator>(start);
+ }
+
+ static Entry<Key, Value> Make(const Key &key, const Value &value) {
+ Entry<Key, Value> ret;
+ ret.Set(key, value);
+ return ret;
+ }
+};
+
+template <class KeyT, class ValueT> class ByteAlignedPacking {
+ public:
+ typedef KeyT Key;
+ typedef ValueT Value;
+
+ private:
+#pragma pack(push)
+#pragma pack(1)
+ struct RawEntry {
+ Key key;
+ Value value;
+
+ const Key &GetKey() const { return key; }
+ const Value &GetValue() const { return value; }
+
+ void Set(const Key &key_in, const Value &value_in) {
+ SetKey(key_in);
+ SetValue(value_in);
+ }
+ void SetKey(const Key &key_in) { key = key_in; }
+ void SetValue(const Value &value_in) { value = value_in; }
+
+ bool operator<(const RawEntry &other) const { return GetKey() < other.GetKey(); }
+ };
+#pragma pack(pop)
+
+ friend void std::swap<>(RawEntry&, RawEntry&);
+
+ public:
+ typedef RawEntry *MutableIterator;
+ typedef const RawEntry *ConstIterator;
+ typedef RawEntry &ConstReference;
+
+ static const std::size_t kBytes = sizeof(RawEntry);
+ static const std::size_t kBits = kBytes * 8;
+
+ static MutableIterator FromVoid(void *start) {
+ return MutableIterator(reinterpret_cast<RawEntry*>(start));
+ }
+
+ static RawEntry Make(const Key &key, const Value &value) {
+ RawEntry ret;
+ ret.Set(key, value);
+ return ret;
+ }
+};
+
+} // namespace util
+namespace std {
+template <class Key, class Value> void swap(
+ typename util::ByteAlignedPacking<Key, Value>::RawEntry &first,
+ typename util::ByteAlignedPacking<Key, Value>::RawEntry &second) {
+ swap(first.key, second.key);
+ swap(first.value, second.value);
+}
+}// namespace std
+
+#endif // UTIL_KEY_VALUE_PACKING__
diff --git a/klm/util/key_value_packing_test.cc b/klm/util/key_value_packing_test.cc
new file mode 100644
index 00000000..a0d33fd7
--- /dev/null
+++ b/klm/util/key_value_packing_test.cc
@@ -0,0 +1,75 @@
+#include "util/key_value_packing.hh"
+
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_int.hpp>
+#include <boost/random/variate_generator.hpp>
+#include <boost/scoped_array.hpp>
+#define BOOST_TEST_MODULE KeyValueStoreTest
+#include <boost/test/unit_test.hpp>
+
+#include <limits>
+#include <stdlib.h>
+
+namespace util {
+namespace {
+
+BOOST_AUTO_TEST_CASE(basic_in_out) {
+ typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
+ void *backing = malloc(Packing::kBytes * 2);
+ Packing::MutableIterator i(Packing::FromVoid(backing));
+ i->SetKey(10);
+ BOOST_CHECK_EQUAL(10, i->GetKey());
+ i->SetValue(3);
+ BOOST_CHECK_EQUAL(3, i->GetValue());
+ ++i;
+ i->SetKey(5);
+ BOOST_CHECK_EQUAL(5, i->GetKey());
+ i->SetValue(42);
+ BOOST_CHECK_EQUAL(42, i->GetValue());
+
+ Packing::ConstIterator c(i);
+ BOOST_CHECK_EQUAL(5, c->GetKey());
+ --c;
+ BOOST_CHECK_EQUAL(10, c->GetKey());
+ BOOST_CHECK_EQUAL(42, i->GetValue());
+
+ BOOST_CHECK_EQUAL(5, i->GetKey());
+ free(backing);
+}
+
+BOOST_AUTO_TEST_CASE(simple_sort) {
+ typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
+ char foo[Packing::kBytes * 4];
+ Packing::MutableIterator begin(Packing::FromVoid(foo));
+ Packing::MutableIterator i = begin;
+ i->SetKey(0); ++i;
+ i->SetKey(2); ++i;
+ i->SetKey(3); ++i;
+ i->SetKey(1); ++i;
+ std::sort(begin, i);
+ BOOST_CHECK_EQUAL(0, begin[0].GetKey());
+ BOOST_CHECK_EQUAL(1, begin[1].GetKey());
+ BOOST_CHECK_EQUAL(2, begin[2].GetKey());
+ BOOST_CHECK_EQUAL(3, begin[3].GetKey());
+}
+
+BOOST_AUTO_TEST_CASE(big_sort) {
+ typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
+ boost::scoped_array<char> memory(new char[Packing::kBytes * 1000]);
+ Packing::MutableIterator begin(Packing::FromVoid(memory.get()));
+
+ boost::mt19937 rng;
+ boost::uniform_int<uint64_t> range(0, std::numeric_limits<uint64_t>::max());
+ boost::variate_generator<boost::mt19937&, boost::uniform_int<uint64_t> > gen(rng, range);
+
+ for (size_t i = 0; i < 1000; ++i) {
+ (begin + i)->SetKey(gen());
+ }
+ std::sort(begin, begin + 1000);
+ for (size_t i = 0; i < 999; ++i) {
+ BOOST_CHECK(begin[i] < begin[i+1]);
+ }
+}
+
+} // namespace
+} // namespace util
diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc
new file mode 100644
index 00000000..648b5d0a
--- /dev/null
+++ b/klm/util/mmap.cc
@@ -0,0 +1,95 @@
+#include "util/exception.hh"
+#include "util/mmap.hh"
+#include "util/scoped.hh"
+
+#include <assert.h>
+#include <err.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+namespace util {
+
+scoped_mmap::~scoped_mmap() {
+ if (data_ != (void*)-1) {
+ if (munmap(data_, size_))
+ err(1, "munmap failed ");
+ }
+}
+
+void scoped_memory::reset(void *data, std::size_t size, Alloc source) {
+ switch(source_) {
+ case MMAP_ALLOCATED:
+ scoped_mmap(data_, size_);
+ break;
+ case ARRAY_ALLOCATED:
+ delete [] reinterpret_cast<char*>(data_);
+ break;
+ case MALLOC_ALLOCATED:
+ free(data_);
+ break;
+ case NONE_ALLOCATED:
+ break;
+ }
+ data_ = data;
+ size_ = size;
+ source_ = source;
+}
+
+void scoped_memory::call_realloc(std::size_t size) {
+ assert(source_ == MALLOC_ALLOCATED || source_ == NONE_ALLOCATED);
+ void *new_data = realloc(data_, size);
+ if (!new_data) {
+ reset();
+ } else {
+ reset(new_data, size, MALLOC_ALLOCATED);
+ }
+}
+
+void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset) {
+#ifdef MAP_POPULATE // Linux specific
+ if (prefault) {
+ flags |= MAP_POPULATE;
+ }
+ int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
+#else
+ int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
+#endif
+ void *ret = mmap(NULL, size, protect, flags, fd, offset);
+ if (ret == MAP_FAILED) {
+ UTIL_THROW(ErrnoException, "mmap failed for size " << size << " at offset " << offset);
+ }
+ return ret;
+}
+
+void *MapForRead(std::size_t size, bool prefault, int fd, off_t offset) {
+ return MapOrThrow(size, false, MAP_FILE | MAP_PRIVATE, prefault, fd, offset);
+}
+
+void *MapAnonymous(std::size_t size) {
+ return MapOrThrow(size, true,
+#ifdef MAP_ANONYMOUS
+ MAP_ANONYMOUS // Linux
+#else
+ MAP_ANON // BSD
+#endif
+ | MAP_PRIVATE, false, -1, 0);
+}
+
+void MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file, scoped_mmap &mem) {
+ file.reset(open(name, O_CREAT | O_RDWR | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH));
+ if (-1 == file.get())
+ UTIL_THROW(ErrnoException, "Failed to open " << name << " for writing");
+ if (-1 == ftruncate(file.get(), size))
+ UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed");
+ try {
+ mem.reset(MapOrThrow(size, true, MAP_FILE | MAP_SHARED, false, file.get(), 0), size);
+ } catch (ErrnoException &e) {
+ e << " in file " << name;
+ throw;
+ }
+}
+
+} // namespace util
diff --git a/klm/util/mmap.hh b/klm/util/mmap.hh
new file mode 100644
index 00000000..c9068ec9
--- /dev/null
+++ b/klm/util/mmap.hh
@@ -0,0 +1,101 @@
+#ifndef UTIL_MMAP__
+#define UTIL_MMAP__
+// Utilities for mmaped files.
+
+#include "util/scoped.hh"
+
+#include <cstddef>
+
+#include <sys/types.h>
+
+namespace util {
+
+// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here.
+class scoped_mmap {
+ public:
+ scoped_mmap() : data_((void*)-1), size_(0) {}
+ scoped_mmap(void *data, std::size_t size) : data_(data), size_(size) {}
+ ~scoped_mmap();
+
+ void *get() const { return data_; }
+
+ const char *begin() const { return reinterpret_cast<char*>(data_); }
+ const char *end() const { return reinterpret_cast<char*>(data_) + size_; }
+ std::size_t size() const { return size_; }
+
+ void reset(void *data, std::size_t size) {
+ scoped_mmap other(data_, size_);
+ data_ = data;
+ size_ = size;
+ }
+
+ void reset() {
+ reset((void*)-1, 0);
+ }
+
+ private:
+ void *data_;
+ std::size_t size_;
+
+ scoped_mmap(const scoped_mmap &);
+ scoped_mmap &operator=(const scoped_mmap &);
+};
+
+/* For when the memory might come from mmap, new char[], or malloc. Uses NULL
+ * and 0 for blanks even though mmap signals errors with (void*)-1). The reset
+ * function checks that blank for mmap.
+ */
+class scoped_memory {
+ public:
+ typedef enum {MMAP_ALLOCATED, ARRAY_ALLOCATED, MALLOC_ALLOCATED, NONE_ALLOCATED} Alloc;
+
+ scoped_memory() : data_(NULL), size_(0), source_(NONE_ALLOCATED) {}
+
+ ~scoped_memory() { reset(); }
+
+ void *get() const { return data_; }
+ const char *begin() const { return reinterpret_cast<char*>(data_); }
+ const char *end() const { return reinterpret_cast<char*>(data_) + size_; }
+ std::size_t size() const { return size_; }
+
+ Alloc source() const { return source_; }
+
+ void reset() { reset(NULL, 0, NONE_ALLOCATED); }
+
+ void reset(void *data, std::size_t size, Alloc from);
+
+ // realloc allows the current data to escape hence the need for this call
+ // If realloc fails, destroys the original too and get() returns NULL.
+ void call_realloc(std::size_t to);
+
+ private:
+
+ void *data_;
+ std::size_t size_;
+
+ Alloc source_;
+
+ scoped_memory(const scoped_memory &);
+ scoped_memory &operator=(const scoped_memory &);
+};
+
+struct scoped_mapped_file {
+ scoped_fd fd;
+ scoped_mmap mem;
+};
+
+// Wrapper around mmap to check it worked and hide some platform macros.
+void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset = 0);
+void *MapForRead(std::size_t size, bool prefault, int fd, off_t offset = 0);
+
+void *MapAnonymous(std::size_t size);
+
+// Open file name with mmap of size bytes, all of which are initially zero.
+void MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file, scoped_mmap &mem);
+inline void MapZeroedWrite(const char *name, std::size_t size, scoped_mapped_file &out) {
+ MapZeroedWrite(name, size, out.fd, out.mem);
+}
+
+} // namespace util
+
+#endif // UTIL_SCOPED__
diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc
new file mode 100644
index 00000000..d58a0727
--- /dev/null
+++ b/klm/util/murmur_hash.cc
@@ -0,0 +1,129 @@
+/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All
+ * code is released to the public domain. For business purposes, Murmurhash is
+ * under the MIT license."
+ * This is modified from the original:
+ * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.
+ * length changed to unsigned int.
+ * placed in namespace util
+ * add MurmurHashNative
+ * default option = 0 for seed
+ */
+
+#include "util/murmur_hash.hh"
+
+namespace util {
+
+//-----------------------------------------------------------------------------
+// MurmurHash2, 64-bit versions, by Austin Appleby
+
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
+// and endian-ness issues if used across multiple platforms.
+
+// 64-bit hash for 64-bit platforms
+
+uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
+{
+ const uint64_t m = 0xc6a4a7935bd1e995ULL;
+ const int r = 47;
+
+ uint64_t h = seed ^ (len * m);
+
+ const uint64_t * data = (const uint64_t *)key;
+ const uint64_t * end = data + (len/8);
+
+ while(data != end)
+ {
+ uint64_t k = *data++;
+
+ k *= m;
+ k ^= k >> r;
+ k *= m;
+
+ h ^= k;
+ h *= m;
+ }
+
+ const unsigned char * data2 = (const unsigned char*)data;
+
+ switch(len & 7)
+ {
+ case 7: h ^= uint64_t(data2[6]) << 48;
+ case 6: h ^= uint64_t(data2[5]) << 40;
+ case 5: h ^= uint64_t(data2[4]) << 32;
+ case 4: h ^= uint64_t(data2[3]) << 24;
+ case 3: h ^= uint64_t(data2[2]) << 16;
+ case 2: h ^= uint64_t(data2[1]) << 8;
+ case 1: h ^= uint64_t(data2[0]);
+ h *= m;
+ };
+
+ h ^= h >> r;
+ h *= m;
+ h ^= h >> r;
+
+ return h;
+}
+
+
+// 64-bit hash for 32-bit platforms
+
+uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
+{
+ const unsigned int m = 0x5bd1e995;
+ const int r = 24;
+
+ unsigned int h1 = seed ^ len;
+ unsigned int h2 = 0;
+
+ const unsigned int * data = (const unsigned int *)key;
+
+ while(len >= 8)
+ {
+ unsigned int k1 = *data++;
+ k1 *= m; k1 ^= k1 >> r; k1 *= m;
+ h1 *= m; h1 ^= k1;
+ len -= 4;
+
+ unsigned int k2 = *data++;
+ k2 *= m; k2 ^= k2 >> r; k2 *= m;
+ h2 *= m; h2 ^= k2;
+ len -= 4;
+ }
+
+ if(len >= 4)
+ {
+ unsigned int k1 = *data++;
+ k1 *= m; k1 ^= k1 >> r; k1 *= m;
+ h1 *= m; h1 ^= k1;
+ len -= 4;
+ }
+
+ switch(len)
+ {
+ case 3: h2 ^= ((unsigned char*)data)[2] << 16;
+ case 2: h2 ^= ((unsigned char*)data)[1] << 8;
+ case 1: h2 ^= ((unsigned char*)data)[0];
+ h2 *= m;
+ };
+
+ h1 ^= h2 >> 18; h1 *= m;
+ h2 ^= h1 >> 22; h2 *= m;
+ h1 ^= h2 >> 17; h1 *= m;
+ h2 ^= h1 >> 19; h2 *= m;
+
+ uint64_t h = h1;
+
+ h = (h << 32) | h2;
+
+ return h;
+}
+
+uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) {
+ if (sizeof(int) == 4) {
+ return MurmurHash64B(key, len, seed);
+ } else {
+ return MurmurHash64A(key, len, seed);
+ }
+}
+
+} // namespace util
diff --git a/klm/util/murmur_hash.hh b/klm/util/murmur_hash.hh
new file mode 100644
index 00000000..638aaeb2
--- /dev/null
+++ b/klm/util/murmur_hash.hh
@@ -0,0 +1,14 @@
+#ifndef UTIL_MURMUR_HASH__
+#define UTIL_MURMUR_HASH__
+#include <cstddef>
+#include <stdint.h>
+
+namespace util {
+
+uint64_t MurmurHash64A(const void * key, std::size_t len, unsigned int seed = 0);
+uint64_t MurmurHash64B(const void * key, std::size_t len, unsigned int seed = 0);
+uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed = 0);
+
+} // namespace util
+
+#endif // UTIL_MURMUR_HASH__
diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh
new file mode 100644
index 00000000..c3529a7e
--- /dev/null
+++ b/klm/util/probing_hash_table.hh
@@ -0,0 +1,97 @@
+#ifndef UTIL_PROBING_HASH_TABLE__
+#define UTIL_PROBING_HASH_TABLE__
+
+#include <algorithm>
+#include <cstddef>
+#include <functional>
+
+#include <assert.h>
+
+namespace util {
+
+/* Non-standard hash table
+ * Buckets must be set at the beginning and must be greater than maximum number
+ * of elements, else an infinite loop happens.
+ * Memory management and initialization is externalized to make it easier to
+ * serialize these to disk and load them quickly.
+ * Uses linear probing to find value.
+ * Only insert and lookup operations.
+ */
+
+template <class PackingT, class HashT, class EqualT = std::equal_to<typename PackingT::Key> > class ProbingHashTable {
+ public:
+ typedef PackingT Packing;
+ typedef typename Packing::Key Key;
+ typedef typename Packing::MutableIterator MutableIterator;
+ typedef typename Packing::ConstIterator ConstIterator;
+
+ typedef HashT Hash;
+ typedef EqualT Equal;
+
+ static std::size_t Size(std::size_t entries, float multiplier) {
+ return std::max(entries + 1, static_cast<std::size_t>(multiplier * static_cast<float>(entries))) * Packing::kBytes;
+ }
+
+ // Must be assigned to later.
+ ProbingHashTable()
+#ifdef DEBUG
+ : initialized_(false), entries_(0)
+#endif
+ {}
+
+ ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal())
+ : begin_(Packing::FromVoid(start)),
+ buckets_(allocated / Packing::kBytes),
+ end_(begin_ + (allocated / Packing::kBytes)),
+ invalid_(invalid),
+ hash_(hash_func),
+ equal_(equal_func)
+#ifdef DEBUG
+ , initialized_(true),
+ entries_(0)
+#endif
+ {}
+
+ template <class T> void Insert(const T &t) {
+#ifdef DEBUG
+ assert(initialized_);
+ assert(++entries_ < buckets_);
+#endif
+ for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) {
+ if (equal_(i->GetKey(), invalid_)) { *i = t; return; }
+ if (++i == end_) { i = begin_; }
+ }
+ }
+
+ void FinishedInserting() {}
+
+ void LoadedBinary() {}
+
+ template <class Key> bool Find(const Key key, ConstIterator &out) const {
+#ifdef DEBUG
+ assert(initialized_);
+#endif
+ for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) {
+ Key got(i->GetKey());
+ if (equal_(got, key)) { out = i; return true; }
+ if (equal_(got, invalid_)) { return false; }
+ if (++i == end_) { i = begin_; }
+ }
+ }
+
+ private:
+ MutableIterator begin_;
+ std::size_t buckets_;
+ MutableIterator end_;
+ Key invalid_;
+ Hash hash_;
+ Equal equal_;
+#ifdef DEBUG
+ bool initialized_;
+ std::size_t entries_;
+#endif
+};
+
+} // namespace util
+
+#endif // UTIL_PROBING_HASH_TABLE__
diff --git a/klm/util/probing_hash_table_test.cc b/klm/util/probing_hash_table_test.cc
new file mode 100644
index 00000000..ff2f5af3
--- /dev/null
+++ b/klm/util/probing_hash_table_test.cc
@@ -0,0 +1,30 @@
+#include "util/probing_hash_table.hh"
+
+#include "util/key_value_packing.hh"
+
+#define BOOST_TEST_MODULE ProbingHashTableTest
+#include <boost/test/unit_test.hpp>
+#include <boost/functional/hash.hpp>
+
+namespace util {
+namespace {
+
+typedef AlignedPacking<char, uint64_t> Packing;
+typedef ProbingHashTable<Packing, boost::hash<char> > Table;
+
+BOOST_AUTO_TEST_CASE(simple) {
+ char mem[Table::Size(10, 1.2)];
+ memset(mem, 0, sizeof(mem));
+
+ Table table(mem, sizeof(mem));
+ Packing::ConstIterator i = Packing::ConstIterator();
+ BOOST_CHECK(!table.Find(2, i));
+ table.Insert(Packing::Make(3, 328920));
+ BOOST_REQUIRE(table.Find(3, i));
+ BOOST_CHECK_EQUAL(3, i->GetKey());
+ BOOST_CHECK_EQUAL(static_cast<uint64_t>(328920), i->GetValue());
+ BOOST_CHECK(!table.Find(2, i));
+}
+
+} // namespace
+} // namespace util
diff --git a/klm/util/proxy_iterator.hh b/klm/util/proxy_iterator.hh
new file mode 100644
index 00000000..1c5b7089
--- /dev/null
+++ b/klm/util/proxy_iterator.hh
@@ -0,0 +1,94 @@
+#ifndef UTIL_PROXY_ITERATOR__
+#define UTIL_PROXY_ITERATOR__
+
+#include <cstddef>
+#include <iterator>
+
+/* This is a RandomAccessIterator that uses a proxy to access the underlying
+ * data. Useful for packing data at bit offsets but still using STL
+ * algorithms.
+ *
+ * Normally I would use boost::iterator_facade but some people are too lazy to
+ * install boost and still want to use my language model. It's amazing how
+ * many operators an iterator has.
+ *
+ * The Proxy needs to provide:
+ * class InnerIterator;
+ * InnerIterator &Inner();
+ * const InnerIterator &Inner() const;
+ *
+ * InnerIterator has to implement:
+ * operator==(InnerIterator)
+ * operator<(InnerIterator)
+ * operator+=(std::ptrdiff_t)
+ * operator-(InnerIterator)
+ * and of course whatever Proxy needs to dereference it.
+ *
+ * It's also a good idea to specialize std::swap for Proxy.
+ */
+
+namespace util {
+template <class Proxy> class ProxyIterator {
+ private:
+ // Self.
+ typedef ProxyIterator<Proxy> S;
+ typedef typename Proxy::InnerIterator InnerIterator;
+
+ public:
+ typedef std::random_access_iterator_tag iterator_category;
+ typedef typename Proxy::value_type value_type;
+ typedef std::ptrdiff_t difference_type;
+ typedef Proxy reference;
+ typedef Proxy * pointer;
+
+ ProxyIterator() {}
+
+ // For cast from non const to const.
+ template <class AlternateProxy> ProxyIterator(const ProxyIterator<AlternateProxy> &in) : p_(*in) {}
+ explicit ProxyIterator(const Proxy &p) : p_(p) {}
+
+ // p_'s operator= does value copying, but here we want iterator copying.
+ S &operator=(const S &other) {
+ I() = other.I();
+ return *this;
+ }
+
+ bool operator==(const S &other) const { return I() == other.I(); }
+ bool operator!=(const S &other) const { return !(*this == other); }
+ bool operator<(const S &other) const { return I() < other.I(); }
+ bool operator>(const S &other) const { return other < *this; }
+ bool operator<=(const S &other) const { return !(*this > other); }
+ bool operator>=(const S &other) const { return !(*this < other); }
+
+ S &operator++() { return *this += 1; }
+ S operator++(int) { S ret(*this); ++*this; return ret; }
+ S &operator+=(std::ptrdiff_t amount) { I() += amount; return *this; }
+ S operator+(std::ptrdiff_t amount) const { S ret(*this); ret += amount; return ret; }
+
+ S &operator--() { return *this -= 1; }
+ S operator--(int) { S ret(*this); --*this; return ret; }
+ S &operator-=(std::ptrdiff_t amount) { I() += (-amount); return *this; }
+ S operator-(std::ptrdiff_t amount) const { S ret(*this); ret -= amount; return ret; }
+
+ std::ptrdiff_t operator-(const S &other) const { return I() - other.I(); }
+
+ Proxy operator*() { return p_; }
+ const Proxy operator*() const { return p_; }
+ Proxy *operator->() { return &p_; }
+ const Proxy *operator->() const { return &p_; }
+ Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); }
+
+ private:
+ InnerIterator &I() { return p_.Inner(); }
+ const InnerIterator &I() const { return p_.Inner(); }
+
+ Proxy p_;
+};
+
+template <class Proxy> ProxyIterator<Proxy> operator+(std::ptrdiff_t amount, const ProxyIterator<Proxy> &it) {
+ return it + amount;
+}
+
+} // namespace util
+
+#endif // UTIL_PROXY_ITERATOR__
diff --git a/klm/util/scoped.cc b/klm/util/scoped.cc
new file mode 100644
index 00000000..61394ffc
--- /dev/null
+++ b/klm/util/scoped.cc
@@ -0,0 +1,12 @@
+#include "util/scoped.hh"
+
+#include <err.h>
+#include <unistd.h>
+
+namespace util {
+
+scoped_fd::~scoped_fd() {
+ if (fd_ != -1 && close(fd_)) err(1, "Could not close file %i", fd_);
+}
+
+} // namespace util
diff --git a/klm/util/scoped.hh b/klm/util/scoped.hh
new file mode 100644
index 00000000..ef62a74f
--- /dev/null
+++ b/klm/util/scoped.hh
@@ -0,0 +1,66 @@
+#ifndef UTIL_SCOPED__
+#define UTIL_SCOPED__
+
+/* Other scoped objects in the style of scoped_ptr. */
+
+#include <cstddef>
+
+namespace util {
+
+template <class T, class R, R (*Free)(T*)> class scoped_thing {
+ public:
+ explicit scoped_thing(T *c = static_cast<T*>(0)) : c_(c) {}
+
+ ~scoped_thing() { if (c_) Free(c_); }
+
+ void reset(T *c) {
+ if (c_) Free(c_);
+ c_ = c;
+ }
+
+ T &operator*() { return *c_; }
+ T &operator->() { return *c_; }
+
+ T *get() { return c_; }
+ const T *get() const { return c_; }
+
+ private:
+ T *c_;
+
+ scoped_thing(const scoped_thing &);
+ scoped_thing &operator=(const scoped_thing &);
+};
+
+class scoped_fd {
+ public:
+ scoped_fd() : fd_(-1) {}
+
+ explicit scoped_fd(int fd) : fd_(fd) {}
+
+ ~scoped_fd();
+
+ void reset(int to) {
+ scoped_fd other(fd_);
+ fd_ = to;
+ }
+
+ int get() const { return fd_; }
+
+ int operator*() const { return fd_; }
+
+ int release() {
+ int ret = fd_;
+ fd_ = -1;
+ return ret;
+ }
+
+ private:
+ int fd_;
+
+ scoped_fd(const scoped_fd &);
+ scoped_fd &operator=(const scoped_fd &);
+};
+
+} // namespace util
+
+#endif // UTIL_SCOPED__
diff --git a/klm/util/sorted_uniform.hh b/klm/util/sorted_uniform.hh
new file mode 100644
index 00000000..96ec4866
--- /dev/null
+++ b/klm/util/sorted_uniform.hh
@@ -0,0 +1,139 @@
+#ifndef UTIL_SORTED_UNIFORM__
+#define UTIL_SORTED_UNIFORM__
+
+#include <algorithm>
+#include <cstddef>
+
+#include <assert.h>
+#include <inttypes.h>
+
+namespace util {
+
+inline std::size_t Pivot(uint64_t off, uint64_t range, std::size_t width) {
+ std::size_t ret = static_cast<std::size_t>(static_cast<float>(off) / static_cast<float>(range) * static_cast<float>(width));
+ // Cap for floating point rounding
+ return (ret < width) ? ret : width - 1;
+}
+/*inline std::size_t Pivot(uint32_t off, uint32_t range, std::size_t width) {
+ return static_cast<std::size_t>(static_cast<uint64_t>(off) * static_cast<uint64_t>(width) / static_cast<uint64_t>(range));
+}
+inline std::size_t Pivot(uint16_t off, uint16_t range, std::size_t width) {
+ return static_cast<std::size_t>(static_cast<std::size_t>(off) * width / static_cast<std::size_t>(range));
+}
+inline std::size_t Pivot(unsigned char off, unsigned char range, std::size_t width) {
+ return static_cast<std::size_t>(static_cast<std::size_t>(off) * width / static_cast<std::size_t>(range));
+}*/
+
+template <class Iterator, class Key> bool SortedUniformFind(Iterator begin, Iterator end, const Key key, Iterator &out) {
+ if (begin == end) return false;
+ Key below(begin->GetKey());
+ if (key <= below) {
+ if (key == below) { out = begin; return true; }
+ return false;
+ }
+ // Make the range [begin, end].
+ --end;
+ Key above(end->GetKey());
+ if (key >= above) {
+ if (key == above) { out = end; return true; }
+ return false;
+ }
+
+ // Search the range [begin + 1, end - 1] knowing that *begin == below, *end == above.
+ while (end - begin > 1) {
+ Iterator pivot(begin + (1 + Pivot(key - below, above - below, static_cast<std::size_t>(end - begin - 1))));
+ Key mid(pivot->GetKey());
+ if (mid < key) {
+ begin = pivot;
+ below = mid;
+ } else if (mid > key) {
+ end = pivot;
+ above = mid;
+ } else {
+ out = pivot;
+ return true;
+ }
+ }
+ return false;
+}
+
+// To use this template, you need to define a Pivot function to match Key.
+template <class PackingT> class SortedUniformMap {
+ public:
+ typedef PackingT Packing;
+ typedef typename Packing::ConstIterator ConstIterator;
+
+ public:
+ // Offer consistent API with probing hash.
+ static std::size_t Size(std::size_t entries, float ignore = 0.0) {
+ return sizeof(uint64_t) + entries * Packing::kBytes;
+ }
+
+ SortedUniformMap()
+#ifdef DEBUG
+ : initialized_(false), loaded_(false)
+#endif
+ {}
+
+ SortedUniformMap(void *start, std::size_t allocated) :
+ begin_(Packing::FromVoid(reinterpret_cast<uint64_t*>(start) + 1)),
+ end_(begin_), size_ptr_(reinterpret_cast<uint64_t*>(start))
+#ifdef DEBUG
+ , initialized_(true), loaded_(false)
+#endif
+ {}
+
+ void LoadedBinary() {
+#ifdef DEBUG
+ assert(initialized_);
+ assert(!loaded_);
+ loaded_ = true;
+#endif
+ // Restore the size.
+ end_ = begin_ + *size_ptr_;
+ }
+
+ // Caller responsible for not exceeding specified size. Do not call after FinishedInserting.
+ template <class T> void Insert(const T &t) {
+#ifdef DEBUG
+ assert(initialized_);
+ assert(!loaded_);
+#endif
+ *end_ = t;
+ ++end_;
+ }
+
+ void FinishedInserting() {
+#ifdef DEBUG
+ assert(initialized_);
+ assert(!loaded_);
+ loaded_ = true;
+#endif
+ std::sort(begin_, end_);
+ *size_ptr_ = (end_ - begin_);
+ }
+
+ // Do not call before FinishedInserting.
+ template <class Key> bool Find(const Key key, ConstIterator &out) const {
+#ifdef DEBUG
+ assert(initialized_);
+ assert(loaded_);
+#endif
+ return SortedUniformFind<ConstIterator, Key>(ConstIterator(begin_), ConstIterator(end_), key, out);
+ }
+
+ ConstIterator begin() const { return begin_; }
+ ConstIterator end() const { return end_; }
+
+ private:
+ typename Packing::MutableIterator begin_, end_;
+ uint64_t *size_ptr_;
+#ifdef DEBUG
+ bool initialized_;
+ bool loaded_;
+#endif
+};
+
+} // namespace util
+
+#endif // UTIL_SORTED_UNIFORM__
diff --git a/klm/util/sorted_uniform_test.cc b/klm/util/sorted_uniform_test.cc
new file mode 100644
index 00000000..4aa4c8aa
--- /dev/null
+++ b/klm/util/sorted_uniform_test.cc
@@ -0,0 +1,116 @@
+#include "util/sorted_uniform.hh"
+
+#include "util/key_value_packing.hh"
+
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_int.hpp>
+#include <boost/random/variate_generator.hpp>
+#include <boost/scoped_array.hpp>
+#include <boost/unordered_map.hpp>
+#define BOOST_TEST_MODULE SortedUniformTest
+#include <boost/test/unit_test.hpp>
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+namespace util {
+namespace {
+
+template <class Map, class Key, class Value> void Check(const Map &map, const boost::unordered_map<Key, Value> &reference, const Key key) {
+ typename boost::unordered_map<Key, Value>::const_iterator ref = reference.find(key);
+ typename Map::ConstIterator i = typename Map::ConstIterator();
+ if (ref == reference.end()) {
+ BOOST_CHECK(!map.Find(key, i));
+ } else {
+ // g++ can't tell that require will crash and burn.
+ BOOST_REQUIRE(map.Find(key, i));
+ BOOST_CHECK_EQUAL(ref->second, i->GetValue());
+ }
+}
+
+typedef SortedUniformMap<AlignedPacking<uint64_t, uint32_t> > TestMap;
+
+BOOST_AUTO_TEST_CASE(empty) {
+ char buf[TestMap::Size(0)];
+ TestMap map(buf, TestMap::Size(0));
+ map.FinishedInserting();
+ TestMap::ConstIterator i;
+ BOOST_CHECK(!map.Find(42, i));
+}
+
+BOOST_AUTO_TEST_CASE(one) {
+ char buf[TestMap::Size(1)];
+ TestMap map(buf, sizeof(buf));
+ Entry<uint64_t, uint32_t> e;
+ e.Set(42,2);
+ map.Insert(e);
+ map.FinishedInserting();
+ TestMap::ConstIterator i = TestMap::ConstIterator();
+ BOOST_REQUIRE(map.Find(42, i));
+ BOOST_CHECK(i == map.begin());
+ BOOST_CHECK(!map.Find(43, i));
+ BOOST_CHECK(!map.Find(41, i));
+}
+
+template <class Key> void RandomTest(Key upper, size_t entries, size_t queries) {
+ typedef unsigned char Value;
+ typedef SortedUniformMap<AlignedPacking<Key, unsigned char> > Map;
+ boost::scoped_array<char> buffer(new char[Map::Size(entries)]);
+ Map map(buffer.get(), entries);
+ boost::mt19937 rng;
+ boost::uniform_int<Key> range_key(0, upper);
+ boost::uniform_int<Value> range_value(0, 255);
+ boost::variate_generator<boost::mt19937&, boost::uniform_int<Key> > gen_key(rng, range_key);
+ boost::variate_generator<boost::mt19937&, boost::uniform_int<unsigned char> > gen_value(rng, range_value);
+
+ boost::unordered_map<Key, unsigned char> reference;
+ Entry<Key, unsigned char> ent;
+ for (size_t i = 0; i < entries; ++i) {
+ Key key = gen_key();
+ unsigned char value = gen_value();
+ if (reference.insert(std::make_pair(key, value)).second) {
+ ent.Set(key, value);
+ map.Insert(Entry<Key, unsigned char>(ent));
+ }
+ }
+ map.FinishedInserting();
+
+ // Random queries.
+ for (size_t i = 0; i < queries; ++i) {
+ const Key key = gen_key();
+ Check<Map, Key, unsigned char>(map, reference, key);
+ }
+
+ typename boost::unordered_map<Key, unsigned char>::const_iterator it = reference.begin();
+ for (size_t i = 0; (i < queries) && (it != reference.end()); ++i, ++it) {
+ Check<Map, Key, unsigned char>(map, reference, it->second);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(basic) {
+ RandomTest<uint8_t>(11, 10, 200);
+}
+
+BOOST_AUTO_TEST_CASE(tiny_dense_random) {
+ RandomTest<uint8_t>(11, 50, 200);
+}
+
+BOOST_AUTO_TEST_CASE(small_dense_random) {
+ RandomTest<uint8_t>(100, 100, 200);
+}
+
+BOOST_AUTO_TEST_CASE(small_sparse_random) {
+ RandomTest<uint8_t>(200, 15, 200);
+}
+
+BOOST_AUTO_TEST_CASE(medium_sparse_random) {
+ RandomTest<uint16_t>(32000, 1000, 2000);
+}
+
+BOOST_AUTO_TEST_CASE(sparse_random) {
+ RandomTest<uint64_t>(std::numeric_limits<uint64_t>::max(), 100000, 2000);
+}
+
+} // namespace
+} // namespace util
diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc
new file mode 100644
index 00000000..6917a6bc
--- /dev/null
+++ b/klm/util/string_piece.cc
@@ -0,0 +1,57 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copied from strings/stringpiece.cc with modifications
+
+#include "util/string_piece.hh"
+
+#ifdef USE_BOOST
+#include <boost/functional/hash/hash.hpp>
+#endif
+
+#include <algorithm>
+#include <iostream>
+
+#ifdef USE_ICU
+U_NAMESPACE_BEGIN
+#endif
+
+std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
+ o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
+ return o;
+}
+
+#ifdef USE_BOOST
+size_t hash_value(const StringPiece &str) {
+ return boost::hash_range(str.data(), str.data() + str.length());
+}
+#endif
+
+#ifdef USE_ICU
+U_NAMESPACE_END
+#endif
diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh
new file mode 100644
index 00000000..58008d13
--- /dev/null
+++ b/klm/util/string_piece.hh
@@ -0,0 +1,260 @@
+/* If you use ICU in your program, then compile with -DUSE_ICU -licui18n. If
+ * you don't use ICU, then this will use the Google implementation from Chrome.
+ * This has been modified from the original version to let you choose.
+ */
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copied from strings/stringpiece.h with modifications
+//
+// A string-like object that points to a sized piece of memory.
+//
+// Functions or methods may use const StringPiece& parameters to accept either
+// a "const char*" or a "string" value that will be implicitly converted to
+// a StringPiece. The implicit conversion means that it is often appropriate
+// to include this .h file in other files rather than forward-declaring
+// StringPiece as would be appropriate for most other Google classes.
+//
+// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
+// conversions from "const char*" to "string" and back again.
+//
+
+#ifndef BASE_STRING_PIECE_H__
+#define BASE_STRING_PIECE_H__
+
+//Uncomment this line if you use ICU in your code.
+//#define USE_ICU
+//Uncomment this line if you want boost hashing for your StringPieces.
+//#define USE_BOOST
+
+#include <cstring>
+#include <iosfwd>
+
+#ifdef USE_ICU
+#include <unicode/stringpiece.h>
+U_NAMESPACE_BEGIN
+#else
+
+#include <algorithm>
+#include <ostream>
+#include <string>
+#include <string.h>
+
+class StringPiece {
+ public:
+ typedef size_t size_type;
+
+ private:
+ const char* ptr_;
+ size_type length_;
+
+ public:
+ // We provide non-explicit singleton constructors so users can pass
+ // in a "const char*" or a "string" wherever a "StringPiece" is
+ // expected.
+ StringPiece() : ptr_(NULL), length_(0) { }
+ StringPiece(const char* str)
+ : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { }
+ StringPiece(const std::string& str)
+ : ptr_(str.data()), length_(str.size()) { }
+ StringPiece(const char* offset, size_type len)
+ : ptr_(offset), length_(len) { }
+
+ // data() may return a pointer to a buffer with embedded NULs, and the
+ // returned buffer may or may not be null terminated. Therefore it is
+ // typically a mistake to pass data() to a routine that expects a NUL
+ // terminated string.
+ const char* data() const { return ptr_; }
+ size_type size() const { return length_; }
+ size_type length() const { return length_; }
+ bool empty() const { return length_ == 0; }
+
+ void clear() { ptr_ = NULL; length_ = 0; }
+ void set(const char* data, size_type len) { ptr_ = data; length_ = len; }
+ void set(const char* str) {
+ ptr_ = str;
+ length_ = str ? strlen(str) : 0;
+ }
+ void set(const void* data, size_type len) {
+ ptr_ = reinterpret_cast<const char*>(data);
+ length_ = len;
+ }
+
+ char operator[](size_type i) const { return ptr_[i]; }
+
+ void remove_prefix(size_type n) {
+ ptr_ += n;
+ length_ -= n;
+ }
+
+ void remove_suffix(size_type n) {
+ length_ -= n;
+ }
+
+ int compare(const StringPiece& x) const {
+ int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_));
+ if (r == 0) {
+ if (length_ < x.length_) r = -1;
+ else if (length_ > x.length_) r = +1;
+ }
+ return r;
+ }
+
+ std::string as_string() const {
+ // std::string doesn't like to take a NULL pointer even with a 0 size.
+ return std::string(!empty() ? data() : "", size());
+ }
+
+ void CopyToString(std::string* target) const;
+ void AppendToString(std::string* target) const;
+
+ // Does "this" start with "x"
+ bool starts_with(const StringPiece& x) const {
+ return ((length_ >= x.length_) &&
+ (wordmemcmp(ptr_, x.ptr_, x.length_) == 0));
+ }
+
+ // Does "this" end with "x"
+ bool ends_with(const StringPiece& x) const {
+ return ((length_ >= x.length_) &&
+ (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
+ }
+
+ // standard STL container boilerplate
+ typedef char value_type;
+ typedef const char* pointer;
+ typedef const char& reference;
+ typedef const char& const_reference;
+ typedef ptrdiff_t difference_type;
+ static const size_type npos;
+ typedef const char* const_iterator;
+ typedef const char* iterator;
+ typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+ typedef std::reverse_iterator<iterator> reverse_iterator;
+ iterator begin() const { return ptr_; }
+ iterator end() const { return ptr_ + length_; }
+ const_reverse_iterator rbegin() const {
+ return const_reverse_iterator(ptr_ + length_);
+ }
+ const_reverse_iterator rend() const {
+ return const_reverse_iterator(ptr_);
+ }
+
+ size_type max_size() const { return length_; }
+ size_type capacity() const { return length_; }
+
+ size_type copy(char* buf, size_type n, size_type pos = 0) const;
+
+ size_type find(const StringPiece& s, size_type pos = 0) const;
+ size_type find(char c, size_type pos = 0) const;
+ size_type rfind(const StringPiece& s, size_type pos = npos) const;
+ size_type rfind(char c, size_type pos = npos) const;
+
+ size_type find_first_of(const StringPiece& s, size_type pos = 0) const;
+ size_type find_first_of(char c, size_type pos = 0) const {
+ return find(c, pos);
+ }
+ size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const;
+ size_type find_first_not_of(char c, size_type pos = 0) const;
+ size_type find_last_of(const StringPiece& s, size_type pos = npos) const;
+ size_type find_last_of(char c, size_type pos = npos) const {
+ return rfind(c, pos);
+ }
+ size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const;
+ size_type find_last_not_of(char c, size_type pos = npos) const;
+
+ StringPiece substr(size_type pos, size_type n = npos) const;
+
+ static int wordmemcmp(const char* p, const char* p2, size_type N) {
+ return memcmp(p, p2, N);
+ }
+};
+
+inline bool operator==(const StringPiece& x, const StringPiece& y) {
+ if (x.size() != y.size())
+ return false;
+
+ return std::memcmp(x.data(), y.data(), x.size()) == 0;
+}
+
+inline bool operator!=(const StringPiece& x, const StringPiece& y) {
+ return !(x == y);
+}
+
+#endif
+
+inline bool operator<(const StringPiece& x, const StringPiece& y) {
+ const int r = std::memcmp(x.data(), y.data(),
+ std::min(x.size(), y.size()));
+ return ((r < 0) || ((r == 0) && (x.size() < y.size())));
+}
+
+inline bool operator>(const StringPiece& x, const StringPiece& y) {
+ return y < x;
+}
+
+inline bool operator<=(const StringPiece& x, const StringPiece& y) {
+ return !(x > y);
+}
+
+inline bool operator>=(const StringPiece& x, const StringPiece& y) {
+ return !(x < y);
+}
+
+// allow StringPiece to be logged (needed for unit testing).
+extern std::ostream& operator<<(std::ostream& o, const StringPiece& piece);
+
+#ifdef USE_BOOST
+size_t hash_value(const StringPiece &str);
+
+/* Support for lookup of StringPiece in boost::unordered_map<std::string> */
+struct StringPieceCompatibleHash : public std::unary_function<const StringPiece &, size_t> {
+ size_t operator()(const StringPiece &str) const {
+ return hash_value(str);
+ }
+};
+
+struct StringPieceCompatibleEquals : public std::binary_function<const StringPiece &, const std::string &, bool> {
+ bool operator()(const StringPiece &first, const StringPiece &second) const {
+ return first == second;
+ }
+};
+template <class T> typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) {
+ return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
+}
+template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece &key) {
+ return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
+}
+#endif
+
+#ifdef USE_ICU
+U_NAMESPACE_END
+#endif
+
+#endif // BASE_STRING_PIECE_H__