partial merge, ruleid feature

author: Patrick Simianer <p@simianer.de> 2011-09-09 15:33:35 +0200
committer: Patrick Simianer <p@simianer.de> 2011-09-23 19:13:58 +0200
commit: fe6acf199a5749f0a604a95e8d7af59bccc7505e (patch)
tree: 5aaad9282a65deb3b3b560bd3e643b845a42987a /klm/util
parent: ef74e67449515ff68f598f06ffc9d221eb13f919 (diff)
8 files changed, 281 insertions, 175 deletions
diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh
index b35d80c8..9f47d559 100644
--- a/klm/util/bit_packing.hh
+++ b/klm/util/bit_packing.hh
@@ -107,9 +107,20 @@ void BitPackingSanity();
 uint8_t RequiredBits(uint64_t max_value);
 
 struct BitsMask {
+  static BitsMask ByMax(uint64_t max_value) {
+    BitsMask ret;
+    ret.FromMax(max_value);
+    return ret;
+  }
+  static BitsMask ByBits(uint8_t bits) {
+    BitsMask ret;
+    ret.bits = bits;
+    ret.mask = (1ULL << bits) - 1;
+    return ret;
+  }
   void FromMax(uint64_t max_value) {
     bits = RequiredBits(max_value);
-    mask = (1 << bits) - 1;
+    mask = (1ULL << bits) - 1;
   }
   uint8_t bits;
   uint64_t mask;
diff --git a/klm/util/exception.cc b/klm/util/exception.cc
index 84f9fe7c..62280970 100644
--- a/klm/util/exception.cc
+++ b/klm/util/exception.cc
@@ -1,5 +1,9 @@
 #include "util/exception.hh"
 
+#ifdef __GXX_RTTI
+#include <typeinfo>
+#endif
+
 #include <errno.h>
 #include <string.h>
 
@@ -22,6 +26,30 @@ const char *Exception::what() const throw() {
   return text_.c_str();
 }
 
+void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) {
+  /* The child class might have set some text, but we want this to come first.
+   * Another option would be passing this information to the constructor, but
+   * then child classes would have to accept constructor arguments and pass
+   * them down.  
+   */
+  text_ = stream_.str();
+  stream_.str("");
+  stream_ << file << ':' << line;
+  if (func) stream_ << " in " << func << " threw ";
+  if (child_name) {
+    stream_ << child_name;
+  } else {
+#ifdef __GXX_RTTI
+    stream_ << typeid(this).name();
+#else
+    stream_ << "an exception";
+#endif
+  }
+  if (condition) stream_ << " because `" << condition;
+  stream_ << "'.\n";
+  stream_ << text_;
+}
+
 namespace {
 // The XOPEN version.
 const char *HandleStrerror(int ret, const char *buf) {
diff --git a/klm/util/exception.hh b/klm/util/exception.hh
index c6936914..81675a57 100644
--- a/klm/util/exception.hh
+++ b/klm/util/exception.hh
@@ -1,8 +1,6 @@
 #ifndef UTIL_EXCEPTION__
 #define UTIL_EXCEPTION__
 
-#include "util/string_piece.hh"
-
 #include <exception>
 #include <sstream>
 #include <string>
@@ -22,6 +20,14 @@ class Exception : public std::exception {
     // Not threadsafe, but probably doesn't matter.  FWIW, Boost's exception guidance implies that what() isn't threadsafe.  
     const char *what() const throw();
 
+    // For use by the UTIL_THROW macros.  
+    void SetLocation(
+        const char *file,
+        unsigned int line,
+        const char *func,
+        const char *child_name,
+        const char *condition);
+
   private:
     template <class Except, class Data> friend typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
 
@@ -43,7 +49,49 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep
   return e;
 }
 
-#define UTIL_THROW(Exception, Modify) { Exception UTIL_e; {UTIL_e << Modify;} throw UTIL_e; }
+#ifdef __GNUC__
+#define UTIL_FUNC_NAME __PRETTY_FUNCTION__
+#else
+#ifdef _WIN32
+#define UTIL_FUNC_NAME __FUNCTION__
+#else
+#define UTIL_FUNC_NAME NULL
+#endif
+#endif
+
+#define UTIL_SET_LOCATION(UTIL_e, child, condition) do { \
+  (UTIL_e).SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, (child), (condition)); \
+} while (0)
+
+/* Create an instance of Exception, add the message Modify, and throw it.
+ * Modify is appended to the what() message and can contain << for ostream
+ * operations.  
+ *
+ * do .. while kludge to swallow trailing ; character
+ * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html .  
+ */
+#define UTIL_THROW(Exception, Modify) do { \
+  Exception UTIL_e; \
+  UTIL_SET_LOCATION(UTIL_e, #Exception, NULL); \
+  UTIL_e << Modify; \
+  throw UTIL_e; \
+} while (0)
+
+#define UTIL_THROW_VAR(Var, Modify) do { \
+  Exception &UTIL_e = (Var); \
+  UTIL_SET_LOCATION(UTIL_e, NULL, NULL); \
+  UTIL_e << Modify; \
+  throw UTIL_e; \
+} while (0)
+
+#define UTIL_THROW_IF(Condition, Exception, Modify) do { \
+  if (Condition) { \
+    Exception UTIL_e; \
+    UTIL_SET_LOCATION(UTIL_e, #Exception, #Condition); \
+    UTIL_e << Modify; \
+    throw UTIL_e; \
+  } \
+} while (0)
 
 class ErrnoException : public Exception {
   public:
@@ -51,7 +99,7 @@ class ErrnoException : public Exception {
 
     virtual ~ErrnoException() throw();
 
-    int Error() { return errno_; }
+    int Error() const throw() { return errno_; }
 
   private:
     int errno_;
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index f447a70c..cbe4234f 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -41,8 +41,8 @@ GZException::GZException(void *file) {
 const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 
 int OpenReadOrThrow(const char *name) {
-  int ret = open(name, O_RDONLY);
-  if (ret == -1) UTIL_THROW(ErrnoException, "in open (" << name << ") for reading");
+  int ret;
+  UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name);
   return ret;
 }
 
@@ -52,13 +52,13 @@ off_t SizeFile(int fd) {
   return sb.st_size;
 }
 
-FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) : 
+FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) : 
   file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
   progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
   Initialize(name, show_progress, min_buffer);
 }
 
-FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) : 
+FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer)  : 
   file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
   progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
   Initialize(name, show_progress, min_buffer);
@@ -78,7 +78,7 @@ FilePiece::~FilePiece() {
 #endif
 }
 
-StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) {
+StringPiece FilePiece::ReadLine(char delim) {
   size_t skip = 0;
   while (true) {
     for (const char *i = position_ + skip; i < position_end_; ++i) {
@@ -97,20 +97,20 @@ StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileExcepti
   }
 }
 
-float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) {
+float FilePiece::ReadFloat() {
   return ReadNumber<float>();
 }
-double FilePiece::ReadDouble() throw(GZException, EndOfFileException, ParseNumberException) {
+double FilePiece::ReadDouble() {
   return ReadNumber<double>();
 }
-long int FilePiece::ReadLong() throw(GZException, EndOfFileException, ParseNumberException) {
+long int FilePiece::ReadLong() {
   return ReadNumber<long int>();
 }
-unsigned long int FilePiece::ReadULong() throw(GZException, EndOfFileException, ParseNumberException) {
+unsigned long int FilePiece::ReadULong() {
   return ReadNumber<unsigned long int>();
 }
 
-void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) {
+void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer)  {
 #ifdef HAVE_ZLIB
   gz_file_ = NULL;
 #endif
@@ -163,7 +163,7 @@ void ParseNumber(const char *begin, char *&end, unsigned long int &out) {
 }
 } // namespace
 
-template <class T> T FilePiece::ReadNumber() throw(GZException, EndOfFileException, ParseNumberException) {
+template <class T> T FilePiece::ReadNumber() {
   SkipSpaces();
   while (last_space_ < position_) {
     if (at_end_) {
@@ -186,7 +186,7 @@ template <class T> T FilePiece::ReadNumber() throw(GZException, EndOfFileExcepti
   return ret;
 }
 
-const char *FilePiece::FindDelimiterOrEOF(const bool *delim) throw (GZException, EndOfFileException) {
+const char *FilePiece::FindDelimiterOrEOF(const bool *delim)  {
   size_t skip = 0;
   while (true) {
     for (const char *i = position_ + skip; i < position_end_; ++i) {
@@ -201,7 +201,7 @@ const char *FilePiece::FindDelimiterOrEOF(const bool *delim) throw (GZException,
   }
 }
 
-void FilePiece::Shift() throw(GZException, EndOfFileException) {
+void FilePiece::Shift() {
   if (at_end_) {
     progress_.Finished();
     throw EndOfFileException();
@@ -217,7 +217,7 @@ void FilePiece::Shift() throw(GZException, EndOfFileException) {
   }
 }
 
-void FilePiece::MMapShift(off_t desired_begin) throw() {
+void FilePiece::MMapShift(off_t desired_begin) {
   // Use mmap.  
   off_t ignore = desired_begin % page_;
   // Duplicate request for Shift means give more data.  
@@ -259,25 +259,23 @@ void FilePiece::MMapShift(off_t desired_begin) throw() {
   progress_.Set(desired_begin);
 }
 
-void FilePiece::TransitionToRead() throw (GZException) {
+void FilePiece::TransitionToRead() {
   assert(!fallback_to_read_);
   fallback_to_read_ = true;
   data_.reset();
   data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
-  if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_);
+  UTIL_THROW_IF(!data_.get(), ErrnoException, "malloc failed for " << default_map_size_);
   position_ = data_.begin();
   position_end_ = position_;
 
 #ifdef HAVE_ZLIB
   assert(!gz_file_);
   gz_file_ = gzdopen(file_.get(), "r");
-  if (!gz_file_) {
-    UTIL_THROW(GZException, "zlib failed to open " << file_name_);
-  }
+  UTIL_THROW_IF(!gz_file_, GZException, "zlib failed to open " << file_name_);
 #endif
 }
 
-void FilePiece::ReadShift() throw(GZException, EndOfFileException) {
+void FilePiece::ReadShift() {
   assert(fallback_to_read_);
   // Bytes [data_.begin(), position_) have been consumed.  
   // Bytes [position_, position_end_) have been read into the buffer.  
@@ -297,7 +295,7 @@ void FilePiece::ReadShift() throw(GZException, EndOfFileException) {
       std::size_t valid_length = position_end_ - position_;
       default_map_size_ *= 2;
       data_.call_realloc(default_map_size_);
-      if (!data_.get()) UTIL_THROW(ErrnoException, "realloc failed for " << default_map_size_);
+      UTIL_THROW_IF(!data_.get(), ErrnoException, "realloc failed for " << default_map_size_);
       position_ = data_.begin();
       position_end_ = position_ + valid_length;
     } else {
@@ -320,7 +318,7 @@ void FilePiece::ReadShift() throw(GZException, EndOfFileException) {
   }
 #else
   read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
-  if (read_return == -1) UTIL_THROW(ErrnoException, "read failed");
+  UTIL_THROW_IF(read_return == -1, ErrnoException, "read failed");
   progress_.Set(mapped_offset_);
 #endif
   if (read_return == 0) {
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
index 870ae5a3..a5c00910 100644
--- a/klm/util/file_piece.hh
+++ b/klm/util/file_piece.hh
@@ -45,13 +45,13 @@ off_t SizeFile(int fd);
 class FilePiece {
   public:
     // 32 MB default.
-    explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException);
+    explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
     // Takes ownership of fd.  name is used for messages.  
-    explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException);
+    explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
 
     ~FilePiece();
      
-    char get() throw(GZException, EndOfFileException) { 
+    char get() { 
       if (position_ == position_end_) {
         Shift();
         if (at_end_) throw EndOfFileException();
@@ -60,22 +60,22 @@ class FilePiece {
     }
 
     // Leaves the delimiter, if any, to be returned by get().  Delimiters defined by isspace().  
-    StringPiece ReadDelimited(const bool *delim = kSpaces) throw(GZException, EndOfFileException) {
+    StringPiece ReadDelimited(const bool *delim = kSpaces) {
       SkipSpaces(delim);
       return Consume(FindDelimiterOrEOF(delim));
     }
 
     // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
     // It is similar to getline in that way.  
-    StringPiece ReadLine(char delim = '\n') throw(GZException, EndOfFileException);
+    StringPiece ReadLine(char delim = '\n');
 
-    float ReadFloat() throw(GZException, EndOfFileException, ParseNumberException);
-    double ReadDouble() throw(GZException, EndOfFileException, ParseNumberException);
-    long int ReadLong() throw(GZException, EndOfFileException, ParseNumberException);
-    unsigned long int ReadULong() throw(GZException, EndOfFileException, ParseNumberException);
+    float ReadFloat();
+    double ReadDouble();
+    long int ReadLong();
+    unsigned long int ReadULong();
 
     // Skip spaces defined by isspace.  
-    void SkipSpaces(const bool *delim = kSpaces) throw (GZException, EndOfFileException) {
+    void SkipSpaces(const bool *delim = kSpaces) {
       for (; ; ++position_) {
         if (position_ == position_end_) Shift();
         if (!delim[static_cast<unsigned char>(*position_)]) return;
@@ -89,9 +89,9 @@ class FilePiece {
     const std::string &FileName() const { return file_name_; }
     
   private:
-    void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw(GZException);
+    void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer);
 
-    template <class T> T ReadNumber() throw(GZException, EndOfFileException, ParseNumberException);
+    template <class T> T ReadNumber();
 
     StringPiece Consume(const char *to) {
       StringPiece ret(position_, to - position_);
@@ -99,14 +99,14 @@ class FilePiece {
       return ret;
     }
 
-    const char *FindDelimiterOrEOF(const bool *delim = kSpaces) throw (GZException, EndOfFileException);
+    const char *FindDelimiterOrEOF(const bool *delim = kSpaces);
 
-    void Shift() throw (EndOfFileException, GZException);
+    void Shift();
     // Backends to Shift().
-    void MMapShift(off_t desired_begin) throw ();
+    void MMapShift(off_t desired_begin);
 
-    void TransitionToRead() throw (GZException);
-    void ReadShift() throw (GZException, EndOfFileException);
+    void TransitionToRead();
+    void ReadShift();
 
     const char *position_, *last_space_, *position_end_;
 
diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc
index d58a0727..fec47fd9 100644
--- a/klm/util/murmur_hash.cc
+++ b/klm/util/murmur_hash.cc
@@ -1,129 +1,129 @@
-/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All
- * code is released to the public domain. For business purposes, Murmurhash is
- * under the MIT license."
- * This is modified from the original:
- * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.  
- * length changed to unsigned int.  
- * placed in namespace util
- * add MurmurHashNative
- * default option = 0 for seed
- */
-
-#include "util/murmur_hash.hh"
-
-namespace util {
-
-//-----------------------------------------------------------------------------
-// MurmurHash2, 64-bit versions, by Austin Appleby
-
-// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment 
-// and endian-ness issues if used across multiple platforms.
-
-// 64-bit hash for 64-bit platforms
-
-uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
-{
-  const uint64_t m = 0xc6a4a7935bd1e995ULL;
-  const int r = 47;
-
-  uint64_t h = seed ^ (len * m);
-
-  const uint64_t * data = (const uint64_t *)key;
-  const uint64_t * end = data + (len/8);
-
-  while(data != end)
-  {
-    uint64_t k = *data++;
-
-    k *= m; 
-    k ^= k >> r; 
-    k *= m; 
-    
-    h ^= k;
-    h *= m; 
-  }
-
-  const unsigned char * data2 = (const unsigned char*)data;
-
-  switch(len & 7)
-  {
-  case 7: h ^= uint64_t(data2[6]) << 48;
-  case 6: h ^= uint64_t(data2[5]) << 40;
-  case 5: h ^= uint64_t(data2[4]) << 32;
-  case 4: h ^= uint64_t(data2[3]) << 24;
-  case 3: h ^= uint64_t(data2[2]) << 16;
-  case 2: h ^= uint64_t(data2[1]) << 8;
-  case 1: h ^= uint64_t(data2[0]);
-          h *= m;
-  };
- 
-  h ^= h >> r;
-  h *= m;
-  h ^= h >> r;
-
-  return h;
-} 
-
-
-// 64-bit hash for 32-bit platforms
-
-uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
-{
-  const unsigned int m = 0x5bd1e995;
-  const int r = 24;
-
-  unsigned int h1 = seed ^ len;
-  unsigned int h2 = 0;
-
-  const unsigned int * data = (const unsigned int *)key;
-
-  while(len >= 8)
-  {
-    unsigned int k1 = *data++;
-    k1 *= m; k1 ^= k1 >> r; k1 *= m;
-    h1 *= m; h1 ^= k1;
-    len -= 4;
-
-    unsigned int k2 = *data++;
-    k2 *= m; k2 ^= k2 >> r; k2 *= m;
-    h2 *= m; h2 ^= k2;
-    len -= 4;
-  }
-
-  if(len >= 4)
-  {
-    unsigned int k1 = *data++;
-    k1 *= m; k1 ^= k1 >> r; k1 *= m;
-    h1 *= m; h1 ^= k1;
-    len -= 4;
-  }
-
-  switch(len)
-  {
-  case 3: h2 ^= ((unsigned char*)data)[2] << 16;
-  case 2: h2 ^= ((unsigned char*)data)[1] << 8;
-  case 1: h2 ^= ((unsigned char*)data)[0];
-      h2 *= m;
-  };
-
-  h1 ^= h2 >> 18; h1 *= m;
-  h2 ^= h1 >> 22; h2 *= m;
-  h1 ^= h2 >> 17; h1 *= m;
-  h2 ^= h1 >> 19; h2 *= m;
-
-  uint64_t h = h1;
-
-  h = (h << 32) | h2;
-
-  return h;
-}
-
-uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) {
-  if (sizeof(int) == 4) {
-    return MurmurHash64B(key, len, seed);
-  } else {
-    return MurmurHash64A(key, len, seed);
-  }
-}
-
-} // namespace util
+/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All
+ * code is released to the public domain. For business purposes, Murmurhash is
+ * under the MIT license."
+ * This is modified from the original:
+ * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.  
+ * length changed to unsigned int.  
+ * placed in namespace util
+ * add MurmurHashNative
+ * default option = 0 for seed
+ */
+
+#include "util/murmur_hash.hh"
+
+namespace util {
+
+//-----------------------------------------------------------------------------
+// MurmurHash2, 64-bit versions, by Austin Appleby
+
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment 
+// and endian-ness issues if used across multiple platforms.
+
+// 64-bit hash for 64-bit platforms
+
+uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
+{
+  const uint64_t m = 0xc6a4a7935bd1e995ULL;
+  const int r = 47;
+
+  uint64_t h = seed ^ (len * m);
+
+  const uint64_t * data = (const uint64_t *)key;
+  const uint64_t * end = data + (len/8);
+
+  while(data != end)
+  {
+    uint64_t k = *data++;
+
+    k *= m; 
+    k ^= k >> r; 
+    k *= m; 
+    
+    h ^= k;
+    h *= m; 
+  }
+
+  const unsigned char * data2 = (const unsigned char*)data;
+
+  switch(len & 7)
+  {
+  case 7: h ^= uint64_t(data2[6]) << 48;
+  case 6: h ^= uint64_t(data2[5]) << 40;
+  case 5: h ^= uint64_t(data2[4]) << 32;
+  case 4: h ^= uint64_t(data2[3]) << 24;
+  case 3: h ^= uint64_t(data2[2]) << 16;
+  case 2: h ^= uint64_t(data2[1]) << 8;
+  case 1: h ^= uint64_t(data2[0]);
+          h *= m;
+  };
+ 
+  h ^= h >> r;
+  h *= m;
+  h ^= h >> r;
+
+  return h;
+} 
+
+
+// 64-bit hash for 32-bit platforms
+
+uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
+{
+  const unsigned int m = 0x5bd1e995;
+  const int r = 24;
+
+  unsigned int h1 = seed ^ len;
+  unsigned int h2 = 0;
+
+  const unsigned int * data = (const unsigned int *)key;
+
+  while(len >= 8)
+  {
+    unsigned int k1 = *data++;
+    k1 *= m; k1 ^= k1 >> r; k1 *= m;
+    h1 *= m; h1 ^= k1;
+    len -= 4;
+
+    unsigned int k2 = *data++;
+    k2 *= m; k2 ^= k2 >> r; k2 *= m;
+    h2 *= m; h2 ^= k2;
+    len -= 4;
+  }
+
+  if(len >= 4)
+  {
+    unsigned int k1 = *data++;
+    k1 *= m; k1 ^= k1 >> r; k1 *= m;
+    h1 *= m; h1 ^= k1;
+    len -= 4;
+  }
+
+  switch(len)
+  {
+  case 3: h2 ^= ((unsigned char*)data)[2] << 16;
+  case 2: h2 ^= ((unsigned char*)data)[1] << 8;
+  case 1: h2 ^= ((unsigned char*)data)[0];
+      h2 *= m;
+  };
+
+  h1 ^= h2 >> 18; h1 *= m;
+  h2 ^= h1 >> 22; h2 *= m;
+  h1 ^= h2 >> 17; h1 *= m;
+  h2 ^= h1 >> 19; h2 *= m;
+
+  uint64_t h = h1;
+
+  h = (h << 32) | h2;
+
+  return h;
+}
+
+uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) {
+  if (sizeof(int) == 4) {
+    return MurmurHash64B(key, len, seed);
+  } else {
+    return MurmurHash64A(key, len, seed);
+  }
+}
+
+} // namespace util
diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh
index 00be0ed7..2ec342a6 100644
--- a/klm/util/probing_hash_table.hh
+++ b/klm/util/probing_hash_table.hh
@@ -57,7 +57,7 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac
         equal_(equal_func),
         entries_(0)
 #ifdef DEBUG
-        , initialized_(true),
+        , initialized_(true)
 #endif
     {}
 
diff --git a/klm/util/sorted_uniform.hh b/klm/util/sorted_uniform.hh
index 84d7aa02..0d6ecbbd 100644
--- a/klm/util/sorted_uniform.hh
+++ b/klm/util/sorted_uniform.hh
@@ -12,7 +12,7 @@ namespace util {
 template <class T> class IdentityAccessor {
   public:
     typedef T Key;
-    T operator()(const uint64_t *in) const { return *in; }
+    T operator()(const T *in) const { return *in; }
 };
 
 struct Pivot64 {
@@ -101,6 +101,27 @@ template <class Iterator, class Accessor, class Pivot> bool SortedUniformFind(co
   return BoundedSortedUniformFind<Iterator, Accessor, Pivot>(accessor, begin, below, end, above, key, out);
 }
 
+// May return begin - 1.
+template <class Iterator, class Accessor> Iterator BinaryBelow(
+    const Accessor &accessor,
+    Iterator begin,
+    Iterator end,
+    const typename Accessor::Key key) {
+  while (end > begin) {
+    Iterator pivot(begin + (end - begin) / 2);
+    typename Accessor::Key mid(accessor(pivot));
+    if (mid < key) {
+      begin = pivot + 1;
+    } else if (mid > key) {
+      end = pivot;
+    } else {
+      for (++pivot; (pivot < end) && accessor(pivot) == mid; ++pivot) {}
+      return pivot - 1;
+    }
+  }
+  return begin - 1;
+}
+
 // To use this template, you need to define a Pivot function to match Key.  
 template <class PackingT> class SortedUniformMap {
   public:
author	Patrick Simianer <p@simianer.de>	2011-09-09 15:33:35 +0200
committer	Patrick Simianer <p@simianer.de>	2011-09-23 19:13:58 +0200
commit	fe6acf199a5749f0a604a95e8d7af59bccc7505e (patch)
tree	5aaad9282a65deb3b3b560bd3e643b845a42987a /klm/util
parent	ef74e67449515ff68f598f06ffc9d221eb13f919 (diff)