diff options
Diffstat (limited to 'klm/util')
| -rw-r--r-- | klm/util/bit_packing.hh | 13 | ||||
| -rw-r--r-- | klm/util/exception.cc | 28 | ||||
| -rw-r--r-- | klm/util/exception.hh | 56 | ||||
| -rw-r--r-- | klm/util/file_piece.cc | 42 | ||||
| -rw-r--r-- | klm/util/file_piece.hh | 34 | ||||
| -rw-r--r-- | klm/util/murmur_hash.cc | 258 | ||||
| -rw-r--r-- | klm/util/probing_hash_table.hh | 2 | ||||
| -rw-r--r-- | klm/util/sorted_uniform.hh | 23 | 
8 files changed, 281 insertions, 175 deletions
| diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh index b35d80c8..9f47d559 100644 --- a/klm/util/bit_packing.hh +++ b/klm/util/bit_packing.hh @@ -107,9 +107,20 @@ void BitPackingSanity();  uint8_t RequiredBits(uint64_t max_value);  struct BitsMask { +  static BitsMask ByMax(uint64_t max_value) { +    BitsMask ret; +    ret.FromMax(max_value); +    return ret; +  } +  static BitsMask ByBits(uint8_t bits) { +    BitsMask ret; +    ret.bits = bits; +    ret.mask = (1ULL << bits) - 1; +    return ret; +  }    void FromMax(uint64_t max_value) {      bits = RequiredBits(max_value); -    mask = (1 << bits) - 1; +    mask = (1ULL << bits) - 1;    }    uint8_t bits;    uint64_t mask; diff --git a/klm/util/exception.cc b/klm/util/exception.cc index 84f9fe7c..62280970 100644 --- a/klm/util/exception.cc +++ b/klm/util/exception.cc @@ -1,5 +1,9 @@  #include "util/exception.hh" +#ifdef __GXX_RTTI +#include <typeinfo> +#endif +  #include <errno.h>  #include <string.h> @@ -22,6 +26,30 @@ const char *Exception::what() const throw() {    return text_.c_str();  } +void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) { +  /* The child class might have set some text, but we want this to come first. +   * Another option would be passing this information to the constructor, but +   * then child classes would have to accept constructor arguments and pass +   * them down.   +   */ +  text_ = stream_.str(); +  stream_.str(""); +  stream_ << file << ':' << line; +  if (func) stream_ << " in " << func << " threw "; +  if (child_name) { +    stream_ << child_name; +  } else { +#ifdef __GXX_RTTI +    stream_ << typeid(this).name(); +#else +    stream_ << "an exception"; +#endif +  } +  if (condition) stream_ << " because `" << condition; +  stream_ << "'.\n"; +  stream_ << text_; +} +  namespace {  // The XOPEN version.  const char *HandleStrerror(int ret, const char *buf) { diff --git a/klm/util/exception.hh b/klm/util/exception.hh index c6936914..81675a57 100644 --- a/klm/util/exception.hh +++ b/klm/util/exception.hh @@ -1,8 +1,6 @@  #ifndef UTIL_EXCEPTION__  #define UTIL_EXCEPTION__ -#include "util/string_piece.hh" -  #include <exception>  #include <sstream>  #include <string> @@ -22,6 +20,14 @@ class Exception : public std::exception {      // Not threadsafe, but probably doesn't matter.  FWIW, Boost's exception guidance implies that what() isn't threadsafe.        const char *what() const throw(); +    // For use by the UTIL_THROW macros.   +    void SetLocation( +        const char *file, +        unsigned int line, +        const char *func, +        const char *child_name, +        const char *condition); +    private:      template <class Except, class Data> friend typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data); @@ -43,7 +49,49 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep    return e;  } -#define UTIL_THROW(Exception, Modify) { Exception UTIL_e; {UTIL_e << Modify;} throw UTIL_e; } +#ifdef __GNUC__ +#define UTIL_FUNC_NAME __PRETTY_FUNCTION__ +#else +#ifdef _WIN32 +#define UTIL_FUNC_NAME __FUNCTION__ +#else +#define UTIL_FUNC_NAME NULL +#endif +#endif + +#define UTIL_SET_LOCATION(UTIL_e, child, condition) do { \ +  (UTIL_e).SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, (child), (condition)); \ +} while (0) + +/* Create an instance of Exception, add the message Modify, and throw it. + * Modify is appended to the what() message and can contain << for ostream + * operations.   + * + * do .. while kludge to swallow trailing ; character + * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html .   + */ +#define UTIL_THROW(Exception, Modify) do { \ +  Exception UTIL_e; \ +  UTIL_SET_LOCATION(UTIL_e, #Exception, NULL); \ +  UTIL_e << Modify; \ +  throw UTIL_e; \ +} while (0) + +#define UTIL_THROW_VAR(Var, Modify) do { \ +  Exception &UTIL_e = (Var); \ +  UTIL_SET_LOCATION(UTIL_e, NULL, NULL); \ +  UTIL_e << Modify; \ +  throw UTIL_e; \ +} while (0) + +#define UTIL_THROW_IF(Condition, Exception, Modify) do { \ +  if (Condition) { \ +    Exception UTIL_e; \ +    UTIL_SET_LOCATION(UTIL_e, #Exception, #Condition); \ +    UTIL_e << Modify; \ +    throw UTIL_e; \ +  } \ +} while (0)  class ErrnoException : public Exception {    public: @@ -51,7 +99,7 @@ class ErrnoException : public Exception {      virtual ~ErrnoException() throw(); -    int Error() { return errno_; } +    int Error() const throw() { return errno_; }    private:      int errno_; diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index f447a70c..cbe4234f 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -41,8 +41,8 @@ GZException::GZException(void *file) {  const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};  int OpenReadOrThrow(const char *name) { -  int ret = open(name, O_RDONLY); -  if (ret == -1) UTIL_THROW(ErrnoException, "in open (" << name << ") for reading"); +  int ret; +  UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name);    return ret;  } @@ -52,13 +52,13 @@ off_t SizeFile(int fd) {    return sb.st_size;  } -FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) :  +FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) :     file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),    progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {    Initialize(name, show_progress, min_buffer);  } -FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) :  +FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer)  :     file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),    progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {    Initialize(name, show_progress, min_buffer); @@ -78,7 +78,7 @@ FilePiece::~FilePiece() {  #endif  } -StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) { +StringPiece FilePiece::ReadLine(char delim) {    size_t skip = 0;    while (true) {      for (const char *i = position_ + skip; i < position_end_; ++i) { @@ -97,20 +97,20 @@ StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileExcepti    }  } -float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) { +float FilePiece::ReadFloat() {    return ReadNumber<float>();  } -double FilePiece::ReadDouble() throw(GZException, EndOfFileException, ParseNumberException) { +double FilePiece::ReadDouble() {    return ReadNumber<double>();  } -long int FilePiece::ReadLong() throw(GZException, EndOfFileException, ParseNumberException) { +long int FilePiece::ReadLong() {    return ReadNumber<long int>();  } -unsigned long int FilePiece::ReadULong() throw(GZException, EndOfFileException, ParseNumberException) { +unsigned long int FilePiece::ReadULong() {    return ReadNumber<unsigned long int>();  } -void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) { +void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer)  {  #ifdef HAVE_ZLIB    gz_file_ = NULL;  #endif @@ -163,7 +163,7 @@ void ParseNumber(const char *begin, char *&end, unsigned long int &out) {  }  } // namespace -template <class T> T FilePiece::ReadNumber() throw(GZException, EndOfFileException, ParseNumberException) { +template <class T> T FilePiece::ReadNumber() {    SkipSpaces();    while (last_space_ < position_) {      if (at_end_) { @@ -186,7 +186,7 @@ template <class T> T FilePiece::ReadNumber() throw(GZException, EndOfFileExcepti    return ret;  } -const char *FilePiece::FindDelimiterOrEOF(const bool *delim) throw (GZException, EndOfFileException) { +const char *FilePiece::FindDelimiterOrEOF(const bool *delim)  {    size_t skip = 0;    while (true) {      for (const char *i = position_ + skip; i < position_end_; ++i) { @@ -201,7 +201,7 @@ const char *FilePiece::FindDelimiterOrEOF(const bool *delim) throw (GZException,    }  } -void FilePiece::Shift() throw(GZException, EndOfFileException) { +void FilePiece::Shift() {    if (at_end_) {      progress_.Finished();      throw EndOfFileException(); @@ -217,7 +217,7 @@ void FilePiece::Shift() throw(GZException, EndOfFileException) {    }  } -void FilePiece::MMapShift(off_t desired_begin) throw() { +void FilePiece::MMapShift(off_t desired_begin) {    // Use mmap.      off_t ignore = desired_begin % page_;    // Duplicate request for Shift means give more data.   @@ -259,25 +259,23 @@ void FilePiece::MMapShift(off_t desired_begin) throw() {    progress_.Set(desired_begin);  } -void FilePiece::TransitionToRead() throw (GZException) { +void FilePiece::TransitionToRead() {    assert(!fallback_to_read_);    fallback_to_read_ = true;    data_.reset();    data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); -  if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_); +  UTIL_THROW_IF(!data_.get(), ErrnoException, "malloc failed for " << default_map_size_);    position_ = data_.begin();    position_end_ = position_;  #ifdef HAVE_ZLIB    assert(!gz_file_);    gz_file_ = gzdopen(file_.get(), "r"); -  if (!gz_file_) { -    UTIL_THROW(GZException, "zlib failed to open " << file_name_); -  } +  UTIL_THROW_IF(!gz_file_, GZException, "zlib failed to open " << file_name_);  #endif  } -void FilePiece::ReadShift() throw(GZException, EndOfFileException) { +void FilePiece::ReadShift() {    assert(fallback_to_read_);    // Bytes [data_.begin(), position_) have been consumed.      // Bytes [position_, position_end_) have been read into the buffer.   @@ -297,7 +295,7 @@ void FilePiece::ReadShift() throw(GZException, EndOfFileException) {        std::size_t valid_length = position_end_ - position_;        default_map_size_ *= 2;        data_.call_realloc(default_map_size_); -      if (!data_.get()) UTIL_THROW(ErrnoException, "realloc failed for " << default_map_size_); +      UTIL_THROW_IF(!data_.get(), ErrnoException, "realloc failed for " << default_map_size_);        position_ = data_.begin();        position_end_ = position_ + valid_length;      } else { @@ -320,7 +318,7 @@ void FilePiece::ReadShift() throw(GZException, EndOfFileException) {    }  #else    read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read); -  if (read_return == -1) UTIL_THROW(ErrnoException, "read failed"); +  UTIL_THROW_IF(read_return == -1, ErrnoException, "read failed");    progress_.Set(mapped_offset_);  #endif    if (read_return == 0) { diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index 870ae5a3..a5c00910 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -45,13 +45,13 @@ off_t SizeFile(int fd);  class FilePiece {    public:      // 32 MB default. -    explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException); +    explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);      // Takes ownership of fd.  name is used for messages.   -    explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException); +    explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);      ~FilePiece(); -    char get() throw(GZException, EndOfFileException) {  +    char get() {         if (position_ == position_end_) {          Shift();          if (at_end_) throw EndOfFileException(); @@ -60,22 +60,22 @@ class FilePiece {      }      // Leaves the delimiter, if any, to be returned by get().  Delimiters defined by isspace().   -    StringPiece ReadDelimited(const bool *delim = kSpaces) throw(GZException, EndOfFileException) { +    StringPiece ReadDelimited(const bool *delim = kSpaces) {        SkipSpaces(delim);        return Consume(FindDelimiterOrEOF(delim));      }      // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.      // It is similar to getline in that way.   -    StringPiece ReadLine(char delim = '\n') throw(GZException, EndOfFileException); +    StringPiece ReadLine(char delim = '\n'); -    float ReadFloat() throw(GZException, EndOfFileException, ParseNumberException); -    double ReadDouble() throw(GZException, EndOfFileException, ParseNumberException); -    long int ReadLong() throw(GZException, EndOfFileException, ParseNumberException); -    unsigned long int ReadULong() throw(GZException, EndOfFileException, ParseNumberException); +    float ReadFloat(); +    double ReadDouble(); +    long int ReadLong(); +    unsigned long int ReadULong();      // Skip spaces defined by isspace.   -    void SkipSpaces(const bool *delim = kSpaces) throw (GZException, EndOfFileException) { +    void SkipSpaces(const bool *delim = kSpaces) {        for (; ; ++position_) {          if (position_ == position_end_) Shift();          if (!delim[static_cast<unsigned char>(*position_)]) return; @@ -89,9 +89,9 @@ class FilePiece {      const std::string &FileName() const { return file_name_; }    private: -    void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw(GZException); +    void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer); -    template <class T> T ReadNumber() throw(GZException, EndOfFileException, ParseNumberException); +    template <class T> T ReadNumber();      StringPiece Consume(const char *to) {        StringPiece ret(position_, to - position_); @@ -99,14 +99,14 @@ class FilePiece {        return ret;      } -    const char *FindDelimiterOrEOF(const bool *delim = kSpaces) throw (GZException, EndOfFileException); +    const char *FindDelimiterOrEOF(const bool *delim = kSpaces); -    void Shift() throw (EndOfFileException, GZException); +    void Shift();      // Backends to Shift(). -    void MMapShift(off_t desired_begin) throw (); +    void MMapShift(off_t desired_begin); -    void TransitionToRead() throw (GZException); -    void ReadShift() throw (GZException, EndOfFileException); +    void TransitionToRead(); +    void ReadShift();      const char *position_, *last_space_, *position_end_; diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc index d58a0727..fec47fd9 100644 --- a/klm/util/murmur_hash.cc +++ b/klm/util/murmur_hash.cc @@ -1,129 +1,129 @@ -/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All
 - * code is released to the public domain. For business purposes, Murmurhash is
 - * under the MIT license."
 - * This is modified from the original:
 - * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.  
 - * length changed to unsigned int.  
 - * placed in namespace util
 - * add MurmurHashNative
 - * default option = 0 for seed
 - */
 -
 -#include "util/murmur_hash.hh"
 -
 -namespace util {
 -
 -//-----------------------------------------------------------------------------
 -// MurmurHash2, 64-bit versions, by Austin Appleby
 -
 -// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment 
 -// and endian-ness issues if used across multiple platforms.
 -
 -// 64-bit hash for 64-bit platforms
 -
 -uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
 -{
 -  const uint64_t m = 0xc6a4a7935bd1e995ULL;
 -  const int r = 47;
 -
 -  uint64_t h = seed ^ (len * m);
 -
 -  const uint64_t * data = (const uint64_t *)key;
 -  const uint64_t * end = data + (len/8);
 -
 -  while(data != end)
 -  {
 -    uint64_t k = *data++;
 -
 -    k *= m; 
 -    k ^= k >> r; 
 -    k *= m; 
 -    
 -    h ^= k;
 -    h *= m; 
 -  }
 -
 -  const unsigned char * data2 = (const unsigned char*)data;
 -
 -  switch(len & 7)
 -  {
 -  case 7: h ^= uint64_t(data2[6]) << 48;
 -  case 6: h ^= uint64_t(data2[5]) << 40;
 -  case 5: h ^= uint64_t(data2[4]) << 32;
 -  case 4: h ^= uint64_t(data2[3]) << 24;
 -  case 3: h ^= uint64_t(data2[2]) << 16;
 -  case 2: h ^= uint64_t(data2[1]) << 8;
 -  case 1: h ^= uint64_t(data2[0]);
 -          h *= m;
 -  };
 - 
 -  h ^= h >> r;
 -  h *= m;
 -  h ^= h >> r;
 -
 -  return h;
 -} 
 -
 -
 -// 64-bit hash for 32-bit platforms
 -
 -uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
 -{
 -  const unsigned int m = 0x5bd1e995;
 -  const int r = 24;
 -
 -  unsigned int h1 = seed ^ len;
 -  unsigned int h2 = 0;
 -
 -  const unsigned int * data = (const unsigned int *)key;
 -
 -  while(len >= 8)
 -  {
 -    unsigned int k1 = *data++;
 -    k1 *= m; k1 ^= k1 >> r; k1 *= m;
 -    h1 *= m; h1 ^= k1;
 -    len -= 4;
 -
 -    unsigned int k2 = *data++;
 -    k2 *= m; k2 ^= k2 >> r; k2 *= m;
 -    h2 *= m; h2 ^= k2;
 -    len -= 4;
 -  }
 -
 -  if(len >= 4)
 -  {
 -    unsigned int k1 = *data++;
 -    k1 *= m; k1 ^= k1 >> r; k1 *= m;
 -    h1 *= m; h1 ^= k1;
 -    len -= 4;
 -  }
 -
 -  switch(len)
 -  {
 -  case 3: h2 ^= ((unsigned char*)data)[2] << 16;
 -  case 2: h2 ^= ((unsigned char*)data)[1] << 8;
 -  case 1: h2 ^= ((unsigned char*)data)[0];
 -      h2 *= m;
 -  };
 -
 -  h1 ^= h2 >> 18; h1 *= m;
 -  h2 ^= h1 >> 22; h2 *= m;
 -  h1 ^= h2 >> 17; h1 *= m;
 -  h2 ^= h1 >> 19; h2 *= m;
 -
 -  uint64_t h = h1;
 -
 -  h = (h << 32) | h2;
 -
 -  return h;
 -}
 -
 -uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) {
 -  if (sizeof(int) == 4) {
 -    return MurmurHash64B(key, len, seed);
 -  } else {
 -    return MurmurHash64A(key, len, seed);
 -  }
 -}
 -
 -} // namespace util
 +/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All + * code is released to the public domain. For business purposes, Murmurhash is + * under the MIT license." + * This is modified from the original: + * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.   + * length changed to unsigned int.   + * placed in namespace util + * add MurmurHashNative + * default option = 0 for seed + */ + +#include "util/murmur_hash.hh" + +namespace util { + +//----------------------------------------------------------------------------- +// MurmurHash2, 64-bit versions, by Austin Appleby + +// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment  +// and endian-ness issues if used across multiple platforms. + +// 64-bit hash for 64-bit platforms + +uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed ) +{ +  const uint64_t m = 0xc6a4a7935bd1e995ULL; +  const int r = 47; + +  uint64_t h = seed ^ (len * m); + +  const uint64_t * data = (const uint64_t *)key; +  const uint64_t * end = data + (len/8); + +  while(data != end) +  { +    uint64_t k = *data++; + +    k *= m;  +    k ^= k >> r;  +    k *= m;  +     +    h ^= k; +    h *= m;  +  } + +  const unsigned char * data2 = (const unsigned char*)data; + +  switch(len & 7) +  { +  case 7: h ^= uint64_t(data2[6]) << 48; +  case 6: h ^= uint64_t(data2[5]) << 40; +  case 5: h ^= uint64_t(data2[4]) << 32; +  case 4: h ^= uint64_t(data2[3]) << 24; +  case 3: h ^= uint64_t(data2[2]) << 16; +  case 2: h ^= uint64_t(data2[1]) << 8; +  case 1: h ^= uint64_t(data2[0]); +          h *= m; +  }; +  +  h ^= h >> r; +  h *= m; +  h ^= h >> r; + +  return h; +}  + + +// 64-bit hash for 32-bit platforms + +uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed ) +{ +  const unsigned int m = 0x5bd1e995; +  const int r = 24; + +  unsigned int h1 = seed ^ len; +  unsigned int h2 = 0; + +  const unsigned int * data = (const unsigned int *)key; + +  while(len >= 8) +  { +    unsigned int k1 = *data++; +    k1 *= m; k1 ^= k1 >> r; k1 *= m; +    h1 *= m; h1 ^= k1; +    len -= 4; + +    unsigned int k2 = *data++; +    k2 *= m; k2 ^= k2 >> r; k2 *= m; +    h2 *= m; h2 ^= k2; +    len -= 4; +  } + +  if(len >= 4) +  { +    unsigned int k1 = *data++; +    k1 *= m; k1 ^= k1 >> r; k1 *= m; +    h1 *= m; h1 ^= k1; +    len -= 4; +  } + +  switch(len) +  { +  case 3: h2 ^= ((unsigned char*)data)[2] << 16; +  case 2: h2 ^= ((unsigned char*)data)[1] << 8; +  case 1: h2 ^= ((unsigned char*)data)[0]; +      h2 *= m; +  }; + +  h1 ^= h2 >> 18; h1 *= m; +  h2 ^= h1 >> 22; h2 *= m; +  h1 ^= h2 >> 17; h1 *= m; +  h2 ^= h1 >> 19; h2 *= m; + +  uint64_t h = h1; + +  h = (h << 32) | h2; + +  return h; +} + +uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) { +  if (sizeof(int) == 4) { +    return MurmurHash64B(key, len, seed); +  } else { +    return MurmurHash64A(key, len, seed); +  } +} + +} // namespace util diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh index 00be0ed7..2ec342a6 100644 --- a/klm/util/probing_hash_table.hh +++ b/klm/util/probing_hash_table.hh @@ -57,7 +57,7 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac          equal_(equal_func),          entries_(0)  #ifdef DEBUG -        , initialized_(true), +        , initialized_(true)  #endif      {} diff --git a/klm/util/sorted_uniform.hh b/klm/util/sorted_uniform.hh index 84d7aa02..0d6ecbbd 100644 --- a/klm/util/sorted_uniform.hh +++ b/klm/util/sorted_uniform.hh @@ -12,7 +12,7 @@ namespace util {  template <class T> class IdentityAccessor {    public:      typedef T Key; -    T operator()(const uint64_t *in) const { return *in; } +    T operator()(const T *in) const { return *in; }  };  struct Pivot64 { @@ -101,6 +101,27 @@ template <class Iterator, class Accessor, class Pivot> bool SortedUniformFind(co    return BoundedSortedUniformFind<Iterator, Accessor, Pivot>(accessor, begin, below, end, above, key, out);  } +// May return begin - 1. +template <class Iterator, class Accessor> Iterator BinaryBelow( +    const Accessor &accessor, +    Iterator begin, +    Iterator end, +    const typename Accessor::Key key) { +  while (end > begin) { +    Iterator pivot(begin + (end - begin) / 2); +    typename Accessor::Key mid(accessor(pivot)); +    if (mid < key) { +      begin = pivot + 1; +    } else if (mid > key) { +      end = pivot; +    } else { +      for (++pivot; (pivot < end) && accessor(pivot) == mid; ++pivot) {} +      return pivot - 1; +    } +  } +  return begin - 1; +} +  // To use this template, you need to define a Pivot function to match Key.    template <class PackingT> class SortedUniformMap {    public: | 
