From edb0cc0cbae1e75e4aeedb6360eab325effe6573 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Fri, 9 Sep 2011 15:33:35 +0200
Subject: partial merge, ruleid feature
---
klm/util/bit_packing.hh | 13 ++-
klm/util/exception.cc | 28 +++++
klm/util/exception.hh | 56 ++++++++-
klm/util/file_piece.cc | 42 ++++---
klm/util/file_piece.hh | 34 +++---
klm/util/murmur_hash.cc | 258 ++++++++++++++++++++---------------------
klm/util/probing_hash_table.hh | 2 +-
klm/util/sorted_uniform.hh | 23 +++-
8 files changed, 281 insertions(+), 175 deletions(-)
(limited to 'klm/util')
diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh
index b35d80c8..9f47d559 100644
--- a/klm/util/bit_packing.hh
+++ b/klm/util/bit_packing.hh
@@ -107,9 +107,20 @@ void BitPackingSanity();
uint8_t RequiredBits(uint64_t max_value);
struct BitsMask {
+ static BitsMask ByMax(uint64_t max_value) {
+ BitsMask ret;
+ ret.FromMax(max_value);
+ return ret;
+ }
+ static BitsMask ByBits(uint8_t bits) {
+ BitsMask ret;
+ ret.bits = bits;
+ ret.mask = (1ULL << bits) - 1;
+ return ret;
+ }
void FromMax(uint64_t max_value) {
bits = RequiredBits(max_value);
- mask = (1 << bits) - 1;
+ mask = (1ULL << bits) - 1;
}
uint8_t bits;
uint64_t mask;
diff --git a/klm/util/exception.cc b/klm/util/exception.cc
index 84f9fe7c..62280970 100644
--- a/klm/util/exception.cc
+++ b/klm/util/exception.cc
@@ -1,5 +1,9 @@
#include "util/exception.hh"
+#ifdef __GXX_RTTI
+#include
+#endif
+
#include
#include
@@ -22,6 +26,30 @@ const char *Exception::what() const throw() {
return text_.c_str();
}
+void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) {
+ /* The child class might have set some text, but we want this to come first.
+ * Another option would be passing this information to the constructor, but
+ * then child classes would have to accept constructor arguments and pass
+ * them down.
+ */
+ text_ = stream_.str();
+ stream_.str("");
+ stream_ << file << ':' << line;
+ if (func) stream_ << " in " << func << " threw ";
+ if (child_name) {
+ stream_ << child_name;
+ } else {
+#ifdef __GXX_RTTI
+ stream_ << typeid(this).name();
+#else
+ stream_ << "an exception";
+#endif
+ }
+ if (condition) stream_ << " because `" << condition;
+ stream_ << "'.\n";
+ stream_ << text_;
+}
+
namespace {
// The XOPEN version.
const char *HandleStrerror(int ret, const char *buf) {
diff --git a/klm/util/exception.hh b/klm/util/exception.hh
index c6936914..81675a57 100644
--- a/klm/util/exception.hh
+++ b/klm/util/exception.hh
@@ -1,8 +1,6 @@
#ifndef UTIL_EXCEPTION__
#define UTIL_EXCEPTION__
-#include "util/string_piece.hh"
-
#include
#include
#include
@@ -22,6 +20,14 @@ class Exception : public std::exception {
// Not threadsafe, but probably doesn't matter. FWIW, Boost's exception guidance implies that what() isn't threadsafe.
const char *what() const throw();
+ // For use by the UTIL_THROW macros.
+ void SetLocation(
+ const char *file,
+ unsigned int line,
+ const char *func,
+ const char *child_name,
+ const char *condition);
+
private:
template friend typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data);
@@ -43,7 +49,49 @@ template typename Except::template ExceptionTag();
}
-double FilePiece::ReadDouble() throw(GZException, EndOfFileException, ParseNumberException) {
+double FilePiece::ReadDouble() {
return ReadNumber();
}
-long int FilePiece::ReadLong() throw(GZException, EndOfFileException, ParseNumberException) {
+long int FilePiece::ReadLong() {
return ReadNumber();
}
-unsigned long int FilePiece::ReadULong() throw(GZException, EndOfFileException, ParseNumberException) {
+unsigned long int FilePiece::ReadULong() {
return ReadNumber();
}
-void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) {
+void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) {
#ifdef HAVE_ZLIB
gz_file_ = NULL;
#endif
@@ -163,7 +163,7 @@ void ParseNumber(const char *begin, char *&end, unsigned long int &out) {
}
} // namespace
-template T FilePiece::ReadNumber() throw(GZException, EndOfFileException, ParseNumberException) {
+template T FilePiece::ReadNumber() {
SkipSpaces();
while (last_space_ < position_) {
if (at_end_) {
@@ -186,7 +186,7 @@ template T FilePiece::ReadNumber() throw(GZException, EndOfFileExcepti
return ret;
}
-const char *FilePiece::FindDelimiterOrEOF(const bool *delim) throw (GZException, EndOfFileException) {
+const char *FilePiece::FindDelimiterOrEOF(const bool *delim) {
size_t skip = 0;
while (true) {
for (const char *i = position_ + skip; i < position_end_; ++i) {
@@ -201,7 +201,7 @@ const char *FilePiece::FindDelimiterOrEOF(const bool *delim) throw (GZException,
}
}
-void FilePiece::Shift() throw(GZException, EndOfFileException) {
+void FilePiece::Shift() {
if (at_end_) {
progress_.Finished();
throw EndOfFileException();
@@ -217,7 +217,7 @@ void FilePiece::Shift() throw(GZException, EndOfFileException) {
}
}
-void FilePiece::MMapShift(off_t desired_begin) throw() {
+void FilePiece::MMapShift(off_t desired_begin) {
// Use mmap.
off_t ignore = desired_begin % page_;
// Duplicate request for Shift means give more data.
@@ -259,25 +259,23 @@ void FilePiece::MMapShift(off_t desired_begin) throw() {
progress_.Set(desired_begin);
}
-void FilePiece::TransitionToRead() throw (GZException) {
+void FilePiece::TransitionToRead() {
assert(!fallback_to_read_);
fallback_to_read_ = true;
data_.reset();
data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
- if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_);
+ UTIL_THROW_IF(!data_.get(), ErrnoException, "malloc failed for " << default_map_size_);
position_ = data_.begin();
position_end_ = position_;
#ifdef HAVE_ZLIB
assert(!gz_file_);
gz_file_ = gzdopen(file_.get(), "r");
- if (!gz_file_) {
- UTIL_THROW(GZException, "zlib failed to open " << file_name_);
- }
+ UTIL_THROW_IF(!gz_file_, GZException, "zlib failed to open " << file_name_);
#endif
}
-void FilePiece::ReadShift() throw(GZException, EndOfFileException) {
+void FilePiece::ReadShift() {
assert(fallback_to_read_);
// Bytes [data_.begin(), position_) have been consumed.
// Bytes [position_, position_end_) have been read into the buffer.
@@ -297,7 +295,7 @@ void FilePiece::ReadShift() throw(GZException, EndOfFileException) {
std::size_t valid_length = position_end_ - position_;
default_map_size_ *= 2;
data_.call_realloc(default_map_size_);
- if (!data_.get()) UTIL_THROW(ErrnoException, "realloc failed for " << default_map_size_);
+ UTIL_THROW_IF(!data_.get(), ErrnoException, "realloc failed for " << default_map_size_);
position_ = data_.begin();
position_end_ = position_ + valid_length;
} else {
@@ -320,7 +318,7 @@ void FilePiece::ReadShift() throw(GZException, EndOfFileException) {
}
#else
read_return = read(file_.get(), static_cast(data_.get()) + already_read, default_map_size_ - already_read);
- if (read_return == -1) UTIL_THROW(ErrnoException, "read failed");
+ UTIL_THROW_IF(read_return == -1, ErrnoException, "read failed");
progress_.Set(mapped_offset_);
#endif
if (read_return == 0) {
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
index 870ae5a3..a5c00910 100644
--- a/klm/util/file_piece.hh
+++ b/klm/util/file_piece.hh
@@ -45,13 +45,13 @@ off_t SizeFile(int fd);
class FilePiece {
public:
// 32 MB default.
- explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException);
+ explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
// Takes ownership of fd. name is used for messages.
- explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException);
+ explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
~FilePiece();
- char get() throw(GZException, EndOfFileException) {
+ char get() {
if (position_ == position_end_) {
Shift();
if (at_end_) throw EndOfFileException();
@@ -60,22 +60,22 @@ class FilePiece {
}
// Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
- StringPiece ReadDelimited(const bool *delim = kSpaces) throw(GZException, EndOfFileException) {
+ StringPiece ReadDelimited(const bool *delim = kSpaces) {
SkipSpaces(delim);
return Consume(FindDelimiterOrEOF(delim));
}
// Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
// It is similar to getline in that way.
- StringPiece ReadLine(char delim = '\n') throw(GZException, EndOfFileException);
+ StringPiece ReadLine(char delim = '\n');
- float ReadFloat() throw(GZException, EndOfFileException, ParseNumberException);
- double ReadDouble() throw(GZException, EndOfFileException, ParseNumberException);
- long int ReadLong() throw(GZException, EndOfFileException, ParseNumberException);
- unsigned long int ReadULong() throw(GZException, EndOfFileException, ParseNumberException);
+ float ReadFloat();
+ double ReadDouble();
+ long int ReadLong();
+ unsigned long int ReadULong();
// Skip spaces defined by isspace.
- void SkipSpaces(const bool *delim = kSpaces) throw (GZException, EndOfFileException) {
+ void SkipSpaces(const bool *delim = kSpaces) {
for (; ; ++position_) {
if (position_ == position_end_) Shift();
if (!delim[static_cast(*position_)]) return;
@@ -89,9 +89,9 @@ class FilePiece {
const std::string &FileName() const { return file_name_; }
private:
- void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw(GZException);
+ void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer);
- template T ReadNumber() throw(GZException, EndOfFileException, ParseNumberException);
+ template T ReadNumber();
StringPiece Consume(const char *to) {
StringPiece ret(position_, to - position_);
@@ -99,14 +99,14 @@ class FilePiece {
return ret;
}
- const char *FindDelimiterOrEOF(const bool *delim = kSpaces) throw (GZException, EndOfFileException);
+ const char *FindDelimiterOrEOF(const bool *delim = kSpaces);
- void Shift() throw (EndOfFileException, GZException);
+ void Shift();
// Backends to Shift().
- void MMapShift(off_t desired_begin) throw ();
+ void MMapShift(off_t desired_begin);
- void TransitionToRead() throw (GZException);
- void ReadShift() throw (GZException, EndOfFileException);
+ void TransitionToRead();
+ void ReadShift();
const char *position_, *last_space_, *position_end_;
diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc
index d58a0727..fec47fd9 100644
--- a/klm/util/murmur_hash.cc
+++ b/klm/util/murmur_hash.cc
@@ -1,129 +1,129 @@
-/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All
- * code is released to the public domain. For business purposes, Murmurhash is
- * under the MIT license."
- * This is modified from the original:
- * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.
- * length changed to unsigned int.
- * placed in namespace util
- * add MurmurHashNative
- * default option = 0 for seed
- */
-
-#include "util/murmur_hash.hh"
-
-namespace util {
-
-//-----------------------------------------------------------------------------
-// MurmurHash2, 64-bit versions, by Austin Appleby
-
-// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
-// and endian-ness issues if used across multiple platforms.
-
-// 64-bit hash for 64-bit platforms
-
-uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
-{
- const uint64_t m = 0xc6a4a7935bd1e995ULL;
- const int r = 47;
-
- uint64_t h = seed ^ (len * m);
-
- const uint64_t * data = (const uint64_t *)key;
- const uint64_t * end = data + (len/8);
-
- while(data != end)
- {
- uint64_t k = *data++;
-
- k *= m;
- k ^= k >> r;
- k *= m;
-
- h ^= k;
- h *= m;
- }
-
- const unsigned char * data2 = (const unsigned char*)data;
-
- switch(len & 7)
- {
- case 7: h ^= uint64_t(data2[6]) << 48;
- case 6: h ^= uint64_t(data2[5]) << 40;
- case 5: h ^= uint64_t(data2[4]) << 32;
- case 4: h ^= uint64_t(data2[3]) << 24;
- case 3: h ^= uint64_t(data2[2]) << 16;
- case 2: h ^= uint64_t(data2[1]) << 8;
- case 1: h ^= uint64_t(data2[0]);
- h *= m;
- };
-
- h ^= h >> r;
- h *= m;
- h ^= h >> r;
-
- return h;
-}
-
-
-// 64-bit hash for 32-bit platforms
-
-uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
-{
- const unsigned int m = 0x5bd1e995;
- const int r = 24;
-
- unsigned int h1 = seed ^ len;
- unsigned int h2 = 0;
-
- const unsigned int * data = (const unsigned int *)key;
-
- while(len >= 8)
- {
- unsigned int k1 = *data++;
- k1 *= m; k1 ^= k1 >> r; k1 *= m;
- h1 *= m; h1 ^= k1;
- len -= 4;
-
- unsigned int k2 = *data++;
- k2 *= m; k2 ^= k2 >> r; k2 *= m;
- h2 *= m; h2 ^= k2;
- len -= 4;
- }
-
- if(len >= 4)
- {
- unsigned int k1 = *data++;
- k1 *= m; k1 ^= k1 >> r; k1 *= m;
- h1 *= m; h1 ^= k1;
- len -= 4;
- }
-
- switch(len)
- {
- case 3: h2 ^= ((unsigned char*)data)[2] << 16;
- case 2: h2 ^= ((unsigned char*)data)[1] << 8;
- case 1: h2 ^= ((unsigned char*)data)[0];
- h2 *= m;
- };
-
- h1 ^= h2 >> 18; h1 *= m;
- h2 ^= h1 >> 22; h2 *= m;
- h1 ^= h2 >> 17; h1 *= m;
- h2 ^= h1 >> 19; h2 *= m;
-
- uint64_t h = h1;
-
- h = (h << 32) | h2;
-
- return h;
-}
-
-uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) {
- if (sizeof(int) == 4) {
- return MurmurHash64B(key, len, seed);
- } else {
- return MurmurHash64A(key, len, seed);
- }
-}
-
-} // namespace util
+/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All
+ * code is released to the public domain. For business purposes, Murmurhash is
+ * under the MIT license."
+ * This is modified from the original:
+ * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.
+ * length changed to unsigned int.
+ * placed in namespace util
+ * add MurmurHashNative
+ * default option = 0 for seed
+ */
+
+#include "util/murmur_hash.hh"
+
+namespace util {
+
+//-----------------------------------------------------------------------------
+// MurmurHash2, 64-bit versions, by Austin Appleby
+
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
+// and endian-ness issues if used across multiple platforms.
+
+// 64-bit hash for 64-bit platforms
+
+uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
+{
+ const uint64_t m = 0xc6a4a7935bd1e995ULL;
+ const int r = 47;
+
+ uint64_t h = seed ^ (len * m);
+
+ const uint64_t * data = (const uint64_t *)key;
+ const uint64_t * end = data + (len/8);
+
+ while(data != end)
+ {
+ uint64_t k = *data++;
+
+ k *= m;
+ k ^= k >> r;
+ k *= m;
+
+ h ^= k;
+ h *= m;
+ }
+
+ const unsigned char * data2 = (const unsigned char*)data;
+
+ switch(len & 7)
+ {
+ case 7: h ^= uint64_t(data2[6]) << 48;
+ case 6: h ^= uint64_t(data2[5]) << 40;
+ case 5: h ^= uint64_t(data2[4]) << 32;
+ case 4: h ^= uint64_t(data2[3]) << 24;
+ case 3: h ^= uint64_t(data2[2]) << 16;
+ case 2: h ^= uint64_t(data2[1]) << 8;
+ case 1: h ^= uint64_t(data2[0]);
+ h *= m;
+ };
+
+ h ^= h >> r;
+ h *= m;
+ h ^= h >> r;
+
+ return h;
+}
+
+
+// 64-bit hash for 32-bit platforms
+
+uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
+{
+ const unsigned int m = 0x5bd1e995;
+ const int r = 24;
+
+ unsigned int h1 = seed ^ len;
+ unsigned int h2 = 0;
+
+ const unsigned int * data = (const unsigned int *)key;
+
+ while(len >= 8)
+ {
+ unsigned int k1 = *data++;
+ k1 *= m; k1 ^= k1 >> r; k1 *= m;
+ h1 *= m; h1 ^= k1;
+ len -= 4;
+
+ unsigned int k2 = *data++;
+ k2 *= m; k2 ^= k2 >> r; k2 *= m;
+ h2 *= m; h2 ^= k2;
+ len -= 4;
+ }
+
+ if(len >= 4)
+ {
+ unsigned int k1 = *data++;
+ k1 *= m; k1 ^= k1 >> r; k1 *= m;
+ h1 *= m; h1 ^= k1;
+ len -= 4;
+ }
+
+ switch(len)
+ {
+ case 3: h2 ^= ((unsigned char*)data)[2] << 16;
+ case 2: h2 ^= ((unsigned char*)data)[1] << 8;
+ case 1: h2 ^= ((unsigned char*)data)[0];
+ h2 *= m;
+ };
+
+ h1 ^= h2 >> 18; h1 *= m;
+ h2 ^= h1 >> 22; h2 *= m;
+ h1 ^= h2 >> 17; h1 *= m;
+ h2 ^= h1 >> 19; h2 *= m;
+
+ uint64_t h = h1;
+
+ h = (h << 32) | h2;
+
+ return h;
+}
+
+uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) {
+ if (sizeof(int) == 4) {
+ return MurmurHash64B(key, len, seed);
+ } else {
+ return MurmurHash64A(key, len, seed);
+ }
+}
+
+} // namespace util
diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh
index 00be0ed7..2ec342a6 100644
--- a/klm/util/probing_hash_table.hh
+++ b/klm/util/probing_hash_table.hh
@@ -57,7 +57,7 @@ template class IdentityAccessor {
public:
typedef T Key;
- T operator()(const uint64_t *in) const { return *in; }
+ T operator()(const T *in) const { return *in; }
};
struct Pivot64 {
@@ -101,6 +101,27 @@ template bool SortedUniformFind(co
return BoundedSortedUniformFind(accessor, begin, below, end, above, key, out);
}
+// May return begin - 1.
+template Iterator BinaryBelow(
+ const Accessor &accessor,
+ Iterator begin,
+ Iterator end,
+ const typename Accessor::Key key) {
+ while (end > begin) {
+ Iterator pivot(begin + (end - begin) / 2);
+ typename Accessor::Key mid(accessor(pivot));
+ if (mid < key) {
+ begin = pivot + 1;
+ } else if (mid > key) {
+ end = pivot;
+ } else {
+ for (++pivot; (pivot < end) && accessor(pivot) == mid; ++pivot) {}
+ return pivot - 1;
+ }
+ }
+ return begin - 1;
+}
+
// To use this template, you need to define a Pivot function to match Key.
template class SortedUniformMap {
public:
--
cgit v1.2.3