#ifndef UTIL_PROBING_HASH_TABLE_H #define UTIL_PROBING_HASH_TABLE_H #include "util/exception.hh" #include "util/scoped.hh" #include <algorithm> #include <cstddef> #include <functional> #include <vector> #include <assert.h> #include <stdint.h> namespace util { /* Thrown when table grows too large */ class ProbingSizeException : public Exception { public: ProbingSizeException() throw() {} ~ProbingSizeException() throw() {} }; // std::identity is an SGI extension :-( struct IdentityHash { template <class T> T operator()(T arg) const { return arg; } }; template <class EntryT, class HashT, class EqualT> class AutoProbing; /* Non-standard hash table * Buckets must be set at the beginning and must be greater than maximum number * of elements, else it throws ProbingSizeException. * Memory management and initialization is externalized to make it easier to * serialize these to disk and load them quickly. * Uses linear probing to find value. * Only insert and lookup operations. */ template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class ProbingHashTable { public: typedef EntryT Entry; typedef typename Entry::Key Key; typedef const Entry *ConstIterator; typedef Entry *MutableIterator; typedef HashT Hash; typedef EqualT Equal; static uint64_t Size(uint64_t entries, float multiplier) { uint64_t buckets = std::max(entries + 1, static_cast<uint64_t>(multiplier * static_cast<float>(entries))); return buckets * sizeof(Entry); } // Must be assigned to later. ProbingHashTable() : entries_(0) #ifdef DEBUG , initialized_(false) #endif {} ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) : begin_(reinterpret_cast<MutableIterator>(start)), buckets_(allocated / sizeof(Entry)), end_(begin_ + buckets_), invalid_(invalid), hash_(hash_func), equal_(equal_func), entries_(0) #ifdef DEBUG , initialized_(true) #endif {} void Relocate(void *new_base) { begin_ = reinterpret_cast<MutableIterator>(new_base); end_ = begin_ + buckets_; } template <class T> MutableIterator Insert(const T &t) { #ifdef DEBUG assert(initialized_); #endif UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); return UncheckedInsert(t); } // Return true if the value was found (and not inserted). This is consistent with Find but the opposite if hash_map! template <class T> bool FindOrInsert(const T &t, MutableIterator &out) { #ifdef DEBUG assert(initialized_); #endif for (MutableIterator i = Ideal(t);;) { Key got(i->GetKey()); if (equal_(got, t.GetKey())) { out = i; return true; } if (equal_(got, invalid_)) { UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); *i = t; out = i; return false; } if (++i == end_) i = begin_; } } void FinishedInserting() {} // Don't change anything related to GetKey, template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) { #ifdef DEBUG assert(initialized_); #endif for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) { Key got(i->GetKey()); if (equal_(got, key)) { out = i; return true; } if (equal_(got, invalid_)) return false; if (++i == end_) i = begin_; } } // Like UnsafeMutableFind, but the key must be there. template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) { for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) { Key got(i->GetKey()); if (equal_(got, key)) { return i; } assert(!equal_(got, invalid_)); if (++i == end_) i = begin_; } } template <class Key> bool Find(const Key key, ConstIterator &out) const { #ifdef DEBUG assert(initialized_); #endif for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) { Key got(i->GetKey()); if (equal_(got, key)) { out = i; return true; } if (equal_(got, invalid_)) return false; if (++i == end_) i = begin_; } } // Like Find but we're sure it must be there. template <class Key> ConstIterator MustFind(const Key key) const { for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) { Key got(i->GetKey()); if (equal_(got, key)) { return i; } assert(!equal_(got, invalid_)); if (++i == end_) i = begin_; } } void Clear() { Entry invalid; invalid.SetKey(invalid_); std::fill(begin_, end_, invalid); entries_ = 0; } // Return number of entries assuming no serialization went on. std::size_t SizeNoSerialization() const { return entries_; } // Return memory size expected by Double. std::size_t DoubleTo() const { return buckets_ * 2 * sizeof(Entry); } // Inform the table that it has double the amount of memory. // Pass clear_new = false if you are sure the new memory is initialized // properly (to invalid_) i.e. by mremap. void Double(void *new_base, bool clear_new = true) { begin_ = static_cast<MutableIterator>(new_base); MutableIterator old_end = begin_ + buckets_; buckets_ *= 2; end_ = begin_ + buckets_; if (clear_new) { Entry invalid; invalid.SetKey(invalid_); std::fill(old_end, end_, invalid); } std::vector<Entry> rolled_over; // Move roll-over entries to a buffer because they might not roll over anymore. This should be small. for (MutableIterator i = begin_; i != old_end && !equal_(i->GetKey(), invalid_); ++i) { rolled_over.push_back(*i); i->SetKey(invalid_); } /* Re-insert everything. Entries might go backwards to take over a * recently opened gap, stay, move to new territory, or wrap around. If * an entry wraps around, it might go to a pointer greater than i (which * can happen at the beginning) and it will be revisited to possibly fill * in a gap created later. */ Entry temp; for (MutableIterator i = begin_; i != old_end; ++i) { if (!equal_(i->GetKey(), invalid_)) { temp = *i; i->SetKey(invalid_); UncheckedInsert(temp); } } // Put the roll-over entries back in. for (typename std::vector<Entry>::const_iterator i(rolled_over.begin()); i != rolled_over.end(); ++i) { UncheckedInsert(*i); } } // Mostly for tests, check consistency of every entry. void CheckConsistency() { MutableIterator last; for (last = end_ - 1; last >= begin_ && !equal_(last->GetKey(), invalid_); --last) {} UTIL_THROW_IF(last == begin_, ProbingSizeException, "Completely full"); MutableIterator i; // Beginning can be wrap-arounds. for (i = begin_; !equal_(i->GetKey(), invalid_); ++i) { MutableIterator ideal = Ideal(*i); UTIL_THROW_IF(ideal > i && ideal <= last, Exception, "Inconsistency at position " << (i - begin_) << " should be at " << (ideal - begin_)); } MutableIterator pre_gap = i; for (; i != end_; ++i) { if (equal_(i->GetKey(), invalid_)) { pre_gap = i; continue; } MutableIterator ideal = Ideal(*i); UTIL_THROW_IF(ideal > i || ideal <= pre_gap, Exception, "Inconsistency at position " << (i - begin_) << " with ideal " << (ideal - begin_)); } } private: friend class AutoProbing<Entry, Hash, Equal>; template <class T> MutableIterator Ideal(const T &t) { return begin_ + (hash_(t.GetKey()) % buckets_); } template <class T> MutableIterator UncheckedInsert(const T &t) { for (MutableIterator i(Ideal(t));;) { if (equal_(i->GetKey(), invalid_)) { *i = t; return i; } if (++i == end_) { i = begin_; } } } MutableIterator begin_; std::size_t buckets_; MutableIterator end_; Key invalid_; Hash hash_; Equal equal_; std::size_t entries_; #ifdef DEBUG bool initialized_; #endif }; // Resizable linear probing hash table. This owns the memory. template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class AutoProbing { private: typedef ProbingHashTable<EntryT, HashT, EqualT> Backend; public: static std::size_t MemUsage(std::size_t size, float multiplier = 1.5) { return Backend::Size(size, multiplier); } typedef EntryT Entry; typedef typename Entry::Key Key; typedef const Entry *ConstIterator; typedef Entry *MutableIterator; typedef HashT Hash; typedef EqualT Equal; AutoProbing(std::size_t initial_size = 10, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) : allocated_(Backend::Size(initial_size, 1.5)), mem_(util::MallocOrThrow(allocated_)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) { threshold_ = initial_size * 1.2; Clear(); } // Assumes that the key is unique. Multiple insertions won't cause a failure, just inconsistent lookup. template <class T> MutableIterator Insert(const T &t) { DoubleIfNeeded(); return backend_.UncheckedInsert(t); } template <class T> bool FindOrInsert(const T &t, MutableIterator &out) { DoubleIfNeeded(); return backend_.FindOrInsert(t, out); } template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) { return backend_.UnsafeMutableFind(key, out); } template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) { return backend_.UnsafeMutableMustFind(key); } template <class Key> bool Find(const Key key, ConstIterator &out) const { return backend_.Find(key, out); } template <class Key> ConstIterator MustFind(const Key key) const { return backend_.MustFind(key); } std::size_t Size() const { return backend_.SizeNoSerialization(); } void Clear() { backend_.Clear(); } private: void DoubleIfNeeded() { if (Size() < threshold_) return; mem_.call_realloc(backend_.DoubleTo()); allocated_ = backend_.DoubleTo(); backend_.Double(mem_.get()); threshold_ *= 2; } std::size_t allocated_; util::scoped_malloc mem_; Backend backend_; std::size_t threshold_; }; } // namespace util #endif // UTIL_PROBING_HASH_TABLE_H