summaryrefslogtreecommitdiff
path: root/utils/fast_sparse_vector.h
diff options
context:
space:
mode:
Diffstat (limited to 'utils/fast_sparse_vector.h')
-rw-r--r--utils/fast_sparse_vector.h187
1 files changed, 144 insertions, 43 deletions
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index c4a44b99..b8b3df1f 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -7,6 +7,7 @@
// important: indexes are integers
// important: iterators may return elements in any order
+#include <cmath>
#include <cstring>
#include <climits>
#include <map>
@@ -21,18 +22,35 @@
#define L2_CACHE_LINE 128
// this should just be a typedef to pair<int,T> on the new c++
+// I have to avoid this since I want to use unions and c++-98
+// does not let unions have types with constructors in them
+// this type bypasses default constructors. use with caution!
+// this should work as long as T is in a acceptable state to
+// have its destructor called when initialized with all zeros
template <typename T>
struct PairIntT {
- int first;
- T second;
const PairIntT& operator=(const std::pair<const int, T>& v) {
- first = v.first;
- second = v.second;
+ std::memcpy(this, &v, sizeof(PairIntT));
return *this;
}
operator const std::pair<const int, T>&() const {
return *reinterpret_cast<const std::pair<const int, T>*>(this);
}
+ int& first() {
+ return reinterpret_cast<std::pair<int, T>*>(this)->first;
+ }
+ T& second() {
+ return reinterpret_cast<std::pair<int, T>*>(this)->second;
+ }
+ const int& first() const {
+ return reinterpret_cast<const std::pair<int, T>*>(this)->first;
+ }
+ const T& second() const {
+ return reinterpret_cast<const std::pair<int, T>*>(this)->second;
+ }
+ private:
+ // very bad way of bypassing the default constructor on T
+ char data_[sizeof(std::pair<int, T>)];
};
BOOST_STATIC_ASSERT(sizeof(PairIntT<float>) == sizeof(std::pair<int,float>));
@@ -40,7 +58,7 @@ template <typename T, int LOCAL_MAX = (sizeof(T) == sizeof(float) ? 15 : 7)>
class FastSparseVector {
public:
struct const_iterator {
- const_iterator(const FastSparseVector<T>& v, const bool is_end) : local_(v.is_local_) {
+ const_iterator(const FastSparseVector<T>& v, const bool is_end) : local_(!v.is_remote_) {
if (local_) {
local_it_ = &v.data_.local[is_end ? v.local_size_ : 0];
} else {
@@ -85,43 +103,89 @@ class FastSparseVector {
}
};
public:
- FastSparseVector() : local_size_(0), is_local_(true) {}
+ FastSparseVector() : local_size_(0), is_remote_(false) { std::memset(&data_, 0, sizeof(data_)); }
+ explicit FastSparseVector(const std::vector<T>& init) : local_size_(0), is_remote_(false) {
+ for (int i = 0; i < init.size(); ++i) set_value(i, init[i]);
+ }
~FastSparseVector() {
- if (!is_local_) delete data_.rbmap;
+ clear();
}
FastSparseVector(const FastSparseVector& other) {
std::memcpy(this, &other, sizeof(FastSparseVector));
- if (is_local_) return;
- data_.rbmap = new std::map<int, T>(*data_.rbmap);
+ if (is_remote_) data_.rbmap = new std::map<int, T>(*data_.rbmap);
+ }
+ void erase(int k) {
+ if (is_remote_) {
+ data_.rbmap->erase(k);
+ } else {
+ for (int i = 0; i < local_size_; ++i) {
+ if (data_.local[i].first() == k) {
+ for (int j = i+1; j < local_size_; ++j) {
+ data_.local[j-1].first() = data_.local[j].first();
+ data_.local[j-1].second() = data_.local[j].second();
+ }
+ }
+ }
+ }
}
const FastSparseVector& operator=(const FastSparseVector& other) {
- if (!is_local_) delete data_.rbmap;
+ if (is_remote_) delete data_.rbmap;
std::memcpy(this, &other, sizeof(FastSparseVector));
- if (is_local_) return *this;
- data_.rbmap = new std::map<int, T>(*data_.rbmap);
+ if (is_remote_)
+ data_.rbmap = new std::map<int, T>(*data_.rbmap);
return *this;
}
+ T const& get_singleton() const {
+ assert(size()==1);
+ return begin()->second;
+ }
+ bool nonzero(int k) const {
+ return static_cast<bool>(value(k));
+ }
inline void set_value(int k, const T& v) {
get_or_create_bin(k) = v;
}
+ inline T& add_value(int k, const T& v) {
+ return get_or_create_bin(k) += v;
+ }
+ inline T get(int k) const {
+ return value(k);
+ }
inline T value(int k) const {
- if (is_local_) {
+ if (is_remote_) {
+ typename std::map<int, T>::const_iterator it = data_.rbmap->find(k);
+ if (it != data_.rbmap->end()) return it->second;
+ } else {
for (int i = 0; i < local_size_; ++i) {
const PairIntT<T>& p = data_.local[i];
- if (p.first == k) return p.second;
+ if (p.first() == k) return p.second();
}
- } else {
- typename std::map<int, T>::const_iterator it = data_.rbmap->find(k);
- if (it != data_.rbmap->end()) return it->second;
}
return T();
}
+ T l2norm_sq() const {
+ T sum = T();
+ for (const_iterator it = begin(), e = end(); it != e; ++it)
+ sum += it->second * it->second;
+ return sum;
+ }
+ T l2norm() const {
+ return sqrt(l2norm_sq());
+ }
+ // if values are binary, gives |A intersect B|/|A union B|
+ template<typename S>
+ S tanimoto_coef(const FastSparseVector<S> &vec) const {
+ const S dp=dot(vec);
+ return dp/(l2norm_sq()+vec.l2norm_sq()-dp);
+ }
inline size_t size() const {
- if (is_local_) return local_size_;
- return data_.rbmap->size();
+ if (is_remote_)
+ return data_.rbmap->size();
+ else
+ return local_size_;
}
inline void clear() {
- if (!is_local_) delete data_.rbmap;
+ if (is_remote_) delete data_.rbmap;
local_size_ = 0;
}
inline bool empty() const {
@@ -135,6 +199,14 @@ class FastSparseVector {
}
return *this;
}
+ template <typename O>
+ inline FastSparseVector& operator+=(const FastSparseVector<O>& other) {
+ const typename FastSparseVector<O>::const_iterator end = other.end();
+ for (typename FastSparseVector<O>::const_iterator it = other.begin(); it != end; ++it) {
+ get_or_create_bin(it->first) += it->second;
+ }
+ return *this;
+ }
inline FastSparseVector& operator-=(const FastSparseVector& other) {
const typename FastSparseVector::const_iterator end = other.end();
for (typename FastSparseVector::const_iterator it = other.begin(); it != end; ++it) {
@@ -143,24 +215,24 @@ class FastSparseVector {
return *this;
}
inline FastSparseVector& operator*=(const T& scalar) {
- if (is_local_) {
- for (int i = 0; i < local_size_; ++i)
- data_.local[i].second *= scalar;
- } else {
+ if (is_remote_) {
const typename std::map<int, T>::iterator end = data_.rbmap->end();
for (typename std::map<int, T>::iterator it = data_.rbmap->begin(); it != end; ++it)
it->second *= scalar;
+ } else {
+ for (int i = 0; i < local_size_; ++i)
+ data_.local[i].second() *= scalar;
}
return *this;
}
inline FastSparseVector& operator/=(const T& scalar) {
- if (is_local_) {
- for (int i = 0; i < local_size_; ++i)
- data_.local[i].second /= scalar;
- } else {
+ if (is_remote_) {
const typename std::map<int, T>::iterator end = data_.rbmap->end();
for (typename std::map<int, T>::iterator it = data_.rbmap->begin(); it != end; ++it)
it->second /= scalar;
+ } else {
+ for (int i = 0; i < local_size_; ++i)
+ data_.local[i].second() /= scalar;
}
return *this;
}
@@ -182,38 +254,57 @@ class FastSparseVector {
T res = T();
for (const_iterator it = begin(), e = end(); it != e; ++it)
if (it->first < v.size()) res += it->second * v[it->first];
+ return res;
+ }
+ T dot(const FastSparseVector<T>& other) const {
+ T res = T();
+ for (const_iterator it = begin(), e = end(); it != e; ++it)
+ res += other.value(it->first) * it->second;
+ return res;
+ }
+ bool operator==(const FastSparseVector<T>& other) const {
+ if (other.size() != size()) return false;
+ for (const_iterator it = begin(), e = end(); it != e; ++it) {
+ if (other.value(it->first) != it->second) return false;
+ }
+ return true;
+ }
+ void swap(FastSparseVector<T>& other) {
+ char t[sizeof(data_)];
+ std::swap(other.is_remote_, is_remote_);
+ std::swap(other.local_size_, local_size_);
+ std::memcpy(t, &other.data_, sizeof(data_));
+ std::memcpy(&other.data_, &data_, sizeof(data_));
+ std::memcpy(&data_, t, sizeof(data_));
}
private:
- inline T& extend_vector(std::vector<T> &v,int i) {
+ static inline T& extend_vector(std::vector<T> &v,int i) {
if (i>=v.size())
v.resize(i+1);
return v[i];
}
inline T& get_or_create_bin(int k) {
- if (is_local_) {
- for (int i = 0; i < local_size_; ++i)
- if (data_.local[i].first == k) return data_.local[i].second;
- } else {
+ if (is_remote_) {
return (*data_.rbmap)[k];
+ } else {
+ for (int i = 0; i < local_size_; ++i)
+ if (data_.local[i].first() == k) return data_.local[i].second();
}
- assert(is_local_);
+ assert(!is_remote_);
// currently local!
if (local_size_ < LOCAL_MAX) {
PairIntT<T>& p = data_.local[local_size_];
++local_size_;
- p.first = k;
- return p.second;
+ p.first() = k;
+ p.second() = T();
+ return p.second();
} else {
swap_local_rbmap();
return (*data_.rbmap)[k];
}
}
void swap_local_rbmap() {
- if (is_local_) { // data is local, move to rbmap
- std::map<int, T>* m = new std::map<int, T>(&data_.local[0], &data_.local[local_size_]);
- data_.rbmap = m;
- is_local_ = false;
- } else { // data is in rbmap, move to local
+ if (is_remote_) { // data is in rbmap, move to local
assert(data_.rbmap->size() < LOCAL_MAX);
const std::map<int, T>* m = data_.rbmap;
local_size_ = m->size();
@@ -223,7 +314,11 @@ class FastSparseVector {
data_.local[i] = *it;
++i;
}
- is_local_ = true;
+ is_remote_ = false;
+ } else { // data is local, move to rbmap
+ std::map<int, T>* m = new std::map<int, T>(&data_.local[0], &data_.local[local_size_]);
+ data_.rbmap = m;
+ is_remote_ = true;
}
}
@@ -232,7 +327,7 @@ class FastSparseVector {
std::map<int, T>* rbmap;
} data_;
unsigned char local_size_;
- bool is_local_;
+ bool is_remote_;
};
template <typename T>
@@ -261,6 +356,12 @@ const FastSparseVector<T> operator-(const FastSparseVector<T>& x, const FastSpar
}
}
+template <class T>
+std::size_t hash_value(FastSparseVector<T> const& x) {
+ assert(!"not implemented");
+ return 0;
+}
+
namespace performance_checks {
// if you get a failure on the next line, you should adjust LOCAL_MAX for
// your architecture