From 9d393298ce21159907884ea9b7318c52585409ee Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 22 Apr 2011 13:38:32 -0400 Subject: make compatible with FastSparseVector --- utils/fast_sparse_vector.h | 187 ++++++++++++++++++++++++++++++++++----------- utils/sparse_vector.h | 48 +++++++++--- utils/ts.cc | 18 ++++- 3 files changed, 197 insertions(+), 56 deletions(-) (limited to 'utils') diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index c4a44b99..b8b3df1f 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -7,6 +7,7 @@ // important: indexes are integers // important: iterators may return elements in any order +#include #include #include #include @@ -21,18 +22,35 @@ #define L2_CACHE_LINE 128 // this should just be a typedef to pair on the new c++ +// I have to avoid this since I want to use unions and c++-98 +// does not let unions have types with constructors in them +// this type bypasses default constructors. use with caution! +// this should work as long as T is in a acceptable state to +// have its destructor called when initialized with all zeros template struct PairIntT { - int first; - T second; const PairIntT& operator=(const std::pair& v) { - first = v.first; - second = v.second; + std::memcpy(this, &v, sizeof(PairIntT)); return *this; } operator const std::pair&() const { return *reinterpret_cast*>(this); } + int& first() { + return reinterpret_cast*>(this)->first; + } + T& second() { + return reinterpret_cast*>(this)->second; + } + const int& first() const { + return reinterpret_cast*>(this)->first; + } + const T& second() const { + return reinterpret_cast*>(this)->second; + } + private: + // very bad way of bypassing the default constructor on T + char data_[sizeof(std::pair)]; }; BOOST_STATIC_ASSERT(sizeof(PairIntT) == sizeof(std::pair)); @@ -40,7 +58,7 @@ template class FastSparseVector { public: struct const_iterator { - const_iterator(const FastSparseVector& v, const bool is_end) : local_(v.is_local_) { + const_iterator(const FastSparseVector& v, const bool is_end) : local_(!v.is_remote_) { if (local_) { local_it_ = &v.data_.local[is_end ? v.local_size_ : 0]; } else { @@ -85,43 +103,89 @@ class FastSparseVector { } }; public: - FastSparseVector() : local_size_(0), is_local_(true) {} + FastSparseVector() : local_size_(0), is_remote_(false) { std::memset(&data_, 0, sizeof(data_)); } + explicit FastSparseVector(const std::vector& init) : local_size_(0), is_remote_(false) { + for (int i = 0; i < init.size(); ++i) set_value(i, init[i]); + } ~FastSparseVector() { - if (!is_local_) delete data_.rbmap; + clear(); } FastSparseVector(const FastSparseVector& other) { std::memcpy(this, &other, sizeof(FastSparseVector)); - if (is_local_) return; - data_.rbmap = new std::map(*data_.rbmap); + if (is_remote_) data_.rbmap = new std::map(*data_.rbmap); + } + void erase(int k) { + if (is_remote_) { + data_.rbmap->erase(k); + } else { + for (int i = 0; i < local_size_; ++i) { + if (data_.local[i].first() == k) { + for (int j = i+1; j < local_size_; ++j) { + data_.local[j-1].first() = data_.local[j].first(); + data_.local[j-1].second() = data_.local[j].second(); + } + } + } + } } const FastSparseVector& operator=(const FastSparseVector& other) { - if (!is_local_) delete data_.rbmap; + if (is_remote_) delete data_.rbmap; std::memcpy(this, &other, sizeof(FastSparseVector)); - if (is_local_) return *this; - data_.rbmap = new std::map(*data_.rbmap); + if (is_remote_) + data_.rbmap = new std::map(*data_.rbmap); return *this; } + T const& get_singleton() const { + assert(size()==1); + return begin()->second; + } + bool nonzero(int k) const { + return static_cast(value(k)); + } inline void set_value(int k, const T& v) { get_or_create_bin(k) = v; } + inline T& add_value(int k, const T& v) { + return get_or_create_bin(k) += v; + } + inline T get(int k) const { + return value(k); + } inline T value(int k) const { - if (is_local_) { + if (is_remote_) { + typename std::map::const_iterator it = data_.rbmap->find(k); + if (it != data_.rbmap->end()) return it->second; + } else { for (int i = 0; i < local_size_; ++i) { const PairIntT& p = data_.local[i]; - if (p.first == k) return p.second; + if (p.first() == k) return p.second(); } - } else { - typename std::map::const_iterator it = data_.rbmap->find(k); - if (it != data_.rbmap->end()) return it->second; } return T(); } + T l2norm_sq() const { + T sum = T(); + for (const_iterator it = begin(), e = end(); it != e; ++it) + sum += it->second * it->second; + return sum; + } + T l2norm() const { + return sqrt(l2norm_sq()); + } + // if values are binary, gives |A intersect B|/|A union B| + template + S tanimoto_coef(const FastSparseVector &vec) const { + const S dp=dot(vec); + return dp/(l2norm_sq()+vec.l2norm_sq()-dp); + } inline size_t size() const { - if (is_local_) return local_size_; - return data_.rbmap->size(); + if (is_remote_) + return data_.rbmap->size(); + else + return local_size_; } inline void clear() { - if (!is_local_) delete data_.rbmap; + if (is_remote_) delete data_.rbmap; local_size_ = 0; } inline bool empty() const { @@ -135,6 +199,14 @@ class FastSparseVector { } return *this; } + template + inline FastSparseVector& operator+=(const FastSparseVector& other) { + const typename FastSparseVector::const_iterator end = other.end(); + for (typename FastSparseVector::const_iterator it = other.begin(); it != end; ++it) { + get_or_create_bin(it->first) += it->second; + } + return *this; + } inline FastSparseVector& operator-=(const FastSparseVector& other) { const typename FastSparseVector::const_iterator end = other.end(); for (typename FastSparseVector::const_iterator it = other.begin(); it != end; ++it) { @@ -143,24 +215,24 @@ class FastSparseVector { return *this; } inline FastSparseVector& operator*=(const T& scalar) { - if (is_local_) { - for (int i = 0; i < local_size_; ++i) - data_.local[i].second *= scalar; - } else { + if (is_remote_) { const typename std::map::iterator end = data_.rbmap->end(); for (typename std::map::iterator it = data_.rbmap->begin(); it != end; ++it) it->second *= scalar; + } else { + for (int i = 0; i < local_size_; ++i) + data_.local[i].second() *= scalar; } return *this; } inline FastSparseVector& operator/=(const T& scalar) { - if (is_local_) { - for (int i = 0; i < local_size_; ++i) - data_.local[i].second /= scalar; - } else { + if (is_remote_) { const typename std::map::iterator end = data_.rbmap->end(); for (typename std::map::iterator it = data_.rbmap->begin(); it != end; ++it) it->second /= scalar; + } else { + for (int i = 0; i < local_size_; ++i) + data_.local[i].second() /= scalar; } return *this; } @@ -182,38 +254,57 @@ class FastSparseVector { T res = T(); for (const_iterator it = begin(), e = end(); it != e; ++it) if (it->first < v.size()) res += it->second * v[it->first]; + return res; + } + T dot(const FastSparseVector& other) const { + T res = T(); + for (const_iterator it = begin(), e = end(); it != e; ++it) + res += other.value(it->first) * it->second; + return res; + } + bool operator==(const FastSparseVector& other) const { + if (other.size() != size()) return false; + for (const_iterator it = begin(), e = end(); it != e; ++it) { + if (other.value(it->first) != it->second) return false; + } + return true; + } + void swap(FastSparseVector& other) { + char t[sizeof(data_)]; + std::swap(other.is_remote_, is_remote_); + std::swap(other.local_size_, local_size_); + std::memcpy(t, &other.data_, sizeof(data_)); + std::memcpy(&other.data_, &data_, sizeof(data_)); + std::memcpy(&data_, t, sizeof(data_)); } private: - inline T& extend_vector(std::vector &v,int i) { + static inline T& extend_vector(std::vector &v,int i) { if (i>=v.size()) v.resize(i+1); return v[i]; } inline T& get_or_create_bin(int k) { - if (is_local_) { - for (int i = 0; i < local_size_; ++i) - if (data_.local[i].first == k) return data_.local[i].second; - } else { + if (is_remote_) { return (*data_.rbmap)[k]; + } else { + for (int i = 0; i < local_size_; ++i) + if (data_.local[i].first() == k) return data_.local[i].second(); } - assert(is_local_); + assert(!is_remote_); // currently local! if (local_size_ < LOCAL_MAX) { PairIntT& p = data_.local[local_size_]; ++local_size_; - p.first = k; - return p.second; + p.first() = k; + p.second() = T(); + return p.second(); } else { swap_local_rbmap(); return (*data_.rbmap)[k]; } } void swap_local_rbmap() { - if (is_local_) { // data is local, move to rbmap - std::map* m = new std::map(&data_.local[0], &data_.local[local_size_]); - data_.rbmap = m; - is_local_ = false; - } else { // data is in rbmap, move to local + if (is_remote_) { // data is in rbmap, move to local assert(data_.rbmap->size() < LOCAL_MAX); const std::map* m = data_.rbmap; local_size_ = m->size(); @@ -223,7 +314,11 @@ class FastSparseVector { data_.local[i] = *it; ++i; } - is_local_ = true; + is_remote_ = false; + } else { // data is local, move to rbmap + std::map* m = new std::map(&data_.local[0], &data_.local[local_size_]); + data_.rbmap = m; + is_remote_ = true; } } @@ -232,7 +327,7 @@ class FastSparseVector { std::map* rbmap; } data_; unsigned char local_size_; - bool is_local_; + bool is_remote_; }; template @@ -261,6 +356,12 @@ const FastSparseVector operator-(const FastSparseVector& x, const FastSpar } } +template +std::size_t hash_value(FastSparseVector const& x) { + assert(!"not implemented"); + return 0; +} + namespace performance_checks { // if you get a failure on the next line, you should adjust LOCAL_MAX for // your architecture diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h index de8c0291..dbeacbe8 100644 --- a/utils/sparse_vector.h +++ b/utils/sparse_vector.h @@ -645,6 +645,19 @@ SparseVector operator+(const SparseVector& a, const SparseVector& b) { return result += b; } +template +SparseVector operator*(const double& a, const SparseVector& b) { + SparseVector result = b; + return result *= a; +} + +#else + +#include "fast_sparse_vector.h" +#define SparseVector FastSparseVector + +#endif + template SparseVector operator*(const SparseVector& a, const double& b) { SparseVector result = a; @@ -658,23 +671,36 @@ SparseVector operator*(const SparseVector& a, const T& b) { } template -SparseVector operator*(const double& a, const SparseVector& b) { - SparseVector result = b; - return result *= a; +SparseVector operator/(const SparseVector& a, const double& b) { + SparseVector result = a; + return result *= b; } template -std::ostream &operator<<(std::ostream &out, const SparseVector &vec) -{ - return vec.operator<<(out); +SparseVector operator/(const SparseVector& a, const T& b) { + SparseVector result = a; + return result *= b; } -#else - -#include "fast_sparse_vector.h" -#define SparseVector FastSparseVector +template +inline void print(O &o,const SparseVector& v, const char* kvsep="=",const char* pairsep=" ",const char* pre="",const char* post="") { + o << pre; + bool first=true; + for (typename SparseVector::const_iterator i=v.begin(),e=v.end();i!=e;++i) { + if (first) + first=false; + else + o<first)<second; + } + o << post; +} -#endif +template +inline std::ostream& operator<<(std::ostream& out, const SparseVector& v) { + print(out, v); + return out; +} namespace B64 { void Encode(double objective, const SparseVector& v, std::ostream* out); diff --git a/utils/ts.cc b/utils/ts.cc index 28b5f9b1..563794c5 100644 --- a/utils/ts.cc +++ b/utils/ts.cc @@ -11,7 +11,7 @@ using namespace std; template -void Print(const T& x) { +void MPrint(const T& x) { typename T::const_iterator it = x.begin(); for (; it != x.end(); ++it) { cerr << it->first << ":" << it->second << " "; @@ -24,16 +24,30 @@ void test_unique() { T x; x.set_value(100, 1.0); x.set_value(100, 2.0); - Print(x); + MPrint(x); assert(x.size() == 1); assert(x.value(100) == 2.0); } +void test_logv() { + FastSparseVector x; + cerr << "FSV = " << sizeof(FastSparseVector) << endl; + x.set_value(999, prob_t(0.5)); + x.set_value(0, prob_t()); + x.set_value(1, prob_t(1)); + MPrint(x); + x += x; + MPrint(x); + x -= x; + MPrint(x); +} + int main() { cerr << sizeof(prob_t) << " " << sizeof(LogVal) << endl; cerr << " sizeof(FSV) = " << sizeof(FastSparseVector) << endl; cerr << "sizeof(FSV) = " << sizeof(FastSparseVector) << endl; test_unique >(); + test_logv(); // sranddev(); int c = 0; FastSparseVector p; -- cgit v1.2.3