diff options
-rw-r--r-- | utils/fast_sparse_vector.h | 191 | ||||
-rwxr-xr-x | utils/feature_vector.h | 1 | ||||
-rw-r--r-- | utils/sparse_vector.cc | 3 | ||||
-rw-r--r-- | utils/sparse_vector.h | 9 | ||||
-rw-r--r-- | utils/ts.cc | 41 |
5 files changed, 216 insertions, 29 deletions
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index 91bf441d..c4a44b99 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -1,47 +1,102 @@ #ifndef _FAST_SPARSE_VECTOR_H_ #define _FAST_SPARSE_VECTOR_H_ +// FastSparseVector<T> is a integer indexed unordered map that supports very fast +// (mathematical) vector operations when the sizes are very small, and reasonably +// fast operations when the sizes are large. +// important: indexes are integers +// important: iterators may return elements in any order + +#include <cstring> +#include <climits> #include <map> #include <cassert> +#include <vector> #include <boost/static_assert.hpp> -// these are architecture dependent, they should be +// this is architecture dependent, it should be // detected in some way but it's probably easiest (for me) -// to just set them +// to just set it #define L2_CACHE_LINE 128 -// this should just be a typedef to pair<int,float> on the new c++ +// this should just be a typedef to pair<int,T> on the new c++ template <typename T> -struct PairIntFloat { +struct PairIntT { int first; T second; - const PairIntFloat& operator=(const std::pair<const int, float>& v) { + const PairIntT& operator=(const std::pair<const int, T>& v) { first = v.first; second = v.second; return *this; } - operator const std::pair<const int, float>&() const { - return *reinterpret_cast<const std::pair<const int, float>*>(this); + operator const std::pair<const int, T>&() const { + return *reinterpret_cast<const std::pair<const int, T>*>(this); } }; -BOOST_STATIC_ASSERT(sizeof(PairIntFloat<float>) == sizeof(std::pair<int,float>)); +BOOST_STATIC_ASSERT(sizeof(PairIntT<float>) == sizeof(std::pair<int,float>)); template <typename T, int LOCAL_MAX = (sizeof(T) == sizeof(float) ? 15 : 7)> class FastSparseVector { public: + struct const_iterator { + const_iterator(const FastSparseVector<T>& v, const bool is_end) : local_(v.is_local_) { + if (local_) { + local_it_ = &v.data_.local[is_end ? v.local_size_ : 0]; + } else { + if (is_end) + remote_it_ = v.data_.rbmap->end(); + else + remote_it_ = v.data_.rbmap->begin(); + } + } + const bool local_; + const PairIntT<T>* local_it_; + typename std::map<int, T>::const_iterator remote_it_; + const std::pair<const int, T>& operator*() const { + if (local_) + return *reinterpret_cast<const std::pair<const int, float>*>(local_it_); + else + return *remote_it_; + } + + const std::pair<const int, T>* operator->() const { + if (local_) + return reinterpret_cast<const std::pair<const int, T>*>(local_it_); + else + return &*remote_it_; + } + + const_iterator& operator++() { + if (local_) ++local_it_; else ++remote_it_; + return *this; + } + + inline bool operator==(const const_iterator& o) const { + if (o.local_ != local_) return false; + if (local_) { + return local_it_ == o.local_it_; + } else { + return remote_it_ == o.remote_it_; + } + } + inline bool operator!=(const const_iterator& o) const { + return !(o == *this); + } + }; + public: FastSparseVector() : local_size_(0), is_local_(true) {} ~FastSparseVector() { if (!is_local_) delete data_.rbmap; } FastSparseVector(const FastSparseVector& other) { - memcpy(this, &other, sizeof(FastSparseVector)); + std::memcpy(this, &other, sizeof(FastSparseVector)); if (is_local_) return; data_.rbmap = new std::map<int, T>(*data_.rbmap); } const FastSparseVector& operator=(const FastSparseVector& other) { if (!is_local_) delete data_.rbmap; - memcpy(this, &other, sizeof(FastSparseVector)); + std::memcpy(this, &other, sizeof(FastSparseVector)); if (is_local_) return *this; data_.rbmap = new std::map<int, T>(*data_.rbmap); return *this; @@ -52,11 +107,11 @@ class FastSparseVector { inline T value(int k) const { if (is_local_) { for (int i = 0; i < local_size_; ++i) { - const PairIntFloat<T>& p = data_.local[i]; + const PairIntT<T>& p = data_.local[i]; if (p.first == k) return p.second; } } else { - std::map<int, float>::const_iterator it = data_.rbmap->find(k); + typename std::map<int, T>::const_iterator it = data_.rbmap->find(k); if (it != data_.rbmap->end()) return it->second; } return T(); @@ -73,13 +128,67 @@ class FastSparseVector { return size() == 0; } inline FastSparseVector& operator+=(const FastSparseVector& other) { - if (!is_local_) { - } else if (!other.is_local_) { - } else { // both local + if (empty()) { *this = other; return *this; } + const typename FastSparseVector::const_iterator end = other.end(); + for (typename FastSparseVector::const_iterator it = other.begin(); it != end; ++it) { + get_or_create_bin(it->first) += it->second; + } + return *this; + } + inline FastSparseVector& operator-=(const FastSparseVector& other) { + const typename FastSparseVector::const_iterator end = other.end(); + for (typename FastSparseVector::const_iterator it = other.begin(); it != end; ++it) { + get_or_create_bin(it->first) -= it->second; } return *this; } + inline FastSparseVector& operator*=(const T& scalar) { + if (is_local_) { + for (int i = 0; i < local_size_; ++i) + data_.local[i].second *= scalar; + } else { + const typename std::map<int, T>::iterator end = data_.rbmap->end(); + for (typename std::map<int, T>::iterator it = data_.rbmap->begin(); it != end; ++it) + it->second *= scalar; + } + return *this; + } + inline FastSparseVector& operator/=(const T& scalar) { + if (is_local_) { + for (int i = 0; i < local_size_; ++i) + data_.local[i].second /= scalar; + } else { + const typename std::map<int, T>::iterator end = data_.rbmap->end(); + for (typename std::map<int, T>::iterator it = data_.rbmap->begin(); it != end; ++it) + it->second /= scalar; + } + return *this; + } + const_iterator begin() const { + return const_iterator(*this, false); + } + const_iterator end() const { + return const_iterator(*this, true); + } + void init_vector(std::vector<T> *vp) const { + init_vector(*vp); + } + void init_vector(std::vector<T> &v) const { + v.clear(); + for (const_iterator i=begin(),e=end();i!=e;++i) + extend_vector(v,i->first)=i->second; + } + T dot(const std::vector<T>& v) const { + T res = T(); + for (const_iterator it = begin(), e = end(); it != e; ++it) + if (it->first < v.size()) res += it->second * v[it->first]; + } private: + inline T& extend_vector(std::vector<T> &v,int i) { + if (i>=v.size()) + v.resize(i+1); + return v[i]; + } inline T& get_or_create_bin(int k) { if (is_local_) { for (int i = 0; i < local_size_; ++i) @@ -90,7 +199,7 @@ class FastSparseVector { assert(is_local_); // currently local! if (local_size_ < LOCAL_MAX) { - PairIntFloat<T>& p = data_.local[local_size_]; + PairIntT<T>& p = data_.local[local_size_]; ++local_size_; p.first = k; return p.second; @@ -119,17 +228,65 @@ class FastSparseVector { } union { - PairIntFloat<T> local[LOCAL_MAX]; + PairIntT<T> local[LOCAL_MAX]; std::map<int, T>* rbmap; } data_; unsigned char local_size_; bool is_local_; }; +template <typename T> +const FastSparseVector<T> operator+(const FastSparseVector<T>& x, const FastSparseVector<T>& y) { + if (x.size() > y.size()) { + FastSparseVector<T> res(x); + res += y; + return res; + } else { + FastSparseVector<T> res(y); + res += x; + return res; + } +} + +template <typename T> +const FastSparseVector<T> operator-(const FastSparseVector<T>& x, const FastSparseVector<T>& y) { + if (x.size() > y.size()) { + FastSparseVector<T> res(x); + res -= y; + return res; + } else { + FastSparseVector<T> res(y); + res -= x; + return res; + } +} + namespace performance_checks { // if you get a failure on the next line, you should adjust LOCAL_MAX for // your architecture BOOST_STATIC_ASSERT(sizeof(FastSparseVector<float>) == L2_CACHE_LINE); }; +#include "fdict.h" + +template <class O, typename T> +inline void print(O &o,const FastSparseVector<T>& v, const char* kvsep="=",const char* pairsep=" ",const char* pre="",const char* post="") { + o << pre; + bool first=true; + for (typename FastSparseVector<T>::const_iterator i=v.begin(),e=v.end();i!=e;++i) { + if (first) + first=false; + else + o<<pairsep; + o<<FD::Convert(i->first)<<kvsep<<i->second; + } + o << post; +} + +template <typename T> +inline std::ostream& operator<<(std::ostream& out, const FastSparseVector<T>& v) { + print(out, v); + return out; +} + #endif diff --git a/utils/feature_vector.h b/utils/feature_vector.h index be378a6a..733aa99e 100755 --- a/utils/feature_vector.h +++ b/utils/feature_vector.h @@ -6,7 +6,6 @@ #include "fdict.h" typedef double Featval; -typedef SparseVectorList<Featval> FeatureVectorList; typedef SparseVector<Featval> FeatureVector; typedef SparseVector<Featval> WeightVector; typedef std::vector<Featval> DenseWeightVector; diff --git a/utils/sparse_vector.cc b/utils/sparse_vector.cc index 6e42a216..24da5f39 100644 --- a/utils/sparse_vector.cc +++ b/utils/sparse_vector.cc @@ -3,6 +3,7 @@ #include <iostream> #include <cstring> +#include "fdict.h" #include "b64tools.h" using namespace std; @@ -10,7 +11,7 @@ using namespace std; namespace B64 { void Encode(double objective, const SparseVector<double>& v, ostream* out) { - const int num_feats = v.num_active(); + const int num_feats = v.size(); size_t tot_size = 0; const size_t off_objective = tot_size; tot_size += sizeof(double); // objective diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h index 1bcb9502..de8c0291 100644 --- a/utils/sparse_vector.h +++ b/utils/sparse_vector.h @@ -1,6 +1,8 @@ #ifndef _SPARSE_VECTOR_H_ #define _SPARSE_VECTOR_H_ +#undef USE_FAST_SPARSE_VECTOR +#ifndef USE_FAST_SPARSE_VECTOR /* TODO: specialize for int value types, where it probably makes sense to check if adding/subtracting brings a value to 0, and remove it from the map (e.g. in a gibbs sampler). or add a separate policy argument for that. */ @@ -667,6 +669,13 @@ std::ostream &operator<<(std::ostream &out, const SparseVector<T> &vec) return vec.operator<<(out); } +#else + +#include "fast_sparse_vector.h" +#define SparseVector FastSparseVector + +#endif + namespace B64 { void Encode(double objective, const SparseVector<double>& v, std::ostream* out); // returns false if failed to decode diff --git a/utils/ts.cc b/utils/ts.cc index 1febed3c..28b5f9b1 100644 --- a/utils/ts.cc +++ b/utils/ts.cc @@ -10,25 +10,46 @@ using namespace std; +template <typename T> +void Print(const T& x) { + typename T::const_iterator it = x.begin(); + for (; it != x.end(); ++it) { + cerr << it->first << ":" << it->second << " "; + } + cerr << endl; +} + +template <typename T> +void test_unique() { + T x; + x.set_value(100, 1.0); + x.set_value(100, 2.0); + Print(x); + assert(x.size() == 1); + assert(x.value(100) == 2.0); +} + int main() { cerr << sizeof(prob_t) << " " << sizeof(LogVal<float>) << endl; cerr << " sizeof(FSV<float>) = " << sizeof(FastSparseVector<float>) << endl; cerr << "sizeof(FSV<double>) = " << sizeof(FastSparseVector<double>) << endl; - sranddev(); + test_unique<FastSparseVector<float> >(); +// sranddev(); int c = 0; + FastSparseVector<float> p; for (int i = 0; i < 1000000; ++i) { FastSparseVector<float> x; - //SparseVector<float> x; - for (int j = 0; j < 15; ++j) { - const int k = rand() % 1000; - const float v = rand() / 3.14f; + for (int j = 0; j < 10; ++j) { + const int k = rand() % 200; + const float v = 1000.0f / rand(); x.set_value(k,v); } - //SparseVector<float> y = x; - FastSparseVector<float> y = x; - y += x; - y = x; - if (y.value(50)) { c++; } + if (x.value(50)) { c++; } + p = x; + p += p; + FastSparseVector<float> y = p + p; + y *= 2; + y -= y; } cerr << "Counted " << c << " times\n"; return 0; |