adding functionality to fast_sparse_vector, getting ready for transition to it

author: Chris Dyer <cdyer@cs.cmu.edu> 2011-04-21 18:06:31 -0400
committer: Chris Dyer <cdyer@cs.cmu.edu> 2011-04-21 18:06:31 -0400
commit: b6634dff2cd515d9e6f95416512db893a08bde29 (patch)
tree: 02730df750245ef3d101deb469d0e10109b7d6f5 /utils
parent: 282bc08647eb6856f798da0c64dd67530e02842a (diff)
5 files changed, 216 insertions, 29 deletions
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index 91bf441d..c4a44b99 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -1,47 +1,102 @@
 #ifndef _FAST_SPARSE_VECTOR_H_
 #define _FAST_SPARSE_VECTOR_H_
 
+// FastSparseVector<T> is a integer indexed unordered map that supports very fast
+// (mathematical) vector operations when the sizes are very small, and reasonably
+// fast operations when the sizes are large.
+// important: indexes are integers
+// important: iterators may return elements in any order
+
+#include <cstring>
+#include <climits>
 #include <map>
 #include <cassert>
+#include <vector>
 
 #include <boost/static_assert.hpp>
 
-// these are architecture dependent, they should be
+// this is architecture dependent, it should be
 // detected in some way but it's probably easiest (for me)
-// to just set them
+// to just set it
 #define L2_CACHE_LINE 128
 
-// this should just be a typedef to pair<int,float> on the new c++
+// this should just be a typedef to pair<int,T> on the new c++
 template <typename T>
-struct PairIntFloat {
+struct PairIntT {
   int first;
   T second;
-  const PairIntFloat& operator=(const std::pair<const int, float>& v) {
+  const PairIntT& operator=(const std::pair<const int, T>& v) {
     first = v.first;
     second = v.second;
     return *this;
   }
-  operator const std::pair<const int, float>&() const {
-    return *reinterpret_cast<const std::pair<const int, float>*>(this);
+  operator const std::pair<const int, T>&() const {
+    return *reinterpret_cast<const std::pair<const int, T>*>(this);
   }
 };
-BOOST_STATIC_ASSERT(sizeof(PairIntFloat<float>) == sizeof(std::pair<int,float>));
+BOOST_STATIC_ASSERT(sizeof(PairIntT<float>) == sizeof(std::pair<int,float>));
 
 template <typename T, int LOCAL_MAX = (sizeof(T) == sizeof(float) ? 15 : 7)>
 class FastSparseVector {
  public:
+  struct const_iterator {
+    const_iterator(const FastSparseVector<T>& v, const bool is_end) : local_(v.is_local_) {
+      if (local_) {
+        local_it_ = &v.data_.local[is_end ? v.local_size_ : 0];
+      } else {
+        if (is_end)
+          remote_it_ = v.data_.rbmap->end();
+        else
+          remote_it_ = v.data_.rbmap->begin();
+      }
+    }
+    const bool local_;
+    const PairIntT<T>* local_it_;
+    typename std::map<int, T>::const_iterator remote_it_;
+    const std::pair<const int, T>& operator*() const {
+      if (local_)
+        return *reinterpret_cast<const std::pair<const int, float>*>(local_it_);
+      else
+        return *remote_it_;
+    }
+
+    const std::pair<const int, T>* operator->() const {
+      if (local_)
+        return reinterpret_cast<const std::pair<const int, T>*>(local_it_);
+      else
+        return &*remote_it_;
+    }
+
+    const_iterator& operator++() {
+      if (local_) ++local_it_; else ++remote_it_;
+      return *this;
+    }
+
+    inline bool operator==(const const_iterator& o) const {
+      if (o.local_ != local_) return false;
+      if (local_) {
+        return local_it_ == o.local_it_;
+      } else {
+        return remote_it_ == o.remote_it_;
+      }
+    }
+    inline bool operator!=(const const_iterator& o) const {
+      return !(o == *this);
+    }
+  };
+ public:
   FastSparseVector() : local_size_(0), is_local_(true) {}
   ~FastSparseVector() {
     if (!is_local_) delete data_.rbmap;
   }
   FastSparseVector(const FastSparseVector& other) {
-    memcpy(this, &other, sizeof(FastSparseVector));
+    std::memcpy(this, &other, sizeof(FastSparseVector));
     if (is_local_) return;
     data_.rbmap = new std::map<int, T>(*data_.rbmap);
   }
   const FastSparseVector& operator=(const FastSparseVector& other) {
     if (!is_local_) delete data_.rbmap;
-    memcpy(this, &other, sizeof(FastSparseVector));
+    std::memcpy(this, &other, sizeof(FastSparseVector));
     if (is_local_) return *this;
     data_.rbmap = new std::map<int, T>(*data_.rbmap);
     return *this;
@@ -52,11 +107,11 @@ class FastSparseVector {
   inline T value(int k) const {
     if (is_local_) {
       for (int i = 0; i < local_size_; ++i) {
-        const PairIntFloat<T>& p = data_.local[i];
+        const PairIntT<T>& p = data_.local[i];
         if (p.first == k) return p.second;
       }
     } else {
-      std::map<int, float>::const_iterator it = data_.rbmap->find(k);
+      typename std::map<int, T>::const_iterator it = data_.rbmap->find(k);
       if (it != data_.rbmap->end()) return it->second;
     }
     return T();
@@ -73,13 +128,67 @@ class FastSparseVector {
     return size() == 0;
   }
   inline FastSparseVector& operator+=(const FastSparseVector& other) {
-    if (!is_local_) {
-    } else if (!other.is_local_) {
-    } else { // both local
+    if (empty()) { *this = other; return *this; }
+    const typename FastSparseVector::const_iterator end = other.end();
+    for (typename FastSparseVector::const_iterator it = other.begin(); it != end; ++it) {
+      get_or_create_bin(it->first) += it->second;
+    }
+    return *this;
+  }
+  inline FastSparseVector& operator-=(const FastSparseVector& other) {
+    const typename FastSparseVector::const_iterator end = other.end();
+    for (typename FastSparseVector::const_iterator it = other.begin(); it != end; ++it) {
+      get_or_create_bin(it->first) -= it->second;
     }
     return *this;
   }
+  inline FastSparseVector& operator*=(const T& scalar) {
+    if (is_local_) {
+      for (int i = 0; i < local_size_; ++i)
+        data_.local[i].second *= scalar;
+    } else {
+      const typename std::map<int, T>::iterator end = data_.rbmap->end();
+      for (typename std::map<int, T>::iterator it = data_.rbmap->begin(); it != end; ++it)
+        it->second *= scalar;
+    }
+    return *this;
+  }
+  inline FastSparseVector& operator/=(const T& scalar) {
+    if (is_local_) {
+      for (int i = 0; i < local_size_; ++i)
+        data_.local[i].second /= scalar;
+    } else {
+      const typename std::map<int, T>::iterator end = data_.rbmap->end();
+      for (typename std::map<int, T>::iterator it = data_.rbmap->begin(); it != end; ++it)
+        it->second /= scalar;
+    }
+    return *this;
+  }
+  const_iterator begin() const {
+    return const_iterator(*this, false);
+  }
+  const_iterator end() const {
+    return const_iterator(*this, true);
+  }
+  void init_vector(std::vector<T> *vp) const {
+    init_vector(*vp);
+  }
+  void init_vector(std::vector<T> &v) const {
+    v.clear();
+    for (const_iterator i=begin(),e=end();i!=e;++i)
+      extend_vector(v,i->first)=i->second;
+  }
+  T dot(const std::vector<T>& v) const {
+    T res = T();
+    for (const_iterator it = begin(), e = end(); it != e; ++it)
+      if (it->first < v.size()) res += it->second * v[it->first];
+  }
  private:
+  inline T& extend_vector(std::vector<T> &v,int i) {
+    if (i>=v.size())
+      v.resize(i+1);
+    return v[i];
+  }
   inline T& get_or_create_bin(int k) {
     if (is_local_) {
       for (int i = 0; i < local_size_; ++i)
@@ -90,7 +199,7 @@ class FastSparseVector {
     assert(is_local_);
     // currently local!
     if (local_size_ < LOCAL_MAX) {
-      PairIntFloat<T>& p = data_.local[local_size_];
+      PairIntT<T>& p = data_.local[local_size_];
       ++local_size_;
       p.first = k;
       return p.second;
@@ -119,17 +228,65 @@ class FastSparseVector {
   }
 
   union {
-    PairIntFloat<T> local[LOCAL_MAX];
+    PairIntT<T> local[LOCAL_MAX];
     std::map<int, T>* rbmap;
   } data_;
   unsigned char local_size_;
   bool is_local_;
 };
 
+template <typename T>
+const FastSparseVector<T> operator+(const FastSparseVector<T>& x, const FastSparseVector<T>& y) {
+  if (x.size() > y.size()) {
+    FastSparseVector<T> res(x);
+    res += y;
+    return res;
+  } else {
+    FastSparseVector<T> res(y);
+    res += x;
+    return res;
+  }
+}
+
+template <typename T>
+const FastSparseVector<T> operator-(const FastSparseVector<T>& x, const FastSparseVector<T>& y) {
+  if (x.size() > y.size()) {
+    FastSparseVector<T> res(x);
+    res -= y;
+    return res;
+  } else {
+    FastSparseVector<T> res(y);
+    res -= x;
+    return res;
+  }
+}
+
 namespace performance_checks {
   // if you get a failure on the next line, you should adjust LOCAL_MAX for
   // your architecture
   BOOST_STATIC_ASSERT(sizeof(FastSparseVector<float>) == L2_CACHE_LINE);
 };
 
+#include "fdict.h"
+
+template <class O, typename T>
+inline void print(O &o,const FastSparseVector<T>& v, const char* kvsep="=",const char* pairsep=" ",const char* pre="",const char* post="") {
+  o << pre;
+  bool first=true;
+  for (typename FastSparseVector<T>::const_iterator i=v.begin(),e=v.end();i!=e;++i) {
+    if (first)
+      first=false;
+    else
+      o<<pairsep;
+    o<<FD::Convert(i->first)<<kvsep<<i->second;
+  }
+  o << post;
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& out, const FastSparseVector<T>& v) {
+  print(out, v);
+  return out;
+}
+
 #endif
diff --git a/utils/feature_vector.h b/utils/feature_vector.h
index be378a6a..733aa99e 100755
--- a/utils/feature_vector.h
+++ b/utils/feature_vector.h
@@ -6,7 +6,6 @@
 #include "fdict.h"
 
 typedef double Featval;
-typedef SparseVectorList<Featval> FeatureVectorList;
 typedef SparseVector<Featval> FeatureVector;
 typedef SparseVector<Featval> WeightVector;
 typedef std::vector<Featval> DenseWeightVector;
diff --git a/utils/sparse_vector.cc b/utils/sparse_vector.cc
index 6e42a216..24da5f39 100644
--- a/utils/sparse_vector.cc
+++ b/utils/sparse_vector.cc
@@ -3,6 +3,7 @@
 #include <iostream>
 #include <cstring>
 
+#include "fdict.h"
 #include "b64tools.h"
 
 using namespace std;
@@ -10,7 +11,7 @@ using namespace std;
 namespace B64 {
 
 void Encode(double objective, const SparseVector<double>& v, ostream* out) {
-  const int num_feats = v.num_active();
+  const int num_feats = v.size();
   size_t tot_size = 0;
   const size_t off_objective = tot_size;
   tot_size += sizeof(double);                   // objective
diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h
index 1bcb9502..de8c0291 100644
--- a/utils/sparse_vector.h
+++ b/utils/sparse_vector.h
@@ -1,6 +1,8 @@
 #ifndef _SPARSE_VECTOR_H_
 #define _SPARSE_VECTOR_H_
 
+#undef USE_FAST_SPARSE_VECTOR
+#ifndef USE_FAST_SPARSE_VECTOR
 /*
 TODO: specialize for int value types, where it probably makes sense to check if adding/subtracting brings a value to 0, and remove it from the map (e.g. in a gibbs sampler).  or add a separate policy argument for that.
  */
@@ -667,6 +669,13 @@ std::ostream &operator<<(std::ostream &out, const SparseVector<T> &vec)
     return vec.operator<<(out);
 }
 
+#else
+
+#include "fast_sparse_vector.h"
+#define SparseVector FastSparseVector
+
+#endif
+
 namespace B64 {
   void Encode(double objective, const SparseVector<double>& v, std::ostream* out);
   // returns false if failed to decode
diff --git a/utils/ts.cc b/utils/ts.cc
index 1febed3c..28b5f9b1 100644
--- a/utils/ts.cc
+++ b/utils/ts.cc
@@ -10,25 +10,46 @@
 
 using namespace std;
 
+template <typename T>
+void Print(const T& x) {
+  typename T::const_iterator it = x.begin();
+  for (; it != x.end(); ++it) {
+    cerr << it->first << ":" << it->second << " ";
+  }
+  cerr << endl;
+}
+
+template <typename T>
+void test_unique() {
+  T x;
+  x.set_value(100, 1.0);
+  x.set_value(100, 2.0);
+  Print(x);
+  assert(x.size() == 1);
+  assert(x.value(100) == 2.0);
+}
+
 int main() {
   cerr << sizeof(prob_t) << " " << sizeof(LogVal<float>) << endl;
   cerr << " sizeof(FSV<float>) = " << sizeof(FastSparseVector<float>) << endl;
   cerr << "sizeof(FSV<double>) = " << sizeof(FastSparseVector<double>) << endl;
-  sranddev();
+  test_unique<FastSparseVector<float> >();
+//  sranddev();
   int c = 0;
+  FastSparseVector<float> p;
   for (int i = 0; i < 1000000; ++i) {
     FastSparseVector<float> x;
-    //SparseVector<float> x;
-    for (int j = 0; j < 15; ++j) {
-      const int k = rand() % 1000;
-      const float v = rand() / 3.14f;
+    for (int j = 0; j < 10; ++j) {
+      const int k = rand() % 200;
+      const float v = 1000.0f / rand();
       x.set_value(k,v);
     }
-    //SparseVector<float> y = x;
-    FastSparseVector<float> y = x;
-    y += x;
-    y = x;
-    if (y.value(50)) { c++; }
+    if (x.value(50)) { c++; }
+    p = x;
+    p += p;
+    FastSparseVector<float> y = p + p;
+    y *= 2;
+    y -= y;
   }
   cerr << "Counted " << c << " times\n";
   return 0;
author	Chris Dyer <cdyer@cs.cmu.edu>	2011-04-21 18:06:31 -0400
committer	Chris Dyer <cdyer@cs.cmu.edu>	2011-04-21 18:06:31 -0400
commit	b6634dff2cd515d9e6f95416512db893a08bde29 (patch)
tree	02730df750245ef3d101deb469d0e10109b7d6f5 /utils
parent	282bc08647eb6856f798da0c64dd67530e02842a (diff)