diff options
Diffstat (limited to 'utils')
| -rw-r--r-- | utils/fast_sparse_vector.h | 187 | ||||
| -rw-r--r-- | utils/sparse_vector.h | 48 | ||||
| -rw-r--r-- | utils/ts.cc | 18 | 
3 files changed, 197 insertions, 56 deletions
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index c4a44b99..b8b3df1f 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -7,6 +7,7 @@  // important: indexes are integers  // important: iterators may return elements in any order +#include <cmath>  #include <cstring>  #include <climits>  #include <map> @@ -21,18 +22,35 @@  #define L2_CACHE_LINE 128  // this should just be a typedef to pair<int,T> on the new c++ +// I have to avoid this since I want to use unions and c++-98 +// does not let unions have types with constructors in them +// this type bypasses default constructors. use with caution! +// this should work as long as T is in a acceptable state to +// have its destructor called when initialized with all zeros  template <typename T>  struct PairIntT { -  int first; -  T second;    const PairIntT& operator=(const std::pair<const int, T>& v) { -    first = v.first; -    second = v.second; +    std::memcpy(this, &v, sizeof(PairIntT));      return *this;    }    operator const std::pair<const int, T>&() const {      return *reinterpret_cast<const std::pair<const int, T>*>(this);    } +  int& first() { +    return reinterpret_cast<std::pair<int, T>*>(this)->first; +  } +  T& second() { +    return reinterpret_cast<std::pair<int, T>*>(this)->second; +  } +  const int& first() const { +    return reinterpret_cast<const std::pair<int, T>*>(this)->first; +  } +  const T& second() const { +    return reinterpret_cast<const std::pair<int, T>*>(this)->second; +  } + private: +  // very bad way of bypassing the default constructor on T +  char data_[sizeof(std::pair<int, T>)];  };  BOOST_STATIC_ASSERT(sizeof(PairIntT<float>) == sizeof(std::pair<int,float>)); @@ -40,7 +58,7 @@ template <typename T, int LOCAL_MAX = (sizeof(T) == sizeof(float) ? 15 : 7)>  class FastSparseVector {   public:    struct const_iterator { -    const_iterator(const FastSparseVector<T>& v, const bool is_end) : local_(v.is_local_) { +    const_iterator(const FastSparseVector<T>& v, const bool is_end) : local_(!v.is_remote_) {        if (local_) {          local_it_ = &v.data_.local[is_end ? v.local_size_ : 0];        } else { @@ -85,43 +103,89 @@ class FastSparseVector {      }    };   public: -  FastSparseVector() : local_size_(0), is_local_(true) {} +  FastSparseVector() : local_size_(0), is_remote_(false) { std::memset(&data_, 0, sizeof(data_)); } +  explicit FastSparseVector(const std::vector<T>& init) : local_size_(0), is_remote_(false) { +    for (int i = 0; i < init.size(); ++i) set_value(i, init[i]); +  }    ~FastSparseVector() { -    if (!is_local_) delete data_.rbmap; +    clear();    }    FastSparseVector(const FastSparseVector& other) {      std::memcpy(this, &other, sizeof(FastSparseVector)); -    if (is_local_) return; -    data_.rbmap = new std::map<int, T>(*data_.rbmap); +    if (is_remote_) data_.rbmap = new std::map<int, T>(*data_.rbmap); +  } +  void erase(int k) { +    if (is_remote_) { +      data_.rbmap->erase(k); +    } else { +      for (int i = 0; i < local_size_; ++i) { +        if (data_.local[i].first() == k) { +          for (int j = i+1; j < local_size_; ++j) { +            data_.local[j-1].first() = data_.local[j].first(); +            data_.local[j-1].second() = data_.local[j].second(); +          } +        } +      } +    }    }    const FastSparseVector& operator=(const FastSparseVector& other) { -    if (!is_local_) delete data_.rbmap; +    if (is_remote_) delete data_.rbmap;      std::memcpy(this, &other, sizeof(FastSparseVector)); -    if (is_local_) return *this; -    data_.rbmap = new std::map<int, T>(*data_.rbmap); +    if (is_remote_) +      data_.rbmap = new std::map<int, T>(*data_.rbmap);      return *this;    } +  T const& get_singleton() const { +    assert(size()==1); +    return begin()->second; +  } +  bool nonzero(int k) const { +    return static_cast<bool>(value(k)); +  }    inline void set_value(int k, const T& v) {      get_or_create_bin(k) = v;    } +  inline T& add_value(int k, const T& v) { +    return get_or_create_bin(k) += v; +  } +  inline T get(int k) const { +    return value(k); +  }    inline T value(int k) const { -    if (is_local_) { +    if (is_remote_) { +      typename std::map<int, T>::const_iterator it = data_.rbmap->find(k); +      if (it != data_.rbmap->end()) return it->second; +    } else {        for (int i = 0; i < local_size_; ++i) {          const PairIntT<T>& p = data_.local[i]; -        if (p.first == k) return p.second; +        if (p.first() == k) return p.second();        } -    } else { -      typename std::map<int, T>::const_iterator it = data_.rbmap->find(k); -      if (it != data_.rbmap->end()) return it->second;      }      return T();    } +  T l2norm_sq() const { +    T sum = T(); +    for (const_iterator it = begin(), e = end(); it != e; ++it) +      sum += it->second * it->second; +    return sum; +  } +  T l2norm() const { +    return sqrt(l2norm_sq()); +  } +  // if values are binary, gives |A intersect B|/|A union B| +  template<typename S> +  S tanimoto_coef(const FastSparseVector<S> &vec) const { +    const S dp=dot(vec); +    return dp/(l2norm_sq()+vec.l2norm_sq()-dp); +  }    inline size_t size() const { -    if (is_local_) return local_size_; -    return data_.rbmap->size(); +    if (is_remote_) +      return data_.rbmap->size(); +    else +      return local_size_;    }    inline void clear() { -    if (!is_local_) delete data_.rbmap; +    if (is_remote_) delete data_.rbmap;      local_size_ = 0;    }    inline bool empty() const { @@ -135,6 +199,14 @@ class FastSparseVector {      }      return *this;    } +  template <typename O> +  inline FastSparseVector& operator+=(const FastSparseVector<O>& other) { +    const typename FastSparseVector<O>::const_iterator end = other.end(); +    for (typename FastSparseVector<O>::const_iterator it = other.begin(); it != end; ++it) { +      get_or_create_bin(it->first) += it->second; +    } +    return *this; +  }    inline FastSparseVector& operator-=(const FastSparseVector& other) {      const typename FastSparseVector::const_iterator end = other.end();      for (typename FastSparseVector::const_iterator it = other.begin(); it != end; ++it) { @@ -143,24 +215,24 @@ class FastSparseVector {      return *this;    }    inline FastSparseVector& operator*=(const T& scalar) { -    if (is_local_) { -      for (int i = 0; i < local_size_; ++i) -        data_.local[i].second *= scalar; -    } else { +    if (is_remote_) {        const typename std::map<int, T>::iterator end = data_.rbmap->end();        for (typename std::map<int, T>::iterator it = data_.rbmap->begin(); it != end; ++it)          it->second *= scalar; +    } else { +      for (int i = 0; i < local_size_; ++i) +        data_.local[i].second() *= scalar;      }      return *this;    }    inline FastSparseVector& operator/=(const T& scalar) { -    if (is_local_) { -      for (int i = 0; i < local_size_; ++i) -        data_.local[i].second /= scalar; -    } else { +    if (is_remote_) {        const typename std::map<int, T>::iterator end = data_.rbmap->end();        for (typename std::map<int, T>::iterator it = data_.rbmap->begin(); it != end; ++it)          it->second /= scalar; +    } else { +      for (int i = 0; i < local_size_; ++i) +        data_.local[i].second() /= scalar;      }      return *this;    } @@ -182,38 +254,57 @@ class FastSparseVector {      T res = T();      for (const_iterator it = begin(), e = end(); it != e; ++it)        if (it->first < v.size()) res += it->second * v[it->first]; +    return res; +  } +  T dot(const FastSparseVector<T>& other) const { +    T res = T(); +    for (const_iterator it = begin(), e = end(); it != e; ++it) +      res += other.value(it->first) * it->second; +    return res; +  } +  bool operator==(const FastSparseVector<T>& other) const { +    if (other.size() != size()) return false; +    for (const_iterator it = begin(), e = end(); it != e; ++it) { +      if (other.value(it->first) != it->second) return false; +    } +    return true; +  } +  void swap(FastSparseVector<T>& other) { +    char t[sizeof(data_)]; +    std::swap(other.is_remote_, is_remote_); +    std::swap(other.local_size_, local_size_); +    std::memcpy(t, &other.data_, sizeof(data_)); +    std::memcpy(&other.data_, &data_, sizeof(data_)); +    std::memcpy(&data_, t, sizeof(data_));    }   private: -  inline T& extend_vector(std::vector<T> &v,int i) { +  static inline T& extend_vector(std::vector<T> &v,int i) {      if (i>=v.size())        v.resize(i+1);      return v[i];    }    inline T& get_or_create_bin(int k) { -    if (is_local_) { -      for (int i = 0; i < local_size_; ++i) -        if (data_.local[i].first == k) return data_.local[i].second; -    } else { +    if (is_remote_) {        return (*data_.rbmap)[k]; +    } else { +      for (int i = 0; i < local_size_; ++i) +        if (data_.local[i].first() == k) return data_.local[i].second();      } -    assert(is_local_); +    assert(!is_remote_);      // currently local!      if (local_size_ < LOCAL_MAX) {        PairIntT<T>& p = data_.local[local_size_];        ++local_size_; -      p.first = k; -      return p.second; +      p.first() = k; +      p.second() = T(); +      return p.second();      } else {        swap_local_rbmap();        return (*data_.rbmap)[k];        }    }    void swap_local_rbmap() { -    if (is_local_) { // data is local, move to rbmap -      std::map<int, T>* m = new std::map<int, T>(&data_.local[0], &data_.local[local_size_]); -      data_.rbmap = m; -      is_local_ = false; -    } else { // data is in rbmap, move to local +    if (is_remote_) { // data is in rbmap, move to local        assert(data_.rbmap->size() < LOCAL_MAX);        const std::map<int, T>* m = data_.rbmap;        local_size_ = m->size(); @@ -223,7 +314,11 @@ class FastSparseVector {          data_.local[i] = *it;          ++i;        } -      is_local_ = true; +      is_remote_ = false; +    } else { // data is local, move to rbmap +      std::map<int, T>* m = new std::map<int, T>(&data_.local[0], &data_.local[local_size_]); +      data_.rbmap = m; +      is_remote_ = true;      }    } @@ -232,7 +327,7 @@ class FastSparseVector {      std::map<int, T>* rbmap;    } data_;    unsigned char local_size_; -  bool is_local_; +  bool is_remote_;  };  template <typename T> @@ -261,6 +356,12 @@ const FastSparseVector<T> operator-(const FastSparseVector<T>& x, const FastSpar    }  } +template <class T> +std::size_t hash_value(FastSparseVector<T> const& x) { +  assert(!"not implemented"); +  return 0; +} +  namespace performance_checks {    // if you get a failure on the next line, you should adjust LOCAL_MAX for    // your architecture diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h index de8c0291..dbeacbe8 100644 --- a/utils/sparse_vector.h +++ b/utils/sparse_vector.h @@ -646,6 +646,19 @@ SparseVector<T> operator+(const SparseVector<T>& a, const SparseVector<T>& b) {  }  template <class T> +SparseVector<T> operator*(const double& a, const SparseVector<T>& b) { +  SparseVector<T> result = b; +  return result *= a; +} + +#else + +#include "fast_sparse_vector.h" +#define SparseVector FastSparseVector + +#endif + +template <class T>  SparseVector<T> operator*(const SparseVector<T>& a, const double& b) {    SparseVector<T> result = a;    return result *= b; @@ -658,23 +671,36 @@ SparseVector<T> operator*(const SparseVector<T>& a, const T& b) {  }  template <class T> -SparseVector<T> operator*(const double& a, const SparseVector<T>& b) { -  SparseVector<T> result = b; -  return result *= a; +SparseVector<T> operator/(const SparseVector<T>& a, const double& b) { +  SparseVector<T> result = a; +  return result *= b;  }  template <class T> -std::ostream &operator<<(std::ostream &out, const SparseVector<T> &vec) -{ -    return vec.operator<<(out); +SparseVector<T> operator/(const SparseVector<T>& a, const T& b) { +  SparseVector<T> result = a; +  return result *= b;  } -#else - -#include "fast_sparse_vector.h" -#define SparseVector FastSparseVector +template <class O, typename T> +inline void print(O &o,const SparseVector<T>& v, const char* kvsep="=",const char* pairsep=" ",const char* pre="",const char* post="") { +  o << pre; +  bool first=true; +  for (typename SparseVector<T>::const_iterator i=v.begin(),e=v.end();i!=e;++i) { +    if (first) +      first=false; +    else +      o<<pairsep; +    o<<FD::Convert(i->first)<<kvsep<<i->second; +  } +  o << post; +} -#endif +template <typename T> +inline std::ostream& operator<<(std::ostream& out, const SparseVector<T>& v) { +  print(out, v); +  return out; +}  namespace B64 {    void Encode(double objective, const SparseVector<double>& v, std::ostream* out); diff --git a/utils/ts.cc b/utils/ts.cc index 28b5f9b1..563794c5 100644 --- a/utils/ts.cc +++ b/utils/ts.cc @@ -11,7 +11,7 @@  using namespace std;  template <typename T> -void Print(const T& x) { +void MPrint(const T& x) {    typename T::const_iterator it = x.begin();    for (; it != x.end(); ++it) {      cerr << it->first << ":" << it->second << " "; @@ -24,16 +24,30 @@ void test_unique() {    T x;    x.set_value(100, 1.0);    x.set_value(100, 2.0); -  Print(x); +  MPrint(x);    assert(x.size() == 1);    assert(x.value(100) == 2.0);  } +void test_logv() { +  FastSparseVector<prob_t> x; +  cerr << "FSV<prob_t> = " << sizeof(FastSparseVector<prob_t>) << endl; +  x.set_value(999, prob_t(0.5)); +  x.set_value(0, prob_t()); +  x.set_value(1, prob_t(1)); +  MPrint(x); +  x += x; +  MPrint(x); +  x -= x; +  MPrint(x); +} +  int main() {    cerr << sizeof(prob_t) << " " << sizeof(LogVal<float>) << endl;    cerr << " sizeof(FSV<float>) = " << sizeof(FastSparseVector<float>) << endl;    cerr << "sizeof(FSV<double>) = " << sizeof(FastSparseVector<double>) << endl;    test_unique<FastSparseVector<float> >(); +  test_logv();  //  sranddev();    int c = 0;    FastSparseVector<float> p;  | 
