From 2738a4d902757d60108dccb2ddbb778251be63e3 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 28 Apr 2011 15:48:43 -0400 Subject: replace old sparse vector with much faster version, major speed improvements for cdec --- utils/sparse_vector.h | 620 +------------------------------------------------- utils/ts.cc | 13 ++ 2 files changed, 15 insertions(+), 618 deletions(-) (limited to 'utils') diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h index 7161a5a9..274220ef 100644 --- a/utils/sparse_vector.h +++ b/utils/sparse_vector.h @@ -1,538 +1,7 @@ #ifndef _SPARSE_VECTOR_H_ #define _SPARSE_VECTOR_H_ -#undef USE_FAST_SPARSE_VECTOR -#ifndef USE_FAST_SPARSE_VECTOR -/* -TODO: specialize for int value types, where it probably makes sense to check if adding/subtracting brings a value to 0, and remove it from the map (e.g. in a gibbs sampler). or add a separate policy argument for that. - */ - -//#define SPARSE_VECTOR_HASH -// if defined, use hash_map rather than map. map is probably faster/smaller for small vectors - -/* - use SparseVectorList (pair smallvector) for feat funcs / hypergraphs (you rarely need random access; just append a feature to the list) -*/ -/* hack: index 0 never gets printed because cdyer is creative and efficient. features which have no weight got feature dict id 0, see, and the models all clobered that value. nobody wants to see it. except that vlad is also creative and efficient and stored the oracle bleu there. */ -/* NOTE: zero vals may or may not be dropped from map (sparse, but not guaranteed to be so). - - I rely on !v the same as !((bool)v) the same as v==0 and v() same as v(0). - - one exception: - - a local: - T sum = 0; - is used instead of - T sum; - - because T may be a primitive type, and - - T sum(); - - is parsed as a function decl :( - - the alternative T sum=T() is also be reasonable. i've switched to that. -*/ - -// this is a modified version of code originally written -// by Phil Blunsom - -#include -#include -#ifdef SPARSE_VECTOR_HASH -#include "hash.h" -# define SPARSE_VECTOR_MAP HASH_MAP -# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) HASH_MAP_RESERVED(h,empty,deleted) -#else -# define SPARSE_VECTOR_MAP std::map -# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) -#endif - -#include -#include -#include -#include -#include - -#include "fdict.h" -#include "small_vector.h" -#include "string_to.h" - -#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP -#include -#endif - -template -inline T & extend_vector(std::vector &v,int i) { - if (i>=v.size()) - v.resize(i+1); - return v[i]; -} - -template -class SparseVector { - void init_reserved() { - SPARSE_VECTOR_MAP_RESERVED(values_,-1,-2); - } -public: - T const& get_singleton() const { - assert(values_.size()==1); - return values_.begin()->second; - } - - typedef SparseVector Self; - typedef SPARSE_VECTOR_MAP MapType; - typedef typename MapType::const_iterator const_iterator; - SparseVector() { - init_reserved(); - } - typedef typename MapType::value_type value_type; - typedef typename MapType::iterator iterator; - explicit SparseVector(std::vector const& v) { - init_reserved(); - iterator p=values_.begin(); - const T z=0; - for (unsigned i=0;i - void print(O &o,Str kvsep="=",Str pairsep=" ",Str pre="",Str post="") const { - o << pre; - bool first=true; - for (const_iterator i=values_.begin(),e=values_.end();i!=e;++i) { - if (first) - first=false; - else - o<first)<second; - } - o << post; - } - - static void error(std::string const& msg) { - throw std::runtime_error("SparseVector: "+msg); - } - - enum DupPolicy { - NO_DUPS, - KEEP_FIRST, - KEEP_LAST, - SUM - }; - - // either key val alternating whitespace sep, or key=val (kvsep char is '='). end at eof or terminator (non-ws) char - template - void read(S &s,DupPolicy dp=NO_DUPS,bool use_kvsep=true,char kvsep='=',bool use_pairsep=true,char optional_pairsep=';',bool stop_at_terminator=false,char terminator=')') { - values_.clear(); - std::string id; - WordID k; - T v; -#undef SPARSE_MUST_READ -#define SPARSE_MUST_READ(x) if (!(x)) error(#x); - int ki; - while (s) { - if (stop_at_terminator||use_pairsep) { - char c; - if (!(s>>c)) goto eof; - if (stop_at_terminator && c==terminator) return; - if (!use_pairsep || c!=optional_pairsep) - s.unget(); - } - if (!(s>>id)) goto eof; - if (use_kvsep && (ki=id.find(kvsep))!=std::string::npos) { - k=FD::Convert(std::string(id,0,ki)); - string_into(id.c_str()+ki+1,v); - } else { - k=FD::Convert(id); - if (!(s>>v)) error("reading value failed"); - } - std::pair vi=values_.insert(value_type(k,v)); - if (!vi.second) { - T &oldv=vi.first->second; - switch(dp) { - case NO_DUPS: error("read duplicate key with NO_DUPS. key=" - +FD::Convert(k)+" val="+to_string(v)+" old-val="+to_string(oldv)); - break; - case KEEP_FIRST: break; - case KEEP_LAST: oldv=v; break; - case SUM: oldv+=v; break; - } - } - } - goto good; - eof: - if (!s.eof()) error("reading key failed (before EOF)"); - good: - s.clear(); // we may have reached eof, but that's no error. - } - - friend inline std::ostream & operator<<(std::ostream &o,Self const& s) { - s.print(o); - return o; - } - - friend inline std::istream & operator>>(std::istream &o,Self & s) { - s.read(o); - return o; - } - - void init_vector(std::vector *vp) const { - init_vector(*vp); - } - - void init_vector(std::vector &v) const { - v.clear(); - for (const_iterator i=values_.begin(),e=values_.end();i!=e;++i) - extend_vector(v,i->first)=i->second; - } - - void set_new_value(int index, T const& val) { - assert(values_.find(index)==values_.end()); - values_[index]=val; - } - - - // warning: exploits the fact that 0 values are always removed from map. change this if you change that. - bool nonzero(int index) const { - typename MapType::const_iterator found = values_.find(index); - return found==values_.end() || !found->second; - } - - void remove_zeros() { - typename MapType::iterator it = values_.begin(); - for (; it != values_.end(); ++it) - if (!it->second) values_.erase(it); - } - - T get(int index) const { - typename MapType::const_iterator found = values_.find(index); - return found==values_.end()?T():found->second; - } - - T value(int i) const { return get(i); } - - // same as above but may add a 0 entry. TODO: check that people relying on no entry use get - T & operator[](int index){ - return values_[index]; - } - - inline void maybe_set_value(int index, const T &value) { - if (value) values_[index] = value; - } - - inline void set_value(int index, const T &value) { - values_[index] = value; - } - - inline void maybe_add(int index, const T& value) { - if (value) add_value(index,value); - } - - T& add_value(int index, const T &value) { -#if 1 - return values_[index]+=value; -#else - // this is not really going to be any faster, and we already rely on default init = 0 init - std::pair art=values_.insert(std::make_pair(index,value)); - T &val=art.first->second; - if (!art.second) val += value; // already existed - return val; -#endif - } - - - void store(std::valarray* target) const { - (*target) *= 0; - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) { - if (it->first >= target->size()) break; - (*target)[it->first] = it->second; - } - } - - int max_index() const { - if (empty()) return 0; - typename MapType::const_iterator found =values_.end(); - --found; - return found->first; - } - - // dot product with a unit vector of the same length - // as the sparse vector - T dot() const { - T sum = T(); - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) - sum += it->second; - return sum; - } - - template - S cosine_sim(const SparseVector &vec) const { - return dot(vec)/(l2norm()*vec.l2norm()); - } - - // if values are binary, gives |A intersect B|/|A union B| - template - S tanimoto_coef(const SparseVector &vec) const { - S dp=dot(vec); - return dp/(l2norm_sq()+vec.l2norm_sq()-dp); - } - - template - S dot(const SparseVector &vec) const { - S sum = S(); - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) - { - typename MapType::const_iterator - found = vec.values_.find(it->first); - if (found != vec.values_.end()) - sum += it->second * found->second; - } - return sum; - } - - template - S dot(const std::vector &vec) const { - S sum = S(); - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) - { - if (it->first < static_cast(vec.size())) - sum += it->second * vec[it->first]; - } - return sum; - } - - template - S dot(const S *vec) const { - // this is not range checked! - S sum = S(); - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) - sum += it->second * vec[it->first]; - std::cout << "dot(*vec) " << sum << std::endl; - return sum; - } - - T l1norm() const { - T sum = T(); - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) - sum += fabs(it->second); - return sum; - } - - T l2norm_sq() const { - T sum = T(); - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) - sum += it->second * it->second; - return sum; - } - - T l2norm() const { - return sqrt(l2norm_sq()); - } - - void erase(int key) { - values_.erase(key); -/* typename MapType::iterator found = values_.find(key); - if (found!=values_end()) - values_.erase(found);*/ - } - - template - void set_from(SparseVector const& other) { - for (typename MapType::const_iterator - it = other.values_.begin(); it != other.values_.end(); ++it) - { - values_[it->first]=it->second; - } - } - - SparseVector &operator+=(const SparseVector &other) { - for (typename MapType::const_iterator - it = other.values_.begin(); it != other.values_.end(); ++it) - { -// T v = - (values_[it->first] += it->second); -// if (!v) values_.erase(it->first); - } - return *this; - } - - template - SparseVector &operator+=(const SparseVector &other) { - for (typename SparseVector::MapType::const_iterator - it = other.values_.begin(); it != other.values_.end(); ++it) - { -// T v = - (values_[it->first] += it->second); -// if (!v) values_.erase(it->first); - } - return *this; - } - - SparseVector &operator-=(const SparseVector &other) { - for (typename MapType::const_iterator - it = other.values_.begin(); it != other.values_.end(); ++it) - { -// T v = - (values_[it->first] -= it->second); -// if (!v) values_.erase(it->first); - } - return *this; - } - - friend SparseVector operator -(SparseVector x,SparseVector const& y) { - x-=y; - return x; - } - friend SparseVector operator +(SparseVector x,SparseVector const& y) { - x+=y; - return x; - } - -private: - // DEPRECATED: becuase 0 values are dropped from the map, this doesn't even make sense if you have a fully populated (not really sparse re: what you'll ever use) vector - SparseVector &operator-=(T const& x) { - for (typename MapType::iterator - it = values_.begin(); it != values_.end(); ++it) - it->second -= x; - return *this; - } - - SparseVector &operator+=(T const& x) { - for (typename MapType::iterator - it = values_.begin(); it != values_.end(); ++it) - it->second += x; - return *this; - } -public: - SparseVector &operator/=(const T &x) { - for (typename MapType::iterator - it = values_.begin(); it != values_.end(); ++it) - it->second /= x; - return *this; - } - - SparseVector &operator*=(const T& x) { - for (typename MapType::iterator - it = values_.begin(); it != values_.end(); ++it) - it->second *= x; - return *this; - } - - SparseVector operator+(T const& x) const { - SparseVector result = *this; - return result += x; - } - - SparseVector operator-(T const& x) const { - SparseVector result = *this; - return result -= x; - } - - SparseVector operator/(T const& x) const { - SparseVector result = *this; - return result /= x; - } - - std::ostream &operator<<(std::ostream& out) const { - Write(true, &out); - return out; - } - - void Write(const bool with_semi, std::ostream* os) const { - bool first = true; - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) { - // by definition feature id 0 is a dummy value - if (!it->first) continue; - if (with_semi) { - (*os) << (first ? "" : ";") - << FD::Convert(it->first) << '=' << it->second; - } else { - (*os) << (first ? "" : " ") - << FD::Convert(it->first) << '=' << it->second; - } - first = false; - } - } - - bool operator==(Self const & other) const { - return size()==other.size() && contains_keys_of(other) && other.contains_i(*this); - } - - std::size_t hash_impl() const { - return boost::hash_range(begin(),end()); - } - - bool contains(Self const &o) const { - return size()>o.size() && contains(o); - } - - bool at_equals(int i,T const& val) const { - const_iterator it=values_.find(i); - if (it==values_.end()) return !val; - return it->second==val; - } - - bool contains_i(Self const& o) const { - for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i) - if (!at_equals(i->first,i->second)) - return false; - return true; - } - - bool contains_keys_of(Self const& o) const { - for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i) - if (values_.find(i->first)==values_.end()) - return false; - return true; - } - -#ifndef SPARSE_VECTOR_HASH - bool operator<(const SparseVector &other) const { - typename MapType::const_iterator it = values_.begin(); - typename MapType::const_iterator other_it = other.values_.begin(); - - for (; it != values_.end() && other_it != other.values_.end(); ++it, ++other_it) - { - if (it->first < other_it->first) return true; - if (it->first > other_it->first) return false; - if (it->second < other_it->second) return true; - if (it->second > other_it->second) return false; - } - return values_.size() < other.values_.size(); - } -#endif - - int size() const { return values_.size(); } - - int num_active() const { return values_.size(); } - bool empty() const { return values_.empty(); } - - const_iterator begin() const { return values_.begin(); } - const_iterator end() const { return values_.end(); } - - void clear() { - values_.clear(); - } - - void swap(SparseVector& other) { - values_.swap(other.values_); - } - - MapType values_; -private: +#if 0 #if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP friend class boost::serialization::access; @@ -568,96 +37,11 @@ private: BOOST_CLASS_TRACKING(SparseVector,track_never) #endif -template -inline void swap(SparseVector &a,SparseVector &b) { - a.swap(b); -} - -//like a pair but can live in a union, because it lacks default+copy ctors, dtor. -template -struct feature_val { - int fid; - T val; -}; - -template -inline feature_val featval(int fid,T const &val) { - feature_val f; - f.fid=fid; - f.val=val; - return f; -} - - -// doesn't support fast indexing directly -template -class SparseVectorList { - typedef feature_val Pair; - typedef SmallVector List; - typedef typename List::const_iterator const_iterator; - SparseVectorList() { } - template - SparseVectorList(I i,I const& end) { - int c=0; - for (;i const& v) { - for (unsigned i=0;i *to) const { - for (int i=0;iset_value(p[i].fid,p[i].val); - } - void copy_to(SparseVector *to) const { - to->clear(); - overlay(to); - } - SparseVector sparse() const { - SparseVector r; - copy_to(r); - return r; - } -private: - List p; -}; - -template -std::size_t hash_value(SparseVector const& x) { - return x.hash_impl(); -} - -template -SparseVector operator+(const SparseVector& a, const SparseVector& b) { - SparseVector result = a; - return result += b; -} - -template -SparseVector operator*(const double& a, const SparseVector& b) { - SparseVector result = b; - return result *= a; -} - -#else +#endif /// FIX #include "fast_sparse_vector.h" #define SparseVector FastSparseVector -#endif - template SparseVector operator*(const SparseVector& a, const double& b) { SparseVector result = a; diff --git a/utils/ts.cc b/utils/ts.cc index 563794c5..3694e076 100644 --- a/utils/ts.cc +++ b/utils/ts.cc @@ -40,6 +40,19 @@ void test_logv() { MPrint(x); x -= x; MPrint(x); + FastSparseVector y; + y = x; + for (int i = 1; i < 10; ++i) { + x.set_value(i, prob_t(i*1.3)); + y.set_value(i*2, prob_t(i*1.4)); + } + swap(x,y); + MPrint(y); + MPrint(x); + x = y; + y = y; + x = x; + MPrint(y); } int main() { -- cgit v1.2.3