From a2d9d0f96502c7d3c04303f3db36a8602d992287 Mon Sep 17 00:00:00 2001 From: graehl Date: Fri, 23 Jul 2010 19:17:22 +0000 Subject: sparse_vector use google::dense_hash_map, fsa scan logging git-svn-id: https://ws10smt.googlecode.com/svn/trunk@383 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/cdec.cc | 2 +- decoder/ff.h | 2 +- decoder/ff_from_fsa.h | 29 ++++++++-- decoder/ff_fsa.h | 122 +++++++++++++++++++++++++-------------- decoder/ff_sample_fsa.h | 15 +++-- decoder/hg.cc | 5 +- decoder/logval.h | 2 + decoder/sparse_vector.h | 100 ++++++++++++++++++++++++-------- decoder/value_array.h | 2 +- extools/mr_stripe_rule_reduce.cc | 2 +- 10 files changed, 196 insertions(+), 85 deletions(-) diff --git a/decoder/cdec.cc b/decoder/cdec.cc index 6fb6d5a1..f366a08f 100644 --- a/decoder/cdec.cc +++ b/decoder/cdec.cc @@ -749,7 +749,7 @@ int main(int argc, char** argv) { } if (output_training_vector) { - acc_vec.clear_value(0); + acc_vec.erase(0); ++g_count; if (g_count % combine_size == 0) { if (encode_b64) { diff --git a/decoder/ff.h b/decoder/ff.h index e54ac149..0bfc8582 100644 --- a/decoder/ff.h +++ b/decoder/ff.h @@ -175,7 +175,7 @@ class ModelSet { // sets edge->feature_values_ and edge->edge_prob_ // NOTE: edge must not necessarily be in hg.edges_ but its TAIL nodes - // must be. + // must be. edge features are supposed to be overwritten, not added to (possibly because rule features aren't in ModelSet so need to be left alone void AddFeaturesToEdge(const SentenceMetadata& smeta, const Hypergraph& hg, const std::vector& node_states, diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h index 6f2e27f0..f84bda31 100755 --- a/decoder/ff_from_fsa.h +++ b/decoder/ff_from_fsa.h @@ -39,6 +39,7 @@ public: FeatureVector* estimated_features, void* out_state) const { + ff.init_features(features); // estimated_features is fresh if (!ssz) { TRule const& rule=*edge.rule_; Sentence const& e = rule.e(); @@ -112,11 +113,11 @@ public: FeatureVector* final_features) const { Sentence const& ends=ff.end_phrase(); - SP ss=ff.start_state(); if (!ssz) { - AccumFeatures(ff,smeta,begin(ends),end(ends),final_features,ss); + AccumFeatures(ff,smeta,begin(ends),end(ends),final_features,0); return; } + SP ss=ff.start_state(); WP l=(WP)residual_state,lend=left_end(residual_state); SP rst=fsa_state(residual_state); if (lend==rst) { // implying we have an fsa state @@ -137,6 +138,15 @@ public: return StateSize()==0; // Fsa features don't get info about span } + static void test() { + WordID w1[1],w1b[1],w2[2]; + w1[0]=w2[0]=TD::Convert("hi"); + w2[1]=w1b[0]=TD::none; + assert(left_end(w1,w1+1)==w1+1); + assert(left_end(w1b,w1b+1)==w1b); + assert(left_end(w2,w2+2)==w2+1); + } + private: Impl ff; void Init() { @@ -147,8 +157,8 @@ private: SetStateSize(ff.state_bytes()+state_offset); } int M; // markov order (ctx len) - FeatureFunctionFromFsa() { } - // call this explicitly in constructor body: + FeatureFunctionFromFsa(); // not allowed. + int state_offset; // store left-words first, then fsa state int ssz; // bytes in fsa state /* @@ -183,10 +193,17 @@ private: inline void fstatecpy(void *dest,void const* src) const { std::memcpy(dest,src,ssz); } - - }; +#ifdef TEST_FSA +# include "tdict.cc" +# include "ff_sample_fsa.h" +int main() { + std::cerr<<"Testing left_end...\n"; + WordPenaltyFromFsa::test(); + return 0; +} +#endif #endif diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h index 3a9478e2..e359cfd9 100755 --- a/decoder/ff_fsa.h +++ b/decoder/ff_fsa.h @@ -3,7 +3,13 @@ //SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h -//TODO: actually compile this; probably full of syntax errors. +#define FSA_DEBUG +#ifdef FSA_DEBUG +# include +# define FSADBG(x) do { std::cerr << x; } while(0) +#else +# define FSADBG(x) +#endif #include //C99 #include @@ -34,7 +40,6 @@ protected: if (h_start.size()!=sb) h_start.resize(sb); state_bytes_=sb; } - int fid_; // you can have more than 1 feature of course. void init_fid(std::string const& name) { // call this, though, if you have a single feature fid_=FD::Convert(name); @@ -54,6 +59,11 @@ protected: to_state(state,(char const*)begin,(char const*)end); } public: + //edges may have old features on them. override if you have more than 1 fid. we need to call this explicitly because edges may have old feature values already, and I chose to use add_value (+=) to simplify scanning a phrase, rather than set_value (=) for fsa ffs. could revisit this and use set_value and therefore sum + void init_features(FeatureVector *fv) const { + fv->set_value(fid_,0); + //erase(fid_) + } // return m: all strings x with the same final m+1 letters must end in this state /* markov chain of order m: P(xn|xn-1...x1)=P(xn|xn-1...xn-m) */ int markov_order() const { return 0; } // override if you use state. order 0 implies state_bytes()==0 as well, as far as scoring/splitting is concerned (you can still track state, though) @@ -87,6 +97,69 @@ public: }; +// init state is in cs; overwrite cs, ns repeatedly (alternatively). return resulting state +template +void *FsaScan(FsaFF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *features, void *cs,void *ns) { + // extra code - IT'S FOR EFFICIENCY, MAN! IT'S OK! definitely no bugs here. + void *os,*es; + WordID const* e=end-1; // boundcheck 1 earlier because in loop below we use i+1 before rechecking + if ((end-i)&1) { // odd # of words + os=cs; + es=ns; + i-=1; + goto odd; + } else { + es=cs; + os=ns; + } + for (;io + odd: + ff.Scan(smeta,*(i+1),os,es,features); // o->e + } + return es; +} + +// if you have a more efficient implementation for scanning a phrase than one word at a time (e.g. LM context using sliding window buffer rather than rotating through a fixed state size), you can override this +template +void Scan(FsaFF const& ff,SentenceMetadata const& smeta,WordID const* i,WordID const* e,void const* state,void *next_state,FeatureVector *features) { + int ssz=ff.state_bytes(); + if (!ssz) { + for (;i +void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *h_features,void const* start_state) { + int ssz=ff.state_bytes(); + if (ssz) { + Bytes state(ssz),state2(ssz); + void *cs=state.begin(),*ns=state2.begin(); + memcpy(cs,start_state,ff.state_bytes()); + FsaScan(ff,smeta,i,end,h_features,cs,ns); + } else + for (;i struct FsaTypedBase : public FsaFeatureFunctionBase { @@ -114,41 +187,20 @@ public: } }; + // usage (if you're lazy): // struct ShorterThanPrev : public FsaTypedBase,FsaTypedScan -template -struct FsaTypedScan { +template +struct FsaTypedScan : public FsaTypedBase { void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const { Impl const* impl=static_cast(this); - impl->Scan(smeta,w,impl->state(st),impl->state(next_state),features); + impl->ScanTyped(smeta,w,impl->state(st),impl->state(next_state),features); + FSADBG("Scan "<state(st)<<" ->"<state(next_state)<<" = "<<(*features)[impl->fid_]< -void *FsaScan(FsaFF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *h_features, void *cs,void *ns) { - // extra code - IT'S FOR EFFICIENCY, MAN! IT'S OK! definitely no bugs here. - void *os,*es; - WordID const* e=end-1; // boundcheck 1 earlier because in loop below we use i+1 before rechecking - if ((end-i)&1) { // odd # of words - os=cs; - es=ns; - i-=1; - goto odd; - } else { - es=cs; - os=ns; - } - for (;io - odd: - ff.Scan(smeta,*(i+1),os,es,h_features); // o->e - } - return es; -} - // do not use if state size is 0, please. const bool optimize_FsaScanner_zerostate=false; @@ -205,20 +257,6 @@ struct FsaScanner { }; -template -void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *h_features,void const* start_state) { - int ssz=ff.state_bytes(); - if (ssz) { - Bytes state(ssz),state2(ssz); - void *cs=state.begin(),*ns=state2.begin(); - memcpy(cs,start_state,ff.state_bytes()); - FsaScan(ff,smeta,i,end,h_features,cs,ns); - } else - for (;i4 chars is usually longer than previous } @@ -73,8 +73,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase { }; // similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State) -struct ShorterThanPrev : public FsaTypedBase,FsaTypedScan { - typedef int State; // defines # of bytes in state and return type of state(void *) +struct ShorterThanPrev : FsaTypedScan { static std::string usage(bool param,bool verbose) { return FeatureFunction::usage_helper( "ShorterThanPrev", @@ -88,22 +87,22 @@ struct ShorterThanPrev : public FsaTypedBase,FsaTypedScan } ShorterThanPrev(std::string const& param) { init_fid(usage(false,false)); - end_phrase_.push_back(TD::Convert("")); // this triggers end of sentence firing +// end_phrase_.push_back(TD::Convert("")); // this triggers end of sentence firing set_starts(-1,4); // estimate: anything <4 chars is usually shorter than previous } static const float val_per_target_word=-1; // evil anti-google int & len out-param: - void Scan(SentenceMetadata const& smeta,WordID w,int prevlen,int &len,FeatureVector *features) const { + void ScanTyped(SentenceMetadata const& smeta,WordID w,int prevlen,int &len,FeatureVector *features) const { len=wordlen(w); if (lenadd_value(fid_,val_per_target_word); } // already provided by FsaTypedScan - void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const { - Scan(smeta,w,state(st),state(next_state),features); - } +/* void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const { + ScanTyped(smeta,w,state(st),state(next_state),features); + } */ }; diff --git a/decoder/hg.cc b/decoder/hg.cc index f0d6b3d5..5a33fc97 100644 --- a/decoder/hg.cc +++ b/decoder/hg.cc @@ -54,12 +54,13 @@ struct ScaledTransitionEventWeightFunction { // safe to reinterpret a vector of these as a vector of prob_t (plain old data) struct TropicalValue { TropicalValue() : v_() {} - explicit TropicalValue(int v) { + TropicalValue(int v) { if (v == 0) v_ = prob_t::Zero(); else if (v == 1) v_ = prob_t::One(); else { cerr << "Bad value in TropicalValue(int).\n"; abort(); } } - explicit TropicalValue(const prob_t& v) : v_(v) {} + TropicalValue(unsigned v) : v_(v) {} + TropicalValue(const prob_t& v) : v_(v) {} inline TropicalValue& operator+=(const TropicalValue& o) { if (v_ < o.v_) v_ = o.v_; return *this; diff --git a/decoder/logval.h b/decoder/logval.h index c8c342a3..37f14ae5 100644 --- a/decoder/logval.h +++ b/decoder/logval.h @@ -13,6 +13,8 @@ class LogVal { public: LogVal() : s_(), v_(-std::numeric_limits::infinity()) {} explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {} + LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {} + LogVal(unsigned x) : s_(0), v_(std::log(x)) { } LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {} static LogVal exp(T lnx) { return LogVal(lnx,false); } diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h index b42e001a..0f3724f0 100644 --- a/decoder/sparse_vector.h +++ b/decoder/sparse_vector.h @@ -1,6 +1,17 @@ #ifndef _SPARSE_VECTOR_H_ #define _SPARSE_VECTOR_H_ -/* TODO: use dense_hash_map for sparsevector + +#define SPARSE_VECTOR_HASH + +#ifdef SPARSE_VECTOR_HASH +#include "hash.h" +# define SPARSE_VECTOR_MAP HASH_MAP +# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) HASH_MAP_RESERVED(h,empty,deleted) +#else +# define SPARSE_VECTOR_MAP std::map +# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) +#endif +/* use SparseVectorList (pair smallvector) for feat funcs / hypergraphs (you rarely need random access; just append a feature to the list) */ /* hack: index 0 never gets printed because cdyer is creative and efficient. features which have no weight got feature dict id 0, see, and the models all clobered that value. nobody wants to see it. except that vlad is also creative and efficient and stored the oracle bleu there. */ @@ -27,21 +38,28 @@ inline T & extend_vector(std::vector &v,int i) { template class SparseVector { + void init_reserved() { + SPARSE_VECTOR_MAP_RESERVED(values_,-1,-2); + } public: - typedef std::map MapType; - typedef typename std::map::const_iterator const_iterator; - SparseVector() {} + typedef SparseVector Self; + typedef SPARSE_VECTOR_MAP MapType; + typedef typename MapType::const_iterator const_iterator; + SparseVector() { + init_reserved(); + } explicit SparseVector(std::vector const& v) { + init_reserved(); typename MapType::iterator p=values_.begin(); - const T z=T(0); + const T z=0; for (unsigned i=0;i *vp) const { init_vector(*vp); } @@ -52,7 +70,6 @@ public: extend_vector(v,i->first)=i->second; } - void set_new_value(int index, T const& val) { assert(values_.find(index)==values_.end()); values_[index]=val; @@ -65,10 +82,10 @@ public: } - const T operator[](int index) const { + T operator[](int index) const { typename MapType::const_iterator found = values_.find(index); if (found == values_.end()) - return T(0); + return 0; else return found->second; } @@ -77,8 +94,11 @@ public: values_[index] = value; } - T add_value(int index, const T &value) { - return values_[index] += value; + T const& add_value(int index, const T &value) { + std::pair art=values_.insert(std::make_pair(index,value)); + T &val=art.first->second; + if (!art.second) val += value; // already existed + return val; } T value(int index) const { @@ -86,7 +106,7 @@ public: if (found != values_.end()) return found->second; else - return T(0); + return 0; } void store(std::valarray* target) const { @@ -184,13 +204,20 @@ public: return sqrt(l2norm_sq()); } + void erase(int key) { + values_.erase(key); +/* typename MapType::iterator found = values_.find(key); + if (found!=values_end()) + values_.erase(found);*/ + } + SparseVector &operator+=(const SparseVector &other) { for (typename MapType::const_iterator it = other.values_.begin(); it != other.values_.end(); ++it) { - T v = (values_[it->first] += it->second); - if (v == T()) - values_.erase(it->first); +// T v = + (values_[it->first] += it->second); +// if (v == 0) values_.erase(it->first); } return *this; } @@ -199,9 +226,9 @@ public: for (typename MapType::const_iterator it = other.values_.begin(); it != other.values_.end(); ++it) { - T v = (values_[it->first] -= it->second); - if (v == T(0)) - values_.erase(it->first); +// T v = + (values_[it->first] -= it->second); +// if (v == 0) values_.erase(it->first); } return *this; } @@ -282,6 +309,35 @@ public: } } + bool operator==(Self const & other) const { + return size()==other.size() && contains_keys_of(other) && other.contains_i(*this); + } + + bool contains(Self const &o) const { + return size()>o.size() && contains(o); + } + + bool at_equals(int i,T const& val) const { + const_iterator it=values_.find(i); + if (it==values_.end()) return val==0; + return it->second==val; + } + + bool contains_i(Self const& o) const { + for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i) + if (!at_equals(i->first,i->second)) + return false; + return true; + } + + bool contains_keys_of(Self const& o) const { + for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i) + if (values_.find(i)==values_.end()) + return false; + return true; + } + +#ifndef SPARSE_VECTOR_HASH bool operator<(const SparseVector &other) const { typename MapType::const_iterator it = values_.begin(); typename MapType::const_iterator other_it = other.values_.begin(); @@ -295,6 +351,7 @@ public: } return values_.size() < other.values_.size(); } +#endif int size() const { return values_.size(); } @@ -307,9 +364,6 @@ public: void clear() { values_.clear(); } - void clear_value(int index) { - values_.erase(index); - } void swap(SparseVector& other) { values_.swap(other.values_); @@ -328,7 +382,7 @@ class SparseVectorList { SparseVectorList() { } template SparseVectorList(I i,I const& end) { - const T z=T(0); + const T z=0; int c=0; for (;i const& v) { - const T z=T(0); + const T z=0; for (unsigned i=0;i& key, const bool bidir, ID2RuleStati it->second.counts.set_value(cur_marginal_id, tot); // prevent double counting of the joint - if (cur_marginal_id == kCE) it->second.counts.clear_value(kCFE); + if (cur_marginal_id == kCE) it->second.counts.erase(kCFE); } } -- cgit v1.2.3