From a2d9d0f96502c7d3c04303f3db36a8602d992287 Mon Sep 17 00:00:00 2001 From: graehl Date: Fri, 23 Jul 2010 19:17:22 +0000 Subject: sparse_vector use google::dense_hash_map, fsa scan logging git-svn-id: https://ws10smt.googlecode.com/svn/trunk@383 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/ff_fsa.h | 122 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 42 deletions(-) (limited to 'decoder/ff_fsa.h') diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h index 3a9478e2..e359cfd9 100755 --- a/decoder/ff_fsa.h +++ b/decoder/ff_fsa.h @@ -3,7 +3,13 @@ //SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h -//TODO: actually compile this; probably full of syntax errors. +#define FSA_DEBUG +#ifdef FSA_DEBUG +# include +# define FSADBG(x) do { std::cerr << x; } while(0) +#else +# define FSADBG(x) +#endif #include //C99 #include @@ -34,7 +40,6 @@ protected: if (h_start.size()!=sb) h_start.resize(sb); state_bytes_=sb; } - int fid_; // you can have more than 1 feature of course. void init_fid(std::string const& name) { // call this, though, if you have a single feature fid_=FD::Convert(name); @@ -54,6 +59,11 @@ protected: to_state(state,(char const*)begin,(char const*)end); } public: + //edges may have old features on them. override if you have more than 1 fid. we need to call this explicitly because edges may have old feature values already, and I chose to use add_value (+=) to simplify scanning a phrase, rather than set_value (=) for fsa ffs. could revisit this and use set_value and therefore sum + void init_features(FeatureVector *fv) const { + fv->set_value(fid_,0); + //erase(fid_) + } // return m: all strings x with the same final m+1 letters must end in this state /* markov chain of order m: P(xn|xn-1...x1)=P(xn|xn-1...xn-m) */ int markov_order() const { return 0; } // override if you use state. order 0 implies state_bytes()==0 as well, as far as scoring/splitting is concerned (you can still track state, though) @@ -87,6 +97,69 @@ public: }; +// init state is in cs; overwrite cs, ns repeatedly (alternatively). return resulting state +template +void *FsaScan(FsaFF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *features, void *cs,void *ns) { + // extra code - IT'S FOR EFFICIENCY, MAN! IT'S OK! definitely no bugs here. + void *os,*es; + WordID const* e=end-1; // boundcheck 1 earlier because in loop below we use i+1 before rechecking + if ((end-i)&1) { // odd # of words + os=cs; + es=ns; + i-=1; + goto odd; + } else { + es=cs; + os=ns; + } + for (;io + odd: + ff.Scan(smeta,*(i+1),os,es,features); // o->e + } + return es; +} + +// if you have a more efficient implementation for scanning a phrase than one word at a time (e.g. LM context using sliding window buffer rather than rotating through a fixed state size), you can override this +template +void Scan(FsaFF const& ff,SentenceMetadata const& smeta,WordID const* i,WordID const* e,void const* state,void *next_state,FeatureVector *features) { + int ssz=ff.state_bytes(); + if (!ssz) { + for (;i +void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *h_features,void const* start_state) { + int ssz=ff.state_bytes(); + if (ssz) { + Bytes state(ssz),state2(ssz); + void *cs=state.begin(),*ns=state2.begin(); + memcpy(cs,start_state,ff.state_bytes()); + FsaScan(ff,smeta,i,end,h_features,cs,ns); + } else + for (;i struct FsaTypedBase : public FsaFeatureFunctionBase { @@ -114,41 +187,20 @@ public: } }; + // usage (if you're lazy): // struct ShorterThanPrev : public FsaTypedBase,FsaTypedScan -template -struct FsaTypedScan { +template +struct FsaTypedScan : public FsaTypedBase { void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const { Impl const* impl=static_cast(this); - impl->Scan(smeta,w,impl->state(st),impl->state(next_state),features); + impl->ScanTyped(smeta,w,impl->state(st),impl->state(next_state),features); + FSADBG("Scan "<state(st)<<" ->"<state(next_state)<<" = "<<(*features)[impl->fid_]< -void *FsaScan(FsaFF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *h_features, void *cs,void *ns) { - // extra code - IT'S FOR EFFICIENCY, MAN! IT'S OK! definitely no bugs here. - void *os,*es; - WordID const* e=end-1; // boundcheck 1 earlier because in loop below we use i+1 before rechecking - if ((end-i)&1) { // odd # of words - os=cs; - es=ns; - i-=1; - goto odd; - } else { - es=cs; - os=ns; - } - for (;io - odd: - ff.Scan(smeta,*(i+1),os,es,h_features); // o->e - } - return es; -} - // do not use if state size is 0, please. const bool optimize_FsaScanner_zerostate=false; @@ -205,20 +257,6 @@ struct FsaScanner { }; -template -void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *h_features,void const* start_state) { - int ssz=ff.state_bytes(); - if (ssz) { - Bytes state(ssz),state2(ssz); - void *cs=state.begin(),*ns=state2.begin(); - memcpy(cs,start_state,ff.state_bytes()); - FsaScan(ff,smeta,i,end,h_features,cs,ns); - } else - for (;i