From 90d6674878bfc231012bb8eb2a3eaa183eee5220 Mon Sep 17 00:00:00 2001 From: graehl Date: Fri, 23 Jul 2010 04:21:51 +0000 Subject: fsa: stateless works, debug sample bigram {Longer,Shorter}ThanPrev git-svn-id: https://ws10smt.googlecode.com/svn/trunk@375 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/cdec_ff.cc | 4 +- decoder/ff_from_fsa.h | 1 - decoder/ff_fsa.h | 77 +++++++++++++++++++++++---------- decoder/ff_sample_fsa.h | 111 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 169 insertions(+), 24 deletions(-) create mode 100755 decoder/ff_sample_fsa.h diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index ecb244d8..78c67fb3 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -8,13 +8,15 @@ #include "ff_factory.h" #include "ff_ruleshape.h" #include "ff_bleu.h" -#include "ff_from_fsa.h" +#include "ff_sample_fsa.h" boost::shared_ptr global_ff_registry; void register_feature_functions() { global_ff_registry->Register(new FFFactory); global_ff_registry->Register(new FFFactory); // same as WordPenalty, but implemented using ff_fsa + global_ff_registry->Register(new FFFactory >); + global_ff_registry->Register(new FFFactory >); //TODO: use for all features the new Register which requires usage(...) #ifdef HAVE_RANDLM global_ff_registry->Register("RandLM", new FFFactory); diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h index cb7255d8..6f2e27f0 100755 --- a/decoder/ff_from_fsa.h +++ b/decoder/ff_from_fsa.h @@ -187,7 +187,6 @@ private: }; -typedef FeatureFunctionFromFsa WordPenaltyFromFsa; #endif diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h index 3096f049..3a9478e2 100755 --- a/decoder/ff_fsa.h +++ b/decoder/ff_fsa.h @@ -30,6 +30,8 @@ protected: Sentence end_phrase_; // words appended for final traversal (final state cost is assessed using Scan) e.g. "" for lm. int state_bytes_; // don't forget to set this. default 0 (it may depend on params of course) void set_state_bytes(int sb=0) { + if (start.size()!=sb) start.resize(sb); + if (h_start.size()!=sb) h_start.resize(sb); state_bytes_=sb; } @@ -37,8 +39,21 @@ protected: void init_fid(std::string const& name) { // call this, though, if you have a single feature fid_=FD::Convert(name); } + inline void static to_state(void *state,char const* begin,char const* end) { + std::memcpy(state,begin,end-begin); + } + inline void static to_state(void *state,char const* begin,int n) { + std::memcpy(state,begin,n); + } + template + inline void static to_state(void *state,T const* begin,int n) { + to_state(state,(char const*)begin,n); + } + template + inline void static to_state(void *state,T const* begin,T const* end) { + to_state(state,(char const*)begin,(char const*)end); + } public: - // return m: all strings x with the same final m+1 letters must end in this state /* markov chain of order m: P(xn|xn-1...x1)=P(xn|xn-1...xn-m) */ int markov_order() const { return 0; } // override if you use state. order 0 implies state_bytes()==0 as well, as far as scoring/splitting is concerned (you can still track state, though) @@ -64,12 +79,49 @@ public: //TODO: decide if we want to require you to support dest same as src, since that's how we use it most often in ff_from_fsa bottom-up wrapper (in l->r scoring, however, distinct copies will be the rule), and it probably wouldn't be too hard for most people to support. however, it's good to hide the complexity here, once (see overly clever FsaScan loop that swaps src/dest addresses repeatedly to scan a sequence by effectively swapping) // NOTE: if you want to e.g. track statistics, cache, whatever, cast const away or use mutable members - void Scan(SentenceMetadata const& smeta,WordID x,void const* state,void *next_state,FeatureVector *features) const { + void Scan(SentenceMetadata const& smeta,WordID w,void const* state,void *next_state,FeatureVector *features) const { } // don't set state-bytes etc. in ctor because it may depend on parsing param string - FsaFeatureFunctionBase() : start(0),h_start(0),state_bytes_(0) { } + FsaFeatureFunctionBase(int statesz=0) : start(statesz),h_start(statesz),state_bytes_(statesz) { } + +}; + +// if State is pod. sets state size and allocs start, h_start +template +struct FsaTypedBase : public FsaFeatureFunctionBase { +protected: + typedef St State; + static inline State & state(void *state) { + return *(State*)state; + } + static inline State const& state(void const* state) { + return *(State const*)state; + } + void set_starts(State const& s,State const& heuristic_s) { + if (0) { // already in ctor + start.resize(sizeof(State)); + h_start.resize(sizeof(State)); + } + state(start.begin())=s; + state(h_start.begin())=heuristic_s; + } + void set_h_start(State const& s) { + } +public: + int markov_order() const { return 1; } + FsaTypedBase() : FsaFeatureFunctionBase(sizeof(State)) { + } +}; +// usage (if you're lazy): +// struct ShorterThanPrev : public FsaTypedBase,FsaTypedScan +template +struct FsaTypedScan { + void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const { + Impl const* impl=static_cast(this); + impl->Scan(smeta,w,impl->state(st),impl->state(next_state),features); + } }; @@ -169,26 +221,7 @@ void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,WordID const* i, W //TODO: combine 2 FsaFeatures typelist style (can recurse for more) -// example: feature val = -1 * # of target words -struct WordPenaltyFsa : public FsaFeatureFunctionBase { - WordPenaltyFsa(std::string const& param) { - init_fid(usage(false,false)); - return; - //below are all defaults: - set_state_bytes(0); - start.clear(); - h_start.clear(); - } - static const float val_per_target_word=-1; - // move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates. state and next_state may be same memory. - void Scan(SentenceMetadata const& smeta,WordID x,void const* state,void *next_state,FeatureVector *features) const { - features->add_value(fid_,val_per_target_word); - } - static std::string usage(bool param,bool verbose) { - return FeatureFunction::usage_helper("WordPenaltyFsa","","-1 per target word",param,verbose); - } -}; #endif diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h new file mode 100755 index 00000000..13a25387 --- /dev/null +++ b/decoder/ff_sample_fsa.h @@ -0,0 +1,111 @@ +#ifndef FF_SAMPLE_FSA_H +#define FF_SAMPLE_FSA_H + +#include "ff_from_fsa.h" + +// example: feature val = -1 * # of target words +struct WordPenaltyFsa : public FsaFeatureFunctionBase { + static std::string usage(bool param,bool verbose) { + return FeatureFunction::usage_helper( + "WordPenaltyFsa","","-1 per target word" + ,param,verbose); + } + + WordPenaltyFsa(std::string const& param) { + init_fid(usage(false,false)); + return; + //below are all defaults: + set_state_bytes(0); + start.clear(); + h_start.clear(); + } + static const float val_per_target_word=-1; + // move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates. state and next_state may be same memory. + void Scan(SentenceMetadata const& smeta,WordID w,void const* state,void *next_state,FeatureVector *features) const { + features->add_value(fid_,val_per_target_word); + } +}; + +typedef FeatureFunctionFromFsa WordPenaltyFromFsa; + + +// +struct LongerThanPrev : public FsaFeatureFunctionBase { + static std::string usage(bool param,bool verbose) { + return FeatureFunction::usage_helper( + "LongerThanPrev", + "", + "stupid example stateful (bigram) feature: -1 per target word that's longer than the previous word (always fires for first word of sentence)", + param,verbose); + } + + static inline int &wordlen(void *state) { + return *(int*)state; + } + static inline int wordlen(void const* state) { + return *(int const*)state; + } + static inline int wordlen(WordID w) { + return std::strlen(TD::Convert(w)); + } + int markov_order() const { return 1; } + LongerThanPrev(std::string const& param) { + init_fid(usage(false,false)); + set_state_bytes(sizeof(int)); +// start.resize(state_bytes()); // this is done by set_state_bytes already. +// h_start.resize(state_bytes()); +// int ss=-1; +// wordcpy((WordID*)start.begin(),&ss,&ss+1); + //to_state(start.begin(),&ss,1); + wordlen(start.begin())=-1; // same as above. + wordlen(h_start.begin())=4; // estimate: anything >4 chars is usually longer than previous + } + + static const float val_per_target_word=-1; + void Scan(SentenceMetadata const& smeta,WordID w,void const* state,void *next_state,FeatureVector *features) const { + int prevlen=wordlen(state); + int len=wordlen(w); + wordlen(next_state)=len; + if (len>prevlen) + features->add_value(fid_,val_per_target_word); + } + +}; + +// similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State) +struct ShorterThanPrev : public FsaTypedBase,FsaTypedScan { + typedef int State; // defines # of bytes in state and return type of state(void *) + static std::string usage(bool param,bool verbose) { + return FeatureFunction::usage_helper( + "ShorterThanPrev", + "", + "stupid example stateful (bigram) feature: -1 per target word that's shorter than the previous word (always fires for end of sentence)", + param,verbose); + } + + static inline int wordlen(WordID w) { + return std::strlen(TD::Convert(w)); + } + ShorterThanPrev(std::string const& param) { + init_fid(usage(false,false)); + end_phrase_.push_back(TD::Convert("")); // this triggers end of sentence firing + set_starts(-1,4); // estimate: anything <4 chars is usually shorter than previous + } + + static const float val_per_target_word=-1; + // evil anti-google int & len out-param: + void Scan(SentenceMetadata const& smeta,WordID w,int prevlen,int &len,FeatureVector *features) const { + len=wordlen(w); + if (lenadd_value(fid_,val_per_target_word); + } + + // already provided by FsaTypedScan + void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const { + Scan(smeta,w,state(st),state(next_state),features); + } + +}; + + +#endif -- cgit v1.2.3