From 9f34384f610e512488df4bb0f08e962ad95d6d13 Mon Sep 17 00:00:00 2001 From: graehl Date: Tue, 27 Jul 2010 04:59:37 +0000 Subject: fsa feature templated Accum interface, phrase interface allows exceeding markov order e.g. unigram state, 3gram lm. use Accum,set_value rather than clear,add_value. warning: 3gram fsa lm disagrees with bottom-up in 4th decimal place git-svn-id: https://ws10smt.googlecode.com/svn/trunk@431 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/cdec.cc | 1 - decoder/cdec_ff.cc | 6 +- decoder/feature_accum.h | 129 ++++++++++++++++++++++ decoder/ff.h | 3 +- decoder/ff_factory.cc | 2 +- decoder/ff_from_fsa.h | 100 +++++++++-------- decoder/ff_fsa.h | 286 ++++++++++++++++++++++++++---------------------- decoder/ff_lm.cc | 62 ++--------- decoder/ff_lm.h | 37 ++++++- decoder/ff_lm_fsa.h | 95 +++++++++++++++- decoder/ff_sample_fsa.h | 2 + decoder/sentences.h | 5 +- decoder/sparse_vector.h | 16 ++- decoder/tdict.cc | 11 ++ decoder/tdict.h | 1 + decoder/value_array.h | 53 +++++++++ 16 files changed, 571 insertions(+), 238 deletions(-) create mode 100755 decoder/feature_accum.h diff --git a/decoder/cdec.cc b/decoder/cdec.cc index bc1c348d..76551cfa 100644 --- a/decoder/cdec.cc +++ b/decoder/cdec.cc @@ -190,7 +190,6 @@ void InitCommandLine(int argc, char** argv, OracleBleu &ob, po::variables_map* c exit(1); } - if (conf.count("usage")) { cout<usage(str("usage",conf),true,true)< global_ff_registry; void register_feature_functions() { global_ff_registry->Register(new FFFactory); global_ff_registry->Register(new FFFactory >); // same as LM but using fsa wrapper + + /* // sample_fsa: global_ff_registry->Register(new FFFactory); // same as WordPenalty, but implemented using ff_fsa global_ff_registry->Register(new FFFactory >); global_ff_registry->Register(new FFFactory >); + */ + //TODO: use for all features the new Register which requires usage(...) #ifdef HAVE_RANDLM global_ff_registry->Register("RandLM", new FFFactory); diff --git a/decoder/feature_accum.h b/decoder/feature_accum.h new file mode 100755 index 00000000..851b29db --- /dev/null +++ b/decoder/feature_accum.h @@ -0,0 +1,129 @@ +#ifndef FEATURE_ACCUM_H +#define FEATURE_ACCUM_H + +#include "ff.h" +#include "sparse_vector.h" +#include "value_array.h" + +struct SparseFeatureAccumulator : public FeatureVector { + typedef FeatureVector State; + SparseFeatureAccumulator() { } + template + FeatureVector const& describe(FF const& ) { return *this; } + void Store(FeatureVector *fv) const { + fv->set_from(*this); + } + template + void Store(FF const& /* ff */,FeatureVector *fv) const { + fv->set_from(*this); + } + template + void Add(FF const& /* ff */,FeatureVector const& fv) { + (*this)+=fv; + } + void Add(FeatureVector const& fv) { + (*this)+=fv; + } + /* + SparseFeatureAccumulator(FeatureVector const& fv) : State(fv) {} + FeatureAccumulator(Features const& fids) {} + FeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fv) {} + void Add(Features const& fids,FeatureVector const& fv) { + *this += fv; + } + */ + void Add(int i,Featval v) { + (*this)[i]+=v; + } + void Add(Features const& fids,int i,Featval v) { + (*this)[i]+=v; + } +}; + +struct SingleFeatureAccumulator { + typedef Featval State; + typedef SingleFeatureAccumulator Self; + State v; + /* + void operator +=(State const& o) { + v+=o; + } + */ + void operator +=(Self const& s) { + v+=s.v; + } + SingleFeatureAccumulator() : v() {} + template + State const& describe(FF const& ) const { return v; } + + template + void Store(FF const& ff,FeatureVector *fv) const { + fv->set_value(ff.fid_,v); + } + void Store(Features const& fids,FeatureVector *fv) const { + assert(fids.size()==1); + fv->set_value(fids[0],v); + } + /* + SingleFeatureAccumulator(Features const& fids) { assert(fids.size()==1); } + SingleFeatureAccumulator(Features const& fids,FeatureVector const& fv) + { + assert(fids.size()==1); + v=fv.get_singleton(); + } + */ + + template + void Add(FF const& ff,FeatureVector const& fv) { + v+=fv.get(ff.fid_); + } + void Add(FeatureVector const& fv) { + v+=fv.get_singleton(); + } + + void Add(Features const& fids,FeatureVector const& fv) { + v += fv.get(fids[0]); + } + void Add(Featval dv) { + v+=dv; + } + void Add(int,Featval dv) { + v+=dv; + } + void Add(FeatureVector const& fids,int i,Featval dv) { + assert(fids.size()==1 && i==0); + v+=dv; + } +}; + + +#if 0 +// omitting this so we can default construct an accum. might be worth resurrecting in the future +struct ArrayFeatureAccumulator : public ValueArray { + typedef ValueArray State; + template + ArrayFeatureAccumulator(Fsa const& fsa) : State(fsa.features_.size()) { } + ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) { } + ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) { } + ArrayFeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fids.size()) { + for (int i=0,e=iset_value(fids[i],(*this)[i]); + } + void Add(Features const& fids,FeatureVector const& fv) { + for (int i=0,e=i Features; // set of features ids class FeatureFunction { public: std::string name; // set by FF factory using usage() - bool debug; // also set by FF factory checking param for immediate initial "debug" + bool debug_; // also set by FF factory checking param for immediate initial "debug" + bool debug() const { return debug_; } FeatureFunction() : state_size_() {} explicit FeatureFunction(int state_size) : state_size_(state_size) {} virtual ~FeatureFunction(); diff --git a/decoder/ff_factory.cc b/decoder/ff_factory.cc index aec82d38..88991fbf 100644 --- a/decoder/ff_factory.cc +++ b/decoder/ff_factory.cc @@ -43,7 +43,7 @@ shared_ptr FFRegistry::Create(const string& ffname, const strin } res = it->second->Create(p); res->name=ffname; - res->debug=debug; + res->debug_=debug; } return res; } diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h index f50e0fdc..7fa6be67 100755 --- a/decoder/ff_from_fsa.h +++ b/decoder/ff_from_fsa.h @@ -5,8 +5,8 @@ #define FSA_FF_DEBUG 0 #if FSA_FF_DEBUG -# define FSAFFDBG(e,x) FSADBGif(debug,e,x) -# define FSAFFDBGnl(e) FSADBGif_nl(debug,e) +# define FSAFFDBG(e,x) FSADBGif(debug(),e,x) +# define FSAFFDBGnl(e) FSADBGif_nl(debug(),e) #else # define FSAFFDBG(e,x) # define FSAFFDBGnl(e) @@ -30,7 +30,7 @@ class FeatureFunctionFromFsa : public FeatureFunction { typedef WordID const* WP; public: FeatureFunctionFromFsa(std::string const& param) : ff(param) { - debug=true; // because factory won't set until after we construct. + debug_=true; // because factory won't set until after we construct. Init(); } @@ -48,18 +48,27 @@ public: FeatureVector* estimated_features, void* out_state) const { - ff.init_features(features); // estimated_features is fresh + TRule const& rule=*edge.rule_; + Sentence const& e = rule.e(); + typename Impl::Accum accum,h_accum; if (!ssz) { - TRule const& rule=*edge.rule_; - Sentence const& e = rule.e(); - for (int j = 0; j < e.size(); ++j) { // items in target side of rule - if (e[j] < 1) { // variable - } else { + Sentence phrase; + phrase.reserve(e.size()); + for (int j=0,je=e.size();;++j) { // items in target side of rule + if (je==j || e[j]<1) { // end or variable + if (phrase.size()) { + FSAFFDBG(edge," ["< fsa(ff,smeta,edge); // this holds our current state and eventuallybecomes our right state if we saw enough words - TRule const& rule=*edge.rule_; - Sentence const& e = rule.e(); for (int j = 0; j < e.size(); ++j) { // items in target side of rule if (e[j] < 1) { // variable SP a = ant_contexts[-e[j]]; // variables a* are referring to this child derivation state. @@ -91,11 +98,11 @@ public: left_out=(W)left_full; // heuristic known now fsa.reset(h_start); - fsa.scan(left_begin,left_full,estimated_features); // save heuristic (happens once only) - fsa.scan(al+ntofill,ale,features); // because of markov order, fully filled left words scored starting at h_start put us in the right state to score the extra words (which are forgotten) + fsa.scan(left_begin,left_full,&h_accum); // save heuristic (happens once only) + fsa.scan(al+ntofill,ale,&accum); // because of markov order, fully filled left words scored starting at h_start put us in the right state to score the extra words (which are forgotten) al+=ntofill; // we used up the first ntofill words of al to end up in some known state via exactly M words total (M-ntofill were there beforehand). now we can scan the remaining al words of this child } else { // more to score / state to update (left already full) - fsa.scan(al,ale,features); + fsa.scan(al,ale,&accum); } if (anw==M) // child had full state already fsa.reset(fsa_state(a)); @@ -108,22 +115,24 @@ public: *left_out++=ew; if (left_out==left_full) { // handle heuristic, once only, establish state fsa.reset(h_start); - fsa.scan(left_begin,left_full,estimated_features); // save heuristic (happens only once) + fsa.scan(left_begin,left_full,&h_accum); // save heuristic (happens only once) } } else - fsa.scan(ew,features); + fsa.scan(ew,&accum); } } void *out_fsa_state=fsa_state(out_state); if (left_out{"<") - FSAFFDBG(edge," end="<{"<{"<") + FSAFFDBG(edge," end="<{"<R if we only scan 1 word at a time, that's fine +#define FSA_SCORE_PHRASE 1 +// if true, special effort is made to give entire phrases to fsa models so they can understate their true markov_order but sometimes have more specific probs. #define FSA_DEBUG 0 @@ -47,29 +53,15 @@ #include "tdict.h" #include "hg.h" #include "sentences.h" +#include "feature_accum.h" typedef ValueArray Bytes; /* -usage: -struct SameFirstLetter : public FsaFeatureFunctionBase { -SameFirstLetter(string const& param) : FsaFeatureFunctionBase(1,singleton_sentence("END")) { start[0]='a';h_start[0]=0; } // 1 byte of state, scan final (single) symbol "END" to get final state cost - int markov_order() const { return 1; } - Featval Scan1(WordID w,void const* old_state,void *new_state) const { - char cw=TD::Convert(w)[0]; - char co=*(char const*)old_state; - *(char *)new_state = cw; - return cw==co?1:0; - } - void print_state(std::ostream &o,void const* st) const { - o<<*(char const*)st; - } - static std::string usage(bool param,bool verbose) { - return FeatureFunction::usage_helper("SameFirstLetter","[no args]","1 each time 2 consecutive words start with the same letter",param,verbose); - } -}; +usage: see ff_sample_fsa.h or ff_lm_fsa.h + + then, to decode, see ff_from_fsa.h (or TODO: left->right target-earley style rescoring) -// then, to decode, see ff_from_fsa.h */ template @@ -77,21 +69,26 @@ struct FsaFeatureFunctionBase { Impl const& d() const { return static_cast(*this); } Impl & d() { return static_cast(*this); } protected: - int state_bytes_; // don't forget to set this. default 0 (it may depend on params of course) + int ssz; // don't forget to set this. default 0 (it may depend on params of course) Bytes start,h_start; // start state and estimated-features (heuristic) start state. set these. default empty. Sentence end_phrase_; // words appended for final traversal (final state cost is assessed using Scan) e.g. "" for lm. void set_state_bytes(int sb=0) { if (start.size()!=sb) start.resize(sb); if (h_start.size()!=sb) h_start.resize(sb); - state_bytes_=sb; + ssz=sb; } void set_end_phrase(WordID single) { end_phrase_=singleton_sentence(single); } - int fid_; // you can have more than 1 feature of course. - void Init() { // CALL THIS MANUALLY (because feature name(s) may depend on param - fid_=FD::Convert(d().name()); + // CALL 1 of these MANUALLY (because feature name(s) may depend on param, it's not done in ctor) + void InitFidNamed(std::string const& fname="") { + fid_=FD::Convert(name.empty()?name():fname); + Init(); + } + Features features_; + void Init() { + features_=FeatureFunction::single_feature(fid_); } inline void static to_state(void *state,char const* begin,char const* end) { @@ -119,11 +116,14 @@ protected: } public: + int fid_; // you can have more than 1 feature of course. + void state_copy(void *to,void const*from) const { - std::memcpy(to,from,state_bytes_); + if (ssz) + std::memcpy(to,from,ssz); } void state_zero(void *st) const { // you should call this if you don't know the state yet and want it to be hashed/compared properly - std::memset(st,0,state_bytes_); + std::memset(st,0,ssz); } // can override to different return type, e.g. just return feats: @@ -139,7 +139,7 @@ public: void print_state(std::ostream &o,void const*state) const { char const* i=(char const*)state; - char const* e=i+state_bytes_; + char const* e=i+ssz; for (;i!=e;++i) print_hex_byte(o,*i); } @@ -149,26 +149,26 @@ public: d().print_state(o,state); return o.str(); } + typedef SingleFeatureAccumulator Accum; - //edges may have old features on them. override if you have more than 1 fid. we need to call this explicitly because edges may have old feature values already, and I chose to use add_value (+=) to simplify scanning a phrase, rather than set_value (=) for fsa ffs. could revisit this and use set_value and therefore sum - void init_features(FeatureVector *fv) const { - fv->set_value(fid_,0); - //erase(fid_) - } // return m: all strings x with the same final m+1 letters must end in this state /* markov chain of order m: P(xn|xn-1...x1)=P(xn|xn-1...xn-m) */ int markov_order() const { return 0; } // override if you use state. order 0 implies state_bytes()==0 as well, as far as scoring/splitting is concerned (you can still track state, though) //TODO: if we wanted, we could mark certain states as maximal-context, but this would lose our fixed amount of left context in ff_from_fsa, and lose also our vector operations (have to scan left words 1 at a time, checking always to see where you change from h to inside - BUT, could detect equivalent LM states, which would be nice). - Features features() const { // override this if >1 fid - return FeatureFunction::single_feature(fid_); + Features features() const { + return features_; + } + + int n_features() const { + return features_.size(); } // override this (static) static std::string usage(bool param,bool verbose) { return FeatureFunction::usage_helper("unnamed_fsa_feature","","",param,verbose); } - int state_bytes() const { return state_bytes_; } // or override this + int state_bytes() const { return ssz; } // or override this void const* start_state() const { return start.begin(); } @@ -176,92 +176,134 @@ public: return h_start.begin(); } Sentence const& end_phrase() const { return end_phrase_; } - // move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates. state and next_state will never be the same memory. + // move from state to next_state after seeing word x, while emitting features->set_value(fid,val) possibly with duplicates. state and next_state will never be the same memory. //TODO: decide if we want to require you to support dest same as src, since that's how we use it most often in ff_from_fsa bottom-up wrapper (in l->r scoring, however, distinct copies will be the rule), and it probably wouldn't be too hard for most people to support. however, it's good to hide the complexity here, once (see overly clever FsaScan loop that swaps src/dest addresses repeatedly to scan a sequence by effectively swapping) - // different name because of inheritance method hiding; simple/common case; 1 fid +protected: + // overrides have different name because of inheritance method hiding; + + // simple/common case; 1 fid. these need not be overriden if you have multiple feature ids Featval Scan1(WordID w,void const* state,void *next_state) const { + assert(0); return 0; } + Featval Scan1Meta(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */, + WordID w,void const* state,void *next_state) const { + return d().Scan1(w,state,next_state); + } +public: + template + static inline T* state_as(void *p) { return (T*)p; } + template + static inline T const* state_as(void const* p) { return (T*)p; } + + // must override this or Scan1Meta or Scan1 + template + inline void ScanAccum(SentenceMetadata const& smeta,const Hypergraph::Edge& edge, + WordID w,void const* state,void *next_state,Accum *a) const { + Add(d().Scan1Meta(smeta,edge,smeta,edge,w,state,next_state),a); + } + + // bounce back and forth between two state vars starting at cs, returning end state location. if we required src=dest addr safe state updating, this concept wouldn't need to exist. + // recommend you override this if you score phrases differently than word-by-word. + template + void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const { + // extra code - IT'S FOR EFFICIENCY, MAN! IT'S OK! definitely no bugs here. + if (!ssz) { + for (;io + odd: + d().ScanAccum(smeta,edge,i[0],os,es,accum); // o->e + } + return es; + } + + + + // override this (and SCAN_PHRASE_ACCUM_OVERRIDE ) if you want e.g. maximum possible order ngram scores with markov_order < n-1. in the future SparseFeatureAccumulator will probably be the only option for type-erased FSA ffs. you will be adding to accum, not setting + template + inline void ScanPhraseAccum(SentenceMetadata const& smeta,const Hypergraph::Edge& edge, + WordID const* i, WordID const* end,void const* state,void *next_state,Accum *accum) const { + if (!ssz) { + for (;i \ + void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const { \ + ScanPhraseAccum(smeta,edge,i,end,cs,ns,accum); \ + return ns; \ + } \ + template \ + inline void ScanPhraseAccumOnly(SentenceMetadata const& smeta,const Hypergraph::Edge& edge, \ + WordID const* i, WordID const* end,void const* state,Accum *accum) const { \ + char s2[ssz]; ScanPhraseAccum(smeta,edge,i,end,state,(void*)s2,accum); \ + } - // NOTE: if you want to e.g. track statistics, cache, whatever, cast const away or use mutable members - inline void Scan(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID w,void const* state,void *next_state,FeatureVector *features) const { - maybe_add_feat(features,d().Scan1(w,state,next_state)); + // override this or bounce along with above. note: you can just call ScanPhraseAccum + // doesn't set state (for heuristic in ff_from_fsa) + template + inline void ScanPhraseAccumOnly(SentenceMetadata const& smeta,const Hypergraph::Edge& edge, + WordID const* i, WordID const* end,void const* state,Accum *accum) const { + char s1[ssz]; + char s2[ssz]; + state_copy(s1,state); + d().ScanPhraseAccumBounce(smeta,edge,i,end,(void*)s1,(void*)s2,accum); } - inline void maybe_add_feat(FeatureVector *features,Featval v) const { - features->maybe_add(fid_,v); + template + inline void Add(Featval v,Accum *a) const { // for single-feat only + a->Add(v); } - inline void add_feat(FeatureVector *features,Featval v) const { - features->add_value(fid_,v); + + inline void set_feat(FeatureVector *features,Featval v) const { + features->set_value(fid_,v); } // don't set state-bytes etc. in ctor because it may depend on parsing param string - FsaFeatureFunctionBase(int statesz=0,Sentence const& end_sentence_phrase=Sentence()) : state_bytes_(statesz),start(statesz),h_start(statesz),end_phrase_(end_sentence_phrase) {} + FsaFeatureFunctionBase(int statesz=0,Sentence const& end_sentence_phrase=Sentence()) : ssz(statesz),start(statesz),h_start(statesz),end_phrase_(end_sentence_phrase) {} }; -// init state is in cs; overwrite cs, ns repeatedly (alternatively). return resulting state -template -void *FsaScan(FsaFF const& ff,SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID const* i, WordID const* end,FeatureVector *features, void *cs,void *ns) { - // extra code - IT'S FOR EFFICIENCY, MAN! IT'S OK! definitely no bugs here. - void *os,*es; - if ((end-i)&1) { // odd # of words - os=cs; - es=ns; - goto odd; - } else { - i+=1; - es=cs; - os=ns; - } - for (;io - odd: - ff.Scan(smeta,edge,i[0],os,es,features); // o->e - } - return es; -} - -// if you have a more efficient implementation for scanning a phrase than one word at a time (e.g. LM context using sliding window buffer rather than rotating through a fixed state size), you can override this -template -void Scan(FsaFF const& ff,SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID const* i,WordID const* e,void const* state,void *next_state,FeatureVector *features) { - int ssz=ff.state_bytes(); - if (!ssz) { - for (;i -void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID const* i, WordID const* end,FeatureVector *h_features,void const* start_state) { - int ssz=ff.state_bytes(); - if (ssz) { - Bytes state(ssz),state2(ssz); - void *cs=state.begin(),*ns=state2.begin(); - memcpy(cs,start_state,ff.state_bytes()); - FsaScan(ff,smeta,edge,i,end,h_features,cs,ns); - } else - for (;i +struct MultipleFeatureFsa : public FsaFeatureFunctionBase { + typedef SparseFeatureAccumulator Accum; +}; + + + // if State is pod. sets state size and allocs start, h_start // usage: @@ -316,9 +358,7 @@ public: }; -const bool optimize_FsaScanner_zerostate=false; - -// do not use if state size is 0. should crash (maybe won't if you set optimize_FsaScanner_zerostate true) +// keep a "current state" (bouncing back and forth) template struct FsaScanner { // enum {ALIGN=8}; @@ -347,28 +387,16 @@ struct FsaScanner { cs=st0; std::memcpy(st0,state,ssz); } - void scan(WordID w,FeatureVector *feat) { - if (optimize_FsaScanner_zerostate && !ssz) { - ff.Scan(smeta,edge,w,0,0,feat); - return; - } + template + void scan(WordID w,Accum *a) { void *ns=nexts(); - ff.Scan(smeta,edge,w,cs,ns,feat); + ff.ScanAccum(smeta,edge,w,cs,ns,a); cs=ns; } - - void scan(WordID const* i,WordID const* end,FeatureVector *feat) { -#if 1 - // faster. - if (optimize_FsaScanner_zerostate && !ssz) - for (;i + void scan(WordID const* i,WordID const* end,Accum *a) { + // faster. and allows greater-order excursions + cs=ff.ScanPhraseAccumBounce(smeta,edge,i,end,cs,nexts(),a); } }; diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index 0f44f8d3..3d81a599 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -1,5 +1,5 @@ #define LM_FSA_SHORTEN_CONTEXT 1 -// seems to work great - just not sure if it actually speeds anything up +// seems to work great - just not sure if it actually speeds anything up // virtual LogP contextBOW(const VocabIndex *context, unsigned length); /* backoff weight for truncating context */ // does that need to be used? i think so. @@ -188,7 +188,7 @@ struct LMClient { char request_buffer[16000]; }; -class LanguageModelImpl { +class LanguageModelImpl : public LanguageModelInterface { void init(int order) { //all these used to be const members, but that has no performance implication, and now there's less duplication. order_=order; @@ -251,21 +251,6 @@ class LanguageModelImpl { return ngram_.contextBOW((VocabIndex*)context,shortened_len); } - double ShortenContext(WordID * context,int len) { - int slen=ContextSize(context,len); - double p=ContextBOW(context,slen); - while (len>slen) { - --len; - context[len]=TD::none; - } - return p; - } - - /// NOT a negative logp, i.e. should be worse prob = more negative. that's what SRI wordProb returns, fortunately. - inline double clamp(double logp) const { - return logp < floor_ ? floor_ : logp; - } - inline double LookupProbForBufferContents(int i) { // int k = i; cerr << "P("; while(buffer_[k] > 0) { std::cerr << TD::Convert(buffer_[k++]) << " "; } double p = WordProb(buffer_[i], &buffer_[i+1]); @@ -457,7 +442,6 @@ public: int order_; int state_size_; public: - double floor_; WordID kSTART; WordID kSTOP; WordID kUNKNOWN; @@ -606,9 +590,6 @@ void LanguageModelFsa::set_ngram_order(int i) { } } } -namespace { -WordID empty_context=TD::none; -} LanguageModelFsa::LanguageModelFsa(string const& param) { int lmorder; @@ -617,29 +598,8 @@ LanguageModelFsa::LanguageModelFsa(string const& param) { set_ngram_order(lmorder); } -void LanguageModelFsa::Scan(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,FeatureVector *features) const { - //variable length array is in C99, msvc++, if it doesn't support it, #ifdef it or use a stackalloc call (forget the name) - Featval p; - if (ctxlen_) { - WordID ctx[ngram_order_]; - state_copy(ctx,old_st); - ctx[ctxlen_]=TD::none; // make this part of state? wastes space but saves copies. - p=pimpl_->WordProb(w,ctx); -// states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.). - WordID *nst=(WordID *)new_st; - nst[0]=w; // new most recent word - to_state(nst+1,ctx,ctxlen_-1); // rotate old words right -#if LM_FSA_SHORTEN_CONTEXT - pimpl_->ShortenContext(nst,ctxlen_); -#endif - } else { - p=pimpl_->WordProb(w,&empty_context); - } - add_feat(features,(p0;) { --i; @@ -660,7 +620,7 @@ LanguageModel::~LanguageModel() { } string LanguageModel::DebugStateToString(const void* state) const{ - return pimpl_->DebugStateToString(state); + return imp().DebugStateToString(state); } void LanguageModel::TraversalFeaturesImpl(const SentenceMetadata& /* smeta */, @@ -669,13 +629,13 @@ void LanguageModel::TraversalFeaturesImpl(const SentenceMetadata& /* smeta */, SparseVector* features, SparseVector* estimated_features, void* state) const { - features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state)); - estimated_features->set_value(fid_, pimpl_->EstimateProb(state)); + features->set_value(fid_, imp().LookupWords(*edge.rule_, ant_states, state)); + estimated_features->set_value(fid_, imp().EstimateProb(state)); } void LanguageModel::FinalTraversalFeatures(const void* ant_state, SparseVector* features) const { - features->set_value(fid_, pimpl_->FinalTraversalCost(ant_state)); + features->set_value(fid_, imp().FinalTraversalCost(ant_state)); } #ifdef HAVE_RANDLM @@ -763,13 +723,13 @@ void LanguageModelRandLM::TraversalFeaturesImpl(const SentenceMetadata& smeta, SparseVector* estimated_features, void* state) const { (void) smeta; - features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state)); - estimated_features->set_value(fid_, pimpl_->EstimateProb(state)); + features->set_value(fid_, imp().LookupWords(*edge.rule_, ant_states, state)); + estimated_features->set_value(fid_, imp().EstimateProb(state)); } void LanguageModelRandLM::FinalTraversalFeatures(const void* ant_state, SparseVector* features) const { - features->set_value(fid_, pimpl_->FinalTraversalCost(ant_state)); + features->set_value(fid_, imp().FinalTraversalCost(ant_state)); } #endif diff --git a/decoder/ff_lm.h b/decoder/ff_lm.h index 935e283c..e682481d 100644 --- a/decoder/ff_lm.h +++ b/decoder/ff_lm.h @@ -8,7 +8,38 @@ #include "ff.h" #include "config.h" -class LanguageModelImpl; +class LanguageModelInterface { + public: + double floor_; + LanguageModelInterface() : floor_(-100) { } + virtual ~LanguageModelInterface() { } + + // not clamped to floor. log10prob + virtual double WordProb(WordID word, WordID const* context) = 0; + inline double WordProbFloored(WordID word, WordID const* context) { + return clamp(WordProb(word,context)); + } + // may be shorter than actual null-terminated length. context must be null terminated. len is just to save effort for subclasses that don't support contextID + virtual int ContextSize(WordID const* context,int len) = 0; + // use this as additional logprob when shortening the context as above + virtual double ContextBOW(WordID const* context,int shortened_len) = 0; // unlikely that you'll ever need to floor a backoff cost. i'd say impossible. + + inline double ShortenContext(WordID * context,int len) { + int slen=ContextSize(context,len); + double p=ContextBOW(context,slen); + while (len>slen) { + --len; + context[len]=TD::none; + } + return p; + } + /// should be worse prob = more negative. that's what SRI wordProb returns: log10(prob) + inline double clamp(double logp) const { + return logp < floor_ ? floor_ : logp; + } +}; + +struct LanguageModelImpl; class LanguageModel : public FeatureFunction { public: @@ -29,7 +60,9 @@ class LanguageModel : public FeatureFunction { void* out_context) const; private: int fid_; // conceptually const; mutable only to simplify constructor - mutable LanguageModelImpl* pimpl_; + //LanguageModelImpl &imp() { return *(LanguageModelImpl*)pimpl_; } + LanguageModelImpl & imp() const { return *(LanguageModelImpl*)pimpl_; } + /* mutable */ LanguageModelInterface* pimpl_; }; #ifdef HAVE_RANDLM diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h index 55a8b497..3547d75b 100755 --- a/decoder/ff_lm_fsa.h +++ b/decoder/ff_lm_fsa.h @@ -1,27 +1,112 @@ #ifndef FF_LM_FSA_H #define FF_LM_FSA_H -//TODO: use SRI LM::contextID to shorten state -//TODO: expose ScanPhrase interface to achieve > ngram probs (e.g. unigram) with higher order lm - but that wouldn't apply to L->R maximal hook/sharing decoding +//FIXME: 3gram has differences in 4th decimal digit, compared to regular ff_lm. this is USUALLY a bug (there's way more actual precision in there). this was with #define LM_FSA_SHORTEN_CONTEXT 1 + + +#define FSA_LM_DEBUG 0 +#if FSA_LM_DEBUG +# define FSALMDBG(e,x) FSADBGif(debug(),e,x) +# define FSALMDBGnl(e) FSADBGif_nl(debug(),e) +#else +# define FSALMDBG(e,x) +# define FSALMDBGnl(e) +#endif #include "ff_lm.h" #include "ff_from_fsa.h" +namespace { +WordID empty_context=TD::none; +} + struct LanguageModelFsa : public FsaFeatureFunctionBase { + typedef WordID * W; + typedef WordID const* WP; + // overrides; implementations in ff_lm.cc + typedef SingleFeatureAccumulator Accum; static std::string usage(bool,bool); LanguageModelFsa(std::string const& param); int markov_order() const { return ctxlen_; } - void Scan(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,FeatureVector *features) const; - void print_state(std::ostream &,void *) const; + void print_state(std::ostream &,void const *) const; + inline Featval floored(Featval p) const { + return pleft;--e) + if (e[-1]!=TD::none) break; + //post: [left,e] are the seen left words + return e; + } + template + void ScanPhraseAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge&edge,WordID const* begin,WordID const* end,void const* old_st,void *new_st,Accum *a) const { + if (begin==end) return; // otherwise w/ shortening it's possible to end up with no words at all. + /* // this is forcing unigram prob always. we will instead build the phrase + if (!ctxlen_) { + Featval p=0; + for (;iWordProb(*i,e&mpty_context)); + Add(p,a); + return; + } */ + int nw=end-begin; + WP st=(WP)old_st; + WP st_end=st+ctxlen_; // may include some null already (or none if full) + int nboth=nw+ctxlen_; + WordID ctx[nboth+1]; + ctx[nboth]=TD::none; + // reverse order - state at very end of context, then [i,end) in rev order ending at ctx[0] + wordcpy_reverse(ctx,begin,end); + W ctx_score_end=ctx+nw; + wordcpy(ctx_score_end,st,st_end); // st already reversed +// FSALMDBG(edge," Scan("<ctx;--ctx_score_end) + p+=floored(pimpl_->WordProb(ctx_score_end[-1],ctx_score_end)); +#if LM_FSA_SHORTEN_CONTEXT + p+=pimpl_->ShortenContext(ctx,nboth + void ScanAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,Accum *a) const { + if (!ctxlen_) { + Add(floored(pimpl_->WordProb(w,&empty_context)),a); + return; + } + //variable length array is in C99, msvc++, if it doesn't support it, #ifdef it or use a stackalloc call (forget the name) + if (ctxlen_) { + WordID ctx[ngram_order_]; + state_copy(ctx,old_st); + ctx[ctxlen_]=TD::none; // make this part of state? wastes space but saves copies. + Featval p=pimpl_->WordProb(w,ctx); +// states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.). + WordID *nst=(WordID *)new_st; + nst[0]=w; // new most recent word + to_state(nst+1,ctx,ctxlen_-1); // rotate old words right +#if LM_FSA_SHORTEN_CONTEXT + p+=pimpl_->ShortenContext(nst,ctxlen_); +#endif + Add(floored(p),a); + } + } // impl details: void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows. otherwise, it's the same as a "-o i" argument to constructor double floor_; // log10prob minimum used (e.g. unk words) + private: int ngram_order_; int ctxlen_; // 1 less than above - LanguageModelImpl *pimpl_; + LanguageModelInterface *pimpl_; +public: + SCAN_PHRASE_ACCUM_OVERRIDE + }; typedef FeatureFunctionFromFsa LanguageModelFromFsa; diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h index 6e6ad30e..fb44dade 100755 --- a/decoder/ff_sample_fsa.h +++ b/decoder/ff_sample_fsa.h @@ -1,6 +1,8 @@ #ifndef FF_SAMPLE_FSA_H #define FF_SAMPLE_FSA_H +//TODO: fsa interface changed to include ScanPhrase* Scan*Accum. update these so they compile+work + #include "ff_from_fsa.h" // example: feature val = 1 * # of target words diff --git a/decoder/sentences.h b/decoder/sentences.h index 452fb21e..4c71f8f1 100755 --- a/decoder/sentences.h +++ b/decoder/sentences.h @@ -29,7 +29,10 @@ inline void wordcpy(WordID *dest,WordID const* src,int n) { inline void wordcpy(WordID *dest,WordID const* src,WordID const* src_end) { wordcpy(dest,src,src_end-src); } - +inline void wordcpy_reverse(WordID *dest,WordID const* src,WordID const* src_end) { + for(WordID const* i=src_end;i>src;) + *dest++=*--i; +} inline Sentence singleton_sentence(WordID t) { return Sentence(1,t); } diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h index 8fc0f520..207489c5 100644 --- a/decoder/sparse_vector.h +++ b/decoder/sparse_vector.h @@ -60,6 +60,11 @@ class SparseVector { SPARSE_VECTOR_MAP_RESERVED(values_,-1,-2); } public: + T const& get_singleton() const { + assert(values_.size()==1); + return values_.begin()->second; + } + typedef SparseVector Self; typedef SPARSE_VECTOR_MAP MapType; typedef typename MapType::const_iterator const_iterator; @@ -113,7 +118,7 @@ public: return values_[index]; } - void set_value(int index, const T &value) { + inline void set_value(int index, const T &value) { values_[index] = value; } @@ -236,6 +241,15 @@ public: values_.erase(found);*/ } + template + void set_from(SparseVector const& other) { + for (typename MapType::const_iterator + it = other.values_.begin(); it != other.values_.end(); ++it) + { + values_[it->first]=it->second; + } + } + SparseVector &operator+=(const SparseVector &other) { for (typename MapType::const_iterator it = other.values_.begin(); it != other.values_.end(); ++it) diff --git a/decoder/tdict.cc b/decoder/tdict.cc index 6794bc79..7b56d259 100644 --- a/decoder/tdict.cc +++ b/decoder/tdict.cc @@ -85,6 +85,17 @@ std::string TD::GetString(const std::vector& str) { return o.str(); } +std::string TD::GetString(WordID const* i,WordID const* e) { + ostringstream o; + bool sp=false; + for (;i* ids); static void GetWordIDs(const std::vector& strings, std::vector* ids); static std::string GetString(const std::vector& str); + static std::string GetString(WordID const* i,WordID const* e); static int AppendString(const WordID& w, int pos, int bufsize, char* buffer); static unsigned int NumWords(); static WordID Convert(const std::string& s); diff --git a/decoder/value_array.h b/decoder/value_array.h index 6fa221ee..1a0d5f95 100755 --- a/decoder/value_array.h +++ b/decoder/value_array.h @@ -15,11 +15,33 @@ # include #endif +//TODO: use awesome type traits (and/or policy typelist argument) to provide these only when possible? +#define VALUE_ARRAY_ADD 1 +#define VALUE_ARRAY_MUL 1 +#define VALUE_ARRAY_BITWISE 0 +#define VALUE_ARRAY_OSTREAM 1 + +#if VALUE_ARRAY_OSTREAM +# include +#endif + // valarray like in that size is fixed (so saves space compared to vector), but same interface as vector (less resize/push_back/insert, of course) template > class ValueArray : A // private inheritance so stateless allocator adds no size. { + typedef ValueArray Self; public: +#if VALUE_ARRAY_OSTREAM + friend inline std::ostream & operator << (std::ostream &o,Self const& s) { + o<<'['; + for (unsigned i=0,e=s.size();i1?sizeof(T)/sizeof(T*):1; //space optimization: SV_MAX T will fit inside what would otherwise be a pointer to heap data. todo in the far future if bored. typedef T value_type; @@ -82,6 +104,37 @@ public: clear(); } +#undef VALUE_ARRAY_OPEQ +#define VALUE_ARRAY_OPEQ(op) template Self & operator op (ValueArray const& o) { assert(sz==o.sz); for (int i=0,e=sz;i<=e;++i) array[i] op o.array[i]; return *this; } +#if VALUE_ARRAY_ADD + VALUE_ARRAY_OPEQ(+=) + VALUE_ARRAY_OPEQ(-=) +#endif +#if VALUE_ARRAY_MUL + VALUE_ARRAY_OPEQ(*=) + VALUE_ARRAY_OPEQ(/=) +#endif +#if VALUE_ARRAY_BITWISE + VALUE_ARRAY_OPEQ(|=) + VALUE_ARRAY_OPEQ(*=) +#endif +#undef VALUE_ARRAY_OPEQ +#undef VALUE_ARRAY_BINOP +#define VALUE_ARRAY_BINOP(op,opeq) template friend inline Self operator op (Self x,ValueArray const& y) { x opeq y; return x; } +#if VALUE_ARRAY_ADD + VALUE_ARRAY_BINOP(+,+=) + VALUE_ARRAY_BINOP(-,-=) +#endif +#if VALUE_ARRAY_MUL + VALUE_ARRAY_BINOP(*,*=) + VALUE_ARRAY_BINOP(/,/=) +#endif +#if VALUE_ARRAY_BITWISE + VALUE_ARRAY_BINOP(|,|=) + VALUE_ARRAY_BINOP(*,*=) +#endif + +#undef VALUE_ARRAY_BINOP void clear() { for (size_type i = sz; i != 0; --i) { -- cgit v1.2.3