From cc7ee5e3f6ac08750b80e468b77e8ad732e48b77 Mon Sep 17 00:00:00 2001 From: graehl Date: Tue, 27 Jul 2010 04:59:37 +0000 Subject: fsa feature templated Accum interface, phrase interface allows exceeding markov order e.g. unigram state, 3gram lm. use Accum,set_value rather than clear,add_value. warning: 3gram fsa lm disagrees with bottom-up in 4th decimal place git-svn-id: https://ws10smt.googlecode.com/svn/trunk@431 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/ff_lm_fsa.h | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 5 deletions(-) (limited to 'decoder/ff_lm_fsa.h') diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h index 55a8b497..3547d75b 100755 --- a/decoder/ff_lm_fsa.h +++ b/decoder/ff_lm_fsa.h @@ -1,27 +1,112 @@ #ifndef FF_LM_FSA_H #define FF_LM_FSA_H -//TODO: use SRI LM::contextID to shorten state -//TODO: expose ScanPhrase interface to achieve > ngram probs (e.g. unigram) with higher order lm - but that wouldn't apply to L->R maximal hook/sharing decoding +//FIXME: 3gram has differences in 4th decimal digit, compared to regular ff_lm. this is USUALLY a bug (there's way more actual precision in there). this was with #define LM_FSA_SHORTEN_CONTEXT 1 + + +#define FSA_LM_DEBUG 0 +#if FSA_LM_DEBUG +# define FSALMDBG(e,x) FSADBGif(debug(),e,x) +# define FSALMDBGnl(e) FSADBGif_nl(debug(),e) +#else +# define FSALMDBG(e,x) +# define FSALMDBGnl(e) +#endif #include "ff_lm.h" #include "ff_from_fsa.h" +namespace { +WordID empty_context=TD::none; +} + struct LanguageModelFsa : public FsaFeatureFunctionBase { + typedef WordID * W; + typedef WordID const* WP; + // overrides; implementations in ff_lm.cc + typedef SingleFeatureAccumulator Accum; static std::string usage(bool,bool); LanguageModelFsa(std::string const& param); int markov_order() const { return ctxlen_; } - void Scan(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,FeatureVector *features) const; - void print_state(std::ostream &,void *) const; + void print_state(std::ostream &,void const *) const; + inline Featval floored(Featval p) const { + return pleft;--e) + if (e[-1]!=TD::none) break; + //post: [left,e] are the seen left words + return e; + } + template + void ScanPhraseAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge&edge,WordID const* begin,WordID const* end,void const* old_st,void *new_st,Accum *a) const { + if (begin==end) return; // otherwise w/ shortening it's possible to end up with no words at all. + /* // this is forcing unigram prob always. we will instead build the phrase + if (!ctxlen_) { + Featval p=0; + for (;iWordProb(*i,e&mpty_context)); + Add(p,a); + return; + } */ + int nw=end-begin; + WP st=(WP)old_st; + WP st_end=st+ctxlen_; // may include some null already (or none if full) + int nboth=nw+ctxlen_; + WordID ctx[nboth+1]; + ctx[nboth]=TD::none; + // reverse order - state at very end of context, then [i,end) in rev order ending at ctx[0] + wordcpy_reverse(ctx,begin,end); + W ctx_score_end=ctx+nw; + wordcpy(ctx_score_end,st,st_end); // st already reversed +// FSALMDBG(edge," Scan("<ctx;--ctx_score_end) + p+=floored(pimpl_->WordProb(ctx_score_end[-1],ctx_score_end)); +#if LM_FSA_SHORTEN_CONTEXT + p+=pimpl_->ShortenContext(ctx,nboth + void ScanAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,Accum *a) const { + if (!ctxlen_) { + Add(floored(pimpl_->WordProb(w,&empty_context)),a); + return; + } + //variable length array is in C99, msvc++, if it doesn't support it, #ifdef it or use a stackalloc call (forget the name) + if (ctxlen_) { + WordID ctx[ngram_order_]; + state_copy(ctx,old_st); + ctx[ctxlen_]=TD::none; // make this part of state? wastes space but saves copies. + Featval p=pimpl_->WordProb(w,ctx); +// states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.). + WordID *nst=(WordID *)new_st; + nst[0]=w; // new most recent word + to_state(nst+1,ctx,ctxlen_-1); // rotate old words right +#if LM_FSA_SHORTEN_CONTEXT + p+=pimpl_->ShortenContext(nst,ctxlen_); +#endif + Add(floored(p),a); + } + } // impl details: void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows. otherwise, it's the same as a "-o i" argument to constructor double floor_; // log10prob minimum used (e.g. unk words) + private: int ngram_order_; int ctxlen_; // 1 less than above - LanguageModelImpl *pimpl_; + LanguageModelInterface *pimpl_; +public: + SCAN_PHRASE_ACCUM_OVERRIDE + }; typedef FeatureFunctionFromFsa LanguageModelFromFsa; -- cgit v1.2.3