diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-27 05:12:40 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-27 05:12:40 +0000 |
commit | 4c5df460c9da5c935438850ef7993463a9113286 (patch) | |
tree | dc9c94ae0f87b7ce4337c2a30b9c48f363e97d48 | |
parent | 684db46e977bca456e02e677d22ba5e4a33ae6ce (diff) |
disable fsa lm phrase due to slight (.05%) diff in scores - solve it later
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@434 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-x | decoder/ff_lm_fsa.h | 53 |
1 files changed, 28 insertions, 25 deletions
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h index c1d875eb..4b0682d1 100755 --- a/decoder/ff_lm_fsa.h +++ b/decoder/ff_lm_fsa.h @@ -1,8 +1,9 @@ #ifndef FF_LM_FSA_H #define FF_LM_FSA_H -//FIXME: 3gram has differences in 4th decimal digit, compared to regular ff_lm. this is USUALLY a bug (there's way more actual precision in there). this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that) +//FIXME: when FSA_LM_PHRASE 1, 3gram has differences in 4th decimal digit, compared to regular ff_lm. this is USUALLY a bug (there's way more actual precision in there). this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that). also, LM_FSA_SHORTEN_CONTEXT gives identical scores with FSA_LM_PHRASE 0 +#define FSA_LM_PHRASE 0 #define FSA_LM_DEBUG 0 #if FSA_LM_DEBUG @@ -41,6 +42,31 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> { } template <class Accum> + void ScanAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,Accum *a) const { + if (!ctxlen_) { + Add(floored(pimpl_->WordProb(w,&empty_context)),a); + return; + } + //variable length array is in C99, msvc++, if it doesn't support it, #ifdef it or use a stackalloc call (forget the name) + if (ctxlen_) { + WordID ctx[ngram_order_]; + state_copy(ctx,old_st); + ctx[ctxlen_]=TD::none; // make this part of state? wastes space but saves copies. + Featval p=floored(pimpl_->WordProb(w,ctx)); +// states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.). + WordID *nst=(WordID *)new_st; + nst[0]=w; // new most recent word + to_state(nst+1,ctx,ctxlen_-1); // rotate old words right +#if LM_FSA_SHORTEN_CONTEXT + p+=pimpl_->ShortenContext(nst,ctxlen_); +#endif + Add(p,a); + } + } + +#if FSA_LM_PHRASE + //FIXME: there is a bug in here somewhere, or else the 3gram LM we use gives different scores for phrases (impossible? BOW nonzero when shortening context past what LM has?) + template <class Accum> void ScanPhraseAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge&edge,WordID const* begin,WordID const* end,void const* old_st,void *new_st,Accum *a) const { if (begin==end) return; // otherwise w/ shortening it's possible to end up with no words at all. /* // this is forcing unigram prob always. we will instead build the phrase @@ -75,29 +101,8 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> { Add(p,a); } - template <class Accum> - void ScanAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,Accum *a) const { - if (!ctxlen_) { - Add(floored(pimpl_->WordProb(w,&empty_context)),a); - return; - } - //variable length array is in C99, msvc++, if it doesn't support it, #ifdef it or use a stackalloc call (forget the name) - if (ctxlen_) { - WordID ctx[ngram_order_]; - state_copy(ctx,old_st); - ctx[ctxlen_]=TD::none; // make this part of state? wastes space but saves copies. - Featval p=floored(pimpl_->WordProb(w,ctx)); -// states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.). - WordID *nst=(WordID *)new_st; - nst[0]=w; // new most recent word - to_state(nst+1,ctx,ctxlen_-1); // rotate old words right -#if LM_FSA_SHORTEN_CONTEXT - p+=pimpl_->ShortenContext(nst,ctxlen_); + SCAN_PHRASE_ACCUM_OVERRIDE #endif - Add(p,a); - } - } - // impl details: void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows. otherwise, it's the same as a "-o i" argument to constructor double floor_; // log10prob minimum used (e.g. unk words) @@ -106,8 +111,6 @@ private: int ngram_order_; int ctxlen_; // 1 less than above LanguageModelInterface *pimpl_; -public: - SCAN_PHRASE_ACCUM_OVERRIDE }; |