summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-27 05:12:40 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-27 05:12:40 +0000
commit4c5df460c9da5c935438850ef7993463a9113286 (patch)
treedc9c94ae0f87b7ce4337c2a30b9c48f363e97d48
parent684db46e977bca456e02e677d22ba5e4a33ae6ce (diff)
disable fsa lm phrase due to slight (.05%) diff in scores - solve it later
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@434 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-xdecoder/ff_lm_fsa.h53
1 files changed, 28 insertions, 25 deletions
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
index c1d875eb..4b0682d1 100755
--- a/decoder/ff_lm_fsa.h
+++ b/decoder/ff_lm_fsa.h
@@ -1,8 +1,9 @@
#ifndef FF_LM_FSA_H
#define FF_LM_FSA_H
-//FIXME: 3gram has differences in 4th decimal digit, compared to regular ff_lm. this is USUALLY a bug (there's way more actual precision in there). this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that)
+//FIXME: when FSA_LM_PHRASE 1, 3gram has differences in 4th decimal digit, compared to regular ff_lm. this is USUALLY a bug (there's way more actual precision in there). this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that). also, LM_FSA_SHORTEN_CONTEXT gives identical scores with FSA_LM_PHRASE 0
+#define FSA_LM_PHRASE 0
#define FSA_LM_DEBUG 0
#if FSA_LM_DEBUG
@@ -41,6 +42,31 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
}
template <class Accum>
+ void ScanAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,Accum *a) const {
+ if (!ctxlen_) {
+ Add(floored(pimpl_->WordProb(w,&empty_context)),a);
+ return;
+ }
+ //variable length array is in C99, msvc++, if it doesn't support it, #ifdef it or use a stackalloc call (forget the name)
+ if (ctxlen_) {
+ WordID ctx[ngram_order_];
+ state_copy(ctx,old_st);
+ ctx[ctxlen_]=TD::none; // make this part of state? wastes space but saves copies.
+ Featval p=floored(pimpl_->WordProb(w,ctx));
+// states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.).
+ WordID *nst=(WordID *)new_st;
+ nst[0]=w; // new most recent word
+ to_state(nst+1,ctx,ctxlen_-1); // rotate old words right
+#if LM_FSA_SHORTEN_CONTEXT
+ p+=pimpl_->ShortenContext(nst,ctxlen_);
+#endif
+ Add(p,a);
+ }
+ }
+
+#if FSA_LM_PHRASE
+ //FIXME: there is a bug in here somewhere, or else the 3gram LM we use gives different scores for phrases (impossible? BOW nonzero when shortening context past what LM has?)
+ template <class Accum>
void ScanPhraseAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge&edge,WordID const* begin,WordID const* end,void const* old_st,void *new_st,Accum *a) const {
if (begin==end) return; // otherwise w/ shortening it's possible to end up with no words at all.
/* // this is forcing unigram prob always. we will instead build the phrase
@@ -75,29 +101,8 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
Add(p,a);
}
- template <class Accum>
- void ScanAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,Accum *a) const {
- if (!ctxlen_) {
- Add(floored(pimpl_->WordProb(w,&empty_context)),a);
- return;
- }
- //variable length array is in C99, msvc++, if it doesn't support it, #ifdef it or use a stackalloc call (forget the name)
- if (ctxlen_) {
- WordID ctx[ngram_order_];
- state_copy(ctx,old_st);
- ctx[ctxlen_]=TD::none; // make this part of state? wastes space but saves copies.
- Featval p=floored(pimpl_->WordProb(w,ctx));
-// states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.).
- WordID *nst=(WordID *)new_st;
- nst[0]=w; // new most recent word
- to_state(nst+1,ctx,ctxlen_-1); // rotate old words right
-#if LM_FSA_SHORTEN_CONTEXT
- p+=pimpl_->ShortenContext(nst,ctxlen_);
+ SCAN_PHRASE_ACCUM_OVERRIDE
#endif
- Add(p,a);
- }
- }
-
// impl details:
void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows. otherwise, it's the same as a "-o i" argument to constructor
double floor_; // log10prob minimum used (e.g. unk words)
@@ -106,8 +111,6 @@ private:
int ngram_order_;
int ctxlen_; // 1 less than above
LanguageModelInterface *pimpl_;
-public:
- SCAN_PHRASE_ACCUM_OVERRIDE
};