debugging print - still no idea on .05% difference scoring 3gram using phrases

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@446 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-28 07:10:09 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-28 07:10:09 +0000
commit: 64f43ca5010758b58326d727e359b5908de4fcb0 (patch)
tree: 3bbc3c68939bc964add941786bb623fae0eb3e72 /decoder/ff_lm_fsa.h
parent: 069ef91fd567783961891e5b94f0778a82193218 (diff)
1 files changed, 14 insertions, 7 deletions
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
index 108698ec..9ba7b2c5 100755
--- a/decoder/ff_lm_fsa.h
+++ b/decoder/ff_lm_fsa.h
@@ -1,9 +1,10 @@
 #ifndef FF_LM_FSA_H
 #define FF_LM_FSA_H
 
-//FIXME: when FSA_LM_PHRASE 1, 3gram has differences in 4th decimal digit, compared to regular ff_lm.  this is USUALLY a bug (there's way more actual precision in there).  this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that).  also, LM_FSA_SHORTEN_CONTEXT gives identical scores with FSA_LM_PHRASE 0
+//FIXME: when FSA_LM_PHRASE 1, 3gram fsa has differences, especially with unk words, in about the 4th decimal digit (about .05%), compared to regular ff_lm.  this is USUALLY a bug (there's way more actual precision in there).  this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that).  also, LM_FSA_SHORTEN_CONTEXT gives identical scores with FSA_LM_PHRASE 0
 
-#define FSA_LM_PHRASE 0
+// enabling for now - retest unigram+ more, solve above puzzle
+#define FSA_LM_PHRASE 1
 
 #define FSA_LM_DEBUG 0
 #if FSA_LM_DEBUG
@@ -42,7 +43,8 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
   }
 
   template <class Accum>
-  void ScanAccum(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& /* edge */,WordID w,void const* old_st,void *new_st,Accum *a) const {
+  void ScanAccum(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& edge,WordID w,void const* old_st,void *new_st,Accum *a) const {
+    Hypergraph::Edge &de=(Hypergraph::Edge &)edge;
     if (!ctxlen_) {
       Add(floored(pimpl_->WordProb(w,&empty_context)),a);
       return;
@@ -53,6 +55,8 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
       state_copy(ctx,old_st);
       ctx[ctxlen_]=TD::none; // make this part of state?  wastes space but saves copies.
       Featval p=floored(pimpl_->WordProb(w,ctx));
+      FSALMDBG(de,"p("<<TD::Convert(w)<<"|"<<TD::Convert(ctx,ctx+ctxlen_)<<")="<<p);
+      FSALMDBGnl(de);
 // states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.).
       WordID *nst=(WordID *)new_st;
       nst[0]=w; // new most recent word
@@ -68,6 +72,7 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
   //FIXME: there is a bug in here somewhere, or else the 3gram LM we use gives different scores for phrases (impossible? BOW nonzero when shortening context past what LM has?)
   template <class Accum>
   void ScanPhraseAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge&edge,WordID const* begin,WordID const* end,void const* old_st,void *new_st,Accum *a) const {
+    Hypergraph::Edge &de=(Hypergraph::Edge &)edge;
     if (begin==end) return; // otherwise w/ shortening it's possible to end up with no words at all.
     /* // this is forcing unigram prob always.  we will instead build the phrase
     if (!ctxlen_) {
@@ -85,27 +90,29 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
     ctx[nboth]=TD::none;
     // reverse order - state at very end of context, then [i,end) in rev order ending at ctx[0]
     W ctx_score_end=wordcpy_reverse(ctx,begin,end);
-    assert(ctx_score_end==ctx+nw);
     wordcpy(ctx_score_end,st,st_end); // st already reversed.
+    assert(ctx_score_end==ctx+nw);
     // we could just copy the filled state words, but it probably doesn't save much time (and might cost some to scan to find the nones.  most contexts are full except for the shortest source spans.
-//    FSALMDBG(edge," Scan("<<TD::GetString(ctx,ctx+nboth)<<')');
+    FSALMDBG(de," scan.r->l("<<TD::GetString(ctx,ctx_score_end)<<"|"<<TD::GetString(ctx_score_end,ctx+nboth)<<')');
+    FSAFFDBG(de," r->l("<<TD::GetString(ctx,ctx_score_end)<<"|"<<TD::GetString(ctx_score_end,ctx+nboth)<<')');
     Featval p=0;
     FSALMDBGnl(edge);
     for(;ctx_score_end>ctx;--ctx_score_end)
       p+=floored(pimpl_->WordProb(ctx_score_end[-1],ctx_score_end));
     //TODO: look for score discrepancy -
-    // i had some idea that maybe shortencontext would return a different prob if the length provided was > ctxlen_; however, since the same 4th digit disagreement happens with LM_FSA_SHORTEN_CONTEXT 0 anyway, it's not that.  perhaps look to SCAN_PHRASE_ACCUM_OVERRIDE - make sure they do the right thing.
+    // i had some idea that maybe shortencontext would return a different prob if the length provided was > ctxlen_; however, since the same disagreement happens with LM_FSA_SHORTEN_CONTEXT 0 anyway, it's not that.  perhaps look to SCAN_PHRASE_ACCUM_OVERRIDE - make sure they do the right thing.
 #if LM_FSA_SHORTEN_CONTEXT
     p+=pimpl_->ShortenContext(ctx,nboth<ctxlen_?nboth:ctxlen_);
 #endif
     state_copy(new_st,ctx);
-    FSALMDBG(edge," lm.Scan("<<TD::GetString(begin,end)<<"|"<<describe_state(old_st)<<")"<<"="<<p<<","<<describe_state(new_st));
+    FSALMDBG(de," lm.Scan("<<TD::GetString(begin,end)<<"|"<<describe_state(old_st)<<")"<<"="<<p<<","<<describe_state(new_st));
     FSALMDBGnl(edge);
     Add(p,a);
   }
 
   SCAN_PHRASE_ACCUM_OVERRIDE
 #endif
+
   // impl details:
   void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows.  otherwise, it's the same as a "-o i" argument to constructor
   double floor_; // log10prob minimum used (e.g. unk words)
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-28 07:10:09 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-28 07:10:09 +0000
commit	64f43ca5010758b58326d727e359b5908de4fcb0 (patch)
tree	3bbc3c68939bc964add941786bb623fae0eb3e72 /decoder/ff_lm_fsa.h
parent	069ef91fd567783961891e5b94f0778a82193218 (diff)