fsa lm phrase mystery remains, but bool fsa::simple_phrase_score indicates whether stateless features should copy phrases from rules (e.g. unigram lm)

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@444 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-28 05:25:56 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-28 05:25:56 +0000
commit: b2cbd45b1f8b1d2194624d43ffee12bf53d56a4d (patch)
tree: fc8c1912616a9340686f87077b253b6b6f386ef9 /decoder
parent: fd09366d32dff751f8611bbb8d52627886b84150 (diff)
5 files changed, 99 insertions, 59 deletions
diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h
index 7fa6be67..c517ec64 100755
--- a/decoder/ff_from_fsa.h
+++ b/decoder/ff_from_fsa.h
@@ -40,7 +40,7 @@ public:
 
   Features features() const { return ff.features(); }
 
-  // Log because it
+  // Log because it potentially stores info in edge.  otherwise the same as regular TraversalFeatures.
   void TraversalFeaturesLog(const SentenceMetadata& smeta,
                              Hypergraph::Edge& edge,
                              const std::vector<const void*>& ant_contexts,
@@ -51,62 +51,74 @@ public:
     TRule const& rule=*edge.rule_;
     Sentence const& e = rule.e();
     typename Impl::Accum accum,h_accum;
-    if (!ssz) {
-      Sentence phrase;
-      phrase.reserve(e.size());
-      for (int j=0,je=e.size();;++j) { // items in target side of rule
-        if (je==j || e[j]<1) { // end or variable
-          if (phrase.size()) {
-            FSAFFDBG(edge," ["<<TD::GetString(phrase)<<']');
-            ff.ScanPhraseAccum(smeta,edge,begin(phrase),end(phrase),0,0,&accum);
+    if (!ssz) { // special case for no state - but still build up longer phrases to score in case FSA overrides ScanPhraseAccum
+      if (Impl::simple_phrase_score) {
+        // save the effort of building up the contiguous rule phrases
+        for (int j=0,je=e.size();j<je;++j) // items in target side of rule
+          if (e[j]>=1) // token
+            ff.ScanAccum(smeta,edge,(WordID)e[j],NULL,NULL,&accum);
+        FSAFFDBG(edge," "<<TD::Convert(e[j]));
+      } else {
+        Sentence phrase;
+        phrase.reserve(e.size());
+        for (int j=0,je=e.size();;++j) { // items in target side of rule
+          if (je==j || e[j]<1) { // end or variable
+            if (phrase.size()) {
+              FSAFFDBG(edge," ["<<TD::GetString(phrase)<<']');
+              ff.ScanPhraseAccum(smeta,edge,begin(phrase),end(phrase),0,0,&accum);
+            }
+            if (je==j)
+              break;
+            phrase.clear();
+          } else { // word
+            WordID ew=e[j];
+            phrase.push_back(ew);
           }
-          if (je==j)
-            break;
-          phrase.clear();
-        } else { // word
-          WordID ew=e[j];
-          phrase.push_back(ew);
         }
       }
       accum.Store(ff,features);
+      FSAFFDBG(egde,"="<<accum->describe(ff));
       FSAFFDBGnl(edge);
       return;
     }
+
+    // bear with me, because this is hard to understand.  reminder: ant_contexts and out_state are left-words first (up to M, TD::none padded).  if all M words are present, then FSA state follows.  otherwise 0 bytes to keep memcmp/hash happy.
+
 //why do we compute heuristic in so many places?  well, because that's how we know what state we should score words in once we're full on our left context (because of markov order bound, we know the score will be the same no matter what came before that left context)
-    SP h_start=ff.heuristic_start_state();
     // these left_* refer to our output (out_state):
     W left_begin=(W)out_state;
     W left_out=left_begin; // [left,fsa_state) = left ctx words.  if left words aren't full, then null wordid
     WP left_full=left_end_full(out_state);
-    FsaScanner<Impl> fsa(ff,smeta,edge); // this holds our current state and eventuallybecomes our right state if we saw enough words
+    FsaScanner<Impl> fsa(ff,smeta,edge);
+    /* fsa holds our current state once we've seen our first M rule or child left-context words.  that state scores up the rest of the words at the time, and is replaced by the right state of any full child.  at the end, if we've got at least M left words in all, it becomes our right state (otherwise, we don't bother storing the partial state, which might seem useful any time we're built on by a rule that has our variable in the initial position - but without also storing the heuristic for that case, we just end up rescanning from scratch anyway to produce the heuristic.  so we just store all 0 bytes if we have less than M left words at the end. */
     for (int j = 0; j < e.size(); ++j) { // items in target side of rule
       if (e[j] < 1) { // variable
-        SP a = ant_contexts[-e[j]]; // variables a* are referring to this child derivation state.
-        FSAFFDBG(edge,' '<<describe_state(a));
-        WP al=(WP)a;
-        WP ale=left_end(a);
-        // scan(al,le) these - the same as below else.  macro for now; pull into closure object later?
+        // variables a* are referring to this child derivation state.
+        SP a = ant_contexts[-e[j]];
+        WP al=(WP)a,ale=left_end(a); // the child left words
         int anw=ale-al;
+        FSAFFDBG(edge,' '<<describe_state(a));
 // anw left words in child.  full if == M.  we will use them to fill our left words, and then score the rest fully, knowing what state we're in based on h_state -> our left words -> any number of interior words which are scored then hidden
-        if (left_out+anw<left_full) { // nothing to score after adding
+        if (left_out+anw<left_full) { // still short of M after adding - nothing to score (not even our heuristic)
           wordcpy(left_out,al,anw);
           left_out+=anw;
-        } else if (left_out<left_full) { // something to score AND newly full left context to fill
+        } else if (left_out<left_full) { // we had less than M before, and will have a tleast M after adding.  so score heuristic and the rest M+1,... score inside.
           int ntofill=left_full-left_out;
           assert(ntofill==M-(left_out-left_begin));
           wordcpy(left_out,al,ntofill);
           left_out=(W)left_full;
           // heuristic known now
-          fsa.reset(h_start);
+          fsa.reset(ff.heuristic_start_state());
           fsa.scan(left_begin,left_full,&h_accum); // save heuristic (happens once only)
           fsa.scan(al+ntofill,ale,&accum); // because of markov order, fully filled left words scored starting at h_start put us in the right state to score the extra words (which are forgotten)
           al+=ntofill; // we used up the first ntofill words of al to end up in some known state via exactly M words total (M-ntofill were there beforehand).  now we can scan the remaining al words of this child
         } else { // more to score / state to update (left already full)
           fsa.scan(al,ale,&accum);
         }
-        if (anw==M) // child had full state already
+        if (anw==M)
           fsa.reset(fsa_state(a));
-        assert(anw<=M);
+        // if child had full state already, we must assume there was a gap and use its right state (note: if the child derivation was exactly M words, then we still use its state even though it will be equal to our current; there's no way to distinguish between such an M word item and an e.g. 2*M+k word item, although it's extremely unlikely that you'd have a >M word item that happens to have the same left and right boundary words).
+        assert(anw<=M); // of course, we never store more than M left words in an item.
       } else { // single word
         WordID ew=e[j];
         FSAFFDBG(edge,' '<<TD::Convert(ew));
@@ -114,7 +126,7 @@ public:
         if (left_out<left_full) {
           *left_out++=ew;
           if (left_out==left_full) { // handle heuristic, once only, establish state
-            fsa.reset(h_start);
+            fsa.reset(ff.heuristic_start_state());
             fsa.scan(left_begin,left_full,&h_accum); // save heuristic (happens only once)
           }
         } else
@@ -124,8 +136,8 @@ public:
 
     void *out_fsa_state=fsa_state(out_state);
     if (left_out<left_full) { // finally: partial heuristic for unfilled items
-//      fsa.reset(h_start);      fsa.scan(left_begin,left_out,&h_accum);
-      ff.ScanPhraseAccumOnly(smeta,edge,left_begin,left_out,h_start,&h_accum);
+//      fsa.reset(ff.heuristic_start_state());      fsa.scan(left_begin,left_out,&h_accum);
+      ff.ScanPhraseAccumOnly(smeta,edge,left_begin,left_out,ff.heuristic_start_state(),&h_accum);
       do { *left_out++=TD::none; } while(left_out<left_full); // none-terminate so left_end(out_state) will know how many words
       ff.state_zero(out_fsa_state); // so we compare / hash correctly. don't know state yet because left context isn't full
     } else // or else store final right-state.  heuristic was already assigned
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
index a5563511..8d2b0488 100755
--- a/decoder/ff_fsa.h
+++ b/decoder/ff_fsa.h
@@ -8,7 +8,7 @@
 
   state is some fixed width byte array.  could actually be a void *, WordID sequence, whatever.
 
-  TODO: there are a confusing array of default-implemented supposedly slightly more efficient overrides enabled; however, the two key differences are: do you score a phrase, or just word at a time (the latter constraining you to obey markov_order() everywhere.
+  TODO: there are a confusing array of default-implemented supposedly slightly more efficient overrides enabled; however, the two key differences are: do you score a phrase, or just word at a time (the latter constraining you to obey markov_order() everywhere.  you have to implement the word case no matter what.
 
   TODO: considerable simplification of implementation if Scan implementors are required to update state in place (using temporary copy if they need it), or e.g. using memmove (copy from end to beginning) to rotate state right.
 
@@ -20,9 +20,6 @@
   TODO: state (+ possibly span-specific) custom heuristic, e.g. in "longer than previous word" model, you can expect a higher outside if your state is a word of 2 letters.  this is on top of the nice heuristic for the unscored words, of course.  in ngrams, the avg prob will be about the same, but if the words possible for a source span are summarized, maybe it's possible to predict.  probably not worth the effort.
 */
 
-#define FSA_SCORE_PHRASE 1
-// if true, special effort is made to give entire phrases to fsa models so they can understate their true markov_order but sometimes have more specific probs.
-
 #define FSA_DEBUG 0
 
 #if USE_INFO_EDGE
@@ -187,7 +184,7 @@ protected:
     assert(0);
     return 0;
   }
-  Featval Scan1Meta(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,
+  Featval Scan1Meta(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& /* edge */,
                     WordID w,void const* state,void *next_state) const {
     return d().Scan1(w,state,next_state);
   }
@@ -199,19 +196,19 @@ public:
 
   // must override this or Scan1Meta or Scan1
   template <class Accum>
-  inline void ScanAccum(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,
+  inline void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
                         WordID w,void const* state,void *next_state,Accum *a) const {
     Add(d().Scan1Meta(smeta,edge,w,state,next_state),a);
   }
 
   // bounce back and forth between two state vars starting at cs, returning end state location.  if we required src=dest addr safe state updating, this concept wouldn't need to exist.
-  // recommend you override this if you score phrases differently than word-by-word.
+  // required that you override this if you score phrases differently than word-by-word, however, you can just use the SCAN_PHRASE_ACCUM_OVERRIDE macro to do that in terms of ScanPhraseAccum
   template <class Accum>
-  void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const {
+  void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const {
     // extra code - IT'S FOR EFFICIENCY, MAN!  IT'S OK!  definitely no bugs here.
     if (!ssz) {
       for (;i<end;++i)
-        ScanAccum(smeta,edge,*i,0,0,accum);
+        d().ScanAccum(smeta,edge,*i,0,0,accum);
       return 0;
     }
     void *os,*es;
@@ -233,11 +230,14 @@ public:
   }
 
 
+  static const bool simple_phrase_score=true; // if d().simple_phrase_score_, then you should expect different Phrase scores for phrase length > M.  so, set this false if you provide ScanPhraseAccum (SCAN_PHRASE_ACCUM_OVERRIDE macro does this)
+
+  // override this (and use SCAN_PHRASE_ACCUM_OVERRIDE  ) if you want e.g. maximum possible order ngram scores with markov_order < n-1.  in the future SparseFeatureAccumulator will probably be the only option for type-erased FSA ffs.
   // note you'll still have to override ScanAccum
-  // override this (and SCAN_PHRASE_ACCUM_OVERRIDE  ) if you want e.g. maximum possible order ngram scores with markov_order < n-1.  in the future SparseFeatureAccumulator will probably be the only option for type-erased FSA ffs.  you will be adding to accum, not setting
   template <class Accum>
-  inline void ScanPhraseAccum(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,
-                              WordID const* i, WordID const* end,void const* state,void *next_state,Accum *accum) const {
+  void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge,
+                              WordID const* i, WordID const* end,
+                              void const* state,void *next_state,Accum *accum) const {
     if (!ssz) {
       for (;i<end;++i)
         d().ScanAccum(smeta,edge,*i,0,0,accum);
@@ -247,6 +247,7 @@ public:
     void *tst=tstate;
     bool odd=(end-i)&1;
     void *cs,*ns;
+    // we're going to use Bounce (word by word alternating of states) such that the final place is next_state
     if (odd) {
       cs=tst;
       ns=next_state;
@@ -259,24 +260,30 @@ public:
     assert(est==next_state);
   }
 
+
+
   // could replace this with a CRTP subclass providing these impls.
+  // the d() subclass dispatch is not needed because these will be defined in the subclass
 #define SCAN_PHRASE_ACCUM_OVERRIDE \
+  static const bool simple_phrase_score=false; \
   template <class Accum> \
-  void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const { \
-    ScanPhraseAccum(smeta,edge,i,end,cs,ns,accum); \
+  void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const { \
+    ScanPhraseAccum(smeta,edge,i,end,cs,ns,accum);  \
     return ns; \
   } \
   template <class Accum> \
-  inline void ScanPhraseAccumOnly(SentenceMetadata const& smeta,const Hypergraph::Edge& edge, \
-                              WordID const* i, WordID const* end,void const* state,Accum *accum) const { \
-    char s2[ssz]; ScanPhraseAccum(smeta,edge,i,end,state,(void*)s2,accum);                \
+  void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, \
+                              WordID const* i, WordID const* end, \
+                              void const* state,Accum *accum) const { \
+    char s2[ssz]; ScanPhraseAccum(smeta,edge,i,end,state,(void*)s2,accum); \
   }
 
   // override this or bounce along with above.  note: you can just call ScanPhraseAccum
   // doesn't set state (for heuristic in ff_from_fsa)
   template <class Accum>
-  inline void ScanPhraseAccumOnly(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,
-                              WordID const* i, WordID const* end,void const* state,Accum *accum) const {
+  void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
+                           WordID const* i, WordID const* end,
+                           void const* state,Accum *accum) const {
     char s1[ssz];
     char s2[ssz];
     state_copy(s1,state);
@@ -354,20 +361,20 @@ public:
   }
 
   // or this
-  Featval ScanT1(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,St const& from ,St & to) const {
+  Featval ScanT1(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& /* edge */,WordID w,St const& from ,St & to) const {
     return d().ScanT1S(w,from,to);
   }
 
   // or this (most general)
   template <class Accum>
-  inline void ScanT(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID w,St const& prev_st,St &new_st,Accum *a) const {
+  inline void ScanT(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID w,St const& prev_st,St &new_st,Accum *a) const {
     Add(d().ScanT1(smeta,edge,w,prev_st,new_st),a);
   }
 
   // note: you're on your own when it comes to Phrase overrides.  see FsaFeatureFunctionBase.  sorry.
 
   template <class Accum>
-  inline void ScanAccum(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID w,void const* st,void *next_state,Accum *a) const {
+  inline void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID w,void const* st,void *next_state,Accum *a) const {
     Impl const& im=d();
     FSADBG(edge,"Scan "<<FD::Convert(im.fid_)<<" = "<<a->describe(im)<<" "<<im.state(st)<<"->"<<TD::Convert(w)<<" ");
     im.ScanT(smeta,edge,w,state(st),state(next_state),a);
@@ -393,8 +400,8 @@ struct FsaScanner {
   inline void *nexts() const {
     return (cs==st0)?st1:st0;
   }
-  const Hypergraph::Edge& edge;
-  FsaScanner(FF const& ff,SentenceMetadata const& smeta,const Hypergraph::Edge& edge) : ff(ff),smeta(smeta),edge(edge)
+  Hypergraph::Edge const& edge;
+  FsaScanner(FF const& ff,SentenceMetadata const& smeta,Hypergraph::Edge const& edge) : ff(ff),smeta(smeta),edge(edge)
   {
     ssz=ff.state_bytes();
     int stride=((ssz+ALIGN-1)/ALIGN)*ALIGN; // round up to multiple of ALIGN
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index 75778756..e8d3bbb0 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -4,6 +4,23 @@
 				   /* backoff weight for truncating context */
 // does that need to be used?  i think so.
 
+
+// if defined, and EDGE_INFO is defined, then --show_derivation will give a nice summary of all the components of the LM score, if you pass debug as the first param to the LM
+// (TODO) - because components are hidden within lm impl class, would have to pass edge in
+#define LM_DEBUG 0
+
+#define LM_DEBUG_DEBUG 0
+# define LMDBGif(i,e,x) do { if (i) { if (LM_DEBUG_CERR){std::cerr<<x;}  INFO_EDGE(e,x); if (LM_DEBUG_DEBUG){std::cerr<<"LMDBGif edge.info "<<&e<<" = "<<e.info()<<std::endl;}} } while(0)
+# define LMDBGif_nl(i,e) do { if (i) { if (LM_DEBUG_CERR) std::cerr<<std::endl; INFO_EDGE(e,"; "); } } while(0)
+#if LM_DEBUG
+# include <iostream>
+# define LMDBG(e,x) LMDBGif(debug(),e,x)
+# define LMDBGnl(e) LMDBGif_nl(debug(),e,x)
+#else
+# define LMDBG(e,x)
+# define LMDBGnl(e)
+#endif
+
 namespace {
 char const* usage_name="LanguageModel";
 char const* usage_short="srilm.gz [-n FeatureName] [-o StateOrder] [-m LimitLoadOrder]";
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
index 4b0682d1..108698ec 100755
--- a/decoder/ff_lm_fsa.h
+++ b/decoder/ff_lm_fsa.h
@@ -42,7 +42,7 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
   }
 
   template <class Accum>
-  void ScanAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,Accum *a) const {
+  void ScanAccum(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& /* edge */,WordID w,void const* old_st,void *new_st,Accum *a) const {
     if (!ctxlen_) {
       Add(floored(pimpl_->WordProb(w,&empty_context)),a);
       return;
@@ -84,14 +84,17 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
     WordID ctx[nboth+1];
     ctx[nboth]=TD::none;
     // reverse order - state at very end of context, then [i,end) in rev order ending at ctx[0]
-    wordcpy_reverse(ctx,begin,end);
-    W ctx_score_end=ctx+nw;
-    wordcpy(ctx_score_end,st,st_end); // st already reversed
+    W ctx_score_end=wordcpy_reverse(ctx,begin,end);
+    assert(ctx_score_end==ctx+nw);
+    wordcpy(ctx_score_end,st,st_end); // st already reversed.
+    // we could just copy the filled state words, but it probably doesn't save much time (and might cost some to scan to find the nones.  most contexts are full except for the shortest source spans.
 //    FSALMDBG(edge," Scan("<<TD::GetString(ctx,ctx+nboth)<<')');
     Featval p=0;
     FSALMDBGnl(edge);
     for(;ctx_score_end>ctx;--ctx_score_end)
       p+=floored(pimpl_->WordProb(ctx_score_end[-1],ctx_score_end));
+    //TODO: look for score discrepancy -
+    // i had some idea that maybe shortencontext would return a different prob if the length provided was > ctxlen_; however, since the same 4th digit disagreement happens with LM_FSA_SHORTEN_CONTEXT 0 anyway, it's not that.  perhaps look to SCAN_PHRASE_ACCUM_OVERRIDE - make sure they do the right thing.
 #if LM_FSA_SHORTEN_CONTEXT
     p+=pimpl_->ShortenContext(ctx,nboth<ctxlen_?nboth:ctxlen_);
 #endif
diff --git a/decoder/sentences.h b/decoder/sentences.h
index 4c71f8f1..54b5ffb3 100755
--- a/decoder/sentences.h
+++ b/decoder/sentences.h
@@ -29,9 +29,10 @@ inline void wordcpy(WordID *dest,WordID const* src,int n) {
 inline void wordcpy(WordID *dest,WordID const* src,WordID const* src_end) {
   wordcpy(dest,src,src_end-src);
 }
-inline void wordcpy_reverse(WordID *dest,WordID const* src,WordID const* src_end) {
+inline WordID *wordcpy_reverse(WordID *dest,WordID const* src,WordID const* src_end) {
   for(WordID const* i=src_end;i>src;)
     *dest++=*--i;
+  return dest;
 }
 inline Sentence singleton_sentence(WordID t) {
   return Sentence(1,t);
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-28 05:25:56 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-28 05:25:56 +0000
commit	b2cbd45b1f8b1d2194624d43ffee12bf53d56a4d (patch)
tree	fc8c1912616a9340686f87077b253b6b6f386ef9 /decoder
parent	fd09366d32dff751f8611bbb8d52627886b84150 (diff)