fsa lm phrase mystery remains, but bool fsa::simple_phrase_score indicates whether stateless features should copy phrases from rules (e.g. unigram lm)

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@444 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-28 05:25:56 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-28 05:25:56 +0000
commit: 36efbef92279e22888059ccd225aa7798fe9f4ae (patch)
tree: 114b1db2852cedd68a9c86661d162f02dc23f58a /decoder/ff_from_fsa.h
parent: 13ba3e7e405c7b1ecd78ed54a17cd05454eaf918 (diff)
1 files changed, 42 insertions, 30 deletions
diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h
index 7fa6be67..c517ec64 100755
--- a/decoder/ff_from_fsa.h
+++ b/decoder/ff_from_fsa.h
@@ -40,7 +40,7 @@ public:
 
   Features features() const { return ff.features(); }
 
-  // Log because it
+  // Log because it potentially stores info in edge.  otherwise the same as regular TraversalFeatures.
   void TraversalFeaturesLog(const SentenceMetadata& smeta,
                              Hypergraph::Edge& edge,
                              const std::vector<const void*>& ant_contexts,
@@ -51,62 +51,74 @@ public:
     TRule const& rule=*edge.rule_;
     Sentence const& e = rule.e();
     typename Impl::Accum accum,h_accum;
-    if (!ssz) {
-      Sentence phrase;
-      phrase.reserve(e.size());
-      for (int j=0,je=e.size();;++j) { // items in target side of rule
-        if (je==j || e[j]<1) { // end or variable
-          if (phrase.size()) {
-            FSAFFDBG(edge," ["<<TD::GetString(phrase)<<']');
-            ff.ScanPhraseAccum(smeta,edge,begin(phrase),end(phrase),0,0,&accum);
+    if (!ssz) { // special case for no state - but still build up longer phrases to score in case FSA overrides ScanPhraseAccum
+      if (Impl::simple_phrase_score) {
+        // save the effort of building up the contiguous rule phrases
+        for (int j=0,je=e.size();j<je;++j) // items in target side of rule
+          if (e[j]>=1) // token
+            ff.ScanAccum(smeta,edge,(WordID)e[j],NULL,NULL,&accum);
+        FSAFFDBG(edge," "<<TD::Convert(e[j]));
+      } else {
+        Sentence phrase;
+        phrase.reserve(e.size());
+        for (int j=0,je=e.size();;++j) { // items in target side of rule
+          if (je==j || e[j]<1) { // end or variable
+            if (phrase.size()) {
+              FSAFFDBG(edge," ["<<TD::GetString(phrase)<<']');
+              ff.ScanPhraseAccum(smeta,edge,begin(phrase),end(phrase),0,0,&accum);
+            }
+            if (je==j)
+              break;
+            phrase.clear();
+          } else { // word
+            WordID ew=e[j];
+            phrase.push_back(ew);
           }
-          if (je==j)
-            break;
-          phrase.clear();
-        } else { // word
-          WordID ew=e[j];
-          phrase.push_back(ew);
         }
       }
       accum.Store(ff,features);
+      FSAFFDBG(egde,"="<<accum->describe(ff));
       FSAFFDBGnl(edge);
       return;
     }
+
+    // bear with me, because this is hard to understand.  reminder: ant_contexts and out_state are left-words first (up to M, TD::none padded).  if all M words are present, then FSA state follows.  otherwise 0 bytes to keep memcmp/hash happy.
+
 //why do we compute heuristic in so many places?  well, because that's how we know what state we should score words in once we're full on our left context (because of markov order bound, we know the score will be the same no matter what came before that left context)
-    SP h_start=ff.heuristic_start_state();
     // these left_* refer to our output (out_state):
     W left_begin=(W)out_state;
     W left_out=left_begin; // [left,fsa_state) = left ctx words.  if left words aren't full, then null wordid
     WP left_full=left_end_full(out_state);
-    FsaScanner<Impl> fsa(ff,smeta,edge); // this holds our current state and eventuallybecomes our right state if we saw enough words
+    FsaScanner<Impl> fsa(ff,smeta,edge);
+    /* fsa holds our current state once we've seen our first M rule or child left-context words.  that state scores up the rest of the words at the time, and is replaced by the right state of any full child.  at the end, if we've got at least M left words in all, it becomes our right state (otherwise, we don't bother storing the partial state, which might seem useful any time we're built on by a rule that has our variable in the initial position - but without also storing the heuristic for that case, we just end up rescanning from scratch anyway to produce the heuristic.  so we just store all 0 bytes if we have less than M left words at the end. */
     for (int j = 0; j < e.size(); ++j) { // items in target side of rule
       if (e[j] < 1) { // variable
-        SP a = ant_contexts[-e[j]]; // variables a* are referring to this child derivation state.
-        FSAFFDBG(edge,' '<<describe_state(a));
-        WP al=(WP)a;
-        WP ale=left_end(a);
-        // scan(al,le) these - the same as below else.  macro for now; pull into closure object later?
+        // variables a* are referring to this child derivation state.
+        SP a = ant_contexts[-e[j]];
+        WP al=(WP)a,ale=left_end(a); // the child left words
         int anw=ale-al;
+        FSAFFDBG(edge,' '<<describe_state(a));
 // anw left words in child.  full if == M.  we will use them to fill our left words, and then score the rest fully, knowing what state we're in based on h_state -> our left words -> any number of interior words which are scored then hidden
-        if (left_out+anw<left_full) { // nothing to score after adding
+        if (left_out+anw<left_full) { // still short of M after adding - nothing to score (not even our heuristic)
           wordcpy(left_out,al,anw);
           left_out+=anw;
-        } else if (left_out<left_full) { // something to score AND newly full left context to fill
+        } else if (left_out<left_full) { // we had less than M before, and will have a tleast M after adding.  so score heuristic and the rest M+1,... score inside.
           int ntofill=left_full-left_out;
           assert(ntofill==M-(left_out-left_begin));
           wordcpy(left_out,al,ntofill);
           left_out=(W)left_full;
           // heuristic known now
-          fsa.reset(h_start);
+          fsa.reset(ff.heuristic_start_state());
           fsa.scan(left_begin,left_full,&h_accum); // save heuristic (happens once only)
           fsa.scan(al+ntofill,ale,&accum); // because of markov order, fully filled left words scored starting at h_start put us in the right state to score the extra words (which are forgotten)
           al+=ntofill; // we used up the first ntofill words of al to end up in some known state via exactly M words total (M-ntofill were there beforehand).  now we can scan the remaining al words of this child
         } else { // more to score / state to update (left already full)
           fsa.scan(al,ale,&accum);
         }
-        if (anw==M) // child had full state already
+        if (anw==M)
           fsa.reset(fsa_state(a));
-        assert(anw<=M);
+        // if child had full state already, we must assume there was a gap and use its right state (note: if the child derivation was exactly M words, then we still use its state even though it will be equal to our current; there's no way to distinguish between such an M word item and an e.g. 2*M+k word item, although it's extremely unlikely that you'd have a >M word item that happens to have the same left and right boundary words).
+        assert(anw<=M); // of course, we never store more than M left words in an item.
       } else { // single word
         WordID ew=e[j];
         FSAFFDBG(edge,' '<<TD::Convert(ew));
@@ -114,7 +126,7 @@ public:
         if (left_out<left_full) {
           *left_out++=ew;
           if (left_out==left_full) { // handle heuristic, once only, establish state
-            fsa.reset(h_start);
+            fsa.reset(ff.heuristic_start_state());
             fsa.scan(left_begin,left_full,&h_accum); // save heuristic (happens only once)
           }
         } else
@@ -124,8 +136,8 @@ public:
 
     void *out_fsa_state=fsa_state(out_state);
     if (left_out<left_full) { // finally: partial heuristic for unfilled items
-//      fsa.reset(h_start);      fsa.scan(left_begin,left_out,&h_accum);
-      ff.ScanPhraseAccumOnly(smeta,edge,left_begin,left_out,h_start,&h_accum);
+//      fsa.reset(ff.heuristic_start_state());      fsa.scan(left_begin,left_out,&h_accum);
+      ff.ScanPhraseAccumOnly(smeta,edge,left_begin,left_out,ff.heuristic_start_state(),&h_accum);
       do { *left_out++=TD::none; } while(left_out<left_full); // none-terminate so left_end(out_state) will know how many words
       ff.state_zero(out_fsa_state); // so we compare / hash correctly. don't know state yet because left context isn't full
     } else // or else store final right-state.  heuristic was already assigned
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-28 05:25:56 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-28 05:25:56 +0000
commit	36efbef92279e22888059ccd225aa7798fe9f4ae (patch)
tree	114b1db2852cedd68a9c86661d162f02dc23f58a /decoder/ff_from_fsa.h
parent	13ba3e7e405c7b1ecd78ed54a17cd05454eaf918 (diff)