6 files changed, 65 insertions, 39 deletions
diff --git a/decoder/cdec.cc b/decoder/cdec.cc
index 460e9f15..876dee18 100644
--- a/decoder/cdec.cc
+++ b/decoder/cdec.cc
@@ -133,7 +133,7 @@ void InitCommandLine(int argc, char** argv, OracleBleu &ob, po::variables_map* c
         ("ctf_no_exhaustive", "Do not fall back to exhaustive parse if coarse-to-fine parsing fails")
         ("beam_prune", po::value<double>(), "Prune paths from +LM forest, keep paths within exp(alpha>=0)")
     ("scale_prune_srclen", "scale beams by the input length (in # of tokens; may not be what you want for lattices")
-    ("promise_power",po::value<double>()->default_value(0), "Give more beam budget to more promising previous-pass nodes when pruning - but allocate the same average beams.  0 means off, 1 means beam proportional to inside*outside prob, n means nth power (affects just --cubepruning_pop_limit)")
+    ("promise_power",po::value<double>()->default_value(0), "Give more beam budget to more promising previous-pass nodes when pruning - but allocate the same average beams.  0 means off, 1 means beam proportional to inside*outside prob, n means nth power (affects just --cubepruning_pop_limit).  note: for the same poplimit, this gives more search error unless very close to 0 (recommend disabled; even 0.01 is slightly worse than 0) which is a bad sign and suggests this isn't doing a good job; further it's slightly slower to LM cube rescore with 0.01 compared to 0, as well as giving (very insignificantly) lower BLEU.  TODO: test under more conditions, or try idea with different formula, or prob. cube beams.")
         ("lexalign_use_null", "Support source-side null words in lexical translation")
         ("tagger_tagset,t", po::value<string>(), "(Tagger) file containing tag set")
         ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format")
diff --git a/decoder/fdict.cc b/decoder/fdict.cc
index da80c260..65187685 100644
--- a/decoder/fdict.cc
+++ b/decoder/fdict.cc
@@ -111,7 +111,6 @@ std::string UrlDecodeString(const std::string & encoded) {
 }
 
 std::string UrlEncodeString(const std::string & decoded) {
-  const char * sz_decoded = decoded.c_str();
   size_t needed_length = decoded.length() * 3 + 3;
   char stackalloc[64];
   char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ?
diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h
index c517ec64..10ccfe6d 100755
--- a/decoder/ff_from_fsa.h
+++ b/decoder/ff_from_fsa.h
@@ -3,7 +3,9 @@
 
 #include "ff_fsa.h"
 
-#define FSA_FF_DEBUG 0
+#ifndef FSA_FF_DEBUG
+# define FSA_FF_DEBUG 0
+#endif
 #if FSA_FF_DEBUG
 # define FSAFFDBG(e,x) FSADBGif(debug(),e,x)
 # define FSAFFDBGnl(e) FSADBGif_nl(debug(),e)
@@ -49,35 +51,36 @@ public:
                              void* out_state) const
   {
     TRule const& rule=*edge.rule_;
-    Sentence const& e = rule.e();
+    Sentence const& e = rule.e();  // items in target side of rule
     typename Impl::Accum accum,h_accum;
     if (!ssz) { // special case for no state - but still build up longer phrases to score in case FSA overrides ScanPhraseAccum
       if (Impl::simple_phrase_score) {
-        // save the effort of building up the contiguous rule phrases
-        for (int j=0,je=e.size();j<je;++j) // items in target side of rule
+        // save the effort of building up the contiguous rule phrases - probably can just use the else branch, now that phrases aren't copied but are scanned off e directly.
+        for (int j=0,ee=e.size();j<ee;++j) {
           if (e[j]>=1) // token
             ff.ScanAccum(smeta,edge,(WordID)e[j],NULL,NULL,&accum);
-        FSAFFDBG(edge," "<<TD::Convert(e[j]));
+          FSAFFDBG(edge," "<<TD::Convert(e[j]));
+        }
       } else {
-        Sentence phrase;
-        phrase.reserve(e.size());
-        for (int j=0,je=e.size();;++j) { // items in target side of rule
-          if (je==j || e[j]<1) { // end or variable
-            if (phrase.size()) {
-              FSAFFDBG(edge," ["<<TD::GetString(phrase)<<']');
-              ff.ScanPhraseAccum(smeta,edge,begin(phrase),end(phrase),0,0,&accum);
-            }
-            if (je==j)
-              break;
-            phrase.clear();
-          } else { // word
-            WordID ew=e[j];
-            phrase.push_back(ew);
+#undef RHS_WORD
+#define RHS_WORD(j) (e[j]>=1)
+        for (int j=0,ee=e.size();;++j) { // items in target side of rule
+          for(;;++j) {
+            if (j>=ee) goto rhs_done; // j may go 1 past ee due to k possibly getting to end
+            if (RHS_WORD(j)) break;
           }
+          // word @j
+          int k=j;
+          while(k<ee) if (!RHS_WORD(++k)) break;
+          //end or nonword @k - [j,k) is phrase
+          FSAFFDBG(edge," ["<<TD::GetString(&e[j],&e[k])<<']');
+          ff.ScanPhraseAccum(smeta,edge,&e[j],&e[k],0,0,&accum);
+          j=k;
         }
       }
+    rhs_done:
       accum.Store(ff,features);
-      FSAFFDBG(egde,"="<<accum->describe(ff));
+      FSAFFDBG(edge,"="<<accum.describe(ff));
       FSAFFDBGnl(edge);
       return;
     }
@@ -91,8 +94,9 @@ public:
     WP left_full=left_end_full(out_state);
     FsaScanner<Impl> fsa(ff,smeta,edge);
     /* fsa holds our current state once we've seen our first M rule or child left-context words.  that state scores up the rest of the words at the time, and is replaced by the right state of any full child.  at the end, if we've got at least M left words in all, it becomes our right state (otherwise, we don't bother storing the partial state, which might seem useful any time we're built on by a rule that has our variable in the initial position - but without also storing the heuristic for that case, we just end up rescanning from scratch anyway to produce the heuristic.  so we just store all 0 bytes if we have less than M left words at the end. */
-    for (int j = 0; j < e.size(); ++j) { // items in target side of rule
-      if (e[j] < 1) { // variable
+    for (int j = 0,ee=e.size(); j < ee; ++j) { // items in target side of rule
+    s_rhs_next:
+      if (!RHS_WORD(j)) { // variable
         // variables a* are referring to this child derivation state.
         SP a = ant_contexts[-e[j]];
         WP al=(WP)a,ale=left_end(a); // the child left words
@@ -121,7 +125,6 @@ public:
         assert(anw<=M); // of course, we never store more than M left words in an item.
       } else { // single word
         WordID ew=e[j];
-        FSAFFDBG(edge,' '<<TD::Convert(ew));
         // some redundancy: non-vectorized version of above handling of left words of child item
         if (left_out<left_full) {
           *left_out++=ew;
@@ -129,11 +132,24 @@ public:
             fsa.reset(ff.heuristic_start_state());
             fsa.scan(left_begin,left_full,&h_accum); // save heuristic (happens only once)
           }
-        } else
-          fsa.scan(ew,&accum);
+        } else {
+          if (Impl::simple_phrase_score) {
+            fsa.scan(ew,&accum); // single word scan isn't optimal if phrase is different
+            FSAFFDBG(edge,' '<<TD::Convert(ew));
+          } else {
+            int k=j;
+            while(k<ee) if (!RHS_WORD(++k)) break;
+            FSAFFDBG(edge," rule-phrase["<<TD::GetString(&e[j],&e[k])<<']');
+            fsa.scan(&e[j],&e[k],&accum);
+            if (k==ee) goto s_rhs_done;
+            j=k;
+            goto s_rhs_next;
+          }
+        }
       }
     }
-
+#undef RHS_WORD
+  s_rhs_done:
     void *out_fsa_state=fsa_state(out_state);
     if (left_out<left_full) { // finally: partial heuristic for unfilled items
 //      fsa.reset(ff.heuristic_start_state());      fsa.scan(left_begin,left_out,&h_accum);
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index e8d3bbb0..2b97bea8 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -1,5 +1,6 @@
 #define LM_FSA_SHORTEN_CONTEXT 1
-// seems to work great  - just not sure if it actually speeds anything up
+// seems to work great  - just not sure if it actually speeds anything up.  theoretically slightly more compact (more sharing) forest, but unlikely to make a big difference
+
 //      virtual LogP contextBOW(const VocabIndex *context, unsigned length);
 				   /* backoff weight for truncating context */
 // does that need to be used?  i think so.
@@ -619,11 +620,12 @@ LanguageModelFsa::LanguageModelFsa(string const& param) {
 void LanguageModelFsa::print_state(ostream &o,void const* st) const {
   WordID const *wst=(WordID const*)st;
   o<<'[';
-  for (int i=ctxlen_;i>0;) {
+  bool sp=false;
+  for (int i=ctxlen_;i>0;sp=true) {
     --i;
     WordID w=wst[i];
     if (w==TD::none) continue;
-    if (i) o<<' ';
+    if (sp) o<<' ';
     o << TD::Convert(w);
   }
   o<<']';
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
index 108698ec..9ba7b2c5 100755
--- a/decoder/ff_lm_fsa.h
+++ b/decoder/ff_lm_fsa.h
@@ -1,9 +1,10 @@
 #ifndef FF_LM_FSA_H
 #define FF_LM_FSA_H
 
-//FIXME: when FSA_LM_PHRASE 1, 3gram has differences in 4th decimal digit, compared to regular ff_lm.  this is USUALLY a bug (there's way more actual precision in there).  this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that).  also, LM_FSA_SHORTEN_CONTEXT gives identical scores with FSA_LM_PHRASE 0
+//FIXME: when FSA_LM_PHRASE 1, 3gram fsa has differences, especially with unk words, in about the 4th decimal digit (about .05%), compared to regular ff_lm.  this is USUALLY a bug (there's way more actual precision in there).  this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that).  also, LM_FSA_SHORTEN_CONTEXT gives identical scores with FSA_LM_PHRASE 0
 
-#define FSA_LM_PHRASE 0
+// enabling for now - retest unigram+ more, solve above puzzle
+#define FSA_LM_PHRASE 1
 
 #define FSA_LM_DEBUG 0
 #if FSA_LM_DEBUG
@@ -42,7 +43,8 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
   }
 
   template <class Accum>
-  void ScanAccum(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& /* edge */,WordID w,void const* old_st,void *new_st,Accum *a) const {
+  void ScanAccum(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& edge,WordID w,void const* old_st,void *new_st,Accum *a) const {
+    Hypergraph::Edge &de=(Hypergraph::Edge &)edge;
     if (!ctxlen_) {
       Add(floored(pimpl_->WordProb(w,&empty_context)),a);
       return;
@@ -53,6 +55,8 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
       state_copy(ctx,old_st);
       ctx[ctxlen_]=TD::none; // make this part of state?  wastes space but saves copies.
       Featval p=floored(pimpl_->WordProb(w,ctx));
+      FSALMDBG(de,"p("<<TD::Convert(w)<<"|"<<TD::Convert(ctx,ctx+ctxlen_)<<")="<<p);
+      FSALMDBGnl(de);
 // states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.).
       WordID *nst=(WordID *)new_st;
       nst[0]=w; // new most recent word
@@ -68,6 +72,7 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
   //FIXME: there is a bug in here somewhere, or else the 3gram LM we use gives different scores for phrases (impossible? BOW nonzero when shortening context past what LM has?)
   template <class Accum>
   void ScanPhraseAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge&edge,WordID const* begin,WordID const* end,void const* old_st,void *new_st,Accum *a) const {
+    Hypergraph::Edge &de=(Hypergraph::Edge &)edge;
     if (begin==end) return; // otherwise w/ shortening it's possible to end up with no words at all.
     /* // this is forcing unigram prob always.  we will instead build the phrase
     if (!ctxlen_) {
@@ -85,27 +90,29 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
     ctx[nboth]=TD::none;
     // reverse order - state at very end of context, then [i,end) in rev order ending at ctx[0]
     W ctx_score_end=wordcpy_reverse(ctx,begin,end);
-    assert(ctx_score_end==ctx+nw);
     wordcpy(ctx_score_end,st,st_end); // st already reversed.
+    assert(ctx_score_end==ctx+nw);
     // we could just copy the filled state words, but it probably doesn't save much time (and might cost some to scan to find the nones.  most contexts are full except for the shortest source spans.
-//    FSALMDBG(edge," Scan("<<TD::GetString(ctx,ctx+nboth)<<')');
+    FSALMDBG(de," scan.r->l("<<TD::GetString(ctx,ctx_score_end)<<"|"<<TD::GetString(ctx_score_end,ctx+nboth)<<')');
+    FSAFFDBG(de," r->l("<<TD::GetString(ctx,ctx_score_end)<<"|"<<TD::GetString(ctx_score_end,ctx+nboth)<<')');
     Featval p=0;
     FSALMDBGnl(edge);
     for(;ctx_score_end>ctx;--ctx_score_end)
       p+=floored(pimpl_->WordProb(ctx_score_end[-1],ctx_score_end));
     //TODO: look for score discrepancy -
-    // i had some idea that maybe shortencontext would return a different prob if the length provided was > ctxlen_; however, since the same 4th digit disagreement happens with LM_FSA_SHORTEN_CONTEXT 0 anyway, it's not that.  perhaps look to SCAN_PHRASE_ACCUM_OVERRIDE - make sure they do the right thing.
+    // i had some idea that maybe shortencontext would return a different prob if the length provided was > ctxlen_; however, since the same disagreement happens with LM_FSA_SHORTEN_CONTEXT 0 anyway, it's not that.  perhaps look to SCAN_PHRASE_ACCUM_OVERRIDE - make sure they do the right thing.
 #if LM_FSA_SHORTEN_CONTEXT
     p+=pimpl_->ShortenContext(ctx,nboth<ctxlen_?nboth:ctxlen_);
 #endif
     state_copy(new_st,ctx);
-    FSALMDBG(edge," lm.Scan("<<TD::GetString(begin,end)<<"|"<<describe_state(old_st)<<")"<<"="<<p<<","<<describe_state(new_st));
+    FSALMDBG(de," lm.Scan("<<TD::GetString(begin,end)<<"|"<<describe_state(old_st)<<")"<<"="<<p<<","<<describe_state(new_st));
     FSALMDBGnl(edge);
     Add(p,a);
   }
 
   SCAN_PHRASE_ACCUM_OVERRIDE
 #endif
+
   // impl details:
   void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows.  otherwise, it's the same as a "-o i" argument to constructor
   double floor_; // log10prob minimum used (e.g. unk words)
diff --git a/decoder/hg.h b/decoder/hg.h
index 6660a1f5..bef3bebd 100644
--- a/decoder/hg.h
+++ b/decoder/hg.h
@@ -2,7 +2,9 @@
 #define _HG_H_
 
 // define USE_INFO_EDGE 1 if you want lots of debug info shown with --show_derivations - otherwise it adds quite a bit of overhead if ffs have their logging enabled (e.g. ff_from_fsa)
-#define USE_INFO_EDGE 0
+#ifndef USE_INFO_EDGE
+# define USE_INFO_EDGE 0
+#endif
 #if USE_INFO_EDGE
 # define INFO_EDGE(e,msg) do { std::ostringstream &o=(e.info_);o<<msg; } while(0)
 # define INFO_EDGEw(e,msg) do { std::ostringstream &o(e.info_);if (o.empty()) o<<' ';o<<msg; } while(0)