12 files changed, 466 insertions, 94 deletions
diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc
index 0e83582f..2b518d62 100644
--- a/decoder/apply_models.cc
+++ b/decoder/apply_models.cc
@@ -96,7 +96,7 @@ struct Candidate {
     if (is_goal) {
       assert(tail.size() == 1);
       const string& ant_state = node_states[tail.front()];
-      models.AddFinalFeatures(ant_state, &out_edge_);
+      models.AddFinalFeatures(ant_state, &out_edge_, smeta);
     } else {
       models.AddFeaturesToEdge(smeta, out_hg, node_states, &out_edge_, &state_, &edge_estimate);
     }
@@ -344,7 +344,7 @@ struct NoPruningRescorer {
       if (is_goal) {
         assert(tail.size() == 1);
         const string& ant_state = node_states_[tail.front()];
-        models.AddFinalFeatures(ant_state, new_edge);
+        models.AddFinalFeatures(ant_state, new_edge,smeta);
       } else {
         prob_t edge_estimate; // this is a full intersection, so we disregard this
         models.AddFeaturesToEdge(smeta, out, node_states_, new_edge, &head_state, &edge_estimate);
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 069e07f1..ecb244d8 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -8,12 +8,13 @@
 #include "ff_factory.h"
 #include "ff_ruleshape.h"
 #include "ff_bleu.h"
+#include "ff_from_fsa.h"
 
 boost::shared_ptr<FFRegistry> global_ff_registry;
 
 void register_feature_functions() {
   global_ff_registry->Register(new FFFactory<LanguageModel>);
-
+  global_ff_registry->Register(new FFFactory<WordPenaltyFromFsa>); // same as WordPenalty, but implemented using ff_fsa
   //TODO: use for all features the new Register which requires usage(...)
 #ifdef HAVE_RANDLM
   global_ff_registry->Register("RandLM", new FFFactory<LanguageModelRandLM>);
@@ -34,5 +35,4 @@ void register_feature_functions() {
   global_ff_registry->Register("CSplit_ReverseCharLM", new FFFactory<ReverseCharLMCSplitFeature>);
   global_ff_registry->Register("Tagger_BigramIdentity", new FFFactory<Tagger_BigramIdentity>);
   global_ff_registry->Register("LexicalPairIdentity", new FFFactory<LexicalPairIdentity>);
-};
-
+}
diff --git a/decoder/ff.cc b/decoder/ff.cc
index a20b743f..b6a541e3 100644
--- a/decoder/ff.cc
+++ b/decoder/ff.cc
@@ -30,12 +30,12 @@ string FeatureFunction::usage_helper(std::string const& name,std::string const&
   return r;
 }
 
-FeatureFunction::Features FeatureFunction::single_feature(WordID feat) {
+Features FeatureFunction::single_feature(WordID feat) {
   return Features(1,feat);
 }
 
-FeatureFunction::Features ModelSet::all_features(std::ostream *warn,bool warn0) {
-  typedef FeatureFunction::Features FFS;
+Features ModelSet::all_features(std::ostream *warn,bool warn0) {
+  typedef Features FFS;
   FFS ffs;
 #define WARNFF(x) do { if (warn) { *warn << "WARNING: "<< x ; *warn<<endl; } } while(0)
   typedef std::map<WordID,string> FFM;
@@ -74,7 +74,7 @@ FeatureFunction::Features ModelSet::all_features(std::ostream *warn,bool warn0)
 
 void ModelSet::show_features(std::ostream &out,std::ostream &warn,bool warn_zero_wt)
 {
-  typedef FeatureFunction::Features FFS;
+  typedef Features FFS;
   FFS ffs=all_features(&warn,warn_zero_wt);
   out << "Weight  Feature\n";
   for (unsigned i=0;i<ffs.size();++i) {
@@ -90,7 +90,7 @@ void ModelSet::show_features(std::ostream &out,std::ostream &warn,bool warn_zero
 
 // Hiero and Joshua use log_10(e) as the value, so I do to
 WordPenalty::WordPenalty(const string& param) :
-    fid_(FD::Convert("WordPenalty")),
+  fid_(FD::Convert("WordPenalty")),
     value_(-1.0 / log(10)) {
   if (!param.empty()) {
     cerr << "Warning WordPenalty ignoring parameter: " << param << endl;
@@ -118,11 +118,11 @@ SourceWordPenalty::SourceWordPenalty(const string& param) :
   }
 }
 
-FeatureFunction::Features SourceWordPenalty::features() const {
+Features SourceWordPenalty::features() const {
   return single_feature(fid_);
 }
 
-FeatureFunction::Features WordPenalty::features() const {
+Features WordPenalty::features() const {
   return single_feature(fid_);
 }
 
@@ -154,7 +154,7 @@ ArityPenalty::ArityPenalty(const std::string& param) :
   while (!fids_.empty() && fids_.back()==0) fids_.pop_back(); // pretty up features vector in case FD was frozen.  doesn't change anything
 }
 
-FeatureFunction::Features ArityPenalty::features() const {
+Features ArityPenalty::features() const {
   return Features(fids_.begin(),fids_.end());
 }
 
@@ -212,7 +212,7 @@ void ModelSet::AddFeaturesToEdge(const SentenceMetadata& smeta,
   edge->edge_prob_.logeq(edge->feature_values_.dot(weights_));
 }
 
-void ModelSet::AddFinalFeatures(const std::string& state, Hypergraph::Edge* edge) const {
+void ModelSet::AddFinalFeatures(const std::string& state, Hypergraph::Edge* edge,SentenceMetadata const& smeta) const {
   assert(1 == edge->rule_->Arity());
 
   for (int i = 0; i < models_.size(); ++i) {
@@ -223,7 +223,7 @@ void ModelSet::AddFinalFeatures(const std::string& state, Hypergraph::Edge* edge
       int spos = model_state_pos_[i];
       ant_state = &state[spos];
     }
-    ff.FinalTraversalFeatures(ant_state, &edge->feature_values_);
+    ff.FinalTraversalFeatures(smeta, ant_state, &edge->feature_values_);
   }
   edge->edge_prob_.logeq(edge->feature_values_.dot(weights_));
 }
diff --git a/decoder/ff.h b/decoder/ff.h
index 2b7c7fec..e54ac149 100644
--- a/decoder/ff.h
+++ b/decoder/ff.h
@@ -8,6 +8,8 @@
 class SentenceMetadata;
 class FeatureFunction;  // see definition below
 
+typedef std::vector<WordID> Features; // set of features ids
+
 // if you want to develop a new feature, inherit from this class and
 // override TraversalFeaturesImpl(...).  If it's a feature that returns /
 // depends on context, you may also need to implement
@@ -23,12 +25,8 @@ class FeatureFunction {
   static std::string usage(bool show_params,bool show_details) {
     return usage_helper("FIXME_feature_needs_name","[no parameters]","[no documentation yet]",show_params,show_details);
   }
-
-  typedef std::vector<WordID> Features; // set of features ids
-
-protected:
   static std::string usage_helper(std::string const& name,std::string const& params,std::string const& details,bool show_params,bool show_details);
-  static Features single_feature(WordID feat);
+  static Features single_feature(int feat);
 public:
   // stateless feature that doesn't depend on source span: override and return true.  then your feature can be precomputed over rules.
   virtual bool rule_feature() const { return false; }
@@ -61,8 +59,17 @@ public:
   // if there's some state left when you transition to the goal state, score
   // it here.  For example, the language model computes the cost of adding
   // <s> and </s>.
+protected:
   virtual void FinalTraversalFeatures(const void* residual_state,
                                       FeatureVector* final_features) const;
+public:
+  //override either this or above. (no need to do both)
+  virtual void FinalTraversalFeatures(const SentenceMetadata& smeta,
+                                      const void* residual_state,
+                                      FeatureVector* final_features) const {
+    FinalTraversalFeatures(residual_state,final_features);
+  }
+
 
  protected:
   // context is a pointer to a buffer of size NumBytesContext() that the
@@ -88,6 +95,7 @@ public:
   int state_size_;
 };
 
+
 // word penalty feature, for each word on the E side of a rule,
 // add value_
 class WordPenalty : public FeatureFunction {
@@ -176,12 +184,13 @@ class ModelSet {
                          prob_t* combination_cost_estimate = NULL) const;
 
   void AddFinalFeatures(const std::string& residual_context,
-                        Hypergraph::Edge* edge) const;
+                        Hypergraph::Edge* edge,
+                        SentenceMetadata const& smeta) const;
 
   bool empty() const { return models_.empty(); }
 
   bool stateless() const { return !state_size_; }
-  FeatureFunction::Features all_features(std::ostream *warnings=0,bool warn_fid_zero=false); // this will warn about duplicate features as well (one function overwrites the feature of another).  also resizes weights_ so it is large enough to hold the (0) weight for the largest reported feature id.  since 0 is a NULL feature id, it's never included.  if warn_fid_zero, then even the first 0 id is
+  Features all_features(std::ostream *warnings=0,bool warn_fid_zero=false); // this will warn about duplicate features as well (one function overwrites the feature of another).  also resizes weights_ so it is large enough to hold the (0) weight for the largest reported feature id.  since 0 is a NULL feature id, it's never included.  if warn_fid_zero, then even the first 0 id is
   void show_features(std::ostream &out,std::ostream &warn,bool warn_zero_wt=true); //show features and weights
  private:
   std::vector<const FeatureFunction*> models_;
diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h
new file mode 100755
index 00000000..3bd3f070
--- /dev/null
+++ b/decoder/ff_from_fsa.h
@@ -0,0 +1,189 @@
+#ifndef FF_FROM_FSA_H
+#define FF_FROM_FSA_H
+
+#include "ff_fsa.h"
+
+/* regular bottom up scorer from Fsa feature
+   uses guarantee about markov order=N to score ASAP
+   encoding of state: if less than N-1 (ctxlen) words
+
+   either:
+   struct FF : public FsaImpl,FeatureFunctionFromFsa<FF> (more efficient)
+
+   or:
+   struct FF : public FsaFeatureFunctionDynamic,FeatureFunctionFromFsa<FF> (code sharing, but double dynamic dispatch)
+*/
+
+template <class Impl>
+class FeatureFunctionFromFsa : public FeatureFunction {
+  typedef void const* SP;
+  typedef WordID *W;
+  typedef WordID const* WP;
+public:
+  FeatureFunctionFromFsa(std::string const& param) : ff(param) {
+    Init();
+  }
+
+  static std::string usage(bool args,bool verbose) {
+    return Impl::usage(args,verbose);
+  }
+
+  Features features() const { return ff.features(); }
+
+  //TODO: add source span to Fsa FF interface, pass along
+  //TODO: read/debug VERY CAREFULLY
+  void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                             const Hypergraph::Edge& edge,
+                             const std::vector<const void*>& ant_contexts,
+                             FeatureVector* features,
+                             FeatureVector* estimated_features,
+                             void* out_state) const
+  {
+    if (!ssz) {
+      TRule const& rule=*edge.rule_;
+      Sentence const& e = rule.e();
+      for (int j = 0; j < e.size(); ++j) { // items in target side of rule
+        if (e[j] < 1) { // variable
+        } else {
+          WordID ew=e[j];
+          ff.Scan(smeta,ew,0,0,features);
+        }
+      }
+      return;
+    }
+
+    SP h_start=ff.heuristic_start_state();
+    W left_begin=(W)out_state;
+    W left_out=left_begin; // [left,fsa_state) = left ctx words.  if left words aren't full, then null wordid
+    WP left_full=left_end_full(out_state);
+    FsaScanner<Impl> fsa(ff,smeta);
+    TRule const& rule=*edge.rule_;
+    Sentence const& e = rule.e();
+    for (int j = 0; j < e.size(); ++j) { // items in target side of rule
+      if (e[j] < 1) { // variable
+        SP a = ant_contexts[-e[j]];
+        WP al=(WP)a;
+        WP ale=left_end(a);
+        // scan(al,le) these - the same as below else.  macro for now; pull into closure object later?
+        int nw=ale-al;
+        if (left_out+nw<left_full) { // nothing to score
+          wordcpy(left_out,al,nw);
+          left_out+=nw;
+        } else if (left_out<left_full) { // something to score AND left context to fill
+          int ntofill=left_full-left_out;
+          wordcpy(left_out,al,ntofill);
+          left_out=(W)left_full;
+          // heuristic known now
+          fsa.reset(h_start);
+          fsa.scan(left_begin,left_full,estimated_features); // save heuristic (happens once only)
+          al+=ntofill; // we used up the first ntofill words of al to end up in some known state via exactly M words total (M-ntofill were there beforehand).  now we can scan the remaining al words of this child
+          goto scan;
+        } else { // more to score / state to update
+        scan:
+          fsa.scan(al,ale,features);
+        }
+        if (nw>M) // child had full state already (had a "gap"); if nw==M then we already reached the same state via left word heuristic scan above
+          fsa.reset(fsa_state(a));
+      } else { // single word
+        WordID ew=e[j];
+        // some redundancy: non-vectorized version of above handling of left words of child item
+        if (left_out<left_full) {
+          *left_out++=ew;
+          if (left_out==left_full) { // handle heuristic, once only, establish state
+            fsa.reset(h_start);
+            fsa.scan(left_begin,left_full,estimated_features); // save heuristic (happens only once)
+          }
+        } else
+          fsa.scan(ew,features);
+      }
+    }
+
+    if (left_out<left_full) { // finally: partial heuristic fo runfilled items
+      fsa.reset(h_start);
+      fsa.scan(left_begin,left_out,estimated_features); // save heuristic (happens once)
+      clear_fsa_state(out_state); // 0 bytes so we compare / hash correctly. don't know state yet
+      while(left_out<left_full) *left_out++=TD::none; // mark as partial left word seq
+    } else // or else store final right-state.  heuristic was already assigned
+      fstatecpy(fsa_state(out_state),fsa.cs);
+  }
+
+  virtual void FinalTraversalFeatures(const SentenceMetadata& smeta,
+                                      const void* residual_state,
+                                      FeatureVector* final_features) const
+  {
+    WP l=(WP)residual_state,lend=left_end(residual_state);
+    SP rst=fsa_state(residual_state);
+    SP ss=ff.start_state();
+    Sentence const& ends=ff.end_phrase();
+    if (lend==rst) { // implying we have an fsa state
+      AccumFeatures(ff,smeta,l,lend,final_features,ss); // e.g. <s> score(full left unscored phrase)
+      AccumFeatures(ff,smeta,begin(ends),end(ends),final_features,rst); // e.g. [ctx for last M words] score("</s>")
+    } else { // all we have is a single short phrase < M words before adding ends
+      int nl=lend-l;
+      Sentence whole(ends.size()+nl);
+      WordID *w=begin(whole);
+      wordcpy(w,l,nl);
+      wordcpy(w+nl,begin(ends),ends.size());
+      // whole = left-words + end-phrase
+      AccumFeatures(ff,smeta,w,end(whole),final_features,ss);
+    }
+  }
+
+  bool rule_feature() const {
+    return StateSize()==0; // Fsa features don't get info about span
+  }
+
+private:
+  Impl ff;
+  void Init() {
+//    FeatureFunction::name=Impl::usage(false,false); // already achieved by ff_factory.cc
+    M=ff.markov_order();
+    ssz=ff.state_bytes();
+    state_offset=sizeof(WordID)*M;
+    SetStateSize(ff.state_bytes()+state_offset);
+  }
+  int M; // markov order (ctx len)
+  FeatureFunctionFromFsa() {  }
+  // call this explicitly in constructor body:
+  int state_offset; // store left-words first, then fsa state
+  int ssz; // bytes in fsa state
+  /*
+    state layout: left WordIds, followed by fsa state
+    left words have never been scored.  last ones remaining will be scored on FinalTraversalFeatures only.
+    right state is unknown until we have all M left words (less than M means TD::none will pad out right end).  unk right state will be zeroed out for proper hash/equal recombination.
+  */
+
+  static inline WordID const* left_end(WordID const* left, WordID const* e) {
+    while (e>left)
+      if (*--e!=TD::none) break;
+    //post: [left,e] are the seen left words
+    return e+1;
+  }
+  inline WP left_end(SP ant) const {
+    return left_end((WP)ant,(WP)fsa_state(ant));
+  }
+  inline WP left_end_full(SP ant) const {
+    return (WP)fsa_state(ant);
+  }
+  inline SP fsa_state(SP ant) const {
+    return ((char const*)ant+state_offset);
+  }
+  inline void *fsa_state(void * ant) const {
+    return ((char *)ant+state_offset);
+  }
+
+  void clear_fsa_state(void *ant) const { // when state is unknown
+    std::memset(fsa_state(ant),0,ssz);
+  }
+
+  inline void fstatecpy(void *dest,void const* src) const {
+    std::memcpy(dest,src,ssz);
+  }
+
+
+};
+
+typedef FeatureFunctionFromFsa<WordPenaltyFsa> WordPenaltyFromFsa;
+
+
+#endif
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
index a14f9913..3096f049 100755
--- a/decoder/ff_fsa.h
+++ b/decoder/ff_fsa.h
@@ -1,6 +1,8 @@
 #ifndef FF_FSA_H
 #define FF_FSA_H
 
+//SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h
+
 //TODO: actually compile this; probably full of syntax errors.
 
 #include <stdint.h> //C99
@@ -10,6 +12,7 @@
 #include "value_array.h" // used to hold state
 #include "tdict.h"
 #include "hg.h"
+#include "sentences.h"
 
 typedef ValueArray<uint8_t> Bytes;
 
@@ -18,92 +21,171 @@ typedef ValueArray<uint8_t> Bytes;
 
   state is some fixed width byte array.  could actually be a void *, WordID sequence, whatever.
 
- */
+*/
 
-// it's not necessary to inherit from this.
+// it's not necessary to inherit from this, but you probably should to save yourself some boilerplate.  defaults to no-state
 struct FsaFeatureFunctionBase {
-  std::string name,usage_short,usage_verbose;
-  int fid; // you can have more than 1 feature of course.
-  void InitFid() { // call this, though, if you have a single feature
-    fid=FD::Convert(name);
+protected:
+  Bytes start,h_start; // start state and estimated-features (heuristic) start state.  set these.  default empty.
+  Sentence end_phrase_; // words appended for final traversal (final state cost is assessed using Scan) e.g. "</s>" for lm.
+  int state_bytes_; // don't forget to set this. default 0 (it may depend on params of course)
+  void set_state_bytes(int sb=0) {
+    state_bytes_=sb;
   }
-  std::string usage(bool param,bool verbose) {
-    return FeatureFunction::usage_helper(name,usage_short,usage_verbose,param,verbose);
+
+  int fid_; // you can have more than 1 feature of course.
+  void init_fid(std::string const& name) { // call this, though, if you have a single feature
+    fid_=FD::Convert(name);
   }
+public:
 
-  FsaFeatureFunctionBase(std::string const& name,std::string const& usage_verbose="[no documentation yet]",std::string const& usage_short="[no parameters]") : name(name),usage_short(usage_short),usage_verbose(usage_verbose) {  }
+  // return m: all strings x with the same final m+1 letters must end in this state
+  /* markov chain of order m: P(xn|xn-1...x1)=P(xn|xn-1...xn-m) */
+  int markov_order() const { return 0; } // override if you use state.  order 0 implies state_bytes()==0 as well, as far as scoring/splitting is concerned (you can still track state, though)
+  //TODO: if we wanted, we could mark certain states as maximal-context, but this would lose our fixed amount of left context in ff_from_fsa, and lose also our vector operations (have to scan left words 1 at a time, checking always to see where you change from h to inside - BUT, could detect equivalent LM states, which would be nice).
 
-  int state_bytes; // don't forget to set this (it may depend on params of course)
-};
+  Features features() const { // override this if >1 fid
+    return FeatureFunction::single_feature(fid_);
+  }
 
-// example: feature val = -1 * # of target words
-struct TargetPenaltyFsa : public FsaFeatureFunctionBase {
-  TargetPenaltyFsa(std::string const& param) : FsaFeatureFunctionBase("TargetPenalty","","-1 per target word") { InitFid(); }
-  const float val_per_target_word=-1;
-  // state for backoff
+  // override this (static)
+  static std::string usage(bool param,bool verbose) {
+    return FeatureFunction::usage_helper("unnamed_fsa_feature","","",param,verbose);
+  }
+  int state_bytes() const { return state_bytes_; } // or override this
+  void const* start_state() const {
+    return start.begin();
+  }
+  void const * heuristic_start_state() const {
+    return h_start.begin();
+  }
+  Sentence const& end_phrase() const { return end_phrase_; }
+  // move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates.  state and next_state will never be the same memory.
+  //TODO: decide if we want to require you to support dest same as src, since that's how we use it most often in ff_from_fsa bottom-up wrapper (in l->r scoring, however, distinct copies will be the rule), and it probably wouldn't be too hard for most people to support.  however, it's good to hide the complexity here, once (see overly clever FsaScan loop that swaps src/dest addresses repeatedly to scan a sequence by effectively swapping)
 
-  // scan
-  void Scan(SentenceMetadata const& smeta,WordID x,void const* prev_state,FeatureVector *features) {
-    features->set_value(fid,val_per_target_word);
+  // NOTE: if you want to e.g. track statistics, cache, whatever, cast const away or use mutable members
+  void Scan(SentenceMetadata const& smeta,WordID x,void const* state,void *next_state,FeatureVector *features) const {
   }
 
-  // heuristic estimate of phrase
-  void Heuristic(WordID const* begin, WordID const* end,FeatureVector *h_features)
+  // don't set state-bytes etc. in ctor because it may depend on parsing param string
+  FsaFeatureFunctionBase() : start(0),h_start(0),state_bytes_(0) {  }
 
-  // return m: all strings x with the same final m+1 letters must end in this state
-  /* markov chain of order m: P(xn|xn-1...x1)=P(xn|xn-1...xn-m) */
-  int MarkovOrder() const {
-    return 0;
+};
+
+
+
+// init state is in cs; overwrite cs, ns repeatedly (alternatively).  return resulting state
+template <class FsaFF>
+void *FsaScan(FsaFF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *h_features, void *cs,void *ns) {
+  // extra code - IT'S FOR EFFICIENCY, MAN!  IT'S OK!  definitely no bugs here.
+  void *os,*es;
+  WordID const* e=end-1; // boundcheck 1 earlier because in loop below we use i+1 before rechecking
+  if ((end-i)&1) { // odd # of words
+    os=cs;
+    es=ns;
+    i-=1;
+    goto odd;
+  } else {
+    es=cs;
+    os=ns;
+  }
+  for (;i<e;i+=2) {
+    ff.Scan(smeta,*i,es,os,h_features); // e->o
+  odd:
+    ff.Scan(smeta,*(i+1),os,es,h_features); // o->e
+  }
+  return es;
+}
+
+// do not use if state size is 0, please.
+const bool optimize_FsaScanner_zerostate=false;
+
+template <class FF>
+struct FsaScanner {
+//  enum {ALIGN=8};
+  static const int ALIGN=8;
+  FF const& ff;
+  SentenceMetadata const& smeta;
+  int ssz;
+  Bytes states; // first is at begin, second is at (char*)begin+stride
+  void *st0; // states
+  void *st1; // states+stride
+  void *cs;
+  inline void *nexts() const {
+    return (cs==st0)?st1:st0;
+  }
+  FsaScanner(FF const& ff,SentenceMetadata const& smeta) : ff(ff),smeta(smeta)
+  {
+    ssz=ff.state_bytes();
+    int stride=((ssz+ALIGN-1)/ALIGN)*ALIGN; // round up to multiple of ALIGN
+    states.resize(stride+ssz);
+    st0=states.begin();
+    st1=(char*)st0+stride;
+//    for (int i=0;i<2;++i) st[i]=cs+(i*stride);
+  }
+  void reset(void const* state) {
+    cs=st0;
+    std::memcpy(st0,state,ssz);
+  }
+  void scan(WordID w,FeatureVector *feat) {
+    if (optimize_FsaScanner_zerostate && !ssz) {
+      ff.Scan(smeta,w,0,0,feat);
+      return;
+    }
+    void *ns=nexts();
+    ff.Scan(smeta,w,cs,ns,feat);
+    cs=ns;
   }
 
+  void scan(WordID const* i,WordID const* end,FeatureVector *feat) {
+#if 1
+    // faster.
+    if (optimize_FsaScanner_zerostate && !ssz)
+      for (;i<end;++i)
+        ff.Scan(smeta,*i,0,0,feat);
+    else
+      cs=FsaScan(ff,smeta,i,end,feat,cs,nexts());
+#else
+    for (;i<end;++i)
+      scan(*i,feat);
+#endif
+  }
 };
 
-//TODO: combine 2 FsaFeatures typelist style (can recurse for more)
 
-// the type-erased interface
-struct FsaFeatureFunction {
-  virtual int MarkovOrder() const = 0;
-  virtual ~FsaFeatureFunction();
+template <class FF>
+void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *h_features,void const* start_state) {
+  int ssz=ff.state_bytes();
+  if (ssz) {
+    Bytes state(ssz),state2(ssz);
+    void *cs=state.begin(),*ns=state2.begin();
+    memcpy(cs,start_state,ff.state_bytes());
+    FsaScan(ff,smeta,i,end,h_features,cs,ns);
+  } else
+    for (;i<end;++i)
+      ff.Scan(smeta,*i,0,0,h_features);
+}
 
-};
 
-// conforming to above interface, type erases FsaImpl
-// you might be wondering: why do this?  answer: it's cool, and it means that the bottom-up ff over ff_fsa wrapper doesn't go through multiple layers of dynamic dispatch
-template <class Impl>
-struct FsaFeatureFunctionDynamic : public FsaFeatureFunction {
-  Impl& d() { return static_cast<Impl&>(*this); }
-  Impl const& d() { return static_cast<Impl const&>(*this); }
-  int MarkovOrder() const { return d().MarkovOrder(); }
-};
+//TODO: combine 2 FsaFeatures typelist style (can recurse for more)
 
-//TODO: combine 2 (or N) FsaFeatureFunction (type erased)
-
-/* regular bottom up scorer from Fsa feature
-   uses guarantee about markov order=N to score ASAP
-   encoding of state: if less than N-1 (ctxlen) words
-
-   either:
-   struct FF : public FsaImpl,FeatureFunctionFromFsa<FF> (more efficient)
-
-   or:
-   struct FF : public FsaFeatureFunctionDynamic,FeatureFunctionFromFsa<FF> (code sharing, but double dynamic dispatch)
- */
-
-template <class Impl>
-struct FeatureFunctionFromFsa : public FeatureFunction {
-  Impl& d() { return static_cast<Impl&>(*this); }
-  Impl const& d() { return static_cast<Impl const&>(*this); }
-  int M; // markov order (ctx len)
-  FeatureFunctionFromFsa() {  }
-  Init() {
-    name=d().name;
-    M=d().MarkovOrder
-    SetStateSize(sizeof(WordID)*2*M);
-  } // can't do this in constructor because we come before d() in order
-
-  virtual Features Features() const { return d().Features(); }
-  bool rule_feature() const {
-    return StateSize()==0; // Fsa features don't get info about span
+// example: feature val = -1 * # of target words
+struct WordPenaltyFsa : public FsaFeatureFunctionBase {
+  WordPenaltyFsa(std::string const& param) {
+    init_fid(usage(false,false));
+    return;
+    //below are all defaults:
+    set_state_bytes(0);
+    start.clear();
+    h_start.clear();
+  }
+  static const float val_per_target_word=-1;
+  // move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates.  state and next_state may be same memory.
+  void Scan(SentenceMetadata const& smeta,WordID x,void const* state,void *next_state,FeatureVector *features) const {
+    features->add_value(fid_,val_per_target_word);
+  }
+  static std::string usage(bool param,bool verbose) {
+    return FeatureFunction::usage_helper("WordPenaltyFsa","","-1 per target word",param,verbose);
   }
 
 };
diff --git a/decoder/ff_fsa_dynamic.h b/decoder/ff_fsa_dynamic.h
new file mode 100755
index 00000000..79672bdc
--- /dev/null
+++ b/decoder/ff_fsa_dynamic.h
@@ -0,0 +1,29 @@
+#ifndef FF_FSA_DYNAMIC_H
+#define FF_FSA_DYNAMIC_H
+
+#include "ff_fsa.h"
+
+// the type-erased interface
+/*
+struct FsaFeatureFunction {
+  virtual int markov_order() const = 0;
+  virtual ~FsaFeatureFunction();
+
+};
+
+// conforming to above interface, type erases FsaImpl
+// you might be wondering: why do this?  answer: it's cool, and it means that the bottom-up ff over ff_fsa wrapper doesn't go through multiple layers of dynamic dispatch
+template <class Impl>
+struct FsaFeatureFunctionDynamic : public FsaFeatureFunction {
+  Impl& d() { return static_cast<Impl&>(*this); }
+  Impl const& d() { return static_cast<Impl const&>(*this); }
+  int markov_order() const { return d().markov_order(); }
+};
+
+//TODO: wrap every method in concrete fsaff and declare in interface above.
+//TODO: combine 2 (or N) FsaFeatureFunction (type erased)
+
+*/
+
+
+#endif
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index 15e3f20e..28312b4f 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -535,7 +535,7 @@ LanguageModel::LanguageModel(const string& param) {
   SetStateSize(LanguageModelImpl::OrderToStateSize(order));
 }
 
-FeatureFunction::Features LanguageModel::features() const {
+Features LanguageModel::features() const {
   return single_feature(fid_);
 }
 
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
new file mode 100755
index 00000000..344cd992
--- /dev/null
+++ b/decoder/ff_lm_fsa.h
@@ -0,0 +1,15 @@
+#ifndef FF_LM_FSA_H
+#define FF_LM_FSA_H
+
+#include "ff_lm.h"
+#include "ff_from_fsa.h"
+
+class LanguageModelFsa : public FsaFeatureFunctionBase {
+  static std::string usage(bool,bool);
+  LanguageModelFsa(std::string const& param);
+  // implementations in ff_lm.cc
+};
+
+typedef FeatureFunctionFromFsa<LanguageModelFsa> LanguageModelFromFsa;
+
+#endif
diff --git a/decoder/sentences.h b/decoder/sentences.h
index 6ab216bf..482d3be9 100755
--- a/decoder/sentences.h
+++ b/decoder/sentences.h
@@ -7,8 +7,30 @@
 #include "filelib.h"
 #include "tdict.h"
 #include "stringlib.h"
+#include <cstring>
 typedef std::vector<WordID> Sentence;
 
+// these "iterators" are invalidated if s is modified.  note: this is allowed by std.
+inline WordID const* begin(Sentence const& s) {
+  return &*s.begin();
+}
+inline WordID const* end(Sentence const& s) {
+  return &*s.end();
+}
+inline WordID * begin(Sentence & s) {
+  return &*s.begin();
+}
+inline WordID * end(Sentence & s) {
+  return &*s.end();
+}
+inline void wordcpy(WordID *dest,WordID const* src,int n) {
+  std::memcpy(dest,src,n*sizeof(*dest));
+}
+inline void wordcpy(WordID *dest,WordID const* src,WordID const* src_end) {
+  wordcpy(dest,src,src_end-src);
+}
+
+
 inline std::ostream & operator<<(std::ostream &out,Sentence const& s) {
   return out<<TD::GetString(s);
 }
diff --git a/decoder/small_vector.h b/decoder/small_vector.h
index 202b72c9..b5d86231 100644
--- a/decoder/small_vector.h
+++ b/decoder/small_vector.h
@@ -254,4 +254,9 @@ public:
 
 typedef SmallVector<int,2> SmallVectorInt;
 
+template <class T,int N>
+void memcpy(void *out,SmallVector<T,N> const& v) {
+  std::memcpy(out,v.begin(),v.size()*sizeof(T));
+}
+
 #endif
diff --git a/decoder/value_array.h b/decoder/value_array.h
index 042247a1..0cb5c3d6 100755
--- a/decoder/value_array.h
+++ b/decoder/value_array.h
@@ -1,12 +1,15 @@
 #ifndef VALUE_ARRAY_H
 #define VALUE_ARRAY_H
 
+//TODO: option for non-constructed version (type_traits pod?), option for small array optimization (if sz < N, store inline in union, see small_vector.h)
+
 #include <cstdlib>
 #include <algorithm>
 #include <new>
 #include <boost/range.hpp>
 #include <boost/utility/enable_if.hpp>
 #include <boost/type_traits.hpp>
+#include <cstring>
 #ifdef USE_BOOST_SERIALIZE
 # include <boost/serialization/split_member.hpp>
 # include <boost/serialization/access.hpp>
@@ -17,7 +20,7 @@ template <class T, class A = std::allocator<T> >
 class ValueArray : A // private inheritance so stateless allocator adds no size.
 {
 public:
-  const int SV_MAX=sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1;
+  static const int SV_MAX=sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1;
   //space optimization: SV_MAX T will fit inside what would otherwise be a pointer to heap data.  todo in the far future if bored.
   typedef T value_type;
   typedef T& reference;
@@ -51,11 +54,21 @@ public:
   ValueArray() : sz(0), array(NULL) {}
 
   explicit ValueArray(size_type s, const_reference t = T())
-    : sz(s)
-    , array(A::allocate(s))
   {
+    init(s,t);
+  }
+
+protected:
+  inline void init(size_type s, const_reference t = T()) {
+    sz=s;
+    array=A::allocate(s);
     for (size_type i = 0; i != sz; ++i) { A::construct(array + i,t); }
   }
+public:
+  void resize(size_type s, const_reference t = T()) {
+    clear();
+    init(s,t);
+  }
 
   template <class I>
   ValueArray(I itr, I end)
@@ -65,7 +78,11 @@ public:
     copy_construct(itr,end,array);
   }
 
-  ~ValueArray()
+  ~ValueArray() {
+    clear();
+  }
+
+  void clear()
   {
     for (size_type i = sz; i != 0; --i) {
       A::destroy(array + (i - 1));
@@ -160,6 +177,10 @@ bool operator< (ValueArray<T,A> const& v1, ValueArray<T,A> const& v2)
                                        , v2.end() );
 }
 
+template <class T,class A>
+void memcpy(void *out,ValueArray<T,A> const& v) {
+  std::memcpy(out,v.begin(),v.size()*sizeof(T));
+}
 
 
 #endif