From bb86637332d49f71c485df34576e464eaf053656 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 13 Sep 2011 17:36:23 +0100
Subject: get rid of bad Weights class so it no longer keeps a copy of a vector
 inside it

---
 training/mpi_online_optimize.cc | 69 ++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 45 deletions(-)

(limited to 'training/mpi_online_optimize.cc')
diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc
index 32033c19..2ef4a2e7 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/mpi_online_optimize.cc
@@ -31,35 +31,6 @@ namespace mpi = boost::mpi;
 using namespace std;
 namespace po = boost::program_options;
 
-void SanityCheck(const vector<double>& w) {
-  for (int i = 0; i < w.size(); ++i) {
-    assert(!isnan(w[i]));
-    assert(!isinf(w[i]));
-  }
-}
-
-struct FComp {
-  const vector<double>& w_;
-  FComp(const vector<double>& w) : w_(w) {}
-  bool operator()(int a, int b) const {
-    return fabs(w_[a]) > fabs(w_[b]);
-  }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
-  vector<int> fnums(w.size());
-  for (int i = 0; i < w.size(); ++i)
-    fnums[i] = i;
-  vector<int>::iterator mid = fnums.begin();
-  mid += (w.size() > 10 ? 10 : w.size());
-  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
-  cerr << "TOP FEATURES:";
-  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
-    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
-  }
-  cerr << endl;
-}
-
 bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
@@ -250,10 +221,25 @@ int main(int argc, char** argv) {
   if (!InitCommandLine(argc, argv, &conf))
     return 1;
 
+  vector<pair<string, int> > agenda;
+  if (!LoadAgenda(conf["training_agenda"].as<string>(), &agenda))
+    return 1;
+  if (rank == 0)
+    cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n";
+
+  assert(agenda.size() > 0);
+
+  if (1) {  // hack to load the feature hash functions -- TODO this should not be in cdec.ini
+    const string& cur_config = agenda[0].first;
+    const unsigned max_iteration = agenda[0].second;
+    ReadFile ini_rf(cur_config);
+    Decoder decoder(ini_rf.stream());
+  }
+
   // load initial weights
-  Weights weights;
+  vector<weight_t> init_weights;
   if (conf.count("input_weights"))
-    weights.InitFromFile(conf["input_weights"].as<string>());
+    Weights::InitFromFile(conf["input_weights"].as<string>(), &init_weights);
 
   vector<int> frozen_fids;
   if (conf.count("frozen_features")) {
@@ -310,19 +296,12 @@ int main(int argc, char** argv) {
     rng.reset(new MT19937);
 
   SparseVector<double> x;
-  weights.InitSparseVector(&x);
+  Weights::InitSparseVector(init_weights, &x);
   TrainingObserver observer;
 
   int write_weights_every_ith = 100; // TODO configure
   int titer = -1;
 
-  vector<pair<string, int> > agenda;
-  if (!LoadAgenda(conf["training_agenda"].as<string>(), &agenda))
-    return 1;
-  if (rank == 0)
-    cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n";
-
-  vector<double> lambdas;
   for (int ai = 0; ai < agenda.size(); ++ai) {
     const string& cur_config = agenda[ai].first;
     const unsigned max_iteration = agenda[ai].second;
@@ -331,6 +310,8 @@ int main(int argc, char** argv) {
     // load cdec.ini and set up decoder
     ReadFile ini_rf(cur_config);
     Decoder decoder(ini_rf.stream());
+    vector<weight_t>& lambdas = decoder.CurrentWeightVector();
+    if (ai == 0) { lambdas.swap(init_weights); init_weights.clear(); }
 
     if (rank == 0)
       o->ResetEpoch(); // resets the learning rate-- TODO is this good?
@@ -341,15 +322,13 @@ int main(int argc, char** argv) {
 #ifdef HAVE_MPI
       mpi::timer timer;
 #endif
-      weights.InitFromVector(x);
-      weights.InitVector(&lambdas);
+      x.init_vector(&lambdas);
       ++iter; ++titer;
       observer.Reset();
-      decoder.SetWeights(lambdas);
       if (rank == 0) {
         converged = (iter == max_iteration);
-        SanityCheck(lambdas);
-        ShowLargestFeatures(lambdas);
+        Weights::SanityCheck(lambdas);
+        Weights::ShowLargestFeatures(lambdas);
         string fname = "weights.cur.gz";
         if (iter % write_weights_every_ith == 0) {
           ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz";
@@ -360,7 +339,7 @@ int main(int argc, char** argv) {
         vv << "total iter=" << titer << " (of current config iter=" << iter << ")  minibatch=" << size_per_proc << " sentences/proc x " << size << " procs.   num_feats=" << x.size() << '/' << FD::NumFeats() << "   passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << "   eta=" << lr->eta(titer);
         const string svv = vv.str();
         cerr << svv << endl;
-        weights.WriteToFile(fname, true, &svv);
+        Weights::WriteToFile(fname, lambdas, true, &svv);
       }
 
       for (int i = 0; i < size_per_proc; ++i) {
-- 
cgit v1.2.3


From 08c4a7fae8f0bec4f76c4e0928e357100eb7a1ca Mon Sep 17 00:00:00 2001
From: Guest_account Guest_account prguest11 <prguest11@taipan.cs>
Date: Tue, 11 Oct 2011 16:16:53 +0100
Subject: remove implicit conversion-to-double operator from LogVal<T> that
 caused overflow errors, clean up some pf code

---
 decoder/aligner.cc              |  2 +-
 decoder/cfg.cc                  |  2 +-
 decoder/cfg_format.h            |  2 +-
 decoder/decoder.cc              | 10 ++++----
 decoder/hg.cc                   |  4 ++--
 decoder/rule_lexer.l            |  2 ++
 decoder/trule.h                 | 15 +++++++++++-
 gi/pf/brat.cc                   | 11 ---------
 gi/pf/cbgi.cc                   | 10 --------
 gi/pf/dpnaive.cc                | 12 ----------
 gi/pf/itg.cc                    | 11 ---------
 gi/pf/pfbrat.cc                 | 11 ---------
 gi/pf/pfdist.cc                 | 11 ---------
 gi/pf/pfnaive.cc                | 11 ---------
 mteval/mbr_kbest.cc             |  4 ++--
 phrasinator/ccrp_nt.h           | 24 +++++++++++++++----
 training/mpi_batch_optimize.cc  |  2 +-
 training/mpi_compute_cllh.cc    | 51 +++++++++++++++++++----------------------
 training/mpi_online_optimize.cc |  4 ++--
 utils/logval.h                  | 10 ++++----
 20 files changed, 78 insertions(+), 131 deletions(-)

(limited to 'training/mpi_online_optimize.cc')

diff --git a/decoder/aligner.cc b/decoder/aligner.cc
index 292ee123..53e059fb 100644
--- a/decoder/aligner.cc
+++ b/decoder/aligner.cc
@@ -165,7 +165,7 @@ inline void WriteProbGrid(const Array2D<prob_t>& m, ostream* pos) {
       if (m(i,j) == prob_t::Zero()) {
         os << "\t---X---";
       } else {
-        snprintf(b, 1024, "%0.5f", static_cast<double>(m(i,j)));
+        snprintf(b, 1024, "%0.5f", m(i,j).as_float());
         os << '\t' << b;
       }
     }
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index 651978d2..cd7e66e9 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -639,7 +639,7 @@ void CFG::Print(std::ostream &o,CFGFormat const& f) const {
     o << '['<<f.goal_nt_name <<']';
     WordID rhs=-goal_nt;
     f.print_rhs(o,*this,&rhs,&rhs+1);
-    if (pushed_inside!=1)
+    if (pushed_inside!=prob_t::One())
       f.print_features(o,pushed_inside);
     o<<'\n';
   }
diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h
index c6a594b8..2f40d483 100755
--- a/decoder/cfg_format.h
+++ b/decoder/cfg_format.h
@@ -101,7 +101,7 @@ struct CFGFormat {
   }
 
   void print_features(std::ostream &o,prob_t p,FeatureVector const& fv=FeatureVector()) const {
-    bool logp=(logprob_feat && p!=1);
+    bool logp=(logprob_feat && p!=prob_t::One());
     if (features || logp) {
       o << partsep;
       if (logp)
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index c4fe3c4d..3b53fd6b 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -325,7 +325,7 @@ struct DecoderImpl {
 
   static void ConvertSV(const SparseVector<prob_t>& src, SparseVector<double>* trg) {
     for (SparseVector<prob_t>::const_iterator it = src.begin(); it != src.end(); ++it)
-      trg->set_value(it->first, it->second);
+      trg->set_value(it->first, it->second.as_float());
   }
 };
 
@@ -788,10 +788,10 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
   const bool show_tree_structure=conf.count("show_tree_structure");
   if (!SILENT) forest_stats(forest,"  Init. forest",show_tree_structure,oracle.show_derivation);
   if (conf.count("show_expected_length")) {
-    const PRPair<double, double> res =
-      Inside<PRPair<double, double>,
-             PRWeightFunction<double, EdgeProb, double, ELengthWeightFunction> >(forest);
-    cerr << "  Expected length  (words): " << res.r / res.p << "\t" << res << endl;
+    const PRPair<prob_t, prob_t> res =
+      Inside<PRPair<prob_t, prob_t>,
+             PRWeightFunction<prob_t, EdgeProb, prob_t, ELengthWeightFunction> >(forest);
+    cerr << "  Expected length  (words): " << (res.r / res.p).as_float() << "\t" << res << endl;
   }
 
   if (conf.count("show_partition")) {
diff --git a/decoder/hg.cc b/decoder/hg.cc
index 3ad17f1a..180986d7 100644
--- a/decoder/hg.cc
+++ b/decoder/hg.cc
@@ -157,14 +157,14 @@ prob_t Hypergraph::ComputeEdgePosteriors(double scale, vector<prob_t>* posts) co
   const ScaledEdgeProb weight(scale);
   const ScaledTransitionEventWeightFunction w2(scale);
   SparseVector<prob_t> pv;
-  const double inside = InsideOutside<prob_t,
+  const prob_t inside = InsideOutside<prob_t,
                   ScaledEdgeProb,
                   SparseVector<prob_t>,
                   ScaledTransitionEventWeightFunction>(*this, &pv, weight, w2);
   posts->resize(edges_.size());
   for (int i = 0; i < edges_.size(); ++i)
     (*posts)[i] = prob_t(pv.value(i));
-  return prob_t(inside);
+  return inside;
 }
 
 prob_t Hypergraph::ComputeBestPathThroughEdges(vector<prob_t>* post) const {
diff --git a/decoder/rule_lexer.l b/decoder/rule_lexer.l
index 9331d8ed..083a5bb1 100644
--- a/decoder/rule_lexer.l
+++ b/decoder/rule_lexer.l
@@ -220,6 +220,8 @@ NT [^\t \[\],]+
                   std::cerr << "Line " << lex_line << ": LHS and RHS arity mismatch!\n";
                   abort();
                 }
+		// const bool ignore_grammar_features = false;
+		// if (ignore_grammar_features) scfglex_num_feats = 0;
 		TRulePtr rp(new TRule(scfglex_lhs, scfglex_src_rhs, scfglex_src_rhs_size, scfglex_trg_rhs, scfglex_trg_rhs_size, scfglex_feat_ids, scfglex_feat_vals, scfglex_num_feats, scfglex_src_arity, scfglex_als, scfglex_num_als));
     check_and_update_ctf_stack(rp);
     TRulePtr coarse_rp = ((ctf_level == 0) ? TRulePtr() : ctf_rule_stack.top());
diff --git a/decoder/trule.h b/decoder/trule.h
index 4df4ec90..8eb2a059 100644
--- a/decoder/trule.h
+++ b/decoder/trule.h
@@ -5,7 +5,9 @@
 #include <vector>
 #include <cassert>
 #include <iostream>
-#include <boost/shared_ptr.hpp>
+
+#include "boost/shared_ptr.hpp"
+#include "boost/functional/hash.hpp"
 
 #include "sparse_vector.h"
 #include "wordid.h"
@@ -162,4 +164,15 @@ class TRule {
   bool SanityCheck() const;
 };
 
+inline size_t hash_value(const TRule& r) {
+  size_t h = boost::hash_value(r.e_);
+  boost::hash_combine(h, -r.lhs_);
+  boost::hash_combine(h, boost::hash_value(r.f_));
+  return h;
+}
+
+inline bool operator==(const TRule& a, const TRule& b) {
+  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
+}
+
 #endif
diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc
index 4c6ba3ef..7b60ef23 100644
--- a/gi/pf/brat.cc
+++ b/gi/pf/brat.cc
@@ -25,17 +25,6 @@ static unsigned kMAX_SRC_PHRASE;
 static unsigned kMAX_TRG_PHRASE;
 struct FSTState;
 
-size_t hash_value(const TRule& r) {
-  size_t h = 2 - r.lhs_;
-  boost::hash_combine(h, boost::hash_value(r.e_));
-  boost::hash_combine(h, boost::hash_value(r.f_));
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
 double log_poisson(unsigned x, const double& lambda) {
   assert(lambda > 0.0);
   return log(lambda) * x - lgamma(x + 1) - lambda;
diff --git a/gi/pf/cbgi.cc b/gi/pf/cbgi.cc
index 20204e8a..97f1ba34 100644
--- a/gi/pf/cbgi.cc
+++ b/gi/pf/cbgi.cc
@@ -27,16 +27,6 @@ double log_decay(unsigned x, const double& b) {
   return log(b - 1) - x * log(b);
 }
 
-size_t hash_value(const TRule& r) {
-  // TODO fix hash function
-  size_t h = boost::hash_value(r.e_) * boost::hash_value(r.f_) * r.lhs_;
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
 struct SimpleBase {
   SimpleBase(unsigned esize, unsigned fsize, unsigned ntsize = 144) :
     uniform_e(-log(esize)),
diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc
index 582d1be7..608f73d5 100644
--- a/gi/pf/dpnaive.cc
+++ b/gi/pf/dpnaive.cc
@@ -20,18 +20,6 @@ namespace po = boost::program_options;
 
 static unsigned kMAX_SRC_PHRASE;
 static unsigned kMAX_TRG_PHRASE;
-struct FSTState;
-
-size_t hash_value(const TRule& r) {
-  size_t h = 2 - r.lhs_;
-  boost::hash_combine(h, boost::hash_value(r.e_));
-  boost::hash_combine(h, boost::hash_value(r.f_));
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
 
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc
index 2c2a86f9..ac3c16a3 100644
--- a/gi/pf/itg.cc
+++ b/gi/pf/itg.cc
@@ -27,17 +27,6 @@ ostream& operator<<(ostream& os, const vector<WordID>& p) {
   return os << ']';
 }
 
-size_t hash_value(const TRule& r) {
-  size_t h = boost::hash_value(r.e_);
-  boost::hash_combine(h, -r.lhs_);
-  boost::hash_combine(h, boost::hash_value(r.f_));
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
 double log_poisson(unsigned x, const double& lambda) {
   assert(lambda > 0.0);
   return log(lambda) * x - lgamma(x + 1) - lambda;
diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc
index 4c6ba3ef..7b60ef23 100644
--- a/gi/pf/pfbrat.cc
+++ b/gi/pf/pfbrat.cc
@@ -25,17 +25,6 @@ static unsigned kMAX_SRC_PHRASE;
 static unsigned kMAX_TRG_PHRASE;
 struct FSTState;
 
-size_t hash_value(const TRule& r) {
-  size_t h = 2 - r.lhs_;
-  boost::hash_combine(h, boost::hash_value(r.e_));
-  boost::hash_combine(h, boost::hash_value(r.f_));
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
 double log_poisson(unsigned x, const double& lambda) {
   assert(lambda > 0.0);
   return log(lambda) * x - lgamma(x + 1) - lambda;
diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc
index 18dfd03b..81abd61b 100644
--- a/gi/pf/pfdist.cc
+++ b/gi/pf/pfdist.cc
@@ -24,17 +24,6 @@ namespace po = boost::program_options;
 
 shared_ptr<MT19937> prng;
 
-size_t hash_value(const TRule& r) {
-  size_t h = boost::hash_value(r.e_);
-  boost::hash_combine(h, -r.lhs_);
-  boost::hash_combine(h, boost::hash_value(r.f_));
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc
index 43c604c3..c30e7c4f 100644
--- a/gi/pf/pfnaive.cc
+++ b/gi/pf/pfnaive.cc
@@ -24,17 +24,6 @@ namespace po = boost::program_options;
 
 shared_ptr<MT19937> prng;
 
-size_t hash_value(const TRule& r) {
-  size_t h = boost::hash_value(r.e_);
-  boost::hash_combine(h, -r.lhs_);
-  boost::hash_combine(h, boost::hash_value(r.f_));
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc
index 2867b36b..64a6a8bf 100644
--- a/mteval/mbr_kbest.cc
+++ b/mteval/mbr_kbest.cc
@@ -32,7 +32,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
 }
 
 struct LossComparer {
-  bool operator()(const pair<vector<WordID>, double>& a, const pair<vector<WordID>, double>& b) const {
+  bool operator()(const pair<vector<WordID>, prob_t>& a, const pair<vector<WordID>, prob_t>& b) const {
     return a.second < b.second;
   }
 };
@@ -108,7 +108,7 @@ int main(int argc, char** argv) {
           ScoreP s = scorer->ScoreCandidate(list[j].first);
           double loss = 1.0 - s->ComputeScore();
           if (type == TER || type == AER) loss = 1.0 - loss;
-          double weighted_loss = loss * (joints[j] / marginal);
+          double weighted_loss = loss * (joints[j] / marginal).as_float();
           wl_acc += weighted_loss;
           if ((!output_list) && wl_acc > mbr_loss) break;
         }
diff --git a/phrasinator/ccrp_nt.h b/phrasinator/ccrp_nt.h
index 163b643a..811bce73 100644
--- a/phrasinator/ccrp_nt.h
+++ b/phrasinator/ccrp_nt.h
@@ -50,15 +50,26 @@ class CCRP_NoTable {
     return it->second;
   }
 
-  void increment(const Dish& dish) {
-    ++custs_[dish];
+  int increment(const Dish& dish) {
+    int table_diff = 0;
+    if (++custs_[dish] == 1)
+      table_diff = 1;
     ++num_customers_;
+    return table_diff;
   }
 
-  void decrement(const Dish& dish) {
-    if ((--custs_[dish]) == 0)
+  int decrement(const Dish& dish) {
+    int table_diff = 0;
+    int nc = --custs_[dish];
+    if (nc == 0) {
       custs_.erase(dish);
+      table_diff = -1;
+    } else if (nc < 0) {
+      std::cerr << "Dish counts dropped below zero for: " << dish << std::endl;
+      abort();
+    }
     --num_customers_;
+    return table_diff;
   }
 
   double prob(const Dish& dish, const double& p0) const {
@@ -66,6 +77,11 @@ class CCRP_NoTable {
     return (at_table + p0 * concentration_) / (num_customers_ + concentration_);
   }
 
+  double logprob(const Dish& dish, const double& logp0) const {
+    const unsigned at_table = num_customers(dish);
+    return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_);
+  }
+
   double log_crp_prob() const {
     return log_crp_prob(concentration_);
   }
diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc
index 0ba8c530..046e921c 100644
--- a/training/mpi_batch_optimize.cc
+++ b/training/mpi_batch_optimize.cc
@@ -92,7 +92,7 @@ struct TrainingObserver : public DecoderObserver {
   void SetLocalGradientAndObjective(vector<double>* g, double* o) const {
     *o = acc_obj;
     for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
-      (*g)[it->first] = it->second;
+      (*g)[it->first] = it->second.as_float();
   }
 
   virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
diff --git a/training/mpi_compute_cllh.cc b/training/mpi_compute_cllh.cc
index b496d196..d5caa745 100644
--- a/training/mpi_compute_cllh.cc
+++ b/training/mpi_compute_cllh.cc
@@ -1,6 +1,4 @@
-#include <sstream>
 #include <iostream>
-#include <fstream>
 #include <vector>
 #include <cassert>
 #include <cmath>
@@ -12,6 +10,7 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "sentence_metadata.h"
 #include "verbose.h"
 #include "hg.h"
 #include "prob.h"
@@ -52,7 +51,8 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   return true;
 }
 
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* ids) {
+void ReadInstances(const string& fname, int rank, int size, vector<string>* c) {
+  assert(fname != "-");
   ReadFile rf(fname);
   istream& in = *rf.stream();
   string line;
@@ -60,20 +60,16 @@ void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>*
   while(in) {
     getline(in, line);
     if (!in) break;
-    if (lc % size == rank) {
-      c->push_back(line);
-      ids->push_back(lc);
-    }
+    if (lc % size == rank) c->push_back(line);
     ++lc;
   }
 }
 
 static const double kMINUS_EPSILON = -1e-6;
 
-struct TrainingObserver : public DecoderObserver {
-  void Reset() {
-    acc_obj = 0;
-  } 
+struct ConditionalLikelihoodObserver : public DecoderObserver {
+
+  ConditionalLikelihoodObserver() : trg_words(), acc_obj(), cur_obj() {}
 
   virtual void NotifyDecodingStart(const SentenceMetadata&) {
     cur_obj = 0;
@@ -120,8 +116,10 @@ struct TrainingObserver : public DecoderObserver {
     }
     assert(!isnan(log_ref_z));
     acc_obj += (cur_obj - log_ref_z);
+    trg_words += smeta.GetReference().size();
   }
 
+  unsigned trg_words;
   double acc_obj;
   double cur_obj;
   int state;
@@ -161,35 +159,32 @@ int main(int argc, char** argv) {
   if (conf.count("weights"))
     Weights::InitFromFile(conf["weights"].as<string>(), &weights);
 
-  // freeze feature set
-  //const bool freeze_feature_set = conf.count("freeze_feature_set");
-  //if (freeze_feature_set) FD::Freeze();
-
-  vector<string> corpus; vector<int> ids;
-  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
+  vector<string> corpus;
+  ReadInstances(conf["training_data"].as<string>(), rank, size, &corpus);
   assert(corpus.size() > 0);
-  assert(corpus.size() == ids.size());
-
-  TrainingObserver observer;
-  double objective = 0;
 
-  observer.Reset();
   if (rank == 0)
-    cerr << "Each processor is decoding " << corpus.size() << " training examples...\n";
+    cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
 
-  for (int i = 0; i < corpus.size(); ++i) {
-    decoder.SetId(ids[i]);
+  ConditionalLikelihoodObserver observer;
+  for (int i = 0; i < corpus.size(); ++i)
     decoder.Decode(corpus[i], &observer);
-  }
 
+  double objective = 0;
+  unsigned total_words = 0;
 #ifdef HAVE_MPI
   reduce(world, observer.acc_obj, objective, std::plus<double>(), 0);
+  reduce(world, observer.trg_words, total_words, std::plus<unsigned>(), 0);
 #else
   objective = observer.acc_obj;
 #endif
 
-  if (rank == 0)
-    cout << "OBJECTIVE: " << objective << endl;
+  if (rank == 0) {
+    cout << "CONDITIONAL LOG_e LIKELIHOOD: " << objective << endl;
+    cout << "CONDITIONAL LOG_2 LIKELIHOOD: " << (objective/log(2)) << endl;
+    cout << "         CONDITIONAL ENTROPY: " << (objective/log(2) / total_words) << endl;
+    cout << "                  PERPLEXITY: " << pow(2, (objective/log(2) / total_words)) << endl;
+  }
 
   return 0;
 }
diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc
index 2ef4a2e7..f87b7274 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/mpi_online_optimize.cc
@@ -94,7 +94,7 @@ struct TrainingObserver : public DecoderObserver {
   void SetLocalGradientAndObjective(vector<double>* g, double* o) const {
     *o = acc_obj;
     for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
-      (*g)[it->first] = it->second;
+      (*g)[it->first] = it->second.as_float();
   }
 
   virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
@@ -158,7 +158,7 @@ struct TrainingObserver : public DecoderObserver {
   void GetGradient(SparseVector<double>* g) const {
     g->clear();
     for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
-      g->set_value(it->first, it->second);
+      g->set_value(it->first, it->second.as_float());
   }
 
   int total_complete;
diff --git a/utils/logval.h b/utils/logval.h
index 6fdc2c42..8a59d0b1 100644
--- a/utils/logval.h
+++ b/utils/logval.h
@@ -25,12 +25,13 @@ class LogVal {
   typedef LogVal<T> Self;
 
   LogVal() : s_(), v_(LOGVAL_LOG0) {}
-  explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {}
+  LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {}
+  const Self& operator=(double x) { s_ = std::signbit(x); v_ = s_ ? std::log(-x) : std::log(x); return *this; }
   LogVal(init_minus_1) : s_(true),v_(0) {  }
   LogVal(init_1) : s_(),v_(0) {  }
   LogVal(init_0) : s_(),v_(LOGVAL_LOG0) {  }
-  LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {}
-  LogVal(unsigned x) : s_(0), v_(std::log(x)) { }
+  explicit LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {}
+  explicit LogVal(unsigned x) : s_(0), v_(std::log(x)) { }
   LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {}
   LogVal(double lnx,init_lnx) : s_(),v_(lnx) {}
   static Self exp(T lnx) { return Self(lnx,false); }
@@ -141,9 +142,6 @@ class LogVal {
     return pow(1/root);
   }
 
-  operator T() const {
-    if (s_) return -std::exp(v_); else return std::exp(v_);
-  }
   T as_float() const {
     if (s_) return -std::exp(v_); else return std::exp(v_);
   }
-- 
cgit v1.2.3


From 708e8220d2cbc41fbbf2e8a713edf1d2cf3bc767 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Fri, 14 Oct 2011 11:51:12 +0100
Subject: remove FSA integration code. will have to be resurrected another day

---
 decoder/Makefile.am             |   1 -
 decoder/apply_fsa_models.cc     | 798 ----------------------------------------
 decoder/cdec_ff.cc              |  13 -
 decoder/feature_accum.h         | 129 -------
 decoder/ff_factory.h            |   2 -
 decoder/ff_from_fsa.h           | 304 ---------------
 decoder/ff_fsa.h                | 401 --------------------
 decoder/ff_fsa_data.h           | 131 -------
 decoder/ff_fsa_dynamic.h        | 208 -----------
 decoder/ff_lm.cc                |  48 ---
 decoder/ff_lm_fsa.h             | 140 -------
 decoder/ff_register.h           |  38 --
 decoder/hg_test.cc              |  16 +-
 training/mpi_online_optimize.cc |   2 +
 14 files changed, 10 insertions(+), 2221 deletions(-)
 delete mode 100755 decoder/apply_fsa_models.cc
 delete mode 100755 decoder/feature_accum.h
 delete mode 100755 decoder/ff_from_fsa.h
 delete mode 100755 decoder/ff_fsa.h
 delete mode 100755 decoder/ff_fsa_data.h
 delete mode 100755 decoder/ff_fsa_dynamic.h
 delete mode 100755 decoder/ff_lm_fsa.h

(limited to 'training/mpi_online_optimize.cc')

diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index ede1cff0..6b9360d8 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -42,7 +42,6 @@ libcdec_a_SOURCES = \
   cfg.cc \
   dwarf.cc \
   ff_dwarf.cc \
-  apply_fsa_models.cc \
   rule_lexer.cc \
   fst_translator.cc \
   csplit.cc \
diff --git a/decoder/apply_fsa_models.cc b/decoder/apply_fsa_models.cc
deleted file mode 100755
index 3e93cadd..00000000
--- a/decoder/apply_fsa_models.cc
+++ /dev/null
@@ -1,798 +0,0 @@
-//see apply_fsa_models.README for notes on the l2r earley fsa+cfg intersection
-//implementation in this file (also some comments in this file)
-#define SAFE_VALGRIND 1
-
-#include "apply_fsa_models.h"
-#include <stdexcept>
-#include <cassert>
-#include <queue>
-#include <stdint.h>
-
-#include "writer.h"
-#include "hg.h"
-#include "ff_fsa_dynamic.h"
-#include "ff_from_fsa.h"
-#include "feature_vector.h"
-#include "stringlib.h"
-#include "apply_models.h"
-#include "cfg.h"
-#include "hg_cfg.h"
-#include "utoa.h"
-#include "hash.h"
-#include "value_array.h"
-#include "d_ary_heap.h"
-#include "agenda.h"
-#include "show.h"
-#include "string_to.h"
-
-
-#define DFSA(x) x
-//fsa earley chart
-
-#define DPFSA(x) x
-//prefix trie
-
-#define DBUILDTRIE(x)
-
-#define PRINT_PREFIX 1
-#if PRINT_PREFIX
-# define IF_PRINT_PREFIX(x) x
-#else
-# define IF_PRINT_PREFIX(x)
-#endif
-// keep backpointers in prefix trie so you can print a meaningful node id
-
-static const unsigned FSA_AGENDA_RESERVE=10; // TODO: increase to 1<<24 (16M)
-
-using namespace std;
-
-//impl details (not exported).  flat namespace for my ease.
-
-typedef CFG::RHS RHS;
-typedef CFG::BinRhs BinRhs;
-typedef CFG::NTs NTs;
-typedef CFG::NT NT;
-typedef CFG::NTHandle NTHandle;
-typedef CFG::Rules Rules;
-typedef CFG::Rule Rule;
-typedef CFG::RuleHandle RuleHandle;
-
-namespace {
-
-/*
-
-1) A -> x . * (trie)
-
-this is somewhat nice.  cost pushed for best first, of course.  similar benefit as left-branching binarization without the explicit predict/complete steps?
-
-vs. just
-
-2) * -> x . y
-
-here you have to potentially list out all A -> . x y as items * -> . x y immediately, and shared rhs seqs won't be shared except at the usual single-NT predict/complete.  of course, the prediction of items -> . x y can occur lazy best-first.
-
-vs.
-
-3) * -> x . *
-
-with 3, we predict all sorts of useless items - that won't give us our goal A and may not partcipate in any parse.  this is not a good option at all.
-
-I'm using option 1.
-*/
-
-// if we don't greedy-binarize, we want to encode recognized prefixes p (X -> p . rest) efficiently.  if we're doing this, we may as well also push costs so we can best-first select rules in a lazy fashion.  this is effectively left-branching binarization, of course.
-
-template <class K,class V,class Hash>
-struct fsa_map_type {
-  typedef std::map<K,V> type; // change to HASH_MAP ?
-};
-//template typedef - and macro to make it less painful
-#define FSA_MAP(k,v) fsa_map_type<k,v,boost::hash<k> >::type
-
-struct PrefixTrieNode;
-typedef PrefixTrieNode *NodeP;
-typedef PrefixTrieNode const *NodePc;
-
-// for debugging prints only
-struct TrieBackP {
-  WordID w;
-  NodePc from;
-  TrieBackP(WordID w=0,NodePc from=0) : w(w),from(from) {  }
-};
-
-FsaFeatureFunction const* print_fsa=0;
-CFG const* print_cfg=0;
-inline ostream& print_cfg_rhs(std::ostream &o,WordID w,CFG const*pcfg=print_cfg) {
-  if (pcfg)
-    pcfg->print_rhs_name(o,w);
-  else
-    CFG::static_print_rhs_name(o,w);
-  return o;
-}
-
-inline std::string nt_name(WordID n,CFG const*pcfg=print_cfg) {
-  if (pcfg) return pcfg->nt_name(n);
-  return CFG::static_nt_name(n);
-}
-
-template <class V>
-ostream& print_by_nt(std::ostream &o,V const& v,CFG const*pcfg=print_cfg,char const* header="\nNT -> X\n") {
-  o<<header;
-  for (int i=0;i<v.size();++i)
-    o << nt_name(i,pcfg) << " -> "<<v[i]<<"\n";
-  return o;
-}
-
-template <class V>
-ostream& print_map_by_nt(std::ostream &o,V const& v,CFG const*pcfg=print_cfg,char const* header="\nNT -> X\n") {
-  o<<header;
-  for (typename V::const_iterator i=v.begin(),e=v.end();i!=e;++i) {
-    print_cfg_rhs(o,i->first,pcfg) << " -> "<<i->second<<"\n";
-  }
-  return o;
-}
-
-struct PrefixTrieEdge {
-  PrefixTrieEdge()
-  //    : dest(0),w(TD::max_wordid)
-  {}
-  PrefixTrieEdge(WordID w,NodeP dest)
-    : dest(dest),w(w)
-  {}
-//  explicit PrefixTrieEdge(best_t p) : p(p),dest(0) {  }
-
-  best_t p;// viterbi additional prob, i.e. product over path incl. p_final = total rule prob.  note: for final edge, set this.
-  //DPFSA()
-  // we can probably just store deltas, but for debugging remember the full p
-  //    best_t delta; //
-  NodeP dest;
-  bool is_final() const { return dest==0; }
-  best_t p_dest() const;
-  WordID w; // for root and and is_final(), this will be (negated) NTHandle.
-
-  // for sorting most probable first in adj; actually >(p)
-  inline bool operator <(PrefixTrieEdge const& o) const {
-    return o.p<p;
-  }
-  PRINT_SELF(PrefixTrieEdge)
-  void print(std::ostream &o) const {
-    print_cfg_rhs(o,w);
-    o<<"{"<<p<<"}->"<<dest;
-  }
-};
-
-//note: ending a rule is handled with a special final edge, so that possibility can be explored in best-first order along with the rest (alternative: always finish a final rule by putting it on queue).  this edge has no symbol on it.
-struct PrefixTrieNode {
-  best_t p; // viterbi (max prob) of rule this node leads to - when building.  telescope later onto edges for best-first.
-//  bool final; // may also have successors, of course.  we don't really need to track this; a null dest edge in the adj list lets us encounter the fact in best first order.
-  void p_delta(int next,best_t &p) const {
-    p*=adj[next].p;
-  }
-  void inc_adj(int &next,best_t &p) const {
-    p/=adj[next].p; //TODO: cache deltas
-    ++next;
-    p*=adj[next].p;
-  }
-
-
-  typedef TrieBackP BP;
-  typedef std::vector<BP> BPs;
-  void back_vec(BPs &ns) const {
-    IF_PRINT_PREFIX(if(backp.from) { ns.push_back(backp); backp.from->back_vec(ns); })
-  }
-
-  BPs back_vec() const {
-    BPs ret;
-    back_vec(ret);
-    return ret;
-  }
-
-  unsigned size() const {
-    unsigned a=adj.size();
-    unsigned e=edge_for.size();
-    return a>e?a:e;
-  }
-
-  void print_back_str(std::ostream &o) const {
-    BPs back=back_vec();
-    unsigned i=back.size();
-    if (!i) {
-      o<<"PrefixTrieNode@"<<(uintptr_t)this;
-      return;
-    }
-    bool first=true;
-    while (i--<=0) {
-      if (!first) o<<',';
-      first=false;
-      WordID w=back[i].w;
-      print_cfg_rhs(o,w);
-    }
-  }
-  std::string back_str() const {
-    std::ostringstream o;
-    print_back_str(o);
-    return o.str();
-  }
-
-//  best_t p_final; // additional prob beyond what we already paid. while building, this is the total prob
-// instead of storing final, we'll say that an edge with a NULL dest is a final edge.  this way it gets sorted into the list of adj.
-
-  // instead of completed map, we have trie start w/ lhs.
-  NTHandle lhs; // nonneg. - instead of storing this in Item.
-  IF_PRINT_PREFIX(BP backp;)
-
-  enum { ROOT=-1 };
-  explicit PrefixTrieNode(NTHandle lhs=ROOT,best_t p=1) : p(p),lhs(lhs),IF_PRINT_PREFIX(backp()) {
-    //final=false;
-  }
-  bool is_root() const { return lhs==ROOT; } // means adj are the nonneg lhs indices, and we have the index edge_for still available
-
-  // outgoing edges will be ordered highest p to worst p
-
-  typedef FSA_MAP(WordID,PrefixTrieEdge) PrefixTrieEdgeFor;
-public:
-  PrefixTrieEdgeFor edge_for; //TODO: move builder elsewhere?  then need 2nd hash or edge include pointer to builder.  just clear this later
-  bool have_adj() const {
-    return adj.size()>=edge_for.size();
-  }
-  bool no_adj() const {
-    return adj.empty();
-  }
-
-  void index_adj() {
-    index_adj(edge_for);
-  }
-  template <class M>
-  void index_adj(M &m) {
-    assert(have_adj());
-    m.clear();
-    for (int i=0;i<adj.size();++i) {
-      PrefixTrieEdge const& e=adj[i];
-      SHOWM2(DPFSA,"index_adj",i,e);
-      m[e.w]=e;
-    }
-  }
-  template <class PV>
-  void index_lhs(PV &v) {
-    for (int i=0,e=adj.size();i!=e;++i) {
-      PrefixTrieEdge const& edge=adj[i];
-      // assert(edge.p.is_1());  // actually, after done_building, e will have telescoped dest->p/p.
-      NTHandle n=-edge.w;
-      assert(n>=0);
-//      SHOWM3(DPFSA,"index_lhs",i,edge,n);
-      v[n]=edge.dest;
-    }
-  }
-
-  template <class PV>
-  void done_root(PV &v) {
-    assert(is_root());
-    SHOWM1(DBUILDTRIE,"done_root",OSTRF1(print_map_by_nt,edge_for));
-    done_building_r(); //sets adj
-    SHOWM1(DBUILDTRIE,"done_root",OSTRF1(print_by_nt,adj));
-//    SHOWM1(DBUILDTRIE,done_root,adj);
-//    index_adj(); // we want an index for the root node?.  don't think so - index_lhs handles it.  also we stopped clearing edge_for.
-    index_lhs(v); // uses adj
-  }
-
-  // call only once.
-  void done_building_r() {
-    done_building();
-    for (int i=0;i<adj.size();++i)
-      if (adj[i].dest) // skip final edge
-        adj[i].dest->done_building_r();
-  }
-
-  // for done_building; compute incremental (telescoped) edge p
-  PrefixTrieEdge /*const&*/ operator()(PrefixTrieEdgeFor::value_type & pair) const {
-    PrefixTrieEdge &e=pair.second;//const_cast<PrefixTrieEdge&>(pair.second);
-    e.p=e.p_dest()/p;
-    return e;
-  }
-
-  // call only once.
-  void done_building() {
-    SHOWM3(DBUILDTRIE,"done_building",edge_for.size(),adj.size(),1);
-#if 1
-    adj.reinit_map(edge_for,*this);
-#else
-    adj.reinit(edge_for.size());
-    SHOWM3(DBUILDTRIE,"done_building_reinit",edge_for.size(),adj.size(),2);
-    Adj::iterator o=adj.begin();
-    for (PrefixTrieEdgeFor::iterator i=edge_for.begin(),e=edge_for.end();i!=e;++i) {
-      SHOWM3(DBUILDTRIE,"edge_for",o-adj.begin(),i->first,i->second);
-      PrefixTrieEdge &edge=i->second;
-      edge.p=(edge.dest->p)/p;
-      *o++=edge;
-//      (*this)(*i);
-    }
-#endif
-    SHOWM1(DBUILDTRIE,"done building adj",prange(adj.begin(),adj.end(),true));
-    assert(adj.size()==edge_for.size());
-//    if (final) p_final/=p;
-    std::sort(adj.begin(),adj.end());
-    //TODO: store adjacent differences on edges (compared to
-  }
-
-  typedef ValueArray<PrefixTrieEdge>  Adj;
-//  typedef vector<PrefixTrieEdge> Adj;
-  Adj adj;
-
-  typedef WordID W;
-
-  // let's compute p_min so that every rule reachable from the created node has p at least this low.
-  NodeP improve_edge(PrefixTrieEdge const& e,best_t rulep) {
-    NodeP d=e.dest;
-    maybe_improve(d->p,rulep);
-    return d;
-  }
-
-  inline NodeP build(W w,best_t rulep) {
-    return build(lhs,w,rulep);
-  }
-  inline NodeP build_lhs(NTHandle n,best_t rulep) {
-    return build(n,-n,rulep);
-  }
-
-  NodeP build(NTHandle lhs_,W w,best_t rulep) {
-    PrefixTrieEdgeFor::iterator i=edge_for.find(w);
-    if (i!=edge_for.end())
-      return improve_edge(i->second,rulep);
-    NodeP r=new PrefixTrieNode(lhs_,rulep);
-    IF_PRINT_PREFIX(r->backp=BP(w,this));
-//    edge_for.insert(i,PrefixTrieEdgeFor::value_type(w,PrefixTrieEdge(w,r)));
-    add(edge_for,w,PrefixTrieEdge(w,r));
-    SHOWM4(DBUILDTRIE,"built node",this,w,*r,r);
-    return r;
-  }
-
-  void set_final(NTHandle lhs_,best_t pf) {
-    assert(no_adj());
-//    final=true;
-    PrefixTrieEdge &e=edge_for[null_wordid];
-    e.p=pf;
-    e.dest=0;
-    e.w=lhs_;
-    maybe_improve(p,pf);
-  }
-
-private:
-  void destroy_children() {
-    assert(adj.size()>=edge_for.size());
-    for (int i=0,e=adj.size();i<e;++i) {
-      NodeP c=adj[i].dest;
-      if (c) { // final state has no end
-        delete c;
-      }
-    }
-  }
-public:
-  ~PrefixTrieNode() {
-    destroy_children();
-  }
-  void print(std::ostream &o) const {
-    o << "Node"<<this<< ": "<<lhs << "->" << p;
-    o << ',' << size() << ',';
-    print_back_str(o);
-  }
-  PRINT_SELF(PrefixTrieNode)
-};
-
-inline best_t PrefixTrieEdge::p_dest() const {
-  return dest ? dest->p : p; // for final edge, p was set (no sentinel node)
-}
-
-
-//Trie starts with lhs (nonneg index), then continues w/ rhs (mixed >0 word, else NT)
-// trie ends with final edge, which points to a per-lhs prefix node
-struct PrefixTrie {
-  void print(std::ostream &o) const {
-    o << cfgp << ' ' << root;
-  }
-  PRINT_SELF(PrefixTrie);
-  CFG *cfgp;
-  Rules const* rulesp;
-  Rules const& rules() const { return *rulesp; }
-  CFG const& cfg() const { return *cfgp; }
-  PrefixTrieNode root;
-  typedef std::vector<NodeP> LhsToTrie; // will have to check lhs2[lhs].p for best cost of some rule with that lhs, then use edge deltas after?  they're just caching a very cheap computation, really
-  LhsToTrie lhs2; // no reason to use a map or hash table; every NT in the CFG will have some rule rhses.  lhs_to_trie[i]=root.edge_for[i], i.e. we still have a root trie node conceptually, we just access through this since it's faster.
-  typedef LhsToTrie LhsToComplete;
-  LhsToComplete lhs2complete; // the sentinel "we're completing" node (dot at end) for that lhs.  special case of suffix-set=same trie minimization (aka right branching binarization) // these will be used to track kbest completions, along with a l state (r state will be in the list)
-  PrefixTrie(CFG &cfg) : cfgp(&cfg),rulesp(&cfg.rules),lhs2(cfg.nts.size(),0),lhs2complete(cfg.nts.size()) {
-//    cfg.SortLocalBestFirst(); // instead we'll sort in done_building_r
-    print_cfg=cfgp;
-    SHOWM2(DBUILDTRIE,"PrefixTrie()",rulesp->size(),lhs2.size());
-    cfg.VisitRuleIds(*this);
-    root.done_root(lhs2);
-    SHOWM3(DBUILDTRIE,"done w/ PrefixTrie: ",root,root.adj.size(),lhs2.size());
-    DBUILDTRIE(print_by_nt(cerr,lhs2,cfgp));
-    SHOWM1(DBUILDTRIE,"lhs2",OSTRF2(print_by_nt,lhs2,cfgp));
-  }
-
-  void operator()(int ri) {
-    Rule const& r=rules()[ri];
-    NTHandle lhs=r.lhs;
-    best_t p=r.p;
-//    NodeP n=const_cast<PrefixTrieNode&>(root).build_lhs(lhs,p);
-    NodeP n=root.build_lhs(lhs,p);
-    SHOWM4(DBUILDTRIE,"Prefixtrie rule id, root",ri,root,p,*n);
-    for (RHS::const_iterator i=r.rhs.begin(),e=r.rhs.end();;++i) {
-      SHOWM2(DBUILDTRIE,"PrefixTrie build or final",i-r.rhs.begin(),*n);
-      if (i==e) {
-        n->set_final(lhs,p);
-        break;
-      }
-      n=n->build(*i,p);
-      SHOWM2(DBUILDTRIE,"PrefixTrie built",*i,*n);
-    }
-//    root.build(lhs,r.p)->build(r.rhs,r.p);
-  }
-  inline NodeP lhs2_ex(NTHandle n) const {
-    NodeP r=lhs2[n];
-    if (!r) throw std::runtime_error("PrefixTrie: no CFG rule w/ lhs "+cfgp->nt_name(n));
-    return r;
-  }
-private:
-  PrefixTrie(PrefixTrie const& o);
-};
-
-
-
-typedef std::size_t ItemHash;
-
-
-struct ItemKey {
-  explicit ItemKey(NodeP start,Bytes const& start_state) : dot(start),q(start_state),r(start_state) {  }
-  explicit ItemKey(NodeP dot) : dot(dot) {  }
-  NodeP dot; // dot is a function of the stuff already recognized, and gives a set of suffixes y to complete to finish a rhs for lhs() -> dot y.  for a lhs A -> . *, this will point to lh2[A]
-  Bytes q,r; // (q->r are the fsa states; if r is empty it means
-  bool operator==(ItemKey const& o) const {
-    return dot==o.dot && q==o.q && r==o.r;
-  }
-  inline ItemHash hash() const {
-    ItemHash h=GOLDEN_MEAN_FRACTION*(ItemHash)(dot-NULL); // i.e. lower order bits of ptr are nonrandom
-    using namespace boost;
-    hash_combine(h,q);
-    hash_combine(h,r);
-    return h;
-  }
-  template<class O>
-  void print(O &o) const {
-    o<<"lhs="<<lhs();
-    if (dot)
-      dot->print_back_str(o);
-    if (print_fsa) {
-      o<<'/';
-      print_fsa->print_state(o,&q[0]);
-      o<<"->";
-      print_fsa->print_state(o,&r[0]);
-    }
-  }
-  NTHandle lhs() const { return dot->lhs; }
-  PRINT_SELF(ItemKey)
-};
-inline ItemHash hash_value(ItemKey const& x) {
-  return x.hash();
-}
-ItemKey null_item((PrefixTrieNode*)0);
-
-struct Item;
-typedef Item *ItemP;
-
-/* we use a single type of item so it can live in a single best-first queue.  we hold them by pointer so they can have mutable state, e.g. priority/location, but also lists of predictions and kbest completions (i.e. completions[L,r] = L -> * (r,s), by 1best for each possible s.  we may discover more s later.  we could use different subtypes since we hold by pointer, but for now everything will be packed as variants of Item */
-#undef INIT_LOCATION
-#if D_ARY_TRACK_OUT_OF_HEAP
-# define INIT_LOCATION                           , location(D_ARY_HEAP_NULL_INDEX)
-#elif !defined(NDEBUG) || SAFE_VALGRIND
- // avoid spurious valgrind warning - FIXME: still complains???
-# define INIT_LOCATION                           , location()
-#else
-# define INIT_LOCATION
-#endif
-
-// these should go in a global best-first queue
-struct ItemPrio {
-  // NOTE: sum = viterbi (max)
-  ItemPrio() : priority(init_0()),inner(init_0()) {  }
-  explicit ItemPrio(best_t priority) : priority(priority),inner(init_0()) {  }
-  best_t priority; // includes inner prob. (forward)
-  /* The forward probability alpha_i(X[k]->x.y) is the sum of the probabilities of all
-     constrained paths of length i that end in state X[k]->x.y*/
-  best_t inner;
-  /* The inner probability beta_i(X[k]->x.y) is the sum of the probabilities of all
-     paths of length i-k that start in state X[k,k]->.xy and end in X[k,i]->x.y, and generate the input symbols x[k,...,i-1] */
-  template<class O>
-  void print(O &o) const {
-    o<<priority; // TODO: show/use inner?
-  }
-  typedef ItemPrio self_type;
-  SELF_TYPE_PRINT
-};
-
-#define ITEM_TYPE(X,t)                              \
-  X(t,ADJ,=-1)                                 \
-
-#define ITEM_TYPE_TYPE ItemType
-
-DECLARE_NAMED_ENUM(ITEM_TYPE)
-DEFINE_NAMED_ENUM(ITEM_TYPE)
-
-struct Item : ItemPrio,ItemKey {
-/*  explicit Item(NodeP dot,best_t prio,int next) : ItemPrio(prio),ItemKey(dot),trienext(next),from(0)
-                                        INIT_LOCATION
-                                        {  }*/
-//  ItemType t;
-  // lazy queueing of succesors item:
-  bool is_trie_adj() const {
-    return trienext>=0;
-  }
-  explicit Item(FFState const& state,NodeP dot,best_t prio,int next=0) : ItemPrio(prio),ItemKey(dot,state),trienext(next),from(0)
-                                                             INIT_LOCATION
-  {
-//    t=ADJ;
-//    if (dot->adj.size())
-    dot->p_delta(next,priority);
-//    SHOWM1(DFSA,"Item(state,dot,prio)",prio);
-  }
-  typedef std::queue<ItemP> Predicted;
-//  Predicted predicted; // this is empty, unless this is a predicted L -> .asdf item, or a to-complete L -> asdf .
-  int trienext; // index of dot->adj to complete (if dest==0), or predict (if NT), or scan (if word).  note: we could store pointer inside adj since it and trie are @ fixed addrs.  less pointer arith, more space.
-  ItemP from; //backpointer - 0 for L -> . asdf for the rest; L -> a .sdf, it's the L -> .asdf item.
-  ItemP predicted_from() const {
-    ItemP p=(ItemP)this;
-    while(p->from) p=p->from;
-    return p;
-  }
-  template<class O>
-  void print(O &o) const {
-    o<< '[';
-    o<<this<<": ";
-    ItemKey::print(o);
-    o<<' ';
-    ItemPrio::print(o);
-    o<<" next="<<trienext;
-    o<< ']';
-  }
-  PRINT_SELF(Item)
-  unsigned location;
-};
-
-struct GetItemKey {
-  typedef Item argument_type;
-  typedef ItemKey result_type;
-  result_type const& operator()(Item const& i) const { return i; }
-  template <class T>
-  T const& operator()(T const& t) const { return t; }
-};
-
-/* here's what i imagine (best first):
-   all of these are looked up in a chart which includes the fsa states as part of the identity
-
-   perhaps some items are ephemeral and never reused (e.g. edge items of a cube, where we delay traversing trie based on probabilities), but in all ohter cases we make entirely new objects derived from the original one (memoizing).  let's ignore lazier edge items for now and always push all successors onto heap.
-
-   initial item (predicted): GOAL_NT -> . * (trie root for that lhs), start, start (fsa start states).  has a list of
-
-   completing item ( L -> * . needs to queue all the completions immediately.  when predicting before a completion happens, add to prediction list.  after it happens, immediately use the completed bests.  this is confusing to me: the completions for an original NT w/ a given r state may end up in several different ones.  we don't only care about the 1 best cost r item but all the different r.
-
-   the prediction's left/right uses the predictor's right
-
- */
-template <class FsaFF=FsaFeatureFunction>
-struct Chart {
-  //typedef HASH_MAP<Item,ItemP,boost::hash<Item> > Items;
-  //typedef Items::iterator FindItem;
-  //typedef std::pair<FindItem,bool> InsertItem;
-//  Items items;
-  CFG &cfg; // TODO: remove this from Chart
-  SentenceMetadata const& smeta;
-  FsaFF const& fsa;
-  NTHandle goal_nt;
-  PrefixTrie trie;
-  typedef Agenda<Item,BetterP,GetItemKey> A;
-  A a;
-
-  /* had to stop working on this for now - it's garbage/useless in this form - see NOTES.earley */
-
-  // p_partial is priority*p(rule) - excluding the FSA model score, and predicted
-  void succ(Item const& from,int adji,best_t p_partial) {
-    PrefixTrieEdge const& te=from.dot->adj[adji];
-    NodeP dest=te.dest;
-    if (te.is_final()) {
-      // complete
-      return;
-    }
-    WordID w=te.w;
-    if (w<0) {
-      NTHandle lhs=-w;
-    } else {
-
-    }
-  }
-
-  void extend1() {
-    BetterP better;
-    Item &t=*a.top();
-    best_t tp=t.priority;
-    if (t.is_trie_adj()) {
-      best_t pstop=a.second_best(); // remember; best_t a<b means a better than (higher prob) than b
-//      NodeP d=t.dot;
-      PrefixTrieNode::Adj const& adj=t.dot->adj;
-      int n=t.trienext,m=adj.size();
-      SHOWM3(DFSA,"popped",t,tp,pstop);
-      for (;n<m;++n) { // cube corner
-        PrefixTrieEdge const& te=adj[n];
-        SHOWM3(DFSA,"maybe try trie next",n,te.p,pstop);
-        if (better(te.p,pstop)) { // can get some improvement
-          SHOWM2(DFSA,"trying adj ",m,te);
-        } else {
-          goto done;
-        }
-      }
-      a.pop();
-    done:;
-    }
-  }
-
-  void best_first(unsigned kbest=1) {
-    assert(kbest==1); //TODO: k-best via best-first requires revisiting best things again and adjusting desc.  tricky.
-    while(!a.empty()) {
-      extend1();
-    }
-  }
-
-  Chart(CFG &cfg,SentenceMetadata const& smeta,FsaFF const& fsa,unsigned reserve=FSA_AGENDA_RESERVE)
-    : cfg(cfg),smeta(smeta),fsa(fsa),trie(cfg),a(reserve) {
-    assert(fsa.state_bytes());
-    print_fsa=&fsa;
-    goal_nt=cfg.goal_nt;
-    best_t prio=init_1();
-    SHOW1(DFSA,prio);
-    a.add(a.construct(fsa.start,trie.lhs2_ex(goal_nt),prio));
-  }
-};
-
-
-}//anon ns
-
-
-DEFINE_NAMED_ENUM(FSA_BY)
-
-template <class FsaFF=FsaFeatureFunction>
-struct ApplyFsa {
-  ApplyFsa(HgCFG &i,
-           const SentenceMetadata& smeta,
-           const FsaFeatureFunction& fsa,
-           DenseWeightVector const& weights,
-           ApplyFsaBy const& by,
-           Hypergraph* oh
-    )
-    :hgcfg(i),smeta(smeta),fsa(fsa),weights(weights),by(by),oh(oh)
-  {
-    stateless=!fsa.state_bytes();
-  }
-  void Compute() {
-    if (by.IsBottomUp() || stateless)
-      ApplyBottomUp();
-    else
-      ApplyEarley();
-  }
-  void ApplyBottomUp();
-  void ApplyEarley();
-  CFG const& GetCFG();
-private:
-  CFG cfg;
-  HgCFG &hgcfg;
-  SentenceMetadata const& smeta;
-  FsaFF const& fsa;
-//  WeightVector weight_vector;
-  DenseWeightVector weights;
-  ApplyFsaBy by;
-  Hypergraph* oh;
-  std::string cfg_out;
-  bool stateless;
-};
-
-template <class F>
-void ApplyFsa<F>::ApplyBottomUp()
-{
-  assert(by.IsBottomUp());
-  FeatureFunctionFromFsa<FsaFeatureFunctionFwd> buff(&fsa);
-  buff.Init(); // mandatory to call this (normally factory would do it)
-  vector<const FeatureFunction*> ffs(1,&buff);
-  ModelSet models(weights, ffs);
-  IntersectionConfiguration i(stateless ? BU_FULL : by.BottomUpAlgorithm(),by.pop_limit);
-  ApplyModelSet(hgcfg.ih,smeta,models,i,oh);
-}
-
-template <class F>
-void ApplyFsa<F>::ApplyEarley()
-{
-  hgcfg.GiveCFG(cfg);
-  print_cfg=&cfg;
-  print_fsa=&fsa;
-  Chart<F> chart(cfg,smeta,fsa);
-  // don't need to uniq - option to do that already exists in cfg_options
-  //TODO:
-  chart.best_first();
-  *oh=hgcfg.ih;
-}
-
-
-void ApplyFsaModels(HgCFG &i,
-                    const SentenceMetadata& smeta,
-                    const FsaFeatureFunction& fsa,
-                    DenseWeightVector const& weight_vector,
-                    ApplyFsaBy const& by,
-                    Hypergraph* oh)
-{
-  ApplyFsa<FsaFeatureFunction> a(i,smeta,fsa,weight_vector,by,oh);
-  a.Compute();
-}
-
-/*
-namespace {
-char const* anames[]={
-  "BU_CUBE",
-  "BU_FULL",
-  "EARLEY",
-  0
-};
-}
-*/
-
-//TODO: named enum type in boost?
-
-std::string ApplyFsaBy::name() const {
-//  return anames[algorithm];
-  return GetName(algorithm);
-}
-
-std::string ApplyFsaBy::all_names() {
-  return FsaByNames(" ");
-  /*
-  std::ostringstream o;
-  for (int i=0;i<N_ALGORITHMS;++i) {
-    assert(anames[i]);
-    if (i) o<<' ';
-    o<<anames[i];
-  }
-  return o.str();
-  */
-}
-
-ApplyFsaBy::ApplyFsaBy(std::string const& n, int pop_limit) : pop_limit(pop_limit) {
-  std::string uname=toupper(n);
-  algorithm=GetFsaBy(uname);
-/*anames=0;
-  while(anames[algorithm] && anames[algorithm] != uname) ++algorithm;
-  if (!anames[algorithm])
-    throw std::runtime_error("Unknown ApplyFsaBy type: "+n+" - legal types: "+all_names());
-*/
-}
-
-ApplyFsaBy::ApplyFsaBy(FsaBy i, int pop_limit) : pop_limit(pop_limit) {
-/*  if (i<0 || i>=N_ALGORITHMS)
-    throw std::runtime_error("Unknown ApplyFsaBy type id: "+itos(i)+" - legal types: "+all_names());
-*/
-  GetName(i); // checks validity
-  algorithm=i;
-}
-
-int ApplyFsaBy::BottomUpAlgorithm() const {
-  assert(IsBottomUp());
-  return algorithm==BU_CUBE ?
-    IntersectionConfiguration::CUBE
-    :IntersectionConfiguration::FULL;
-}
-
-void ApplyFsaModels(Hypergraph const& ih,
-                    const SentenceMetadata& smeta,
-                    const FsaFeatureFunction& fsa,
-                    DenseWeightVector const& weights, // pre: in is weighted by these (except with fsa featval=0 before this)
-                    ApplyFsaBy const& cfg,
-                    Hypergraph* out)
-{
-  HgCFG i(ih);
-  ApplyFsaModels(i,smeta,fsa,weights,cfg,out);
-}
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 69f40c93..4ce5749e 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -12,8 +12,6 @@
 #include "ff_rules.h"
 #include "ff_ruleshape.h"
 #include "ff_bleu.h"
-#include "ff_lm_fsa.h"
-#include "ff_sample_fsa.h"
 #include "ff_source_syntax.h"
 #include "ff_register.h"
 #include "ff_charset.h"
@@ -31,15 +29,6 @@ void register_feature_functions() {
   }
   registered = true;
 
-  //TODO: these are worthless example target FSA ffs.  remove later
-  RegisterFsaImpl<SameFirstLetter>(true);
-  RegisterFsaImpl<LongerThanPrev>(true);
-  RegisterFsaImpl<ShorterThanPrev>(true);
-//  ff_registry.Register("LanguageModelFsaDynamic",new FFFactory<FeatureFunctionFromFsa<FsaFeatureFunctionDynamic<LanguageModelFsa> > >); // to test correctness of FsaFeatureFunctionDynamic erasure
-  RegisterFsaDynToFF<LanguageModelFsa>();
-  RegisterFsaImpl<LanguageModelFsa>(true); // same as LM but using fsa wrapper
-  RegisterFsaDynToFF<SameFirstLetter>();
-
   RegisterFF<LanguageModel>();
 
   RegisterFF<WordPenalty>();
@@ -47,8 +36,6 @@ void register_feature_functions() {
   RegisterFF<ArityPenalty>();
   RegisterFF<BLEUModel>();
 
-  ff_registry.Register(new FFFactory<WordPenaltyFromFsa>); // same as WordPenalty, but implemented using ff_fsa
-
   //TODO: use for all features the new Register which requires static FF::usage(false,false) give name
 #ifdef HAVE_RANDLM
   ff_registry.Register("RandLM", new FFFactory<LanguageModelRandLM>);
diff --git a/decoder/feature_accum.h b/decoder/feature_accum.h
deleted file mode 100755
index 4b8338eb..00000000
--- a/decoder/feature_accum.h
+++ /dev/null
@@ -1,129 +0,0 @@
-#ifndef FEATURE_ACCUM_H
-#define FEATURE_ACCUM_H
-
-#include "ff.h"
-#include "sparse_vector.h"
-#include "value_array.h"
-
-struct SparseFeatureAccumulator : public FeatureVector {
-  typedef FeatureVector State;
-  SparseFeatureAccumulator() { assert(!"this code is disabled");  }
-  template <class FF>
-  FeatureVector const& describe(FF const& ) { return *this; }
-  void Store(FeatureVector *fv) const {
-//NO    fv->set_from(*this);
-  }
-  template <class FF>
-  void Store(FF const& /* ff */,FeatureVector *fv) const {
-//NO    fv->set_from(*this);
-  }
-  template <class FF>
-  void Add(FF const& /* ff */,FeatureVector const& fv) {
-    (*this)+=fv;
-  }
-  void Add(FeatureVector const& fv) {
-    (*this)+=fv;
-  }
-  /*
-  SparseFeatureAccumulator(FeatureVector const& fv) : State(fv) {}
-  FeatureAccumulator(Features const& fids) {}
-  FeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fv) {}
-  void Add(Features const& fids,FeatureVector const& fv) {
-    *this += fv;
-  }
-  */
-  void Add(int i,Featval v) {
-//NO    (*this)[i]+=v;
-  }
-  void Add(Features const& fids,int i,Featval v) {
-//NO    (*this)[i]+=v;
-  }
-};
-
-struct SingleFeatureAccumulator {
-  typedef Featval State;
-  typedef SingleFeatureAccumulator Self;
-  State v;
-  /*
-  void operator +=(State const& o) {
-    v+=o;
-  }
-  */
-  void operator +=(Self const& s) {
-    v+=s.v;
-  }
-  SingleFeatureAccumulator() : v() {}
-  template <class FF>
-  State const& describe(FF const& ) const { return v; }
-
-  template <class FF>
-  void Store(FF const& ff,FeatureVector *fv) const {
-    fv->set_value(ff.fid_,v);
-  }
-  void Store(Features const& fids,FeatureVector *fv) const {
-    assert(fids.size()==1);
-    fv->set_value(fids[0],v);
-  }
-  /*
-  SingleFeatureAccumulator(Features const& fids) { assert(fids.size()==1); }
-  SingleFeatureAccumulator(Features const& fids,FeatureVector const& fv)
-  {
-    assert(fids.size()==1);
-    v=fv.get_singleton();
-  }
-  */
-
-  template <class FF>
-  void Add(FF const& ff,FeatureVector const& fv) {
-    v+=fv.get(ff.fid_);
-  }
-  void Add(FeatureVector const& fv) {
-    v+=fv.get_singleton();
-  }
-
-  void Add(Features const& fids,FeatureVector const& fv) {
-    v += fv.get(fids[0]);
-  }
-  void Add(Featval dv) {
-    v+=dv;
-  }
-  void Add(int,Featval dv) {
-    v+=dv;
-  }
-  void Add(FeatureVector const& fids,int i,Featval dv) {
-    assert(fids.size()==1 && i==0);
-    v+=dv;
-  }
-};
-
-
-#if 0
-// omitting this so we can default construct an accum.  might be worth resurrecting in the future
-struct ArrayFeatureAccumulator : public ValueArray<Featval> {
-  typedef ValueArray<Featval> State;
-  template <class Fsa>
-  ArrayFeatureAccumulator(Fsa const& fsa) : State(fsa.features_.size()) { }
-  ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) {  }
-  ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) {  }
-  ArrayFeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fids.size()) {
-    for (int i=0,e=i<fids.size();i<e;++i)
-      (*this)[i]=fv.get(i);
-  }
-  State const& describe(Features const& fids) const { return *this; }
-  void Store(Features const& fids,FeatureVector *fv) const {
-    assert(fids.size()==size());
-    for (int i=0,e=i<fids.size();i<e;++i)
-      fv->set_value(fids[i],(*this)[i]);
-  }
-  void Add(Features const& fids,FeatureVector const& fv) {
-    for (int i=0,e=i<fids.size();i<e;++i)
-      (*this)[i]+=fv.get(i);
-  }
-  void Add(FeatureVector const& fids,int i,Featval v) {
-    (*this)[i]+=v;
-  }
-};
-#endif
-
-
-#endif
diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h
index 92334396..5eb68c8b 100644
--- a/decoder/ff_factory.h
+++ b/decoder/ff_factory.h
@@ -20,8 +20,6 @@
 
 #include <boost/shared_ptr.hpp>
 
-#include "ff_fsa_dynamic.h"
-
 class FeatureFunction;
 
 class FsaFeatureFunction;
diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h
deleted file mode 100755
index f8d79e03..00000000
--- a/decoder/ff_from_fsa.h
+++ /dev/null
@@ -1,304 +0,0 @@
-#ifndef FF_FROM_FSA_H
-#define FF_FROM_FSA_H
-
-#include "ff_fsa.h"
-
-#ifndef TD__none
-// replacing dependency on SRILM
-#define TD__none -1
-#endif
-
-#ifndef FSA_FF_DEBUG
-# define FSA_FF_DEBUG 0
-#endif
-#if FSA_FF_DEBUG
-# define FSAFFDBG(e,x) FSADBGif(debug(),e,x)
-# define FSAFFDBGnl(e) FSADBGif_nl(debug(),e)
-#else
-# define FSAFFDBG(e,x)
-# define FSAFFDBGnl(e)
-#endif
-
-/* regular bottom up scorer from Fsa feature
-   uses guarantee about markov order=N to score ASAP
-   encoding of state: if less than N-1 (ctxlen) words
-
-   usage:
-   typedef FeatureFunctionFromFsa<LanguageModelFsa> LanguageModelFromFsa;
-*/
-
-template <class Impl>
-class FeatureFunctionFromFsa : public FeatureFunction {
-  typedef void const* SP;
-  typedef WordID *W;
-  typedef WordID const* WP;
-public:
-  template <class I>
-  FeatureFunctionFromFsa(I const& param) : ff(param) {
-    debug_=true; // because factory won't set until after we construct.
-  }
-  template <class I>
-  FeatureFunctionFromFsa(I & param) : ff(param) {
-    debug_=true; // because factory won't set until after we construct.
-  }
-
-  static std::string usage(bool args,bool verbose) {
-    return Impl::usage(args,verbose);
-  }
-  void init_name_debug(std::string const& n,bool debug) {
-    FeatureFunction::init_name_debug(n,debug);
-    ff.init_name_debug(n,debug);
-  }
-
-  // this should override
-  Features features() const {
-    DBGINIT("FeatureFunctionFromFsa features() name="<<ff.name()<<" features="<<FD::Convert(ff.features()));
-    return ff.features();
-  }
-
-  // Log because it potentially stores info in edge.  otherwise the same as regular TraversalFeatures.
-  void TraversalFeaturesLog(const SentenceMetadata& smeta,
-                             Hypergraph::Edge& edge,
-                             const std::vector<const void*>& ant_contexts,
-                             FeatureVector* features,
-                             FeatureVector* estimated_features,
-                             void* out_state) const
-  {
-    TRule const& rule=*edge.rule_;
-    Sentence const& e = rule.e();  // items in target side of rule
-    typename Impl::Accum accum,h_accum;
-    if (!ssz) { // special case for no state - but still build up longer phrases to score in case FSA overrides ScanPhraseAccum
-      if (Impl::simple_phrase_score) {
-        // save the effort of building up the contiguous rule phrases - probably can just use the else branch, now that phrases aren't copied but are scanned off e directly.
-        for (int j=0,ee=e.size();j<ee;++j) {
-          if (e[j]>=1) // token
-            ff.ScanAccum(smeta,edge,(WordID)e[j],NULL,NULL,&accum);
-          FSAFFDBG(edge," "<<TD::Convert(e[j]));
-        }
-      } else {
-#undef RHS_WORD
-#define RHS_WORD(j) (e[j]>=1)
-        for (int j=0,ee=e.size();;++j) { // items in target side of rule
-          for(;;++j) {
-            if (j>=ee) goto rhs_done; // j may go 1 past ee due to k possibly getting to end
-            if (RHS_WORD(j)) break;
-          }
-          // word @j
-          int k=j;
-          while(k<ee) if (!RHS_WORD(++k)) break;
-          //end or nonword @k - [j,k) is phrase
-          FSAFFDBG(edge," ["<<TD::GetString(&e[j],&e[k])<<']');
-          ff.ScanPhraseAccum(smeta,edge,&e[j],&e[k],0,0,&accum);
-          j=k;
-        }
-      }
-    rhs_done:
-      accum.Store(ff,features);
-      FSAFFDBG(edge,"="<<accum.describe(ff));
-      FSAFFDBGnl(edge);
-      return;
-    }
-
-    // bear with me, because this is hard to understand.  reminder: ant_contexts and out_state are left-words first (up to M, TD__none padded).  if all M words are present, then FSA state follows.  otherwise 0 bytes to keep memcmp/hash happy.
-
-//why do we compute heuristic in so many places?  well, because that's how we know what state we should score words in once we're full on our left context (because of markov order bound, we know the score will be the same no matter what came before that left context)
-    // these left_* refer to our output (out_state):
-    W left_begin=(W)out_state;
-    W left_out=left_begin; // [left,fsa_state) = left ctx words.  if left words aren't full, then null wordid
-    WP left_full=left_end_full(out_state);
-    FsaScanner<Impl> fsa(ff,smeta,edge);
-    /* fsa holds our current state once we've seen our first M rule or child left-context words.  that state scores up the rest of the words at the time, and is replaced by the right state of any full child.  at the end, if we've got at least M left words in all, it becomes our right state (otherwise, we don't bother storing the partial state, which might seem useful any time we're built on by a rule that has our variable in the initial position - but without also storing the heuristic for that case, we just end up rescanning from scratch anyway to produce the heuristic.  so we just store all 0 bytes if we have less than M left words at the end. */
-    for (int j = 0,ee=e.size(); j < ee; ++j) { // items in target side of rule
-    s_rhs_next:
-      if (!RHS_WORD(j)) { // variable
-        // variables a* are referring to this child derivation state.
-        SP a = ant_contexts[-e[j]];
-        WP al=(WP)a,ale=left_end(a); // the child left words
-        int anw=ale-al;
-        FSAFFDBG(edge,' '<<describe_state(a));
-// anw left words in child.  full if == M.  we will use them to fill our left words, and then score the rest fully, knowing what state we're in based on h_state -> our left words -> any number of interior words which are scored then hidden
-        if (left_out+anw<left_full) { // still short of M after adding - nothing to score (not even our heuristic)
-          wordcpy(left_out,al,anw);
-          left_out+=anw;
-        } else if (left_out<left_full) { // we had less than M before, and will have a tleast M after adding.  so score heuristic and the rest M+1,... score inside.
-          int ntofill=left_full-left_out;
-          assert(ntofill==M-(left_out-left_begin));
-          wordcpy(left_out,al,ntofill);
-          left_out=(W)left_full;
-          // heuristic known now
-          fsa.reset(ff.heuristic_start_state());
-          fsa.scan(left_begin,left_full,&h_accum); // save heuristic (happens once only)
-          fsa.scan(al+ntofill,ale,&accum); // because of markov order, fully filled left words scored starting at h_start put us in the right state to score the extra words (which are forgotten)
-          al+=ntofill; // we used up the first ntofill words of al to end up in some known state via exactly M words total (M-ntofill were there beforehand).  now we can scan the remaining al words of this child
-        } else { // more to score / state to update (left already full)
-          fsa.scan(al,ale,&accum);
-        }
-        if (anw==M)
-          fsa.reset(fsa_state(a));
-        // if child had full state already, we must assume there was a gap and use its right state (note: if the child derivation was exactly M words, then we still use its state even though it will be equal to our current; there's no way to distinguish between such an M word item and an e.g. 2*M+k word item, although it's extremely unlikely that you'd have a >M word item that happens to have the same left and right boundary words).
-        assert(anw<=M); // of course, we never store more than M left words in an item.
-      } else { // single word
-        WordID ew=e[j];
-        // some redundancy: non-vectorized version of above handling of left words of child item
-        if (left_out<left_full) {
-          *left_out++=ew;
-          if (left_out==left_full) { // handle heuristic, once only, establish state
-            fsa.reset(ff.heuristic_start_state());
-            fsa.scan(left_begin,left_full,&h_accum); // save heuristic (happens only once)
-          }
-        } else {
-          if (Impl::simple_phrase_score) {
-            fsa.scan(ew,&accum); // single word scan isn't optimal if phrase is different
-            FSAFFDBG(edge,' '<<TD::Convert(ew));
-          } else {
-            int k=j;
-            while(k<ee) if (!RHS_WORD(++k)) break;
-            FSAFFDBG(edge," rule-phrase["<<TD::GetString(&e[j],&e[k])<<']');
-            fsa.scan(&e[j],&e[k],&accum);
-            if (k==ee) goto s_rhs_done;
-            j=k;
-            goto s_rhs_next;
-          }
-        }
-      }
-    }
-#undef RHS_WORD
-  s_rhs_done:
-    void *out_fsa_state=fsa_state(out_state);
-    if (left_out<left_full) { // finally: partial heuristic for unfilled items
-//      fsa.reset(ff.heuristic_start_state());      fsa.scan(left_begin,left_out,&h_accum);
-      ff.ScanPhraseAccumOnly(smeta,edge,left_begin,left_out,ff.heuristic_start_state(),&h_accum);
-      do { *left_out++=TD__none; } while(left_out<left_full); // none-terminate so left_end(out_state) will know how many words
-      ff.state_zero(out_fsa_state); // so we compare / hash correctly. don't know state yet because left context isn't full
-    } else // or else store final right-state.  heuristic was already assigned
-      ff.state_copy(out_fsa_state,fsa.cs);
-    accum.Store(ff,features);
-    h_accum.Store(ff,estimated_features);
-    FSAFFDBG(edge," = " << describe_state(out_state)<<" "<<name<<"="<<accum.describe(ff)<<" h="<<h_accum.describe(ff)<<")");
-    FSAFFDBGnl(edge);
-  }
-
-  void print_state(std::ostream &o,void const*ant) const {
-    WP l=(WP)ant,le=left_end(ant),lf=left_end_full(ant);
-    o<<'['<<Sentence(l,le);
-    if (le==lf) {
-      o<<" : ";
-      ff.print_state(o,lf);
-    }
-    o << ']';
-  }
-
-  std::string describe_state(void const*ant) const {
-    std::ostringstream o;
-    print_state(o,ant);
-    return o.str();
-  }
-
-  //FIXME: it's assumed that the final rule is just a unary no-target-terminal rewrite (same as ff_lm)
-  virtual void FinalTraversalFeatures(const SentenceMetadata& smeta,
-                                      Hypergraph::Edge& edge,
-                                      const void* residual_state,
-                                      FeatureVector* final_features) const
-  {
-    Sentence const& ends=ff.end_phrase();
-    typename Impl::Accum accum;
-    if (!ssz) {
-      FSAFFDBG(edge," (final,0state,end="<<ends<<")");
-      ff.ScanPhraseAccumOnly(smeta,edge,begin(ends),end(ends),0,&accum);
-    } else {
-      SP ss=ff.start_state();
-      WP l=(WP)residual_state,lend=left_end(residual_state);
-      SP rst=fsa_state(residual_state);
-      FSAFFDBG(edge," (final");// "<<name);//<< " before="<<*final_features);
-      if (lend==rst) { // implying we have an fsa state
-        ff.ScanPhraseAccumOnly(smeta,edge,l,lend,ss,&accum); // e.g. <s> score(full left unscored phrase)
-        FSAFFDBG(edge," start="<<ff.describe_state(ss)<<"->{"<<Sentence(l,lend)<<"}");
-        ff.ScanPhraseAccumOnly(smeta,edge,begin(ends),end(ends),rst,&accum); // e.g. [ctx for last M words] score("</s>")
-        FSAFFDBG(edge," end="<<ff.describe_state(rst)<<"->{"<<ends<<"}");
-      } else { // all we have is a single short phrase < M words before adding ends
-        int nl=lend-l;
-        Sentence whole(ends.size()+nl);
-        WordID *wb=begin(whole);
-        wordcpy(wb,l,nl);
-        wordcpy(wb+nl,begin(ends),ends.size());
-        FSAFFDBG(edge," whole={"<<whole<<"}");
-        // whole = left-words + end-phrase
-        ff.ScanPhraseAccumOnly(smeta,edge,wb,end(whole),ss,&accum);
-      }
-    }
-    FSAFFDBG(edge,' '<<name<<"="<<accum.describe(ff));
-    FSAFFDBGnl(edge);
-    accum.Store(ff,final_features);
-  }
-
-  bool rule_feature() const {
-    return StateSize()==0; // Fsa features don't get info about span
-  }
-
-  static void test() {
-    WordID w1[1],w1b[1],w2[2];
-    w1[0]=w2[0]=TD::Convert("hi");
-    w2[1]=w1b[0]=TD__none;
-    assert(left_end(w1,w1+1)==w1+1);
-    assert(left_end(w1b,w1b+1)==w1b);
-    assert(left_end(w2,w2+2)==w2+1);
-  }
-
-  // override from FeatureFunction; should be called by factory after constructor.  we'll also call in our own ctor
-  void Init() {
-    ff.Init();
-    ff.sync();
-    DBGINIT("base (single feature) FsaFeatureFunctionBase::Init name="<<name_<<" features="<<FD::Convert(features()));
-//    FeatureFunction::name_=Impl::usage(false,false); // already achieved by ff_factory.cc
-    M=ff.markov_order();
-    ssz=ff.state_bytes();
-    state_offset=sizeof(WordID)*M;
-    SetStateSize(ssz+state_offset);
-    assert(!ssz == !M); // no fsa state <=> markov order 0
-  }
-
-private:
-  Impl ff;
-  int M; // markov order (ctx len)
-  FeatureFunctionFromFsa(); // not allowed.
-
-  int state_offset; // NOTE: in bytes (add to char* only). store left-words first, then fsa state
-  int ssz; // bytes in fsa state
-  /*
-    state layout: left WordIds, followed by fsa state
-    left words have never been scored.  last ones remaining will be scored on FinalTraversalFeatures only.
-    right state is unknown until we have all M left words (less than M means TD__none will pad out right end).  unk right state will be zeroed out for proper hash/equal recombination.
-  */
-
-  static inline WordID const* left_end(WordID const* left, WordID const* e) {
-    for (;e>left;--e)
-      if (e[-1]!=TD__none) break;
-    //post: [left,e] are the seen left words
-    return e;
-  }
-  inline WP left_end(SP ant) const {
-    return left_end((WP)ant,(WP)fsa_state(ant));
-  }
-  inline WP left_end_full(SP ant) const {
-    return (WP)fsa_state(ant);
-  }
-  inline SP fsa_state(SP ant) const {
-    return ((char const*)ant+state_offset);
-  }
-  inline void *fsa_state(void * ant) const {
-    return ((char *)ant+state_offset);
-  }
-};
-
-#ifdef TEST_FSA
-# include "tdict.cc"
-# include "ff_sample_fsa.h"
-int main() {
-  std::cerr<<"Testing left_end...\n";
-  std::cerr<<"sizeof(FeatureVector)="<<sizeof(FeatureVector)<<"\n";
-  WordPenaltyFromFsa::test();
-  return 0;
-}
-#endif
-
-#endif
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
deleted file mode 100755
index 18e90bf1..00000000
--- a/decoder/ff_fsa.h
+++ /dev/null
@@ -1,401 +0,0 @@
-#ifndef FF_FSA_H
-#define FF_FSA_H
-
-/*
-  features whose score is just some PFSA over target string.  however, PFSA can use edge and smeta info (e.g. spans on edge) - not usually useful.
-
-//SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h
-
-  state is some fixed width byte array.  could actually be a void *, WordID sequence, whatever.
-
-  TODO: specify Scan return code or feature value = -inf for failure state (e.g. for hard intersection with desired target lattice?)
-
-  TODO: maybe ff that wants to know about SentenceMetadata should store a ref to
-  it permanently rather than get passed it for every operation.  we're never
-  decoding more than 1 sentence at once and it's annoying to pass it.  same
-  could apply for result edge as well since so far i only use it for logging
-  when USE_INFO_EDGE 1 - would make the most sense if the same change happened
-  to ff.h at the same time.
-
-  TODO: there are a confusing array of default-implemented supposedly slightly more efficient overrides enabled; however, the two key differences are: do you score a phrase, or just word at a time (the latter constraining you to obey markov_order() everywhere.  you have to implement the word case no matter what.
-
-  TODO: considerable simplification of implementation if Scan implementors are required to update state in place (using temporary copy if they need it), or e.g. using memmove (copy from end to beginning) to rotate state right.
-
-  TODO: at what sizes is memcpy/memmove better than looping over 2-3 ints and assigning?
-
-  TODO: fsa ff scores phrases not just words
-  TODO: fsa feature aggregator that presents itself as a single fsa; benefit: when wrapped in ff_from_fsa, only one set of left words is stored.  downside: compared to separate ff, the inside portion of lower-order models is incorporated later.  however, the full heuristic is already available and exact for those words.  so don't sweat it.
-
-  TODO: state (+ possibly span-specific) custom heuristic, e.g. in "longer than previous word" model, you can expect a higher outside if your state is a word of 2 letters.  this is on top of the nice heuristic for the unscored words, of course.  in ngrams, the avg prob will be about the same, but if the words possible for a source span are summarized, maybe it's possible to predict.  probably not worth the effort.
-*/
-
-#define FSA_DEBUG 0
-
-#if USE_INFO_EDGE
-#define FSA_DEBUG_CERR 0
-#else
-#define FSA_DEBUG_CERR 1
-#endif
-
-#define FSA_DEBUG_DEBUG 0
-# define FSADBGif(i,e,x) do { if (i) { if (FSA_DEBUG_CERR){std::cerr<<x;}  INFO_EDGE(e,x); if (FSA_DEBUG_DEBUG){std::cerr<<"FSADBGif edge.info "<<&e<<" = "<<e.info()<<std::endl;}} } while(0)
-# define FSADBGif_nl(i,e) do { if (i) { if (FSA_DEBUG_CERR) std::cerr<<std::endl; INFO_EDGE(e,"; "); } } while(0)
-#if FSA_DEBUG
-# include <iostream>
-# define FSADBG(e,x) FSADBGif(d().debug(),e,x)
-# define FSADBGnl(e) FSADBGif_nl(d().debug(),e,x)
-#else
-# define FSADBG(e,x)
-# define FSADBGnl(e)
-#endif
-
-#include "fast_lexical_cast.hpp"
-#include <sstream>
-#include <string>
-#include "ff.h"
-#include "sparse_vector.h"
-#include "tdict.h"
-#include "hg.h"
-#include "ff_fsa_data.h"
-
-/*
-usage: see ff_sample_fsa.h or ff_lm_fsa.h
-
- then, to decode, see ff_from_fsa.h (or TODO: left->right target-earley style rescoring)
-
- */
-
-
-template <class Impl>
-struct FsaFeatureFunctionBase : public FsaFeatureFunctionData {
-  Impl const& d() const { return static_cast<Impl const&>(*this); }
-  Impl & d()  { return static_cast<Impl &>(*this); }
-
-  // this will get called by factory - override if you have multiple or dynamically named features.  note: may be called repeatedly
-  void Init() {
-    Init(name());
-    DBGINIT("base (single feature) FsaFeatureFunctionBase::Init name="<<name()<<" features="<<FD::Convert(features_));
-  }
-  void Init(std::string const& fname) {
-    fid_=FD::Convert(fname);
-    InitHaveFid();
-  }
-  void InitHaveFid() {
-    features_=FeatureFunction::single_feature(fid_);
-  }
-  Features features() const {
-    DBGINIT("FeatureFunctionBase::features() name="<<name()<<" features="<<FD::Convert(features_));
-    return features_;
-  }
-
-public:
-  int fid_; // you can have more than 1 feature of course.
-
-  std::string describe() const {
-    std::ostringstream o;
-    o<<*this;
-    return o.str();
-  }
-
-  // can override to different return type, e.g. just return feats:
-  Featval describe_features(FeatureVector const& feats) const {
-    return feats.get(fid_);
-  }
-
-  bool debug() const { return true; }
-  int fid() const { return fid_; } // return the one most important feature (for debugging)
-  std::string name() const {
-    return Impl::usage(false,false);
-  }
-
-  void print_state(std::ostream &o,void const*state) const {
-    char const* i=(char const*)state;
-    char const* e=i+ssz;
-    for (;i!=e;++i)
-      print_hex_byte(o,*i);
-  }
-
-  std::string describe_state(void const* state) const {
-    std::ostringstream o;
-    d().print_state(o,state);
-    return o.str();
-  }
-  typedef SingleFeatureAccumulator Accum;
-
-  // return m: all strings x with the same final m+1 letters must end in this state
-  /* markov chain of order m: P(xn|xn-1...x1)=P(xn|xn-1...xn-m) */
-  int markov_order() const { return 0; } // override if you use state.  order 0 implies state_bytes()==0 as well, as far as scoring/splitting is concerned (you can still track state, though)
-  //TODO: if we wanted, we could mark certain states as maximal-context, but this would lose our fixed amount of left context in ff_from_fsa, and lose also our vector operations (have to scan left words 1 at a time, checking always to see where you change from h to inside - BUT, could detect equivalent LM states, which would be nice).
-
-
-
-  // if [i,end) are unscored words of length <= markov_order, score some of them on the right, and return the number scored, i.e. [end-r,end) will have been scored for return r.  CAREFUL: for ngram you have to sometimes remember to pay all of the backoff once you see a few more words to the left.
-  template <class Accum>
-  int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const {
-    return 0;
-  }
-
-  // this isn't currently used at all.  this left-shortening is not recommended (wasn't worth the computation expense for ngram): specifically for bottom up scoring (ff_from_fsa), you can return a shorter left-words context - but this means e.g. for ngram tracking that a backoff occurred where the final BO cost isn't yet known.  you would also have to remember any necessary info in your own state - in the future, ff_from_fsa on a list of fsa features would only shorten it to the max
-
-
-  // override this (static)
-  static std::string usage(bool param,bool verbose) {
-    return FeatureFunction::usage_helper("unnamed_fsa_feature","","",param,verbose);
-  }
-
-  // move from state to next_state after seeing word x, while emitting features->set_value(fid,val) possibly with duplicates.  state and next_state will never be the same memory.
-  //TODO: decide if we want to require you to support dest same as src, since that's how we use it most often in ff_from_fsa bottom-up wrapper (in l->r scoring, however, distinct copies will be the rule), and it probably wouldn't be too hard for most people to support.  however, it's good to hide the complexity here, once (see overly clever FsaScan loop that swaps src/dest addresses repeatedly to scan a sequence by effectively swapping)
-
-protected:
-  // overrides have different name because of inheritance method hiding;
-
-  // simple/common case; 1 fid.  these need not be overriden if you have multiple feature ids
-  Featval Scan1(WordID w,void const* state,void *next_state) const {
-    assert(0);
-    return 0;
-  }
-  Featval Scan1Meta(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& /* edge */,
-                    WordID w,void const* state,void *next_state) const {
-    return d().Scan1(w,state,next_state);
-  }
-public:
-
-  // must override this or Scan1Meta or Scan1
-  template <class Accum>
-  inline void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                        WordID w,void const* state,void *next_state,Accum *a) const {
-    Add(d().Scan1Meta(smeta,edge,w,state,next_state),a);
-  }
-
-  // bounce back and forth between two state vars starting at cs, returning end state location.  if we required src=dest addr safe state updating, this concept wouldn't need to exist.
-  // required that you override this if you score phrases differently than word-by-word, however, you can just use the SCAN_PHRASE_ACCUM_OVERRIDE macro to do that in terms of ScanPhraseAccum
-  template <class Accum>
-  void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const {
-    // extra code - IT'S FOR EFFICIENCY, MAN!  IT'S OK!  definitely no bugs here.
-    if (!ssz) {
-      for (;i<end;++i)
-        d().ScanAccum(smeta,edge,*i,0,0,accum);
-      return 0;
-    }
-    void *os,*es;
-    if ((end-i)&1) { // odd # of words
-      os=cs;
-      es=ns;
-      goto odd;
-    } else {
-      i+=1;
-      es=cs;
-      os=ns;
-    }
-    for (;i<end;i+=2) {
-      d().ScanAccum(smeta,edge,i[-1],es,os,accum); // e->o
-    odd:
-      d().ScanAccum(smeta,edge,i[0],os,es,accum); // o->e
-    }
-    return es;
-  }
-
-
-  static const bool simple_phrase_score=true; // if d().simple_phrase_score_, then you should expect different Phrase scores for phrase length > M.  so, set this false if you provide ScanPhraseAccum (SCAN_PHRASE_ACCUM_OVERRIDE macro does this)
-
-  // override this (and use SCAN_PHRASE_ACCUM_OVERRIDE  ) if you want e.g. maximum possible order ngram scores with markov_order < n-1.  in the future SparseFeatureAccumulator will probably be the only option for type-erased FSA ffs.
-  // note you'll still have to override ScanAccum
-  template <class Accum>
-  void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge,
-                              WordID const* i, WordID const* end,
-                              void const* state,void *next_state,Accum *accum) const {
-    if (!ssz) {
-      for (;i<end;++i)
-        d().ScanAccum(smeta,edge,*i,0,0,accum);
-      return;
-    }
-    char tstate[ssz];
-    void *tst=tstate;
-    bool odd=(end-i)&1;
-    void *cs,*ns;
-    // we're going to use Bounce (word by word alternating of states) such that the final place is next_state
-    if (odd) {
-      cs=tst;
-      ns=next_state;
-    } else {
-      cs=next_state;
-      ns=tst;
-    }
-    state_copy(cs,state);
-    void *est=d().ScanPhraseAccumBounce(smeta,edge,i,end,cs,ns,accum);
-    assert(est==next_state);
-  }
-
-
-
-  // could replace this with a CRTP subclass providing these impls.
-  // the d() subclass dispatch is not needed because these will be defined in the subclass
-#define SCAN_PHRASE_ACCUM_OVERRIDE \
-  static const bool simple_phrase_score=false; \
-  template <class Accum> \
-  void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const { \
-    ScanPhraseAccum(smeta,edge,i,end,cs,ns,accum);  \
-    return ns; \
-  } \
-  template <class Accum> \
-  void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, \
-                              WordID const* i, WordID const* end, \
-                              void const* state,Accum *accum) const { \
-    char s2[ssz]; ScanPhraseAccum(smeta,edge,i,end,state,(void*)s2,accum); \
-  }
-
-  // override this or bounce along with above.  note: you can just call ScanPhraseAccum
-  // doesn't set state (for heuristic in ff_from_fsa)
-  template <class Accum>
-  void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                           WordID const* i, WordID const* end,
-                           void const* state,Accum *accum) const {
-    char s1[ssz];
-    char s2[ssz];
-    state_copy(s1,state);
-    d().ScanPhraseAccumBounce(smeta,edge,i,end,(void*)s1,(void*)s2,accum);
-  }
-
-  // for single-feat only.  but will work for different accums
-  template <class Accum>
-  inline void Add(Featval v,Accum *a) const {
-    a->Add(fid_,v);
-  }
-  inline void set_feat(FeatureVector *features,Featval v) const {
-    features->set_value(fid_,v);
-  }
-
-  // don't set state-bytes etc. in ctor because it may depend on parsing param string
-  FsaFeatureFunctionBase(int statesz=0,Sentence const& end_sentence_phrase=Sentence())
-    : FsaFeatureFunctionData(statesz,end_sentence_phrase)
-  {
-    name_=name(); // should allow FsaDynamic wrapper to get name copied to it with sync
-  }
-
-};
-
-template <class Impl>
-struct MultipleFeatureFsa : public FsaFeatureFunctionBase<Impl> {
-  typedef SparseFeatureAccumulator Accum;
-};
-
-
-
-
-// if State is pod.  sets state size and allocs start, h_start
-// usage:
-// struct ShorterThanPrev : public FsaTypedBase<int,ShorterThanPrev>
-// i.e. Impl is a CRTP
-template <class St,class Impl>
-struct FsaTypedBase : public FsaFeatureFunctionBase<Impl> {
-  Impl const& d() const { return static_cast<Impl const&>(*this); }
-  Impl & d()  { return static_cast<Impl &>(*this); }
-protected:
-  typedef FsaFeatureFunctionBase<Impl> Base;
-  typedef St State;
-  static inline State & state(void *state) {
-    return *(State*)state;
-  }
-  static inline State const& state(void const* state) {
-    return *(State const*)state;
-  }
-  void set_starts(State const& s,State const& heuristic_s) {
-    if (0) { // already in ctor
-      Base::start.resize(sizeof(State));
-      Base::h_start.resize(sizeof(State));
-    }
-    assert(Base::start.size()==sizeof(State));
-    assert(Base::h_start.size()==sizeof(State));
-    state(Base::start.begin())=s;
-    state(Base::h_start.begin())=heuristic_s;
-  }
-  FsaTypedBase(St const& start_st=St()
-               ,St const& h_start_st=St()
-               ,Sentence const& end_sentence_phrase=Sentence())
-    : Base(sizeof(State),end_sentence_phrase) {
-    set_starts(start_st,h_start_st);
-  }
-public:
-  void print_state(std::ostream &o,void const*st) const {
-    o<<state(st);
-  }
-  int markov_order() const { return 1; }
-
-  // override this
-  Featval ScanT1S(WordID w,St const& /* from */ ,St & /* to */) const {
-    return 0;
-  }
-
-  // or this
-  Featval ScanT1(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& /* edge */,WordID w,St const& from ,St & to) const {
-    return d().ScanT1S(w,from,to);
-  }
-
-  // or this (most general)
-  template <class Accum>
-  inline void ScanT(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID w,St const& prev_st,St &new_st,Accum *a) const {
-    Add(d().ScanT1(smeta,edge,w,prev_st,new_st),a);
-  }
-
-  // note: you're on your own when it comes to Phrase overrides.  see FsaFeatureFunctionBase.  sorry.
-
-  template <class Accum>
-  inline void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID w,void const* st,void *next_state,Accum *a) const {
-    Impl const& im=d();
-    FSADBG(edge,"Scan "<<FD::Convert(im.fid_)<<" = "<<a->describe(im)<<" "<<im.state(st)<<"->"<<TD::Convert(w)<<" ");
-    im.ScanT(smeta,edge,w,state(st),state(next_state),a);
-    FSADBG(edge,state(next_state)<<" = "<<a->describe(im));
-    FSADBGnl(edge);
-  }
-};
-
-
-// keep a "current state" (bouncing back and forth)
-template <class FF>
-struct FsaScanner {
-//  enum {ALIGN=8};
-  static const int ALIGN=8;
-  FF const& ff;
-  SentenceMetadata const& smeta;
-  int ssz;
-  Bytes states; // first is at begin, second is at (char*)begin+stride
-  void *st0; // states
-  void *st1; // states+stride
-  void *cs; // initially st0, alternates between st0 and st1
-  inline void *nexts() const {
-    return (cs==st0)?st1:st0;
-  }
-  Hypergraph::Edge const& edge;
-  FsaScanner(FF const& ff,SentenceMetadata const& smeta,Hypergraph::Edge const& edge) : ff(ff),smeta(smeta),edge(edge)
-  {
-    ssz=ff.state_bytes();
-    int stride=((ssz+ALIGN-1)/ALIGN)*ALIGN; // round up to multiple of ALIGN
-    states.resize(stride+ssz);
-    st0=states.begin();
-    st1=(char*)st0+stride;
-//    for (int i=0;i<2;++i) st[i]=cs+(i*stride);
-  }
-  void reset(void const* state) {
-    cs=st0;
-    std::memcpy(st0,state,ssz);
-  }
-  template <class Accum>
-  void scan(WordID w,Accum *a) {
-    void *ns=nexts();
-    ff.ScanAccum(smeta,edge,w,cs,ns,a);
-    cs=ns;
-  }
-  template <class Accum>
-  void scan(WordID const* i,WordID const* end,Accum *a) {
-    // faster. and allows greater-order excursions
-    cs=ff.ScanPhraseAccumBounce(smeta,edge,i,end,cs,nexts(),a);
-  }
-};
-
-
-//TODO: combine 2 FsaFeatures typelist style (can recurse for more)
-
-
-
-
-#endif
diff --git a/decoder/ff_fsa_data.h b/decoder/ff_fsa_data.h
deleted file mode 100755
index d215e940..00000000
--- a/decoder/ff_fsa_data.h
+++ /dev/null
@@ -1,131 +0,0 @@
-#ifndef FF_FSA_DATA_H
-#define FF_FSA_DATA_H
-
-#include <stdint.h> //C99
-#include <sstream>
-#include "sentences.h"
-#include "feature_accum.h"
-#include "value_array.h"
-#include "ff.h" //debug
-typedef ValueArray<uint8_t> Bytes;
-
-// stuff I see no reason to have virtual.  but because it's impossible (w/o virtual inheritance to have dynamic fsa ff know where the impl's data starts, implemented a sync (copy) method that needs to be called.  init_name_debug was already necessary to keep state in sync between ff and ff_from_fsa, so no sync should be needed after it.  supposing all modifications were through setters, then no explicit sync call would ever be needed; updates could be mirrored.
-struct FsaFeatureFunctionData
-{
-  void init_name_debug(std::string const& n,bool debug) {
-    name_=n;
-    debug_=debug;
-  }
-  //HACK for diamond inheritance (w/o costing performance)
-  FsaFeatureFunctionData *sync_to_;
-
-  void sync() const { // call this if you modify any fields after your constructor is done
-    if (sync_to_) {
-      DBGINIT("sync to "<<*sync_to_);
-      *sync_to_=*this;
-      DBGINIT("synced result="<<*sync_to_<< " from this="<<*this);
-    } else {
-      DBGINIT("nobody to sync to - from FeatureFunctionData this="<<*this);
-    }
-  }
-
-  friend std::ostream &operator<<(std::ostream &o,FsaFeatureFunctionData const& d) {
-    o << "[FSA "<<d.name_<<" features="<<FD::Convert(d.features_)<<" state_bytes="<<d.state_bytes()<<" end='"<<d.end_phrase()<<"' start=";
-    d.print_state(o,d.start_state());
-    o<<"]";
-    return o;
-  }
-
-  FsaFeatureFunctionData(int statesz=0,Sentence const& end_sentence_phrase=Sentence()) : start(statesz),h_start(statesz),end_phrase_(end_sentence_phrase),ssz(statesz) {
-    debug_=true;
-    sync_to_=0;
-  }
-
-  std::string name_;
-  std::string name() const {
-    return name_;
-  }
-  typedef SparseFeatureAccumulator Accum;
-  bool debug_;
-  bool debug() const { return debug_; }
-  void state_copy(void *to,void const*from) const {
-    if (ssz)
-      std::memcpy(to,from,ssz);
-  }
-  void state_zero(void *st) const { // you should call this if you don't know the state yet and want it to be hashed/compared properly
-    std::memset(st,0,ssz);
-  }
-  Features features() const {
-    return features_;
-  }
-  int n_features() const {
-    return features_.size();
-  }
-  int state_bytes() const { return ssz; }
-  void const* start_state() const {
-    return start.begin();
-  }
-  void const * heuristic_start_state() const {
-    return h_start.begin();
-  }
-  Sentence const& end_phrase() const { return end_phrase_; }
-  template <class T>
-  static inline T* state_as(void *p) { return (T*)p; }
-  template <class T>
-  static inline T const* state_as(void const* p) { return (T*)p; }
-  std::string describe_features(FeatureVector const& feats) {
-    std::ostringstream o;
-    o<<feats;
-    return o.str();
-  }
-  void print_state(std::ostream &o,void const*state) const {
-    char const* i=(char const*)state;
-    char const* e=i+ssz;
-    for (;i!=e;++i)
-      print_hex_byte(o,*i);
-  }
-
-  Features features_;
-  Bytes start,h_start; // start state and estimated-features (heuristic) start state.  set these.  default empty.
-  Sentence end_phrase_; // words appended for final traversal (final state cost is assessed using Scan) e.g. "</s>" for lm.
-protected:
-  int ssz; // don't forget to set this. default 0 (it may depend on params of course)
-  // this can be called instead or after constructor (also set bytes and end_phrase_)
-  void set_state_bytes(int sb=0) {
-    if (start.size()!=sb) start.resize(sb);
-    if (h_start.size()!=sb) h_start.resize(sb);
-    ssz=sb;
-  }
-  void set_end_phrase(WordID single) {
-    end_phrase_=singleton_sentence(single);
-  }
-
-  inline void static to_state(void *state,char const* begin,char const* end) {
-    std::memcpy(state,begin,end-begin);
-  }
-  inline void static to_state(void *state,char const* begin,int n) {
-    std::memcpy(state,begin,n);
-  }
-  template <class T>
-  inline void static to_state(void *state,T const* begin,int n=1) {
-    to_state(state,(char const*)begin,n*sizeof(T));
-  }
-  template <class T>
-  inline void static to_state(void *state,T const* begin,T const* end) {
-    to_state(state,(char const*)begin,(char const*)end);
-  }
-  inline static char hexdigit(int i) {
-    int j=i-10;
-    return j>=0?'a'+j:'0'+i;
-  }
-  inline static void print_hex_byte(std::ostream &o,unsigned c) {
-    o<<hexdigit(c>>4);
-    o<<hexdigit(c&0x0f);
-  }
-  inline static void Add(Featval v,SingleFeatureAccumulator *a) {
-    a->Add(v);
-  }
-
-};
-
-#endif
diff --git a/decoder/ff_fsa_dynamic.h b/decoder/ff_fsa_dynamic.h
deleted file mode 100755
index 6f75bbe5..00000000
--- a/decoder/ff_fsa_dynamic.h
+++ /dev/null
@@ -1,208 +0,0 @@
-#ifndef FF_FSA_DYNAMIC_H
-#define FF_FSA_DYNAMIC_H
-
-struct SentenceMetadata;
-
-#include "ff_fsa_data.h"
-#include "hg.h" // can't forward declare nested Hypergraph::Edge class
-#include <sstream>
-
-// the type-erased interface
-
-//FIXME: diamond inheritance problem.  make a copy of the fixed data?  or else make the dynamic version not wrap but rather be templated CRTP base (yuck)
-struct FsaFeatureFunction : public FsaFeatureFunctionData {
-  static const bool simple_phrase_score=false;
-  virtual int markov_order() const = 0;
-
-  // see ff_fsa.h - FsaFeatureFunctionBase<Impl> gives you reasonable impls of these if you override just ScanAccum
-  virtual void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                        WordID w,void const* state,void *next_state,Accum *a) const = 0;
-  virtual void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge,
-                              WordID const* i, WordID const* end,
-                              void const* state,void *next_state,Accum *accum) const = 0;
-  virtual void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                           WordID const* i, WordID const* end,
-                           void const* state,Accum *accum) const = 0;
-  virtual void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const = 0;
-
-  virtual int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const { return 0; }
-  // called after constructor, before use
-  virtual void Init() = 0;
-  virtual std::string usage_v(bool param,bool verbose) const {
-    return FeatureFunction::usage_helper("unnamed_dynamic_fsa_feature","","",param,verbose);
-  }
-  virtual void init_name_debug(std::string const& n,bool debug) {
-    FsaFeatureFunctionData::init_name_debug(n,debug);
-  }
-
-  virtual void print_state(std::ostream &o,void const*state) const {
-    FsaFeatureFunctionData::print_state(o,state);
-  }
-  virtual std::string describe() const { return "[FSA unnamed_dynamic_fsa_feature]"; }
-
-  //end_phrase()
-  virtual ~FsaFeatureFunction() {}
-
-  // no need to override:
-  std::string describe_state(void const* state) const {
-    std::ostringstream o;
-    print_state(o,state);
-    return o.str();
-  }
-};
-
-// conforming to above interface, type erases FsaImpl
-// you might be wondering: why do this?  answer: it's cool, and it means that the bottom-up ff over ff_fsa wrapper doesn't go through multiple layers of dynamic dispatch
-// usage: typedef FsaFeatureFunctionDynamic<MyFsa> MyFsaDyn;
-template <class Impl>
-struct FsaFeatureFunctionDynamic : public FsaFeatureFunction {
-  static const bool simple_phrase_score=Impl::simple_phrase_score;
-  Impl& d() { return impl;//static_cast<Impl&>(*this);
-  }
-  Impl const& d() const { return impl;
-    //static_cast<Impl const&>(*this);
-  }
-  int markov_order() const { return d().markov_order(); }
-
-  std::string describe() const   {
-    return d().describe();
-  }
-
-  virtual void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                        WordID w,void const* state,void *next_state,Accum *a) const {
-    return d().ScanAccum(smeta,edge,w,state,next_state,a);
-  }
-
-  virtual void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge,
-                              WordID const* i, WordID const* end,
-                              void const* state,void *next_state,Accum *a) const {
-    return d().ScanPhraseAccum(smeta,edge,i,end,state,next_state,a);
-  }
-
-  virtual void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                           WordID const* i, WordID const* end,
-                           void const* state,Accum *a) const {
-    return d().ScanPhraseAccumOnly(smeta,edge,i,end,state,a);
-  }
-
-  virtual void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *a) const {
-    return d().ScanPhraseAccumBounce(smeta,edge,i,end,cs,ns,a);
-  }
-
-  virtual int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const {
-    return d().early_score_words(smeta,edge,i,end,accum);
-  }
-
-  static std::string usage(bool param,bool verbose) {
-    return Impl::usage(param,verbose);
-  }
-
-  std::string usage_v(bool param,bool verbose) const {
-    return Impl::usage(param,verbose);
-  }
-
-  virtual void print_state(std::ostream &o,void const*state) const {
-    return d().print_state(o,state);
-  }
-
-  void init_name_debug(std::string const& n,bool debug) {
-    FsaFeatureFunction::init_name_debug(n,debug);
-    d().init_name_debug(n,debug);
-  }
-
-  virtual void Init() {
-    d().sync_to_=(FsaFeatureFunctionData*)this;
-    d().Init();
-    d().sync();
-  }
-
-  template <class I>
-  FsaFeatureFunctionDynamic(I const& param) : impl(param) {
-    Init();
-  }
-private:
-  Impl impl;
-};
-
-// constructor takes ptr or shared_ptr to Impl, otherwise same as above - note: not virtual
-template <class Impl>
-struct FsaFeatureFunctionPimpl : public FsaFeatureFunctionData {
-  typedef boost::shared_ptr<Impl const> Pimpl;
-  static const bool simple_phrase_score=Impl::simple_phrase_score;
-  Impl const& d() const { return *p_; }
-  int markov_order() const { return d().markov_order(); }
-
-  std::string describe() const   {
-    return d().describe();
-  }
-
-  void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                        WordID w,void const* state,void *next_state,Accum *a) const {
-    return d().ScanAccum(smeta,edge,w,state,next_state,a);
-  }
-
-  void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge,
-                              WordID const* i, WordID const* end,
-                              void const* state,void *next_state,Accum *a) const {
-    return d().ScanPhraseAccum(smeta,edge,i,end,state,next_state,a);
-  }
-
-  void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                           WordID const* i, WordID const* end,
-                           void const* state,Accum *a) const {
-    return d().ScanPhraseAccumOnly(smeta,edge,i,end,state,a);
-  }
-
-  void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *a) const {
-    return d().ScanPhraseAccumBounce(smeta,edge,i,end,cs,ns,a);
-  }
-
-  int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const {
-    return d().early_score_words(smeta,edge,i,end,accum);
-  }
-
-  static std::string usage(bool param,bool verbose) {
-    return Impl::usage(param,verbose);
-  }
-
-  std::string usage_v(bool param,bool verbose) const {
-    return Impl::usage(param,verbose);
-  }
-
-  void print_state(std::ostream &o,void const*state) const {
-    return d().print_state(o,state);
-  }
-
-#if 0
-  // this and Init() don't touch p_ because we want to leave the original alone.
-      void init_name_debug(std::string const& n,bool debug) {
-    FsaFeatureFunctionData::init_name_debug(n,debug);
-  }
-#endif
-  void Init() {
-    p_=hold_pimpl_.get();
-#if 0
-    d().sync_to_=static_cast<FsaFeatureFunctionData*>(this);
-    d().Init();
-#endif
-    *static_cast<FsaFeatureFunctionData*>(this)=d();
-  }
-
-  FsaFeatureFunctionPimpl(Impl const* const p) : hold_pimpl_(p,null_deleter()) {
-    Init();
-  }
-  FsaFeatureFunctionPimpl(Pimpl const& p) : hold_pimpl_(p) {
-    Init();
-  }
-private:
-  Impl const* p_;
-  Pimpl hold_pimpl_;
-};
-
-typedef FsaFeatureFunctionPimpl<FsaFeatureFunction> FsaFeatureFunctionFwd; // allow ff_from_fsa for an existing dynamic-type ff (as opposed to usual register a wrapped known-type FSA in ff_register, which is more efficient)
-//typedef FsaFeatureFunctionDynamic<FsaFeatureFunctionFwd> DynamicFsaFeatureFunctionFwd;  //if you really need to have a dynamic fsa facade that's also a dynamic fsa
-
-//TODO: combine 2 (or N) FsaFeatureFunction (type erased)
-
-
-#endif
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index afa36b96..5e16d4e3 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -46,7 +46,6 @@ char const* usage_verbose="-n determines the name of the feature (and its weight
 #endif
 
 #include "ff_lm.h"
-#include "ff_lm_fsa.h"
 
 #include <sstream>
 #include <unistd.h>
@@ -69,10 +68,6 @@ char const* usage_verbose="-n determines the name of the feature (and its weight
 
 using namespace std;
 
-string LanguageModelFsa::usage(bool param,bool verbose) {
-  return FeatureFunction::usage_helper("LanguageModelFsa",usage_short,usage_verbose,param,verbose);
-}
-
 string LanguageModel::usage(bool param,bool verbose) {
   return FeatureFunction::usage_helper(usage_name,usage_short,usage_verbose,param,verbose);
 }
@@ -524,49 +519,6 @@ LanguageModel::LanguageModel(const string& param) {
   SetStateSize(LanguageModelImpl::OrderToStateSize(order));
 }
 
-//TODO: decide whether to waste a word of space so states are always none-terminated for SRILM.  otherwise we have to copy
-void LanguageModelFsa::set_ngram_order(int i) {
-  assert(i>0);
-  ngram_order_=i;
-  ctxlen_=i-1;
-  set_state_bytes(ctxlen_*sizeof(WordID));
-  WordID *ss=(WordID*)start.begin();
-  WordID *hs=(WordID*)h_start.begin();
-  if (ctxlen_) { // avoid segfault in case of unigram lm (0 state)
-    set_end_phrase(TD::Convert("</s>"));
-// se is pretty boring in unigram case, just adds constant prob.  check that this is what we want
-    ss[0]=TD::Convert("<s>"); // start-sentence context (length 1)
-    hs[0]=0; // empty context
-    for (int i=1;i<ctxlen_;++i) {
-      ss[i]=hs[i]=0; // need this so storage is initialized for hashing.
-      //TODO: reevaluate whether state space comes cleared by allocator or not.
-    }
-  }
-  sync(); // for dynamic markov_order copy etc
-}
-
-LanguageModelFsa::LanguageModelFsa(string const& param) {
-  int lmorder;
-  pimpl_ = make_lm_impl(param,&lmorder,&fid_);
-  Init();
-  floor_=pimpl_->floor_;
-  set_ngram_order(lmorder);
-}
-
-void LanguageModelFsa::print_state(ostream &o,void const* st) const {
-  WordID const *wst=(WordID const*)st;
-  o<<'[';
-  bool sp=false;
-  for (int i=ctxlen_;i>0;sp=true) {
-    --i;
-    WordID w=wst[i];
-    if (w==0) continue;
-    if (sp) o<<' ';
-    o << TD::Convert(w);
-  }
-  o<<']';
-}
-
 Features LanguageModel::features() const {
   return single_feature(fid_);
 }
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
deleted file mode 100755
index 85b7ef44..00000000
--- a/decoder/ff_lm_fsa.h
+++ /dev/null
@@ -1,140 +0,0 @@
-#ifndef FF_LM_FSA_H
-#define FF_LM_FSA_H
-
-//FIXME: when FSA_LM_PHRASE 1, 3gram fsa has differences, especially with unk words, in about the 4th decimal digit (about .05%), compared to regular ff_lm.  this is USUALLY a bug (there's way more actual precision in there).  this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that).  also, LM_FSA_SHORTEN_CONTEXT gives identical scores with FSA_LM_PHRASE 0
-
-// enabling for now - retest unigram+ more, solve above puzzle
-
-// some impls in ff_lm.cc
-
-#define FSA_LM_PHRASE 1
-
-#define FSA_LM_DEBUG 0
-#if FSA_LM_DEBUG
-# define FSALMDBG(e,x) FSADBGif(debug(),e,x)
-# define FSALMDBGnl(e) FSADBGif_nl(debug(),e)
-#else
-# define FSALMDBG(e,x)
-# define FSALMDBGnl(e)
-#endif
-
-#include "ff_fsa.h"
-#include "ff_lm.h"
-
-#ifndef TD__none
-// replacing dependency on SRILM
-#define TD__none -1
-#endif
-
-namespace {
-WordID empty_context=TD__none;
-}
-
-struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
-  typedef WordID * W;
-  typedef WordID const* WP;
-
-  // overrides; implementations in ff_lm.cc
-  typedef SingleFeatureAccumulator Accum;
-  static std::string usage(bool,bool);
-  LanguageModelFsa(std::string const& param);
-  int markov_order() const { return ctxlen_; }
-  void print_state(std::ostream &,void const *) const;
-  inline Featval floored(Featval p) const {
-    return p<floor_?floor_:p;
-  }
-  static inline WordID const* left_end(WordID const* left, WordID const* e) {
-    for (;e>left;--e)
-      if (e[-1]!=TD__none) break;
-    //post: [left,e] are the seen left words
-    return e;
-  }
-
-  template <class Accum>
-  void ScanAccum(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& edge,WordID w,void const* old_st,void *new_st,Accum *a) const {
-#if USE_INFO_EDGE
-    Hypergraph::Edge &de=(Hypergraph::Edge &)edge;
-#endif
-    if (!ctxlen_) {
-      Add(floored(pimpl_->WordProb(w,&empty_context)),a);
-    } else {
-      WordID ctx[ngram_order_]; //alloca if you don't have C99
-      state_copy(ctx,old_st);
-      ctx[ctxlen_]=TD__none;
-      Featval p=floored(pimpl_->WordProb(w,ctx));
-      FSALMDBG(de,"p("<<TD::Convert(w)<<"|"<<TD::Convert(ctx,ctx+ctxlen_)<<")="<<p);FSALMDBGnl(de);
-      // states are srilm contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.).
-      WordID *nst=(WordID *)new_st;
-      nst[0]=w; // new most recent word
-      to_state(nst+1,ctx,ctxlen_-1); // rotate old words right
-#if LM_FSA_SHORTEN_CONTEXT
-      p+=pimpl_->ShortenContext(nst,ctxlen_);
-#endif
-      Add(p,a);
-    }
-  }
-
-#if FSA_LM_PHRASE
-  //FIXME: there is a bug in here somewhere, or else the 3gram LM we use gives different scores for phrases (impossible? BOW nonzero when shortening context past what LM has?)
-  template <class Accum>
-  void ScanPhraseAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge&edge,WordID const* begin,WordID const* end,void const* old_st,void *new_st,Accum *a) const {
-    Hypergraph::Edge &de=(Hypergraph::Edge &)edge;(void)de;
-    if (begin==end) return; // otherwise w/ shortening it's possible to end up with no words at all.
-    /* // this is forcing unigram prob always.  we will instead build the phrase
-    if (!ctxlen_) {
-      Featval p=0;
-      for (;i<end;++i)
-        p+=floored(pimpl_->WordProb(*i,e&mpty_context));
-      Add(p,a);
-      return;
-      } */
-    int nw=end-begin;
-    WP st=(WP)old_st;
-    WP st_end=st+ctxlen_; // may include some null already (or none if full)
-    int nboth=nw+ctxlen_;
-    WordID ctx[nboth+1];
-    ctx[nboth]=TD__none;
-    // reverse order - state at very end of context, then [i,end) in rev order ending at ctx[0]
-    W ctx_score_end=wordcpy_reverse(ctx,begin,end);
-    wordcpy(ctx_score_end,st,st_end); // st already reversed.
-    assert(ctx_score_end==ctx+nw);
-    // we could just copy the filled state words, but it probably doesn't save much time (and might cost some to scan to find the nones.  most contexts are full except for the shortest source spans.
-    FSALMDBG(de," scan.r->l("<<TD::GetString(ctx,ctx_score_end)<<"|"<<TD::GetString(ctx_score_end,ctx+nboth)<<')');
-    Featval p=0;
-    FSALMDBGnl(edge);
-    for(;ctx_score_end>ctx;--ctx_score_end)
-      p+=floored(pimpl_->WordProb(ctx_score_end[-1],ctx_score_end));
-    //TODO: look for score discrepancy -
-    // i had some idea that maybe shortencontext would return a different prob if the length provided was > ctxlen_; however, since the same disagreement happens with LM_FSA_SHORTEN_CONTEXT 0 anyway, it's not that.  perhaps look to SCAN_PHRASE_ACCUM_OVERRIDE - make sure they do the right thing.
-#if LM_FSA_SHORTEN_CONTEXT
-    p+=pimpl_->ShortenContext(ctx,nboth<ctxlen_?nboth:ctxlen_);
-#endif
-    state_copy(new_st,ctx);
-    FSALMDBG(de," lm.Scan("<<TD::GetString(begin,end)<<"|"<<describe_state(old_st)<<")"<<"="<<p<<","<<describe_state(new_st));
-    FSALMDBGnl(edge);
-    Add(p,a);
-  }
-
-  SCAN_PHRASE_ACCUM_OVERRIDE
-#endif
-
-  // impl details:
-  void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows.  otherwise, it's the same as a "-o i" argument to constructor
-  // note: if you adjust ngram_order, ff_from_fsa won't notice.
-
-  double floor_; // log10prob minimum used (e.g. unk words)
-
-  // because we might have a custom fid due to lm name option:
-  void Init() {
-    InitHaveFid();
-  }
-
-private:
-  int ngram_order_;
-  int ctxlen_; // 1 less than above
-  LanguageModelInterface *pimpl_;
-
-};
-
-
-#endif
diff --git a/decoder/ff_register.h b/decoder/ff_register.h
index eff23537..80b1457e 100755
--- a/decoder/ff_register.h
+++ b/decoder/ff_register.h
@@ -2,50 +2,12 @@
 #define FF_FSA_REGISTER_H
 
 #include "ff_factory.h"
-#include "ff_from_fsa.h"
-#include "ff_fsa_dynamic.h"
-
-inline std::string prefix_fsa(std::string const& name,bool fsa_prefix_ff) {
-  return fsa_prefix_ff ? "Fsa"+name : name;
-}
-
-//FIXME: problem with FeatureFunctionFromFsa<FsaFeatureFunction> - need to use factory rather than ctor.
-#if 0
-template <class DynFsa>
-inline void RegisterFsa(bool ff_also=true,bool fsa_prefix_ff=true) {
-  assert(!ff_also);
-//  global_fsa_ff_registry->RegisterFsa<DynFsa>();
-//if (ff_also) ff_registry.RegisterFF<FeatureFunctionFromFsa<DynFsa> >(prefix_fsa(DynFsa::usage(false,false)),fsa_prefix_ff);
-}
-#endif
-
-//TODO: ff from fsa that uses pointer to fsa impl?  e.g. in LanguageModel we share underlying lm file by recognizing same param, but without that effort, otherwise stateful ff may duplicate state if we enable both fsa and ff_from_fsa
-template <class FsaImpl>
-inline void RegisterFsaImpl(bool ff_also=true,bool fsa_prefix_ff=false) {
-  typedef FsaFeatureFunctionDynamic<FsaImpl> DynFsa;
-  typedef FeatureFunctionFromFsa<FsaImpl> FFFrom;
-  std::string name=FsaImpl::usage(false,false);
-  fsa_ff_registry.Register(new FsaFactory<DynFsa>);
-  if (ff_also)
-    ff_registry.Register(prefix_fsa(name,fsa_prefix_ff),new FFFactory<FFFrom>);
-}
 
 template <class Impl>
 inline void RegisterFF() {
   ff_registry.Register(new FFFactory<Impl>);
 }
 
-template <class FsaImpl>
-inline void RegisterFsaDynToFF(std::string name,bool prefix=true) {
-  typedef FsaFeatureFunctionDynamic<FsaImpl> DynFsa;
-  ff_registry.Register(prefix?"DynamicFsa"+name:name,new FFFactory<FeatureFunctionFromFsa<DynFsa> >);
-}
-
-template <class FsaImpl>
-inline void RegisterFsaDynToFF(bool prefix=true) {
-  RegisterFsaDynToFF<FsaImpl>(FsaImpl::usage(false,false),prefix);
-}
-
 void register_feature_functions();
 
 #endif
diff --git a/decoder/hg_test.cc b/decoder/hg_test.cc
index 3be5b82d..5d1910fb 100644
--- a/decoder/hg_test.cc
+++ b/decoder/hg_test.cc
@@ -57,7 +57,7 @@ TEST_F(HGTest,Union) {
   c3 = ViterbiESentence(hg1, &t3);
   int l3 = ViterbiPathLength(hg1);
   cerr << c3 << "\t" << TD::GetString(t3) << endl;
-  EXPECT_FLOAT_EQ(c2, c3);
+  EXPECT_FLOAT_EQ(c2.as_float(), c3.as_float());
   EXPECT_EQ(TD::GetString(t2), TD::GetString(t3));
   EXPECT_EQ(l2, l3);
 
@@ -117,7 +117,7 @@ TEST_F(HGTest,InsideScore) {
   cerr << "cost: " << cost << "\n";
   hg.PrintGraphviz();
   prob_t inside = Inside<prob_t, EdgeProb>(hg);
-  EXPECT_FLOAT_EQ(1.7934048, inside);  // computed by hand
+  EXPECT_FLOAT_EQ(1.7934048, inside.as_float());  // computed by hand
   vector<prob_t> post;
   inside = hg.ComputeBestPathThroughEdges(&post);
   EXPECT_FLOAT_EQ(-0.3, log(inside));  // computed by hand
@@ -282,13 +282,13 @@ TEST_F(HGTest, TestGenericInside) {
   hg.Reweight(wts);
   vector<prob_t> inside;
   prob_t ins = Inside<prob_t, EdgeProb>(hg, &inside);
-  EXPECT_FLOAT_EQ(1.7934048, ins);  // computed by hand
+  EXPECT_FLOAT_EQ(1.7934048, ins.as_float());  // computed by hand
   vector<prob_t> outside;
   Outside<prob_t, EdgeProb>(hg, inside, &outside);
   EXPECT_EQ(3, outside.size());
-  EXPECT_FLOAT_EQ(1.7934048, outside[0]);
-  EXPECT_FLOAT_EQ(1.3114071, outside[1]);
-  EXPECT_FLOAT_EQ(1.0, outside[2]);
+  EXPECT_FLOAT_EQ(1.7934048, outside[0].as_float());
+  EXPECT_FLOAT_EQ(1.3114071, outside[1].as_float());
+  EXPECT_FLOAT_EQ(1.0, outside[2].as_float());
 }
 
 TEST_F(HGTest,TestGenericInside2) {
@@ -327,8 +327,8 @@ TEST_F(HGTest,TestAddExpectations) {
   SparseVector<prob_t> feat_exps;
   prob_t z = InsideOutside<prob_t, EdgeProb,
                   SparseVector<prob_t>, EdgeFeaturesAndProbWeightFunction>(hg, &feat_exps);
-  EXPECT_FLOAT_EQ(-2.5439765, feat_exps.value(FD::Convert("f1")) / z);
-  EXPECT_FLOAT_EQ(-2.6357865, feat_exps.value(FD::Convert("f2")) / z);
+  EXPECT_FLOAT_EQ(-2.5439765, (feat_exps.value(FD::Convert("f1")) / z).as_float());
+  EXPECT_FLOAT_EQ(-2.6357865, (feat_exps.value(FD::Convert("f2")) / z).as_float());
   cerr << feat_exps << endl;
   cerr << "Z=" << z << endl;
 }
diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc
index f87b7274..993627f0 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/mpi_online_optimize.cc
@@ -9,6 +9,7 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "stringlib.h"
 #include "verbose.h"
 #include "hg.h"
 #include "prob.h"
@@ -204,6 +205,7 @@ bool LoadAgenda(const string& file, vector<pair<string, int> >* a) {
 }
 
 int main(int argc, char** argv) {
+  cerr << "THIS SOFTWARE IS DEPRECATED YOU SHOULD USE mpi_flex_optimize\n";
 #ifdef HAVE_MPI
   mpi::environment env(argc, argv);
   mpi::communicator world;
-- 
cgit v1.2.3