12 files changed, 453 insertions, 39 deletions
diff --git a/decoder/apply_fsa_models.cc b/decoder/apply_fsa_models.cc
index 2854b28b..c9bda68b 100755
--- a/decoder/apply_fsa_models.cc
+++ b/decoder/apply_fsa_models.cc
@@ -13,6 +13,46 @@
 
 using namespace std;
 
+//impl details (not exported).  flat namespace for my ease.
+
+typedef CFG::BinRhs BinRhs;
+typedef CFG::NTs NTs;
+typedef CFG::NT NT;
+typedef CFG::NTHandle NTHandle;
+typedef CFG::Rules Rules;
+typedef CFG::Rule Rule;
+typedef CFG::RuleHandle RuleHandle;
+
+namespace {
+
+// if we don't greedy-binarize, we want to encode recognized prefixes p (X -> p . rest) efficiently.  if we're doing this, we may as well also push costs so we can best-first select rules in a lazy fashion.  this is effectively left-branching binarization, of course.
+template <class K,class V>
+struct prefix_map_type {
+  typedef std::map<K,V> type;
+};
+//template typedef
+#define PREFIX_MAP(k,v) prefix_map_type<k,v>::type
+typedef NTHandle LHS;
+struct PrefixTrieNode {
+  prob_t backward; // (viterbi) backward prob (for cost pushing)
+  typedef PREFIX_MAP(LHS,RuleHandle) Completed; // can only have one rule w/ a given signature (duplicates should be collapsed when making CFG).  but there may be multiple rules, with different LHS
+  Completed completed;
+  typedef PREFIX_MAP(WordID,PrefixTrieNode *) Adj;
+  Adj adj;
+  //TODO:
+};
+
+struct PrefixTrie {
+  CFG const* cfgp;
+  CFG const& cfg() const { return *cfgp; }
+  PrefixTrie(CFG const& cfg) : cfgp(&cfg) {
+//TODO:
+  }
+};
+
+}//anon ns
+
+
 DEFINE_NAMED_ENUM(FSA_BY)
 
 struct ApplyFsa {
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index 8fc5b10f..a279c74a 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -6,24 +6,156 @@
 #include "batched_append.h"
 #include <limits>
 #include "fast_lexical_cast.hpp"
+//#include "indices_after.h"
 
 using namespace std;
 
+typedef CFG::Rule Rule;
+typedef CFG::NTOrder NTOrder;
+typedef CFG::RHS RHS;
+
+/////index ruleids:
+void CFG::UnindexRules() {
+  for (NTs::iterator n=nts.begin(),nn=nts.end();n!=nn;++n)
+    n->ruleids.clear();
+}
+
+void CFG::ReindexRules() {
+  UnindexRules();
+  for (int i=0,e=rules.size();i<e;++i)
+    if (!rules[i].is_null())
+      nts[rules[i].lhs].ruleids.push_back(i);
+}
+
+//////topo order:
+namespace {
+typedef std::vector<char> Seen; // 0 = unseen, 1 = seen+finished, 2 = open (for cycle detection; seen but not finished)
+enum { UNSEEN=0,SEEN,OPEN };
+
+
+// bottom -> top topo order (rev head->tails topo)
+template <class OutOrder>
+struct CFGTopo {
+// meaningless efficiency alternative: close over all the args except ni - so they're passed as a single pointer.  also makes visiting tail_nts simpler.
+  CFG const& cfg;
+  OutOrder outorder;
+  std::ostream *cerrp;
+  CFGTopo(CFG const& cfg,OutOrder const& outorder,std::ostream *cerrp=&std::cerr)
+    : cfg(cfg),outorder(outorder),cerrp(cerrp) // closure over args
+    , seen(cfg.nts.size()) {  }
+
+  Seen seen;
+  void operator()(CFG::NTHandle ni) {
+    char &seenthis=seen[ni];
+    if (seenthis==UNSEEN) {
+      seenthis=OPEN;
+
+      CFG::NT const& nt=cfg.nts[ni];
+      for (CFG::Ruleids::const_iterator i=nt.ruleids.begin(),e=nt.ruleids.end();i!=e;++i) {
+        Rule const& r=cfg.rules[*i];
+        r.visit_rhs_nts(*this); // recurse.
+      }
+
+      *outorder++=ni; // dfs finishing time order = reverse topo.
+      seenthis=SEEN;
+    } else if (cerrp && seenthis==OPEN) {
+      std::ostream &cerr=*cerrp;
+      cerr<<"WARNING: CFG Topo order attempt failed: NT ";
+      cfg.print_nt_name(cerr,ni);
+      cerr<<" already reached from goal(top) ";
+      cfg.print_nt_name(cerr,cfg.goal_nt);
+      cerr<<".  Continuing to reorder, but it's not fully topological.\n";
+    }
+  }
+
+};
+
+template <class O>
+void DoCFGTopo(CFG const& cfg,CFG::NTHandle goal,O const& o,std::ostream *w=0) {
+  CFGTopo<O> ct(cfg,o,w);
+  ct(goal);
+}
+
+}//ns
+
+// you would need to do this only if you didn't build from hg, or you Binarize without bin_topo option.  note: this doesn't sort the list of rules; it's assumed that if you care about the topo order you'll iterate over nodes.
+void CFG::OrderNTsTopo(NTOrder *o_,std::ostream *cycle_complain) {
+  NTOrder &o=*o_;
+  o.resize(nts.size());
+  DoCFGTopo(*this,goal_nt,o.begin(),cycle_complain);
+}
+
+
+/////sort/uniq:
 namespace {
-CFG::BinRhs nullrhs(std::numeric_limits<int>::min(),std::numeric_limits<int>::min());
+RHS null_rhs(1,INT_MIN);
+
+//sort
+struct ruleid_best_first {
+  CFG::Rules const* rulesp;
+  bool operator()(int a,int b) const { // true if a >(prob for ruleid) b
+    return (*rulesp)[b].p < (*rulesp)[a].p;
+  }
+};
+
+//uniq
+struct prob_pos {
+  prob_pos() {}
+  prob_pos(prob_t prob,int pos) : prob(prob),pos(pos) {}
+  prob_t prob;
+  int pos;
+  bool operator <(prob_pos const& o) const { return prob<o.prob; }
+};
+}//ns
+
+void CFG::UniqRules(NTHandle ni) {
+  typedef HASH_MAP<RHS,prob_pos> BestRHS; // faster to use trie? maybe.
+  BestRHS bestp; // once inserted, the position part (output index) never changes.  but the prob may be improved (overwrite ruleid at that position).
+  HASH_MAP_EMPTY(bestp,null_rhs);
+  Ruleids &adj=nts[ni].ruleids;
+  Ruleids oldadj=adj;
+  int oi=0;
+  for (int i=0,e=oldadj.size();i!=e;++i) { // this beautiful complexity is to ensure that adj' is a subsequence of adj (without duplicates)
+    int ri=oldadj[i];
+    Rule const& r=rules[ri];
+    prob_pos pi(r.p,oi);
+    prob_pos &oldpi=get_default(bestp,r.rhs,pi);
+    if (oldpi.pos==oi) {// newly inserted
+      adj[oi++]=ri;
+    } else if (oldpi.prob<pi.prob) { // we improve prev. best (overwrite it @old pos)
+      oldpi.prob=pi.prob;
+      adj[oldpi.pos]=ri; // replace worse rule w/ better
+    }
+  }
+  // post: oi = number of new adj
+  adj.resize(oi);
+}
+
+void CFG::SortLocalBestFirst(NTHandle ni) {
+  ruleid_best_first r;
+  r.rulesp=&rules;
+  Ruleids &adj=nts[ni].ruleids;
+  std::stable_sort(adj.begin(),adj.end(),r);
+}
+
+
+/////binarization:
+namespace {
+
+CFG::BinRhs null_bin_rhs(std::numeric_limits<int>::min(),std::numeric_limits<int>::min());
 
 // index i >= N.size()?  then it's in M[i-N.size()]
 WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
 {
   int nn=N.size();
   ostringstream o;
-#define BinNameOWORD(w) \
-  do { \
-    int n=w; if (n>0) o << TD::Convert(n); \
-    else {                                   \
-      int i=-n; \
-      if (i<nn) o<<N[i].from<<i; else o<<M[i-nn].from; \
-    }                                                  \
+#define BinNameOWORD(w)                                 \
+  do {                                                  \
+    int n=w; if (n>0) o << TD::Convert(n);              \
+    else {                                              \
+      int i=-n;                                         \
+      if (i<nn) o<<N[i].from<<i; else o<<M[i-nn].from;  \
+    }                                                   \
   } while(0)
 
   BinNameOWORD(b.first);
@@ -33,9 +165,7 @@ WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
   return TD::Convert(o.str());
 }
 
-}
-
-
+}//ns
 
 void CFG::Binarize(CFGBinarize const& b) {
   if (!b.Binarizing()) return;
@@ -43,12 +173,12 @@ void CFG::Binarize(CFGBinarize const& b) {
     assert(b.bin_l2r);
     return;
   }
-  // l2r only so far:
+  //TODO: l2r only so far:
   cerr << "Binarizing "<<b<<endl;
   HASH_MAP<BinRhs,NTHandle,boost::hash<BinRhs> > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left.
-  HASH_MAP_EMPTY(bin2lhs,nullrhs);
+  HASH_MAP_EMPTY(bin2lhs,null_bin_rhs);
   int rhsmin=b.bin_unary?0:1;
-  // iterate using indices and not iterators because we'll be adding to both nodes and edge list.  we could instead pessimistically reserve space for both, but this is simpler.  also: store original end of nts since we won't need to reprocess newly added ones.
+  // iterate using indices and not iterators because we'll be adding to both nts and rules list?  we could instead pessimistically reserve space for both, but this is simpler.  also: store original end of nts since we won't need to reprocess newly added ones.
   NTs new_nts; // these will be appended at the end, so we don't have to worry about iterator invalidation
   Rules new_rules;
   //TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target)
@@ -77,6 +207,8 @@ void CFG::Binarize(CFGBinarize const& b) {
   }
   batched_append_swap(nts,new_nts);
   batched_append_swap(rules,new_rules);
+  if (b.bin_topo) //TODO: more efficient (at least for l2r) maintenance of order
+    OrderNTsTopo();
 }
 
 namespace {
diff --git a/decoder/cfg.h b/decoder/cfg.h
index e1f818e8..b214d8dc 100755
--- a/decoder/cfg.h
+++ b/decoder/cfg.h
@@ -5,6 +5,11 @@
 #ifndef CFG_DEBUG
 # define CFG_DEBUG 0
 #endif
+#if CFG_DEBUG
+# define IF_CFG_DEBUG(x) x
+#else
+# define IF_CFG_DEBUG(x)
+#endif
 
 /* for target FSA intersection, we want to produce a simple (feature weighted) CFG using the target projection of a hg.  this is essentially isomorphic to the hypergraph, and we're copying part of the rule info (we'll maintain a pointer to the original hg edge for posterity/debugging; and perhaps avoid making a copy of the feature vector).  but we may also want to support CFG read from text files (w/ features), without needing to have a backing hypergraph.  so hg pointer may be null?  multiple types of CFG?  always copy the feature vector?  especially if we choose to binarize, we won't want to rely on 1:1 alignment w/ hg
 
@@ -28,6 +33,7 @@
 #include "small_vector.h"
 #include "nt_span.h"
 #include <algorithm>
+#include "indices_after.h"
 
 class Hypergraph;
 class CFGFormat; // #include "cfg_format.h"
@@ -47,7 +53,10 @@ struct CFG {
 
   struct Rule {
     // for binarizing - no costs/probs
-    Rule() {  }
+    Rule() : lhs(-1) {  }
+    bool is_null() const { return lhs<0; }
+    void set_null() { lhs=-1; rhs.clear();f.clear(); IF_CFG_DEBUG(rule.reset();) }
+
     Rule(int lhs,BinRhs const& binrhs) : lhs(lhs),rhs(2),p(1) {
       rhs[0]=binrhs.first;
       rhs[1]=binrhs.second;
@@ -57,18 +66,59 @@ struct CFG {
     RHS rhs;
     prob_t p; // h unused for now (there's nothing admissable, and p is already using 1st pass inside as pushed toward top)
     FeatureVector f; // may be empty, unless copy_features on Init
-#if CFG_DEBUG
-    TRulePtr rule; // normally no use for this (waste of space)
-#endif
+    IF_CFG_DEBUG(TRulePtr rule;)
     void Swap(Rule &o) {
       using namespace std;
       swap(lhs,o.lhs);
       swap(rhs,o.rhs);
       swap(p,o.p);
       swap(f,o.f);
-#if CFG_DEBUG
-      swap(rule,o.rule);
-#endif
+      IF_CFG_DEBUG(swap(rule,o.rule);)
+    }
+    template<class V>
+    void visit_rhs_nts(V &v) const {
+      for (RHS::const_iterator i=rhs.begin(),e=rhs.end();i!=e;++i) {
+        WordID w=*i;
+        if (w<=0)
+          v(-w);
+      }
+    }
+    template<class V>
+    void visit_rhs_nts(V const& v) const {
+      for (RHS::const_iterator i=rhs.begin(),e=rhs.end();i!=e;++i) {
+        WordID w=*i;
+        if (w<=0)
+          v(-w);
+      }
+    }
+
+    template<class V>
+    void visit_rhs(V &v) const {
+      for (RHS::const_iterator i=rhs.begin(),e=rhs.end();i!=e;++i) {
+        WordID w=*i;
+        if (w<=0)
+          v.visit_nt(-w);
+        else
+          v.visit_t(w);
+      }
+    }
+
+    // returns 0 or 1 (# of non null rules in this rule).
+    template <class O>
+    bool reorder_from(O &order,NTHandle removed=-1) {
+      for (RHS::iterator i=rhs.begin(),e=rhs.end();i!=e;++i) {
+        WordID &w=*i;
+        if (w<=0) {
+          int oldnt=-w;
+          NTHandle newnt=(NTHandle)order[oldnt]; // e.g. unsigned to int (-1) conversion should be ok
+          if (newnt==removed) {
+            set_null();
+            return false;
+          }
+          w=-newnt;
+        }
+      }
+      return true;
     }
   };
 
@@ -91,6 +141,17 @@ struct CFG {
   bool Uninitialized() const { return uninit; }
   void Clear();
   bool Empty() const { return nts.empty(); }
+  void UnindexRules(); // save some space?
+  void ReindexRules(); // scan over rules and rebuild NT::ruleids (e.g. after using UniqRules)
+  void UniqRules(NTHandle ni); // keep only the highest prob rule for each rhs and lhs=nt - doesn't remove from Rules; just removes from nts[ni].ruleids.  keeps the same order in this sense: for a given signature (rhs), that signature's first representative in the old ruleids will become the new position of the best.  as a consequence, if you SortLocalBestFirst() then UniqRules(), the result is still best first.  but you may also call this on unsorted ruleids.
+  inline void UniqRules() {
+    for (int i=0,e=nts.size();i!=e;++i) UniqRules(i);
+  }
+
+  void SortLocalBestFirst(NTHandle ni); // post: nts[ni].ruleids lists rules from highest p to lowest.  when doing best-first earley intersection/parsing, you don't want to use the global marginal viterbi; you want to ignore outside in ordering edges for a node, so call this.  stable in case of ties
+  inline void SortLocalBestFirst() {
+    for (int i=0,e=nts.size();i!=e;++i) SortLocalBestFirst(i);
+  }
   void Init(Hypergraph const& hg,bool target_side=true,bool copy_features=false,bool push_weights=true);
   void Print(std::ostream &o,CFGFormat const& format) const; // see cfg_format.h
   void PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& format) const;
@@ -104,7 +165,79 @@ struct CFG {
     swap(nts,o.nts);
     swap(goal_nt,o.goal_nt);
   }
-  void Binarize(CFGBinarize const& binarize_options);
+
+  typedef std::vector<NTHandle> NTOrder; // a list of nts, in definition-before-use order.
+
+  //perhaps: give up on templated Order and move the below to .cc (NTOrder should be fine)
+
+  // post: iterating nts 0,1... the same as order[0],order[1],... ; return number of non-null rules (these aren't actually deleted)
+  // pre: order is (without duplicates) a range of NTHandle
+  template <class Order>
+  int ReorderNTs(Order const& order) {
+    using namespace std;
+    int nn=nts.size();
+#if 0
+    NTs newnts(order.size()); // because the (sub)permutation order may have e.g. 1<->4
+    int ni=0;
+    for (typename Order::const_iterator i=order.begin(),e=order.end();i!=e;++i) {
+      assert(*i<nn);
+      swap(newnts[ni++],nts[*i]);
+    }
+    swap(newnts,nts);
+#endif
+    indices_after remap_nti;
+    remap_nti.init_inverse_order(nn,order);
+    remap_nti.do_moves_swap(nts);// (equally efficient (or more?) than the disabled nt swapping above.
+    goal_nt=remap_nti.map[goal_nt]; // remap goal, of course
+    // fix rule ids
+    return RemapRules(remap_nti.map,(NTHandle)indices_after::REMOVED);
+  }
+
+  // return # of kept rules (not null)
+  template <class NTHandleRemap>
+  int RemapRules(NTHandleRemap const& remap_nti,NTHandle removed=-1) {
+    int n_non_null=0;
+    for (int i=0,e=rules.size();i<e;++i)
+      n_non_null+=rules[i].reorder_from(remap_nti,removed);
+    return n_non_null;
+  }
+
+  // call after rules are indexed.
+  template <class V>
+  void VisitRuleIds(V &v) {
+    for (int i=0,e=nts.size();i<e;++i)
+      for (Ruleids::const_iterator j=nts[i].ruleids.begin(),jj=nts[i].ruleids.begin();j!=jj;++j)
+        v(*j);
+  }
+  template <class V>
+  void VisitRuleIds(V const& v) {
+    for (int i=0,e=nts.size();i<e;++i)
+      for (Ruleids::const_iterator j=nts[i].ruleids.begin(),jj=nts[i].ruleids.begin();j!=jj;++j)
+        v(*j);
+  }
+
+  // no index needed
+  template <class V>
+  void VisitRulesUnindexed(V const &v) {
+    for (int i=0,e=rules.size();i<e;++i)
+      if (!rules[i].is_null())
+        v(i,rules[i]);
+  }
+
+
+
+  void OrderNTsTopo(NTOrder *o,std::ostream *cycle_complain=0); // places NTs in defined (completely) bottom-up before use order.  this is actually reverse topo order considering edges from lhs->rhs.
+  // you would need to do this only if you didn't build from hg, or you Binarize without bin_topo option.
+  // note: this doesn't sort the list of rules; it's assumed that if you care about the topo order you'll iterate over nodes.
+  // cycle_complain means to warn in case of back edges.  it's not necessary to prevent inf. loops.  you get some order that's not topo if there are loops.  starts from goal_nt, of course.
+
+  void OrderNTsTopo(std::ostream *cycle_complain=0) {
+    NTOrder o;
+    OrderNTsTopo(&o,cycle_complain);
+    ReorderNTs(o);
+  }
+
+  void Binarize(CFGBinarize const& binarize_options); // see cfg_binarize.h for docs
 
   typedef std::vector<NT> NTs;
   NTs nts;
diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h
index f76619d2..c5303622 100755
--- a/decoder/cfg_binarize.h
+++ b/decoder/cfg_binarize.h
@@ -18,6 +18,8 @@ struct CFGBinarize {
   bool bin_l2r;
   bool bin_unary;
   bool bin_name_nts;
+  bool bin_uniq;
+  bool bin_topo;
   template <class Opts> // template to support both printable_opts and boost nonprintable
   void AddOptions(Opts *opts) {
     opts->add_options()
@@ -25,6 +27,8 @@ struct CFGBinarize {
       ("cfg_binarize_unary", defaulted_value(&bin_unary),"if true, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least cfg_binarize_at times.")
       ("cfg_binarize_l2r", defaulted_value(&bin_l2r),"force left to right (a (b (c d))) binarization (ignore _at threshold)")
       ("cfg_binarize_name_nts", defaulted_value(&bin_name_nts),"create named virtual NT tokens e.g. 'A12+the' when binarizing 'B->[A12] the cat'")
+      ("cfg_binarize_uniq", defaulted_value(&bin_uniq),"in case of duplicate rules, keep only the one with highest prob")
+      ("cfg_binarize_topo", defaulted_value(&bin_topo),"reorder nonterminals after binarization to maintain definition before use (topological order).  otherwise the virtual NTs will all appear after the regular NTs")
     ;
   }
   void Validate() {
@@ -40,6 +44,8 @@ struct CFGBinarize {
     return bin_l2r || bin_at>0;
   }
   void set_defaults() {
+    bin_topo=false;
+    bin_uniq=true;
     bin_at=0;
     bin_unary=false;
     bin_name_nts=true;
diff --git a/decoder/exp_semiring.h b/decoder/exp_semiring.h
index f91beee4..111eaaf1 100644
--- a/decoder/exp_semiring.h
+++ b/decoder/exp_semiring.h
@@ -14,7 +14,7 @@
 //   PType scalar, RType vector
 // BAD examples:
 //   PType vector, RType scalar
-template <typename PType, typename RType>
+template <class PType, class RType>
 struct PRPair {
   PRPair() : p(), r() {}
   // Inside algorithm requires that T(0) and T(1)
@@ -35,26 +35,26 @@ struct PRPair {
   RType r;
 };
 
-template <typename P, typename R>
+template <class P, class R>
 std::ostream& operator<<(std::ostream& o, const PRPair<P,R>& x) {
   return o << '<' << x.p << ", " << x.r << '>';
 }
 
-template <typename P, typename R>
+template <class P, class R>
 const PRPair<P,R> operator+(const PRPair<P,R>& a, const PRPair<P,R>& b) {
   PRPair<P,R> result = a;
   result += b;
   return result;
 }
 
-template <typename P, typename R>
+template <class P, class R>
 const PRPair<P,R> operator*(const PRPair<P,R>& a, const PRPair<P,R>& b) {
   PRPair<P,R> result = a;
   result *= b;
   return result;
 }
 
-template <typename P, typename PWeightFunction, typename R, typename RWeightFunction>
+template <class P, class PWeightFunction, class R, class RWeightFunction>
 struct PRWeightFunction {
   explicit PRWeightFunction(const PWeightFunction& pwf = PWeightFunction(),
                             const RWeightFunction& rwf = RWeightFunction()) :
diff --git a/decoder/hg.h b/decoder/hg.h
index e9510997..03221c74 100644
--- a/decoder/hg.h
+++ b/decoder/hg.h
@@ -392,7 +392,7 @@ public:
 
   // multiple the weights vector by the edge feature vector
   // (inner product) to set the edge probabilities
-  template <typename V>
+  template <class V>
   void Reweight(const V& weights) {
     for (int i = 0; i < edges_.size(); ++i) {
       Edge& e = edges_[i];
diff --git a/decoder/inside_outside.h b/decoder/inside_outside.h
index e3bd4592..73d4ec6a 100644
--- a/decoder/inside_outside.h
+++ b/decoder/inside_outside.h
@@ -27,7 +27,7 @@ struct Boolean {
 // score for each node
 // NOTE: WeightType()  must construct the semiring's additive identity
 //       WeightType(1) must construct the semiring's multiplicative identity
-template<typename WeightType, typename WeightFunction>
+template<class WeightType, class WeightFunction>
 WeightType Inside(const Hypergraph& hg,
                   std::vector<WeightType>* result = NULL,
                   const WeightFunction& weight = WeightFunction()) {
@@ -58,7 +58,7 @@ WeightType Inside(const Hypergraph& hg,
   return inside_score.back();
 }
 
-template<typename WeightType, typename WeightFunction>
+template<class WeightType, class WeightFunction>
 void Outside(const Hypergraph& hg,
              std::vector<WeightType>& inside_score,
              std::vector<WeightType>* result,
@@ -179,7 +179,7 @@ struct InsideOutsides {
 // NOTE: XType * KType must be valid (and yield XType)
 // NOTE: This may do things slightly differently than you are used to, please
 // read the description in Li and Eisner (2009) carefully!
-template<typename KType, typename KWeightFunction, typename XType, typename XWeightFunction>
+template<class KType, class KWeightFunction, class XType, class XWeightFunction>
 KType InsideOutside(const Hypergraph& hg,
                     XType* result_x,
                     const KWeightFunction& kwf = KWeightFunction(),
diff --git a/decoder/viterbi.h b/decoder/viterbi.h
index ea2bc6c2..e78cd157 100644
--- a/decoder/viterbi.h
+++ b/decoder/viterbi.h
@@ -16,7 +16,7 @@ std::string viterbi_stats(Hypergraph const& hg, std::string const& name="forest"
 // WeightFunction must implement:
 //  typedef prob_t Weight;
 //  Weight operator()(Hypergraph::Edge const& e) const;
-template<typename Traversal,typename WeightFunction>
+template<class Traversal,class WeightFunction>
 typename WeightFunction::Weight Viterbi(const Hypergraph& hg,
                    typename Traversal::Result* result,
                    const Traversal& traverse,
diff --git a/utils/hash.h b/utils/hash.h
index e89b1863..b0b1c43e 100755
--- a/utils/hash.h
+++ b/utils/hash.h
@@ -58,4 +58,31 @@ typename H::mapped_type & get_default(H &ht,K const& k,typename H::mapped_type c
   return const_cast<typename H::mapped_type &>(ht.insert(typename H::value_type(k,v)).first->second);
 }
 
+// the below could also return a ref to the mapped max/min.  they have the advantage of not falsely claiming an improvement when an equal value already existed.  otherwise you could just modify the get_default and if equal assume new.
+template <class H,class K>
+bool improve_mapped_max(H &ht,K const& k,typename H::mapped_type const& v) {
+  std::pair<typename H::iterator,bool> inew=ht.insert(typename H::value_type(k,v));
+  if (inew.second) return true;
+  typedef typename H::mapped_type V;
+  V &oldv=const_cast<V&>(inew.first->second);
+  if (oldv<v) {
+    oldv=v;
+    return true;
+  }
+  return false;
+}
+
+template <class H,class K>
+bool improve_mapped_min(H &ht,K const& k,typename H::mapped_type const& v) {
+  std::pair<typename H::iterator,bool> inew=ht.insert(typename H::value_type(k,v));
+  if (inew.second) return true;
+  typedef typename H::mapped_type V;
+  V &oldv=const_cast<V&>(inew.first->second);
+  if (v<oldv) { // the only difference from above
+    oldv=v;
+    return true;
+  }
+  return false;
+}
+
 #endif
diff --git a/utils/indices_after.h b/utils/indices_after.h
index 2b662d5c..2891563c 100755
--- a/utils/indices_after.h
+++ b/utils/indices_after.h
@@ -1,6 +1,9 @@
 #ifndef INDICES_AFTER_REMOVING_H
 #define INDICES_AFTER_REMOVING_H
 
+//TODO: instead of making REMOVED a constant, make it class state (defaults to -1 as before, of course).
+
+#include <assert.h>
 #include <boost/config.hpp> // STATIC_CONSTANT
 #include <algorithm> //swap
 #include <iterator>
@@ -48,19 +51,50 @@ unsigned new_indices(KEEP keep,O out) {
   return new_indices(keep.begin(),keep.end(),out);
 }
 
+template <class V,class Out,class Permi>
+void copy_perm_to(Out o,V const& from,Permi i,Permi e) {
+  for (;i!=e;++i)
+    *o++=from[*i];
+}
+
+//to cannot be same as from, for most permutations.  for to==from, use indices_after::init_inverse_order instead.
+template <class Vto,class Vfrom,class Perm>
+void remap_perm_to(Vto &to,Vfrom const& from,Perm const& p) {
+  to.resize(p.size());
+  copy_perm_to(to.begin(),from,p.begin(),p.end());
+}
+
 // given a vector and a parallel sequence of bools where true means keep, keep only the marked elements while maintaining order.
 // this is done with a parallel sequence to the input, marked with positions the kept items would map into in a destination array, with removed items marked with the index -1.  the reverse would be more compact (parallel to destination array, index of input item that goes into it) but would require the input sequence be random access.
 struct indices_after
 {
   BOOST_STATIC_CONSTANT(unsigned,REMOVED=(unsigned)-1);
   unsigned *map; // map[i] == REMOVED if i is deleted
-  unsigned n_kept;
+  unsigned n_kept; // important to init this.
   unsigned n_mapped;
   template <class AB,class ABe>
   indices_after(AB i, ABe end) {
     init(i,end);
   }
 
+  template <class Order>
+  void init_inverse_order(unsigned from_sz,Order const& order) {
+    init_inverse_order(from_sz,order.begin(),order.end());
+  }
+  template <class OrderI>
+  void init_inverse_order(unsigned from_sz,OrderI i,OrderI end) {
+    init_alloc(from_sz);
+    unsigned d=0;
+    n_kept=0;
+    for(;i!=end;++i) {
+      assert(d<from_sz);
+      map[d++]=*i;
+      ++n_kept;
+    }
+    for(;d<from_sz;++d)
+      map[d]=REMOVED;
+  }
+
   template <class Vec,class R>
   void init_keep_if(Vec v,R const& r)
   {
@@ -69,6 +103,25 @@ struct indices_after
     map=(unsigned *)::operator new(sizeof(unsigned)*n_mapped);
     n_kept=new_indices_keep_if_n(n_mapped,r,map);
   }
+  // contents uninit.
+  void init_alloc(unsigned n) {
+    free();
+    n_mapped=n;
+    map=n_mapped>0 ?
+      (unsigned *)::operator new(sizeof(unsigned)*n_mapped)
+      : 0;
+  }
+  void init_const(unsigned n,unsigned map_all_to) {
+    init_alloc(n);
+    for (unsigned i=0;i<n;++i)
+      map[i]=map_all_to;
+    n_kept=(map_all_to==REMOVED)?0:n;
+  }
+
+  void init_keep_none(unsigned n) {
+    init_const(n,REMOVED);
+    n_kept=0;
+  }
 
   template <class AB,class ABe>
   void init(AB i, ABe end) {
@@ -93,9 +146,14 @@ struct indices_after
   indices_after() : n_mapped(0) {}
   ~indices_after()
   {
+    free();
+  }
+  void free() {
     if (n_mapped)
       ::operator delete((void*)map);
+    n_mapped=0;
   }
+
   bool removing(unsigned i) const
   {
     return map[i] == REMOVED;
@@ -127,7 +185,6 @@ struct indices_after
   {
     using std::swap;
     assert(v.size()==n_mapped);
-    unsigned r=n_mapped;
     unsigned i=0;
     for (;i<n_mapped&&keeping(i);++i) ;
     for(;i<n_mapped;++i)
diff --git a/utils/named_enum.h b/utils/named_enum.h
index 447dff78..675ec868 100755
--- a/utils/named_enum.h
+++ b/utils/named_enum.h
@@ -2,13 +2,16 @@
 #define NAMED_ENUM_H
 
 #ifndef NAMED_ENUM_USE_OPTIONAL
-# define NAMED_ENUM_USE_OPTIONAL 1
+# define NAMED_ENUM_USE_OPTIONAL 0
 #endif
 
 //TODO: efficiency - supply map type (e.g. std::map or tr1::unordered_map) for string->int (int->string already fast w/ switch) - then implement iterators that don't assume contiguous ids.
 //TODO: hidden (can't convert string->id, but can do reverse) sentinel values.  XX (hidden) and XY (can convert to)
 //TODO: bitfield "A|B" strings - note: slightly complicates int->string, as well.
 //TODO: option for case-insensitive compare (ctype tolower?)
+//TODO: program_options validate method so you can declare po::value<MyEnum> instead of po::value<string>?
+//TODO: cout << MyEnum ?
+//impossible: (without wrapping in struct) MyEnum(string)
 
 /* named enum (string<->int).  note: inefficient linear search for string->int
 
diff --git a/utils/small_vector.h b/utils/small_vector.h
index 077a524a..8c1c3bfe 100644
--- a/utils/small_vector.h
+++ b/utils/small_vector.h
@@ -15,6 +15,8 @@
 #include <new>
 #include <stdint.h>
 #include "swap_pod.h"
+#include <boost/functional/hash.hpp>
+
 //sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1
 
 template <class T,int SV_MAX=2>
@@ -250,6 +252,15 @@ public:
     swap_pod(*this,o);
   }
 
+  inline std::size_t hash() const {
+    using namespace boost;
+    if (size_==0) return 0;
+    if (size_==1) return hash_value(data_.vals[0]);
+    if (size<= SV_MAX)
+      return hash_range(data_.vals,data_.vals+size_);
+    return hash_range(data_.ptr,data_.ptr+size_);
+  }
+
  private:
   union StorageType {
     T vals[SV_MAX];
@@ -260,15 +271,20 @@ public:
   uint16_t capacity_;  // only defined when size_ > __SV_MAX_STATIC
 };
 
-template <class T,int SV_MAX>
-inline void swap(SmallVector<T,SV_MAX> &a,SmallVector<T,SV_MAX> &b) {
+template <class T,int M>
+std::size_t hash_value(SmallVector<T,M> const& x) {
+  return x.hash();
+}
+
+template <class T,int M>
+inline void swap(SmallVector<T,M> &a,SmallVector<T,M> &b) {
   a.swap(b);
 }
 
 typedef SmallVector<int,2> SmallVectorInt;
 
-template <class T,int N>
-void memcpy(void *out,SmallVector<T,N> const& v) {
+template <class T,int M>
+void memcpy(void *out,SmallVector<T,M> const& v) {
   std::memcpy(out,v.begin(),v.size()*sizeof(T));
 }