From e10981b37ecc42cafca3e6d05e1eb44602b213b3 Mon Sep 17 00:00:00 2001
From: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Thu, 19 Aug 2010 04:46:18 +0000
Subject: ValueArray instead of string for state is 10% faster

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@599 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 decoder/apply_fsa_models.cc | 157 +++++++++++++++++++++++++++++++++++++-------
 decoder/cdec.cc             |   2 +
 decoder/cfg.cc              |   4 +-
 decoder/decode.sh           |   2 +-
 decoder/ff.h                |   5 +-
 utils/hash.h                |  28 ++++++++
 utils/maybe_update_bound.h  |  17 +++++
 7 files changed, 189 insertions(+), 26 deletions(-)
 create mode 100755 utils/maybe_update_bound.h
diff --git a/decoder/apply_fsa_models.cc b/decoder/apply_fsa_models.cc
index b43c02a4..7cd5fc6d 100755
--- a/decoder/apply_fsa_models.cc
+++ b/decoder/apply_fsa_models.cc
@@ -1,3 +1,4 @@
+#include "maybe_update_bound.h"
 #include "apply_fsa_models.h"
 #include "hg.h"
 #include "ff_fsa_dynamic.h"
@@ -20,6 +21,7 @@ using namespace std;
 
 //impl details (not exported).  flat namespace for my ease.
 
+typedef CFG::RHS RHS;
 typedef CFG::BinRhs BinRhs;
 typedef CFG::NTs NTs;
 typedef CFG::NT NT;
@@ -50,7 +52,7 @@ with 3, we predict all sorts of useless items - that won't give us our goal A an
 */
 
 #define TRIE_START_LHS 1
-// option 1) above.  0 would be option 3), which is dumb
+// 1 is option 1) above.  0 would be option 3), which is dumb
 
 // if we don't greedy-binarize, we want to encode recognized prefixes p (X -> p . rest) efficiently.  if we're doing this, we may as well also push costs so we can best-first select rules in a lazy fashion.  this is effectively left-branching binarization, of course.
 template <class K,class V,class Hash>
@@ -61,58 +63,169 @@ struct fsa_map_type {
 #define FSA_MAP(k,v) fsa_map_type<k,v,boost::hash<k> >::type
 typedef WordID LHS; // so this will be -NTHandle.
 
+struct get_second {
+  template <class P>
+  typename P::second_type const& operator()(P const& p) const {
+    return p.second;
+  }
+};
+
+struct PrefixTrieNode;
+struct PrefixTrieEdge {
+  prob_t p;// viterbi additional prob, i.e. product over path incl. p_final = total rule prob
+  //DPFSA()
+  // we can probably just store deltas, but for debugging remember the full p
+  //    prob_t delta; //
+  PrefixTrieNode *dest;
+  WordID w; // for lhs, this will be positive NTHandle instead
+
+  // for sorting most probable first in adj; actually >(p)
+  inline bool operator <(PrefixTrieEdge const& o) const {
+    return o.p<p;
+  }
+};
 
 struct PrefixTrieNode {
-  prob_t backward; // (viterbi) backward prob (for cost pushing)
-#
+  prob_t p; // viterbi (max prob) of rule this node leads to - when building.  telescope later onto edges for best-first.
 #if TRIE_START_LHS
+  bool final; // may also have successors, of course
+  prob_t p_final; // additional prob beyond what we already paid. while building, this is the total prob
+  // instead of completed map, we have trie start w/ lhs.
+  NTHandle lhs; // instead of storing this in Item.
+#else
   typedef FSA_MAP(LHS,RuleHandle) Completed; // can only have one rule w/ a given signature (duplicates should be collapsed when making CFG).  but there may be multiple rules, with different LHS
   Completed completed;
-#else
-  bool complete; // may also have successors, of course
 #endif
-  // instead of completed map, we have trie start w/ lhs.?
+  explicit PrefixTrieNode(prob_t p=1) : p(p),final(false) {  }
 
   // outgoing edges will be ordered highest p to worst p
-  struct Edge {
-    DPFSA(prob_t p;) // we can probably just store deltas, but for debugging remember the full p
-    prob_t delta; // p[i]=delta*p[i-1], with p(-1)=1
-    PrefixTrieNode *dest;
-    WordID w;
-  };
-  typedef FSA_MAP(WordID,Edge) BuildAdj;
-  BuildAdj build_adj; //TODO: move builder elsewhere?
-  typedef ValueArray<Edge>  Adj;
-//  typedef vector<Edge> Adj;
+
+  typedef FSA_MAP(WordID,PrefixTrieEdge) PrefixTrieEdgeFor;
+public:
+  PrefixTrieEdgeFor edge_for; //TODO: move builder elsewhere?  then need 2nd hash or edge include pointer to builder.  just clear this later
+  void index_adj() {
+    index_adj(edge_for);
+  }
+
+  template <class M>
+  void index_adj(M &m) {
+    m.clear();
+    for (int i=0;i<adj.size();++i) {
+      PrefixTrieEdge const& e=adj[i];
+      m[e.w]=e;
+    }
+  }
+
+  // call only once.
+  void done_building_r() {
+    done_building();
+    for (int i=0;i<adj.size();++i)
+      adj[i].dest->done_building_r();
+  }
+
+  // for done_building; compute incremental (telescoped) edge p
+  PrefixTrieEdge const& operator()(PrefixTrieEdgeFor::value_type &pair) const {
+    PrefixTrieEdge &e=pair.second;
+    e.p=(e.dest->p)/p;
+    return e;
+  }
+
+  // call only once.
+  void done_building() {
+    adj.reinit_map(edge_for.begin(),edge_for.end(),*this);
+    if (final)
+      p_final/=p;
+    std::sort(adj.begin(),adj.end());
+  }
+
+  typedef ValueArray<PrefixTrieEdge>  Adj;
+//  typedef vector<PrefixTrieEdge> Adj;
   Adj adj;
-  //TODO:
+
+  typedef WordID W;
+  typedef NTHandle N; // not negative
+  typedef W const* RI;
+
+  // let's compute p_min so that every rule reachable from the created node has p at least this low.
+  PrefixTrieNode *improve_edge(PrefixTrieEdge const& e,prob_t rulep) {
+    PrefixTrieNode *d=e.dest;
+    maybe_increase_max(d->p,rulep);
+    return d;
+  }
+
+  PrefixTrieNode *build(W w,prob_t rulep) {
+    PrefixTrieEdgeFor::iterator i=edge_for.find(w);
+    if (i!=edge_for.end())
+      return improve_edge(i->second,rulep);
+    PrefixTrieEdge &e=edge_for[w];
+    return e.dest=new PrefixTrieNode(rulep);
+  }
+
+  void set_final(prob_t pf) {
+    final=true;p_final=pf;
+  }
+
+#ifdef HAVE_TAIL_RECURSE
+  // add string i...end
+  void build(RI i,RI end, prob_t rulep) {
+    if (i==end) {
+      set_final(rulep);
+    } else
+    // tail recursion:
+      build(*i)->build(i+1,end,rulep);
+  }
+#endif
 };
 
 #if TRIE_START_LHS
-//Trie starts with lhs, then continues w/ rhs
+//Trie starts with lhs (nonneg index), then continues w/ rhs (mixed >0 word, else NT)
 #else
 // just rhs.  i think item names should exclude lhs if possible (most sharing).  get prefix cost w/ forward = viterbi (global best-first admissable h only) and it should be ok?
 #endif
 
 // costs are pushed.
 struct PrefixTrie {
-  CFG const* cfgp;
+  CFG *cfgp;
+  Rules const* rulesp;
+  Rules const& rules() const { return *rulesp; }
   CFG const& cfg() const { return *cfgp; }
-  PrefixTrie(CFG const& cfg) : cfgp(&cfg) {
+  PrefixTrieNode root;
 
+  PrefixTrie(CFG &cfg) : cfgp(&cfg),rulesp(&cfg.rules) {
+//    cfg.SortLocalBestFirst(); // instead we'll sort in done_building_r
+    cfg.VisitRuleIds(*this);
+    root.done_building_r();
+    root.index_adj(); // maybe the index we use for building trie should be replaced by a much larger/faster table since we look up by lhs many times in parsing?
 //TODO:
   }
+  void operator()(int ri) const {
+    Rule const& r=rules()[ri];
+    prob_t p=r.p;
+    PrefixTrieNode *n=const_cast<PrefixTrieNode&>(root).build(r.lhs,p);
+    for (RHS::const_iterator i=r.rhs.begin(),e=r.rhs.end();;++i) {
+      if (i==e) {
+        n->set_final(p);
+        break;
+      }
+      n=n->build(*i,p);
+    }
+#ifdef HAVE_TAIL_RECURSE
+    root.build(r.lhs,r.p)->build(r.rhs,r.p);
+#endif
+  }
 };
 
 // these should go in a global best-first queue
 struct Item {
   prob_t forward;
+  // NOTE: sum = viterbi (max)
   /* The forward probability alpha_i(X[k]->x.y) is the sum of the probabilities of all
      constrained paths of length that end in state X[k]->x.y*/
   prob_t inner;
   /* The inner probability beta_i(X[k]->x.y) is the sum of the probabilities of all
      paths of length i-k that start in state X[k,k]->.xy and end in X[k,i]->x.y, and generate the input symbols x[k,...,i-1] */
-
+  PrefixTrieNode *dot; // dot is a function of the stuff already recognized, and gives a set of suffixes y to complete to finish a rhs for lhs() -> dot y
+  NTHandle lhs() const { return dot->lhs; }
 };
 
 }//anon ns
@@ -166,7 +279,7 @@ void ApplyFsa::ApplyBottomUp()
 void ApplyFsa::ApplyEarley()
 {
   hgcfg.GiveCFG(cfg);
-  cfg.SortLocalBestFirst();
+  PrefixTrie rt(cfg);
   // don't need to uniq - option to do that already exists in cfg_options
   //TODO:
 }
diff --git a/decoder/cdec.cc b/decoder/cdec.cc
index a4c3613b..ca6284f6 100644
--- a/decoder/cdec.cc
+++ b/decoder/cdec.cc
@@ -881,5 +881,7 @@ int main(int argc, char** argv) {
       cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush;
     }
   }
+  exit(0); // maybe this will save some destruction overhead.  or g++ without cxx_atexit needed?
+  return 0;
 }
 
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index c02f46ec..b2219193 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -308,7 +308,7 @@ struct add_virtual_rules {
     //TODO: don't actually build substrings of rhs; define key type that stores ref to rhs in new_nts by index (because it may grow), and also allows an int* [b,e) range to serve as key (i.e. don't insert the latter type of key).
     int n=rhs.size();
     if (n<=2) return 0;
-    int longest1=1;
+    int longest1=1; // all this other stuff is not uninitialized when used, based on checking this and other things (it's complicated, learn to prove theorems, gcc)
     int mid=n/2;
     int best_k;
     enum {HAVE_L=-1,HAVE_NONE=0,HAVE_R=1};
@@ -445,7 +445,7 @@ void CFG::BinarizeSplit(CFGBinarize const& b) {
     Rule &r=newr[i];expr;  } // NOTE: must use indices since we'll be adding rules as we iterate.
 
   int n_changed_total=0;
-  int n_changed;
+  int n_changed=0; // quiets a warning
 #define CFG_SPLIT_PASS(N,free,just1) \
   for (int pass=0;pass<b.N;++pass) { \
     n_changed=0; \
diff --git a/decoder/decode.sh b/decoder/decode.sh
index 13cc6620..677e64ad 100755
--- a/decoder/decode.sh
+++ b/decoder/decode.sh
@@ -5,6 +5,6 @@ if [ "$lm" ] ; then
     lmargs1="LanguageModel lm.gz -n LM"
 fi
 set -x
-$gdb $d/cdec -c $d/${cfg:=cdec-fsa}.ini -i $d/${in:=1dev.ur} $lmargs0 "$lmargs1" --show_features --show_config --show_weights "$@"
+$gdb ${cdec:=$d/cdec} -c $d/${cfg:=cdec-fsa}.ini -i $d/${in:=1dev.ur} $lmargs0 "$lmargs1" --show_features --show_config --show_weights "$@"
 set +x
 }
diff --git a/decoder/ff.h b/decoder/ff.h
index 726845c4..80a880d8 100644
--- a/decoder/ff.h
+++ b/decoder/ff.h
@@ -243,7 +243,10 @@ void show_all_features(std::vector<FFp> const& models_,DenseWeightVector &weight
   return show_features(all_features(models_,weights_,&warn,warn_fid_0),weights_,out,warn,warn_zero_wt);
 }
 
-typedef std::string FFState; //FIXME: only context.data() is required to be contiguous, and it becomes invalid after next string operation.  use ValueArray instead? (higher performance perhaps, save a word due to fixed size)
+typedef ValueArray<char> FFState; // this is about 10% faster than string.
+//typedef std::string FFState;
+
+//FIXME: only context.data() is required to be contiguous, and it becomes invalid after next string operation.  use ValueArray instead? (higher performance perhaps, save a word due to fixed size)
 typedef std::vector<FFState> FFStates;
 
 // this class is a set of FeatureFunctions that can be used to score, rescore,
diff --git a/utils/hash.h b/utils/hash.h
index fbe10b4e..7e38bb2c 100755
--- a/utils/hash.h
+++ b/utils/hash.h
@@ -60,6 +60,34 @@ typename H::mapped_type & get_default(H &ht,K const& k,typename H::mapped_type c
   return const_cast<typename H::mapped_type &>(ht.insert(typename H::value_type(k,v)).first->second);
 }
 
+// get_or_construct w/ no arg: just use ht[k]
+template <class H,class K,class C0>
+typename H::mapped_type & get_or_construct(H &ht,K const& k,C0 const& c0) {
+  typedef typename H::mapped_type V;
+  typedef typename H::value_type KV;
+  typename H::iterator_type i=ht.find(k);
+  if (i==ht.end()) {
+    return const_cast<V &>(ht.insert(KV(k,V(c0))).first->second);
+  } else {
+    return i->second;
+  }
+}
+
+
+// get_or_call (0 arg)
+template <class H,class K,class F>
+typename H::mapped_type & get_or_call(H &ht,K const& k,F const& f) {
+  typedef typename H::mapped_type V;
+  typedef typename H::value_type KV;
+  typename H::iterator_type i=ht.find(k);
+  if (i==ht.end()) {
+    return const_cast<V &>(ht.insert(KV(k,f())).first->second);
+  } else {
+    return i->second;
+  }
+}
+
+
 // the below could also return a ref to the mapped max/min.  they have the advantage of not falsely claiming an improvement when an equal value already existed.  otherwise you could just modify the get_default and if equal assume new.
 template <class H,class K>
 bool improve_mapped_max(H &ht,K const& k,typename H::mapped_type const& v) {
diff --git a/utils/maybe_update_bound.h b/utils/maybe_update_bound.h
new file mode 100755
index 00000000..d57215d0
--- /dev/null
+++ b/utils/maybe_update_bound.h
@@ -0,0 +1,17 @@
+#ifndef MAYBE_UPDATE_BOUND_H
+#define MAYBE_UPDATE_BOUND_H
+
+template <class To,class From>
+inline void maybe_increase_max(To &to,const From &from) {
+    if (to<from)
+        to=from;
+}
+
+template <class To,class From>
+inline void maybe_decrease_min(To &to,const From &from) {
+    if (from<to)
+        to=from;
+}
+
+
+#endif
-- 
cgit v1.2.3