From 1fc4b6a45c18b52ab2c27eb9a825ad14f31d803b Mon Sep 17 00:00:00 2001
From: "graehl@gmail.com"
 <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Mon, 2 Aug 2010 07:57:23 +0000
Subject: fake tdict names for non-ids, push viterbi cost to root in hg, store
 as feature.  type erased fsa feature via virtual interface.  made
 lexical_cast assume C locale for speed.

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@465 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 decoder/apply_fsa_models.h    |  15 ++++++
 decoder/bottom_up_parser.cc   |   2 +
 decoder/earley_composer.cc    |  10 ++--
 decoder/fast_lexical_cast.hpp |  10 ++++
 decoder/ff.cc                 |   2 +-
 decoder/ff_bleu.cc            |   2 +-
 decoder/ff_fsa.h              |  89 ++++-------------------------------
 decoder/ff_fsa_data.h         | 105 ++++++++++++++++++++++++++++++++++++++++++
 decoder/ff_fsa_dynamic.h      |  77 ++++++++++++++++++++++++++++---
 decoder/ff_lm.cc              |   2 +-
 decoder/forest_writer.cc      |   2 +-
 decoder/grammar.h             |  11 +++--
 decoder/hg.cc                 |  24 ++++++++++
 decoder/hg.h                  |  43 ++++++++++++-----
 decoder/hg_intersect.cc       |   4 +-
 decoder/hg_io.cc              |   2 +-
 decoder/hg_io.h               |   2 +-
 decoder/indices_after.h       |   2 +-
 decoder/inside_outside.h      |  26 +++++++----
 decoder/static_utoa.h         |  63 +++++++++++++++++++++++++
 decoder/tdict.cc              |  26 ++++++++++-
 decoder/tdict.h               |   2 +-
 decoder/threadlocal.h         |  71 ++++++++++++++++++++++++++++
 decoder/tromble_loss.cc       |  10 ++--
 decoder/viterbi.cc            |   2 +-
 25 files changed, 468 insertions(+), 136 deletions(-)
 create mode 100755 decoder/apply_fsa_models.h
 create mode 100755 decoder/fast_lexical_cast.hpp
 create mode 100755 decoder/ff_fsa_data.h
 create mode 100755 decoder/static_utoa.h
 create mode 100755 decoder/threadlocal.h

(limited to 'decoder')
diff --git a/decoder/apply_fsa_models.h b/decoder/apply_fsa_models.h
new file mode 100755
index 00000000..d22397e3
--- /dev/null
+++ b/decoder/apply_fsa_models.h
@@ -0,0 +1,15 @@
+#ifndef _APPLY_FSA_MODELS_H_
+#define _APPLY_FSA_MODELS_H_
+
+//#include "ff_fsa_dynamic.h"
+
+struct FsaFeatureFunction;
+struct Hypergraph;
+struct SentenceMetadata;
+
+void ApplyFsaModels(const Hypergraph& in,
+                    const SentenceMetadata& smeta,
+                    const FsaFeatureFunction& fsa,
+                    Hypergraph* out);
+
+#endif
diff --git a/decoder/bottom_up_parser.cc b/decoder/bottom_up_parser.cc
index 94f209b5..88514b82 100644
--- a/decoder/bottom_up_parser.cc
+++ b/decoder/bottom_up_parser.cc
@@ -1,5 +1,7 @@
 //TODO: when using many nonterminals, group passive edges for a span (treat all as a single X for the active items).
 
+//TODO: figure out what cdyer was talking about when he said that having unary rules A->B and B->A, doesn't make cycles appear in result provided rules are sorted in some way (that they typically are)
+
 #include "bottom_up_parser.h"
 
 #include <iostream>
diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc
index f6a01e52..48e94a31 100644
--- a/decoder/earley_composer.cc
+++ b/decoder/earley_composer.cc
@@ -9,7 +9,7 @@
 #include <boost/shared_ptr.hpp>
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
-#include <boost/lexical_cast.hpp>
+#include "fast_lexical_cast.hpp"
 
 #include "phrasetable_fst.h"
 #include "sparse_vector.h"
@@ -206,15 +206,15 @@ ostream& operator<<(ostream& os, const Edge& e) {
   if (e.IsCreatedByScan())
     type = "SCAN";
   else if (e.IsCreatedByComplete())
-    type = "COMPLETE"; 
+    type = "COMPLETE";
   os << "["
 #ifdef DEBUG_CHART_PARSER
      << '(' << e.id << ") "
 #else
      << '(' << &e << ") "
 #endif
-     << "q=" << e.q << ", r=" << e.r 
-     << ", cat="<< TD::Convert(e.cat*-1) << ", dot=" 
+     << "q=" << e.q << ", r=" << e.r
+     << ", cat="<< TD::Convert(e.cat*-1) << ", dot="
      << e.dot
 #ifdef DEBUG_CHART_PARSER
      << e.dot->hint
@@ -588,7 +588,7 @@ class EarleyComposerImpl {
           assert(!"self-loop found!");
         }
 #endif
-    Hypergraph::Edge* hg_edge = NULL; 
+    Hypergraph::Edge* hg_edge = NULL;
     if (tail.size() == 0) {
       hg_edge = hg->AddEdge(kEPSRule, tail);
     } else if (tail.size() == 1) {
diff --git a/decoder/fast_lexical_cast.hpp b/decoder/fast_lexical_cast.hpp
new file mode 100755
index 00000000..c1b15042
--- /dev/null
+++ b/decoder/fast_lexical_cast.hpp
@@ -0,0 +1,10 @@
+#ifndef FAST_LEXICAL_CAST_HPP
+#define FAST_LEXICAL_CAST_HPP
+
+#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
+
+#include <boost/lexical_cast.hpp>
+
+using boost::lexical_cast;
+
+#endif
diff --git a/decoder/ff.cc b/decoder/ff.cc
index d21bf3fe..28620bab 100644
--- a/decoder/ff.cc
+++ b/decoder/ff.cc
@@ -2,7 +2,7 @@
 
 //TODO: actually score rule_feature()==true features once only, hash keyed on rule or modify TRule directly?  need to keep clear in forest which features come from models vs. rules; then rescoring could drop all the old models features at once
 
-#include <boost/lexical_cast.hpp>
+#include "fast_lexical_cast.hpp"
 #include <stdexcept>
 #include "ff.h"
 
diff --git a/decoder/ff_bleu.cc b/decoder/ff_bleu.cc
index 12c29d32..77989331 100644
--- a/decoder/ff_bleu.cc
+++ b/decoder/ff_bleu.cc
@@ -7,7 +7,7 @@ char const* bleu_usage_verbose="Uses feature id 0!  Make sure there are no other
 
 #include <sstream>
 #include <unistd.h>
-#include <boost/lexical_cast.hpp>
+#include "fast_lexical_cast.hpp"
 
 #include <boost/shared_ptr.hpp>
 
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
index de777fd5..6c1294f8 100755
--- a/decoder/ff_fsa.h
+++ b/decoder/ff_fsa.h
@@ -47,19 +47,14 @@
 # define FSADBGnl(e)
 #endif
 
-#include <boost/lexical_cast.hpp>
+#include "fast_lexical_cast.hpp"
 #include <sstream>
-#include <stdint.h> //C99
 #include <string>
 #include "ff.h"
 #include "sparse_vector.h"
-#include "value_array.h" // used to hold state
 #include "tdict.h"
 #include "hg.h"
-#include "sentences.h"
-#include "feature_accum.h"
-
-typedef ValueArray<uint8_t> Bytes;
+#include "ff_fsa_data.h"
 
 /*
 usage: see ff_sample_fsa.h or ff_lm_fsa.h
@@ -68,8 +63,9 @@ usage: see ff_sample_fsa.h or ff_lm_fsa.h
 
  */
 
+
 template <class Impl>
-struct FsaFeatureFunctionBase {
+struct FsaFeatureFunctionBase : public FsaFeatureFunctionData {
   // CALL 1 of these MANUALLY  (because feature name(s) may depend on param, it's not done in ctor)
   void Init(std::string const& fname="") {
     fid_=FD::Convert(fname.empty()?name():fname);
@@ -82,54 +78,9 @@ struct FsaFeatureFunctionBase {
 
   Impl const& d() const { return static_cast<Impl const&>(*this); }
   Impl & d()  { return static_cast<Impl &>(*this); }
-protected:
-  int ssz; // don't forget to set this. default 0 (it may depend on params of course)
-  Bytes start,h_start; // start state and estimated-features (heuristic) start state.  set these.  default empty.
-  Sentence end_phrase_; // words appended for final traversal (final state cost is assessed using Scan) e.g. "</s>" for lm.
-  void set_state_bytes(int sb=0) {
-    if (start.size()!=sb) start.resize(sb);
-    if (h_start.size()!=sb) h_start.resize(sb);
-    ssz=sb;
-  }
-  void set_end_phrase(WordID single) {
-    end_phrase_=singleton_sentence(single);
-  }
-
-  inline void static to_state(void *state,char const* begin,char const* end) {
-    std::memcpy(state,begin,end-begin);
-  }
-  inline void static to_state(void *state,char const* begin,int n) {
-    std::memcpy(state,begin,n);
-  }
-  template <class T>
-  inline void static to_state(void *state,T const* begin,int n=1) {
-    to_state(state,(char const*)begin,n*sizeof(T));
-  }
-  template <class T>
-  inline void static to_state(void *state,T const* begin,T const* end) {
-    to_state(state,(char const*)begin,(char const*)end);
-  }
-
-  inline static char hexdigit(int i) {
-    int j=i-10;
-    return j>=0?'a'+j:'0'+i;
-  }
-  inline static void print_hex_byte(std::ostream &o,unsigned c) {
-      o<<hexdigit(c>>4);
-      o<<hexdigit(c&0x0f);
-  }
-
 public:
   int fid_; // you can have more than 1 feature of course.
 
-  void state_copy(void *to,void const*from) const {
-    if (ssz)
-      std::memcpy(to,from,ssz);
-  }
-  void state_zero(void *st) const { // you should call this if you don't know the state yet and want it to be hashed/compared properly
-    std::memset(st,0,ssz);
-  }
-
   // can override to different return type, e.g. just return feats:
   Featval describe_features(FeatureVector const& feats) const {
     return feats.get(fid_);
@@ -170,26 +121,12 @@ public:
 
   // this isn't currently used at all.  this left-shortening is not recommended (wasn't worth the computation expense for ngram): specifically for bottom up scoring (ff_from_fsa), you can return a shorter left-words context - but this means e.g. for ngram tracking that a backoff occurred where the final BO cost isn't yet known.  you would also have to remember any necessary info in your own state - in the future, ff_from_fsa on a list of fsa features would only shorten it to the max
 
-  Features features() const {
-    return features_;
-  }
-
-  int n_features() const {
-    return features_.size();
-  }
 
   // override this (static)
   static std::string usage(bool param,bool verbose) {
     return FeatureFunction::usage_helper("unnamed_fsa_feature","","",param,verbose);
   }
-  int state_bytes() const { return ssz; } // or override this
-  void const* start_state() const {
-    return start.begin();
-  }
-  void const * heuristic_start_state() const {
-    return h_start.begin();
-  }
-  Sentence const& end_phrase() const { return end_phrase_; }
+
   // move from state to next_state after seeing word x, while emitting features->set_value(fid,val) possibly with duplicates.  state and next_state will never be the same memory.
   //TODO: decide if we want to require you to support dest same as src, since that's how we use it most often in ff_from_fsa bottom-up wrapper (in l->r scoring, however, distinct copies will be the rule), and it probably wouldn't be too hard for most people to support.  however, it's good to hide the complexity here, once (see overly clever FsaScan loop that swaps src/dest addresses repeatedly to scan a sequence by effectively swapping)
 
@@ -206,10 +143,6 @@ protected:
     return d().Scan1(w,state,next_state);
   }
 public:
-  template <class T>
-  static inline T* state_as(void *p) { return (T*)p; }
-  template <class T>
-  static inline T const* state_as(void const* p) { return (T*)p; }
 
   // must override this or Scan1Meta or Scan1
   template <class Accum>
@@ -307,21 +240,18 @@ public:
     d().ScanPhraseAccumBounce(smeta,edge,i,end,(void*)s1,(void*)s2,accum);
   }
 
+  // for single-feat only.  but will work for different accums
   template <class Accum>
-  inline void Add(Featval v,Accum *a) const { // for single-feat only.  but will work for different accums
+  inline void Add(Featval v,Accum *a) const {
     a->Add(fid_,v);
   }
-  inline void Add(Featval v,SingleFeatureAccumulator *a) const {
-    a->Add(v);
-  }
-
-
   inline void set_feat(FeatureVector *features,Featval v) const {
     features->set_value(fid_,v);
   }
 
   // don't set state-bytes etc. in ctor because it may depend on parsing param string
-  FsaFeatureFunctionBase(int statesz=0,Sentence const& end_sentence_phrase=Sentence()) : ssz(statesz),start(statesz),h_start(statesz),end_phrase_(end_sentence_phrase) {}
+  FsaFeatureFunctionBase(int statesz=0,Sentence const& end_sentence_phrase=Sentence()) :
+    FsaFeatureFunctionData(statesz,end_sentence_phrase) {  }
 
 };
 
@@ -398,7 +328,6 @@ public:
     FSADBG(edge,state(next_state)<<" = "<<a->describe(im));
     FSADBGnl(edge);
   }
-
 };
 
 
diff --git a/decoder/ff_fsa_data.h b/decoder/ff_fsa_data.h
new file mode 100755
index 00000000..66d2cca8
--- /dev/null
+++ b/decoder/ff_fsa_data.h
@@ -0,0 +1,105 @@
+#ifndef FF_FSA_DATA_H
+#define FF_FSA_DATA_H
+
+#include <stdint.h> //C99
+#include <sstream>
+#include "sentences.h"
+#include "feature_accum.h"
+#include "value_array.h"
+
+typedef ValueArray<uint8_t> Bytes;
+
+// stuff I see no reason to have virtual.
+struct FsaFeatureFunctionData
+{
+  FsaFeatureFunctionData(int statesz=0,Sentence const& end_sentence_phrase=Sentence()) : ssz(statesz),start(statesz),h_start(statesz),end_phrase_(end_sentence_phrase) {
+    debug_=true;
+  }
+
+  std::string name_;
+  std::string name() const {
+    return name_;
+  }
+  typedef SparseFeatureAccumulator Accum;
+  bool debug_;
+  bool debug() const { return debug_; }
+  void state_copy(void *to,void const*from) const {
+    if (ssz)
+      std::memcpy(to,from,ssz);
+  }
+  void state_zero(void *st) const { // you should call this if you don't know the state yet and want it to be hashed/compared properly
+    std::memset(st,0,ssz);
+  }
+  Features features() const {
+    return features_;
+  }
+  int n_features() const {
+    return features_.size();
+  }
+  int state_bytes() const { return ssz; } // or override this
+  void const* start_state() const {
+    return start.begin();
+  }
+  void const * heuristic_start_state() const {
+    return h_start.begin();
+  }
+  Sentence const& end_phrase() const { return end_phrase_; }
+  template <class T>
+  static inline T* state_as(void *p) { return (T*)p; }
+  template <class T>
+  static inline T const* state_as(void const* p) { return (T*)p; }
+  std::string describe_features(FeatureVector const& feats) {
+    std::ostringstream o;
+    o<<feats;
+    return o.str();
+  }
+  void print_state(std::ostream &o,void const*state) const {
+    char const* i=(char const*)state;
+    char const* e=i+ssz;
+    for (;i!=e;++i)
+      print_hex_byte(o,*i);
+  }
+
+protected:
+  Features features_;
+  int ssz; // don't forget to set this. default 0 (it may depend on params of course)
+  Bytes start,h_start; // start state and estimated-features (heuristic) start state.  set these.  default empty.
+  Sentence end_phrase_; // words appended for final traversal (final state cost is assessed using Scan) e.g. "</s>" for lm.
+  void set_state_bytes(int sb=0) {
+    if (start.size()!=sb) start.resize(sb);
+    if (h_start.size()!=sb) h_start.resize(sb);
+    ssz=sb;
+  }
+  void set_end_phrase(WordID single) {
+    end_phrase_=singleton_sentence(single);
+  }
+
+  inline void static to_state(void *state,char const* begin,char const* end) {
+    std::memcpy(state,begin,end-begin);
+  }
+  inline void static to_state(void *state,char const* begin,int n) {
+    std::memcpy(state,begin,n);
+  }
+  template <class T>
+  inline void static to_state(void *state,T const* begin,int n=1) {
+    to_state(state,(char const*)begin,n*sizeof(T));
+  }
+  template <class T>
+  inline void static to_state(void *state,T const* begin,T const* end) {
+    to_state(state,(char const*)begin,(char const*)end);
+  }
+  inline static char hexdigit(int i) {
+    int j=i-10;
+    return j>=0?'a'+j:'0'+i;
+  }
+  inline static void print_hex_byte(std::ostream &o,unsigned c) {
+    o<<hexdigit(c>>4);
+    o<<hexdigit(c&0x0f);
+  }
+  inline static void Add(Featval v,SingleFeatureAccumulator *a) {
+    a->Add(v);
+  }
+
+};
+
+#endif
diff --git a/decoder/ff_fsa_dynamic.h b/decoder/ff_fsa_dynamic.h
index 79672bdc..2703b305 100755
--- a/decoder/ff_fsa_dynamic.h
+++ b/decoder/ff_fsa_dynamic.h
@@ -1,29 +1,92 @@
 #ifndef FF_FSA_DYNAMIC_H
 #define FF_FSA_DYNAMIC_H
 
-#include "ff_fsa.h"
+struct SentenceMetadata;
+#include "ff_fsa_data.h"
+#include "hg.h" // can't forward declare nested Hypergraph::Edge class
+#include <sstream>
+
 
 // the type-erased interface
-/*
-struct FsaFeatureFunction {
+
+struct FsaFeatureFunction : public FsaFeatureFunctionData {
+  static const bool simple_phrase_score=false;
   virtual int markov_order() const = 0;
-  virtual ~FsaFeatureFunction();
 
+  // see ff_fsa.h - FsaFeatureFunctionBase<Impl> gives you reasonable impls of these if you override just ScanAccum
+  virtual void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
+                        WordID w,void const* state,void *next_state,Accum *a) const = 0;
+  virtual void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge,
+                              WordID const* i, WordID const* end,
+                              void const* state,void *next_state,Accum *accum) const = 0;
+  virtual void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
+                           WordID const* i, WordID const* end,
+                           void const* state,Accum *accum) const = 0;
+  virtual void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const = 0;
+
+  virtual int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const { return 0; }
+  virtual std::string usage(bool param,bool verbose) const {
+    return FeatureFunction::usage_helper("unnamed_dynamic_fsa_feature","","",param,verbose);
+  }
+
+  virtual void print_state(std::ostream &o,void const*state) const {
+    FsaFeatureFunctionData::print_state(o,state);
+  }
+  //end_phrase()
+  virtual ~FsaFeatureFunction() {}
+
+  // no need to override:
+  std::string describe_state(void const* state) const {
+    std::ostringstream o;
+    print_state(o,state);
+    return o.str();
+  }
 };
 
 // conforming to above interface, type erases FsaImpl
 // you might be wondering: why do this?  answer: it's cool, and it means that the bottom-up ff over ff_fsa wrapper doesn't go through multiple layers of dynamic dispatch
+// usage: struct My : public FsaFeatureFunctionDynamic<My>
 template <class Impl>
 struct FsaFeatureFunctionDynamic : public FsaFeatureFunction {
+  static const bool simple_phrase_score=Impl::simple_phrase_score;
   Impl& d() { return static_cast<Impl&>(*this); }
   Impl const& d() { return static_cast<Impl const&>(*this); }
   int markov_order() const { return d().markov_order(); }
+
+  virtual void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
+                        WordID w,void const* state,void *next_state,Accum *a) const {
+    return d().ScanAccum(smeta,edge,w,state,next_state,a);
+  }
+
+  virtual void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge,
+                              WordID const* i, WordID const* end,
+                              void const* state,void *next_state,Accum *a) const {
+    return d().ScanPhraseAccum(smeta,edge,i,end,state,next_state,a);
+  }
+
+  virtual void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
+                           WordID const* i, WordID const* end,
+                           void const* state,Accum *a) const {
+    return d().ScanPhraseAccumOnly(smeta,edge,i,end,state,a);
+
+  virtual void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *a) const {
+    return d().ScanPhraseAccumBounce(smeta,edge,i,end,cs,ns,a);
+  }
+
+  virtual int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const {
+    return d().early_score_words(smeta,edge,i,end,accum);
+  }
+
+  virtual std::string usage(bool param,bool verbose) const {
+    return Impl::usage(param,verbose);
+  }
+
+  virtual void print_state(std::ostream &o,void const*state) const {
+    return d().print_state(o,state);
+  }
 };
 
-//TODO: wrap every method in concrete fsaff and declare in interface above.
 //TODO: combine 2 (or N) FsaFeatureFunction (type erased)
 
-*/
-
 
 #endif
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index e502ce93..61d845d6 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -56,7 +56,7 @@ char const* usage_verbose="-n determines the name of the feature (and its weight
 #include <netdb.h>
 
 #include <boost/shared_ptr.hpp>
-#include <boost/lexical_cast.hpp>
+#include "fast_lexical_cast.hpp"
 
 #include "tdict.h"
 #include "Vocab.h"
diff --git a/decoder/forest_writer.cc b/decoder/forest_writer.cc
index a9117d18..6e4cccb3 100644
--- a/decoder/forest_writer.cc
+++ b/decoder/forest_writer.cc
@@ -2,7 +2,7 @@
 
 #include <iostream>
 
-#include <boost/lexical_cast.hpp>
+#include "fast_lexical_cast.hpp"
 
 #include "filelib.h"
 #include "hg_io.h"
diff --git a/decoder/grammar.h b/decoder/grammar.h
index b26eb912..1173e3cd 100644
--- a/decoder/grammar.h
+++ b/decoder/grammar.h
@@ -25,9 +25,10 @@ struct GrammarIter {
 };
 
 struct Grammar {
+  //TODO: HASH_MAP?
   typedef std::map<WordID, std::vector<TRulePtr> > Cat2Rules;
   static const std::vector<TRulePtr> NO_RULES;
-  
+
   Grammar(): ctf_levels_(0) {}
   virtual ~Grammar();
   virtual const GrammarIter* GetRoot() const = 0;
@@ -52,7 +53,7 @@ struct Grammar {
  protected:
   Cat2Rules rhs2unaries_;     // these must be filled in by subclasses!
   std::vector<TRulePtr> unaries_;
-  std::string grammar_name_; 
+  std::string grammar_name_;
   unsigned int ctf_levels_;
 };
 
@@ -63,17 +64,17 @@ struct TextGrammar : public Grammar {
   TextGrammar();
   TextGrammar(const std::string& file);
   void SetMaxSpan(int m) { max_span_ = m; }
-  
+
   virtual const GrammarIter* GetRoot() const;
   void AddRule(const TRulePtr& rule, const unsigned int ctf_level=0, const TRulePtr& coarse_parent=TRulePtr());
   void ReadFromFile(const std::string& filename);
   virtual bool HasRuleForSpan(int i, int j, int distance) const;
   const std::vector<TRulePtr>& GetUnaryRules(const WordID& cat) const;
-  
+
  private:
   int max_span_;
   boost::shared_ptr<TGImpl> pimpl_;
-  
+
 };
 
 struct GlueGrammar : public TextGrammar {
diff --git a/decoder/hg.cc b/decoder/hg.cc
index 88e95337..f0238fe3 100644
--- a/decoder/hg.cc
+++ b/decoder/hg.cc
@@ -61,6 +61,7 @@ struct TropicalValue {
   }
   TropicalValue(unsigned v) : v_(v) {}
   TropicalValue(const prob_t& v) : v_(v) {}
+//  operator prob_t() const { return v_; }
   inline TropicalValue& operator+=(const TropicalValue& o) {
     if (v_ < o.v_) v_ = o.v_;
     return *this;
@@ -90,6 +91,7 @@ struct ViterbiTransitionEventWeightFunction {
 };
 
 
+//TODO: both Compute* methods build sparse vectors with size = whole subhypergraph, for every node.  there's no need for that.
 prob_t Hypergraph::ComputeEdgePosteriors(double scale, vector<prob_t>* posts) const {
   const ScaledEdgeProb weight(scale);
   const ScaledTransitionEventWeightFunction w2(scale);
@@ -131,6 +133,27 @@ void Hypergraph::PushWeightsToSource(double scale) {
   }
 }
 
+namespace {
+struct vpusher : public vector<TropicalValue> {
+  int fid;
+  vpusher(int fid=0) : fid(fid) {  }
+  void operator()(int n,int /*ei*/,Hypergraph::Edge &e) const {
+    Hypergraph::TailNodeVector const& t=e.tail_nodes_;
+    prob_t p=e.edge_prob_;
+    for (int i=0;i<t.size();++i)
+      p*=(*this)[t[i]].v_;
+    e.feature_values_.set_value(fid,log(e.edge_prob_=p/(*this)[n].v_));
+  }
+};
+}
+
+void Hypergraph::PushViterbiWeightsToGoal(int fid) {
+  vpusher vi(fid);
+  Inside(*this,&vi,ViterbiWeightFunction());
+  visit_edges(vi);
+}
+
+
 void Hypergraph::PushWeightsToGoal(double scale) {
   vector<prob_t> posts;
   ComputeEdgePosteriors(scale, &posts);
@@ -425,6 +448,7 @@ struct IdCompare {
 //TODO: if you had parallel arrays associating data w/ each node or edge, you'd want access to reloc_node and reloc_edge - expose in stateful object?
 void Hypergraph::TopologicallySortNodesAndEdges(int goal_index,
                                                 const vector<bool>* prune_edges) {
+  edges_topo_=true;
   // figure out which nodes are reachable from the goal
   vector<int> reloc_node(nodes_.size(), -1);
   vector<int> reloc_edge(edges_.size(), -1);
diff --git a/decoder/hg.h b/decoder/hg.h
index bef3bebd..08199eb4 100644
--- a/decoder/hg.h
+++ b/decoder/hg.h
@@ -53,7 +53,7 @@ public:
     WordID cat_;  // non-terminal category if <0, 0 if not set
     EdgesVector in_edges_;   // contents refer to positions in edges_
     EdgesVector out_edges_;  // contents refer to positions in edges_
-    double promise; // set in global pruning; in [0,infty) so that mean is 1.  use: e.g. scale cube poplimit
+    double promise; // set in global pruning; in [0,infty) so that mean is 1.  use: e.g. scale cube poplimit.  //TODO: appears to be useless, compile without this?  on the other hand, pretty cheap.
     void copy_fixed(Node const& o) { // nonstructural fields only - structural ones are managed by sorting/pruning/subsetting
       cat_=o.cat_;
       promise=o.promise;
@@ -64,7 +64,6 @@ public:
       e2.reindex_push_back(o.in_edges_,in_edges_);
       e2.reindex_push_back(o.out_edges_,out_edges_);
     }
-
   };
 
   // TODO get rid of edge_prob_? (can be computed on the fly as the dot
@@ -82,10 +81,10 @@ public:
     prob_t edge_prob_;             // dot product of weights and feat_values
     int id_;   // equal to this object's position in the edges_ vector
 
-    // span info. typically, i_ and j_ refer to indices in the source sentence
-    // if a synchronous parse has been executed i_ and j_ will refer to indices
-    // in the target sentence / lattice and prev_i_ prev_j_ will refer to
-    // positions in the source.  Note: it is up to the translator implementation
+    // span info. typically, i_ and j_ refer to indices in the source sentence.
+    // In synchronous parsing, i_ and j_ will refer to target sentence/lattice indices
+    // while prev_i_ prev_j_ will refer to positions in the source.
+    // Note: it is up to the translator implementation
     // to properly set these values.  For some models (like the Forest-input
     // phrase based model) it may not be straightforward to do.  if these values
     // are not properly set, most things will work but alignment and any features
@@ -333,8 +332,7 @@ public:
   // edge in the hypergraph
   // alpha->size = edges_.size = beta->size
   // returns inside prob of goal node
-  prob_t ComputeEdgePosteriors(double scale,
-                               EdgeProbs* posts) const;
+  prob_t ComputeEdgePosteriors(double scale,EdgeProbs* posts) const;
 
   // find the score of the very best path passing through each edge
   prob_t ComputeBestPathThroughEdges(EdgeProbs* posts) const;
@@ -377,6 +375,10 @@ public:
   // not just lattices
   void PushWeightsToGoal(double scale = 1.0);
 
+  // contrary to PushWeightsToGoal, use viterbi semiring; store log(p) to fid.  note that p_viterbi becomes 1; k*p_viterbi becomes k.  also modifies edge_prob_ (note that the fid stored log(p) will stick around even if you reweight)
+  // afterwards, product of edge_prob_ for a derivation will equal 1 for the viterbi (p_v before, 1 after), and in general (k*p_v before, k after)
+  void PushViterbiWeightsToGoal(int fid=0);
+
 //  void SortInEdgesByEdgeWeights(); // local sort only - pretty useless
 
   void PruneUnreachable(int goal_node_id); // DEPRECATED
@@ -393,7 +395,7 @@ public:
   typedef EdgeProbs NodeProbs;
   void SetPromise(NodeProbs const& inside,NodeProbs const& outside, double power=1, bool normalize=true);
 
-  //TODO: in my opinion, looking at the ratio of logprobs (features \dot weights) rather than the absolute difference generalizes more nicely across sentence lengths and weight vectors that are constant multiples of one another.  at least make that an option.  i worked around this a little in cdec by making "beam alpha per source word" but that's not helping with different tuning runs.  this would also make me more comfortable about allocating promise
+  //TODO: in my opinion, looking at the ratio of logprobs (features \dot weights) rather than the absolute difference generalizes more nicely across sentence lengths and weight vectors that are constant multiples of one another.  at least make that an option.  i worked around this a little in cdec by making "beam alpha per source word" but that's not helping with different tuning runs.  this would also make me more comfortable about allocating Node.promise
 
   // beam_alpha=0 means don't beam prune, otherwise drop things that are e^beam_alpha times worse than best -   // prunes any edge whose prob_t on the best path taking that edge is more than e^alpha times
   //density=0 means don't density prune:   // for density>=1.0, keep this many times the edges needed for the 1best derivation
@@ -437,17 +439,34 @@ public:
   // edges_ is not guaranteed to be in any particular order
   typedef std::vector<Edge> Edges;
   Edges edges_;
+  bool edges_topo_; // TODO: this is always true right now - should reflect whether edges_ are ordered.  typically, you can just iterate over nodes (which are in topo order) and use in_edges to get the edges in topo order
+
+  template <class V>
+  void visit_edges_topo(V &v) {
+    for (int i = 0; i < nodes_.size(); ++i) {
+      EdgesVector const& in=nodes_[i].in_edges_;
+      for (int j=0;j<in.size();++j) {
+        int e=in[j];
+        v(i,e,edges_[e]);
+      }
+    }
+  }
+
+  template <class V>
+  void visit_edges(V &v) {
+    for (int i=0;i<edges_.size();++i)
+      v(edges_[i].head_node_,i,edges_[i]);
+  }
 
   // reorder nodes_ so they are in topological order
   // source nodes at 0 sink nodes at size-1
-  void TopologicallySortNodesAndEdges(int goal_idx,
-                                      const EdgeMask* prune_edges = NULL);
+  void TopologicallySortNodesAndEdges(int goal_idx, const EdgeMask* prune_edges = NULL);
 
   void set_ids(); // resync edge,node .id_
   void check_ids() const; // assert that .id_ have been kept in sync
 
 private:
-  Hypergraph(int num_nodes, int num_edges, bool is_lc) : is_linear_chain_(is_lc), nodes_(num_nodes), edges_(num_edges) {}
+  Hypergraph(int num_nodes, int num_edges, bool is_lc) : is_linear_chain_(is_lc), nodes_(num_nodes), edges_(num_edges),edges_topo_(true) {}
 
   static TRulePtr kEPSRule;
   static TRulePtr kUnaryRule;
diff --git a/decoder/hg_intersect.cc b/decoder/hg_intersect.cc
index 02ff752e..a3a2cbd9 100644
--- a/decoder/hg_intersect.cc
+++ b/decoder/hg_intersect.cc
@@ -2,7 +2,7 @@
 
 #include <vector>
 #include <tr1/unordered_map>
-#include <boost/lexical_cast.hpp>
+#include "fast_lexical_cast.hpp"
 #include <boost/functional/hash.hpp>
 
 #include "tdict.h"
@@ -143,7 +143,7 @@ bool HG::Intersect(const Lattice& target, Hypergraph* hg) {
     rule->parent_rule_ = edge.rule_;
     rule->ComputeArity();
     //cerr << "ADD: " << rule->AsString() << endl;
-    
+
     g->AddRule(rule);
   }
   g->SetMaxSpan(target.size() + 1);
diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc
index 65c2c382..52a8565a 100644
--- a/decoder/hg_io.cc
+++ b/decoder/hg_io.cc
@@ -3,7 +3,7 @@
 #include <sstream>
 #include <iostream>
 
-#include <boost/lexical_cast.hpp>
+#include "fast_lexical_cast.hpp"
 
 #include "tdict.h"
 #include "json_parse.h"
diff --git a/decoder/hg_io.h b/decoder/hg_io.h
index 7162106e..b6a176ab 100644
--- a/decoder/hg_io.h
+++ b/decoder/hg_io.h
@@ -2,8 +2,8 @@
 #define _HG_IO_H_
 
 #include <iostream>
-
 #include "lattice.h"
+
 class Hypergraph;
 
 struct HypergraphIO {
diff --git a/decoder/indices_after.h b/decoder/indices_after.h
index dec94cc0..2b662d5c 100755
--- a/decoder/indices_after.h
+++ b/decoder/indices_after.h
@@ -160,7 +160,7 @@ struct indices_after
   }
 
 private:
-  indices_after(indices_after const& o)
+  indices_after(indices_after const&)
   {
     map=NULL;
   }
diff --git a/decoder/inside_outside.h b/decoder/inside_outside.h
index 128d89da..48e31bb3 100644
--- a/decoder/inside_outside.h
+++ b/decoder/inside_outside.h
@@ -40,13 +40,14 @@ WeightType Inside(const Hypergraph& hg,
   for (int i = 0; i < num_nodes; ++i) {
     const Hypergraph::Node& cur_node = hg.nodes_[i];
     WeightType* const cur_node_inside_score = &inside_score[i];
-    const int num_in_edges = cur_node.in_edges_.size();
+    Hypergraph::EdgesVector const& in=cur_node.in_edges_;
+    const int num_in_edges = in.size();
     if (num_in_edges == 0) {
-      *cur_node_inside_score = WeightType(1);
+      *cur_node_inside_score = WeightType(1); //FIXME: why not call weight(edge) instead?
       continue;
     }
     for (int j = 0; j < num_in_edges; ++j) {
-      const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]];
+      const Hypergraph::Edge& edge = hg.edges_[in[j]];
       WeightType score = weight(edge);
       for (int k = 0; k < edge.tail_nodes_.size(); ++k) {
         const int tail_node_index = edge.tail_nodes_[k];
@@ -76,9 +77,10 @@ void Outside(const Hypergraph& hg,
   for (int i = num_nodes - 1; i >= 0; --i) {
     const Hypergraph::Node& cur_node = hg.nodes_[i];
     const WeightType& head_node_outside_score = outside_score[i];
-    const int num_in_edges = cur_node.in_edges_.size();
+    Hypergraph::EdgesVector const& in=cur_node.in_edges_;
+    const int num_in_edges = in.size();
     for (int j = 0; j < num_in_edges; ++j) {
-      const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]];
+      const Hypergraph::Edge& edge = hg.edges_[in[j]];
       WeightType head_and_edge_weight = weight(edge);
       head_and_edge_weight *= head_node_outside_score;
       const int num_tail_nodes = edge.tail_nodes_.size();
@@ -122,6 +124,10 @@ struct InsideOutsides {
   KType compute(Hypergraph const& hg,KWeightFunction const& kwf=KWeightFunction()) {
     return compute(hg,Outside1<KType>(),kwf);
   }
+  template <class KWeightFunction>
+  KType compute_inside(Hypergraph const& hg,KWeightFunction const& kwf=KWeightFunction()) {
+  }
+
   template <class KWeightFunction,class O1>
   KType compute(Hypergraph const& hg,O1 outside1,KWeightFunction const& kwf=KWeightFunction()) {
     typedef typename KWeightFunction::Weight KType2;
@@ -139,9 +145,10 @@ struct InsideOutsides {
     typename XWeightFunction::Result x;      // default constructor is semiring 0
     for (int i = 0,num_nodes=hg.nodes_.size(); i < num_nodes; ++i) {
       const Hypergraph::Node& cur_node = hg.nodes_[i];
-      const int num_in_edges = cur_node.in_edges_.size();
+      Hypergraph::EdgesVector const& in=cur_node.in_edges_;
+      const int num_in_edges = in.size();
       for (int j = 0; j < num_in_edges; ++j) {
-        const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]];
+        const Hypergraph::Edge& edge = hg.edges_[in[j]];
         KType kbar_e = outside[i];
         const int num_tail_nodes = edge.tail_nodes_.size();
         for (int k = 0; k < num_tail_nodes; ++k)
@@ -156,9 +163,10 @@ struct InsideOutsides {
     vs.resize(hg.edges_.size());
     for (int i = 0,num_nodes=hg.nodes_.size(); i < num_nodes; ++i) {
       const Hypergraph::Node& cur_node = hg.nodes_[i];
-      const int num_in_edges = cur_node.in_edges_.size();
+      Hypergraph::EdgesVector const& in=cur_node.in_edges_;
+      const int num_in_edges = in.size();
       for (int j = 0; j < num_in_edges; ++j) {
-        int edgei=cur_node.in_edges_[j];
+        int edgei=in[j];
         const Hypergraph::Edge& edge = hg.edges_[edgei];
         V x=weight(edge)*outside[i];
         const int num_tail_nodes = edge.tail_nodes_.size();
diff --git a/decoder/static_utoa.h b/decoder/static_utoa.h
new file mode 100755
index 00000000..0dbe111f
--- /dev/null
+++ b/decoder/static_utoa.h
@@ -0,0 +1,63 @@
+#ifndef STATIC_UTOA_H
+#define STATIC_UTOA_H
+
+#include "threadlocal.h"
+#include <cstring>
+
+#define DIGIT_LOOKUP_TABLE 0
+
+namespace {
+THREADLOCAL char utoa_buf[] = "01234567890123456789"; // to put end of string character at buf[20]
+const unsigned utoa_bufsize=sizeof(utoa_buf);
+const unsigned utoa_bufsizem1=utoa_bufsize-1;
+#ifdef DIGIT_LOOKUP_TABLE
+char digits[] = "0123456789";
+#endif
+}
+
+inline char digit_to_char(int d) {
+  return
+#ifdef DIGIT_LOOKUP_TABLE
+    digits[d];
+#else
+    '0'+d;
+#endif
+}
+
+
+// returns n in string [return,num); *num=0 yourself calling if you want a c_str
+inline char *utoa(char *num,unsigned n) {
+  if ( !n ) {
+    *--num='0';
+  } else {
+    unsigned rem;
+    // 3digit lookup table, divide by 1000 faster?
+    while ( n ) {
+#if 1
+      rem = n;
+      n /= 10;
+      rem -= 10*n;		// maybe this is faster than mod because we are already dividing
+#else
+      rem = n%10; // would optimizer combine these together?
+      n   = n/10;
+#endif
+      *--num = digit_to_char(rem);
+    }
+  }
+  return num;
+}
+
+inline char *static_utoa(unsigned n) {
+  return utoa(utoa_buf+utoa_bufsizem1,n);
+}
+
+//returns position of '\0' terminating number written starting at to
+inline char* append_utoa(char *to,unsigned n) {
+  char *s=static_utoa(n);
+  int ns=(utoa_buf+utoa_bufsize)-s;
+  std::memcpy(to,s,ns);
+  return to+ns;
+}
+
+
+#endif
diff --git a/decoder/tdict.cc b/decoder/tdict.cc
index 7b56d259..f0588cfc 100644
--- a/decoder/tdict.cc
+++ b/decoder/tdict.cc
@@ -1,13 +1,19 @@
+#define TD_ALLOW_UNDEFINED_WORDIDS 1
+
+// if 1, word ids that are >= end() will give a numeric token name (single per-thread shared buffer), which of course won't be Convert-able back to the id, because it's not added to the dict.  This is a convenience for logging fake token indices.  Any tokens actually added to the dict may cause end() to overlap the range of fake ids you were using - that's up to you to prevent.
+
+#include <stdlib.h>
+#include <cstring>
 #include <sstream>
 #include "Ngram.h"
 #include "dict.h"
 #include "tdict.h"
 #include "Vocab.h"
 #include "stringlib.h"
+#include "threadlocal.h"
 
 using namespace std;
 
-//FIXME: valgrind errors (static init order?)
 Vocab TD::dict_(0,TD::max_wordid);
 WordID TD::ss=dict_.ssIndex();
 WordID TD::se=dict_.seIndex();
@@ -65,7 +71,23 @@ WordID TD::Convert(char const* s) {
   return dict_.addWord((VocabString)s);
 }
 
-const char* TD::Convert(const WordID& w) {
+
+#if TD_ALLOW_UNDEFINED_WORDIDS
+# include "static_utoa.h"
+char undef_prefix[]="UNDEF_";
+static const int undefpre_n=sizeof(undef_prefix)/sizeof(undef_prefix[0]);
+THREADLOCAL char undef_buf[]="UNDEF_________________";
+inline char const* undef_token(WordID w)
+{
+  append_utoa(undef_buf+undefpre_n,w);
+  return undef_buf;
+}
+#endif
+
+const char* TD::Convert(WordID w) {
+#if TD_ALLOW_UNDEFINED_WORDIDS
+  if (w>=dict_.highIndex()) return undef_token(w);
+#endif
   return dict_.getWord((VocabIndex)w);
 }
 
diff --git a/decoder/tdict.h b/decoder/tdict.h
index cb030dc6..a7b3ee1c 100644
--- a/decoder/tdict.h
+++ b/decoder/tdict.h
@@ -36,7 +36,7 @@ struct TD {
   static unsigned int NumWords();
   static WordID Convert(const std::string& s);
   static WordID Convert(char const* s);
-  static const char* Convert(const WordID& w);
+  static const char* Convert(WordID w);
 };
 
 struct ToTD {
diff --git a/decoder/threadlocal.h b/decoder/threadlocal.h
new file mode 100755
index 00000000..d79f5d9d
--- /dev/null
+++ b/decoder/threadlocal.h
@@ -0,0 +1,71 @@
+#ifndef THREADLOCAL_H
+#define THREADLOCAL_H
+
+#ifndef SETLOCAL_SWAP
+# define SETLOCAL_SWAP 0
+#endif
+
+#ifdef BOOST_NO_MT
+
+# define THREADLOCAL
+
+#else
+
+#ifdef _MSC_VER
+
+//FIXME: doesn't work with DLLs ... use TLS apis instead (http://www.boost.org/libs/thread/doc/tss.html)
+# define THREADLOCAL __declspec(thread)
+
+#else
+
+# define THREADLOCAL __thread
+
+#endif
+
+#endif
+
+#include <algorithm> //swap
+
+// naturally, the below are only thread-safe if value is THREADLOCAL
+template <class D>
+struct SaveLocal {
+    D &value;
+    D old_value;
+    SaveLocal(D& val) : value(val), old_value(val) {}
+    ~SaveLocal() {
+#if SETLOCAL_SWAP
+      swap(value,old_value);
+#else
+      value=old_value;
+#endif
+    }
+};
+
+template <class D>
+struct SetLocal {
+    D &value;
+    D old_value;
+    SetLocal(D& val,const D &new_value) : value(val), old_value(
+#if SETLOCAL_SWAP
+      new_value
+#else
+      val
+#endif
+      ) {
+#if SETLOCAL_SWAP
+      swap(value,old_value);
+#else
+      value=new_value;
+#endif
+    }
+    ~SetLocal() {
+#if SETLOCAL_SWAP
+      swap(value,old_value);
+#else
+      value=old_value;
+#endif
+    }
+};
+
+
+#endif
diff --git a/decoder/tromble_loss.cc b/decoder/tromble_loss.cc
index 9ebd8ab1..24cfef5f 100644
--- a/decoder/tromble_loss.cc
+++ b/decoder/tromble_loss.cc
@@ -1,9 +1,9 @@
 #include "tromble_loss.h"
+#include "fast_lexical_cast.hpp"
 
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/circular_buffer.hpp>
 #include <boost/functional/hash.hpp>
-#include <boost/lexical_cast.hpp>
 #include <boost/range/iterator_range.hpp>
 #include <boost/tokenizer.hpp>
 #include <boost/unordered_map.hpp>
@@ -170,7 +170,7 @@ class TrombleLossComputerImpl {
 
   size_t StateSize() const {
     // n-1 boundary words plus counts for n-grams currently rendered as bytes even though most would fit in bits.
-    // Also, this is cached by higher up classes so no need to cache here.  
+    // Also, this is cached by higher up classes so no need to cache here.
     return MutableState::Size(thetas_.size(), bound_ngram_id_);
   }
 
@@ -179,8 +179,8 @@ class TrombleLossComputerImpl {
       const TRule &rule,
       const vector<const void*> &ant_contexts,
       void *out_context) const {
-    // TODO: get refs from sentence metadata.  
-    // This will require resizable features.  
+    // TODO: get refs from sentence metadata.
+    // This will require resizable features.
     if (smeta.GetSentenceID() >= ref_ids_.size()) {
       std::cerr << "Sentence ID " << smeta.GetSentenceID() << " doesn't have references; there are only " << ref_ids_.size() << " references." << std::endl;
       exit(1);
@@ -216,7 +216,7 @@ class TrombleLossComputerImpl {
           if (++pushed == keep) {
             std::copy(history.begin(), history.end(), out_state.left);
           }
-          // Now i is the length of the history coming from this constituent.  So it needs at least i+1 words to have a cross-child add.  
+          // Now i is the length of the history coming from this constituent.  So it needs at least i+1 words to have a cross-child add.
           AddWord(history, i + 1, ngrams, out_state.counts);
         }
         // If the consituent is shorter than thetas_.size(), then the
diff --git a/decoder/viterbi.cc b/decoder/viterbi.cc
index 46b6a884..b21139df 100644
--- a/decoder/viterbi.cc
+++ b/decoder/viterbi.cc
@@ -1,3 +1,4 @@
+#include "fast_lexical_cast.hpp"
 #include "viterbi.h"
 
 #include <sstream>
@@ -114,7 +115,6 @@ Both relationships are commutative but are not transitive. The relationship defi
 */
 #include <cmath>
 #include <stdexcept>
-#include <boost/lexical_cast.hpp>
 inline bool close_enough(double a,double b,double epsilon)
 {
     using std::fabs;
-- 
cgit v1.2.3