sparse_vector use google::dense_hash_map, fsa scan logging

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@383 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-23 19:17:22 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-23 19:17:22 +0000
commit: 00be683d744ea87e67dd13db08e4a17531bfb1f3 (patch)
tree: 7caccb91b31a1ffa1930a29ebf6f30b27e6f5f69
parent: 8d222e20d8f253aa2c73d139d8ae6cc69483d071 (diff)
10 files changed, 196 insertions, 85 deletions
diff --git a/decoder/cdec.cc b/decoder/cdec.cc
index 6fb6d5a1..f366a08f 100644
--- a/decoder/cdec.cc
+++ b/decoder/cdec.cc
@@ -749,7 +749,7 @@ int main(int argc, char** argv) {
         }
 
         if (output_training_vector) {
-          acc_vec.clear_value(0);
+          acc_vec.erase(0);
           ++g_count;
           if (g_count % combine_size == 0) {
             if (encode_b64) {
diff --git a/decoder/ff.h b/decoder/ff.h
index e54ac149..0bfc8582 100644
--- a/decoder/ff.h
+++ b/decoder/ff.h
@@ -175,7 +175,7 @@ class ModelSet {
 
   // sets edge->feature_values_ and edge->edge_prob_
   // NOTE: edge must not necessarily be in hg.edges_ but its TAIL nodes
-  // must be.
+  // must be.  edge features are supposed to be overwritten, not added to (possibly because rule features aren't in ModelSet so need to be left alone
   void AddFeaturesToEdge(const SentenceMetadata& smeta,
                          const Hypergraph& hg,
                          const std::vector<std::string>& node_states,
diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h
index 6f2e27f0..f84bda31 100755
--- a/decoder/ff_from_fsa.h
+++ b/decoder/ff_from_fsa.h
@@ -39,6 +39,7 @@ public:
                              FeatureVector* estimated_features,
                              void* out_state) const
   {
+    ff.init_features(features); // estimated_features is fresh
     if (!ssz) {
       TRule const& rule=*edge.rule_;
       Sentence const& e = rule.e();
@@ -112,11 +113,11 @@ public:
                                       FeatureVector* final_features) const
   {
     Sentence const& ends=ff.end_phrase();
-    SP ss=ff.start_state();
     if (!ssz) {
-      AccumFeatures(ff,smeta,begin(ends),end(ends),final_features,ss);
+      AccumFeatures(ff,smeta,begin(ends),end(ends),final_features,0);
       return;
     }
+    SP ss=ff.start_state();
     WP l=(WP)residual_state,lend=left_end(residual_state);
     SP rst=fsa_state(residual_state);
     if (lend==rst) { // implying we have an fsa state
@@ -137,6 +138,15 @@ public:
     return StateSize()==0; // Fsa features don't get info about span
   }
 
+  static void test() {
+    WordID w1[1],w1b[1],w2[2];
+    w1[0]=w2[0]=TD::Convert("hi");
+    w2[1]=w1b[0]=TD::none;
+    assert(left_end(w1,w1+1)==w1+1);
+    assert(left_end(w1b,w1b+1)==w1b);
+    assert(left_end(w2,w2+2)==w2+1);
+  }
+
 private:
   Impl ff;
   void Init() {
@@ -147,8 +157,8 @@ private:
     SetStateSize(ff.state_bytes()+state_offset);
   }
   int M; // markov order (ctx len)
-  FeatureFunctionFromFsa() {  }
-  // call this explicitly in constructor body:
+  FeatureFunctionFromFsa(); // not allowed.
+
   int state_offset; // store left-words first, then fsa state
   int ssz; // bytes in fsa state
   /*
@@ -183,10 +193,17 @@ private:
   inline void fstatecpy(void *dest,void const* src) const {
     std::memcpy(dest,src,ssz);
   }
-
-
 };
 
 
+#ifdef TEST_FSA
+# include "tdict.cc"
+# include "ff_sample_fsa.h"
+int main() {
+  std::cerr<<"Testing left_end...\n";
+  WordPenaltyFromFsa::test();
+  return 0;
+}
+#endif
 
 #endif
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
index 3a9478e2..e359cfd9 100755
--- a/decoder/ff_fsa.h
+++ b/decoder/ff_fsa.h
@@ -3,7 +3,13 @@
 
 //SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h
 
-//TODO: actually compile this; probably full of syntax errors.
+#define FSA_DEBUG
+#ifdef FSA_DEBUG
+# include <iostream>
+# define FSADBG(x) do { std::cerr << x; } while(0)
+#else
+# define FSADBG(x)
+#endif
 
 #include <stdint.h> //C99
 #include <string>
@@ -34,7 +40,6 @@ protected:
     if (h_start.size()!=sb) h_start.resize(sb);
     state_bytes_=sb;
   }
-
   int fid_; // you can have more than 1 feature of course.
   void init_fid(std::string const& name) { // call this, though, if you have a single feature
     fid_=FD::Convert(name);
@@ -54,6 +59,11 @@ protected:
     to_state(state,(char const*)begin,(char const*)end);
   }
 public:
+  //edges may have old features on them.  override if you have more than 1 fid.  we need to call this explicitly because edges may have old feature values already, and I chose to use add_value (+=) to simplify scanning a phrase, rather than set_value (=) for fsa ffs.  could revisit this and use set_value and therefore sum
+  void init_features(FeatureVector *fv) const {
+    fv->set_value(fid_,0);
+    //erase(fid_)
+  }
   // return m: all strings x with the same final m+1 letters must end in this state
   /* markov chain of order m: P(xn|xn-1...x1)=P(xn|xn-1...xn-m) */
   int markov_order() const { return 0; } // override if you use state.  order 0 implies state_bytes()==0 as well, as far as scoring/splitting is concerned (you can still track state, though)
@@ -87,6 +97,69 @@ public:
 
 };
 
+// init state is in cs; overwrite cs, ns repeatedly (alternatively).  return resulting state
+template <class FsaFF>
+void *FsaScan(FsaFF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *features, void *cs,void *ns) {
+  // extra code - IT'S FOR EFFICIENCY, MAN!  IT'S OK!  definitely no bugs here.
+  void *os,*es;
+  WordID const* e=end-1; // boundcheck 1 earlier because in loop below we use i+1 before rechecking
+  if ((end-i)&1) { // odd # of words
+    os=cs;
+    es=ns;
+    i-=1;
+    goto odd;
+  } else {
+    es=cs;
+    os=ns;
+  }
+  for (;i<e;i+=2) {
+    ff.Scan(smeta,*i,es,os,features); // e->o
+  odd:
+    ff.Scan(smeta,*(i+1),os,es,features); // o->e
+  }
+  return es;
+}
+
+// if you have a more efficient implementation for scanning a phrase than one word at a time (e.g. LM context using sliding window buffer rather than rotating through a fixed state size), you can override this
+template <class FsaFF>
+void Scan(FsaFF const& ff,SentenceMetadata const& smeta,WordID const* i,WordID const* e,void const* state,void *next_state,FeatureVector *features) {
+  int ssz=ff.state_bytes();
+  if (!ssz) {
+    for (;i<e;++i)
+      ff.Scan(smeta,*i,0,0,features);
+    return;
+  }
+  Bytes tstate(ssz);
+  void *tst=tstate.begin();
+  bool odd=(e-i)&1;
+  void *cs,*ns;
+  if (odd) {
+    cs=tst;
+    ns=next_state;
+  } else {
+    cs=next_state;
+    ns=tst;
+  }
+  std::memcpy(cs,state,ssz);
+  void *est=FsaScan(ff,smeta,i,end,features,cs,ns);
+  assert(est==next_state);
+}
+
+
+// like above Scan, but don't bother storing final state (for FinalTraversalFeatures)
+template <class FF>
+void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *h_features,void const* start_state) {
+  int ssz=ff.state_bytes();
+  if (ssz) {
+    Bytes state(ssz),state2(ssz);
+    void *cs=state.begin(),*ns=state2.begin();
+    memcpy(cs,start_state,ff.state_bytes());
+    FsaScan(ff,smeta,i,end,h_features,cs,ns);
+  } else
+    for (;i<end;++i)
+      ff.Scan(smeta,*i,0,0,h_features);
+}
+
 // if State is pod.  sets state size and allocs start, h_start
 template <class St>
 struct FsaTypedBase : public FsaFeatureFunctionBase {
@@ -114,41 +187,20 @@ public:
   }
 };
 
+
 // usage (if you're lazy):
 // struct ShorterThanPrev : public FsaTypedBase<int>,FsaTypedScan<ShorterThanPrev>
-template <class Impl>
-struct FsaTypedScan  {
+template <class St,class Impl>
+struct FsaTypedScan : public FsaTypedBase<St> {
   void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const {
     Impl const* impl=static_cast<Impl const*>(this);
-    impl->Scan(smeta,w,impl->state(st),impl->state(next_state),features);
+    impl->ScanTyped(smeta,w,impl->state(st),impl->state(next_state),features);
+    FSADBG("Scan "<<impl->state(st)<<" ->"<<TD::Convert(w)<<" "<<impl->state(next_state)<<" = "<<(*features)[impl->fid_]<<std::endl);
   }
 };
 
 
 
-// init state is in cs; overwrite cs, ns repeatedly (alternatively).  return resulting state
-template <class FsaFF>
-void *FsaScan(FsaFF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *h_features, void *cs,void *ns) {
-  // extra code - IT'S FOR EFFICIENCY, MAN!  IT'S OK!  definitely no bugs here.
-  void *os,*es;
-  WordID const* e=end-1; // boundcheck 1 earlier because in loop below we use i+1 before rechecking
-  if ((end-i)&1) { // odd # of words
-    os=cs;
-    es=ns;
-    i-=1;
-    goto odd;
-  } else {
-    es=cs;
-    os=ns;
-  }
-  for (;i<e;i+=2) {
-    ff.Scan(smeta,*i,es,os,h_features); // e->o
-  odd:
-    ff.Scan(smeta,*(i+1),os,es,h_features); // o->e
-  }
-  return es;
-}
-
 // do not use if state size is 0, please.
 const bool optimize_FsaScanner_zerostate=false;
 
@@ -205,20 +257,6 @@ struct FsaScanner {
 };
 
 
-template <class FF>
-void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,WordID const* i, WordID const* end,FeatureVector *h_features,void const* start_state) {
-  int ssz=ff.state_bytes();
-  if (ssz) {
-    Bytes state(ssz),state2(ssz);
-    void *cs=state.begin(),*ns=state2.begin();
-    memcpy(cs,start_state,ff.state_bytes());
-    FsaScan(ff,smeta,i,end,h_features,cs,ns);
-  } else
-    for (;i<end;++i)
-      ff.Scan(smeta,*i,0,0,h_features);
-}
-
-
 //TODO: combine 2 FsaFeatures typelist style (can recurse for more)
 
 
diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h
index 13a25387..947ad21c 100755
--- a/decoder/ff_sample_fsa.h
+++ b/decoder/ff_sample_fsa.h
@@ -57,7 +57,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase {
 //    int ss=-1;
 //    wordcpy((WordID*)start.begin(),&ss,&ss+1);
     //to_state(start.begin(),&ss,1);
-    wordlen(start.begin())=-1; // same as above.
+    wordlen(start.begin())=-1;
     wordlen(h_start.begin())=4; // estimate: anything >4 chars is usually longer than previous
   }
 
@@ -73,8 +73,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase {
 };
 
 // similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State)
-struct ShorterThanPrev : public FsaTypedBase<int>,FsaTypedScan<ShorterThanPrev> {
-  typedef int State; // defines # of bytes in state and return type of state(void *)
+struct ShorterThanPrev : FsaTypedScan<int,ShorterThanPrev> {
   static std::string usage(bool param,bool verbose) {
     return FeatureFunction::usage_helper(
       "ShorterThanPrev",
@@ -88,22 +87,22 @@ struct ShorterThanPrev : public FsaTypedBase<int>,FsaTypedScan<ShorterThanPrev>
   }
   ShorterThanPrev(std::string const& param) {
     init_fid(usage(false,false));
-    end_phrase_.push_back(TD::Convert("")); // this triggers end of sentence firing
+//    end_phrase_.push_back(TD::Convert("")); // this triggers end of sentence firing
     set_starts(-1,4); // estimate: anything <4 chars is usually shorter than previous
   }
 
   static const float val_per_target_word=-1;
   // evil anti-google int & len out-param:
-  void Scan(SentenceMetadata const& smeta,WordID w,int prevlen,int &len,FeatureVector *features) const {
+  void ScanTyped(SentenceMetadata const& smeta,WordID w,int prevlen,int &len,FeatureVector *features) const {
     len=wordlen(w);
     if (len<prevlen)
       features->add_value(fid_,val_per_target_word);
   }
 
   // already provided by FsaTypedScan<ShorterThanPrev>
-  void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const {
-    Scan(smeta,w,state(st),state(next_state),features);
-  }
+/*  void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const {
+    ScanTyped(smeta,w,state(st),state(next_state),features);
+    } */
 
 };
 
diff --git a/decoder/hg.cc b/decoder/hg.cc
index f0d6b3d5..5a33fc97 100644
--- a/decoder/hg.cc
+++ b/decoder/hg.cc
@@ -54,12 +54,13 @@ struct ScaledTransitionEventWeightFunction {
 // safe to reinterpret a vector of these as a vector of prob_t (plain old data)
 struct TropicalValue {
   TropicalValue() : v_() {}
-  explicit TropicalValue(int v) {
+  TropicalValue(int v) {
     if (v == 0) v_ = prob_t::Zero();
     else if (v == 1) v_ = prob_t::One();
     else { cerr << "Bad value in TropicalValue(int).\n"; abort(); }
   }
-  explicit TropicalValue(const prob_t& v) : v_(v) {}
+  TropicalValue(unsigned v) : v_(v) {}
+  TropicalValue(const prob_t& v) : v_(v) {}
   inline TropicalValue& operator+=(const TropicalValue& o) {
     if (v_ < o.v_) v_ = o.v_;
     return *this;
diff --git a/decoder/logval.h b/decoder/logval.h
index c8c342a3..37f14ae5 100644
--- a/decoder/logval.h
+++ b/decoder/logval.h
@@ -13,6 +13,8 @@ class LogVal {
  public:
   LogVal() : s_(), v_(-std::numeric_limits<T>::infinity()) {}
   explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {}
+  LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {}
+  LogVal(unsigned x) : s_(0), v_(std::log(x)) { }
   LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {}
   static LogVal<T> exp(T lnx) { return LogVal(lnx,false); }
 
diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h
index b42e001a..0f3724f0 100644
--- a/decoder/sparse_vector.h
+++ b/decoder/sparse_vector.h
@@ -1,6 +1,17 @@
 #ifndef _SPARSE_VECTOR_H_
 #define _SPARSE_VECTOR_H_
-/* TODO: use dense_hash_map for sparsevector
+
+#define SPARSE_VECTOR_HASH
+
+#ifdef SPARSE_VECTOR_HASH
+#include "hash.h"
+# define SPARSE_VECTOR_MAP HASH_MAP
+# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) HASH_MAP_RESERVED(h,empty,deleted)
+#else
+# define SPARSE_VECTOR_MAP std::map
+# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted)
+#endif
+/*
    use SparseVectorList (pair smallvector) for feat funcs / hypergraphs (you rarely need random access; just append a feature to the list)
 */
 /* hack: index 0 never gets printed because cdyer is creative and efficient. features which have no weight got feature dict id 0, see, and the models all clobered that value.  nobody wants to see it.  except that vlad is also creative and efficient and stored the oracle bleu there. */
@@ -27,21 +38,28 @@ inline T & extend_vector(std::vector<T> &v,int i) {
 
 template <typename T>
 class SparseVector {
+  void init_reserved() {
+    SPARSE_VECTOR_MAP_RESERVED(values_,-1,-2);
+  }
 public:
-  typedef std::map<int, T> MapType;
-  typedef typename std::map<int, T>::const_iterator const_iterator;
-  SparseVector() {}
+  typedef SparseVector<T> Self;
+  typedef SPARSE_VECTOR_MAP<int, T> MapType;
+  typedef typename MapType::const_iterator const_iterator;
+  SparseVector() {
+    init_reserved();
+  }
   explicit SparseVector(std::vector<T> const& v) {
+    init_reserved();
     typename MapType::iterator p=values_.begin();
-    const T z=T(0);
+    const T z=0;
     for (unsigned i=0;i<v.size();++i) {
       T const& t=v[i];
       if (t!=z)
         p=values_.insert(p,typename MapType::value_type(i,t)); //hint makes insertion faster
     }
-
   }
 
+
   void init_vector(std::vector<T> *vp) const {
     init_vector(*vp);
   }
@@ -52,7 +70,6 @@ public:
       extend_vector(v,i->first)=i->second;
   }
 
-
   void set_new_value(int index, T const& val) {
     assert(values_.find(index)==values_.end());
     values_[index]=val;
@@ -65,10 +82,10 @@ public:
   }
 
 
-  const T operator[](int index) const {
+  T operator[](int index) const {
     typename MapType::const_iterator found = values_.find(index);
     if (found == values_.end())
-      return T(0);
+      return 0;
     else
       return found->second;
   }
@@ -77,8 +94,11 @@ public:
     values_[index] = value;
   }
 
-    T add_value(int index, const T &value) {
-        return values_[index] += value;
+    T const& add_value(int index, const T &value) {
+      std::pair<typename MapType::iterator,bool> art=values_.insert(std::make_pair(index,value));
+      T &val=art.first->second;
+      if (!art.second) val += value; // already existed
+      return val;
     }
 
     T value(int index) const {
@@ -86,7 +106,7 @@ public:
         if (found != values_.end())
             return found->second;
         else
-            return T(0);
+            return 0;
     }
 
     void store(std::valarray<T>* target) const {
@@ -184,13 +204,20 @@ public:
       return sqrt(l2norm_sq());
     }
 
+  void erase(int key) {
+    values_.erase(key);
+/*    typename MapType::iterator found = values_.find(key);
+    if (found!=values_end())
+    values_.erase(found);*/
+  }
+
     SparseVector<T> &operator+=(const SparseVector<T> &other) {
         for (typename MapType::const_iterator
                 it = other.values_.begin(); it != other.values_.end(); ++it)
         {
-            T v = (values_[it->first] += it->second);
-            if (v == T())
-                values_.erase(it->first);
+//            T v =
+              (values_[it->first] += it->second);
+//            if (v == 0) values_.erase(it->first);
         }
         return *this;
     }
@@ -199,9 +226,9 @@ public:
         for (typename MapType::const_iterator
                 it = other.values_.begin(); it != other.values_.end(); ++it)
         {
-            T v = (values_[it->first] -= it->second);
-            if (v == T(0))
-                values_.erase(it->first);
+//            T v =
+          (values_[it->first] -= it->second);
+//            if (v == 0) values_.erase(it->first);
         }
         return *this;
     }
@@ -282,6 +309,35 @@ public:
         }
     }
 
+  bool operator==(Self const & other) const {
+    return size()==other.size() && contains_keys_of(other) && other.contains_i(*this);
+  }
+
+  bool contains(Self const &o) const {
+    return size()>o.size() && contains(o);
+  }
+
+  bool at_equals(int i,T const& val) const {
+    const_iterator it=values_.find(i);
+    if (it==values_.end()) return val==0;
+    return it->second==val;
+  }
+
+  bool contains_i(Self const& o) const {
+    for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i)
+      if (!at_equals(i->first,i->second))
+        return false;
+    return true;
+  }
+
+  bool contains_keys_of(Self const& o) const {
+    for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i)
+      if (values_.find(i)==values_.end())
+        return false;
+    return true;
+  }
+
+#ifndef SPARSE_VECTOR_HASH
     bool operator<(const SparseVector<T> &other) const {
         typename MapType::const_iterator it = values_.begin();
         typename MapType::const_iterator other_it = other.values_.begin();
@@ -295,6 +351,7 @@ public:
         }
         return values_.size() < other.values_.size();
     }
+#endif
 
   int size() const { return values_.size(); }
 
@@ -307,9 +364,6 @@ public:
     void clear() {
         values_.clear();
     }
-    void clear_value(int index) {
-      values_.erase(index);
-    }
 
     void swap(SparseVector<T>& other) {
       values_.swap(other.values_);
@@ -328,7 +382,7 @@ class SparseVectorList {
   SparseVectorList() {  }
   template <class I>
   SparseVectorList(I i,I const& end) {
-    const T z=T(0);
+    const T z=0;
     int c=0;
     for (;i<end;++i,++c) {
       if (*i!=z)
@@ -337,7 +391,7 @@ class SparseVectorList {
     p.compact();
   }
   explicit SparseVectorList(std::vector<T> const& v) {
-    const T z=T(0);
+    const T z=0;
     for (unsigned i=0;i<v.size();++i) {
       T const& t=v[i];
       if (t!=z)
diff --git a/decoder/value_array.h b/decoder/value_array.h
index 0cb5c3d6..6fa221ee 100755
--- a/decoder/value_array.h
+++ b/decoder/value_array.h
@@ -61,7 +61,7 @@ public:
 protected:
   inline void init(size_type s, const_reference t = T()) {
     sz=s;
-    array=A::allocate(s);
+    array=s ? A::allocate(s) : 0;
     for (size_type i = 0; i != sz; ++i) { A::construct(array + i,t); }
   }
 public:
diff --git a/extools/mr_stripe_rule_reduce.cc b/extools/mr_stripe_rule_reduce.cc
index 8332a106..0be1834d 100644
--- a/extools/mr_stripe_rule_reduce.cc
+++ b/extools/mr_stripe_rule_reduce.cc
@@ -87,7 +87,7 @@ void DoPhraseMarginals(const vector<WordID>& key, const bool bidir, ID2RuleStati
     it->second.counts.set_value(cur_marginal_id, tot);
 
     // prevent double counting of the joint
-    if (cur_marginal_id == kCE) it->second.counts.clear_value(kCFE);
+    if (cur_marginal_id == kCE) it->second.counts.erase(kCFE);
   }
 }
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-23 19:17:22 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-23 19:17:22 +0000
commit	00be683d744ea87e67dd13db08e4a17531bfb1f3 (patch)
tree	7caccb91b31a1ffa1930a29ebf6f30b27e6f5f69
parent	8d222e20d8f253aa2c73d139d8ae6cc69483d071 (diff)