7 files changed, 493 insertions, 34 deletions
diff --git a/utils/indices_after.h b/utils/indices_after.h
index 2891563c..62683f39 100755
--- a/utils/indices_after.h
+++ b/utils/indices_after.h
@@ -47,8 +47,8 @@ unsigned new_indices_keep_if_n(unsigned n,It i,KeepIf const& r,O out)
 }
 
 template <class KEEP,class O>
-unsigned new_indices(KEEP keep,O out) {
-  return new_indices(keep.begin(),keep.end(),out);
+unsigned new_indices_keep(KEEP keep,O out) {
+  return new_indices_keep(keep.begin(),keep.end(),out);
 }
 
 template <class V,class Out,class Permi>
@@ -129,8 +129,10 @@ struct indices_after
     if (n_mapped>0) {
       map=(unsigned *)::operator new(sizeof(unsigned)*n_mapped);
       n_kept=new_indices_keep(i,end,map);
-    } else
+    } else {
+      n_kept=0;
       map=NULL;
+    }
   }
   template <class A>
   void init(A const& a)
@@ -139,7 +141,7 @@ struct indices_after
   }
 
   template <class A>
-  indices_after(A const& a)
+  explicit indices_after(A const& a)
   {
     init(a.begin(),a.end());
   }
diff --git a/utils/logval.h b/utils/logval.h
index 37f14ae5..b337cf0e 100644
--- a/utils/logval.h
+++ b/utils/logval.h
@@ -3,12 +3,13 @@
 
 #define LOGVAL_CHECK_NEG false
 
+#include <boost/functional/hash.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cmath>
 #include <limits>
 
-template <typename T>
+template <class T>
 class LogVal {
  public:
   LogVal() : s_(), v_(-std::numeric_limits<T>::infinity()) {}
@@ -23,6 +24,11 @@ class LogVal {
   static LogVal<T> e() { return LogVal(1,false); }
   void logeq(const T& v) { s_ = false; v_ = v; }
 
+  std::size_t hash_impl() const {
+    using namespace boost;
+    return hash_value(v_)+s_;
+  }
+
   LogVal& operator+=(const LogVal& a) {
     if (a.v_ == -std::numeric_limits<T>::infinity()) return *this;
     if (a.s_ == s_) {
@@ -98,31 +104,31 @@ class LogVal {
 };
 
 // copy elision - as opposed to explicit copy of LogVal<T> const& o1, we should be able to construct Logval r=a+(b+c) as a single result in place in r.  todo: return std::move(o1) - C++0x
-template<typename T>
+template<class T>
 LogVal<T> operator+(LogVal<T> o1, const LogVal<T>& o2) {
   o1 += o2;
   return o1;
 }
 
-template<typename T>
+template<class T>
 LogVal<T> operator*(LogVal<T> o1, const LogVal<T>& o2) {
   o1 *= o2;
   return o1;
 }
 
-template<typename T>
+template<class T>
 LogVal<T> operator/(LogVal<T> o1, const LogVal<T>& o2) {
   o1 /= o2;
   return o1;
 }
 
-template<typename T>
+template<class T>
 LogVal<T> operator-(LogVal<T> o1, const LogVal<T>& o2) {
   o1 -= o2;
   return o1;
 }
 
-template<typename T>
+template<class T>
 T log(const LogVal<T>& o) {
 #ifdef LOGVAL_CHECK_NEG
   if (o.s_) return log(-1.0);
@@ -130,12 +136,12 @@ T log(const LogVal<T>& o) {
   return o.v_;
 }
 
-template <typename T>
+template <class T>
 LogVal<T> pow(const LogVal<T>& b, const T& e) {
   return b.pow(e);
 }
 
-template <typename T>
+template <class T>
 bool operator<(const LogVal<T>& lhs, const LogVal<T>& rhs) {
   if (lhs.s_ == rhs.s_) {
     return (lhs.v_ < rhs.v_);
@@ -145,28 +151,32 @@ bool operator<(const LogVal<T>& lhs, const LogVal<T>& rhs) {
 }
 
 #if 0
-template <typename T>
+template <class T>
 bool operator<=(const LogVal<T>& lhs, const LogVal<T>& rhs) {
   return (lhs.v_ <= rhs.v_);
 }
 
-template <typename T>
+template <class T>
 bool operator>(const LogVal<T>& lhs, const LogVal<T>& rhs) {
   return (lhs.v_ > rhs.v_);
 }
 
-template <typename T>
+template <class T>
 bool operator>=(const LogVal<T>& lhs, const LogVal<T>& rhs) {
   return (lhs.v_ >= rhs.v_);
 }
 #endif
 
-template <typename T>
+
+template <class T>
+std::size_t hash_value(const LogVal<T>& x) { return x.hash_impl(); }
+
+template <class T>
 bool operator==(const LogVal<T>& lhs, const LogVal<T>& rhs) {
   return (lhs.v_ == rhs.v_) && (lhs.s_ == rhs.s_);
 }
 
-template <typename T>
+template <class T>
 bool operator!=(const LogVal<T>& lhs, const LogVal<T>& rhs) {
   return !(lhs == rhs);
 }
diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h
index e8e9c2f7..7ac85d1d 100644
--- a/utils/sparse_vector.h
+++ b/utils/sparse_vector.h
@@ -1,16 +1,13 @@
 #ifndef _SPARSE_VECTOR_H_
 #define _SPARSE_VECTOR_H_
 
+/*
+TODO: specialize for int value types, where it probably makes sense to check if adding/subtracting brings a value to 0, and remove it from the map (e.g. in a gibbs sampler).  or add a separate policy argument for that.
+ */
+
 //#define SPARSE_VECTOR_HASH
+// if defined, use hash_map rather than map.  map is probably faster/smaller for small vectors
 
-#ifdef SPARSE_VECTOR_HASH
-#include "hash.h"
-# define SPARSE_VECTOR_MAP HASH_MAP
-# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) HASH_MAP_RESERVED(h,empty,deleted)
-#else
-# define SPARSE_VECTOR_MAP std::map
-# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted)
-#endif
 /*
    use SparseVectorList (pair smallvector) for feat funcs / hypergraphs (you rarely need random access; just append a feature to the list)
 */
@@ -38,6 +35,17 @@
 // this is a modified version of code originally written
 // by Phil Blunsom
 
+#include <boost/functional/hash.hpp>
+#include <stdexcept>
+#ifdef SPARSE_VECTOR_HASH
+#include "hash.h"
+# define SPARSE_VECTOR_MAP HASH_MAP
+# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) HASH_MAP_RESERVED(h,empty,deleted)
+#else
+# define SPARSE_VECTOR_MAP std::map
+# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted)
+#endif
+
 #include <iostream>
 #include <map>
 #include <tr1/unordered_map>
@@ -46,6 +54,7 @@
 
 #include "fdict.h"
 #include "small_vector.h"
+#include "string_to.h"
 
 template <class T>
 inline T & extend_vector(std::vector<T> &v,int i) {
@@ -54,7 +63,7 @@ inline T & extend_vector(std::vector<T> &v,int i) {
   return v[i];
 }
 
-template <typename T>
+template <class T>
 class SparseVector {
   void init_reserved() {
     SPARSE_VECTOR_MAP_RESERVED(values_,-1,-2);
@@ -71,17 +80,97 @@ public:
   SparseVector() {
     init_reserved();
   }
+  typedef typename MapType::value_type value_type;
+  typedef typename MapType::iterator iterator;
   explicit SparseVector(std::vector<T> const& v) {
     init_reserved();
-    typename MapType::iterator p=values_.begin();
+    iterator p=values_.begin();
     const T z=0;
     for (unsigned i=0;i<v.size();++i) {
       T const& t=v[i];
       if (t!=z)
-        p=values_.insert(p,typename MapType::value_type(i,t)); //hint makes insertion faster
+        p=values_.insert(p,value_type(i,t)); //hint makes insertion faster
+    }
+  }
+
+  typedef char const* Str;
+  template <class O>
+  void print(O &o,Str pre="",Str post="",Str kvsep="=",Str pairsep=" ") const {
+    o << pre;
+    bool first=true;
+    for (const_iterator i=values_.begin(),e=values_.end();i!=e;++i) {
+      if (first)
+        first=false;
+      else
+        o<<pairsep;
+      o<<FD::Convert(i->first)<<kvsep<<i->second;
     }
+    o << post;
   }
 
+  static void error(std::string const& msg) {
+    throw std::runtime_error("SparseVector: "+msg);
+  }
+
+  enum DupPolicy {
+    NO_DUPS,
+    KEEP_FIRST,
+    KEEP_LAST,
+    SUM
+  };
+
+  // either key val alternating whitespace sep, or key=val (kvsep char is '=').  end at eof or terminator (non-ws) char
+  template <class S>
+  void read(S &s,DupPolicy dp=NO_DUPS,bool use_kvsep=true,char kvsep='=',bool stop_at_terminator=false,char terminator=')') {
+    values_.clear();
+    std::string id;
+    WordID k;
+    T v;
+#undef SPARSE_MUST_READ
+#define SPARSE_MUST_READ(x) if (!(x)) error(#x);
+    int ki;
+    while (s) {
+      if (stop_at_terminator) {
+        char c;
+        if (!(s>>c)) goto eof;
+        s.unget();
+        if (c==terminator) return;
+      }
+      if (!(s>>id)) goto eof;
+      if (use_kvsep && (ki=id.find(kvsep))!=std::string::npos) {
+        k=FD::Convert(std::string(id,0,ki));
+        string_into(id.c_str()+ki+1,v);
+      } else {
+        k=FD::Convert(id);
+        if (!(s>>v)) error("reading value failed");
+      }
+      std::pair<iterator,bool> vi=values_.insert(value_type(k,v));
+      if (vi.second) {
+        T &oldv=vi.first->second;
+        switch(dp) {
+        case NO_DUPS: error("read duplicate key with NO_DUPS.  key="
+                            +FD::Convert(k)+" val="+to_string(v)+" old-val="+to_string(oldv));
+          break;
+        case KEEP_FIRST: break;
+        case KEEP_LAST: oldv=v; break;
+        case SUM: oldv+=v; break;
+        }
+      }
+    }
+    return;
+  eof:
+    if (!s.eof()) error("reading key failed (before EOF)");
+  }
+
+  friend inline std::ostream & operator<<(std::ostream &o,Self const& s) {
+    s.print(o);
+    return o;
+  }
+
+  friend inline std::istream & operator>>(std::istream &o,Self & s) {
+    s.read(o);
+    return o;
+  }
 
   void init_vector(std::vector<T> *vp) const {
     init_vector(*vp);
@@ -118,6 +207,10 @@ public:
     return values_[index];
   }
 
+  inline void maybe_set_value(int index, const T &value) {
+    if (value) values_[index] = value;
+  }
+
   inline void set_value(int index, const T &value) {
     values_[index] = value;
   }
@@ -352,6 +445,10 @@ public:
     return size()==other.size() && contains_keys_of(other) && other.contains_i(*this);
   }
 
+  std::size_t hash_impl() const {
+    return boost::hash_range(begin(),end());
+  }
+
   bool contains(Self const &o) const {
     return size()>o.size() && contains(o);
   }
@@ -371,7 +468,7 @@ public:
 
   bool contains_keys_of(Self const& o) const {
     for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i)
-      if (values_.find(i)==values_.end())
+      if (values_.find(i->first)==values_.end())
         return false;
     return true;
   }
@@ -478,31 +575,36 @@ private:
   List p;
 };
 
-template <typename T>
+template <class T>
+std::size_t hash_value(SparseVector<T> const& x) {
+  return x.hash_impl();
+}
+
+template <class T>
 SparseVector<T> operator+(const SparseVector<T>& a, const SparseVector<T>& b) {
   SparseVector<T> result = a;
   return result += b;
 }
 
-template <typename T>
+template <class T>
 SparseVector<T> operator*(const SparseVector<T>& a, const double& b) {
   SparseVector<T> result = a;
   return result *= b;
 }
 
-template <typename T>
+template <class T>
 SparseVector<T> operator*(const SparseVector<T>& a, const T& b) {
   SparseVector<T> result = a;
   return result *= b;
 }
 
-template <typename T>
+template <class T>
 SparseVector<T> operator*(const double& a, const SparseVector<T>& b) {
   SparseVector<T> result = b;
   return result *= a;
 }
 
-template <typename T>
+template <class T>
 std::ostream &operator<<(std::ostream &out, const SparseVector<T> &vec)
 {
     return vec.operator<<(out);
diff --git a/utils/string_to.h b/utils/string_to.h
new file mode 100755
index 00000000..c78a5394
--- /dev/null
+++ b/utils/string_to.h
@@ -0,0 +1,314 @@
+#ifndef STRING_TO_H
+#define STRING_TO_H
+
+/*
+   may not be any faster than boost::lexical_cast in later incarnations (see http://accu.org/index.php/journals/1375)
+   but is slightly simpler.  no wide char or locale.
+
+   X string_to<X>(string);
+   string to_string(X);
+   X& string_into(string,X &); // note: returns the same ref you passed in, for convenience of use
+
+   default implementation via stringstreams (quite slow, I'm sure)
+
+   fast implementation for string, int<->string, unsigned<->string, float<->string, double<->string
+
+*/
+
+#ifndef USE_FTOA
+#define USE_FTOA 1
+#endif
+#ifndef HAVE_STRTOUL
+# define HAVE_STRTOUL 1
+#endif
+
+#include <string>
+#include <sstream>
+#include <stdexcept>
+#include <cstdlib>
+
+#include "have_64_bits.h"
+#include "utoa.h"
+#if USE_FTOA
+# include "ftoa.h"
+#endif
+
+namespace {
+// for faster numeric to/from string.  TODO: separate into optional header
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h> // access to evil (fast) C isspace etc.
+#include <limits.h> //strtoul
+}
+
+inline void throw_string_to(std::string const& msg,char const* prefix="string_to: ") {
+  throw std::runtime_error(prefix+msg);
+}
+
+template <class I,class To>
+bool try_stream_into(I & i,To &to,bool complete=true)
+{
+    i >> to;
+    if (i.fail()) return false;
+    if (complete) {
+        char c;
+        return !(i >> c);
+    }
+    return true;
+}
+
+template <class Str,class To>
+bool try_string_into(Str const& str,To &to,bool complete=true)
+{
+    std::istringstream i(str);
+    return try_stream_into(i,to,complete);
+}
+
+template <class Str,class Data> inline
+Data & string_into(const Str &str,Data &data)
+{
+    if (!try_string_into(str,data))
+        throw std::runtime_error(std::string("Couldn't convert (string_into): ")+str);
+    return data;
+}
+
+
+template <class Data,class Str> inline
+Data string_to(const Str &str)
+{
+    Data ret;
+    string_into(str,ret);
+    return ret;
+}
+
+template <class D> inline
+std::string to_string(D const &d)
+{
+    std::ostringstream o;
+    o << d;
+    return o.str();
+}
+
+inline std::string to_string(unsigned x) {
+  return utos(x);
+}
+
+inline std::string to_string(int x) {
+  return itos(x);
+}
+
+inline long strtol_complete(char const* s,int base=10) {
+  char *e;
+  if (*s) {
+    long r=strtol(s,&e,base);
+    char c=*e;
+    if (!c || isspace(c)) //simplifying assumption: we're happy if there's other stuff in the string, so long as the number ends in a space or eos.  TODO: loop consuming spaces until end?
+      return r;
+  }
+  throw_string_to(s,"Couldn't convert to integer: ");
+}
+
+// returns -INT_MAX or INT_MAX if number is too large/small
+inline int strtoi_complete_bounded(char const* s,int base=10) {
+  long l=strtol_complete(s,base);
+  if (l<std::numeric_limits<int>::min())
+    return std::numeric_limits<int>::min();
+  if (l>std::numeric_limits<int>::max())
+    return std::numeric_limits<int>::max();
+  return l;
+}
+#define RANGE_STR(x) #x
+#ifdef INT_MIN
+# define INTRANGE_STR "[" RANGE_STR(INT_MIN) "," RANGE_STR(INT_MAX) "]"
+#else
+# define INTRANGE_STR "[-2137483648,2147483647]"
+#endif
+
+  // throw if out of int range
+inline int strtoi_complete_exact(char const* s,int base=10) {
+  long l=strtol_complete(s,base);
+  if (l<std::numeric_limits<int>::min() || l>std::numeric_limits<int>::max())
+    throw_string_to(s,"Out of range for int " INTRANGE_STR ": ");
+  return l;
+}
+
+#if HAVE_LONGER_LONG
+inline int& string_into(std::string const& s,int &x) {
+  x=strtoi_complete_exact(s.c_str());
+  return x;
+}
+inline int& string_into(char const* s,int &x) {
+  x=strtoi_complete_exact(s);
+  return x;
+}
+#endif
+
+inline long& string_into(std::string const& s,long &x) {
+  x=strtol_complete(s.c_str());
+  return x;
+}
+inline long& string_into(char const* s,long &x) {
+  x=strtol_complete(s);
+  return x;
+}
+
+
+//FIXME: preprocessor separation for tokens int<->unsigned int, long<->unsigned long, strtol<->strtoul ?  massive code duplication
+inline unsigned long strtoul_complete(char const* s,int base=10) {
+  char *e;
+  if (*s) {
+#if HAVE_STRTOUL
+    unsigned long r=strtoul(s,&e,base);
+#else
+//    unsigned long r=strtol(s,&e,base); //FIXME: not usually safe
+    unsigned long r;
+    sscanf(s,"%ul",&r);
+#endif
+    char c=*e;
+    if (!c || isspace(c)) //simplifying assumption: we're happy if there's other stuff in the string, so long as the number ends in a space or eos.  TODO: loop consuming spaces until end?
+      return r;
+  }
+  throw_string_to(s,"Couldn't convert to integer: ");
+}
+
+inline unsigned strtou_complete_bounded(char const* s,int base=10) {
+  unsigned long l=strtoul_complete(s,base);
+  if (l<std::numeric_limits<unsigned>::min())
+    return std::numeric_limits<unsigned>::min();
+  if (l>std::numeric_limits<unsigned>::max())
+    return std::numeric_limits<unsigned>::max();
+  return l;
+}
+
+#ifdef UINT_MIN
+# define UINTRANGE_STR "[" RANGE_STR(UINT_MIN) "," RANGE_STR(UINT_MAX) "]"
+#else
+# define UINTRANGE_STR "[0,4,294,967,295]"
+#endif
+
+  // throw if out of int range
+inline unsigned strtou_complete_exact(char const* s,int base=10) {
+  unsigned long l=strtoul_complete(s,base);
+  if (l<std::numeric_limits<unsigned>::min() || l>std::numeric_limits<unsigned>::max())
+    throw_string_to(s,"Out of range for uint " UINTRANGE_STR ": ");
+  return l;
+}
+
+#if HAVE_LONGER_LONG
+inline unsigned& string_into(std::string const& s,unsigned &x) {
+  x=strtou_complete_exact(s.c_str());
+  return x;
+}
+inline unsigned& string_into(char const* s,unsigned &x) {
+  x=strtou_complete_exact(s);
+  return x;
+}
+#endif
+
+inline unsigned long& string_into(std::string const& s,unsigned long &x) {
+  x=strtoul_complete(s.c_str());
+  return x;
+}
+inline unsigned long& string_into(char const* s,unsigned long &x) {
+  x=strtoul_complete(s);
+  return x;
+}
+
+//FIXME: end code duplication
+
+
+/* 9 decimal places needed to avoid rounding error in float->string->float.  17 for double->string->double
+   in terms of usable decimal places, there are 6 for float and 15 for double
+ */
+inline std::string to_string_roundtrip(float x) {
+  char buf[17];
+  return std::string(buf,buf+sprintf(buf,"%.9g",x));
+}
+inline std::string to_string(float x) {
+#if USE_FTOA
+  return ftos(x);
+#else
+  char buf[15];
+  return std::string(buf,buf+sprintf(buf,"%.7g",x));
+#endif
+}
+inline std::string to_string_roundtrip(double x) {
+  char buf[32];
+  return std::string(buf,buf+sprintf(buf,"%.17g",x));
+}
+inline std::string to_string(double x) {
+#if USE_FTOA
+  return ftos(x);
+#else
+  char buf[30];
+  return std::string(buf,buf+sprintf(buf,"%.15g",x));
+#endif
+}
+
+inline double& string_into(char const* s,double &x) {
+  x=std::atof(s);
+  return x;
+}
+inline float& string_into(char const* s,float &x) {
+  x=std::atof(s);
+  return x;
+}
+
+inline double& string_into(std::string const& s,double &x) {
+  x=std::atof(s.c_str());
+  return x;
+}
+inline float& string_into(std::string const& s,float &x) {
+  x=std::atof(s.c_str());
+  return x;
+}
+
+
+template <class Str>
+bool try_string_into(Str const& str,Str &to,bool complete=true)
+{
+    str=to;
+    return true;
+}
+
+inline std::string const& to_string(std::string const& d)
+{
+    return d;
+}
+
+template <class Str>
+Str const& string_to(Str const &s)
+{
+    return s;
+}
+
+template <class Str>
+Str & string_into(Str const &s,Str &d)
+{
+    return d=s;
+}
+
+/*
+
+template <class Str,class Data,class size_type> inline
+void substring_into(const Str &str,size_type pos,size_type n,Data &data)
+{
+//    std::istringstream i(str,pos,n); // doesn't exist!
+    std::istringstream i(str.substr(pos,n));
+    if (!(i>>*data))
+        throw std::runtime_error("Couldn't convert (string_into): "+str);
+}
+
+template <class Data,class Str,class size_type> inline
+Data string_to(const Str &str,size_type pos,size_type n)
+{
+    Data ret;
+    substring_into(str,pos,n,ret);
+    return ret;
+}
+
+*/
+
+
+
+#endif
diff --git a/utils/warning_compiler.h b/utils/warning_compiler.h
new file mode 100644
index 00000000..2052cff3
--- /dev/null
+++ b/utils/warning_compiler.h
@@ -0,0 +1,15 @@
+#ifndef WARNING_COMPILER_HPP
+#define WARNING_COMPILER_HPP
+
+#ifndef HAVE_GCC_4_4
+#undef HAVE_DIAGNOSTIC_PUSH
+#if __GNUC__ > 4 || __GNUC__==4 && __GNUC_MINOR__ > 3
+# define HAVE_GCC_4_4 1
+# define HAVE_DIAGNOSTIC_PUSH 1
+#else
+# define HAVE_GCC_4_4 0
+# define HAVE_DIAGNOSTIC_PUSH 0
+#endif
+#endif
+
+#endif
diff --git a/utils/warning_pop.h b/utils/warning_pop.h
new file mode 100644
index 00000000..2be5d9ed
--- /dev/null
+++ b/utils/warning_pop.h
@@ -0,0 +1,8 @@
+#ifdef _MSC_VER
+#pragma warning( pop )
+#else
+#include "warning_compiler.h"
+#if HAVE_DIAGNOSTIC_PUSH
+# pragma GCC diagnostic pop
+#endif
+#endif
diff --git a/utils/warning_push.h b/utils/warning_push.h
new file mode 100644
index 00000000..086fd524
--- /dev/null
+++ b/utils/warning_push.h
@@ -0,0 +1,8 @@
+#ifdef _MSC_VER
+#pragma warning( pop )
+#else
+#include "warning_compiler.h"
+#if HAVE_DIAGNOSTIC_PUSH
+# pragma GCC diagnostic push
+#endif
+#endif