34 files changed, 701 insertions, 595 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am
index fabb4454..dd74ddc0 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -22,6 +22,7 @@ libutils_a_SOURCES = \
   alias_sampler.h \
   alignment_io.h \
   array2d.h \
+  b64featvector.h \
   b64tools.h \
   batched_append.h \
   city.h \
@@ -38,11 +39,8 @@ libutils_a_SOURCES = \
   have_64_bits.h \
   indices_after.h \
   kernel_string_subseq.h \
-  lbfgs.h \
-  lbfgs.cpp \
   logval.h \
   m.h \
-  mathvec.h \
   maxent.h \
   maxent.cpp \
   murmur_hash3.h \
@@ -50,8 +48,6 @@ libutils_a_SOURCES = \
   named_enum.h \
   null_deleter.h \
   null_traits.h \
-  owlqn.cpp \
-  sgd.cpp \
   perfect_hash.h \
   prob.h \
   sampler.h \
@@ -77,6 +73,7 @@ libutils_a_SOURCES = \
   fast_lexical_cast.hpp \
   intrusive_refcount.hpp \
   alignment_io.cc \
+  b64featvector.cc \
   b64tools.cc \
   corpus_tools.cc \
   dict.cc \
diff --git a/utils/alias_sampler.h b/utils/alias_sampler.h
index 81541f7a..0f9d3f6d 100644
--- a/utils/alias_sampler.h
+++ b/utils/alias_sampler.h
@@ -1,5 +1,5 @@
-#ifndef _ALIAS_SAMPLER_H_
-#define _ALIAS_SAMPLER_H_
+#ifndef ALIAS_SAMPLER_H_
+#define ALIAS_SAMPLER_H_
 
 #include <vector>
 #include <limits>
diff --git a/utils/alignment_io.h b/utils/alignment_io.h
index 63fb916b..ec70688e 100644
--- a/utils/alignment_io.h
+++ b/utils/alignment_io.h
@@ -1,5 +1,5 @@
-#ifndef _ALIGNMENT_IO_H_
-#define _ALIGNMENT_IO_H_
+#ifndef ALIGNMENT_IO_H_
+#define ALIGNMENT_IO_H_
 
 #include <string>
 #include <iostream>
diff --git a/utils/b64featvector.cc b/utils/b64featvector.cc
new file mode 100644
index 00000000..c7d08b29
--- /dev/null
+++ b/utils/b64featvector.cc
@@ -0,0 +1,55 @@
+#include "b64featvector.h"
+
+#include <sstream>
+#include <boost/scoped_array.hpp>
+#include "b64tools.h"
+#include "fdict.h"
+
+using namespace std;
+
+static inline void EncodeFeatureWeight(const string &featname, weight_t weight,
+                                       ostream *output) {
+  output->write(featname.data(), featname.size() + 1);
+  output->write(reinterpret_cast<char *>(&weight), sizeof(weight_t));
+}
+
+string EncodeFeatureVector(const SparseVector<weight_t> &vec) {
+  string b64;
+  {
+    ostringstream base64_strm;
+    {
+      ostringstream strm;
+      for (SparseVector<weight_t>::const_iterator it = vec.begin();
+           it != vec.end(); ++it)
+        if (it->second != 0)
+          EncodeFeatureWeight(FD::Convert(it->first), it->second, &strm);
+      string data(strm.str());
+      B64::b64encode(data.data(), data.size(), &base64_strm);
+    }
+    b64 = base64_strm.str();
+  }
+  return b64;
+}
+
+void DecodeFeatureVector(const string &data, SparseVector<weight_t> *vec) {
+  vec->clear();
+  if (data.empty()) return;
+  // Decode data
+  size_t b64_len = data.size(), len = b64_len / 4 * 3;
+  boost::scoped_array<char> buf(new char[len]);
+  bool res =
+      B64::b64decode(reinterpret_cast<const unsigned char *>(data.data()),
+                     b64_len, buf.get(), len);
+  assert(res);
+  // Apply updates
+  size_t cur = 0;
+  while (cur < len) {
+    string feat_name(buf.get() + cur);
+    if (feat_name.empty()) break;  // Encountered trailing \0
+    int feat_id = FD::Convert(feat_name);
+    weight_t feat_delta =
+        *reinterpret_cast<weight_t *>(buf.get() + cur + feat_name.size() + 1);
+    (*vec)[feat_id] = feat_delta;
+    cur += feat_name.size() + 1 + sizeof(weight_t);
+  }
+}
diff --git a/utils/b64featvector.h b/utils/b64featvector.h
new file mode 100644
index 00000000..6ac04d44
--- /dev/null
+++ b/utils/b64featvector.h
@@ -0,0 +1,12 @@
+#ifndef _B64FEATVECTOR_H_
+#define _B64FEATVECTOR_H_
+
+#include <string>
+
+#include "sparse_vector.h"
+#include "weights.h"
+
+std::string EncodeFeatureVector(const SparseVector<weight_t> &);
+void DecodeFeatureVector(const std::string &, SparseVector<weight_t> *);
+
+#endif  // _B64FEATVECTOR_H_
diff --git a/utils/b64tools.h b/utils/b64tools.h
index c821fc8f..130a9102 100644
--- a/utils/b64tools.h
+++ b/utils/b64tools.h
@@ -1,5 +1,5 @@
-#ifndef _B64_TOOLS_H_
-#define _B64_TOOLS_H_
+#ifndef B64_TOOLS_H_
+#define B64_TOOLS_H_
 
 namespace B64 {
   bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize);
diff --git a/utils/corpus_tools.h b/utils/corpus_tools.h
index f6699d87..3ccaf6ef 100644
--- a/utils/corpus_tools.h
+++ b/utils/corpus_tools.h
@@ -1,5 +1,5 @@
-#ifndef _CORPUS_TOOLS_H_
-#define _CORPUS_TOOLS_H_
+#ifndef CORPUS_TOOLS_H_
+#define CORPUS_TOOLS_H_
 
 #include <string>
 #include <set>
diff --git a/utils/exp_semiring.h b/utils/exp_semiring.h
index 26a22071..164286e3 100644
--- a/utils/exp_semiring.h
+++ b/utils/exp_semiring.h
@@ -1,5 +1,5 @@
-#ifndef _EXP_SEMIRING_H_
-#define _EXP_SEMIRING_H_
+#ifndef EXP_SEMIRING_H_
+#define EXP_SEMIRING_H_
 
 #include <iostream>
 #include "star.h"
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index 6e2a77cd..1e0ab428 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -1,5 +1,5 @@
-#ifndef _FAST_SPARSE_VECTOR_H_
-#define _FAST_SPARSE_VECTOR_H_
+#ifndef FAST_SPARSE_VECTOR_H_
+#define FAST_SPARSE_VECTOR_H_
 
 // FastSparseVector<T> is a integer indexed unordered map that supports very fast
 // (mathematical) vector operations when the sizes are very small, and reasonably
diff --git a/utils/fdict.h b/utils/fdict.h
index eb853fb2..94763890 100644
--- a/utils/fdict.h
+++ b/utils/fdict.h
@@ -1,5 +1,5 @@
-#ifndef _FDICT_H_
-#define _FDICT_H_
+#ifndef FDICT_H_
+#define FDICT_H_
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
diff --git a/utils/feature_vector.h b/utils/feature_vector.h
index a7b61a66..bf77b5ac 100644
--- a/utils/feature_vector.h
+++ b/utils/feature_vector.h
@@ -1,5 +1,5 @@
-#ifndef _FEATURE_VECTOR_H_
-#define _FEATURE_VECTOR_H_
+#ifndef FEATURE_VECTOR_H_
+#define FEATURE_VECTOR_H_
 
 #include <vector>
 #include "sparse_vector.h"
diff --git a/utils/filelib.h b/utils/filelib.h
index 4fa69760..90620d05 100644
--- a/utils/filelib.h
+++ b/utils/filelib.h
@@ -1,5 +1,5 @@
-#ifndef _FILELIB_H_
-#define _FILELIB_H_
+#ifndef FILELIB_H_
+#define FILELIB_H_
 
 #include <cassert>
 #include <string>
diff --git a/utils/kernel_string_subseq.h b/utils/kernel_string_subseq.h
index 516e8b89..00ee7da7 100644
--- a/utils/kernel_string_subseq.h
+++ b/utils/kernel_string_subseq.h
@@ -1,5 +1,5 @@
-#ifndef _KERNEL_STRING_SUBSEQ_H_
-#define _KERNEL_STRING_SUBSEQ_H_
+#ifndef KERNEL_STRING_SUBSEQ_H_
+#define KERNEL_STRING_SUBSEQ_H_
 
 #include <vector>
 #include <cmath>
diff --git a/utils/lbfgs.cpp b/utils/lbfgs.cpp
deleted file mode 100644
index bd26f048..00000000
--- a/utils/lbfgs.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-#include <vector>
-#include <iostream>
-#include <cmath>
-#include <stdio.h>
-#include "mathvec.h"
-#include "lbfgs.h"
-#include "maxent.h"
-
-using namespace std;
-
-const static int M = LBFGS_M;
-const static double LINE_SEARCH_ALPHA = 0.1;
-const static double LINE_SEARCH_BETA = 0.5;
-
-// stopping criteria
-int LBFGS_MAX_ITER = 300;
-const static double MIN_GRAD_NORM = 0.0001;
-
-double ME_Model::backtracking_line_search(const Vec& x0, const Vec& grad0,
-                                          const double f0, const Vec& dx,
-                                          Vec& x, Vec& grad1) {
-  double t = 1.0 / LINE_SEARCH_BETA;
-
-  double f;
-  do {
-    t *= LINE_SEARCH_BETA;
-    x = x0 + t * dx;
-    f = FunctionGradient(x.STLVec(), grad1.STLVec());
-    //        cout << "*";
-  } while (f > f0 + LINE_SEARCH_ALPHA * t * dot_product(dx, grad0));
-
-  return f;
-}
-
-//
-// Jorge Nocedal, "Updating Quasi-Newton Matrices With Limited Storage",
-// Mathematics of Computation, Vol. 35, No. 151, pp. 773-782, 1980.
-//
-Vec approximate_Hg(const int iter, const Vec& grad, const Vec s[],
-                   const Vec y[], const double z[]) {
-  int offset, bound;
-  if (iter <= M) {
-    offset = 0;
-    bound = iter;
-  } else {
-    offset = iter - M;
-    bound = M;
-  }
-
-  Vec q = grad;
-  double alpha[M], beta[M];
-  for (int i = bound - 1; i >= 0; i--) {
-    const int j = (i + offset) % M;
-    alpha[i] = z[j] * dot_product(s[j], q);
-    q += -alpha[i] * y[j];
-  }
-  if (iter > 0) {
-    const int j = (iter - 1) % M;
-    const double gamma = ((1.0 / z[j]) / dot_product(y[j], y[j]));
-    //    static double gamma;
-    //    if (gamma == 0) gamma = ((1.0 / z[j]) / dot_product(y[j], y[j]));
-    q *= gamma;
-  }
-  for (int i = 0; i <= bound - 1; i++) {
-    const int j = (i + offset) % M;
-    beta[i] = z[j] * dot_product(y[j], q);
-    q += s[j] * (alpha[i] - beta[i]);
-  }
-
-  return q;
-}
-
-vector<double> ME_Model::perform_LBFGS(const vector<double>& x0) {
-  const size_t dim = x0.size();
-  Vec x = x0;
-
-  Vec grad(dim), dx(dim);
-  double f = FunctionGradient(x.STLVec(), grad.STLVec());
-
-  Vec s[M], y[M];
-  double z[M];  // rho
-
-  for (int iter = 0; iter < LBFGS_MAX_ITER; iter++) {
-
-    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter + 1, -f, _train_error);
-    if (_nheldout > 0) {
-      const double heldout_logl = heldout_likelihood();
-      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl,
-              _heldout_error);
-    }
-    fprintf(stderr, "\n");
-
-    if (sqrt(dot_product(grad, grad)) < MIN_GRAD_NORM) break;
-
-    dx = -1 * approximate_Hg(iter, grad, s, y, z);
-
-    Vec x1(dim), grad1(dim);
-    f = backtracking_line_search(x, grad, f, dx, x1, grad1);
-
-    s[iter % M] = x1 - x;
-    y[iter % M] = grad1 - grad;
-    z[iter % M] = 1.0 / dot_product(y[iter % M], s[iter % M]);
-    x = x1;
-    grad = grad1;
-  }
-
-  return x.STLVec();
-}
diff --git a/utils/lbfgs.h b/utils/lbfgs.h
deleted file mode 100644
index 4d706f7a..00000000
--- a/utils/lbfgs.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _LBFGS_H_
-#define _LBFGS_H_
-
-#include <vector>
-
-// template<class FuncGrad>
-// std::vector<double>
-// perform_LBFGS(FuncGrad func_grad, const std::vector<double> & x0);
-
-std::vector<double> perform_LBFGS(
-    double (*func_grad)(const std::vector<double> &, std::vector<double> &),
-    const std::vector<double> &x0);
-
-std::vector<double> perform_OWLQN(
-    double (*func_grad)(const std::vector<double> &, std::vector<double> &),
-    const std::vector<double> &x0, const double C);
-
-const int LBFGS_M = 10;
-
-#endif
diff --git a/utils/m.h b/utils/m.h
index dc881b36..bd82c305 100644
--- a/utils/m.h
+++ b/utils/m.h
@@ -1,5 +1,5 @@
-#ifndef _M_H_
-#define _M_H_
+#ifndef M_H_HEADER_
+#define M_H_HEADER_
 
 #include <cassert>
 #include <cmath>
diff --git a/utils/mathvec.h b/utils/mathvec.h
deleted file mode 100644
index f8c60e5d..00000000
--- a/utils/mathvec.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef _MATH_VECTOR_H_
-#define _MATH_VECTOR_H_
-
-#include <vector>
-#include <iostream>
-#include <cassert>
-
-class Vec {
- private:
-  std::vector<double> _v;
-
- public:
-  Vec(const size_t n = 0, const double val = 0) { _v.resize(n, val); }
-  Vec(const std::vector<double>& v) : _v(v) {}
-  const std::vector<double>& STLVec() const { return _v; }
-  std::vector<double>& STLVec() { return _v; }
-  size_t Size() const { return _v.size(); }
-  double& operator[](int i) { return _v[i]; }
-  const double& operator[](int i) const { return _v[i]; }
-  Vec& operator+=(const Vec& b) {
-    assert(b.Size() == _v.size());
-    for (size_t i = 0; i < _v.size(); i++) {
-      _v[i] += b[i];
-    }
-    return *this;
-  }
-  Vec& operator*=(const double c) {
-    for (size_t i = 0; i < _v.size(); i++) {
-      _v[i] *= c;
-    }
-    return *this;
-  }
-  void Project(const Vec& y) {
-    for (size_t i = 0; i < _v.size(); i++) {
-      //      if (sign(_v[i]) != sign(y[i])) _v[i] = 0;
-      if (_v[i] * y[i] <= 0) _v[i] = 0;
-    }
-  }
-};
-
-inline double dot_product(const Vec& a, const Vec& b) {
-  double sum = 0;
-  for (size_t i = 0; i < a.Size(); i++) {
-    sum += a[i] * b[i];
-  }
-  return sum;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Vec& a) {
-  s << "(";
-  for (size_t i = 0; i < a.Size(); i++) {
-    if (i != 0) s << ", ";
-    s << a[i];
-  }
-  s << ")";
-  return s;
-}
-
-inline const Vec operator+(const Vec& a, const Vec& b) {
-  Vec v(a.Size());
-  assert(a.Size() == b.Size());
-  for (size_t i = 0; i < a.Size(); i++) {
-    v[i] = a[i] + b[i];
-  }
-  return v;
-}
-
-inline const Vec operator-(const Vec& a, const Vec& b) {
-  Vec v(a.Size());
-  assert(a.Size() == b.Size());
-  for (size_t i = 0; i < a.Size(); i++) {
-    v[i] = a[i] - b[i];
-  }
-  return v;
-}
-
-inline const Vec operator*(const Vec& a, const double c) {
-  Vec v(a.Size());
-  for (size_t i = 0; i < a.Size(); i++) {
-    v[i] = a[i] * c;
-  }
-  return v;
-}
-
-inline const Vec operator*(const double c, const Vec& a) { return a * c; }
-
-#endif
diff --git a/utils/maxent.cpp b/utils/maxent.cpp
index 0f49ee9d..fd772e08 100644
--- a/utils/maxent.cpp
+++ b/utils/maxent.cpp
@@ -3,12 +3,15 @@
  */
 
 #include "maxent.h"
+
+#include <vector>
+#include <iostream>
 #include <cmath>
 #include <cstdio>
-#include "lbfgs.h"
 
 using namespace std;
 
+namespace maxent {
 double ME_Model::FunctionGradient(const vector<double>& x,
                                   vector<double>& grad) {
   assert((int)_fb.Size() == x.size());
@@ -601,6 +604,428 @@ vector<double> ME_Model::classify(ME_Sample& mes) const {
   return vp;
 }
 
+// template<class FuncGrad>
+// std::vector<double>
+// perform_LBFGS(FuncGrad func_grad, const std::vector<double> & x0);
+
+std::vector<double> perform_LBFGS(
+    double (*func_grad)(const std::vector<double> &, std::vector<double> &),
+    const std::vector<double> &x0);
+
+std::vector<double> perform_OWLQN(
+    double (*func_grad)(const std::vector<double> &, std::vector<double> &),
+    const std::vector<double> &x0, const double C);
+
+const int LBFGS_M = 10;
+
+const static int M = LBFGS_M;
+const static double LINE_SEARCH_ALPHA = 0.1;
+const static double LINE_SEARCH_BETA = 0.5;
+
+// stopping criteria
+int LBFGS_MAX_ITER = 300;
+const static double MIN_GRAD_NORM = 0.0001;
+
+// LBFGS
+
+double ME_Model::backtracking_line_search(const Vec& x0, const Vec& grad0,
+                                          const double f0, const Vec& dx,
+                                          Vec& x, Vec& grad1) {
+  double t = 1.0 / LINE_SEARCH_BETA;
+
+  double f;
+  do {
+    t *= LINE_SEARCH_BETA;
+    x = x0 + t * dx;
+    f = FunctionGradient(x.STLVec(), grad1.STLVec());
+    //        cout << "*";
+  } while (f > f0 + LINE_SEARCH_ALPHA * t * dot_product(dx, grad0));
+
+  return f;
+}
+
+//
+// Jorge Nocedal, "Updating Quasi-Newton Matrices With Limited Storage",
+// Mathematics of Computation, Vol. 35, No. 151, pp. 773-782, 1980.
+//
+Vec approximate_Hg(const int iter, const Vec& grad, const Vec s[],
+                   const Vec y[], const double z[]) {
+  int offset, bound;
+  if (iter <= M) {
+    offset = 0;
+    bound = iter;
+  } else {
+    offset = iter - M;
+    bound = M;
+  }
+
+  Vec q = grad;
+  double alpha[M], beta[M];
+  for (int i = bound - 1; i >= 0; i--) {
+    const int j = (i + offset) % M;
+    alpha[i] = z[j] * dot_product(s[j], q);
+    q += -alpha[i] * y[j];
+  }
+  if (iter > 0) {
+    const int j = (iter - 1) % M;
+    const double gamma = ((1.0 / z[j]) / dot_product(y[j], y[j]));
+    //    static double gamma;
+    //    if (gamma == 0) gamma = ((1.0 / z[j]) / dot_product(y[j], y[j]));
+    q *= gamma;
+  }
+  for (int i = 0; i <= bound - 1; i++) {
+    const int j = (i + offset) % M;
+    beta[i] = z[j] * dot_product(y[j], q);
+    q += s[j] * (alpha[i] - beta[i]);
+  }
+
+  return q;
+}
+
+vector<double> ME_Model::perform_LBFGS(const vector<double>& x0) {
+  const size_t dim = x0.size();
+  Vec x = x0;
+
+  Vec grad(dim), dx(dim);
+  double f = FunctionGradient(x.STLVec(), grad.STLVec());
+
+  Vec s[M], y[M];
+  double z[M];  // rho
+
+  for (int iter = 0; iter < LBFGS_MAX_ITER; iter++) {
+
+    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter + 1, -f, _train_error);
+    if (_nheldout > 0) {
+      const double heldout_logl = heldout_likelihood();
+      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl,
+              _heldout_error);
+    }
+    fprintf(stderr, "\n");
+
+    if (sqrt(dot_product(grad, grad)) < MIN_GRAD_NORM) break;
+
+    dx = -1 * approximate_Hg(iter, grad, s, y, z);
+
+    Vec x1(dim), grad1(dim);
+    f = backtracking_line_search(x, grad, f, dx, x1, grad1);
+
+    s[iter % M] = x1 - x;
+    y[iter % M] = grad1 - grad;
+    z[iter % M] = 1.0 / dot_product(y[iter % M], s[iter % M]);
+    x = x1;
+    grad = grad1;
+  }
+
+  return x.STLVec();
+}
+
+// OWLQN
+
+// stopping criteria
+int OWLQN_MAX_ITER = 300;
+
+Vec approximate_Hg(const int iter, const Vec& grad, const Vec s[],
+                   const Vec y[], const double z[]);
+
+inline int sign(double x) {
+  if (x > 0) return 1;
+  if (x < 0) return -1;
+  return 0;
+};
+
+static Vec pseudo_gradient(const Vec& x, const Vec& grad0, const double C) {
+  Vec grad = grad0;
+  for (size_t i = 0; i < x.Size(); i++) {
+    if (x[i] != 0) {
+      grad[i] += C * sign(x[i]);
+      continue;
+    }
+    const double gm = grad0[i] - C;
+    if (gm > 0) {
+      grad[i] = gm;
+      continue;
+    }
+    const double gp = grad0[i] + C;
+    if (gp < 0) {
+      grad[i] = gp;
+      continue;
+    }
+    grad[i] = 0;
+  }
+
+  return grad;
+}
+
+double ME_Model::regularized_func_grad(const double C, const Vec& x,
+                                       Vec& grad) {
+  double f = FunctionGradient(x.STLVec(), grad.STLVec());
+  for (size_t i = 0; i < x.Size(); i++) {
+    f += C * fabs(x[i]);
+  }
+
+  return f;
+}
+
+double ME_Model::constrained_line_search(double C, const Vec& x0,
+                                         const Vec& grad0, const double f0,
+                                         const Vec& dx, Vec& x, Vec& grad1) {
+  // compute the orthant to explore
+  Vec orthant = x0;
+  for (size_t i = 0; i < orthant.Size(); i++) {
+    if (orthant[i] == 0) orthant[i] = -grad0[i];
+  }
+
+  double t = 1.0 / LINE_SEARCH_BETA;
+
+  double f;
+  do {
+    t *= LINE_SEARCH_BETA;
+    x = x0 + t * dx;
+    x.Project(orthant);
+    //    for (size_t i = 0; i < x.Size(); i++) {
+    //      if (x0[i] != 0 && sign(x[i]) != sign(x0[i])) x[i] = 0;
+    //    }
+
+    f = regularized_func_grad(C, x, grad1);
+    //        cout << "*";
+  } while (f > f0 + LINE_SEARCH_ALPHA * dot_product(x - x0, grad0));
+
+  return f;
+}
+
+vector<double> ME_Model::perform_OWLQN(const vector<double>& x0,
+                                       const double C) {
+  const size_t dim = x0.size();
+  Vec x = x0;
+
+  Vec grad(dim), dx(dim);
+  double f = regularized_func_grad(C, x, grad);
+
+  Vec s[M], y[M];
+  double z[M];  // rho
+
+  for (int iter = 0; iter < OWLQN_MAX_ITER; iter++) {
+    Vec pg = pseudo_gradient(x, grad, C);
+
+    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter + 1, -f, _train_error);
+    if (_nheldout > 0) {
+      const double heldout_logl = heldout_likelihood();
+      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl,
+              _heldout_error);
+    }
+    fprintf(stderr, "\n");
+
+    if (sqrt(dot_product(pg, pg)) < MIN_GRAD_NORM) break;
+
+    dx = -1 * approximate_Hg(iter, pg, s, y, z);
+    if (dot_product(dx, pg) >= 0) dx.Project(-1 * pg);
+
+    Vec x1(dim), grad1(dim);
+    f = constrained_line_search(C, x, pg, f, dx, x1, grad1);
+
+    s[iter % M] = x1 - x;
+    y[iter % M] = grad1 - grad;
+    z[iter % M] = 1.0 / dot_product(y[iter % M], s[iter % M]);
+
+    x = x1;
+    grad = grad1;
+  }
+
+  return x.STLVec();
+}
+
+// SGD
+
+// const double SGD_ETA0 = 1;
+// const double SGD_ITER = 30;
+// const double SGD_ALPHA = 0.85;
+
+//#define FOLOS_NAIVE
+//#define FOLOS_LAZY
+#define SGD_CP
+
+inline void apply_l1_penalty(const int i, const double u, vector<double>& _vl,
+                             vector<double>& q) {
+  double& w = _vl[i];
+  const double z = w;
+  double& qi = q[i];
+  if (w > 0) {
+    w = max(0.0, w - (u + qi));
+  } else if (w < 0) {
+    w = min(0.0, w + (u - qi));
+  }
+  qi += w - z;
+}
+
+static double l1norm(const vector<double>& v) {
+  double sum = 0;
+  for (size_t i = 0; i < v.size(); i++) sum += abs(v[i]);
+  return sum;
+}
+
+inline void update_folos_lazy(const int iter_sample, const int k,
+                              vector<double>& _vl,
+                              const vector<double>& sum_eta,
+                              vector<int>& last_updated) {
+  const double penalty = sum_eta[iter_sample] - sum_eta[last_updated[k]];
+  double& x = _vl[k];
+  if (x > 0)
+    x = max(0.0, x - penalty);
+  else
+    x = min(0.0, x + penalty);
+  last_updated[k] = iter_sample;
+}
+
+int ME_Model::perform_SGD() {
+  if (_l2reg > 0) {
+    cerr << "error: L2 regularization is currently not supported in SGD mode."
+         << endl;
+    exit(1);
+  }
+
+  cerr << "performing SGD" << endl;
+
+  const double l1param = _l1reg;
+
+  const int d = _fb.Size();
+
+  vector<int> ri(_vs.size());
+  for (size_t i = 0; i < ri.size(); i++) ri[i] = i;
+
+  vector<double> grad(d);
+  int iter_sample = 0;
+  const double eta0 = SGD_ETA0;
+
+  //  cerr << "l1param = " << l1param << endl;
+  cerr << "eta0 = " << eta0 << " alpha = " << SGD_ALPHA << endl;
+
+  double u = 0;
+  vector<double> q(d, 0);
+  vector<int> last_updated(d, 0);
+  vector<double> sum_eta;
+  sum_eta.push_back(0);
+
+  for (int iter = 0; iter < SGD_ITER; iter++) {
+
+    random_shuffle(ri.begin(), ri.end());
+
+    double logl = 0;
+    int ncorrect = 0, ntotal = 0;
+    for (size_t i = 0; i < _vs.size(); i++, ntotal++, iter_sample++) {
+      const Sample& s = _vs[ri[i]];
+
+#ifdef FOLOS_LAZY
+      for (vector<int>::const_iterator j = s.positive_features.begin();
+           j != s.positive_features.end(); j++) {
+        for (vector<int>::const_iterator k = _feature2mef[*j].begin();
+             k != _feature2mef[*j].end(); k++) {
+          update_folos_lazy(iter_sample, *k, _vl, sum_eta, last_updated);
+        }
+      }
+#endif
+
+      vector<double> membp(_num_classes);
+      const int max_label = conditional_probability(s, membp);
+
+      const double eta =
+          eta0 * pow(SGD_ALPHA,
+                     (double)iter_sample / _vs.size());  // exponential decay
+      //      const double eta = eta0 / (1.0 + (double)iter_sample /
+      // _vs.size());
+
+      //      if (iter_sample % _vs.size() == 0) cerr << "eta = " << eta <<
+      // endl;
+      u += eta * l1param;
+
+      sum_eta.push_back(sum_eta.back() + eta * l1param);
+
+      logl += log(membp[s.label]);
+      if (max_label == s.label) ncorrect++;
+
+      // binary features
+      for (vector<int>::const_iterator j = s.positive_features.begin();
+           j != s.positive_features.end(); j++) {
+        for (vector<int>::const_iterator k = _feature2mef[*j].begin();
+             k != _feature2mef[*j].end(); k++) {
+          const double me = membp[_fb.Feature(*k).label()];
+          const double ee = (_fb.Feature(*k).label() == s.label ? 1.0 : 0);
+          const double grad = (me - ee);
+          _vl[*k] -= eta * grad;
+#ifdef SGD_CP
+          apply_l1_penalty(*k, u, _vl, q);
+#endif
+        }
+      }
+      // real-valued features
+      for (vector<pair<int, double> >::const_iterator j = s.rvfeatures.begin();
+           j != s.rvfeatures.end(); j++) {
+        for (vector<int>::const_iterator k = _feature2mef[j->first].begin();
+             k != _feature2mef[j->first].end(); k++) {
+          const double me = membp[_fb.Feature(*k).label()];
+          const double ee = (_fb.Feature(*k).label() == s.label ? 1.0 : 0);
+          const double grad = (me - ee) * j->second;
+          _vl[*k] -= eta * grad;
+#ifdef SGD_CP
+          apply_l1_penalty(*k, u, _vl, q);
+#endif
+        }
+      }
+
+#ifdef FOLOS_NAIVE
+      for (size_t j = 0; j < d; j++) {
+        double& x = _vl[j];
+        if (x > 0)
+          x = max(0.0, x - eta * l1param);
+        else
+          x = min(0.0, x + eta * l1param);
+      }
+#endif
+    }
+    logl /= _vs.size();
+//    fprintf(stderr, "%4d logl = %8.3f acc = %6.4f ", iter, logl,
+// (double)ncorrect / ntotal);
+
+#ifdef FOLOS_LAZY
+    if (l1param > 0) {
+      for (size_t j = 0; j < d; j++)
+        update_folos_lazy(iter_sample, j, _vl, sum_eta, last_updated);
+    }
+#endif
+
+    double f = logl;
+    if (l1param > 0) {
+      const double l1 =
+          l1norm(_vl);  // this is not accurate when lazy update is used
+      //      cerr << "f0 = " <<  update_model_expectation() - l1param * l1 << "
+      // ";
+      f -= l1param * l1;
+      int nonzero = 0;
+      for (int j = 0; j < d; j++)
+        if (_vl[j] != 0) nonzero++;
+      //      cerr << " f = " << f << " l1 = " << l1 << " nonzero_features = "
+      // << nonzero << endl;
+    }
+    //    fprintf(stderr, "%4d  obj = %7.3f acc = %6.4f", iter+1, f,
+    // (double)ncorrect/ntotal);
+    //    fprintf(stderr, "%4d  obj = %f", iter+1, f);
+    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter + 1, f,
+            1 - (double)ncorrect / ntotal);
+
+    if (_nheldout > 0) {
+      double heldout_logl = heldout_likelihood();
+      //      fprintf(stderr, "  heldout_logl = %f  acc = %6.4f\n",
+      // heldout_logl, 1 - _heldout_error);
+      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl,
+              _heldout_error);
+    }
+    fprintf(stderr, "\n");
+  }
+
+  return 0;
+}
+
+}  // namespace maxent
+
 /*
  * $Log: maxent.cpp,v $
  * Revision 1.1.1.1  2007/05/15 08:30:35  kyoshida
diff --git a/utils/maxent.h b/utils/maxent.h
index b1efd88e..74d13a6f 100644
--- a/utils/maxent.h
+++ b/utils/maxent.h
@@ -5,21 +5,95 @@
 #ifndef __MAXENT_H_
 #define __MAXENT_H_
 
-#include <string>
-#include <vector>
-#include <list>
-#include <map>
 #include <algorithm>
 #include <iostream>
+#include <list>
+#include <map>
 #include <string>
+#include <unordered_map>
+#include <vector>
+
 #include <cassert>
-#include "mathvec.h"
 
-#define USE_HASH_MAP  // if you encounter errors with hash, try commenting out
-                      // this line. (the program will be a bit slower, though)
-#ifdef USE_HASH_MAP
-#include <unordered_map>
-#endif
+namespace maxent {
+class Vec {
+ private:
+  std::vector<double> _v;
+
+ public:
+  Vec(const size_t n = 0, const double val = 0) { _v.resize(n, val); }
+  Vec(const std::vector<double>& v) : _v(v) {}
+  const std::vector<double>& STLVec() const { return _v; }
+  std::vector<double>& STLVec() { return _v; }
+  size_t Size() const { return _v.size(); }
+  double& operator[](int i) { return _v[i]; }
+  const double& operator[](int i) const { return _v[i]; }
+  Vec& operator+=(const Vec& b) {
+    assert(b.Size() == _v.size());
+    for (size_t i = 0; i < _v.size(); i++) {
+      _v[i] += b[i];
+    }
+    return *this;
+  }
+  Vec& operator*=(const double c) {
+    for (size_t i = 0; i < _v.size(); i++) {
+      _v[i] *= c;
+    }
+    return *this;
+  }
+  void Project(const Vec& y) {
+    for (size_t i = 0; i < _v.size(); i++) {
+      //      if (sign(_v[i]) != sign(y[i])) _v[i] = 0;
+      if (_v[i] * y[i] <= 0) _v[i] = 0;
+    }
+  }
+};
+
+inline double dot_product(const Vec& a, const Vec& b) {
+  double sum = 0;
+  for (size_t i = 0; i < a.Size(); i++) {
+    sum += a[i] * b[i];
+  }
+  return sum;
+}
+
+inline std::ostream& operator<<(std::ostream& s, const Vec& a) {
+  s << "(";
+  for (size_t i = 0; i < a.Size(); i++) {
+    if (i != 0) s << ", ";
+    s << a[i];
+  }
+  s << ")";
+  return s;
+}
+
+inline const Vec operator+(const Vec& a, const Vec& b) {
+  Vec v(a.Size());
+  assert(a.Size() == b.Size());
+  for (size_t i = 0; i < a.Size(); i++) {
+    v[i] = a[i] + b[i];
+  }
+  return v;
+}
+
+inline const Vec operator-(const Vec& a, const Vec& b) {
+  Vec v(a.Size());
+  assert(a.Size() == b.Size());
+  for (size_t i = 0; i < a.Size(); i++) {
+    v[i] = a[i] - b[i];
+  }
+  return v;
+}
+
+inline const Vec operator*(const Vec& a, const double c) {
+  Vec v(a.Size());
+  for (size_t i = 0; i < a.Size(); i++) {
+    v[i] = a[i] * c;
+  }
+  return v;
+}
+
+inline const Vec operator*(const double c, const Vec& a) { return a * c; }
 
 //
 // data format for each sample for training/testing
@@ -309,6 +383,7 @@ class ME_Model {
   static double FunctionGradientWrapper(const std::vector<double>& x,
                                         std::vector<double>& grad);
 };
+}  // namespace maxent
 
 #endif
 
diff --git a/utils/murmur_hash3.h b/utils/murmur_hash3.h
index a125d775..e8a8b10b 100644
--- a/utils/murmur_hash3.h
+++ b/utils/murmur_hash3.h
@@ -2,8 +2,8 @@
 // MurmurHash3 was written by Austin Appleby, and is placed in the public
 // domain. The author hereby disclaims copyright to this source code.
 
-#ifndef _MURMURHASH3_H_
-#define _MURMURHASH3_H_
+#ifndef MURMURHASH3_H_
+#define MURMURHASH3_H_
 
 //-----------------------------------------------------------------------------
 // Platform-specific functions and macros
diff --git a/utils/owlqn.cpp b/utils/owlqn.cpp
deleted file mode 100644
index c3a0f0da..00000000
--- a/utils/owlqn.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-#include <vector>
-#include <iostream>
-#include <cmath>
-#include <stdio.h>
-#include "mathvec.h"
-#include "lbfgs.h"
-#include "maxent.h"
-
-using namespace std;
-
-const static int M = LBFGS_M;
-const static double LINE_SEARCH_ALPHA = 0.1;
-const static double LINE_SEARCH_BETA = 0.5;
-
-// stopping criteria
-int OWLQN_MAX_ITER = 300;
-const static double MIN_GRAD_NORM = 0.0001;
-
-Vec approximate_Hg(const int iter, const Vec& grad, const Vec s[],
-                   const Vec y[], const double z[]);
-
-inline int sign(double x) {
-  if (x > 0) return 1;
-  if (x < 0) return -1;
-  return 0;
-};
-
-static Vec pseudo_gradient(const Vec& x, const Vec& grad0, const double C) {
-  Vec grad = grad0;
-  for (size_t i = 0; i < x.Size(); i++) {
-    if (x[i] != 0) {
-      grad[i] += C * sign(x[i]);
-      continue;
-    }
-    const double gm = grad0[i] - C;
-    if (gm > 0) {
-      grad[i] = gm;
-      continue;
-    }
-    const double gp = grad0[i] + C;
-    if (gp < 0) {
-      grad[i] = gp;
-      continue;
-    }
-    grad[i] = 0;
-  }
-
-  return grad;
-}
-
-double ME_Model::regularized_func_grad(const double C, const Vec& x,
-                                       Vec& grad) {
-  double f = FunctionGradient(x.STLVec(), grad.STLVec());
-  for (size_t i = 0; i < x.Size(); i++) {
-    f += C * fabs(x[i]);
-  }
-
-  return f;
-}
-
-double ME_Model::constrained_line_search(double C, const Vec& x0,
-                                         const Vec& grad0, const double f0,
-                                         const Vec& dx, Vec& x, Vec& grad1) {
-  // compute the orthant to explore
-  Vec orthant = x0;
-  for (size_t i = 0; i < orthant.Size(); i++) {
-    if (orthant[i] == 0) orthant[i] = -grad0[i];
-  }
-
-  double t = 1.0 / LINE_SEARCH_BETA;
-
-  double f;
-  do {
-    t *= LINE_SEARCH_BETA;
-    x = x0 + t * dx;
-    x.Project(orthant);
-    //    for (size_t i = 0; i < x.Size(); i++) {
-    //      if (x0[i] != 0 && sign(x[i]) != sign(x0[i])) x[i] = 0;
-    //    }
-
-    f = regularized_func_grad(C, x, grad1);
-    //        cout << "*";
-  } while (f > f0 + LINE_SEARCH_ALPHA * dot_product(x - x0, grad0));
-
-  return f;
-}
-
-vector<double> ME_Model::perform_OWLQN(const vector<double>& x0,
-                                       const double C) {
-  const size_t dim = x0.size();
-  Vec x = x0;
-
-  Vec grad(dim), dx(dim);
-  double f = regularized_func_grad(C, x, grad);
-
-  Vec s[M], y[M];
-  double z[M];  // rho
-
-  for (int iter = 0; iter < OWLQN_MAX_ITER; iter++) {
-    Vec pg = pseudo_gradient(x, grad, C);
-
-    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter + 1, -f, _train_error);
-    if (_nheldout > 0) {
-      const double heldout_logl = heldout_likelihood();
-      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl,
-              _heldout_error);
-    }
-    fprintf(stderr, "\n");
-
-    if (sqrt(dot_product(pg, pg)) < MIN_GRAD_NORM) break;
-
-    dx = -1 * approximate_Hg(iter, pg, s, y, z);
-    if (dot_product(dx, pg) >= 0) dx.Project(-1 * pg);
-
-    Vec x1(dim), grad1(dim);
-    f = constrained_line_search(C, x, pg, f, dx, x1, grad1);
-
-    s[iter % M] = x1 - x;
-    y[iter % M] = grad1 - grad;
-    z[iter % M] = 1.0 / dot_product(y[iter % M], s[iter % M]);
-
-    x = x1;
-    grad = grad1;
-  }
-
-  return x.STLVec();
-}
diff --git a/utils/perfect_hash.h b/utils/perfect_hash.h
index 29ea48a9..8c12c9f0 100644
--- a/utils/perfect_hash.h
+++ b/utils/perfect_hash.h
@@ -1,5 +1,5 @@
-#ifndef _PERFECT_HASH_MAP_H_
-#define _PERFECT_HASH_MAP_H_
+#ifndef PERFECT_HASH_MAP_H_
+#define PERFECT_HASH_MAP_H_
 
 #include <vector>
 #include <boost/utility.hpp>
diff --git a/utils/prob.h b/utils/prob.h
index bc297870..32ba9a86 100644
--- a/utils/prob.h
+++ b/utils/prob.h
@@ -1,5 +1,5 @@
-#ifndef _PROB_H_
-#define _PROB_H_
+#ifndef PROB_H_
+#define PROB_H_
 
 #include "logval.h"
 
diff --git a/utils/sgd.cpp b/utils/sgd.cpp
deleted file mode 100644
index 8613edca..00000000
--- a/utils/sgd.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-#include "maxent.h"
-#include <cmath>
-#include <stdio.h>
-
-using namespace std;
-
-// const double SGD_ETA0 = 1;
-// const double SGD_ITER = 30;
-// const double SGD_ALPHA = 0.85;
-
-//#define FOLOS_NAIVE
-//#define FOLOS_LAZY
-#define SGD_CP
-
-inline void apply_l1_penalty(const int i, const double u, vector<double>& _vl,
-                             vector<double>& q) {
-  double& w = _vl[i];
-  const double z = w;
-  double& qi = q[i];
-  if (w > 0) {
-    w = max(0.0, w - (u + qi));
-  } else if (w < 0) {
-    w = min(0.0, w + (u - qi));
-  }
-  qi += w - z;
-}
-
-static double l1norm(const vector<double>& v) {
-  double sum = 0;
-  for (size_t i = 0; i < v.size(); i++) sum += abs(v[i]);
-  return sum;
-}
-
-inline void update_folos_lazy(const int iter_sample, const int k,
-                              vector<double>& _vl,
-                              const vector<double>& sum_eta,
-                              vector<int>& last_updated) {
-  const double penalty = sum_eta[iter_sample] - sum_eta[last_updated[k]];
-  double& x = _vl[k];
-  if (x > 0)
-    x = max(0.0, x - penalty);
-  else
-    x = min(0.0, x + penalty);
-  last_updated[k] = iter_sample;
-}
-
-int ME_Model::perform_SGD() {
-  if (_l2reg > 0) {
-    cerr << "error: L2 regularization is currently not supported in SGD mode."
-         << endl;
-    exit(1);
-  }
-
-  cerr << "performing SGD" << endl;
-
-  const double l1param = _l1reg;
-
-  const int d = _fb.Size();
-
-  vector<int> ri(_vs.size());
-  for (size_t i = 0; i < ri.size(); i++) ri[i] = i;
-
-  vector<double> grad(d);
-  int iter_sample = 0;
-  const double eta0 = SGD_ETA0;
-
-  //  cerr << "l1param = " << l1param << endl;
-  cerr << "eta0 = " << eta0 << " alpha = " << SGD_ALPHA << endl;
-
-  double u = 0;
-  vector<double> q(d, 0);
-  vector<int> last_updated(d, 0);
-  vector<double> sum_eta;
-  sum_eta.push_back(0);
-
-  for (int iter = 0; iter < SGD_ITER; iter++) {
-
-    random_shuffle(ri.begin(), ri.end());
-
-    double logl = 0;
-    int ncorrect = 0, ntotal = 0;
-    for (size_t i = 0; i < _vs.size(); i++, ntotal++, iter_sample++) {
-      const Sample& s = _vs[ri[i]];
-
-#ifdef FOLOS_LAZY
-      for (vector<int>::const_iterator j = s.positive_features.begin();
-           j != s.positive_features.end(); j++) {
-        for (vector<int>::const_iterator k = _feature2mef[*j].begin();
-             k != _feature2mef[*j].end(); k++) {
-          update_folos_lazy(iter_sample, *k, _vl, sum_eta, last_updated);
-        }
-      }
-#endif
-
-      vector<double> membp(_num_classes);
-      const int max_label = conditional_probability(s, membp);
-
-      const double eta =
-          eta0 * pow(SGD_ALPHA,
-                     (double)iter_sample / _vs.size());  // exponential decay
-      //      const double eta = eta0 / (1.0 + (double)iter_sample /
-      // _vs.size());
-
-      //      if (iter_sample % _vs.size() == 0) cerr << "eta = " << eta <<
-      // endl;
-      u += eta * l1param;
-
-      sum_eta.push_back(sum_eta.back() + eta * l1param);
-
-      logl += log(membp[s.label]);
-      if (max_label == s.label) ncorrect++;
-
-      // binary features
-      for (vector<int>::const_iterator j = s.positive_features.begin();
-           j != s.positive_features.end(); j++) {
-        for (vector<int>::const_iterator k = _feature2mef[*j].begin();
-             k != _feature2mef[*j].end(); k++) {
-          const double me = membp[_fb.Feature(*k).label()];
-          const double ee = (_fb.Feature(*k).label() == s.label ? 1.0 : 0);
-          const double grad = (me - ee);
-          _vl[*k] -= eta * grad;
-#ifdef SGD_CP
-          apply_l1_penalty(*k, u, _vl, q);
-#endif
-        }
-      }
-      // real-valued features
-      for (vector<pair<int, double> >::const_iterator j = s.rvfeatures.begin();
-           j != s.rvfeatures.end(); j++) {
-        for (vector<int>::const_iterator k = _feature2mef[j->first].begin();
-             k != _feature2mef[j->first].end(); k++) {
-          const double me = membp[_fb.Feature(*k).label()];
-          const double ee = (_fb.Feature(*k).label() == s.label ? 1.0 : 0);
-          const double grad = (me - ee) * j->second;
-          _vl[*k] -= eta * grad;
-#ifdef SGD_CP
-          apply_l1_penalty(*k, u, _vl, q);
-#endif
-        }
-      }
-
-#ifdef FOLOS_NAIVE
-      for (size_t j = 0; j < d; j++) {
-        double& x = _vl[j];
-        if (x > 0)
-          x = max(0.0, x - eta * l1param);
-        else
-          x = min(0.0, x + eta * l1param);
-      }
-#endif
-    }
-    logl /= _vs.size();
-//    fprintf(stderr, "%4d logl = %8.3f acc = %6.4f ", iter, logl,
-// (double)ncorrect / ntotal);
-
-#ifdef FOLOS_LAZY
-    if (l1param > 0) {
-      for (size_t j = 0; j < d; j++)
-        update_folos_lazy(iter_sample, j, _vl, sum_eta, last_updated);
-    }
-#endif
-
-    double f = logl;
-    if (l1param > 0) {
-      const double l1 =
-          l1norm(_vl);  // this is not accurate when lazy update is used
-      //      cerr << "f0 = " <<  update_model_expectation() - l1param * l1 << "
-      // ";
-      f -= l1param * l1;
-      int nonzero = 0;
-      for (int j = 0; j < d; j++)
-        if (_vl[j] != 0) nonzero++;
-      //      cerr << " f = " << f << " l1 = " << l1 << " nonzero_features = "
-      // << nonzero << endl;
-    }
-    //    fprintf(stderr, "%4d  obj = %7.3f acc = %6.4f", iter+1, f,
-    // (double)ncorrect/ntotal);
-    //    fprintf(stderr, "%4d  obj = %f", iter+1, f);
-    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter + 1, f,
-            1 - (double)ncorrect / ntotal);
-
-    if (_nheldout > 0) {
-      double heldout_logl = heldout_likelihood();
-      //      fprintf(stderr, "  heldout_logl = %f  acc = %6.4f\n",
-      // heldout_logl, 1 - _heldout_error);
-      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl,
-              _heldout_error);
-    }
-    fprintf(stderr, "\n");
-  }
-
-  return 0;
-}
diff --git a/utils/small_vector.h b/utils/small_vector.h
index 280ab72c..f16bc898 100644
--- a/utils/small_vector.h
+++ b/utils/small_vector.h
@@ -1,5 +1,5 @@
-#ifndef _SMALL_VECTOR_H_
-#define _SMALL_VECTOR_H_
+#ifndef SMALL_VECTOR_H_
+#define SMALL_VECTOR_H_
 
 /* REQUIRES that T is POD (can be memcpy).  won't work (yet) due to union with SMALL_VECTOR_POD==0 - may be possible to handle movable types that have ctor/dtor, by using  explicit allocation, ctor/dtor calls.  but for now JUST USE THIS FOR no-meaningful ctor/dtor POD types.
 
@@ -15,6 +15,7 @@
 #include <new>
 #include <stdint.h>
 #include <boost/functional/hash.hpp>
+#include <boost/serialization/map.hpp>
 
 //sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1
 
@@ -297,6 +298,21 @@ public:
     return hash_range(data_.ptr,data_.ptr+size_);
   }
 
+  template<class Archive>
+  void save(Archive & ar, const unsigned int) const {
+    ar & size_;
+    for (unsigned i = 0; i < size_; ++i)
+      ar & (*this)[i];
+  }
+  template<class Archive>
+  void load(Archive & ar, const unsigned int) {
+    uint16_t s;
+    ar & s;
+    this->resize(s);
+    for (unsigned i = 0; i < size_; ++i)
+      ar & (*this)[i];
+  }
+  BOOST_SERIALIZATION_SPLIT_MEMBER()
  private:
   union StorageType {
     T vals[SV_MAX];
diff --git a/utils/small_vector_test.cc b/utils/small_vector_test.cc
index a4eb89ae..9e1a148d 100644
--- a/utils/small_vector_test.cc
+++ b/utils/small_vector_test.cc
@@ -3,6 +3,10 @@
 #define BOOST_TEST_MODULE svTest
 #include <boost/test/unit_test.hpp>
 #include <boost/test/floating_point_comparison.hpp>
+#include <boost/archive/text_oarchive.hpp>
+#include <boost/archive/text_iarchive.hpp>
+#include <string>
+#include <sstream>
 #include <iostream>
 #include <vector>
 
@@ -128,3 +132,29 @@ BOOST_AUTO_TEST_CASE(Small) {
   cerr << sizeof(SmallVectorInt) << endl;
   cerr << sizeof(vector<int>) << endl;
 }
+
+BOOST_AUTO_TEST_CASE(Serialize) {
+  std::string in;
+  {
+    SmallVectorInt v;
+    v.push_back(0);
+    v.push_back(1);
+    v.push_back(-2);
+    ostringstream os;
+    boost::archive::text_oarchive oa(os);
+    oa << v;
+    in = os.str();
+    cerr << in;
+  }
+  {
+    istringstream is(in);
+    boost::archive::text_iarchive ia(is);
+    SmallVectorInt v;
+    ia >> v;
+    BOOST_CHECK_EQUAL(v.size(), 3);
+    BOOST_CHECK_EQUAL(v[0], 0);
+    BOOST_CHECK_EQUAL(v[1], 1);
+    BOOST_CHECK_EQUAL(v[2], -2);
+  }
+}
+
diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h
index 049151f7..13601376 100644
--- a/utils/sparse_vector.h
+++ b/utils/sparse_vector.h
@@ -1,5 +1,5 @@
-#ifndef _SPARSE_VECTOR_H_
-#define _SPARSE_VECTOR_H_
+#ifndef SPARSE_VECTOR_H_
+#define SPARSE_VECTOR_H_
 
 #include "fast_sparse_vector.h"
 #define SparseVector FastSparseVector
diff --git a/utils/star.h b/utils/star.h
index 21977dc9..01433d12 100644
--- a/utils/star.h
+++ b/utils/star.h
@@ -1,5 +1,5 @@
-#ifndef _STAR_H_
-#define _STAR_H_
+#ifndef STAR_H_
+#define STAR_H_
 
 // star(x) computes the infinite sum x^0 + x^1 + x^2 + ...
 
diff --git a/utils/sv_test.cc b/utils/sv_test.cc
index 67df8c57..b006e66d 100644
--- a/utils/sv_test.cc
+++ b/utils/sv_test.cc
@@ -1,7 +1,12 @@
 #define BOOST_TEST_MODULE WeightsTest
 #include <boost/test/unit_test.hpp>
 #include <boost/test/floating_point_comparison.hpp>
+#include <boost/archive/text_oarchive.hpp>
+#include <boost/archive/text_iarchive.hpp>
+#include <sstream>
+#include <string>
 #include "sparse_vector.h"
+#include "fdict.h"
 
 using namespace std;
 
@@ -33,3 +38,29 @@ BOOST_AUTO_TEST_CASE(Division) {
   x /= -1;
   BOOST_CHECK(x == y);
 }
+
+BOOST_AUTO_TEST_CASE(Serialization) {
+  string arc;
+  FD::dict_.clear();
+  {
+    SparseVector<double> x;
+    x.set_value(FD::Convert("Feature1"), 1.0);
+    x.set_value(FD::Convert("Pi"), 3.14);
+    ostringstream os;
+    boost::archive::text_oarchive oa(os);
+    oa << x;
+    arc = os.str();
+  }
+  FD::dict_.clear();
+  FD::Convert("SomeNewString");
+  {
+    SparseVector<double> x;
+    istringstream is(arc);
+    boost::archive::text_iarchive ia(is);
+    ia >> x;
+    cerr << x << endl;
+    BOOST_CHECK_CLOSE(x.get(FD::Convert("Pi")), 3.14, 1e-9);
+    BOOST_CHECK_CLOSE(x.get(FD::Convert("Feature1")), 1.0, 1e-9);
+  }
+}
+
diff --git a/utils/tdict.h b/utils/tdict.h
index bb19ecd5..eed33c3a 100644
--- a/utils/tdict.h
+++ b/utils/tdict.h
@@ -1,5 +1,5 @@
-#ifndef _TDICT_H_
-#define _TDICT_H_
+#ifndef TDICT_H_
+#define TDICT_H_
 
 #include <string>
 #include <vector>
diff --git a/utils/timing_stats.h b/utils/timing_stats.h
index 0a9f7656..69a1cf4b 100644
--- a/utils/timing_stats.h
+++ b/utils/timing_stats.h
@@ -1,5 +1,5 @@
-#ifndef _TIMING_STATS_H_
-#define _TIMING_STATS_H_
+#ifndef TIMING_STATS_H_
+#define TIMING_STATS_H_
 
 #include <string>
 #include <map>
diff --git a/utils/verbose.h b/utils/verbose.h
index 73476383..e39e23cb 100644
--- a/utils/verbose.h
+++ b/utils/verbose.h
@@ -1,5 +1,5 @@
-#ifndef _VERBOSE_H_
-#define _VERBOSE_H_
+#ifndef VERBOSE_H_
+#define VERBOSE_H_
 
 extern bool SILENT;
 
diff --git a/utils/weights.h b/utils/weights.h
index 920fdd75..0bd4c2d9 100644
--- a/utils/weights.h
+++ b/utils/weights.h
@@ -1,5 +1,5 @@
-#ifndef _WEIGHTS_H_
-#define _WEIGHTS_H_
+#ifndef WEIGHTS_H_
+#define WEIGHTS_H_
 
 #include <string>
 #include <vector>
diff --git a/utils/wordid.h b/utils/wordid.h
index 714dcd0b..3aa6cc23 100644
--- a/utils/wordid.h
+++ b/utils/wordid.h
@@ -1,5 +1,5 @@
-#ifndef _WORD_ID_H_
-#define _WORD_ID_H_
+#ifndef WORD_ID_H_
+#define WORD_ID_H_
 
 #include <limits>