finalized merge

author: Patrick Simianer <p@simianer.de> 2011-10-20 02:31:25 +0200
committer: Patrick Simianer <p@simianer.de> 2011-10-20 02:31:25 +0200
commit: a5a92ebe23c5819ed104313426012011e32539da (patch)
tree: 3416818c758d5ece4e71fe522c571e75ea04f100 /utils
parent: b88332caac2cbe737c99b8098813f868ca876d8b (diff)
parent: 78baccbb4231bb84a456702d4f574f8e601a8182 (diff)
24 files changed, 1205 insertions, 157 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 94f9be30..df667655 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -1,5 +1,8 @@
-noinst_PROGRAMS = ts
-TESTS = ts
+
+bin_PROGRAMS = reconstruct_weights
+
+noinst_PROGRAMS = ts phmt
+TESTS = ts phmt
 
 if HAVE_GTEST
 noinst_PROGRAMS += \
@@ -11,6 +14,8 @@ noinst_PROGRAMS += \
 TESTS += small_vector_test logval_test weights_test dict_test
 endif
 
+reconstruct_weights_SOURCES = reconstruct_weights.cc
+
 noinst_LIBRARIES = libutils.a
 
 libutils_a_SOURCES = \
@@ -27,6 +32,11 @@ libutils_a_SOURCES = \
   verbose.cc \
   weights.cc
 
+if HAVE_CMPH
+  libutils_a_SOURCES += perfect_hash.cc
+endif
+
+phmt_SOURCES = phmt.cc
 ts_SOURCES = ts.cc
 dict_test_SOURCES = dict_test.cc
 dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h
new file mode 100644
index 00000000..63b6f4c2
--- /dev/null
+++ b/utils/ccrp_nt.h
@@ -0,0 +1,169 @@
+#ifndef _CCRP_NT_H_
+#define _CCRP_NT_H_
+
+#include <numeric>
+#include <cassert>
+#include <cmath>
+#include <list>
+#include <iostream>
+#include <vector>
+#include <tr1/unordered_map>
+#include <boost/functional/hash.hpp>
+#include "sampler.h"
+#include "slice_sampler.h"
+
+// Chinese restaurant process (1 parameter)
+template <typename Dish, typename DishHash = boost::hash<Dish> >
+class CCRP_NoTable {
+ public:
+  explicit CCRP_NoTable(double conc) :
+    num_customers_(),
+    concentration_(conc),
+    concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
+    concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
+
+  CCRP_NoTable(double c_shape, double c_rate, double c = 10.0) :
+    num_customers_(),
+    concentration_(c),
+    concentration_prior_shape_(c_shape),
+    concentration_prior_rate_(c_rate) {}
+
+  double concentration() const { return concentration_; }
+
+  bool has_concentration_prior() const {
+    return !std::isnan(concentration_prior_shape_);
+  }
+
+  void clear() {
+    num_customers_ = 0;
+    custs_.clear();
+  }
+
+  unsigned num_customers() const {
+    return num_customers_;
+  }
+
+  unsigned num_customers(const Dish& dish) const {
+    const typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.find(dish);
+    if (it == custs_.end()) return 0;
+    return it->second;
+  }
+
+  int increment(const Dish& dish) {
+    int table_diff = 0;
+    if (++custs_[dish] == 1)
+      table_diff = 1;
+    ++num_customers_;
+    return table_diff;
+  }
+
+  int decrement(const Dish& dish) {
+    int table_diff = 0;
+    int nc = --custs_[dish];
+    if (nc == 0) {
+      custs_.erase(dish);
+      table_diff = -1;
+    } else if (nc < 0) {
+      std::cerr << "Dish counts dropped below zero for: " << dish << std::endl;
+      abort();
+    }
+    --num_customers_;
+    return table_diff;
+  }
+
+  double prob(const Dish& dish, const double& p0) const {
+    const unsigned at_table = num_customers(dish);
+    return (at_table + p0 * concentration_) / (num_customers_ + concentration_);
+  }
+
+  double logprob(const Dish& dish, const double& logp0) const {
+    const unsigned at_table = num_customers(dish);
+    return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_);
+  }
+
+  double log_crp_prob() const {
+    return log_crp_prob(concentration_);
+  }
+
+  static double log_gamma_density(const double& x, const double& shape, const double& rate) {
+    assert(x >= 0.0);
+    assert(shape > 0.0);
+    assert(rate > 0.0);
+    const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape);
+    return lp;
+  }
+
+  // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
+  // does not include P_0's
+  double log_crp_prob(const double& concentration) const {
+    double lp = 0.0;
+    if (has_concentration_prior())
+      lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_);
+    assert(lp <= 0.0);
+    if (num_customers_) {
+      lp += lgamma(concentration) - lgamma(concentration + num_customers_) +
+        custs_.size() * log(concentration);
+      assert(std::isfinite(lp));
+      for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin();
+             it != custs_.end(); ++it) {
+          lp += lgamma(it->second);
+      }
+    }
+    assert(std::isfinite(lp));
+    return lp;
+  }
+
+  void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    assert(has_concentration_prior());
+    ConcentrationResampler cr(*this);
+    for (int iter = 0; iter < nloop; ++iter) {
+        concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+                               std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+    }
+  }
+
+  struct ConcentrationResampler {
+    ConcentrationResampler(const CCRP_NoTable& crp) : crp_(crp) {}
+    const CCRP_NoTable& crp_;
+    double operator()(const double& proposed_concentration) const {
+      return crp_.log_crp_prob(proposed_concentration);
+    }
+  };
+
+  void Print(std::ostream* out) const {
+    (*out) << "DP(alpha=" << concentration_ << ") customers=" << num_customers_ << std::endl;
+    int cc = 0;
+    for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin();
+         it != custs_.end(); ++it) {
+      (*out) << " " << it->first << "(" << it->second << " eating)";
+      ++cc;
+      if (cc > 10) { (*out) << " ..."; break; }
+    }
+    (*out) << std::endl;
+  }
+
+  unsigned num_customers_;
+  std::tr1::unordered_map<Dish, unsigned, DishHash> custs_;
+
+  typedef typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator const_iterator;
+  const_iterator begin() const {
+    return custs_.begin();
+  }
+  const_iterator end() const {
+    return custs_.end();
+  }
+
+  double concentration_;
+
+  // optional gamma prior on concentration_ (NaN if no prior)
+  double concentration_prior_shape_;
+  double concentration_prior_rate_;
+};
+
+template <typename T,typename H>
+std::ostream& operator<<(std::ostream& o, const CCRP_NoTable<T,H>& c) {
+  c.Print(&o);
+  return o;
+}
+
+#endif
diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h
new file mode 100644
index 00000000..a868af9a
--- /dev/null
+++ b/utils/ccrp_onetable.h
@@ -0,0 +1,241 @@
+#ifndef _CCRP_ONETABLE_H_
+#define _CCRP_ONETABLE_H_
+
+#include <numeric>
+#include <cassert>
+#include <cmath>
+#include <list>
+#include <iostream>
+#include <tr1/unordered_map>
+#include <boost/functional/hash.hpp>
+#include "sampler.h"
+#include "slice_sampler.h"
+
+// Chinese restaurant process (Pitman-Yor parameters) with one table approximation
+
+template <typename Dish, typename DishHash = boost::hash<Dish> >
+class CCRP_OneTable {
+  typedef std::tr1::unordered_map<Dish, unsigned, DishHash> DishMapType;
+ public:
+  CCRP_OneTable(double disc, double conc) :
+    num_tables_(),
+    num_customers_(),
+    discount_(disc),
+    concentration_(conc),
+    discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()),
+    discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
+    concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
+    concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
+
+  CCRP_OneTable(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) :
+    num_tables_(),
+    num_customers_(),
+    discount_(d),
+    concentration_(c),
+    discount_prior_alpha_(d_alpha),
+    discount_prior_beta_(d_beta),
+    concentration_prior_shape_(c_shape),
+    concentration_prior_rate_(c_rate) {}
+
+  double discount() const { return discount_; }
+  double concentration() const { return concentration_; }
+  void set_concentration(double c) { concentration_ = c; }
+  void set_discount(double d) { discount_ = d; }
+
+  bool has_discount_prior() const {
+    return !std::isnan(discount_prior_alpha_);
+  }
+
+  bool has_concentration_prior() const {
+    return !std::isnan(concentration_prior_shape_);
+  }
+
+  void clear() {
+    num_tables_ = 0;
+    num_customers_ = 0;
+    dish_counts_.clear();
+  }
+
+  unsigned num_tables() const {
+    return num_tables_;
+  }
+
+  unsigned num_tables(const Dish& dish) const {
+    const typename DishMapType::const_iterator it = dish_counts_.find(dish);
+    if (it == dish_counts_.end()) return 0;
+    return 1;
+  }
+
+  unsigned num_customers() const {
+    return num_customers_;
+  }
+
+  unsigned num_customers(const Dish& dish) const {
+    const typename DishMapType::const_iterator it = dish_counts_.find(dish);
+    if (it == dish_counts_.end()) return 0;
+    return it->second;
+  }
+
+  // returns +1 or 0 indicating whether a new table was opened
+  int increment(const Dish& dish) {
+    unsigned& dc = dish_counts_[dish];
+    ++dc;
+    ++num_customers_;
+    if (dc == 1) {
+      ++num_tables_;
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  // returns -1 or 0, indicating whether a table was closed
+  int decrement(const Dish& dish) {
+    unsigned& dc = dish_counts_[dish];
+    assert(dc > 0);
+    if (dc == 1) {
+      dish_counts_.erase(dish);
+      --num_tables_;
+      --num_customers_;
+      return -1;
+    } else {
+      assert(dc > 1);
+      --dc;
+      --num_customers_;
+      return 0;
+    }
+  }
+
+  double prob(const Dish& dish, const double& p0) const {
+    const typename DishMapType::const_iterator it = dish_counts_.find(dish);
+    const double r = num_tables_ * discount_ + concentration_;
+    if (it == dish_counts_.end()) {
+      return r * p0 / (num_customers_ + concentration_);
+    } else {
+      return (it->second - discount_ + r * p0) /
+               (num_customers_ + concentration_);
+    }
+  }
+
+  double log_crp_prob() const {
+    return log_crp_prob(discount_, concentration_);
+  }
+
+  static double log_beta_density(const double& x, const double& alpha, const double& beta) {
+    assert(x > 0.0);
+    assert(x < 1.0);
+    assert(alpha > 0.0);
+    assert(beta > 0.0);
+    const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta);
+    return lp;
+  }
+
+  static double log_gamma_density(const double& x, const double& shape, const double& rate) {
+    assert(x >= 0.0);
+    assert(shape > 0.0);
+    assert(rate > 0.0);
+    const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape);
+    return lp;
+  }
+
+  // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
+  // does not include P_0's
+  double log_crp_prob(const double& discount, const double& concentration) const {
+    double lp = 0.0;
+    if (has_discount_prior())
+      lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_);
+    if (has_concentration_prior())
+      lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_);
+    assert(lp <= 0.0);
+    if (num_customers_) {
+      if (discount > 0.0) {
+        const double r = lgamma(1.0 - discount);
+        lp += lgamma(concentration) - lgamma(concentration + num_customers_)
+             + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_)
+             - lgamma(concentration / discount);
+        assert(std::isfinite(lp));
+        for (typename DishMapType::const_iterator it = dish_counts_.begin();
+             it != dish_counts_.end(); ++it) {
+          const unsigned& cur = it->second;
+          lp += lgamma(cur - discount) - r;
+        }
+      } else {
+        assert(!"not implemented yet");
+      }
+    }
+    assert(std::isfinite(lp));
+    return lp;
+  }
+
+  void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    assert(has_discount_prior() || has_concentration_prior());
+    DiscountResampler dr(*this);
+    ConcentrationResampler cr(*this);
+    for (int iter = 0; iter < nloop; ++iter) {
+      if (has_concentration_prior()) {
+        concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+                               std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+      }
+      if (has_discount_prior()) {
+        discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits<double>::min(),
+                               1.0, 0.0, niterations, 100*niterations);
+      }
+    }
+    concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+                             std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+  }
+
+  struct DiscountResampler {
+    DiscountResampler(const CCRP_OneTable& crp) : crp_(crp) {}
+    const CCRP_OneTable& crp_;
+    double operator()(const double& proposed_discount) const {
+      return crp_.log_crp_prob(proposed_discount, crp_.concentration_);
+    }
+  };
+
+  struct ConcentrationResampler {
+    ConcentrationResampler(const CCRP_OneTable& crp) : crp_(crp) {}
+    const CCRP_OneTable& crp_;
+    double operator()(const double& proposed_concentration) const {
+      return crp_.log_crp_prob(crp_.discount_, proposed_concentration);
+    }
+  };
+
+  void Print(std::ostream* out) const {
+    (*out) << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl;
+    for (typename DishMapType::const_iterator it = dish_counts_.begin(); it != dish_counts_.end(); ++it) {
+      (*out) << "  " << it->first << " = " << it->second << std::endl;
+    }
+  }
+
+  typedef typename DishMapType::const_iterator const_iterator;
+  const_iterator begin() const {
+    return dish_counts_.begin();
+  }
+  const_iterator end() const {
+    return dish_counts_.end();
+  }
+
+  unsigned num_tables_;
+  unsigned num_customers_;
+  DishMapType dish_counts_;
+
+  double discount_;
+  double concentration_;
+
+  // optional beta prior on discount_ (NaN if no prior)
+  double discount_prior_alpha_;
+  double discount_prior_beta_;
+
+  // optional gamma prior on concentration_ (NaN if no prior)
+  double concentration_prior_shape_;
+  double concentration_prior_rate_;
+};
+
+template <typename T,typename H>
+std::ostream& operator<<(std::ostream& o, const CCRP_OneTable<T,H>& c) {
+  c.Print(&o);
+  return o;
+}
+
+#endif
diff --git a/utils/dict.h b/utils/dict.h
index 33cca6cf..a3400868 100644
--- a/utils/dict.h
+++ b/utils/dict.h
@@ -1,7 +1,6 @@
 #ifndef DICT_H_
 #define DICT_H_
 
-#include <iostream>
 #include <cassert>
 #include <cstring>
 
@@ -73,8 +72,8 @@ class Dict {
 
   inline const std::string& Convert(const WordID& id) const {
     if (id == 0) return b0_;
-    //assert(id <= (int)words_.size());
-    if (id < 0 || id > (int)words_.size()) return b0_;
+    assert(id <= (int)words_.size());
+    //if (id < 0 || id > (int)words_.size()) return b0_;
     return words_[id-1];
   }
 
diff --git a/utils/fdict.cc b/utils/fdict.cc
index baa0b552..676c951c 100644
--- a/utils/fdict.cc
+++ b/utils/fdict.cc
@@ -9,6 +9,10 @@ using namespace std;
 Dict FD::dict_;
 bool FD::frozen_ = false;
 
+#ifdef HAVE_CMPH
+PerfectHashFunction* FD::hash_ = NULL;
+#endif
+
 std::string FD::Convert(std::vector<WordID> const& v) {
     return Convert(&*v.begin(),&*v.end());
 }
diff --git a/utils/fdict.h b/utils/fdict.h
index 70315a38..f0871b9a 100644
--- a/utils/fdict.h
+++ b/utils/fdict.h
@@ -1,27 +1,58 @@
 #ifndef _FDICT_H_
 #define _FDICT_H_
 
+#include "config.h"
+
+#include <iostream>
 #include <string>
 #include <vector>
 #include "dict.h"
 
+#ifdef HAVE_CMPH
+#include "perfect_hash.h"
+#include "string_to.h"
+#endif
+
 struct FD {
   // once the FD is frozen, new features not already in the
   // dictionary will return 0
   static void Freeze() {
     frozen_ = true;
   }
-  static void UnFreeze() {
-    frozen_ = false;
+  static bool UsingPerfectHashFunction() {
+#ifdef HAVE_CMPH
+    return hash_;
+#else
+    return false;
+#endif
+  }
+  static void EnableHash(const std::string& cmph_file) {
+#ifdef HAVE_CMPH
+    assert(dict_.max() == 0);  // dictionary must not have
+                               // been added to
+    hash_ = new PerfectHashFunction(cmph_file);
+#endif
   }
-
   static inline int NumFeats() {
+#ifdef HAVE_CMPH
+    if (hash_) return hash_->number_of_keys();
+#endif
     return dict_.max() + 1;
   }
   static inline WordID Convert(const std::string& s) {
+#ifdef HAVE_CMPH
+    if (hash_) return (*hash_)(s);
+#endif
     return dict_.Convert(s, frozen_);
   }
   static inline const std::string& Convert(const WordID& w) {
+#ifdef HAVE_CMPH
+    if (hash_) {
+      static std::string tls;
+      tls = to_string(w);
+      return tls;
+    }
+#endif
     return dict_.Convert(w);
   }
   static std::string Convert(WordID const *i,WordID const* e);
@@ -33,6 +64,9 @@ struct FD {
   static Dict dict_;
  private:
   static bool frozen_;
+#ifdef HAVE_CMPH
+  static PerfectHashFunction* hash_;
+#endif
 };
 
 #endif
diff --git a/utils/feature_vector.h b/utils/feature_vector.h
index 733aa99e..a7b61a66 100755
--- a/utils/feature_vector.h
+++ b/utils/feature_vector.h
@@ -3,9 +3,9 @@
 
 #include <vector>
 #include "sparse_vector.h"
-#include "fdict.h"
+#include "weights.h"
 
-typedef double Featval;
+typedef weight_t Featval;
 typedef SparseVector<Featval> FeatureVector;
 typedef SparseVector<Featval> WeightVector;
 typedef std::vector<Featval> DenseWeightVector;
diff --git a/utils/filelib.cc b/utils/filelib.cc
index 79ad2847..d206fc19 100644
--- a/utils/filelib.cc
+++ b/utils/filelib.cc
@@ -2,6 +2,12 @@
 
 #include <unistd.h>
 #include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <cstdlib>
+#include <cstdio>
+#include <sys/stat.h>
+#include <sys/types.h>
 
 using namespace std;
 
@@ -20,3 +26,28 @@ bool DirectoryExists(const string& dir) {
   return false;
 }
 
+void MkDirP(const string& dir) {
+  if (DirectoryExists(dir)) return;
+  if (mkdir(dir.c_str(), 0777)) {
+    perror(dir.c_str());
+    abort();
+  }
+  if (chmod(dir.c_str(), 07777)) {
+    perror(dir.c_str());
+    abort();
+  }
+}
+
+#if 0
+void CopyFile(const string& inf, const string& outf) {
+  WriteFile w(outf);
+  CopyFile(inf,*w);
+}
+#else
+void CopyFile(const string& inf, const string& outf) {
+  ofstream of(outf.c_str(), fstream::trunc|fstream::binary);
+  ifstream in(inf.c_str(), fstream::binary);
+  of << in.rdbuf();
+}
+#endif
+
diff --git a/utils/filelib.h b/utils/filelib.h
index dda98671..bb6e7415 100644
--- a/utils/filelib.h
+++ b/utils/filelib.h
@@ -12,6 +12,7 @@
 
 bool FileExists(const std::string& file_name);
 bool DirectoryExists(const std::string& dir_name);
+void MkDirP(const std::string& dir_name);
 
 // reads from standard in if filename is -
 // uncompresses if file ends with .gz
@@ -112,9 +113,6 @@ inline void CopyFile(std::string const& inf,std::ostream &out) {
   CopyFile(*r,out);
 }
 
-inline void CopyFile(std::string const& inf,std::string const& outf) {
-  WriteFile w(outf);
-  CopyFile(inf,*w);
-}
+void CopyFile(std::string const& inf,std::string const& outf);
 
 #endif
diff --git a/utils/logval.h b/utils/logval.h
index 6fdc2c42..8a59d0b1 100644
--- a/utils/logval.h
+++ b/utils/logval.h
@@ -25,12 +25,13 @@ class LogVal {
   typedef LogVal<T> Self;
 
   LogVal() : s_(), v_(LOGVAL_LOG0) {}
-  explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {}
+  LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {}
+  const Self& operator=(double x) { s_ = std::signbit(x); v_ = s_ ? std::log(-x) : std::log(x); return *this; }
   LogVal(init_minus_1) : s_(true),v_(0) {  }
   LogVal(init_1) : s_(),v_(0) {  }
   LogVal(init_0) : s_(),v_(LOGVAL_LOG0) {  }
-  LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {}
-  LogVal(unsigned x) : s_(0), v_(std::log(x)) { }
+  explicit LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {}
+  explicit LogVal(unsigned x) : s_(0), v_(std::log(x)) { }
   LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {}
   LogVal(double lnx,init_lnx) : s_(),v_(lnx) {}
   static Self exp(T lnx) { return Self(lnx,false); }
@@ -141,9 +142,6 @@ class LogVal {
     return pow(1/root);
   }
 
-  operator T() const {
-    if (s_) return -std::exp(v_); else return std::exp(v_);
-  }
   T as_float() const {
     if (s_) return -std::exp(v_); else return std::exp(v_);
   }
diff --git a/utils/logval_test.cc b/utils/logval_test.cc
index 4aa452f2..6133f5ce 100644
--- a/utils/logval_test.cc
+++ b/utils/logval_test.cc
@@ -30,13 +30,13 @@ TEST_F(LogValTest,Negate) {
   LogVal<double> x(-2.4);
   LogVal<double> y(2.4);
   y.negate();
-  EXPECT_FLOAT_EQ(x,y);
+  EXPECT_FLOAT_EQ(x.as_float(),y.as_float());
 }
 
 TEST_F(LogValTest,Inverse) {
   LogVal<double> x(1/2.4);
   LogVal<double> y(2.4);
-  EXPECT_FLOAT_EQ(x,y.inverse());
+  EXPECT_FLOAT_EQ(x.as_float(),y.inverse().as_float());
 }
 
 TEST_F(LogValTest,Minus) {
@@ -45,9 +45,9 @@ TEST_F(LogValTest,Minus) {
   LogVal<double> z1 = x - y;
   LogVal<double> z2 = x;
   z2 -= y;
-  EXPECT_FLOAT_EQ(z1, z2);
-  EXPECT_FLOAT_EQ(z1, 10.0);
-  EXPECT_FLOAT_EQ(y - x, -10.0);
+  EXPECT_FLOAT_EQ(z1.as_float(), z2.as_float());
+  EXPECT_FLOAT_EQ(z1.as_float(), 10.0);
+  EXPECT_FLOAT_EQ((y - x).as_float(), -10.0);
 }
 
 TEST_F(LogValTest,TestOps) {
@@ -62,8 +62,8 @@ TEST_F(LogValTest,TestOps) {
   LogVal<double> bb(-0.3);
   cerr << (aa + bb) << endl;
   cerr << (bb + aa) << endl;
-  EXPECT_FLOAT_EQ((aa + bb), (bb + aa));
-  EXPECT_FLOAT_EQ((aa + bb), -0.1);
+  EXPECT_FLOAT_EQ((aa + bb).as_float(), (bb + aa).as_float());
+  EXPECT_FLOAT_EQ((aa + bb).as_float(), -0.1);
 }
 
 TEST_F(LogValTest,TestSizes) {
diff --git a/utils/perfect_hash.cc b/utils/perfect_hash.cc
new file mode 100644
index 00000000..706e2741
--- /dev/null
+++ b/utils/perfect_hash.cc
@@ -0,0 +1,37 @@
+#include "config.h"
+
+#ifdef HAVE_CMPH
+
+#include "perfect_hash.h"
+
+#include <cstdio>
+#include <iostream>
+
+using namespace std;
+
+PerfectHashFunction::~PerfectHashFunction() {
+  cmph_destroy(mphf_);
+}
+
+PerfectHashFunction::PerfectHashFunction(const string& fname) {
+  FILE* f = fopen(fname.c_str(), "r");
+  if (!f) {
+    cerr << "Failed to open file " << fname << " for reading: cannot load hash function.\n";
+    abort();
+  }
+  mphf_ = cmph_load(f);
+  if (!mphf_) {
+    cerr << "cmph_load failed on " << fname << "!\n";
+    abort();
+  }
+}
+
+size_t PerfectHashFunction::operator()(const string& key) const {
+  return cmph_search(mphf_, &key[0], key.size());
+}
+
+size_t PerfectHashFunction::number_of_keys() const {
+  return cmph_size(mphf_);
+}
+
+#endif
diff --git a/utils/perfect_hash.h b/utils/perfect_hash.h
new file mode 100644
index 00000000..8ac11f18
--- /dev/null
+++ b/utils/perfect_hash.h
@@ -0,0 +1,24 @@
+#ifndef _PERFECT_HASH_MAP_H_
+#define _PERFECT_HASH_MAP_H_
+
+#include "config.h"
+
+#ifndef HAVE_CMPH
+#error libcmph is required to use PerfectHashFunction
+#endif
+
+#include <vector>
+#include <boost/utility.hpp>
+#include "cmph.h"
+
+class PerfectHashFunction : boost::noncopyable {
+ public:
+  explicit PerfectHashFunction(const std::string& fname);
+  ~PerfectHashFunction();
+  size_t operator()(const std::string& key) const;
+  size_t number_of_keys() const;
+ private:
+  cmph_t *mphf_;
+};
+
+#endif
diff --git a/utils/phmt.cc b/utils/phmt.cc
new file mode 100644
index 00000000..48d9f093
--- /dev/null
+++ b/utils/phmt.cc
@@ -0,0 +1,40 @@
+#include "config.h"
+
+#ifndef HAVE_CMPH
+int main() {
+  return 0;
+}
+#else
+
+#include <iostream>
+#include "weights.h"
+#include "fdict.h"
+
+using namespace std;
+
+int main(int argc, char** argv) {
+  if (argc != 2) { cerr << "Usage: " << argv[0] << " file.mphf\n"; return 1; }
+  FD::EnableHash(argv[1]);
+  cerr << "Number of keys: " << FD::NumFeats() << endl;
+  cerr << "LexFE = " << FD::Convert("LexFE") << endl;
+  cerr << "LexEF = " << FD::Convert("LexEF") << endl;
+  {
+    vector<weight_t> v(FD::NumFeats());
+    v[FD::Convert("LexFE")] = 1.0;
+    v[FD::Convert("LexEF")] = 0.5;
+    cerr << "Writing...\n";
+    Weights::WriteToFile("weights.bin", v);
+    cerr << "Done.\n";
+  }
+  {
+    vector<weight_t> v(FD::NumFeats());
+    cerr << "Reading...\n";
+    Weights::InitFromFile("weights.bin", &v);
+    cerr << "Done.\n";
+    assert(v[FD::Convert("LexFE")] == 1.0);
+    assert(v[FD::Convert("LexEF")] == 0.5);
+  }
+}
+
+#endif
+
diff --git a/utils/reconstruct_weights.cc b/utils/reconstruct_weights.cc
new file mode 100644
index 00000000..d32e4f67
--- /dev/null
+++ b/utils/reconstruct_weights.cc
@@ -0,0 +1,68 @@
+#include <iostream>
+#include <vector>
+#include <cassert>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "fdict.h"
+#include "weights.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("weights,w",po::value<string>(),"Input feature weights file")
+        ("keys,k",po::value<string>(),"Keys file (list of features with dummy value at start)")
+        ("cmph_perfect_hash_file,h",po::value<string>(),"cmph perfect hash function file");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,?", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("cmph_perfect_hash_file") || !conf->count("weights") || !conf->count("keys")) {
+    cerr << "Generate a text format weights file. Options -w -k and -h are required.\n";
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf))
+    return false;
+
+  FD::EnableHash(conf["cmph_perfect_hash_file"].as<string>());
+
+  // load weights
+  vector<weight_t> weights;
+  Weights::InitFromFile(conf["weights"].as<string>(), &weights);
+
+  ReadFile rf(conf["keys"].as<string>());
+  istream& in = *rf.stream();
+  string key;
+  size_t lc = 0;
+  while(getline(in, key)) {
+    ++lc;
+    if (lc == 1) continue;
+    assert(lc <= weights.size());
+    cout << key << " " << weights[lc - 1] << endl;
+  }
+
+  return 0;
+}
+
diff --git a/utils/sampler.h b/utils/sampler.h
index 8567e922..cae660d2 100644
--- a/utils/sampler.h
+++ b/utils/sampler.h
@@ -105,7 +105,7 @@ class SampleSet {
   const F& operator[](int i) const { return m_scores[i]; }
   F& operator[](int i) { return m_scores[i]; }
   bool empty() const { return m_scores.empty(); }
-  void add(const prob_t& s) { m_scores.push_back(s); }
+  void add(const F& s) { m_scores.push_back(s); }
   void clear() { m_scores.clear(); }
   size_t size() const { return m_scores.size(); }
   void resize(int size) { m_scores.resize(size); }
diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h
index a55436fb..049151f7 100644
--- a/utils/sparse_vector.h
+++ b/utils/sparse_vector.h
@@ -1,44 +1,6 @@
 #ifndef _SPARSE_VECTOR_H_
 #define _SPARSE_VECTOR_H_
 
-#if 0
-
-#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP
-  friend class boost::serialization::access;
-  template<class Archive>
-  void save(Archive & ar, const unsigned int version) const {
-    (void) version;
-    int eff_size = values_.size();
-    const_iterator it = this->begin();
-    if (values_.find(0) != values_.end()) { ++it; --eff_size; }
-    ar & eff_size;
-    while (it != this->end()) {
-      const std::pair<const std::string&, const T&> wire_pair(FD::Convert(it->first), it->second);
-      ar & wire_pair;
-      ++it;
-    }
-  }
-  template<class Archive>
-  void load(Archive & ar, const unsigned int version) {
-    (void) version;
-    this->clear();
-    int sz; ar & sz;
-    for (int i = 0; i < sz; ++i) {
-      std::pair<std::string, T> wire_pair;
-      ar & wire_pair;
-      this->set_value(FD::Convert(wire_pair.first), wire_pair.second);
-    }
-  }
-  BOOST_SERIALIZATION_SPLIT_MEMBER()
-#endif
-};
-
-#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP
-BOOST_CLASS_TRACKING(SparseVector<double>,track_never)
-#endif
-
-#endif /// FIX
-
 #include "fast_sparse_vector.h"
 #define SparseVector FastSparseVector
 
diff --git a/utils/stringlib.cc b/utils/stringlib.cc
index 7aaee9f0..1a152985 100644
--- a/utils/stringlib.cc
+++ b/utils/stringlib.cc
@@ -2,6 +2,7 @@
 
 #include <cstring>
 #include <cstdlib>
+#include <cstdio>
 #include <cassert>
 #include <iostream>
 #include <map>
@@ -32,7 +33,12 @@ void ParseTranslatorInput(const string& line, string* input, string* ref) {
 void ProcessAndStripSGML(string* pline, map<string, string>* out) {
   map<string, string>& meta = *out;
   string& line = *pline;
-  string lline = LowercaseString(line);
+  string lline = *pline;
+  if (lline.find("<SEG")==0 || lline.find("<Seg")==0) {
+    cerr << "Segment tags <seg> must be lowercase!\n";
+    cerr << "  " << *pline << endl;
+    abort();
+  } 
   if (lline.find("<seg")!=0) return;
   size_t close = lline.find(">");
   if (close == string::npos) return; // error
@@ -85,3 +91,365 @@ void ProcessAndStripSGML(string* pline, map<string, string>* out) {
   }
 }
 
+string SGMLOpenSegTag(const map<string, string>& attr) {
+  ostringstream os;
+  os << "<seg";
+  for (map<string,string>::const_iterator it = attr.begin(); it != attr.end(); ++it)
+    os << ' ' << it->first << '=' << '"' << it->second << '"';
+  os << '>';
+  return os.str();
+}
+
+class MD5 {
+public:
+  typedef unsigned int size_type; // must be 32bit
+
+  MD5();
+  MD5(const string& text);
+  void update(const unsigned char *buf, size_type length);
+  void update(const char *buf, size_type length);
+  MD5& finalize();
+  string hexdigest() const;
+
+private:
+  void init();
+  typedef unsigned char uint1; //  8bit
+  typedef unsigned int uint4;  // 32bit
+  enum {blocksize = 64}; // VC6 won't eat a const static int here
+
+  void transform(const uint1 block[blocksize]);
+  static void decode(uint4 output[], const uint1 input[], size_type len);
+  static void encode(uint1 output[], const uint4 input[], size_type len);
+
+  bool finalized;
+  uint1 buffer[blocksize]; // bytes that didn't fit in last 64 byte chunk
+  uint4 count[2];   // 64bit counter for number of bits (lo, hi)
+  uint4 state[4];   // digest so far
+  uint1 digest[16]; // the result
+
+  // low level logic operations
+  static inline uint4 F(uint4 x, uint4 y, uint4 z);
+  static inline uint4 G(uint4 x, uint4 y, uint4 z);
+  static inline uint4 H(uint4 x, uint4 y, uint4 z);
+  static inline uint4 I(uint4 x, uint4 y, uint4 z);
+  static inline uint4 rotate_left(uint4 x, int n);
+  static inline void FF(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac);
+  static inline void GG(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac);
+  static inline void HH(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac);
+  static inline void II(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac);
+};
+
+// Constants for MD5Transform routine.
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+
+///////////////////////////////////////////////
+
+// F, G, H and I are basic MD5 functions.
+inline MD5::uint4 MD5::F(uint4 x, uint4 y, uint4 z) {
+  return (x&y) | (~x&z);
+}
+
+inline MD5::uint4 MD5::G(uint4 x, uint4 y, uint4 z) {
+  return (x&z) | (y&~z);
+}
+
+inline MD5::uint4 MD5::H(uint4 x, uint4 y, uint4 z) {
+  return x^y^z;
+}
+
+inline MD5::uint4 MD5::I(uint4 x, uint4 y, uint4 z) {
+  return y ^ (x | ~z);
+}
+
+// rotate_left rotates x left n bits.
+inline MD5::uint4 MD5::rotate_left(uint4 x, int n) {
+  return (x << n) | (x >> (32-n));
+}
+
+// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+// Rotation is separate from addition to prevent recomputation.
+inline void MD5::FF(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) {
+  a = rotate_left(a+ F(b,c,d) + x + ac, s) + b;
+}
+
+inline void MD5::GG(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) {
+  a = rotate_left(a + G(b,c,d) + x + ac, s) + b;
+}
+
+inline void MD5::HH(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) {
+  a = rotate_left(a + H(b,c,d) + x + ac, s) + b;
+}
+
+inline void MD5::II(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) {
+  a = rotate_left(a + I(b,c,d) + x + ac, s) + b;
+}
+
+//////////////////////////////////////////////
+
+// default ctor, just initailize
+MD5::MD5()
+{
+  init();
+}
+
+//////////////////////////////////////////////
+
+// nifty shortcut ctor, compute MD5 for string and finalize it right away
+MD5::MD5(const string &text)
+{
+  init();
+  update(text.c_str(), text.length());
+  finalize();
+}
+
+//////////////////////////////
+
+void MD5::init()
+{
+  finalized=false;
+
+  count[0] = 0;
+  count[1] = 0;
+
+  // load magic initialization constants.
+  state[0] = 0x67452301;
+  state[1] = 0xefcdab89;
+  state[2] = 0x98badcfe;
+  state[3] = 0x10325476;
+}
+
+//////////////////////////////
+
+// decodes input (unsigned char) into output (uint4). Assumes len is a multiple of 4.
+void MD5::decode(uint4 output[], const uint1 input[], size_type len)
+{
+  for (unsigned int i = 0, j = 0; j < len; i++, j += 4)
+    output[i] = ((uint4)input[j]) | (((uint4)input[j+1]) << 8) |
+      (((uint4)input[j+2]) << 16) | (((uint4)input[j+3]) << 24);
+}
+
+//////////////////////////////
+
+// encodes input (uint4) into output (unsigned char). Assumes len is
+// a multiple of 4.
+void MD5::encode(uint1 output[], const uint4 input[], size_type len)
+{
+  for (size_type i = 0, j = 0; j < len; i++, j += 4) {
+    output[j] = input[i] & 0xff;
+    output[j+1] = (input[i] >> 8) & 0xff;
+    output[j+2] = (input[i] >> 16) & 0xff;
+    output[j+3] = (input[i] >> 24) & 0xff;
+  }
+}
+
+//////////////////////////////
+
+// apply MD5 algo on a block
+void MD5::transform(const uint1 block[blocksize])
+{
+  uint4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+  decode (x, block, blocksize);
+
+  /* Round 1 */
+  FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+  FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+  FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+  FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+  FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+  FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+  FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+  FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+  FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+  FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+  FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+  FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+  FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+  FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+  FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+  FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+  /* Round 2 */
+  GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+  GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+  GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+  GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+  GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+  GG (d, a, b, c, x[10], S22,  0x2441453); /* 22 */
+  GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+  GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+  GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+  GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+  GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+  GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+  GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+  GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+  GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+  GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+  /* Round 3 */
+  HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+  HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+  HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+  HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+  HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+  HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+  HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+  HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+  HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+  HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+  HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+  HH (b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
+  HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+  HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+  HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+  HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+  /* Round 4 */
+  II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+  II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+  II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+  II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+  II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+  II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+  II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+  II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+  II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+  II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+  II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+  II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+  II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+  II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+  II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+  II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+  state[0] += a;
+  state[1] += b;
+  state[2] += c;
+  state[3] += d;
+
+  // Zeroize sensitive information.
+  memset(x, 0, sizeof x);
+}
+
+//////////////////////////////
+
+// MD5 block update operation. Continues an MD5 message-digest
+// operation, processing another message block
+void MD5::update(const unsigned char input[], size_type length)
+{
+  // compute number of bytes mod 64
+  size_type index = count[0] / 8 % blocksize;
+
+  // Update number of bits
+  if ((count[0] += (length << 3)) < (length << 3))
+    count[1]++;
+  count[1] += (length >> 29);
+
+  // number of bytes we need to fill in buffer
+  size_type firstpart = 64 - index;
+
+  size_type i;
+
+  // transform as many times as possible.
+  if (length >= firstpart)
+  {
+    // fill buffer first, transform
+    memcpy(&buffer[index], input, firstpart);
+    transform(buffer);
+
+    // transform chunks of blocksize (64 bytes)
+    for (i = firstpart; i + blocksize <= length; i += blocksize)
+      transform(&input[i]);
+
+    index = 0;
+  }
+  else
+    i = 0;
+
+  // buffer remaining input
+  memcpy(&buffer[index], &input[i], length-i);
+}
+
+//////////////////////////////
+
+// for convenience provide a verson with signed char
+void MD5::update(const char input[], size_type length)
+{
+  update((const unsigned char*)input, length);
+}
+
+//////////////////////////////
+
+// MD5 finalization. Ends an MD5 message-digest operation, writing the
+// the message digest and zeroizing the context.
+MD5& MD5::finalize()
+{
+  static unsigned char padding[64] = {
+    0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+  };
+
+  if (!finalized) {
+    // Save number of bits
+    unsigned char bits[8];
+    encode(bits, count, 8);
+
+    // pad out to 56 mod 64.
+    size_type index = count[0] / 8 % 64;
+    size_type padLen = (index < 56) ? (56 - index) : (120 - index);
+    update(padding, padLen);
+
+    // Append length (before padding)
+    update(bits, 8);
+
+    // Store state in digest
+    encode(digest, state, 16);
+
+    // Zeroize sensitive information.
+    memset(buffer, 0, sizeof buffer);
+    memset(count, 0, sizeof count);
+
+    finalized=true;
+  }
+
+  return *this;
+}
+
+//////////////////////////////
+
+// return hex representation of digest as string
+string MD5::hexdigest() const {
+  if (!finalized)
+    return "";
+
+  char buf[33];
+  for (int i=0; i<16; i++)
+    sprintf(buf+i*2, "%02x", digest[i]);
+  buf[32]=0;
+
+  return string(buf);
+}
+
+//////////////////////////////
+
+string md5(const string& str) {
+    MD5 md5 = MD5(str);
+    return md5.hexdigest();
+}
+
diff --git a/utils/stringlib.h b/utils/stringlib.h
index 8022bb88..cafbdac3 100644
--- a/utils/stringlib.h
+++ b/utils/stringlib.h
@@ -249,6 +249,7 @@ inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::s
 }
 
 void ProcessAndStripSGML(std::string* line, std::map<std::string, std::string>* out);
+std::string SGMLOpenSegTag(const std::map<std::string, std::string>& attr);
 
 // given the first character of a UTF8 block, find out how wide it is
 // see http://en.wikipedia.org/wiki/UTF-8 for more info
@@ -260,4 +261,6 @@ inline unsigned int UTF8Len(unsigned char x) {
   else return 0;
 }
 
+std::string md5(const std::string& in);
+
 #endif
diff --git a/utils/tdict.cc b/utils/tdict.cc
index c21b2b48..de234323 100644
--- a/utils/tdict.cc
+++ b/utils/tdict.cc
@@ -13,6 +13,10 @@ using namespace std;
 
 Dict TD::dict_;
 
+unsigned int TD::NumWords() {
+  return dict_.max();
+}
+
 WordID TD::Convert(const std::string& s) {
   return dict_.Convert(s);
 }
diff --git a/utils/ts.cc b/utils/ts.cc
index 3694e076..bf4f8f69 100644
--- a/utils/ts.cc
+++ b/utils/ts.cc
@@ -7,6 +7,7 @@
 #include "prob.h"
 #include "sparse_vector.h"
 #include "fast_sparse_vector.h"
+#include "stringlib.h"
 
 using namespace std;
 
@@ -79,6 +80,11 @@ int main() {
     y -= y;
   }
   cerr << "Counted " << c << " times\n";
+
+  cerr << md5("this is a test") << endl;
+  cerr << md5("some other ||| string is") << endl;
+  map<string,string> x; x["id"] = "12"; x["grammar"] = "/path/to/grammar.gz";
+  cerr << SGMLOpenSegTag(x) << endl;
   return 0;
 }
 
diff --git a/utils/weights.cc b/utils/weights.cc
index 6b7e58ed..ac407dfb 100644
--- a/utils/weights.cc
+++ b/utils/weights.cc
@@ -8,101 +8,150 @@
 
 using namespace std;
 
-void Weights::InitFromFile(const std::string& filename, vector<string>* feature_list) {
+void Weights::InitFromFile(const string& filename,
+                           vector<weight_t>* pweights,
+                           vector<string>* feature_list) {
+  vector<weight_t>& weights = *pweights;
   if (!SILENT) cerr << "Reading weights from " << filename << endl;
   ReadFile in_file(filename);
   istream& in = *in_file.stream();
   assert(in);
-  int weight_count = 0;
-  bool fl = false;
-  string buf;
-  double val = 0;
-  while (in) {
-    getline(in, buf);
-    if (buf.size() == 0) continue;
-    if (buf[0] == '#') continue;
-    for (int i = 0; i < buf.size(); ++i)
-      if (buf[i] == '=') buf[i] = ' ';
-    int start = 0;
-    while(start < buf.size() && buf[start] == ' ') ++start;
-    int end = 0;
-    while(end < buf.size() && buf[end] != ' ') ++end;
-    const int fid = FD::Convert(buf.substr(start, end - start));
-    while(end < buf.size() && buf[end] == ' ') ++end;
-    val = strtod(&buf.c_str()[end], NULL);
-    if (isnan(val)) {
-      cerr << FD::Convert(fid) << " has weight NaN!\n";
-     abort();
+  
+  bool read_text = true;
+  if (1) {
+    ReadFile hdrrf(filename);
+    istream& hi = *hdrrf.stream();
+    assert(hi);
+    char buf[10];
+    hi.read(buf, 5);
+    assert(hi.good());
+    if (strncmp(buf, "_PHWf", 5) == 0) {
+      read_text = false;
+    }
+  }
+
+  if (read_text) {
+    int weight_count = 0;
+    bool fl = false;
+    string buf;
+    weight_t val = 0;
+    while (in) {
+      getline(in, buf);
+      if (buf.size() == 0) continue;
+      if (buf[0] == '#') continue;
+      if (buf[0] == ' ') {
+        cerr << "Weights file lines may not start with whitespace.\n" << buf << endl;
+        abort();
+      }
+      for (int i = buf.size() - 1; i > 0; --i)
+        if (buf[i] == '=' || buf[i] == '\t') { buf[i] = ' '; break; }
+      int start = 0;
+      while(start < buf.size() && buf[start] == ' ') ++start;
+      int end = 0;
+      while(end < buf.size() && buf[end] != ' ') ++end;
+      const int fid = FD::Convert(buf.substr(start, end - start));
+      if (feature_list) { feature_list->push_back(buf.substr(start, end - start)); }
+      while(end < buf.size() && buf[end] == ' ') ++end;
+      val = strtod(&buf.c_str()[end], NULL);
+      if (isnan(val)) {
+        cerr << FD::Convert(fid) << " has weight NaN!\n";
+        abort();
+      }
+      if (weights.size() <= fid)
+        weights.resize(fid + 1);
+      weights[fid] = val;
+      ++weight_count;
+      if (!SILENT) {
+        if (weight_count %   50000 == 0) { cerr << '.' << flush; fl = true; }
+        if (weight_count % 2000000 == 0) { cerr << " [" << weight_count << "]\n"; fl = false; }
+      }
     }
-    if (wv_.size() <= fid)
-      wv_.resize(fid + 1);
-    wv_[fid] = val;
-    if (feature_list) { feature_list->push_back(FD::Convert(fid)); }
-    ++weight_count;
     if (!SILENT) {
-      if (weight_count %   50000 == 0) { cerr << '.' << flush; fl = true; }
-      if (weight_count % 2000000 == 0) { cerr << " [" << weight_count << "]\n"; fl = false; }
+      if (fl) { cerr << endl; }
+      cerr << "Loaded " << weight_count << " feature weights\n";
+    }
+  } else {   // !read_text
+    char buf[6];
+    in.read(buf, 5);
+    size_t num_keys;
+    in.read(reinterpret_cast<char*>(&num_keys), sizeof(size_t));
+    if (num_keys != FD::NumFeats()) {
+      cerr << "Hash function reports " << FD::NumFeats() << " keys but weights file contains " << num_keys << endl;
+      abort();
+    }
+    weights.resize(num_keys);
+    in.read(reinterpret_cast<char*>(&weights.front()), num_keys * sizeof(weight_t));
+    if (!in.good()) {
+      cerr << "Error loading weights!\n";
+      abort();
+    } else {
+      cerr << "  Successfully loaded " << (num_keys * sizeof(weight_t)) << " bytes\n";
     }
-  }
-  if (!SILENT) {
-    if (fl) { cerr << endl; }
-    cerr << "Loaded " << weight_count << " feature weights\n";
   }
 }
 
-void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features, const string* extra) const {
+void Weights::WriteToFile(const string& fname,
+                          const vector<weight_t>& weights,
+                          bool hide_zero_value_features,
+                          const string* extra) {
   WriteFile out(fname);
   ostream& o = *out.stream();
   assert(o);
-  if (extra) { o << "# " << *extra << endl; }
-  o.precision(17);
-  const int num_feats = FD::NumFeats();
-  for (int i = 1; i < num_feats; ++i) {
-    const double val = (i < wv_.size() ? wv_[i] : 0.0);
-    if (hide_zero_value_features && val == 0.0) continue;
-    o << FD::Convert(i) << ' ' << val << endl;
-  }
-}
+  bool write_text = !FD::UsingPerfectHashFunction();
 
-void Weights::InitVector(std::vector<double>* w) const {
-  *w = wv_;
-}
-
-void Weights::InitSparseVector(SparseVector<double>* w) const {
-  for (int i = 1; i < wv_.size(); ++i) {
-    const double& weight = wv_[i];
-    if (weight) w->set_value(i, weight);
+  if (write_text) {
+    if (extra) { o << "# " << *extra << endl; }
+    o.precision(17);
+    const int num_feats = FD::NumFeats();
+    for (int i = 1; i < num_feats; ++i) {
+      const weight_t val = (i < weights.size() ? weights[i] : 0.0);
+      if (hide_zero_value_features && val == 0.0) continue;
+      o << FD::Convert(i) << ' ' << val << endl;
+    }
+  } else {
+    o.write("_PHWf", 5);
+    const size_t keys = FD::NumFeats();
+    assert(keys <= weights.size());
+    o.write(reinterpret_cast<const char*>(&keys), sizeof(keys));
+    o.write(reinterpret_cast<const char*>(&weights[0]), keys * sizeof(weight_t));
   }
 }
 
-void Weights::InitFromVector(const std::vector<double>& w) {
-  wv_ = w;
-  if (wv_.size() > FD::NumFeats())
-    cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n";
-  wv_.resize(FD::NumFeats(), 0);
+void Weights::InitSparseVector(const vector<weight_t>& dv,
+                               SparseVector<weight_t>* sv) {
+  sv->clear();
+  for (unsigned i = 1; i < dv.size(); ++i) {
+    if (dv[i]) sv->set_value(i, dv[i]);
+  }
 }
 
-void Weights::InitFromVector(const SparseVector<double>& w) {
-  wv_.clear();
-  wv_.resize(FD::NumFeats(), 0.0);
-  for (int i = 1; i < FD::NumFeats(); ++i)
-    wv_[i] = w.value(i);
+void Weights::SanityCheck(const vector<weight_t>& w) {
+  for (int i = 0; i < w.size(); ++i) {
+    assert(!isnan(w[i]));
+    assert(!isinf(w[i]));
+  }
 }
 
-void Weights::SetWeight(SparseVector<double>* v, const string fname, const double w) {
-  WordID fid = FD::Convert(fname);
-  cout << "fid " << fid << endl;
-  SetWeight(v, fid, w);
-}
+struct FComp {
+  const vector<weight_t>& w_;
+  FComp(const vector<weight_t>& w) : w_(w) {}
+  bool operator()(int a, int b) const {
+    return fabs(w_[a]) > fabs(w_[b]);
+  }
+};
 
-void Weights::SetWeight(SparseVector<double>* v, const WordID fid, const double w) {
-  wv_.resize(FD::NumFeats(), 0.0);
-  wv_[fid] = w;
-  //v->set_value(fid, w); 
+void Weights::ShowLargestFeatures(const vector<weight_t>& w) {
+  vector<int> fnums(w.size());
+  for (int i = 0; i < w.size(); ++i)
+    fnums[i] = i;
+  vector<int>::iterator mid = fnums.begin();
+  mid += (w.size() > 10 ? 10 : w.size());
+  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
+  cerr << "TOP FEATURES:";
+  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
+    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
+  }
+  cerr << endl;
 }
 
-void Weights::sz()
-{
-  cout << "wv_.size() " << wv_.size() << endl;
-}
 
diff --git a/utils/weights.h b/utils/weights.h
index 86701add..30f71db0 100644
--- a/utils/weights.h
+++ b/utils/weights.h
@@ -2,25 +2,29 @@
 #define _WEIGHTS_H_
 
 #include <string>
-#include <map>
 #include <vector>
 #include "sparse_vector.h"
 
+// warning: in the future this will become float
+typedef double weight_t;
+
 class Weights {
  public:
-  Weights() {}
-  void InitFromFile(const std::string& fname, std::vector<std::string>* feature_list = NULL);
-  void WriteToFile(const std::string& fname, bool hide_zero_value_features = true, const std::string* extra = NULL) const;
-  void InitVector(std::vector<double>* w) const;
-  void InitSparseVector(SparseVector<double>* w) const;
-  void InitFromVector(const std::vector<double>& w);
-  void InitFromVector(const SparseVector<double>& w);
-  void SetWeight(SparseVector<double>* v, const std::string f, const double w);
-  void SetWeight(SparseVector<double>* v, const WordID fid, const double w);
-  std::vector<double>* getw() { return &wv_; }; // probably a hack
-  void sz();
+  static void InitFromFile(const std::string& fname,
+                           std::vector<weight_t>* weights,
+                           std::vector<std::string>* feature_list = NULL);
+  static void WriteToFile(const std::string& fname,
+                          const std::vector<weight_t>& weights,
+                          bool hide_zero_value_features = true,
+                          const std::string* extra = NULL);
+  static void InitSparseVector(const std::vector<weight_t>& dv,
+                               SparseVector<weight_t>* sv);
+  // check for infinities, NaNs, etc
+  static void SanityCheck(const std::vector<weight_t>& w);
+  // write weights with largest magnitude to cerr
+  static void ShowLargestFeatures(const std::vector<weight_t>& w);
  private:
-  std::vector<double> wv_;
+  Weights();
 };
 
 #endif
diff --git a/utils/weights_test.cc b/utils/weights_test.cc
index 8a4c26ef..938b311f 100644
--- a/utils/weights_test.cc
+++ b/utils/weights_test.cc
@@ -14,11 +14,10 @@ class WeightsTest : public testing::Test {
   virtual void TearDown() { }
 };
        
-
 TEST_F(WeightsTest,Load) {
-  Weights w;
-  w.InitFromFile("test_data/weights");
-  w.WriteToFile("-");
+  vector<weight_t> v;
+  Weights::InitFromFile("test_data/weights", &v);
+  Weights::WriteToFile("-", v);
 }
 
 int main(int argc, char **argv) {
author	Patrick Simianer <p@simianer.de>	2011-10-20 02:31:25 +0200
committer	Patrick Simianer <p@simianer.de>	2011-10-20 02:31:25 +0200
commit	a5a92ebe23c5819ed104313426012011e32539da (patch)
tree	3416818c758d5ece4e71fe522c571e75ea04f100 /utils
parent	b88332caac2cbe737c99b8098813f868ca876d8b (diff)
parent	78baccbb4231bb84a456702d4f574f8e601a8182 (diff)