From 1cdb086a30abc0ea2f616ddbb008cda215f6f256 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cab.ark.cs.cmu.edu>
Date: Sun, 12 Aug 2012 21:56:58 -0400
Subject: possible errors with google hashes

---
 gi/pf/pyp_lm.cc            | 78 ++++++++++++++++++++++++++++++++++++++++++----
 utils/fast_sparse_vector.h |  8 ++++-
 utils/hash.h               |  6 ++--
 3 files changed, 83 insertions(+), 9 deletions(-)
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 7cec437a..605d8206 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -6,6 +6,7 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "gamma_poisson.h"
 #include "corpus_tools.h"
 #include "m.h"
 #include "tdict.h"
@@ -59,11 +60,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   }
 }
 
-template <unsigned N> struct PYPLM;
-
-// uniform base distribution (0-gram model)
-template<> struct PYPLM<0> {
-  PYPLM(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {}
+// uniform distribution over a fixed vocabulary
+struct UniformVocabulary {
+  UniformVocabulary(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {}
   void increment(WordID, const vector<WordID>&, MT19937*) { ++draws; }
   void decrement(WordID, const vector<WordID>&, MT19937*) { --draws; assert(draws >= 0); }
   double prob(WordID, const vector<WordID>&) const { return p0; }
@@ -73,6 +72,73 @@ template<> struct PYPLM<0> {
   int draws;
 };
 
+// Lord Rothschild. 1986. THE DISTRIBUTION OF ENGLISH DICTIONARY WORD LENGTHS.
+// Journal of Statistical Planning and Inference 14 (1986) 311-322
+struct PoissonLengthUniformCharWordModel {
+  explicit PoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : plen(5,5), uc(-log(95)), llh() {}
+  void increment(WordID w, const vector<WordID>& v, MT19937*) {
+    llh += log(prob(w, v)); // this isn't quite right
+    plen.increment(TD::Convert(w).size() - 1);
+  }
+  void decrement(WordID w, const vector<WordID>& v, MT19937*) {
+    plen.decrement(TD::Convert(w).size() - 1);
+    llh -= log(prob(w, v)); // this isn't quite right
+  }
+  double prob(WordID w, const vector<WordID>&) const {
+    const unsigned len = TD::Convert(w).size();
+    return plen.prob(len - 1) * exp(uc * len);
+  }
+  double log_likelihood() const { return llh; }
+  void resample_hyperparameters(MT19937*) {}
+  GammaPoisson plen;
+  const double uc;
+  double llh;
+};
+
+struct PYPAdaptedPoissonLengthUniformCharWordModel {
+  explicit PYPAdaptedPoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) :
+    base(vocab_size,1,1,1,1),
+    crp(1,1,1,1) {}
+  void increment(WordID w, const vector<WordID>& v, MT19937* rng) {
+    double p0 = base.prob(w, v);
+    if (crp.increment(w, p0, rng))
+      base.increment(w, v, rng);
+  }
+  void decrement(WordID w, const vector<WordID>& v, MT19937* rng) {
+    if (crp.decrement(w, rng))
+      base.decrement(w, v, rng);
+  }
+  double prob(WordID w, const vector<WordID>& v) const {
+    double p0 = base.prob(w, v);
+    return crp.prob(w, p0);
+  }
+  double log_likelihood() const { return crp.log_crp_prob() + base.log_likelihood(); }
+  void resample_hyperparameters(MT19937* rng) { crp.resample_hyperparameters(rng); }
+  PoissonLengthUniformCharWordModel base;
+  CCRP<WordID> crp;
+};
+
+template <unsigned N> struct PYPLM;
+
+#if 1
+template<> struct PYPLM<0> : public UniformVocabulary {
+  PYPLM(unsigned vs, double a, double b, double c, double d) :
+    UniformVocabulary(vs, a, b, c, d) {}
+};
+#else
+#if 0
+template<> struct PYPLM<0> : public PoissonLengthUniformCharWordModel {
+  PYPLM(unsigned vs, double a, double b, double c, double d) :
+    PoissonLengthUniformCharWordModel(vs, a, b, c, d) {}
+};
+#else
+template<> struct PYPLM<0> : public PYPAdaptedPoissonLengthUniformCharWordModel {
+  PYPLM(unsigned vs, double a, double b, double c, double d) :
+    PYPAdaptedPoissonLengthUniformCharWordModel(vs, a, b, c, d) {}
+};
+#endif
+#endif
+
 // represents an N-gram LM
 template <unsigned N> struct PYPLM {
   PYPLM(unsigned vs, double da, double db, double ss, double sr) :
@@ -170,7 +236,7 @@ int main(int argc, char** argv) {
     }
     if (SS % 10 == 9) {
       cerr << " [LLH=" << lm.log_likelihood() << "]" << endl;
-      if (SS % 20 == 19) lm.resample_hyperparameters(&rng);
+      if (SS % 30 == 29) lm.resample_hyperparameters(&rng);
     } else { cerr << '.' << flush; }
   }
   double llh = 0;
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index 5647a2a9..9fe00459 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -13,6 +13,7 @@
 #include <map>
 #include <cassert>
 #include <vector>
+#include <limits>
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -192,6 +193,7 @@ class FastSparseVector {
     } else {
       is_remote_ = true;
       data_.rbmap = new SPARSE_HASH_MAP<unsigned, T>(first, last);
+      HASH_MAP_DELETED(*data_.rbmap, std::numeric_limits<unsigned>::max());
     }
   }
   void erase(unsigned k) {
@@ -213,8 +215,11 @@ class FastSparseVector {
     if (&other == this) return *this;
     clear();
     std::memcpy(this, &other, sizeof(FastSparseVector));
-    if (is_remote_)
+    if (is_remote_) {
       data_.rbmap = new SPARSE_HASH_MAP<unsigned, T>(*data_.rbmap);
+      // TODO: do i need to set_deleted on a copy?
+      HASH_MAP_DELETED(*data_.rbmap, std::numeric_limits<unsigned>::max());
+    }
     return *this;
   }
   T const& get_singleton() const {
@@ -445,6 +450,7 @@ class FastSparseVector {
       SPARSE_HASH_MAP<unsigned, T>* m = new SPARSE_HASH_MAP<unsigned, T>(
          reinterpret_cast<std::pair<unsigned, T>*>(&data_.local[0]),
          reinterpret_cast<std::pair<unsigned, T>*>(&data_.local[local_size_]), local_size_ * 1.5 + 1);
+      HASH_MAP_DELETED(*m, std::numeric_limits<unsigned>::max());
       data_.rbmap = m;
       is_remote_ = true;
     }
diff --git a/utils/hash.h b/utils/hash.h
index 6d992086..189ed1ae 100644
--- a/utils/hash.h
+++ b/utils/hash.h
@@ -16,14 +16,16 @@
 # define SPARSE_HASH_MAP google::sparse_hash_map
 # define HASH_MAP google::dense_hash_map
 # define HASH_SET google::dense_hash_set
-# define HASH_MAP_RESERVED(h,empty,deleted) do { h.set_empty_key(empty); h.set_deleted_key(deleted); } while(0)
-# define HASH_MAP_EMPTY(h,empty) do { h.set_empty_key(empty); } while(0)
+# define HASH_MAP_DELETED(h,deleted) do { (h).set_deleted_key(deleted); } while(0)
+# define HASH_MAP_RESERVED(h,empty,deleted) do { (h).set_empty_key(empty); (h).set_deleted_key(deleted); } while(0)
+# define HASH_MAP_EMPTY(h,empty) do { (h).set_empty_key(empty); } while(0)
 #else
 # include <tr1/unordered_map>
 # include <tr1/unordered_set>
 # define SPARSE_HASH_MAP std::tr1::unordered_map
 # define HASH_MAP std::tr1::unordered_map
 # define HASH_SET std::tr1::unordered_set
+# define HASH_MAP_DELETED(h,deleted)
 # define HASH_MAP_RESERVED(h,empty,deleted)
 # define HASH_MAP_EMPTY(h,empty)
 #endif
-- 
cgit v1.2.3