summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-08-12 21:56:58 -0400
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-08-12 21:56:58 -0400
commit2c4453984060dd039b8a99e5a8d98dbc107588b9 (patch)
tree022960fd3edc1b97260cfef67b804879ecb6c1f4
parent08030aee204975c2d187c5a8d0dabbc67799a104 (diff)
possible errors with google hashes
-rw-r--r--gi/pf/pyp_lm.cc78
-rw-r--r--utils/fast_sparse_vector.h8
-rw-r--r--utils/hash.h6
3 files changed, 83 insertions, 9 deletions
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
index 7cec437a..605d8206 100644
--- a/gi/pf/pyp_lm.cc
+++ b/gi/pf/pyp_lm.cc
@@ -6,6 +6,7 @@
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
+#include "gamma_poisson.h"
#include "corpus_tools.h"
#include "m.h"
#include "tdict.h"
@@ -59,11 +60,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
}
}
-template <unsigned N> struct PYPLM;
-
-// uniform base distribution (0-gram model)
-template<> struct PYPLM<0> {
- PYPLM(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {}
+// uniform distribution over a fixed vocabulary
+struct UniformVocabulary {
+ UniformVocabulary(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {}
void increment(WordID, const vector<WordID>&, MT19937*) { ++draws; }
void decrement(WordID, const vector<WordID>&, MT19937*) { --draws; assert(draws >= 0); }
double prob(WordID, const vector<WordID>&) const { return p0; }
@@ -73,6 +72,73 @@ template<> struct PYPLM<0> {
int draws;
};
+// Lord Rothschild. 1986. THE DISTRIBUTION OF ENGLISH DICTIONARY WORD LENGTHS.
+// Journal of Statistical Planning and Inference 14 (1986) 311-322
+struct PoissonLengthUniformCharWordModel {
+ explicit PoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : plen(5,5), uc(-log(95)), llh() {}
+ void increment(WordID w, const vector<WordID>& v, MT19937*) {
+ llh += log(prob(w, v)); // this isn't quite right
+ plen.increment(TD::Convert(w).size() - 1);
+ }
+ void decrement(WordID w, const vector<WordID>& v, MT19937*) {
+ plen.decrement(TD::Convert(w).size() - 1);
+ llh -= log(prob(w, v)); // this isn't quite right
+ }
+ double prob(WordID w, const vector<WordID>&) const {
+ const unsigned len = TD::Convert(w).size();
+ return plen.prob(len - 1) * exp(uc * len);
+ }
+ double log_likelihood() const { return llh; }
+ void resample_hyperparameters(MT19937*) {}
+ GammaPoisson plen;
+ const double uc;
+ double llh;
+};
+
+struct PYPAdaptedPoissonLengthUniformCharWordModel {
+ explicit PYPAdaptedPoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) :
+ base(vocab_size,1,1,1,1),
+ crp(1,1,1,1) {}
+ void increment(WordID w, const vector<WordID>& v, MT19937* rng) {
+ double p0 = base.prob(w, v);
+ if (crp.increment(w, p0, rng))
+ base.increment(w, v, rng);
+ }
+ void decrement(WordID w, const vector<WordID>& v, MT19937* rng) {
+ if (crp.decrement(w, rng))
+ base.decrement(w, v, rng);
+ }
+ double prob(WordID w, const vector<WordID>& v) const {
+ double p0 = base.prob(w, v);
+ return crp.prob(w, p0);
+ }
+ double log_likelihood() const { return crp.log_crp_prob() + base.log_likelihood(); }
+ void resample_hyperparameters(MT19937* rng) { crp.resample_hyperparameters(rng); }
+ PoissonLengthUniformCharWordModel base;
+ CCRP<WordID> crp;
+};
+
+template <unsigned N> struct PYPLM;
+
+#if 1
+template<> struct PYPLM<0> : public UniformVocabulary {
+ PYPLM(unsigned vs, double a, double b, double c, double d) :
+ UniformVocabulary(vs, a, b, c, d) {}
+};
+#else
+#if 0
+template<> struct PYPLM<0> : public PoissonLengthUniformCharWordModel {
+ PYPLM(unsigned vs, double a, double b, double c, double d) :
+ PoissonLengthUniformCharWordModel(vs, a, b, c, d) {}
+};
+#else
+template<> struct PYPLM<0> : public PYPAdaptedPoissonLengthUniformCharWordModel {
+ PYPLM(unsigned vs, double a, double b, double c, double d) :
+ PYPAdaptedPoissonLengthUniformCharWordModel(vs, a, b, c, d) {}
+};
+#endif
+#endif
+
// represents an N-gram LM
template <unsigned N> struct PYPLM {
PYPLM(unsigned vs, double da, double db, double ss, double sr) :
@@ -170,7 +236,7 @@ int main(int argc, char** argv) {
}
if (SS % 10 == 9) {
cerr << " [LLH=" << lm.log_likelihood() << "]" << endl;
- if (SS % 20 == 19) lm.resample_hyperparameters(&rng);
+ if (SS % 30 == 29) lm.resample_hyperparameters(&rng);
} else { cerr << '.' << flush; }
}
double llh = 0;
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index 5647a2a9..9fe00459 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -13,6 +13,7 @@
#include <map>
#include <cassert>
#include <vector>
+#include <limits>
#ifdef HAVE_CONFIG_H
#include "config.h"
@@ -192,6 +193,7 @@ class FastSparseVector {
} else {
is_remote_ = true;
data_.rbmap = new SPARSE_HASH_MAP<unsigned, T>(first, last);
+ HASH_MAP_DELETED(*data_.rbmap, std::numeric_limits<unsigned>::max());
}
}
void erase(unsigned k) {
@@ -213,8 +215,11 @@ class FastSparseVector {
if (&other == this) return *this;
clear();
std::memcpy(this, &other, sizeof(FastSparseVector));
- if (is_remote_)
+ if (is_remote_) {
data_.rbmap = new SPARSE_HASH_MAP<unsigned, T>(*data_.rbmap);
+ // TODO: do i need to set_deleted on a copy?
+ HASH_MAP_DELETED(*data_.rbmap, std::numeric_limits<unsigned>::max());
+ }
return *this;
}
T const& get_singleton() const {
@@ -445,6 +450,7 @@ class FastSparseVector {
SPARSE_HASH_MAP<unsigned, T>* m = new SPARSE_HASH_MAP<unsigned, T>(
reinterpret_cast<std::pair<unsigned, T>*>(&data_.local[0]),
reinterpret_cast<std::pair<unsigned, T>*>(&data_.local[local_size_]), local_size_ * 1.5 + 1);
+ HASH_MAP_DELETED(*m, std::numeric_limits<unsigned>::max());
data_.rbmap = m;
is_remote_ = true;
}
diff --git a/utils/hash.h b/utils/hash.h
index 6d992086..189ed1ae 100644
--- a/utils/hash.h
+++ b/utils/hash.h
@@ -16,14 +16,16 @@
# define SPARSE_HASH_MAP google::sparse_hash_map
# define HASH_MAP google::dense_hash_map
# define HASH_SET google::dense_hash_set
-# define HASH_MAP_RESERVED(h,empty,deleted) do { h.set_empty_key(empty); h.set_deleted_key(deleted); } while(0)
-# define HASH_MAP_EMPTY(h,empty) do { h.set_empty_key(empty); } while(0)
+# define HASH_MAP_DELETED(h,deleted) do { (h).set_deleted_key(deleted); } while(0)
+# define HASH_MAP_RESERVED(h,empty,deleted) do { (h).set_empty_key(empty); (h).set_deleted_key(deleted); } while(0)
+# define HASH_MAP_EMPTY(h,empty) do { (h).set_empty_key(empty); } while(0)
#else
# include <tr1/unordered_map>
# include <tr1/unordered_set>
# define SPARSE_HASH_MAP std::tr1::unordered_map
# define HASH_MAP std::tr1::unordered_map
# define HASH_SET std::tr1::unordered_set
+# define HASH_MAP_DELETED(h,deleted)
# define HASH_MAP_RESERVED(h,empty,deleted)
# define HASH_MAP_EMPTY(h,empty)
#endif