From 2c4453984060dd039b8a99e5a8d98dbc107588b9 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 12 Aug 2012 21:56:58 -0400 Subject: possible errors with google hashes --- gi/pf/pyp_lm.cc | 78 ++++++++++++++++++++++++++++++++++++++++++---- utils/fast_sparse_vector.h | 8 ++++- utils/hash.h | 6 ++-- 3 files changed, 83 insertions(+), 9 deletions(-) diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 7cec437a..605d8206 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -6,6 +6,7 @@ #include #include +#include "gamma_poisson.h" #include "corpus_tools.h" #include "m.h" #include "tdict.h" @@ -59,11 +60,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -template struct PYPLM; - -// uniform base distribution (0-gram model) -template<> struct PYPLM<0> { - PYPLM(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} +// uniform distribution over a fixed vocabulary +struct UniformVocabulary { + UniformVocabulary(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} void increment(WordID, const vector&, MT19937*) { ++draws; } void decrement(WordID, const vector&, MT19937*) { --draws; assert(draws >= 0); } double prob(WordID, const vector&) const { return p0; } @@ -73,6 +72,73 @@ template<> struct PYPLM<0> { int draws; }; +// Lord Rothschild. 1986. THE DISTRIBUTION OF ENGLISH DICTIONARY WORD LENGTHS. +// Journal of Statistical Planning and Inference 14 (1986) 311-322 +struct PoissonLengthUniformCharWordModel { + explicit PoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : plen(5,5), uc(-log(95)), llh() {} + void increment(WordID w, const vector& v, MT19937*) { + llh += log(prob(w, v)); // this isn't quite right + plen.increment(TD::Convert(w).size() - 1); + } + void decrement(WordID w, const vector& v, MT19937*) { + plen.decrement(TD::Convert(w).size() - 1); + llh -= log(prob(w, v)); // this isn't quite right + } + double prob(WordID w, const vector&) const { + const unsigned len = TD::Convert(w).size(); + return plen.prob(len - 1) * exp(uc * len); + } + double log_likelihood() const { return llh; } + void resample_hyperparameters(MT19937*) {} + GammaPoisson plen; + const double uc; + double llh; +}; + +struct PYPAdaptedPoissonLengthUniformCharWordModel { + explicit PYPAdaptedPoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : + base(vocab_size,1,1,1,1), + crp(1,1,1,1) {} + void increment(WordID w, const vector& v, MT19937* rng) { + double p0 = base.prob(w, v); + if (crp.increment(w, p0, rng)) + base.increment(w, v, rng); + } + void decrement(WordID w, const vector& v, MT19937* rng) { + if (crp.decrement(w, rng)) + base.decrement(w, v, rng); + } + double prob(WordID w, const vector& v) const { + double p0 = base.prob(w, v); + return crp.prob(w, p0); + } + double log_likelihood() const { return crp.log_crp_prob() + base.log_likelihood(); } + void resample_hyperparameters(MT19937* rng) { crp.resample_hyperparameters(rng); } + PoissonLengthUniformCharWordModel base; + CCRP crp; +}; + +template struct PYPLM; + +#if 1 +template<> struct PYPLM<0> : public UniformVocabulary { + PYPLM(unsigned vs, double a, double b, double c, double d) : + UniformVocabulary(vs, a, b, c, d) {} +}; +#else +#if 0 +template<> struct PYPLM<0> : public PoissonLengthUniformCharWordModel { + PYPLM(unsigned vs, double a, double b, double c, double d) : + PoissonLengthUniformCharWordModel(vs, a, b, c, d) {} +}; +#else +template<> struct PYPLM<0> : public PYPAdaptedPoissonLengthUniformCharWordModel { + PYPLM(unsigned vs, double a, double b, double c, double d) : + PYPAdaptedPoissonLengthUniformCharWordModel(vs, a, b, c, d) {} +}; +#endif +#endif + // represents an N-gram LM template struct PYPLM { PYPLM(unsigned vs, double da, double db, double ss, double sr) : @@ -170,7 +236,7 @@ int main(int argc, char** argv) { } if (SS % 10 == 9) { cerr << " [LLH=" << lm.log_likelihood() << "]" << endl; - if (SS % 20 == 19) lm.resample_hyperparameters(&rng); + if (SS % 30 == 29) lm.resample_hyperparameters(&rng); } else { cerr << '.' << flush; } } double llh = 0; diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index 5647a2a9..9fe00459 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef HAVE_CONFIG_H #include "config.h" @@ -192,6 +193,7 @@ class FastSparseVector { } else { is_remote_ = true; data_.rbmap = new SPARSE_HASH_MAP(first, last); + HASH_MAP_DELETED(*data_.rbmap, std::numeric_limits::max()); } } void erase(unsigned k) { @@ -213,8 +215,11 @@ class FastSparseVector { if (&other == this) return *this; clear(); std::memcpy(this, &other, sizeof(FastSparseVector)); - if (is_remote_) + if (is_remote_) { data_.rbmap = new SPARSE_HASH_MAP(*data_.rbmap); + // TODO: do i need to set_deleted on a copy? + HASH_MAP_DELETED(*data_.rbmap, std::numeric_limits::max()); + } return *this; } T const& get_singleton() const { @@ -445,6 +450,7 @@ class FastSparseVector { SPARSE_HASH_MAP* m = new SPARSE_HASH_MAP( reinterpret_cast*>(&data_.local[0]), reinterpret_cast*>(&data_.local[local_size_]), local_size_ * 1.5 + 1); + HASH_MAP_DELETED(*m, std::numeric_limits::max()); data_.rbmap = m; is_remote_ = true; } diff --git a/utils/hash.h b/utils/hash.h index 6d992086..189ed1ae 100644 --- a/utils/hash.h +++ b/utils/hash.h @@ -16,14 +16,16 @@ # define SPARSE_HASH_MAP google::sparse_hash_map # define HASH_MAP google::dense_hash_map # define HASH_SET google::dense_hash_set -# define HASH_MAP_RESERVED(h,empty,deleted) do { h.set_empty_key(empty); h.set_deleted_key(deleted); } while(0) -# define HASH_MAP_EMPTY(h,empty) do { h.set_empty_key(empty); } while(0) +# define HASH_MAP_DELETED(h,deleted) do { (h).set_deleted_key(deleted); } while(0) +# define HASH_MAP_RESERVED(h,empty,deleted) do { (h).set_empty_key(empty); (h).set_deleted_key(deleted); } while(0) +# define HASH_MAP_EMPTY(h,empty) do { (h).set_empty_key(empty); } while(0) #else # include # include # define SPARSE_HASH_MAP std::tr1::unordered_map # define HASH_MAP std::tr1::unordered_map # define HASH_SET std::tr1::unordered_set +# define HASH_MAP_DELETED(h,deleted) # define HASH_MAP_RESERVED(h,empty,deleted) # define HASH_MAP_EMPTY(h,empty) #endif -- cgit v1.2.3