diff options
| author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-08-09 01:18:32 -0400 | 
|---|---|---|
| committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-08-09 01:18:32 -0400 | 
| commit | 2a9c9a414abc074ec4ea8a5494e8dd50e1f94d70 (patch) | |
| tree | f0bab000a53595e2de5b138accac10b90322c6fe | |
| parent | bc2992ba96cd7af83da8522bdeb6e5dd94a5a11b (diff) | |
gamma-poisson word length model
| -rw-r--r-- | utils/Makefile.am | 5 | ||||
| -rw-r--r-- | utils/gamma_poisson.h | 33 | ||||
| -rw-r--r-- | utils/tdict.cc | 19 | ||||
| -rw-r--r-- | utils/tdict.h | 21 | ||||
| -rw-r--r-- | utils/unigram_pyp_lm.cc | 90 | 
5 files changed, 121 insertions, 47 deletions
| diff --git a/utils/Makefile.am b/utils/Makefile.am index 386344dd..799ec879 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -9,7 +9,8 @@ noinst_PROGRAMS = \    m_test \    weights_test \    logval_test \ -  small_vector_test +  small_vector_test \ +  unigram_pyp_lm  TESTS = ts mfcr_test crp_test small_vector_test logval_test weights_test dict_test m_test @@ -57,6 +58,8 @@ logval_test_SOURCES = logval_test.cc  logval_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz  small_vector_test_SOURCES = small_vector_test.cc  small_vector_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz +unigram_pyp_lm_SOURCES = unigram_pyp_lm.cc +unigram_pyp_lm_LDADD = libutils.a -lz  ################################################################  # do NOT NOT NOT add any other -I includes NO NO NO NO NO ###### diff --git a/utils/gamma_poisson.h b/utils/gamma_poisson.h new file mode 100644 index 00000000..fec763f6 --- /dev/null +++ b/utils/gamma_poisson.h @@ -0,0 +1,33 @@ +#ifndef _GAMMA_POISSON_H_ +#define _GAMMA_POISSON_H_ + +#include <m.h> + +// http://en.wikipedia.org/wiki/Conjugate_prior +struct GammaPoisson { +  GammaPoisson(double shape, double rate) : +    a(shape), b(rate), n(), marginal() {} + +  double prob(unsigned x) const { +    return exp(Md::log_negative_binom(x, a + marginal, 1.0 - (b + n) / (1 + b + n))); +  } + +  void increment(unsigned x) { +    ++n; +    marginal += x; +  } + +  void decrement(unsigned x) { +    --n; +    marginal -= x; +  } + +  double log_likelihood() const { +    return 0; +  } + +  double a, b; +  int n, marginal; +}; + +#endif diff --git a/utils/tdict.cc b/utils/tdict.cc index f33bd576..fd2b76cb 100644 --- a/utils/tdict.cc +++ b/utils/tdict.cc @@ -13,22 +13,6 @@ using namespace std;  Dict TD::dict_; -unsigned int TD::NumWords() { -  return dict_.max(); -} - -WordID TD::Convert(const std::string& s) { -  return dict_.Convert(s); -} - -WordID TD::Convert(char const* s) { -  return dict_.Convert(string(s)); -} - -const char* TD::Convert(WordID w) { -  return dict_.Convert(w).c_str(); -} -  void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) {    ids->clear();    for (vector<string>::const_iterator i = strings.begin(); i != strings.end(); ++i) @@ -57,7 +41,8 @@ std::string TD::GetString(WordID const* i,WordID const* e) {  int TD::AppendString(const WordID& w, int pos, int bufsize, char* buffer)  { -  const char* word = TD::Convert(w); +  const string& s = TD::Convert(w); +  const char* word = s.c_str();    const char* const end_buf = buffer + bufsize;    char* dest = buffer + pos;    while(dest < end_buf && *word) { diff --git a/utils/tdict.h b/utils/tdict.h index 393146fa..03afc2e6 100644 --- a/utils/tdict.h +++ b/utils/tdict.h @@ -3,10 +3,9 @@  #include <string>  #include <vector> +#include <cassert>  #include "wordid.h" -#include <assert.h> - -class Dict; +#include "dict.h"  struct TD {    static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far @@ -15,10 +14,18 @@ struct TD {    static std::string GetString(const std::vector<WordID>& str);    static std::string GetString(WordID const* i,WordID const* e);    static int AppendString(const WordID& w, int pos, int bufsize, char* buffer); -  static unsigned int NumWords(); -  static WordID Convert(const std::string& s); -  static WordID Convert(char const* s); -  static const char* Convert(WordID w); +  static unsigned int NumWords() { +    return dict_.max(); +  } +  static WordID Convert(const std::string& s) { +    return dict_.Convert(s); +  } +  static WordID Convert(char const* s) { +    return dict_.Convert(std::string(s)); +  } +  static const std::string& Convert(WordID w) { +    return dict_.Convert(w); +  }   private:    static Dict dict_;  }; diff --git a/utils/unigram_pyp_lm.cc b/utils/unigram_pyp_lm.cc index 510e8839..30b9fde1 100644 --- a/utils/unigram_pyp_lm.cc +++ b/utils/unigram_pyp_lm.cc @@ -11,6 +11,7 @@  #include "tdict.h"  #include "sampler.h"  #include "ccrp.h" +#include "gamma_poisson.h"  // A not very memory-efficient implementation of an 1-gram LM based on PYPs  // as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model @@ -54,49 +55,90 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    }  } +struct Histogram { +  void increment(unsigned bin, unsigned delta = 1u) { +    data[bin] += delta; +  } +  void decrement(unsigned bin, unsigned delta = 1u) { +    data[bin] -= delta; +  } +  void move(unsigned from_bin, unsigned to_bin, unsigned delta = 1u) { +    decrement(from_bin, delta); +    increment(to_bin, delta); +  } +  map<unsigned, unsigned> data; +  // SparseVector<unsigned> data; +}; + +// Lord Rothschild. 1986. THE DISTRIBUTION OF ENGLISH DICTIONARY WORD LENGTHS. +// Journal of Statistical Planning and Inference 14 (1986) 311-322 +struct PoissonLengthUniformCharWordModel { +  explicit PoissonLengthUniformCharWordModel(unsigned vocab_size) : plen(5,5), uc(-log(50)), llh() {} +  void increment(WordID w, MT19937*) { +    llh += log(prob(w)); // this isn't quite right +    plen.increment(TD::Convert(w).size() - 1); +  } +  void decrement(WordID w, MT19937*) { +    plen.decrement(TD::Convert(w).size() - 1); +    llh -= log(prob(w)); // this isn't quite right +  } +  double prob(WordID w) const { +    size_t len = TD::Convert(w).size(); +    return plen.prob(len - 1) * exp(uc * len); +  } +  double log_likelihood() const { return llh; } +  void resample_hyperparameters(MT19937*) {} +  GammaPoisson plen; +  const double uc; +  double llh; +}; +  // uniform base distribution (0-gram model)  struct UniformWordModel {    explicit UniformWordModel(unsigned vocab_size) : p0(1.0 / vocab_size), draws() {} -  void increment() { ++draws; } -  void decrement() { --draws; assert(draws >= 0); } +  void increment(WordID, MT19937*) { ++draws; } +  void decrement(WordID, MT19937*) { --draws; assert(draws >= 0); }    double prob(WordID) const { return p0; } // all words have equal prob    double log_likelihood() const { return draws * log(p0); } +  void resample_hyperparameters(MT19937*) {}    const double p0;    int draws;  };  // represents an Unigram LM +template <class BaseGenerator>  struct UnigramLM {    UnigramLM(unsigned vs, double da, double db, double ss, double sr) : -      uniform_vocab(vs), +      base(vs),        crp(da, db, ss, sr, 0.8, 1.0) {}    void increment(WordID w, MT19937* rng) { -    const double backoff = uniform_vocab.prob(w); +    const double backoff = base.prob(w);      if (crp.increment(w, backoff, rng)) -      uniform_vocab.increment(); +      base.increment(w, rng);    }    void decrement(WordID w, MT19937* rng) {      if (crp.decrement(w, rng)) -      uniform_vocab.decrement(); +      base.decrement(w, rng);    }    double prob(WordID w) const { -    const double backoff = uniform_vocab.prob(w); +    const double backoff = base.prob(w);      return crp.prob(w, backoff);    }    double log_likelihood() const { -    double llh = uniform_vocab.log_likelihood(); +    double llh = base.log_likelihood();      llh += crp.log_crp_prob();      return llh;    }    void resample_hyperparameters(MT19937* rng) {      crp.resample_hyperparameters(rng); +    base.resample_hyperparameters(rng);    }    double discount_a, discount_b, strength_s, strength_r;    double d, strength; -  UniformWordModel uniform_vocab; +  BaseGenerator base;    CCRP<WordID> crp;  }; @@ -121,15 +163,19 @@ int main(int argc, char** argv) {      CorpusTools::ReadFromFile(conf["test"].as<string>(), &test);    else      test = corpuse; -  UnigramLM lm(vocabe.size(), -               conf["discount_prior_a"].as<double>(), -               conf["discount_prior_b"].as<double>(), -               conf["strength_prior_s"].as<double>(), -               conf["strength_prior_r"].as<double>()); -  for (int SS=0; SS < samples; ++SS) { -    for (int ci = 0; ci < corpuse.size(); ++ci) { +#if 1 +  UnigramLM<PoissonLengthUniformCharWordModel> lm(vocabe.size(), +#else +  UnigramLM<UniformWordModel> lm(vocabe.size(), +#endif +                                 conf["discount_prior_a"].as<double>(), +                                 conf["discount_prior_b"].as<double>(), +                                 conf["strength_prior_s"].as<double>(), +                                 conf["strength_prior_r"].as<double>()); +  for (unsigned SS=0; SS < samples; ++SS) { +    for (unsigned ci = 0; ci < corpuse.size(); ++ci) {        const vector<WordID>& s = corpuse[ci]; -      for (int i = 0; i <= s.size(); ++i) { +      for (unsigned i = 0; i <= s.size(); ++i) {          WordID w = (i < s.size() ? s[i] : kEOS);          if (SS > 0) lm.decrement(w, &rng);          lm.increment(w, &rng); @@ -137,21 +183,21 @@ int main(int argc, char** argv) {        if (SS > 0) lm.decrement(kEOS, &rng);        lm.increment(kEOS, &rng);      } -    cerr << "LLH=" << lm.log_likelihood() << endl; -    //if (SS % 10 == 9) lm.resample_hyperparameters(&rng); +    cerr << "LLH=" << lm.log_likelihood() << "\t tables=" << lm.crp.num_tables() << " " << endl; +    if (SS % 10 == 9) lm.resample_hyperparameters(&rng);    }    double llh = 0;    unsigned cnt = 0;    unsigned oovs = 0; -  for (int ci = 0; ci < test.size(); ++ci) { +  for (unsigned ci = 0; ci < test.size(); ++ci) {      const vector<WordID>& s = test[ci]; -    for (int i = 0; i <= s.size(); ++i) { +    for (unsigned i = 0; i <= s.size(); ++i) {        WordID w = (i < s.size() ? s[i] : kEOS);        double lp = log(lm.prob(w)) / log(2);        if (i < s.size() && vocabe.count(w) == 0) {          cerr << "**OOV ";          ++oovs; -        lp = 0; +        //lp = 0;        }        cerr << "p(" << TD::Convert(w) << ") = " << lp << endl;        llh -= lp; | 
