diff options
Diffstat (limited to 'gi/pyp-topics')
| -rw-r--r-- | gi/pyp-topics/src/pyp-topics.cc | 13 | ||||
| -rw-r--r-- | gi/pyp-topics/src/pyp-topics.hh | 23 | ||||
| -rw-r--r-- | gi/pyp-topics/src/pyp.hh | 32 | ||||
| -rw-r--r-- | gi/pyp-topics/src/train-contexts.cc | 8 | ||||
| -rw-r--r-- | gi/pyp-topics/src/train.cc | 8 | 
5 files changed, 57 insertions, 27 deletions
| diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc index 2ad9d080..2b96816e 100644 --- a/gi/pyp-topics/src/pyp-topics.cc +++ b/gi/pyp-topics/src/pyp-topics.cc @@ -5,7 +5,6 @@  #endif  #include "pyp-topics.hh" -//#include "mt19937ar.h"  #include <boost/date_time/posix_time/posix_time_types.hpp>  #include <time.h> @@ -46,13 +45,13 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,    {      m_word_pyps.at(i).reserve(m_num_topics);      for (int j=0; j<m_num_topics; ++j) -      m_word_pyps.at(i).push_back(new PYP<int>(0.5, 1.0)); +      m_word_pyps.at(i).push_back(new PYP<int>(0.5, 1.0, m_seed));    }    std::cerr << std::endl;    m_document_pyps.reserve(corpus.num_documents());    for (int j=0; j<corpus.num_documents(); ++j) -    m_document_pyps.push_back(new PYP<int>(0.5, 1.0)); +    m_document_pyps.push_back(new PYP<int>(0.5, 1.0, m_seed));    m_topic_p0 = 1.0/m_num_topics;    m_term_p0 = 1.0/corpus.num_types(); @@ -118,8 +117,10 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,      int tmp;      for (int i = corpus.num_documents()-1; i > 0; --i)      { -    	int j = (int)(mt_genrand_real1() * i); -    	tmp = randomDocIndices[i]; +        //i+1 since j \in [0,i] but rnd() \in [0,1) +    	int j = (int)(rnd() * (i+1)); +      assert(j >= 0 && j <= i); +     	tmp = randomDocIndices[i];      	randomDocIndices[i] = randomDocIndices[j];      	randomDocIndices[j] = tmp;      } @@ -258,7 +259,7 @@ int PYPTopics::sample(const DocumentId& doc, const Term& term) {      sums.push_back(sum);    }    // Second pass: sample a topic -  F cutoff = mt_genrand_res53() * sum; +  F cutoff = rnd() * sum;    for (int k=0; k<m_num_topics; ++k) {      if (cutoff <= sums[k])        return k; diff --git a/gi/pyp-topics/src/pyp-topics.hh b/gi/pyp-topics/src/pyp-topics.hh index 996ef4dd..9da49267 100644 --- a/gi/pyp-topics/src/pyp-topics.hh +++ b/gi/pyp-topics/src/pyp-topics.hh @@ -4,6 +4,11 @@  #include <vector>  #include <iostream>  #include <boost/ptr_container/ptr_vector.hpp> + +#include <boost/random/uniform_real.hpp> +#include <boost/random/variate_generator.hpp> +#include <boost/random/mersenne_twister.hpp> +  #include "pyp.hh"  #include "corpus.hh" @@ -15,9 +20,12 @@ public:    typedef double F;  public: -  PYPTopics(int num_topics, bool use_topic_pyp=false)  +  PYPTopics(int num_topics, bool use_topic_pyp=false, unsigned long seed = 0)       : m_num_topics(num_topics), m_word_pyps(1),  -    m_topic_pyp(0.5,1.0), m_use_topic_pyp(use_topic_pyp) {} +    m_topic_pyp(0.5,1.0,seed), m_use_topic_pyp(use_topic_pyp), +    m_seed(seed), +    uni_dist(0,1), rng(seed == 0 ? (unsigned long)this : seed),  +    rnd(rng, uni_dist) {}    void sample_corpus(const Corpus& corpus, int samples,                       int freq_cutoff_start=0, int freq_cutoff_end=0,  @@ -60,6 +68,17 @@ private:    PYP<int> m_topic_pyp;    bool m_use_topic_pyp; +  unsigned long m_seed; + +  typedef boost::mt19937 base_generator_type; +  typedef boost::uniform_real<> uni_dist_type; +  typedef boost::variate_generator<base_generator_type&, uni_dist_type> gen_type; + +  uni_dist_type uni_dist; +  base_generator_type rng; //this gets the seed +  gen_type rnd; //instantiate: rnd(rng, uni_dist) +                //call: rnd() generates uniform on [0,1) +    TermBackoffPtr m_backoff;  }; diff --git a/gi/pyp-topics/src/pyp.hh b/gi/pyp-topics/src/pyp.hh index 80c79fe1..64fb5b58 100644 --- a/gi/pyp-topics/src/pyp.hh +++ b/gi/pyp-topics/src/pyp.hh @@ -5,10 +5,13 @@  #include <map>  #include <tr1/unordered_map> +#include <boost/random/uniform_real.hpp> +#include <boost/random/variate_generator.hpp> +#include <boost/random/mersenne_twister.hpp> +  #include "log_add.h"  #include "gammadist.h"  #include "slice-sampler.h" -#include "mt19937ar.h"  //  // Pitman-Yor process with customer and table tracking @@ -23,7 +26,7 @@ public:    using std::tr1::unordered_map<Dish,int>::begin;    using std::tr1::unordered_map<Dish,int>::end; -  PYP(double a, double b, Hash hash=Hash()); +  PYP(double a, double b, unsigned long seed = 0, Hash hash=Hash());    int increment(Dish d, double p0);    int decrement(Dish d); @@ -80,6 +83,16 @@ private:    DishTableType _dish_tables;    int _total_customers, _total_tables; +  typedef boost::mt19937 base_generator_type; +  typedef boost::uniform_real<> uni_dist_type; +  typedef boost::variate_generator<base_generator_type&, uni_dist_type> gen_type; + +  uni_dist_type uni_dist; +  base_generator_type rng; //this gets the seed +  gen_type rnd; //instantiate: rnd(rng, uni_dist) +                //call: rnd() generates uniform on [0,1) +  +    // Function objects for calculating the parts of the log_prob for     // the parameters a and b    struct resample_a_type { @@ -122,11 +135,12 @@ private:  };  template <typename Dish, typename Hash> -PYP<Dish,Hash>::PYP(double a, double b, Hash) +PYP<Dish,Hash>::PYP(double a, double b, unsigned long seed, Hash)  : std::tr1::unordered_map<Dish, int, Hash>(), _a(a), _b(b),     _a_beta_a(1), _a_beta_b(1), _b_gamma_s(1), _b_gamma_c(1),    //_a_beta_a(1), _a_beta_b(1), _b_gamma_s(10), _b_gamma_c(0.1), -  _total_customers(0), _total_tables(0) +  _total_customers(0), _total_tables(0), +  uni_dist(0,1), rng(seed == 0 ? (unsigned long)this : seed), rnd(rng, uni_dist)  {  //  std::cerr << "\t##PYP<Dish,Hash>::PYP(a=" << _a << ",b=" << _b << ")" << std::endl;  } @@ -211,7 +225,7 @@ PYP<Dish,Hash>::increment(Dish dish, double p0) {    assert (pshare >= 0.0);    //assert (pnew > 0.0); -  if (mt_genrand_res53() < pnew / (pshare + pnew)) { +  if (rnd() < pnew / (pshare + pnew)) {      // assign to a new table      tc.tables += 1;      tc.table_histogram[1] += 1; @@ -221,7 +235,7 @@ PYP<Dish,Hash>::increment(Dish dish, double p0) {    else {      // randomly assign to an existing table      // remove constant denominator from inner loop -    double r = mt_genrand_res53() * (c - _a*t); +    double r = rnd() * (c - _a*t);      for (std::map<int,int>::iterator           hit = tc.table_histogram.begin();           hit != tc.table_histogram.end(); ++hit) { @@ -283,7 +297,7 @@ PYP<Dish,Hash>::decrement(Dish dish)    //std::cerr << "count: " << count(dish) << " ";    //std::cerr << "tables: " << tc.tables << "\n"; -  double r = mt_genrand_res53() * count(dish); +  double r = rnd() * count(dish);    for (std::map<int,int>::iterator hit = tc.table_histogram.begin();         hit != tc.table_histogram.end(); ++hit)    { @@ -467,7 +481,7 @@ PYP<Dish,Hash>::resample_prior_b() {    int niterations = 10;   // number of resampling iterations    //std::cerr << "\n## resample_prior_b(), initial a = " << _a << ", b = " << _b << std::endl;    resample_b_type b_log_prob(_total_customers, _total_tables, _a, _b_gamma_c, _b_gamma_s); -  _b = slice_sampler1d(b_log_prob, _b, mt_genrand_res53, (double) 0.0, std::numeric_limits<double>::infinity(),  +  _b = slice_sampler1d(b_log_prob, _b, rnd, (double) 0.0, std::numeric_limits<double>::infinity(),                          (double) 0.0, niterations, 100*niterations);    //std::cerr << "\n## resample_prior_b(), final a = " << _a << ", b = " << _b << std::endl;  } @@ -481,7 +495,7 @@ PYP<Dish,Hash>::resample_prior_a() {    int niterations = 10;    //std::cerr << "\n## Initial a = " << _a << ", b = " << _b << std::endl;    resample_a_type a_log_prob(_total_customers, _total_tables, _b, _a_beta_a, _a_beta_b, _dish_tables); -  _a = slice_sampler1d(a_log_prob, _a, mt_genrand_res53, std::numeric_limits<double>::min(),  +  _a = slice_sampler1d(a_log_prob, _a, rnd, std::numeric_limits<double>::min(),                          (double) 1.0, (double) 0.0, niterations, 100*niterations);  } diff --git a/gi/pyp-topics/src/train-contexts.cc b/gi/pyp-topics/src/train-contexts.cc index 481f8926..8a0c8949 100644 --- a/gi/pyp-topics/src/train-contexts.cc +++ b/gi/pyp-topics/src/train-contexts.cc @@ -14,7 +14,6 @@  #include "corpus.hh"  #include "contexts_corpus.hh"  #include "gzstream.hh" -#include "mt19937ar.h"  static const char *REVISION = "$Rev$"; @@ -78,10 +77,9 @@ int main(int argc, char **argv)      return 1;    } -  // seed the random number generator -  //mt_init_genrand(time(0)); - -  PYPTopics model(vm["topics"].as<int>(), vm.count("hierarchical-topics")); +  // seed the random number generator: 0 = automatic, specify value otherwise +  unsigned long seed = 0;  +  PYPTopics model(vm["topics"].as<int>(), vm.count("hierarchical-topics"), seed);    // read the data    BackoffGenerator* backoff_gen=0; diff --git a/gi/pyp-topics/src/train.cc b/gi/pyp-topics/src/train.cc index c94010f2..3462f26c 100644 --- a/gi/pyp-topics/src/train.cc +++ b/gi/pyp-topics/src/train.cc @@ -12,7 +12,6 @@  #include "corpus.hh"  #include "contexts_corpus.hh"  #include "gzstream.hh" -#include "mt19937ar.h"  static const char *REVISION = "$Rev$"; @@ -69,10 +68,9 @@ int main(int argc, char **argv)      return 1;     } -  // seed the random number generator -  //mt_init_genrand(time(0)); - -  PYPTopics model(vm["topics"].as<int>()); +  // seed the random number generator: 0 = automatic, specify value otherwise +  unsigned long seed = 0;  +  PYPTopics model(vm["topics"].as<int>(), false, seed);    // read the data    Corpus corpus; | 
