From f5dbba7e5e17ccb0b1f00e44e8262d8eed43647a Mon Sep 17 00:00:00 2001 From: redpony Date: Wed, 25 Aug 2010 23:43:30 +0000 Subject: waste of time. priors on PYP hyperparameters with MJ's slice sampler. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@622 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/clda/src/ccrp.h | 119 +++++++++++++++++++++++++-- gi/clda/src/clda.cc | 12 ++- gi/clda/src/crp_test.cc | 6 ++ gi/clda/src/slice_sampler.h | 191 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 319 insertions(+), 9 deletions(-) create mode 100644 gi/clda/src/slice_sampler.h (limited to 'gi/clda/src') diff --git a/gi/clda/src/ccrp.h b/gi/clda/src/ccrp.h index eeccce1a..74d5be29 100644 --- a/gi/clda/src/ccrp.h +++ b/gi/clda/src/ccrp.h @@ -1,6 +1,7 @@ #ifndef _CCRP_H_ #define _CCRP_H_ +#include #include #include #include @@ -9,6 +10,7 @@ #include #include #include "sampler.h" +#include "slice_sampler.h" // Chinese restaurant process (Pitman-Yor parameters) with explicit table // tracking. @@ -16,7 +18,36 @@ template > class CCRP { public: - CCRP(double disc, double conc) : num_tables_(), num_customers_(), discount_(disc), concentration_(conc) {} + CCRP(double disc, double conc) : + num_tables_(), + num_customers_(), + discount_(disc), + concentration_(conc), + discount_prior_alpha_(std::numeric_limits::quiet_NaN()), + discount_prior_beta_(std::numeric_limits::quiet_NaN()), + concentration_prior_shape_(std::numeric_limits::quiet_NaN()), + concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} + + CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.1, double c = 10.0) : + num_tables_(), + num_customers_(), + discount_(d), + concentration_(c), + discount_prior_alpha_(d_alpha), + discount_prior_beta_(d_beta), + concentration_prior_shape_(c_shape), + concentration_prior_rate_(c_rate) {} + + double discount() const { return discount_; } + double concentration() const { return concentration_; } + + bool has_discount_prior() const { + return !std::isnan(discount_prior_alpha_); + } + + bool has_concentration_prior() const { + return !std::isnan(concentration_prior_shape_); + } void clear() { num_tables_ = 0; @@ -115,24 +146,90 @@ class CCRP { } } + double log_crp_prob() const { + return log_crp_prob(discount_, concentration_); + } + + static double log_beta_density(const double& x, const double& alpha, const double& beta) { + assert(x > 0.0); + assert(x < 1.0); + assert(alpha > 0.0); + assert(beta > 0.0); + const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); + return lp; + } + + static double log_gamma_density(const double& x, const double& shape, const double& rate) { + assert(x >= 0.0); + assert(shape > 0.0); + assert(rate > 0.0); + const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); + return lp; + } + // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's - double log_crp_prob() const { + double log_crp_prob(const double& discount, const double& concentration) const { double lp = 0.0; + if (has_discount_prior()) + lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); + if (has_concentration_prior()) + lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + assert(lp <= 0.0); if (num_customers_) { - const double r = lgamma(1.0 - discount_); - lp = lgamma(concentration_) - lgamma(concentration_ + num_customers_) - + num_tables_ * discount_ + lgamma(concentration_ / discount_ + num_tables_) - - lgamma(concentration_ / discount_); + const double r = lgamma(1.0 - discount); + lp += lgamma(concentration) - lgamma(concentration + num_customers_) + + num_tables_ * discount + lgamma(concentration / discount + num_tables_) + - lgamma(concentration / discount); + assert(std::isfinite(lp)); for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { const DishLocations& cur = it->second; - lp += lgamma(cur.total_dish_count_ - discount_) - r; + for (std::list::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { + lp += lgamma(*ti - discount) - r; + } } } + assert(std::isfinite(lp)); return lp; } + void resample_hyperparameters(MT19937* rng) { + assert(has_discount_prior() || has_concentration_prior()); + DiscountResampler dr(*this); + ConcentrationResampler cr(*this); + const int niterations = 10; + double gamma_upper = std::numeric_limits::infinity(); + for (int iter = 0; iter < 5; ++iter) { + if (has_concentration_prior()) { + concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + gamma_upper, 0.0, niterations, 100*niterations); + } + if (has_discount_prior()) { + discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), + 1.0, 0.0, niterations, 100*niterations); + } + } + concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + gamma_upper, 0.0, niterations, 100*niterations); + } + + struct DiscountResampler { + DiscountResampler(const CCRP& crp) : crp_(crp) {} + const CCRP& crp_; + double operator()(const double& proposed_discount) const { + return crp_.log_crp_prob(proposed_discount, crp_.concentration_); + } + }; + + struct ConcentrationResampler { + ConcentrationResampler(const CCRP& crp) : crp_(crp) {} + const CCRP& crp_; + double operator()(const double& proposed_concentration) const { + return crp_.log_crp_prob(crp_.discount_, proposed_concentration); + } + }; + struct DishLocations { DishLocations() : total_dish_count_() {} unsigned total_dish_count_; // customers at all tables with this dish @@ -166,6 +263,14 @@ class CCRP { double discount_; double concentration_; + + // optional beta prior on discount_ (NaN if no prior) + double discount_prior_alpha_; + double discount_prior_beta_; + + // optional gamma prior on concentration_ (NaN if no prior) + double concentration_prior_shape_; + double concentration_prior_rate_; }; template diff --git a/gi/clda/src/clda.cc b/gi/clda/src/clda.cc index 10056bc9..f548997f 100644 --- a/gi/clda/src/clda.cc +++ b/gi/clda/src/clda.cc @@ -61,8 +61,8 @@ int main(int argc, char** argv) { double alpha = 50.0; const double uniform_topic = 1.0 / num_classes; const double uniform_word = 1.0 / TD::NumWords(); - vector > dr(zji.size(), CCRP(disc, beta)); // dr[i] describes the probability of using a topic in document i - vector > wr(num_classes, CCRP(disc, alpha)); // wr[k] describes the probability of generating a word in topic k + vector > dr(zji.size(), CCRP(1,1,1,1,disc, beta)); // dr[i] describes the probability of using a topic in document i + vector > wr(num_classes, CCRP(1,1,1,1,disc, alpha)); // wr[k] describes the probability of generating a word in topic k for (int j = 0; j < zji.size(); ++j) { const size_t num_words = wji[j].size(); vector& zj = zji[j]; @@ -89,6 +89,13 @@ int main(int argc, char** argv) { total_time += timer.Elapsed(); timer.Reset(); double llh = 0; +#if 1 + for (int j = 0; j < dr.size(); ++j) + dr[j].resample_hyperparameters(&rng); + for (int j = 0; j < wr.size(); ++j) + wr[j].resample_hyperparameters(&rng); +#endif + for (int j = 0; j < dr.size(); ++j) llh += dr[j].log_crp_prob(); for (int j = 0; j < wr.size(); ++j) @@ -120,6 +127,7 @@ int main(int argc, char** argv) { } for (int i = 0; i < num_classes; ++i) { cerr << "---------------------------------\n"; + cerr << " final PYP(" << wr[i].discount() << "," << wr[i].concentration() << ")\n"; ShowTopWordsForTopic(t2w[i]); } cerr << "-------------\n"; diff --git a/gi/clda/src/crp_test.cc b/gi/clda/src/crp_test.cc index ed384f81..561cd4dd 100644 --- a/gi/clda/src/crp_test.cc +++ b/gi/clda/src/crp_test.cc @@ -90,6 +90,12 @@ TEST_F(CRPTest, Exchangability) { cerr << i << ' ' << (hist[i]) << endl; } +TEST_F(CRPTest, LP) { + CCRP crp(1,1,1,1,0.1,50.0); + crp.increment("foo", 1.0, &rng); + cerr << crp.log_crp_prob() << endl; +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/gi/clda/src/slice_sampler.h b/gi/clda/src/slice_sampler.h new file mode 100644 index 00000000..aa48a169 --- /dev/null +++ b/gi/clda/src/slice_sampler.h @@ -0,0 +1,191 @@ +//! slice-sampler.h is an MCMC slice sampler +//! +//! Mark Johnson, 1st August 2008 + +#ifndef SLICE_SAMPLER_H +#define SLICE_SAMPLER_H + +#include +#include +#include +#include +#include + +//! slice_sampler_rfc_type{} returns the value of a user-specified +//! function if the argument is within range, or - infinity otherwise +// +template +struct slice_sampler_rfc_type { + F min_x, max_x; + const Fn& f; + U max_nfeval, nfeval; + slice_sampler_rfc_type(F min_x, F max_x, const Fn& f, U max_nfeval) + : min_x(min_x), max_x(max_x), f(f), max_nfeval(max_nfeval), nfeval(0) { } + + F operator() (F x) { + if (min_x < x && x < max_x) { + assert(++nfeval <= max_nfeval); + F fx = f(x); + assert(std::isfinite(fx)); + return fx; + } + return -std::numeric_limits::infinity(); + } +}; // slice_sampler_rfc_type{} + +//! slice_sampler1d() implements the univariate "range doubling" slice sampler +//! described in Neal (2003) "Slice Sampling", The Annals of Statistics 31(3), 705-767. +// +template +F slice_sampler1d(const LogF& logF0, //!< log of function to sample + F x, //!< starting point + Uniform01& u01, //!< uniform [0,1) random number generator + F min_x = -std::numeric_limits::infinity(), //!< minimum value of support + F max_x = std::numeric_limits::infinity(), //!< maximum value of support + F w = 0.0, //!< guess at initial width + unsigned nsamples=1, //!< number of samples to draw + unsigned max_nfeval=200) //!< max number of function evaluations +{ + typedef unsigned U; + slice_sampler_rfc_type logF(min_x, max_x, logF0, max_nfeval); + + assert(std::isfinite(x)); + + if (w <= 0.0) { // set w to a default width + if (min_x > -std::numeric_limits::infinity() && max_x < std::numeric_limits::infinity()) + w = (max_x - min_x)/4; + else + w = std::max(((x < 0.0) ? -x : x)/4, (F) 0.1); + } + assert(std::isfinite(w)); + + F logFx = logF(x); + for (U sample = 0; sample < nsamples; ++sample) { + F logY = logFx + log(u01()+1e-100); //! slice logFx at this value + assert(std::isfinite(logY)); + + F xl = x - w*u01(); //! lower bound on slice interval + F logFxl = logF(xl); + F xr = xl + w; //! upper bound on slice interval + F logFxr = logF(xr); + + while (logY < logFxl || logY < logFxr) // doubling procedure + if (u01() < 0.5) + logFxl = logF(xl -= xr - xl); + else + logFxr = logF(xr += xr - xl); + + F xl1 = xl; + F xr1 = xr; + while (true) { // shrinking procedure + F x1 = xl1 + u01()*(xr1 - xl1); + if (logY < logF(x1)) { + F xl2 = xl; // acceptance procedure + F xr2 = xr; + bool d = false; + while (xr2 - xl2 > 1.1*w) { + F xm = (xl2 + xr2)/2; + if ((x < xm && x1 >= xm) || (x >= xm && x1 < xm)) + d = true; + if (x1 < xm) + xr2 = xm; + else + xl2 = xm; + if (d && logY >= logF(xl2) && logY >= logF(xr2)) + goto unacceptable; + } + x = x1; + goto acceptable; + } + goto acceptable; + unacceptable: + if (x1 < x) // rest of shrinking procedure + xl1 = x1; + else + xr1 = x1; + } + acceptable: + w = (4*w + (xr1 - xl1))/5; // update width estimate + } + return x; +} + +/* +//! slice_sampler1d() implements a 1-d MCMC slice sampler. +//! It should be correct for unimodal distributions, but +//! not for multimodal ones. +// +template +F slice_sampler1d(const LogP& logP, //!< log of distribution to sample + F x, //!< initial sample + Uniform01& u01, //!< uniform random number generator + F min_x = -std::numeric_limits::infinity(), //!< minimum value of support + F max_x = std::numeric_limits::infinity(), //!< maximum value of support + F w = 0.0, //!< guess at initial width + unsigned nsamples=1, //!< number of samples to draw + unsigned max_nfeval=200) //!< max number of function evaluations +{ + typedef unsigned U; + assert(std::isfinite(x)); + if (w <= 0.0) { + if (min_x > -std::numeric_limits::infinity() && max_x < std::numeric_limits::infinity()) + w = (max_x - min_x)/4; + else + w = std::max(((x < 0.0) ? -x : x)/4, 0.1); + } + // TRACE4(x, min_x, max_x, w); + F logPx = logP(x); + assert(std::isfinite(logPx)); + U nfeval = 1; + for (U sample = 0; sample < nsamples; ++sample) { + F x0 = x; + F logU = logPx + log(u01()+1e-100); + assert(std::isfinite(logU)); + F r = u01(); + F xl = std::max(min_x, x - r*w); + F xr = std::min(max_x, x + (1-r)*w); + // TRACE3(x, logPx, logU); + while (xl > min_x && logP(xl) > logU) { + xl -= w; + w *= 2; + ++nfeval; + if (nfeval >= max_nfeval) + std::cerr << "## Error: nfeval = " << nfeval << ", max_nfeval = " << max_nfeval << ", sample = " << sample << ", nsamples = " << nsamples << ", r = " << r << ", w = " << w << ", xl = " << xl << std::endl; + assert(nfeval < max_nfeval); + } + xl = std::max(xl, min_x); + while (xr < max_x && logP(xr) > logU) { + xr += w; + w *= 2; + ++nfeval; + if (nfeval >= max_nfeval) + std::cerr << "## Error: nfeval = " << nfeval << ", max_nfeval = " << max_nfeval << ", sample = " << sample << ", nsamples = " << nsamples << ", r = " << r << ", w = " << w << ", xr = " << xr << std::endl; + assert(nfeval < max_nfeval); + } + xr = std::min(xr, max_x); + while (true) { + r = u01(); + x = r*xl + (1-r)*xr; + assert(std::isfinite(x)); + logPx = logP(x); + // TRACE4(logPx, x, xl, xr); + assert(std::isfinite(logPx)); + ++nfeval; + if (nfeval >= max_nfeval) + std::cerr << "## Error: nfeval = " << nfeval << ", max_nfeval = " << max_nfeval << ", sample = " << sample << ", nsamples = " << nsamples << ", r = " << r << ", w = " << w << ", xl = " << xl << ", xr = " << xr << ", x = " << x << std::endl; + assert(nfeval < max_nfeval); + if (logPx > logU) + break; + else if (x > x0) + xr = x; + else + xl = x; + } + // w = (4*w + (xr-xl))/5; // gradually adjust w + } + // TRACE2(logPx, x); + return x; +} // slice_sampler1d() +*/ + +#endif // SLICE_SAMPLER_H -- cgit v1.2.3