diff options
Diffstat (limited to 'utils')
-rw-r--r-- | utils/Jamfile | 32 | ||||
-rw-r--r-- | utils/Makefile.am | 8 | ||||
-rw-r--r-- | utils/b64tools.cc | 5 | ||||
-rw-r--r-- | utils/ccrp.h | 317 | ||||
-rw-r--r-- | utils/ccrp_nt.h | 164 | ||||
-rw-r--r-- | utils/ccrp_onetable.h | 253 | ||||
-rw-r--r-- | utils/crp_test.cc | 91 | ||||
-rw-r--r-- | utils/fast_sparse_vector.h | 17 | ||||
-rw-r--r-- | utils/hash.h | 6 | ||||
-rw-r--r-- | utils/mfcr.h | 370 | ||||
-rw-r--r-- | utils/mfcr_test.cc | 72 | ||||
-rw-r--r-- | utils/sampler.h | 2 | ||||
-rw-r--r-- | utils/slice_sampler.h | 191 | ||||
-rw-r--r-- | utils/small_vector.h | 2 | ||||
-rw-r--r-- | utils/stringlib.h | 2 | ||||
-rw-r--r-- | utils/tdict.cc | 19 | ||||
-rw-r--r-- | utils/tdict.h | 21 | ||||
-rw-r--r-- | utils/unigram_pyp_lm.cc | 168 | ||||
-rw-r--r-- | utils/weights.cc | 8 |
19 files changed, 43 insertions, 1705 deletions
diff --git a/utils/Jamfile b/utils/Jamfile deleted file mode 100644 index 4444b25f..00000000 --- a/utils/Jamfile +++ /dev/null @@ -1,32 +0,0 @@ -import testing ; -import option ; - -additional = ; -if [ option.get with-cmph ] { - additional += perfect_hash.cc ; -} - -lib utils : - alignment_io.cc - b64tools.cc - corpus_tools.cc - dict.cc - tdict.cc - fdict.cc - gzstream.cc - filelib.cc - stringlib.cc - sparse_vector.cc - timing_stats.cc - verbose.cc - weights.cc - $(additional) - ..//z - : <include>.. <include>. : : <include>.. <include>. ; - -exe atools : atools.cc utils ..//boost_program_options ; -exe reconstruct_weights : reconstruct_weights.cc utils ..//boost_program_options ; - -alias programs : reconstruct_weights atools ; - -all_tests [ glob *_test.cc phmt.cc ts.cc ] : utils : <testing.arg>$(TOP)/utils/test_data ; diff --git a/utils/Makefile.am b/utils/Makefile.am index 386344dd..3ad9d69e 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -3,15 +3,13 @@ bin_PROGRAMS = reconstruct_weights atools noinst_PROGRAMS = \ ts \ phmt \ - mfcr_test \ - crp_test \ dict_test \ m_test \ weights_test \ logval_test \ small_vector_test -TESTS = ts mfcr_test crp_test small_vector_test logval_test weights_test dict_test m_test +TESTS = ts small_vector_test logval_test weights_test dict_test m_test noinst_LIBRARIES = libutils.a @@ -47,12 +45,8 @@ m_test_SOURCES = m_test.cc m_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz dict_test_SOURCES = dict_test.cc dict_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz -mfcr_test_SOURCES = mfcr_test.cc -mfcr_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz weights_test_SOURCES = weights_test.cc weights_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz -crp_test_SOURCES = crp_test.cc -crp_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz logval_test_SOURCES = logval_test.cc logval_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz small_vector_test_SOURCES = small_vector_test.cc diff --git a/utils/b64tools.cc b/utils/b64tools.cc index 5512f975..78c1cb4e 100644 --- a/utils/b64tools.cc +++ b/utils/b64tools.cc @@ -10,9 +10,10 @@ static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$ static void encodeblock(const unsigned char* in, ostream* os, int len) { char out[4]; + // cerr << len << endl; out[0] = cb64[ in[0] >> 2 ]; - out[1] = cb64[ ((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4) ]; - out[2] = (len > 1 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '='); + out[1] = cb64[ ((in[0] & 0x03) << 4) | (len > 1 ? ((in[1] & 0xf0) >> 4) : static_cast<unsigned char>(0))]; + out[2] = (len > 2 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '='); out[3] = (len > 2 ? cb64[ in[2] & 0x3f ] : '='); os->write(out, 4); } diff --git a/utils/ccrp.h b/utils/ccrp.h deleted file mode 100644 index 1d41a3ef..00000000 --- a/utils/ccrp.h +++ /dev/null @@ -1,317 +0,0 @@ -#ifndef _CCRP_H_ -#define _CCRP_H_ - -#include <numeric> -#include <cassert> -#include <cmath> -#include <list> -#include <iostream> -#include <vector> -#include <tr1/unordered_map> -#include <boost/functional/hash.hpp> -#include "sampler.h" -#include "slice_sampler.h" -#include "m.h" - -// Chinese restaurant process (Pitman-Yor parameters) with table tracking. - -template <typename Dish, typename DishHash = boost::hash<Dish> > -class CCRP { - public: - CCRP(double disc, double strength) : - num_tables_(), - num_customers_(), - discount_(disc), - strength_(strength), - discount_prior_strength_(std::numeric_limits<double>::quiet_NaN()), - discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()), - strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()), - strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) { - check_hyperparameters(); - } - - CCRP(double d_strength, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : - num_tables_(), - num_customers_(), - discount_(d), - strength_(c), - discount_prior_strength_(d_strength), - discount_prior_beta_(d_beta), - strength_prior_shape_(c_shape), - strength_prior_rate_(c_rate) { - check_hyperparameters(); - } - - void check_hyperparameters() { - if (discount_ < 0.0 || discount_ >= 1.0) { - std::cerr << "Bad discount: " << discount_ << std::endl; - abort(); - } - if (strength_ <= -discount_) { - std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl; - abort(); - } - } - - double discount() const { return discount_; } - double strength() const { return strength_; } - void set_hyperparameters(double d, double s) { - discount_ = d; strength_ = s; - check_hyperparameters(); - } - void set_discount(double d) { discount_ = d; check_hyperparameters(); } - void set_strength(double a) { strength_ = a; check_hyperparameters(); } - - bool has_discount_prior() const { - return !std::isnan(discount_prior_strength_); - } - - bool has_strength_prior() const { - return !std::isnan(strength_prior_shape_); - } - - void clear() { - num_tables_ = 0; - num_customers_ = 0; - dish_locs_.clear(); - } - - unsigned num_tables() const { - return num_tables_; - } - - unsigned num_tables(const Dish& dish) const { - const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - return it->second.table_counts_.size(); - } - - unsigned num_customers() const { - return num_customers_; - } - - unsigned num_customers(const Dish& dish) const { - const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - return it->total_dish_count_; - } - - // returns +1 or 0 indicating whether a new table was opened - // p = probability with which the particular table was selected - // excluding p0 - template <typename T> - int increment(const Dish& dish, const T& p0, MT19937* rng, T* p = NULL) { - DishLocations& loc = dish_locs_[dish]; - bool share_table = false; - if (loc.total_dish_count_) { - const T p_empty = T(strength_ + num_tables_ * discount_) * p0; - const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_); - share_table = rng->SelectSample(p_empty, p_share); - } - if (share_table) { - double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - for (typename std::list<unsigned>::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= (*ti - discount_); - if (r <= 0.0) { - if (p) { *p = T(*ti - discount_) / T(strength_ + num_customers_); } - ++(*ti); - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - } else { - loc.table_counts_.push_back(1u); - if (p) { *p = T(strength_ + discount_ * num_tables_) / T(strength_ + num_customers_); } - ++num_tables_; - } - ++loc.total_dish_count_; - ++num_customers_; - return (share_table ? 0 : 1); - } - - // returns -1 or 0, indicating whether a table was closed - int decrement(const Dish& dish, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - assert(loc.total_dish_count_); - if (loc.total_dish_count_ == 1) { - dish_locs_.erase(dish); - --num_tables_; - --num_customers_; - return -1; - } else { - int delta = 0; - // sample customer to remove UNIFORMLY. that is, do NOT use the discount - // here. if you do, it will introduce (unwanted) bias! - double r = rng->next() * loc.total_dish_count_; - --loc.total_dish_count_; - for (typename std::list<unsigned>::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= *ti; - if (r <= 0.0) { - if ((--(*ti)) == 0) { - --num_tables_; - delta = -1; - loc.table_counts_.erase(ti); - } - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - --num_customers_; - return delta; - } - } - - template <typename T> - T prob(const Dish& dish, const T& p0) const { - const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish); - const T r = T(num_tables_ * discount_ + strength_); - if (it == dish_locs_.end()) { - return r * p0 / T(num_customers_ + strength_); - } else { - return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) / - T(num_customers_ + strength_); - } - } - - double log_crp_prob() const { - return log_crp_prob(discount_, strength_); - } - - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process - // does not include P_0's - double log_crp_prob(const double& discount, const double& strength) const { - double lp = 0.0; - if (has_discount_prior()) - lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_); - if (has_strength_prior()) - lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_); - assert(lp <= 0.0); - if (num_customers_) { - if (discount > 0.0) { - const double r = lgamma(1.0 - discount); - if (strength) - lp += lgamma(strength) - lgamma(strength / discount); - lp += - lgamma(strength + num_customers_) - + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_); - assert(std::isfinite(lp)); - for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - const DishLocations& cur = it->second; - for (std::list<unsigned>::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { - lp += lgamma(*ti - discount) - r; - } - } - } else if (!discount) { // discount == 0.0 - lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_); - assert(std::isfinite(lp)); - for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - const DishLocations& cur = it->second; - lp += lgamma(cur.table_counts_.size()); - } - } else { - assert(!"discount less than 0 detected!"); - } - } - assert(std::isfinite(lp)); - return lp; - } - - void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_strength_prior()); - if (num_customers() == 0) return; - DiscountResampler dr(*this); - StrengthResampler sr(*this); - for (unsigned iter = 0; iter < nloop; ++iter) { - if (has_strength_prior()) { - strength_ = slice_sampler1d(sr, strength_, *rng, -discount_ + std::numeric_limits<double>::min(), - std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); - } - if (has_discount_prior()) { - double min_discount = std::numeric_limits<double>::min(); - if (strength_ < 0.0) min_discount -= strength_; - discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, - 1.0, 0.0, niterations, 100*niterations); - } - } - strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, - std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); - } - - struct DiscountResampler { - DiscountResampler(const CCRP& crp) : crp_(crp) {} - const CCRP& crp_; - double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.strength_); - } - }; - - struct StrengthResampler { - StrengthResampler(const CCRP& crp) : crp_(crp) {} - const CCRP& crp_; - double operator()(const double& proposed_strength) const { - return crp_.log_crp_prob(crp_.discount_, proposed_strength); - } - }; - - struct DishLocations { - DishLocations() : total_dish_count_() {} - unsigned total_dish_count_; // customers at all tables with this dish - std::list<unsigned> table_counts_; // list<> gives O(1) deletion and insertion, which we want - // .size() is the number of tables for this dish - }; - - void Print(std::ostream* out) const { - std::cerr << "PYP(d=" << discount_ << ",c=" << strength_ << ") customers=" << num_customers_ << std::endl; - for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; - for (typename std::list<unsigned>::const_iterator i = it->second.table_counts_.begin(); - i != it->second.table_counts_.end(); ++i) { - (*out) << " " << *i; - } - (*out) << std::endl; - } - } - - typedef typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator const_iterator; - const_iterator begin() const { - return dish_locs_.begin(); - } - const_iterator end() const { - return dish_locs_.end(); - } - - unsigned num_tables_; - unsigned num_customers_; - std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_; - - double discount_; - double strength_; - - // optional beta prior on discount_ (NaN if no prior) - double discount_prior_strength_; - double discount_prior_beta_; - - // optional gamma prior on strength_ (NaN if no prior) - double strength_prior_shape_; - double strength_prior_rate_; -}; - -template <typename T,typename H> -std::ostream& operator<<(std::ostream& o, const CCRP<T,H>& c) { - c.Print(&o); - return o; -} - -#endif diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h deleted file mode 100644 index 724b11bd..00000000 --- a/utils/ccrp_nt.h +++ /dev/null @@ -1,164 +0,0 @@ -#ifndef _CCRP_NT_H_ -#define _CCRP_NT_H_ - -#include <numeric> -#include <cassert> -#include <cmath> -#include <list> -#include <iostream> -#include <vector> -#include <tr1/unordered_map> -#include <boost/functional/hash.hpp> -#include "sampler.h" -#include "slice_sampler.h" -#include "m.h" - -// Chinese restaurant process (1 parameter) -template <typename Dish, typename DishHash = boost::hash<Dish> > -class CCRP_NoTable { - public: - explicit CCRP_NoTable(double conc) : - num_customers_(), - alpha_(conc), - alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()), - alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {} - - CCRP_NoTable(double c_shape, double c_rate, double c = 10.0) : - num_customers_(), - alpha_(c), - alpha_prior_shape_(c_shape), - alpha_prior_rate_(c_rate) {} - - double alpha() const { return alpha_; } - void set_alpha(const double& alpha) { alpha_ = alpha; assert(alpha_ > 0.0); } - - bool has_alpha_prior() const { - return !std::isnan(alpha_prior_shape_); - } - - void clear() { - num_customers_ = 0; - custs_.clear(); - } - - unsigned num_customers() const { - return num_customers_; - } - - unsigned num_customers(const Dish& dish) const { - const typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.find(dish); - if (it == custs_.end()) return 0; - return it->second; - } - - int increment(const Dish& dish) { - int table_diff = 0; - if (++custs_[dish] == 1) - table_diff = 1; - ++num_customers_; - return table_diff; - } - - int decrement(const Dish& dish) { - int table_diff = 0; - int nc = --custs_[dish]; - if (nc == 0) { - custs_.erase(dish); - table_diff = -1; - } else if (nc < 0) { - std::cerr << "Dish counts dropped below zero for: " << dish << std::endl; - abort(); - } - --num_customers_; - return table_diff; - } - - template <typename F> - F prob(const Dish& dish, const F& p0) const { - const unsigned at_table = num_customers(dish); - return (F(at_table) + p0 * F(alpha_)) / F(num_customers_ + alpha_); - } - - double logprob(const Dish& dish, const double& logp0) const { - const unsigned at_table = num_customers(dish); - return log(at_table + exp(logp0 + log(alpha_))) - log(num_customers_ + alpha_); - } - - double log_crp_prob() const { - return log_crp_prob(alpha_); - } - - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process - // does not include P_0's - double log_crp_prob(const double& alpha) const { - double lp = 0.0; - if (has_alpha_prior()) - lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); - assert(lp <= 0.0); - if (num_customers_) { - lp += lgamma(alpha) - lgamma(alpha + num_customers_) + - custs_.size() * log(alpha); - assert(std::isfinite(lp)); - for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin(); - it != custs_.end(); ++it) { - lp += lgamma(it->second); - } - } - assert(std::isfinite(lp)); - return lp; - } - - void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_alpha_prior()); - ConcentrationResampler cr(*this); - for (unsigned iter = 0; iter < nloop; ++iter) { - alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, - std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); - } - } - - struct ConcentrationResampler { - ConcentrationResampler(const CCRP_NoTable& crp) : crp_(crp) {} - const CCRP_NoTable& crp_; - double operator()(const double& proposed_alpha) const { - return crp_.log_crp_prob(proposed_alpha); - } - }; - - void Print(std::ostream* out) const { - (*out) << "DP(alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl; - int cc = 0; - for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin(); - it != custs_.end(); ++it) { - (*out) << " " << it->first << "(" << it->second << " eating)"; - ++cc; - if (cc > 10) { (*out) << " ..."; break; } - } - (*out) << std::endl; - } - - unsigned num_customers_; - std::tr1::unordered_map<Dish, unsigned, DishHash> custs_; - - typedef typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator const_iterator; - const_iterator begin() const { - return custs_.begin(); - } - const_iterator end() const { - return custs_.end(); - } - - double alpha_; - - // optional gamma prior on alpha_ (NaN if no prior) - double alpha_prior_shape_; - double alpha_prior_rate_; -}; - -template <typename T,typename H> -std::ostream& operator<<(std::ostream& o, const CCRP_NoTable<T,H>& c) { - c.Print(&o); - return o; -} - -#endif diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h deleted file mode 100644 index abe399ea..00000000 --- a/utils/ccrp_onetable.h +++ /dev/null @@ -1,253 +0,0 @@ -#ifndef _CCRP_ONETABLE_H_ -#define _CCRP_ONETABLE_H_ - -#include <numeric> -#include <cassert> -#include <cmath> -#include <list> -#include <iostream> -#include <tr1/unordered_map> -#include <boost/functional/hash.hpp> -#include "sampler.h" -#include "slice_sampler.h" - -// Chinese restaurant process (Pitman-Yor parameters) with one table approximation - -template <typename Dish, typename DishHash = boost::hash<Dish> > -class CCRP_OneTable { - typedef std::tr1::unordered_map<Dish, unsigned, DishHash> DishMapType; - public: - CCRP_OneTable(double disc, double conc) : - num_tables_(), - num_customers_(), - discount_(disc), - alpha_(conc), - discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()), - discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()), - alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()), - alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {} - - CCRP_OneTable(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : - num_tables_(), - num_customers_(), - discount_(d), - alpha_(c), - discount_prior_alpha_(d_alpha), - discount_prior_beta_(d_beta), - alpha_prior_shape_(c_shape), - alpha_prior_rate_(c_rate) {} - - double discount() const { return discount_; } - double alpha() const { return alpha_; } - void set_alpha(double c) { alpha_ = c; } - void set_discount(double d) { discount_ = d; } - - bool has_discount_prior() const { - return !std::isnan(discount_prior_alpha_); - } - - bool has_alpha_prior() const { - return !std::isnan(alpha_prior_shape_); - } - - void clear() { - num_tables_ = 0; - num_customers_ = 0; - dish_counts_.clear(); - } - - unsigned num_tables() const { - return num_tables_; - } - - unsigned num_tables(const Dish& dish) const { - const typename DishMapType::const_iterator it = dish_counts_.find(dish); - if (it == dish_counts_.end()) return 0; - return 1; - } - - unsigned num_customers() const { - return num_customers_; - } - - unsigned num_customers(const Dish& dish) const { - const typename DishMapType::const_iterator it = dish_counts_.find(dish); - if (it == dish_counts_.end()) return 0; - return it->second; - } - - // returns +1 or 0 indicating whether a new table was opened - int increment(const Dish& dish) { - unsigned& dc = dish_counts_[dish]; - ++dc; - ++num_customers_; - if (dc == 1) { - ++num_tables_; - return 1; - } else { - return 0; - } - } - - // returns -1 or 0, indicating whether a table was closed - int decrement(const Dish& dish) { - unsigned& dc = dish_counts_[dish]; - assert(dc > 0); - if (dc == 1) { - dish_counts_.erase(dish); - --num_tables_; - --num_customers_; - return -1; - } else { - assert(dc > 1); - --dc; - --num_customers_; - return 0; - } - } - - double prob(const Dish& dish, const double& p0) const { - const typename DishMapType::const_iterator it = dish_counts_.find(dish); - const double r = num_tables_ * discount_ + alpha_; - if (it == dish_counts_.end()) { - return r * p0 / (num_customers_ + alpha_); - } else { - return (it->second - discount_ + r * p0) / - (num_customers_ + alpha_); - } - } - - template <typename T> - T probT(const Dish& dish, const T& p0) const { - const typename DishMapType::const_iterator it = dish_counts_.find(dish); - const T r(num_tables_ * discount_ + alpha_); - if (it == dish_counts_.end()) { - return r * p0 / T(num_customers_ + alpha_); - } else { - return (T(it->second - discount_) + r * p0) / - T(num_customers_ + alpha_); - } - } - - double log_crp_prob() const { - return log_crp_prob(discount_, alpha_); - } - - static double log_beta_density(const double& x, const double& alpha, const double& beta) { - assert(x > 0.0); - assert(x < 1.0); - assert(alpha > 0.0); - assert(beta > 0.0); - const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); - return lp; - } - - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; - } - - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process - // does not include P_0's - double log_crp_prob(const double& discount, const double& alpha) const { - double lp = 0.0; - if (has_discount_prior()) - lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_alpha_prior()) - lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); - assert(lp <= 0.0); - if (num_customers_) { - if (discount > 0.0) { - const double r = lgamma(1.0 - discount); - lp += lgamma(alpha) - lgamma(alpha + num_customers_) - + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_) - - lgamma(alpha / discount); - assert(std::isfinite(lp)); - for (typename DishMapType::const_iterator it = dish_counts_.begin(); - it != dish_counts_.end(); ++it) { - const unsigned& cur = it->second; - lp += lgamma(cur - discount) - r; - } - } else { - assert(!"not implemented yet"); - } - } - assert(std::isfinite(lp)); - return lp; - } - - void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_alpha_prior()); - DiscountResampler dr(*this); - ConcentrationResampler cr(*this); - for (unsigned iter = 0; iter < nloop; ++iter) { - if (has_alpha_prior()) { - alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, - std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); - } - if (has_discount_prior()) { - discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits<double>::min(), - 1.0, 0.0, niterations, 100*niterations); - } - } - alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, - std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); - } - - struct DiscountResampler { - DiscountResampler(const CCRP_OneTable& crp) : crp_(crp) {} - const CCRP_OneTable& crp_; - double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.alpha_); - } - }; - - struct ConcentrationResampler { - ConcentrationResampler(const CCRP_OneTable& crp) : crp_(crp) {} - const CCRP_OneTable& crp_; - double operator()(const double& proposed_alpha) const { - return crp_.log_crp_prob(crp_.discount_, proposed_alpha); - } - }; - - void Print(std::ostream* out) const { - (*out) << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl; - for (typename DishMapType::const_iterator it = dish_counts_.begin(); it != dish_counts_.end(); ++it) { - (*out) << " " << it->first << " = " << it->second << std::endl; - } - } - - typedef typename DishMapType::const_iterator const_iterator; - const_iterator begin() const { - return dish_counts_.begin(); - } - const_iterator end() const { - return dish_counts_.end(); - } - - unsigned num_tables_; - unsigned num_customers_; - DishMapType dish_counts_; - - double discount_; - double alpha_; - - // optional beta prior on discount_ (NaN if no prior) - double discount_prior_alpha_; - double discount_prior_beta_; - - // optional gamma prior on alpha_ (NaN if no prior) - double alpha_prior_shape_; - double alpha_prior_rate_; -}; - -template <typename T,typename H> -std::ostream& operator<<(std::ostream& o, const CCRP_OneTable<T,H>& c) { - c.Print(&o); - return o; -} - -#endif diff --git a/utils/crp_test.cc b/utils/crp_test.cc deleted file mode 100644 index 0cdb7afd..00000000 --- a/utils/crp_test.cc +++ /dev/null @@ -1,91 +0,0 @@ -#include <iostream> -#include <vector> -#include <string> - -#define BOOST_TEST_MODULE CrpTest -#include <boost/test/unit_test.hpp> -#include <boost/test/floating_point_comparison.hpp> - -#include "ccrp.h" -#include "sampler.h" - -using namespace std; - -MT19937 rng; - -BOOST_AUTO_TEST_CASE(Dist) { - CCRP<string> crp(0.1, 5); - double un = 0.25; - int tt = 0; - tt += crp.increment("hi", un, &rng); - tt += crp.increment("foo", un, &rng); - tt += crp.increment("bar", un, &rng); - tt += crp.increment("bar", un, &rng); - tt += crp.increment("bar", un, &rng); - tt += crp.increment("bar", un, &rng); - tt += crp.increment("bar", un, &rng); - tt += crp.increment("bar", un, &rng); - tt += crp.increment("bar", un, &rng); - cout << "tt=" << tt << endl; - cout << crp << endl; - cout << " P(bar)=" << crp.prob("bar", un) << endl; - cout << " P(hi)=" << crp.prob("hi", un) << endl; - cout << " P(baz)=" << crp.prob("baz", un) << endl; - cout << " P(foo)=" << crp.prob("foo", un) << endl; - double x = crp.prob("bar", un) + crp.prob("hi", un) + crp.prob("baz", un) + crp.prob("foo", un); - cout << " tot=" << x << endl; - BOOST_CHECK_CLOSE(1.0, x, 1e-6); - tt += crp.decrement("hi", &rng); - tt += crp.decrement("bar", &rng); - cout << crp << endl; - tt += crp.decrement("bar", &rng); - cout << crp << endl; - cout << "tt=" << tt << endl; -} - -BOOST_AUTO_TEST_CASE(Exchangability) { - double tot = 0; - double xt = 0; - CCRP<int> crp(0.5, 1.0); - int cust = 10; - vector<int> hist(cust + 1, 0); - for (int i = 0; i < cust; ++i) { crp.increment(1, 1.0, &rng); } - const int samples = 100000; - const bool simulate = true; - for (int k = 0; k < samples; ++k) { - if (!simulate) { - crp.clear(); - for (int i = 0; i < cust; ++i) { crp.increment(1, 1.0, &rng); } - } else { - int da = rng.next() * cust; - bool a = rng.next() < 0.5; - if (a) { - for (int i = 0; i < da; ++i) { crp.increment(1, 1.0, &rng); } - for (int i = 0; i < da; ++i) { crp.decrement(1, &rng); } - xt += 1.0; - } else { - for (int i = 0; i < da; ++i) { crp.decrement(1, &rng); } - for (int i = 0; i < da; ++i) { crp.increment(1, 1.0, &rng); } - } - } - int c = crp.num_tables(1); - ++hist[c]; - tot += c; - } - BOOST_CHECK_EQUAL(cust, crp.num_customers()); - cerr << "P(a) = " << (xt / samples) << endl; - cerr << "E[num tables] = " << (tot / samples) << endl; - double error = fabs((tot / samples) - 5.4); - cerr << " error = " << error << endl; - BOOST_CHECK_MESSAGE(error < 0.1, "error is too big = " << error); // it's possible for this to fail, but - // very, very unlikely - for (int i = 1; i <= cust; ++i) - cerr << i << ' ' << (hist[i]) << endl; -} - -BOOST_AUTO_TEST_CASE(LP) { - CCRP<string> crp(1,1,1,1,0.1,50.0); - crp.increment("foo", 1.0, &rng); - cerr << crp.log_crp_prob() << endl; -} - diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index 433a5cc5..590a60c4 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -13,6 +13,7 @@ #include <map> #include <cassert> #include <vector> +#include <limits> #ifdef HAVE_CONFIG_H #include "config.h" @@ -192,28 +193,33 @@ class FastSparseVector { } else { is_remote_ = true; data_.rbmap = new SPARSE_HASH_MAP<unsigned, T>(first, last); + HASH_MAP_DELETED(*data_.rbmap, std::numeric_limits<unsigned>::max()); } } - void erase(int k) { + void erase(unsigned k) { if (is_remote_) { data_.rbmap->erase(k); } else { - for (int i = 0; i < local_size_; ++i) { + for (unsigned i = 0; i < local_size_; ++i) { if (data_.local[i].first() == k) { - for (int j = i+1; j < local_size_; ++j) { + for (unsigned j = i+1; j < local_size_; ++j) { data_.local[j-1].first() = data_.local[j].first(); data_.local[j-1].second() = data_.local[j].second(); } } } + --local_size_; } } const FastSparseVector<T>& operator=(const FastSparseVector<T>& other) { if (&other == this) return *this; clear(); std::memcpy(this, &other, sizeof(FastSparseVector)); - if (is_remote_) + if (is_remote_) { data_.rbmap = new SPARSE_HASH_MAP<unsigned, T>(*data_.rbmap); + // TODO: do i need to set_deleted on a copy? + HASH_MAP_DELETED(*data_.rbmap, std::numeric_limits<unsigned>::max()); + } return *this; } T const& get_singleton() const { @@ -444,6 +450,7 @@ class FastSparseVector { SPARSE_HASH_MAP<unsigned, T>* m = new SPARSE_HASH_MAP<unsigned, T>( reinterpret_cast<std::pair<unsigned, T>*>(&data_.local[0]), reinterpret_cast<std::pair<unsigned, T>*>(&data_.local[local_size_]), local_size_ * 1.5 + 1); + HASH_MAP_DELETED(*m, std::numeric_limits<unsigned>::max()); data_.rbmap = m; is_remote_ = true; } @@ -515,7 +522,7 @@ const FastSparseVector<T> operator-(const FastSparseVector<T>& x, const FastSpar } template <class T> -std::size_t hash_value(FastSparseVector<T> const& x) { +std::size_t hash_value(FastSparseVector<T> const&) { assert(!"not implemented"); return 0; } diff --git a/utils/hash.h b/utils/hash.h index 6d992086..189ed1ae 100644 --- a/utils/hash.h +++ b/utils/hash.h @@ -16,14 +16,16 @@ # define SPARSE_HASH_MAP google::sparse_hash_map # define HASH_MAP google::dense_hash_map # define HASH_SET google::dense_hash_set -# define HASH_MAP_RESERVED(h,empty,deleted) do { h.set_empty_key(empty); h.set_deleted_key(deleted); } while(0) -# define HASH_MAP_EMPTY(h,empty) do { h.set_empty_key(empty); } while(0) +# define HASH_MAP_DELETED(h,deleted) do { (h).set_deleted_key(deleted); } while(0) +# define HASH_MAP_RESERVED(h,empty,deleted) do { (h).set_empty_key(empty); (h).set_deleted_key(deleted); } while(0) +# define HASH_MAP_EMPTY(h,empty) do { (h).set_empty_key(empty); } while(0) #else # include <tr1/unordered_map> # include <tr1/unordered_set> # define SPARSE_HASH_MAP std::tr1::unordered_map # define HASH_MAP std::tr1::unordered_map # define HASH_SET std::tr1::unordered_set +# define HASH_MAP_DELETED(h,deleted) # define HASH_MAP_RESERVED(h,empty,deleted) # define HASH_MAP_EMPTY(h,empty) #endif diff --git a/utils/mfcr.h b/utils/mfcr.h deleted file mode 100644 index 4aacb567..00000000 --- a/utils/mfcr.h +++ /dev/null @@ -1,370 +0,0 @@ -#ifndef _MFCR_H_ -#define _MFCR_H_ - -#include <algorithm> -#include <numeric> -#include <cassert> -#include <cmath> -#include <list> -#include <iostream> -#include <vector> -#include <iterator> -#include <tr1/unordered_map> -#include <boost/functional/hash.hpp> -#include "sampler.h" -#include "slice_sampler.h" -#include "m.h" - -struct TableCount { - TableCount() : count(), floor() {} - TableCount(int c, int f) : count(c), floor(f) { - assert(f >= 0); - } - int count; // count or delta (may be 0, <0, or >0) - unsigned char floor; // from which floor? -}; - -std::ostream& operator<<(std::ostream& o, const TableCount& tc) { - return o << "[c=" << tc.count << " floor=" << static_cast<unsigned int>(tc.floor) << ']'; -} - -// Multi-Floor Chinese Restaurant as proposed by Wood & Teh (AISTATS, 2009) to simulate -// graphical Pitman-Yor processes. -// http://jmlr.csail.mit.edu/proceedings/papers/v5/wood09a/wood09a.pdf -// -// Implementation is based on Blunsom, Cohn, Goldwater, & Johnson (ACL 2009) and code -// referenced therein. -// http://www.aclweb.org/anthology/P/P09/P09-2085.pdf -// -template <unsigned Floors, typename Dish, typename DishHash = boost::hash<Dish> > -class MFCR { - public: - - MFCR(double d, double strength) : - num_tables_(), - num_customers_(), - discount_(d), - strength_(strength), - discount_prior_strength_(std::numeric_limits<double>::quiet_NaN()), - discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()), - strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()), - strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) { check_hyperparameters(); } - - MFCR(double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) : - num_tables_(), - num_customers_(), - discount_(d), - strength_(strength), - discount_prior_strength_(discount_strength), - discount_prior_beta_(discount_beta), - strength_prior_shape_(strength_shape), - strength_prior_rate_(strength_rate) { check_hyperparameters(); } - - void check_hyperparameters() { - if (discount_ < 0.0 || discount_ >= 1.0) { - std::cerr << "Bad discount: " << discount_ << std::endl; - abort(); - } - if (strength_ <= -discount_) { - std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl; - abort(); - } - } - - double discount() const { return discount_; } - double strength() const { return strength_; } - void set_hyperparameters(double d, double s) { - discount_ = d; strength_ = s; - check_hyperparameters(); - } - void set_discount(double d) { discount_ = d; check_hyperparameters(); } - void set_strength(double a) { strength_ = a; check_hyperparameters(); } - - bool has_discount_prior() const { - return !std::isnan(discount_prior_strength_); - } - - bool has_strength_prior() const { - return !std::isnan(strength_prior_shape_); - } - - void clear() { - num_tables_ = 0; - num_customers_ = 0; - dish_locs_.clear(); - } - - unsigned num_tables() const { - return num_tables_; - } - - unsigned num_tables(const Dish& dish) const { - const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - return it->second.table_counts_.size(); - } - - // this is not terribly efficient but it should not typically be necessary to execute this query - unsigned num_tables(const Dish& dish, const unsigned floor) const { - const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - unsigned c = 0; - for (typename std::list<TableCount>::const_iterator i = it->second.table_counts_.begin(); - i != it->second.table_counts_.end(); ++i) { - if (i->floor == floor) ++c; - } - return c; - } - - unsigned num_customers() const { - return num_customers_; - } - - unsigned num_customers(const Dish& dish) const { - const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - return it->total_dish_count_; - } - - // returns (delta, floor) indicating whether a new table (delta) was opened and on which floor - template <class InputIterator, class InputIterator2> - TableCount increment(const Dish& dish, InputIterator p0s, InputIterator2 lambdas, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - // marg_p0 = marginal probability of opening a new table on any floor with label dish - typedef typename std::iterator_traits<InputIterator>::value_type F; - const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0)); - assert(marg_p0 <= F(1.0001)); - int floor = -1; - bool share_table = false; - if (loc.total_dish_count_) { - const F p_empty = F(strength_ + num_tables_ * discount_) * marg_p0; - const F p_share = F(loc.total_dish_count_ - loc.table_counts_.size() * discount_); - share_table = rng->SelectSample(p_empty, p_share); - } - if (share_table) { - // this can be done with doubles since P0 (which may be tiny) is not involved - double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - for (typename std::list<TableCount>::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= ti->count - discount_; - if (r <= 0.0) { - ++ti->count; - floor = ti->floor; - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - } else { // sit at currently empty table -- must sample what floor - if (Floors == 1) { - floor = 0; - } else { - F r = F(rng->next()) * marg_p0; - for (unsigned i = 0; i < Floors; ++i) { - r -= (*p0s) * (*lambdas); - ++p0s; - ++lambdas; - if (r <= F(0.0)) { - floor = i; - break; - } - } - } - assert(floor >= 0); - loc.table_counts_.push_back(TableCount(1, floor)); - ++num_tables_; - } - ++loc.total_dish_count_; - ++num_customers_; - return (share_table ? TableCount(0, floor) : TableCount(1, floor)); - } - - // returns first = -1 or 0, indicating whether a table was closed, and on what floor (second) - TableCount decrement(const Dish& dish, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - assert(loc.total_dish_count_); - int floor = -1; - int delta = 0; - if (loc.total_dish_count_ == 1) { - floor = loc.table_counts_.front().floor; - dish_locs_.erase(dish); - --num_tables_; - --num_customers_; - delta = -1; - } else { - // sample customer to remove UNIFORMLY. that is, do NOT use the d - // here. if you do, it will introduce (unwanted) bias! - double r = rng->next() * loc.total_dish_count_; - --loc.total_dish_count_; - --num_customers_; - for (typename std::list<TableCount>::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= ti->count; - if (r <= 0.0) { - floor = ti->floor; - if ((--ti->count) == 0) { - --num_tables_; - delta = -1; - loc.table_counts_.erase(ti); - } - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - } - return TableCount(delta, floor); - } - - template <class InputIterator, class InputIterator2> - typename std::iterator_traits<InputIterator>::value_type prob(const Dish& dish, InputIterator p0s, InputIterator2 lambdas) const { - typedef typename std::iterator_traits<InputIterator>::value_type F; - const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0)); - assert(marg_p0 <= F(1.0001)); - const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish); - const F r = F(num_tables_ * discount_ + strength_); - if (it == dish_locs_.end()) { - return r * marg_p0 / F(num_customers_ + strength_); - } else { - return (F(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + F(r * marg_p0)) / - F(num_customers_ + strength_); - } - } - - double log_crp_prob() const { - return log_crp_prob(discount_, strength_); - } - - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process - // does not include draws from G_w's - double log_crp_prob(const double& discount, const double& strength) const { - double lp = 0.0; - if (has_discount_prior()) - lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_); - if (has_strength_prior()) - lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_); - assert(lp <= 0.0); - if (num_customers_) { - if (discount > 0.0) { - const double r = lgamma(1.0 - discount); - if (strength) - lp += lgamma(strength) - lgamma(strength / discount); - lp += - lgamma(strength + num_customers_) - + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_); - assert(std::isfinite(lp)); - for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - const DishLocations& cur = it->second; - for (std::list<TableCount>::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { - lp += lgamma(ti->count - discount) - r; - } - } - } else if (!discount) { // discount == 0.0 - lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_); - assert(std::isfinite(lp)); - for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - const DishLocations& cur = it->second; - lp += lgamma(cur.table_counts_.size()); - } - } else { - assert(!"discount less than 0 detected!"); - } - } - assert(std::isfinite(lp)); - return lp; - } - - void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_strength_prior()); - DiscountResampler dr(*this); - StrengthResampler sr(*this); - for (int iter = 0; iter < nloop; ++iter) { - if (has_strength_prior()) { - strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, - std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); - } - if (has_discount_prior()) { - double min_discount = std::numeric_limits<double>::min(); - if (strength_ < 0.0) min_discount -= strength_; - discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, - 1.0, 0.0, niterations, 100*niterations); - } - } - strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, - std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); - } - - struct DiscountResampler { - DiscountResampler(const MFCR& crp) : crp_(crp) {} - const MFCR& crp_; - double operator()(const double& proposed_d) const { - return crp_.log_crp_prob(proposed_d, crp_.strength_); - } - }; - - struct StrengthResampler { - StrengthResampler(const MFCR& crp) : crp_(crp) {} - const MFCR& crp_; - double operator()(const double& proposediscount_strength) const { - return crp_.log_crp_prob(crp_.discount_, proposediscount_strength); - } - }; - - struct DishLocations { - DishLocations() : total_dish_count_() {} - unsigned total_dish_count_; // customers at all tables with this dish - std::list<TableCount> table_counts_; // list<> gives O(1) deletion and insertion, which we want - // .size() is the number of tables for this dish - }; - - void Print(std::ostream* out) const { - (*out) << "MFCR<" << Floors << ">(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl; - for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; - for (typename std::list<TableCount>::const_iterator i = it->second.table_counts_.begin(); - i != it->second.table_counts_.end(); ++i) { - (*out) << " " << *i; - } - (*out) << std::endl; - } - } - - typedef typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator const_iterator; - const_iterator begin() const { - return dish_locs_.begin(); - } - const_iterator end() const { - return dish_locs_.end(); - } - - unsigned num_tables_; - unsigned num_customers_; - std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_; - - double discount_; - double strength_; - - // optional beta prior on discount_ (NaN if no prior) - double discount_prior_strength_; - double discount_prior_beta_; - - // optional gamma prior on strength_ (NaN if no prior) - double strength_prior_shape_; - double strength_prior_rate_; -}; - -template <unsigned N,typename T,typename H> -std::ostream& operator<<(std::ostream& o, const MFCR<N,T,H>& c) { - c.Print(&o); - return o; -} - -#endif diff --git a/utils/mfcr_test.cc b/utils/mfcr_test.cc deleted file mode 100644 index 29a1a2ce..00000000 --- a/utils/mfcr_test.cc +++ /dev/null @@ -1,72 +0,0 @@ -#include "mfcr.h" - -#include <iostream> -#include <cassert> -#include <cmath> - -#define BOOST_TEST_MODULE MFCRTest -#include <boost/test/unit_test.hpp> -#include <boost/test/floating_point_comparison.hpp> - -#include "sampler.h" - -using namespace std; - -BOOST_AUTO_TEST_CASE(Exchangability) { - MT19937 r; - MT19937* rng = &r; - MFCR<2, int> crp(0.5, 3.0); - vector<double> lambdas(2); - vector<double> p0s(2); - lambdas[0] = 0.2; - lambdas[1] = 0.8; - p0s[0] = 1.0; - p0s[1] = 1.0; - - double tot = 0; - double tot2 = 0; - double xt = 0; - int cust = 10; - vector<int> hist(cust + 1, 0), hist2(cust + 1, 0); - for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } - const int samples = 100000; - const bool simulate = true; - for (int k = 0; k < samples; ++k) { - if (!simulate) { - crp.clear(); - for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } - } else { - int da = rng->next() * cust; - bool a = rng->next() < 0.45; - if (a) { - for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } - for (int i = 0; i < da; ++i) { crp.decrement(1, rng); } - xt += 1.0; - } else { - for (int i = 0; i < da; ++i) { crp.decrement(1, rng); } - for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } - } - } - int c = crp.num_tables(1); - ++hist[c]; - tot += c; - int c2 = crp.num_tables(1,0); // tables on floor 0 with dish 1 - ++hist2[c2]; - tot2 += c2; - } - cerr << cust << " = " << crp.num_customers() << endl; - cerr << "P(a) = " << (xt / samples) << endl; - cerr << "E[num tables] = " << (tot / samples) << endl; - double error = fabs((tot / samples) - 6.894); - cerr << " error = " << error << endl; - for (int i = 1; i <= cust; ++i) - cerr << i << ' ' << (hist[i]) << endl; - cerr << "E[num tables on floor 0] = " << (tot2 / samples) << endl; - double error2 = fabs((tot2 / samples) - 1.379); - cerr << " error2 = " << error2 << endl; - for (int i = 1; i <= cust; ++i) - cerr << i << ' ' << (hist2[i]) << endl; - assert(error < 0.05); // these can fail with very low probability - assert(error2 < 0.05); -}; - diff --git a/utils/sampler.h b/utils/sampler.h index 3e4a4086..88e1856c 100644 --- a/utils/sampler.h +++ b/utils/sampler.h @@ -19,7 +19,7 @@ #include "prob.h" -template <typename F> struct SampleSet; +template <typename F> class SampleSet; template <typename RNG> struct RandomNumberGenerator { diff --git a/utils/slice_sampler.h b/utils/slice_sampler.h deleted file mode 100644 index aa48a169..00000000 --- a/utils/slice_sampler.h +++ /dev/null @@ -1,191 +0,0 @@ -//! slice-sampler.h is an MCMC slice sampler -//! -//! Mark Johnson, 1st August 2008 - -#ifndef SLICE_SAMPLER_H -#define SLICE_SAMPLER_H - -#include <algorithm> -#include <cassert> -#include <cmath> -#include <iostream> -#include <limits> - -//! slice_sampler_rfc_type{} returns the value of a user-specified -//! function if the argument is within range, or - infinity otherwise -// -template <typename F, typename Fn, typename U> -struct slice_sampler_rfc_type { - F min_x, max_x; - const Fn& f; - U max_nfeval, nfeval; - slice_sampler_rfc_type(F min_x, F max_x, const Fn& f, U max_nfeval) - : min_x(min_x), max_x(max_x), f(f), max_nfeval(max_nfeval), nfeval(0) { } - - F operator() (F x) { - if (min_x < x && x < max_x) { - assert(++nfeval <= max_nfeval); - F fx = f(x); - assert(std::isfinite(fx)); - return fx; - } - return -std::numeric_limits<F>::infinity(); - } -}; // slice_sampler_rfc_type{} - -//! slice_sampler1d() implements the univariate "range doubling" slice sampler -//! described in Neal (2003) "Slice Sampling", The Annals of Statistics 31(3), 705-767. -// -template <typename F, typename LogF, typename Uniform01> -F slice_sampler1d(const LogF& logF0, //!< log of function to sample - F x, //!< starting point - Uniform01& u01, //!< uniform [0,1) random number generator - F min_x = -std::numeric_limits<F>::infinity(), //!< minimum value of support - F max_x = std::numeric_limits<F>::infinity(), //!< maximum value of support - F w = 0.0, //!< guess at initial width - unsigned nsamples=1, //!< number of samples to draw - unsigned max_nfeval=200) //!< max number of function evaluations -{ - typedef unsigned U; - slice_sampler_rfc_type<F,LogF,U> logF(min_x, max_x, logF0, max_nfeval); - - assert(std::isfinite(x)); - - if (w <= 0.0) { // set w to a default width - if (min_x > -std::numeric_limits<F>::infinity() && max_x < std::numeric_limits<F>::infinity()) - w = (max_x - min_x)/4; - else - w = std::max(((x < 0.0) ? -x : x)/4, (F) 0.1); - } - assert(std::isfinite(w)); - - F logFx = logF(x); - for (U sample = 0; sample < nsamples; ++sample) { - F logY = logFx + log(u01()+1e-100); //! slice logFx at this value - assert(std::isfinite(logY)); - - F xl = x - w*u01(); //! lower bound on slice interval - F logFxl = logF(xl); - F xr = xl + w; //! upper bound on slice interval - F logFxr = logF(xr); - - while (logY < logFxl || logY < logFxr) // doubling procedure - if (u01() < 0.5) - logFxl = logF(xl -= xr - xl); - else - logFxr = logF(xr += xr - xl); - - F xl1 = xl; - F xr1 = xr; - while (true) { // shrinking procedure - F x1 = xl1 + u01()*(xr1 - xl1); - if (logY < logF(x1)) { - F xl2 = xl; // acceptance procedure - F xr2 = xr; - bool d = false; - while (xr2 - xl2 > 1.1*w) { - F xm = (xl2 + xr2)/2; - if ((x < xm && x1 >= xm) || (x >= xm && x1 < xm)) - d = true; - if (x1 < xm) - xr2 = xm; - else - xl2 = xm; - if (d && logY >= logF(xl2) && logY >= logF(xr2)) - goto unacceptable; - } - x = x1; - goto acceptable; - } - goto acceptable; - unacceptable: - if (x1 < x) // rest of shrinking procedure - xl1 = x1; - else - xr1 = x1; - } - acceptable: - w = (4*w + (xr1 - xl1))/5; // update width estimate - } - return x; -} - -/* -//! slice_sampler1d() implements a 1-d MCMC slice sampler. -//! It should be correct for unimodal distributions, but -//! not for multimodal ones. -// -template <typename F, typename LogP, typename Uniform01> -F slice_sampler1d(const LogP& logP, //!< log of distribution to sample - F x, //!< initial sample - Uniform01& u01, //!< uniform random number generator - F min_x = -std::numeric_limits<F>::infinity(), //!< minimum value of support - F max_x = std::numeric_limits<F>::infinity(), //!< maximum value of support - F w = 0.0, //!< guess at initial width - unsigned nsamples=1, //!< number of samples to draw - unsigned max_nfeval=200) //!< max number of function evaluations -{ - typedef unsigned U; - assert(std::isfinite(x)); - if (w <= 0.0) { - if (min_x > -std::numeric_limits<F>::infinity() && max_x < std::numeric_limits<F>::infinity()) - w = (max_x - min_x)/4; - else - w = std::max(((x < 0.0) ? -x : x)/4, 0.1); - } - // TRACE4(x, min_x, max_x, w); - F logPx = logP(x); - assert(std::isfinite(logPx)); - U nfeval = 1; - for (U sample = 0; sample < nsamples; ++sample) { - F x0 = x; - F logU = logPx + log(u01()+1e-100); - assert(std::isfinite(logU)); - F r = u01(); - F xl = std::max(min_x, x - r*w); - F xr = std::min(max_x, x + (1-r)*w); - // TRACE3(x, logPx, logU); - while (xl > min_x && logP(xl) > logU) { - xl -= w; - w *= 2; - ++nfeval; - if (nfeval >= max_nfeval) - std::cerr << "## Error: nfeval = " << nfeval << ", max_nfeval = " << max_nfeval << ", sample = " << sample << ", nsamples = " << nsamples << ", r = " << r << ", w = " << w << ", xl = " << xl << std::endl; - assert(nfeval < max_nfeval); - } - xl = std::max(xl, min_x); - while (xr < max_x && logP(xr) > logU) { - xr += w; - w *= 2; - ++nfeval; - if (nfeval >= max_nfeval) - std::cerr << "## Error: nfeval = " << nfeval << ", max_nfeval = " << max_nfeval << ", sample = " << sample << ", nsamples = " << nsamples << ", r = " << r << ", w = " << w << ", xr = " << xr << std::endl; - assert(nfeval < max_nfeval); - } - xr = std::min(xr, max_x); - while (true) { - r = u01(); - x = r*xl + (1-r)*xr; - assert(std::isfinite(x)); - logPx = logP(x); - // TRACE4(logPx, x, xl, xr); - assert(std::isfinite(logPx)); - ++nfeval; - if (nfeval >= max_nfeval) - std::cerr << "## Error: nfeval = " << nfeval << ", max_nfeval = " << max_nfeval << ", sample = " << sample << ", nsamples = " << nsamples << ", r = " << r << ", w = " << w << ", xl = " << xl << ", xr = " << xr << ", x = " << x << std::endl; - assert(nfeval < max_nfeval); - if (logPx > logU) - break; - else if (x > x0) - xr = x; - else - xl = x; - } - // w = (4*w + (xr-xl))/5; // gradually adjust w - } - // TRACE2(logPx, x); - return x; -} // slice_sampler1d() -*/ - -#endif // SLICE_SAMPLER_H diff --git a/utils/small_vector.h b/utils/small_vector.h index 894b1b32..c8a69927 100644 --- a/utils/small_vector.h +++ b/utils/small_vector.h @@ -66,7 +66,7 @@ class SmallVector { //TODO: figure out iterator traits to allow this to be selcted for any iterator range template <class I> SmallVector(I const* begin,I const* end) { - int s=end-begin; + unsigned s=end-begin; Alloc(s); if (s <= SV_MAX) { for (unsigned i = 0; i < s; ++i,++begin) data_.vals[i] = *begin; diff --git a/utils/stringlib.h b/utils/stringlib.h index 75772c4d..ff5dc89d 100644 --- a/utils/stringlib.h +++ b/utils/stringlib.h @@ -86,7 +86,7 @@ bool match_begin(Str const& str,Prefix const& prefix) // source will be returned as a string, target must be a sentence or // a lattice (in PLF format) and will be returned as a Lattice object void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref); -struct Lattice; +class Lattice; void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref); inline std::string Trim(const std::string& str, const std::string& dropChars = " \t") { diff --git a/utils/tdict.cc b/utils/tdict.cc index f33bd576..fd2b76cb 100644 --- a/utils/tdict.cc +++ b/utils/tdict.cc @@ -13,22 +13,6 @@ using namespace std; Dict TD::dict_; -unsigned int TD::NumWords() { - return dict_.max(); -} - -WordID TD::Convert(const std::string& s) { - return dict_.Convert(s); -} - -WordID TD::Convert(char const* s) { - return dict_.Convert(string(s)); -} - -const char* TD::Convert(WordID w) { - return dict_.Convert(w).c_str(); -} - void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) { ids->clear(); for (vector<string>::const_iterator i = strings.begin(); i != strings.end(); ++i) @@ -57,7 +41,8 @@ std::string TD::GetString(WordID const* i,WordID const* e) { int TD::AppendString(const WordID& w, int pos, int bufsize, char* buffer) { - const char* word = TD::Convert(w); + const string& s = TD::Convert(w); + const char* word = s.c_str(); const char* const end_buf = buffer + bufsize; char* dest = buffer + pos; while(dest < end_buf && *word) { diff --git a/utils/tdict.h b/utils/tdict.h index 393146fa..03afc2e6 100644 --- a/utils/tdict.h +++ b/utils/tdict.h @@ -3,10 +3,9 @@ #include <string> #include <vector> +#include <cassert> #include "wordid.h" -#include <assert.h> - -class Dict; +#include "dict.h" struct TD { static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far @@ -15,10 +14,18 @@ struct TD { static std::string GetString(const std::vector<WordID>& str); static std::string GetString(WordID const* i,WordID const* e); static int AppendString(const WordID& w, int pos, int bufsize, char* buffer); - static unsigned int NumWords(); - static WordID Convert(const std::string& s); - static WordID Convert(char const* s); - static const char* Convert(WordID w); + static unsigned int NumWords() { + return dict_.max(); + } + static WordID Convert(const std::string& s) { + return dict_.Convert(s); + } + static WordID Convert(char const* s) { + return dict_.Convert(std::string(s)); + } + static const std::string& Convert(WordID w) { + return dict_.Convert(w); + } private: static Dict dict_; }; diff --git a/utils/unigram_pyp_lm.cc b/utils/unigram_pyp_lm.cc deleted file mode 100644 index 510e8839..00000000 --- a/utils/unigram_pyp_lm.cc +++ /dev/null @@ -1,168 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include <boost/functional.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "corpus_tools.h" -#include "m.h" -#include "tdict.h" -#include "sampler.h" -#include "ccrp.h" - -// A not very memory-efficient implementation of an 1-gram LM based on PYPs -// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model -// based on Pitman-Yor Processes. In Proc. ACL. - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -boost::shared_ptr<MT19937> prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,n",po::value<unsigned>()->default_value(50),"Number of samples") - ("train,i",po::value<string>(),"Training data file") - ("test,T",po::value<string>(),"Test data file") - ("discount_prior_a,a",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): a=this") - ("discount_prior_b,b",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): b=this") - ("strength_prior_s,s",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): s=this") - ("strength_prior_r,r",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): r=this") - ("random_seed,S",po::value<uint32_t>(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("train") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -// uniform base distribution (0-gram model) -struct UniformWordModel { - explicit UniformWordModel(unsigned vocab_size) : p0(1.0 / vocab_size), draws() {} - void increment() { ++draws; } - void decrement() { --draws; assert(draws >= 0); } - double prob(WordID) const { return p0; } // all words have equal prob - double log_likelihood() const { return draws * log(p0); } - const double p0; - int draws; -}; - -// represents an Unigram LM -struct UnigramLM { - UnigramLM(unsigned vs, double da, double db, double ss, double sr) : - uniform_vocab(vs), - crp(da, db, ss, sr, 0.8, 1.0) {} - void increment(WordID w, MT19937* rng) { - const double backoff = uniform_vocab.prob(w); - if (crp.increment(w, backoff, rng)) - uniform_vocab.increment(); - } - void decrement(WordID w, MT19937* rng) { - if (crp.decrement(w, rng)) - uniform_vocab.decrement(); - } - double prob(WordID w) const { - const double backoff = uniform_vocab.prob(w); - return crp.prob(w, backoff); - } - - double log_likelihood() const { - double llh = uniform_vocab.log_likelihood(); - llh += crp.log_crp_prob(); - return llh; - } - - void resample_hyperparameters(MT19937* rng) { - crp.resample_hyperparameters(rng); - } - - double discount_a, discount_b, strength_s, strength_r; - double d, strength; - UniformWordModel uniform_vocab; - CCRP<WordID> crp; -}; - -int main(int argc, char** argv) { - po::variables_map conf; - - InitCommandLine(argc, argv, &conf); - const unsigned samples = conf["samples"].as<unsigned>(); - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - vector<vector<WordID> > corpuse; - set<WordID> vocabe; - const WordID kEOS = TD::Convert("</s>"); - cerr << "Reading corpus...\n"; - CorpusTools::ReadFromFile(conf["train"].as<string>(), &corpuse, &vocabe); - cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; - vector<vector<WordID> > test; - if (conf.count("test")) - CorpusTools::ReadFromFile(conf["test"].as<string>(), &test); - else - test = corpuse; - UnigramLM lm(vocabe.size(), - conf["discount_prior_a"].as<double>(), - conf["discount_prior_b"].as<double>(), - conf["strength_prior_s"].as<double>(), - conf["strength_prior_r"].as<double>()); - for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < corpuse.size(); ++ci) { - const vector<WordID>& s = corpuse[ci]; - for (int i = 0; i <= s.size(); ++i) { - WordID w = (i < s.size() ? s[i] : kEOS); - if (SS > 0) lm.decrement(w, &rng); - lm.increment(w, &rng); - } - if (SS > 0) lm.decrement(kEOS, &rng); - lm.increment(kEOS, &rng); - } - cerr << "LLH=" << lm.log_likelihood() << endl; - //if (SS % 10 == 9) lm.resample_hyperparameters(&rng); - } - double llh = 0; - unsigned cnt = 0; - unsigned oovs = 0; - for (int ci = 0; ci < test.size(); ++ci) { - const vector<WordID>& s = test[ci]; - for (int i = 0; i <= s.size(); ++i) { - WordID w = (i < s.size() ? s[i] : kEOS); - double lp = log(lm.prob(w)) / log(2); - if (i < s.size() && vocabe.count(w) == 0) { - cerr << "**OOV "; - ++oovs; - lp = 0; - } - cerr << "p(" << TD::Convert(w) << ") = " << lp << endl; - llh -= lp; - cnt++; - } - } - cerr << " Log_10 prob: " << (-llh * log(2) / log(10)) << endl; - cerr << " Count: " << cnt << endl; - cerr << " OOVs: " << oovs << endl; - cerr << "Cross-entropy: " << (llh / cnt) << endl; - cerr << " Perplexity: " << pow(2, llh / cnt) << endl; - return 0; -} - diff --git a/utils/weights.cc b/utils/weights.cc index f56e2a20..575877b6 100644 --- a/utils/weights.cc +++ b/utils/weights.cc @@ -34,7 +34,7 @@ void Weights::InitFromFile(const string& filename, int weight_count = 0; bool fl = false; string buf; - weight_t val = 0; + double val = 0; while (in) { getline(in, buf); if (buf.size() == 0) continue; @@ -53,7 +53,7 @@ void Weights::InitFromFile(const string& filename, if (feature_list) { feature_list->push_back(buf.substr(start, end - start)); } while(end < buf.size() && buf[end] == ' ') ++end; val = strtod(&buf.c_str()[end], NULL); - if (isnan(val)) { + if (std::isnan(val)) { cerr << FD::Convert(fid) << " has weight NaN!\n"; abort(); } @@ -127,8 +127,8 @@ void Weights::InitSparseVector(const vector<weight_t>& dv, void Weights::SanityCheck(const vector<weight_t>& w) { for (unsigned i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); + assert(!std::isnan(w[i])); + assert(!std::isinf(w[i])); } } |