summaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-10-11 14:06:32 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2012-10-11 14:06:32 -0400
commit07ea7b64b6f85e5798a8068453ed9fd2b97396db (patch)
tree644496a1690d84d82a396bbc1e39160788beb2cd /utils
parent37b9e45e5cb29d708f7249dbe0b0fb27685282a0 (diff)
parenta36fcc5d55c1de84ae68c1091ebff2b1c32dc3b7 (diff)
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'utils')
-rw-r--r--utils/Jamfile32
-rw-r--r--utils/Makefile.am13
-rw-r--r--utils/ccrp.h270
-rw-r--r--utils/ccrp_nt.h164
-rw-r--r--utils/ccrp_onetable.h253
-rw-r--r--utils/crp_table_manager.h114
-rw-r--r--utils/crp_test.cc91
-rw-r--r--utils/fast_sparse_vector.h2
-rw-r--r--utils/gamma_poisson.h33
-rw-r--r--utils/mfcr.h370
-rw-r--r--utils/mfcr_test.cc72
-rw-r--r--utils/sampler.h2
-rw-r--r--utils/slice_sampler.h191
-rw-r--r--utils/small_vector.h2
-rw-r--r--utils/stringlib.h2
-rw-r--r--utils/unigram_pyp_lm.cc214
16 files changed, 6 insertions, 1819 deletions
diff --git a/utils/Jamfile b/utils/Jamfile
deleted file mode 100644
index 4444b25f..00000000
--- a/utils/Jamfile
+++ /dev/null
@@ -1,32 +0,0 @@
-import testing ;
-import option ;
-
-additional = ;
-if [ option.get with-cmph ] {
- additional += perfect_hash.cc ;
-}
-
-lib utils :
- alignment_io.cc
- b64tools.cc
- corpus_tools.cc
- dict.cc
- tdict.cc
- fdict.cc
- gzstream.cc
- filelib.cc
- stringlib.cc
- sparse_vector.cc
- timing_stats.cc
- verbose.cc
- weights.cc
- $(additional)
- ..//z
- : <include>.. <include>. : : <include>.. <include>. ;
-
-exe atools : atools.cc utils ..//boost_program_options ;
-exe reconstruct_weights : reconstruct_weights.cc utils ..//boost_program_options ;
-
-alias programs : reconstruct_weights atools ;
-
-all_tests [ glob *_test.cc phmt.cc ts.cc ] : utils : <testing.arg>$(TOP)/utils/test_data ;
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 799ec879..3ad9d69e 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -3,16 +3,13 @@ bin_PROGRAMS = reconstruct_weights atools
noinst_PROGRAMS = \
ts \
phmt \
- mfcr_test \
- crp_test \
dict_test \
m_test \
weights_test \
logval_test \
- small_vector_test \
- unigram_pyp_lm
+ small_vector_test
-TESTS = ts mfcr_test crp_test small_vector_test logval_test weights_test dict_test m_test
+TESTS = ts small_vector_test logval_test weights_test dict_test m_test
noinst_LIBRARIES = libutils.a
@@ -48,18 +45,12 @@ m_test_SOURCES = m_test.cc
m_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
dict_test_SOURCES = dict_test.cc
dict_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
-mfcr_test_SOURCES = mfcr_test.cc
-mfcr_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
weights_test_SOURCES = weights_test.cc
weights_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
-crp_test_SOURCES = crp_test.cc
-crp_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
logval_test_SOURCES = logval_test.cc
logval_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
small_vector_test_SOURCES = small_vector_test.cc
small_vector_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
-unigram_pyp_lm_SOURCES = unigram_pyp_lm.cc
-unigram_pyp_lm_LDADD = libutils.a -lz
################################################################
# do NOT NOT NOT add any other -I includes NO NO NO NO NO ######
diff --git a/utils/ccrp.h b/utils/ccrp.h
deleted file mode 100644
index f5d3fc78..00000000
--- a/utils/ccrp.h
+++ /dev/null
@@ -1,270 +0,0 @@
-#ifndef _CCRP_H_
-#define _CCRP_H_
-
-#include <numeric>
-#include <cassert>
-#include <cmath>
-#include <list>
-#include <iostream>
-#include <vector>
-#include <tr1/unordered_map>
-#include <boost/functional/hash.hpp>
-#include "sampler.h"
-#include "slice_sampler.h"
-#include "crp_table_manager.h"
-#include "m.h"
-
-// Chinese restaurant process (Pitman-Yor parameters) with table tracking.
-
-template <typename Dish, typename DishHash = boost::hash<Dish> >
-class CCRP {
- public:
- CCRP(double disc, double strength) :
- num_tables_(),
- num_customers_(),
- discount_(disc),
- strength_(strength),
- discount_prior_strength_(std::numeric_limits<double>::quiet_NaN()),
- discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
- strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
- strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {
- check_hyperparameters();
- }
-
- CCRP(double d_strength, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) :
- num_tables_(),
- num_customers_(),
- discount_(d),
- strength_(c),
- discount_prior_strength_(d_strength),
- discount_prior_beta_(d_beta),
- strength_prior_shape_(c_shape),
- strength_prior_rate_(c_rate) {
- check_hyperparameters();
- }
-
- void check_hyperparameters() {
- if (discount_ < 0.0 || discount_ >= 1.0) {
- std::cerr << "Bad discount: " << discount_ << std::endl;
- abort();
- }
- if (strength_ <= -discount_) {
- std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl;
- abort();
- }
- }
-
- double discount() const { return discount_; }
- double strength() const { return strength_; }
- void set_hyperparameters(double d, double s) {
- discount_ = d; strength_ = s;
- check_hyperparameters();
- }
- void set_discount(double d) { discount_ = d; check_hyperparameters(); }
- void set_strength(double a) { strength_ = a; check_hyperparameters(); }
-
- bool has_discount_prior() const {
- return !std::isnan(discount_prior_strength_);
- }
-
- bool has_strength_prior() const {
- return !std::isnan(strength_prior_shape_);
- }
-
- void clear() {
- num_tables_ = 0;
- num_customers_ = 0;
- dish_locs_.clear();
- }
-
- unsigned num_tables() const {
- return num_tables_;
- }
-
- unsigned num_tables(const Dish& dish) const {
- const typename std::tr1::unordered_map<Dish, CRPTableManager, DishHash>::const_iterator it = dish_locs_.find(dish);
- if (it == dish_locs_.end()) return 0;
- return it->second.num_tables();
- }
-
- unsigned num_customers() const {
- return num_customers_;
- }
-
- unsigned num_customers(const Dish& dish) const {
- const typename std::tr1::unordered_map<Dish, CRPTableManager, DishHash>::const_iterator it = dish_locs_.find(dish);
- if (it == dish_locs_.end()) return 0;
- return it->num_customers();
- }
-
- // returns +1 or 0 indicating whether a new table was opened
- // p = probability with which the particular table was selected
- // excluding p0
- template <typename T>
- int increment(const Dish& dish, const T& p0, MT19937* rng, T* p = NULL) {
- CRPTableManager& loc = dish_locs_[dish];
- bool share_table = false;
- if (loc.num_customers()) {
- const T p_empty = T(strength_ + num_tables_ * discount_) * p0;
- const T p_share = T(loc.num_customers() - loc.num_tables() * discount_);
- share_table = rng->SelectSample(p_empty, p_share);
- }
- if (share_table) {
- loc.share_table(discount_, rng);
- } else {
- loc.create_table();
- ++num_tables_;
- }
- ++num_customers_;
- return (share_table ? 0 : 1);
- }
-
- // returns -1 or 0, indicating whether a table was closed
- int decrement(const Dish& dish, MT19937* rng) {
- CRPTableManager& loc = dish_locs_[dish];
- assert(loc.num_customers());
- if (loc.num_customers() == 1) {
- dish_locs_.erase(dish);
- --num_tables_;
- --num_customers_;
- return -1;
- } else {
- int delta = loc.remove_customer(rng);
- --num_customers_;
- if (delta) --num_tables_;
- return delta;
- }
- }
-
- template <typename T>
- T prob(const Dish& dish, const T& p0) const {
- const typename std::tr1::unordered_map<Dish, CRPTableManager, DishHash>::const_iterator it = dish_locs_.find(dish);
- const T r = T(num_tables_ * discount_ + strength_);
- if (it == dish_locs_.end()) {
- return r * p0 / T(num_customers_ + strength_);
- } else {
- return (T(it->second.num_customers() - discount_ * it->second.num_tables()) + r * p0) /
- T(num_customers_ + strength_);
- }
- }
-
- double log_crp_prob() const {
- return log_crp_prob(discount_, strength_);
- }
-
- // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
- // does not include P_0's
- double log_crp_prob(const double& discount, const double& strength) const {
- double lp = 0.0;
- if (has_discount_prior())
- lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_);
- if (has_strength_prior())
- lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_);
- assert(lp <= 0.0);
- if (num_customers_) {
- if (discount > 0.0) {
- const double r = lgamma(1.0 - discount);
- if (strength)
- lp += lgamma(strength) - lgamma(strength / discount);
- lp += - lgamma(strength + num_customers_)
- + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_);
- assert(std::isfinite(lp));
- for (typename std::tr1::unordered_map<Dish, CRPTableManager, DishHash>::const_iterator it = dish_locs_.begin();
- it != dish_locs_.end(); ++it) {
- const CRPTableManager& cur = it->second; // TODO check
- for (CRPTableManager::const_iterator ti = cur.begin(); ti != cur.end(); ++ti) {
- lp += (lgamma(ti->first - discount) - r) * ti->second;
- }
- }
- } else if (!discount) { // discount == 0.0
- lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_);
- assert(std::isfinite(lp));
- for (typename std::tr1::unordered_map<Dish, CRPTableManager, DishHash>::const_iterator it = dish_locs_.begin();
- it != dish_locs_.end(); ++it) {
- const CRPTableManager& cur = it->second;
- lp += lgamma(cur.num_tables());
- }
- } else {
- assert(!"discount less than 0 detected!");
- }
- }
- assert(std::isfinite(lp));
- return lp;
- }
-
- void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
- assert(has_discount_prior() || has_strength_prior());
- if (num_customers() == 0) return;
- DiscountResampler dr(*this);
- StrengthResampler sr(*this);
- for (unsigned iter = 0; iter < nloop; ++iter) {
- if (has_strength_prior()) {
- strength_ = slice_sampler1d(sr, strength_, *rng, -discount_ + std::numeric_limits<double>::min(),
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- }
- if (has_discount_prior()) {
- double min_discount = std::numeric_limits<double>::min();
- if (strength_ < 0.0) min_discount -= strength_;
- discount_ = slice_sampler1d(dr, discount_, *rng, min_discount,
- 1.0, 0.0, niterations, 100*niterations);
- }
- }
- strength_ = slice_sampler1d(sr, strength_, *rng, -discount_,
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- }
-
- struct DiscountResampler {
- DiscountResampler(const CCRP& crp) : crp_(crp) {}
- const CCRP& crp_;
- double operator()(const double& proposed_discount) const {
- return crp_.log_crp_prob(proposed_discount, crp_.strength_);
- }
- };
-
- struct StrengthResampler {
- StrengthResampler(const CCRP& crp) : crp_(crp) {}
- const CCRP& crp_;
- double operator()(const double& proposed_strength) const {
- return crp_.log_crp_prob(crp_.discount_, proposed_strength);
- }
- };
-
- void Print(std::ostream* out) const {
- std::cerr << "PYP(d=" << discount_ << ",c=" << strength_ << ") customers=" << num_customers_ << std::endl;
- for (typename std::tr1::unordered_map<Dish, CRPTableManager, DishHash>::const_iterator it = dish_locs_.begin();
- it != dish_locs_.end(); ++it) {
- (*out) << it->first << " : " << it->second << std::endl;
- }
- }
-
- typedef typename std::tr1::unordered_map<Dish, CRPTableManager, DishHash>::const_iterator const_iterator;
- const_iterator begin() const {
- return dish_locs_.begin();
- }
- const_iterator end() const {
- return dish_locs_.end();
- }
-
- unsigned num_tables_;
- unsigned num_customers_;
- std::tr1::unordered_map<Dish, CRPTableManager, DishHash> dish_locs_;
-
- double discount_;
- double strength_;
-
- // optional beta prior on discount_ (NaN if no prior)
- double discount_prior_strength_;
- double discount_prior_beta_;
-
- // optional gamma prior on strength_ (NaN if no prior)
- double strength_prior_shape_;
- double strength_prior_rate_;
-};
-
-template <typename T,typename H>
-std::ostream& operator<<(std::ostream& o, const CCRP<T,H>& c) {
- c.Print(&o);
- return o;
-}
-
-#endif
diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h
deleted file mode 100644
index 724b11bd..00000000
--- a/utils/ccrp_nt.h
+++ /dev/null
@@ -1,164 +0,0 @@
-#ifndef _CCRP_NT_H_
-#define _CCRP_NT_H_
-
-#include <numeric>
-#include <cassert>
-#include <cmath>
-#include <list>
-#include <iostream>
-#include <vector>
-#include <tr1/unordered_map>
-#include <boost/functional/hash.hpp>
-#include "sampler.h"
-#include "slice_sampler.h"
-#include "m.h"
-
-// Chinese restaurant process (1 parameter)
-template <typename Dish, typename DishHash = boost::hash<Dish> >
-class CCRP_NoTable {
- public:
- explicit CCRP_NoTable(double conc) :
- num_customers_(),
- alpha_(conc),
- alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
- alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
-
- CCRP_NoTable(double c_shape, double c_rate, double c = 10.0) :
- num_customers_(),
- alpha_(c),
- alpha_prior_shape_(c_shape),
- alpha_prior_rate_(c_rate) {}
-
- double alpha() const { return alpha_; }
- void set_alpha(const double& alpha) { alpha_ = alpha; assert(alpha_ > 0.0); }
-
- bool has_alpha_prior() const {
- return !std::isnan(alpha_prior_shape_);
- }
-
- void clear() {
- num_customers_ = 0;
- custs_.clear();
- }
-
- unsigned num_customers() const {
- return num_customers_;
- }
-
- unsigned num_customers(const Dish& dish) const {
- const typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.find(dish);
- if (it == custs_.end()) return 0;
- return it->second;
- }
-
- int increment(const Dish& dish) {
- int table_diff = 0;
- if (++custs_[dish] == 1)
- table_diff = 1;
- ++num_customers_;
- return table_diff;
- }
-
- int decrement(const Dish& dish) {
- int table_diff = 0;
- int nc = --custs_[dish];
- if (nc == 0) {
- custs_.erase(dish);
- table_diff = -1;
- } else if (nc < 0) {
- std::cerr << "Dish counts dropped below zero for: " << dish << std::endl;
- abort();
- }
- --num_customers_;
- return table_diff;
- }
-
- template <typename F>
- F prob(const Dish& dish, const F& p0) const {
- const unsigned at_table = num_customers(dish);
- return (F(at_table) + p0 * F(alpha_)) / F(num_customers_ + alpha_);
- }
-
- double logprob(const Dish& dish, const double& logp0) const {
- const unsigned at_table = num_customers(dish);
- return log(at_table + exp(logp0 + log(alpha_))) - log(num_customers_ + alpha_);
- }
-
- double log_crp_prob() const {
- return log_crp_prob(alpha_);
- }
-
- // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
- // does not include P_0's
- double log_crp_prob(const double& alpha) const {
- double lp = 0.0;
- if (has_alpha_prior())
- lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
- assert(lp <= 0.0);
- if (num_customers_) {
- lp += lgamma(alpha) - lgamma(alpha + num_customers_) +
- custs_.size() * log(alpha);
- assert(std::isfinite(lp));
- for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin();
- it != custs_.end(); ++it) {
- lp += lgamma(it->second);
- }
- }
- assert(std::isfinite(lp));
- return lp;
- }
-
- void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
- assert(has_alpha_prior());
- ConcentrationResampler cr(*this);
- for (unsigned iter = 0; iter < nloop; ++iter) {
- alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- }
- }
-
- struct ConcentrationResampler {
- ConcentrationResampler(const CCRP_NoTable& crp) : crp_(crp) {}
- const CCRP_NoTable& crp_;
- double operator()(const double& proposed_alpha) const {
- return crp_.log_crp_prob(proposed_alpha);
- }
- };
-
- void Print(std::ostream* out) const {
- (*out) << "DP(alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl;
- int cc = 0;
- for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin();
- it != custs_.end(); ++it) {
- (*out) << " " << it->first << "(" << it->second << " eating)";
- ++cc;
- if (cc > 10) { (*out) << " ..."; break; }
- }
- (*out) << std::endl;
- }
-
- unsigned num_customers_;
- std::tr1::unordered_map<Dish, unsigned, DishHash> custs_;
-
- typedef typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator const_iterator;
- const_iterator begin() const {
- return custs_.begin();
- }
- const_iterator end() const {
- return custs_.end();
- }
-
- double alpha_;
-
- // optional gamma prior on alpha_ (NaN if no prior)
- double alpha_prior_shape_;
- double alpha_prior_rate_;
-};
-
-template <typename T,typename H>
-std::ostream& operator<<(std::ostream& o, const CCRP_NoTable<T,H>& c) {
- c.Print(&o);
- return o;
-}
-
-#endif
diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h
deleted file mode 100644
index abe399ea..00000000
--- a/utils/ccrp_onetable.h
+++ /dev/null
@@ -1,253 +0,0 @@
-#ifndef _CCRP_ONETABLE_H_
-#define _CCRP_ONETABLE_H_
-
-#include <numeric>
-#include <cassert>
-#include <cmath>
-#include <list>
-#include <iostream>
-#include <tr1/unordered_map>
-#include <boost/functional/hash.hpp>
-#include "sampler.h"
-#include "slice_sampler.h"
-
-// Chinese restaurant process (Pitman-Yor parameters) with one table approximation
-
-template <typename Dish, typename DishHash = boost::hash<Dish> >
-class CCRP_OneTable {
- typedef std::tr1::unordered_map<Dish, unsigned, DishHash> DishMapType;
- public:
- CCRP_OneTable(double disc, double conc) :
- num_tables_(),
- num_customers_(),
- discount_(disc),
- alpha_(conc),
- discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()),
- discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
- alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
- alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
-
- CCRP_OneTable(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) :
- num_tables_(),
- num_customers_(),
- discount_(d),
- alpha_(c),
- discount_prior_alpha_(d_alpha),
- discount_prior_beta_(d_beta),
- alpha_prior_shape_(c_shape),
- alpha_prior_rate_(c_rate) {}
-
- double discount() const { return discount_; }
- double alpha() const { return alpha_; }
- void set_alpha(double c) { alpha_ = c; }
- void set_discount(double d) { discount_ = d; }
-
- bool has_discount_prior() const {
- return !std::isnan(discount_prior_alpha_);
- }
-
- bool has_alpha_prior() const {
- return !std::isnan(alpha_prior_shape_);
- }
-
- void clear() {
- num_tables_ = 0;
- num_customers_ = 0;
- dish_counts_.clear();
- }
-
- unsigned num_tables() const {
- return num_tables_;
- }
-
- unsigned num_tables(const Dish& dish) const {
- const typename DishMapType::const_iterator it = dish_counts_.find(dish);
- if (it == dish_counts_.end()) return 0;
- return 1;
- }
-
- unsigned num_customers() const {
- return num_customers_;
- }
-
- unsigned num_customers(const Dish& dish) const {
- const typename DishMapType::const_iterator it = dish_counts_.find(dish);
- if (it == dish_counts_.end()) return 0;
- return it->second;
- }
-
- // returns +1 or 0 indicating whether a new table was opened
- int increment(const Dish& dish) {
- unsigned& dc = dish_counts_[dish];
- ++dc;
- ++num_customers_;
- if (dc == 1) {
- ++num_tables_;
- return 1;
- } else {
- return 0;
- }
- }
-
- // returns -1 or 0, indicating whether a table was closed
- int decrement(const Dish& dish) {
- unsigned& dc = dish_counts_[dish];
- assert(dc > 0);
- if (dc == 1) {
- dish_counts_.erase(dish);
- --num_tables_;
- --num_customers_;
- return -1;
- } else {
- assert(dc > 1);
- --dc;
- --num_customers_;
- return 0;
- }
- }
-
- double prob(const Dish& dish, const double& p0) const {
- const typename DishMapType::const_iterator it = dish_counts_.find(dish);
- const double r = num_tables_ * discount_ + alpha_;
- if (it == dish_counts_.end()) {
- return r * p0 / (num_customers_ + alpha_);
- } else {
- return (it->second - discount_ + r * p0) /
- (num_customers_ + alpha_);
- }
- }
-
- template <typename T>
- T probT(const Dish& dish, const T& p0) const {
- const typename DishMapType::const_iterator it = dish_counts_.find(dish);
- const T r(num_tables_ * discount_ + alpha_);
- if (it == dish_counts_.end()) {
- return r * p0 / T(num_customers_ + alpha_);
- } else {
- return (T(it->second - discount_) + r * p0) /
- T(num_customers_ + alpha_);
- }
- }
-
- double log_crp_prob() const {
- return log_crp_prob(discount_, alpha_);
- }
-
- static double log_beta_density(const double& x, const double& alpha, const double& beta) {
- assert(x > 0.0);
- assert(x < 1.0);
- assert(alpha > 0.0);
- assert(beta > 0.0);
- const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta);
- return lp;
- }
-
- static double log_gamma_density(const double& x, const double& shape, const double& rate) {
- assert(x >= 0.0);
- assert(shape > 0.0);
- assert(rate > 0.0);
- const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape);
- return lp;
- }
-
- // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
- // does not include P_0's
- double log_crp_prob(const double& discount, const double& alpha) const {
- double lp = 0.0;
- if (has_discount_prior())
- lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_);
- if (has_alpha_prior())
- lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
- assert(lp <= 0.0);
- if (num_customers_) {
- if (discount > 0.0) {
- const double r = lgamma(1.0 - discount);
- lp += lgamma(alpha) - lgamma(alpha + num_customers_)
- + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_)
- - lgamma(alpha / discount);
- assert(std::isfinite(lp));
- for (typename DishMapType::const_iterator it = dish_counts_.begin();
- it != dish_counts_.end(); ++it) {
- const unsigned& cur = it->second;
- lp += lgamma(cur - discount) - r;
- }
- } else {
- assert(!"not implemented yet");
- }
- }
- assert(std::isfinite(lp));
- return lp;
- }
-
- void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
- assert(has_discount_prior() || has_alpha_prior());
- DiscountResampler dr(*this);
- ConcentrationResampler cr(*this);
- for (unsigned iter = 0; iter < nloop; ++iter) {
- if (has_alpha_prior()) {
- alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- }
- if (has_discount_prior()) {
- discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits<double>::min(),
- 1.0, 0.0, niterations, 100*niterations);
- }
- }
- alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- }
-
- struct DiscountResampler {
- DiscountResampler(const CCRP_OneTable& crp) : crp_(crp) {}
- const CCRP_OneTable& crp_;
- double operator()(const double& proposed_discount) const {
- return crp_.log_crp_prob(proposed_discount, crp_.alpha_);
- }
- };
-
- struct ConcentrationResampler {
- ConcentrationResampler(const CCRP_OneTable& crp) : crp_(crp) {}
- const CCRP_OneTable& crp_;
- double operator()(const double& proposed_alpha) const {
- return crp_.log_crp_prob(crp_.discount_, proposed_alpha);
- }
- };
-
- void Print(std::ostream* out) const {
- (*out) << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl;
- for (typename DishMapType::const_iterator it = dish_counts_.begin(); it != dish_counts_.end(); ++it) {
- (*out) << " " << it->first << " = " << it->second << std::endl;
- }
- }
-
- typedef typename DishMapType::const_iterator const_iterator;
- const_iterator begin() const {
- return dish_counts_.begin();
- }
- const_iterator end() const {
- return dish_counts_.end();
- }
-
- unsigned num_tables_;
- unsigned num_customers_;
- DishMapType dish_counts_;
-
- double discount_;
- double alpha_;
-
- // optional beta prior on discount_ (NaN if no prior)
- double discount_prior_alpha_;
- double discount_prior_beta_;
-
- // optional gamma prior on alpha_ (NaN if no prior)
- double alpha_prior_shape_;
- double alpha_prior_rate_;
-};
-
-template <typename T,typename H>
-std::ostream& operator<<(std::ostream& o, const CCRP_OneTable<T,H>& c) {
- c.Print(&o);
- return o;
-}
-
-#endif
diff --git a/utils/crp_table_manager.h b/utils/crp_table_manager.h
deleted file mode 100644
index 753e721f..00000000
--- a/utils/crp_table_manager.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifndef _CRP_TABLE_MANAGER_H_
-#define _CRP_TABLE_MANAGER_H_
-
-#include <iostream>
-#include "sparse_vector.h"
-#include "sampler.h"
-
-// these are helper classes for implementing token-based CRP samplers
-// basically the data structures recommended by Blunsom et al. in the Note.
-
-struct CRPHistogram {
- //typedef std::map<unsigned, unsigned> MAPTYPE;
- typedef SparseVector<unsigned> MAPTYPE;
- typedef MAPTYPE::const_iterator const_iterator;
-
- inline void increment(unsigned bin, unsigned delta = 1u) {
- data[bin] += delta;
- }
- inline void decrement(unsigned bin, unsigned delta = 1u) {
- unsigned r = data[bin] -= delta;
- if (!r) data.erase(bin);
- }
- inline void move(unsigned from_bin, unsigned to_bin, unsigned delta = 1u) {
- decrement(from_bin, delta);
- increment(to_bin, delta);
- }
- inline const_iterator begin() const { return data.begin(); }
- inline const_iterator end() const { return data.end(); }
-
- private:
- MAPTYPE data;
-};
-
-// A CRPTableManager tracks statistics about all customers
-// and tables serving some dish in a CRP and can correctly sample what
-// table to remove a customer from and what table to join
-struct CRPTableManager {
- CRPTableManager() : customers(), tables() {}
-
- inline unsigned num_tables() const {
- return tables;
- }
-
- inline unsigned num_customers() const {
- return customers;
- }
-
- inline void create_table() {
- h.increment(1);
- ++tables;
- ++customers;
- }
-
- // seat a customer at a table proportional to the number of customers seated at a table, less the discount
- // *new tables are never created by this function!
- inline void share_table(const double discount, MT19937* rng) {
- const double z = customers - discount * num_tables();
- double r = z * rng->next();
- const CRPHistogram::const_iterator end = h.end();
- CRPHistogram::const_iterator it = h.begin();
- for (; it != end; ++it) {
- // it->first = number of customers at table
- // it->second = number of such tables
- double thresh = (it->first - discount) * it->second;
- if (thresh > r) break;
- r -= thresh;
- }
- h.move(it->first, it->first + 1);
- ++customers;
- }
-
- // randomly sample a customer
- // *tables may be removed
- // returns -1 if a table is removed, 0 otherwise
- inline int remove_customer(MT19937* rng) {
- int r = rng->next() * num_customers();
- const CRPHistogram::const_iterator end = h.end();
- CRPHistogram::const_iterator it = h.begin();
- for (; it != end; ++it) {
- int thresh = it->first * it->second;
- if (thresh > r) break;
- r -= thresh;
- }
- --customers;
- const unsigned tc = it->first;
- if (tc == 1) {
- h.decrement(1);
- --tables;
- return -1;
- } else {
- h.move(tc, tc - 1);
- return 0;
- }
- }
-
- typedef CRPHistogram::const_iterator const_iterator;
- const_iterator begin() const { return h.begin(); }
- const_iterator end() const { return h.end(); }
-
- unsigned customers;
- unsigned tables;
- CRPHistogram h;
-};
-
-std::ostream& operator<<(std::ostream& os, const CRPTableManager& tm) {
- os << '[' << tm.num_customers() << " total customers at " << tm.num_tables() << " tables ||| ";
- for (CRPHistogram::const_iterator it = tm.begin(); it != tm.end(); ++it) {
- if (it != tm.h.begin()) os << " -- ";
- os << '(' << it->first << ") x " << it->second;
- }
- return os << ']';
-}
-
-#endif
diff --git a/utils/crp_test.cc b/utils/crp_test.cc
deleted file mode 100644
index 0cdb7afd..00000000
--- a/utils/crp_test.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <string>
-
-#define BOOST_TEST_MODULE CrpTest
-#include <boost/test/unit_test.hpp>
-#include <boost/test/floating_point_comparison.hpp>
-
-#include "ccrp.h"
-#include "sampler.h"
-
-using namespace std;
-
-MT19937 rng;
-
-BOOST_AUTO_TEST_CASE(Dist) {
- CCRP<string> crp(0.1, 5);
- double un = 0.25;
- int tt = 0;
- tt += crp.increment("hi", un, &rng);
- tt += crp.increment("foo", un, &rng);
- tt += crp.increment("bar", un, &rng);
- tt += crp.increment("bar", un, &rng);
- tt += crp.increment("bar", un, &rng);
- tt += crp.increment("bar", un, &rng);
- tt += crp.increment("bar", un, &rng);
- tt += crp.increment("bar", un, &rng);
- tt += crp.increment("bar", un, &rng);
- cout << "tt=" << tt << endl;
- cout << crp << endl;
- cout << " P(bar)=" << crp.prob("bar", un) << endl;
- cout << " P(hi)=" << crp.prob("hi", un) << endl;
- cout << " P(baz)=" << crp.prob("baz", un) << endl;
- cout << " P(foo)=" << crp.prob("foo", un) << endl;
- double x = crp.prob("bar", un) + crp.prob("hi", un) + crp.prob("baz", un) + crp.prob("foo", un);
- cout << " tot=" << x << endl;
- BOOST_CHECK_CLOSE(1.0, x, 1e-6);
- tt += crp.decrement("hi", &rng);
- tt += crp.decrement("bar", &rng);
- cout << crp << endl;
- tt += crp.decrement("bar", &rng);
- cout << crp << endl;
- cout << "tt=" << tt << endl;
-}
-
-BOOST_AUTO_TEST_CASE(Exchangability) {
- double tot = 0;
- double xt = 0;
- CCRP<int> crp(0.5, 1.0);
- int cust = 10;
- vector<int> hist(cust + 1, 0);
- for (int i = 0; i < cust; ++i) { crp.increment(1, 1.0, &rng); }
- const int samples = 100000;
- const bool simulate = true;
- for (int k = 0; k < samples; ++k) {
- if (!simulate) {
- crp.clear();
- for (int i = 0; i < cust; ++i) { crp.increment(1, 1.0, &rng); }
- } else {
- int da = rng.next() * cust;
- bool a = rng.next() < 0.5;
- if (a) {
- for (int i = 0; i < da; ++i) { crp.increment(1, 1.0, &rng); }
- for (int i = 0; i < da; ++i) { crp.decrement(1, &rng); }
- xt += 1.0;
- } else {
- for (int i = 0; i < da; ++i) { crp.decrement(1, &rng); }
- for (int i = 0; i < da; ++i) { crp.increment(1, 1.0, &rng); }
- }
- }
- int c = crp.num_tables(1);
- ++hist[c];
- tot += c;
- }
- BOOST_CHECK_EQUAL(cust, crp.num_customers());
- cerr << "P(a) = " << (xt / samples) << endl;
- cerr << "E[num tables] = " << (tot / samples) << endl;
- double error = fabs((tot / samples) - 5.4);
- cerr << " error = " << error << endl;
- BOOST_CHECK_MESSAGE(error < 0.1, "error is too big = " << error); // it's possible for this to fail, but
- // very, very unlikely
- for (int i = 1; i <= cust; ++i)
- cerr << i << ' ' << (hist[i]) << endl;
-}
-
-BOOST_AUTO_TEST_CASE(LP) {
- CCRP<string> crp(1,1,1,1,0.1,50.0);
- crp.increment("foo", 1.0, &rng);
- cerr << crp.log_crp_prob() << endl;
-}
-
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index 9fe00459..590a60c4 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -522,7 +522,7 @@ const FastSparseVector<T> operator-(const FastSparseVector<T>& x, const FastSpar
}
template <class T>
-std::size_t hash_value(FastSparseVector<T> const& x) {
+std::size_t hash_value(FastSparseVector<T> const&) {
assert(!"not implemented");
return 0;
}
diff --git a/utils/gamma_poisson.h b/utils/gamma_poisson.h
deleted file mode 100644
index fec763f6..00000000
--- a/utils/gamma_poisson.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _GAMMA_POISSON_H_
-#define _GAMMA_POISSON_H_
-
-#include <m.h>
-
-// http://en.wikipedia.org/wiki/Conjugate_prior
-struct GammaPoisson {
- GammaPoisson(double shape, double rate) :
- a(shape), b(rate), n(), marginal() {}
-
- double prob(unsigned x) const {
- return exp(Md::log_negative_binom(x, a + marginal, 1.0 - (b + n) / (1 + b + n)));
- }
-
- void increment(unsigned x) {
- ++n;
- marginal += x;
- }
-
- void decrement(unsigned x) {
- --n;
- marginal -= x;
- }
-
- double log_likelihood() const {
- return 0;
- }
-
- double a, b;
- int n, marginal;
-};
-
-#endif
diff --git a/utils/mfcr.h b/utils/mfcr.h
deleted file mode 100644
index 4aacb567..00000000
--- a/utils/mfcr.h
+++ /dev/null
@@ -1,370 +0,0 @@
-#ifndef _MFCR_H_
-#define _MFCR_H_
-
-#include <algorithm>
-#include <numeric>
-#include <cassert>
-#include <cmath>
-#include <list>
-#include <iostream>
-#include <vector>
-#include <iterator>
-#include <tr1/unordered_map>
-#include <boost/functional/hash.hpp>
-#include "sampler.h"
-#include "slice_sampler.h"
-#include "m.h"
-
-struct TableCount {
- TableCount() : count(), floor() {}
- TableCount(int c, int f) : count(c), floor(f) {
- assert(f >= 0);
- }
- int count; // count or delta (may be 0, <0, or >0)
- unsigned char floor; // from which floor?
-};
-
-std::ostream& operator<<(std::ostream& o, const TableCount& tc) {
- return o << "[c=" << tc.count << " floor=" << static_cast<unsigned int>(tc.floor) << ']';
-}
-
-// Multi-Floor Chinese Restaurant as proposed by Wood & Teh (AISTATS, 2009) to simulate
-// graphical Pitman-Yor processes.
-// http://jmlr.csail.mit.edu/proceedings/papers/v5/wood09a/wood09a.pdf
-//
-// Implementation is based on Blunsom, Cohn, Goldwater, & Johnson (ACL 2009) and code
-// referenced therein.
-// http://www.aclweb.org/anthology/P/P09/P09-2085.pdf
-//
-template <unsigned Floors, typename Dish, typename DishHash = boost::hash<Dish> >
-class MFCR {
- public:
-
- MFCR(double d, double strength) :
- num_tables_(),
- num_customers_(),
- discount_(d),
- strength_(strength),
- discount_prior_strength_(std::numeric_limits<double>::quiet_NaN()),
- discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
- strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
- strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) { check_hyperparameters(); }
-
- MFCR(double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) :
- num_tables_(),
- num_customers_(),
- discount_(d),
- strength_(strength),
- discount_prior_strength_(discount_strength),
- discount_prior_beta_(discount_beta),
- strength_prior_shape_(strength_shape),
- strength_prior_rate_(strength_rate) { check_hyperparameters(); }
-
- void check_hyperparameters() {
- if (discount_ < 0.0 || discount_ >= 1.0) {
- std::cerr << "Bad discount: " << discount_ << std::endl;
- abort();
- }
- if (strength_ <= -discount_) {
- std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl;
- abort();
- }
- }
-
- double discount() const { return discount_; }
- double strength() const { return strength_; }
- void set_hyperparameters(double d, double s) {
- discount_ = d; strength_ = s;
- check_hyperparameters();
- }
- void set_discount(double d) { discount_ = d; check_hyperparameters(); }
- void set_strength(double a) { strength_ = a; check_hyperparameters(); }
-
- bool has_discount_prior() const {
- return !std::isnan(discount_prior_strength_);
- }
-
- bool has_strength_prior() const {
- return !std::isnan(strength_prior_shape_);
- }
-
- void clear() {
- num_tables_ = 0;
- num_customers_ = 0;
- dish_locs_.clear();
- }
-
- unsigned num_tables() const {
- return num_tables_;
- }
-
- unsigned num_tables(const Dish& dish) const {
- const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
- if (it == dish_locs_.end()) return 0;
- return it->second.table_counts_.size();
- }
-
- // this is not terribly efficient but it should not typically be necessary to execute this query
- unsigned num_tables(const Dish& dish, const unsigned floor) const {
- const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
- if (it == dish_locs_.end()) return 0;
- unsigned c = 0;
- for (typename std::list<TableCount>::const_iterator i = it->second.table_counts_.begin();
- i != it->second.table_counts_.end(); ++i) {
- if (i->floor == floor) ++c;
- }
- return c;
- }
-
- unsigned num_customers() const {
- return num_customers_;
- }
-
- unsigned num_customers(const Dish& dish) const {
- const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
- if (it == dish_locs_.end()) return 0;
- return it->total_dish_count_;
- }
-
- // returns (delta, floor) indicating whether a new table (delta) was opened and on which floor
- template <class InputIterator, class InputIterator2>
- TableCount increment(const Dish& dish, InputIterator p0s, InputIterator2 lambdas, MT19937* rng) {
- DishLocations& loc = dish_locs_[dish];
- // marg_p0 = marginal probability of opening a new table on any floor with label dish
- typedef typename std::iterator_traits<InputIterator>::value_type F;
- const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0));
- assert(marg_p0 <= F(1.0001));
- int floor = -1;
- bool share_table = false;
- if (loc.total_dish_count_) {
- const F p_empty = F(strength_ + num_tables_ * discount_) * marg_p0;
- const F p_share = F(loc.total_dish_count_ - loc.table_counts_.size() * discount_);
- share_table = rng->SelectSample(p_empty, p_share);
- }
- if (share_table) {
- // this can be done with doubles since P0 (which may be tiny) is not involved
- double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
- for (typename std::list<TableCount>::iterator ti = loc.table_counts_.begin();
- ti != loc.table_counts_.end(); ++ti) {
- r -= ti->count - discount_;
- if (r <= 0.0) {
- ++ti->count;
- floor = ti->floor;
- break;
- }
- }
- if (r > 0.0) {
- std::cerr << "Serious error: r=" << r << std::endl;
- Print(&std::cerr);
- assert(r <= 0.0);
- }
- } else { // sit at currently empty table -- must sample what floor
- if (Floors == 1) {
- floor = 0;
- } else {
- F r = F(rng->next()) * marg_p0;
- for (unsigned i = 0; i < Floors; ++i) {
- r -= (*p0s) * (*lambdas);
- ++p0s;
- ++lambdas;
- if (r <= F(0.0)) {
- floor = i;
- break;
- }
- }
- }
- assert(floor >= 0);
- loc.table_counts_.push_back(TableCount(1, floor));
- ++num_tables_;
- }
- ++loc.total_dish_count_;
- ++num_customers_;
- return (share_table ? TableCount(0, floor) : TableCount(1, floor));
- }
-
- // returns first = -1 or 0, indicating whether a table was closed, and on what floor (second)
- TableCount decrement(const Dish& dish, MT19937* rng) {
- DishLocations& loc = dish_locs_[dish];
- assert(loc.total_dish_count_);
- int floor = -1;
- int delta = 0;
- if (loc.total_dish_count_ == 1) {
- floor = loc.table_counts_.front().floor;
- dish_locs_.erase(dish);
- --num_tables_;
- --num_customers_;
- delta = -1;
- } else {
- // sample customer to remove UNIFORMLY. that is, do NOT use the d
- // here. if you do, it will introduce (unwanted) bias!
- double r = rng->next() * loc.total_dish_count_;
- --loc.total_dish_count_;
- --num_customers_;
- for (typename std::list<TableCount>::iterator ti = loc.table_counts_.begin();
- ti != loc.table_counts_.end(); ++ti) {
- r -= ti->count;
- if (r <= 0.0) {
- floor = ti->floor;
- if ((--ti->count) == 0) {
- --num_tables_;
- delta = -1;
- loc.table_counts_.erase(ti);
- }
- break;
- }
- }
- if (r > 0.0) {
- std::cerr << "Serious error: r=" << r << std::endl;
- Print(&std::cerr);
- assert(r <= 0.0);
- }
- }
- return TableCount(delta, floor);
- }
-
- template <class InputIterator, class InputIterator2>
- typename std::iterator_traits<InputIterator>::value_type prob(const Dish& dish, InputIterator p0s, InputIterator2 lambdas) const {
- typedef typename std::iterator_traits<InputIterator>::value_type F;
- const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0));
- assert(marg_p0 <= F(1.0001));
- const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
- const F r = F(num_tables_ * discount_ + strength_);
- if (it == dish_locs_.end()) {
- return r * marg_p0 / F(num_customers_ + strength_);
- } else {
- return (F(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + F(r * marg_p0)) /
- F(num_customers_ + strength_);
- }
- }
-
- double log_crp_prob() const {
- return log_crp_prob(discount_, strength_);
- }
-
- // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
- // does not include draws from G_w's
- double log_crp_prob(const double& discount, const double& strength) const {
- double lp = 0.0;
- if (has_discount_prior())
- lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_);
- if (has_strength_prior())
- lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_);
- assert(lp <= 0.0);
- if (num_customers_) {
- if (discount > 0.0) {
- const double r = lgamma(1.0 - discount);
- if (strength)
- lp += lgamma(strength) - lgamma(strength / discount);
- lp += - lgamma(strength + num_customers_)
- + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_);
- assert(std::isfinite(lp));
- for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
- it != dish_locs_.end(); ++it) {
- const DishLocations& cur = it->second;
- for (std::list<TableCount>::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) {
- lp += lgamma(ti->count - discount) - r;
- }
- }
- } else if (!discount) { // discount == 0.0
- lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_);
- assert(std::isfinite(lp));
- for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
- it != dish_locs_.end(); ++it) {
- const DishLocations& cur = it->second;
- lp += lgamma(cur.table_counts_.size());
- }
- } else {
- assert(!"discount less than 0 detected!");
- }
- }
- assert(std::isfinite(lp));
- return lp;
- }
-
- void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
- assert(has_discount_prior() || has_strength_prior());
- DiscountResampler dr(*this);
- StrengthResampler sr(*this);
- for (int iter = 0; iter < nloop; ++iter) {
- if (has_strength_prior()) {
- strength_ = slice_sampler1d(sr, strength_, *rng, -discount_,
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- }
- if (has_discount_prior()) {
- double min_discount = std::numeric_limits<double>::min();
- if (strength_ < 0.0) min_discount -= strength_;
- discount_ = slice_sampler1d(dr, discount_, *rng, min_discount,
- 1.0, 0.0, niterations, 100*niterations);
- }
- }
- strength_ = slice_sampler1d(sr, strength_, *rng, -discount_,
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- }
-
- struct DiscountResampler {
- DiscountResampler(const MFCR& crp) : crp_(crp) {}
- const MFCR& crp_;
- double operator()(const double& proposed_d) const {
- return crp_.log_crp_prob(proposed_d, crp_.strength_);
- }
- };
-
- struct StrengthResampler {
- StrengthResampler(const MFCR& crp) : crp_(crp) {}
- const MFCR& crp_;
- double operator()(const double& proposediscount_strength) const {
- return crp_.log_crp_prob(crp_.discount_, proposediscount_strength);
- }
- };
-
- struct DishLocations {
- DishLocations() : total_dish_count_() {}
- unsigned total_dish_count_; // customers at all tables with this dish
- std::list<TableCount> table_counts_; // list<> gives O(1) deletion and insertion, which we want
- // .size() is the number of tables for this dish
- };
-
- void Print(std::ostream* out) const {
- (*out) << "MFCR<" << Floors << ">(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl;
- for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
- it != dish_locs_.end(); ++it) {
- (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): ";
- for (typename std::list<TableCount>::const_iterator i = it->second.table_counts_.begin();
- i != it->second.table_counts_.end(); ++i) {
- (*out) << " " << *i;
- }
- (*out) << std::endl;
- }
- }
-
- typedef typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator const_iterator;
- const_iterator begin() const {
- return dish_locs_.begin();
- }
- const_iterator end() const {
- return dish_locs_.end();
- }
-
- unsigned num_tables_;
- unsigned num_customers_;
- std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_;
-
- double discount_;
- double strength_;
-
- // optional beta prior on discount_ (NaN if no prior)
- double discount_prior_strength_;
- double discount_prior_beta_;
-
- // optional gamma prior on strength_ (NaN if no prior)
- double strength_prior_shape_;
- double strength_prior_rate_;
-};
-
-template <unsigned N,typename T,typename H>
-std::ostream& operator<<(std::ostream& o, const MFCR<N,T,H>& c) {
- c.Print(&o);
- return o;
-}
-
-#endif
diff --git a/utils/mfcr_test.cc b/utils/mfcr_test.cc
deleted file mode 100644
index 29a1a2ce..00000000
--- a/utils/mfcr_test.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-#include "mfcr.h"
-
-#include <iostream>
-#include <cassert>
-#include <cmath>
-
-#define BOOST_TEST_MODULE MFCRTest
-#include <boost/test/unit_test.hpp>
-#include <boost/test/floating_point_comparison.hpp>
-
-#include "sampler.h"
-
-using namespace std;
-
-BOOST_AUTO_TEST_CASE(Exchangability) {
- MT19937 r;
- MT19937* rng = &r;
- MFCR<2, int> crp(0.5, 3.0);
- vector<double> lambdas(2);
- vector<double> p0s(2);
- lambdas[0] = 0.2;
- lambdas[1] = 0.8;
- p0s[0] = 1.0;
- p0s[1] = 1.0;
-
- double tot = 0;
- double tot2 = 0;
- double xt = 0;
- int cust = 10;
- vector<int> hist(cust + 1, 0), hist2(cust + 1, 0);
- for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); }
- const int samples = 100000;
- const bool simulate = true;
- for (int k = 0; k < samples; ++k) {
- if (!simulate) {
- crp.clear();
- for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); }
- } else {
- int da = rng->next() * cust;
- bool a = rng->next() < 0.45;
- if (a) {
- for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); }
- for (int i = 0; i < da; ++i) { crp.decrement(1, rng); }
- xt += 1.0;
- } else {
- for (int i = 0; i < da; ++i) { crp.decrement(1, rng); }
- for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); }
- }
- }
- int c = crp.num_tables(1);
- ++hist[c];
- tot += c;
- int c2 = crp.num_tables(1,0); // tables on floor 0 with dish 1
- ++hist2[c2];
- tot2 += c2;
- }
- cerr << cust << " = " << crp.num_customers() << endl;
- cerr << "P(a) = " << (xt / samples) << endl;
- cerr << "E[num tables] = " << (tot / samples) << endl;
- double error = fabs((tot / samples) - 6.894);
- cerr << " error = " << error << endl;
- for (int i = 1; i <= cust; ++i)
- cerr << i << ' ' << (hist[i]) << endl;
- cerr << "E[num tables on floor 0] = " << (tot2 / samples) << endl;
- double error2 = fabs((tot2 / samples) - 1.379);
- cerr << " error2 = " << error2 << endl;
- for (int i = 1; i <= cust; ++i)
- cerr << i << ' ' << (hist2[i]) << endl;
- assert(error < 0.05); // these can fail with very low probability
- assert(error2 < 0.05);
-};
-
diff --git a/utils/sampler.h b/utils/sampler.h
index 3e4a4086..88e1856c 100644
--- a/utils/sampler.h
+++ b/utils/sampler.h
@@ -19,7 +19,7 @@
#include "prob.h"
-template <typename F> struct SampleSet;
+template <typename F> class SampleSet;
template <typename RNG>
struct RandomNumberGenerator {
diff --git a/utils/slice_sampler.h b/utils/slice_sampler.h
deleted file mode 100644
index aa48a169..00000000
--- a/utils/slice_sampler.h
+++ /dev/null
@@ -1,191 +0,0 @@
-//! slice-sampler.h is an MCMC slice sampler
-//!
-//! Mark Johnson, 1st August 2008
-
-#ifndef SLICE_SAMPLER_H
-#define SLICE_SAMPLER_H
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <iostream>
-#include <limits>
-
-//! slice_sampler_rfc_type{} returns the value of a user-specified
-//! function if the argument is within range, or - infinity otherwise
-//
-template <typename F, typename Fn, typename U>
-struct slice_sampler_rfc_type {
- F min_x, max_x;
- const Fn& f;
- U max_nfeval, nfeval;
- slice_sampler_rfc_type(F min_x, F max_x, const Fn& f, U max_nfeval)
- : min_x(min_x), max_x(max_x), f(f), max_nfeval(max_nfeval), nfeval(0) { }
-
- F operator() (F x) {
- if (min_x < x && x < max_x) {
- assert(++nfeval <= max_nfeval);
- F fx = f(x);
- assert(std::isfinite(fx));
- return fx;
- }
- return -std::numeric_limits<F>::infinity();
- }
-}; // slice_sampler_rfc_type{}
-
-//! slice_sampler1d() implements the univariate "range doubling" slice sampler
-//! described in Neal (2003) "Slice Sampling", The Annals of Statistics 31(3), 705-767.
-//
-template <typename F, typename LogF, typename Uniform01>
-F slice_sampler1d(const LogF& logF0, //!< log of function to sample
- F x, //!< starting point
- Uniform01& u01, //!< uniform [0,1) random number generator
- F min_x = -std::numeric_limits<F>::infinity(), //!< minimum value of support
- F max_x = std::numeric_limits<F>::infinity(), //!< maximum value of support
- F w = 0.0, //!< guess at initial width
- unsigned nsamples=1, //!< number of samples to draw
- unsigned max_nfeval=200) //!< max number of function evaluations
-{
- typedef unsigned U;
- slice_sampler_rfc_type<F,LogF,U> logF(min_x, max_x, logF0, max_nfeval);
-
- assert(std::isfinite(x));
-
- if (w <= 0.0) { // set w to a default width
- if (min_x > -std::numeric_limits<F>::infinity() && max_x < std::numeric_limits<F>::infinity())
- w = (max_x - min_x)/4;
- else
- w = std::max(((x < 0.0) ? -x : x)/4, (F) 0.1);
- }
- assert(std::isfinite(w));
-
- F logFx = logF(x);
- for (U sample = 0; sample < nsamples; ++sample) {
- F logY = logFx + log(u01()+1e-100); //! slice logFx at this value
- assert(std::isfinite(logY));
-
- F xl = x - w*u01(); //! lower bound on slice interval
- F logFxl = logF(xl);
- F xr = xl + w; //! upper bound on slice interval
- F logFxr = logF(xr);
-
- while (logY < logFxl || logY < logFxr) // doubling procedure
- if (u01() < 0.5)
- logFxl = logF(xl -= xr - xl);
- else
- logFxr = logF(xr += xr - xl);
-
- F xl1 = xl;
- F xr1 = xr;
- while (true) { // shrinking procedure
- F x1 = xl1 + u01()*(xr1 - xl1);
- if (logY < logF(x1)) {
- F xl2 = xl; // acceptance procedure
- F xr2 = xr;
- bool d = false;
- while (xr2 - xl2 > 1.1*w) {
- F xm = (xl2 + xr2)/2;
- if ((x < xm && x1 >= xm) || (x >= xm && x1 < xm))
- d = true;
- if (x1 < xm)
- xr2 = xm;
- else
- xl2 = xm;
- if (d && logY >= logF(xl2) && logY >= logF(xr2))
- goto unacceptable;
- }
- x = x1;
- goto acceptable;
- }
- goto acceptable;
- unacceptable:
- if (x1 < x) // rest of shrinking procedure
- xl1 = x1;
- else
- xr1 = x1;
- }
- acceptable:
- w = (4*w + (xr1 - xl1))/5; // update width estimate
- }
- return x;
-}
-
-/*
-//! slice_sampler1d() implements a 1-d MCMC slice sampler.
-//! It should be correct for unimodal distributions, but
-//! not for multimodal ones.
-//
-template <typename F, typename LogP, typename Uniform01>
-F slice_sampler1d(const LogP& logP, //!< log of distribution to sample
- F x, //!< initial sample
- Uniform01& u01, //!< uniform random number generator
- F min_x = -std::numeric_limits<F>::infinity(), //!< minimum value of support
- F max_x = std::numeric_limits<F>::infinity(), //!< maximum value of support
- F w = 0.0, //!< guess at initial width
- unsigned nsamples=1, //!< number of samples to draw
- unsigned max_nfeval=200) //!< max number of function evaluations
-{
- typedef unsigned U;
- assert(std::isfinite(x));
- if (w <= 0.0) {
- if (min_x > -std::numeric_limits<F>::infinity() && max_x < std::numeric_limits<F>::infinity())
- w = (max_x - min_x)/4;
- else
- w = std::max(((x < 0.0) ? -x : x)/4, 0.1);
- }
- // TRACE4(x, min_x, max_x, w);
- F logPx = logP(x);
- assert(std::isfinite(logPx));
- U nfeval = 1;
- for (U sample = 0; sample < nsamples; ++sample) {
- F x0 = x;
- F logU = logPx + log(u01()+1e-100);
- assert(std::isfinite(logU));
- F r = u01();
- F xl = std::max(min_x, x - r*w);
- F xr = std::min(max_x, x + (1-r)*w);
- // TRACE3(x, logPx, logU);
- while (xl > min_x && logP(xl) > logU) {
- xl -= w;
- w *= 2;
- ++nfeval;
- if (nfeval >= max_nfeval)
- std::cerr << "## Error: nfeval = " << nfeval << ", max_nfeval = " << max_nfeval << ", sample = " << sample << ", nsamples = " << nsamples << ", r = " << r << ", w = " << w << ", xl = " << xl << std::endl;
- assert(nfeval < max_nfeval);
- }
- xl = std::max(xl, min_x);
- while (xr < max_x && logP(xr) > logU) {
- xr += w;
- w *= 2;
- ++nfeval;
- if (nfeval >= max_nfeval)
- std::cerr << "## Error: nfeval = " << nfeval << ", max_nfeval = " << max_nfeval << ", sample = " << sample << ", nsamples = " << nsamples << ", r = " << r << ", w = " << w << ", xr = " << xr << std::endl;
- assert(nfeval < max_nfeval);
- }
- xr = std::min(xr, max_x);
- while (true) {
- r = u01();
- x = r*xl + (1-r)*xr;
- assert(std::isfinite(x));
- logPx = logP(x);
- // TRACE4(logPx, x, xl, xr);
- assert(std::isfinite(logPx));
- ++nfeval;
- if (nfeval >= max_nfeval)
- std::cerr << "## Error: nfeval = " << nfeval << ", max_nfeval = " << max_nfeval << ", sample = " << sample << ", nsamples = " << nsamples << ", r = " << r << ", w = " << w << ", xl = " << xl << ", xr = " << xr << ", x = " << x << std::endl;
- assert(nfeval < max_nfeval);
- if (logPx > logU)
- break;
- else if (x > x0)
- xr = x;
- else
- xl = x;
- }
- // w = (4*w + (xr-xl))/5; // gradually adjust w
- }
- // TRACE2(logPx, x);
- return x;
-} // slice_sampler1d()
-*/
-
-#endif // SLICE_SAMPLER_H
diff --git a/utils/small_vector.h b/utils/small_vector.h
index 894b1b32..c8a69927 100644
--- a/utils/small_vector.h
+++ b/utils/small_vector.h
@@ -66,7 +66,7 @@ class SmallVector {
//TODO: figure out iterator traits to allow this to be selcted for any iterator range
template <class I>
SmallVector(I const* begin,I const* end) {
- int s=end-begin;
+ unsigned s=end-begin;
Alloc(s);
if (s <= SV_MAX) {
for (unsigned i = 0; i < s; ++i,++begin) data_.vals[i] = *begin;
diff --git a/utils/stringlib.h b/utils/stringlib.h
index 75772c4d..ff5dc89d 100644
--- a/utils/stringlib.h
+++ b/utils/stringlib.h
@@ -86,7 +86,7 @@ bool match_begin(Str const& str,Prefix const& prefix)
// source will be returned as a string, target must be a sentence or
// a lattice (in PLF format) and will be returned as a Lattice object
void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref);
-struct Lattice;
+class Lattice;
void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref);
inline std::string Trim(const std::string& str, const std::string& dropChars = " \t") {
diff --git a/utils/unigram_pyp_lm.cc b/utils/unigram_pyp_lm.cc
deleted file mode 100644
index 30b9fde1..00000000
--- a/utils/unigram_pyp_lm.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/functional.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "corpus_tools.h"
-#include "m.h"
-#include "tdict.h"
-#include "sampler.h"
-#include "ccrp.h"
-#include "gamma_poisson.h"
-
-// A not very memory-efficient implementation of an 1-gram LM based on PYPs
-// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model
-// based on Pitman-Yor Processes. In Proc. ACL.
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-boost::shared_ptr<MT19937> prng;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,n",po::value<unsigned>()->default_value(50),"Number of samples")
- ("train,i",po::value<string>(),"Training data file")
- ("test,T",po::value<string>(),"Test data file")
- ("discount_prior_a,a",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): a=this")
- ("discount_prior_b,b",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): b=this")
- ("strength_prior_s,s",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): s=this")
- ("strength_prior_r,r",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): r=this")
- ("random_seed,S",po::value<uint32_t>(), "Random seed");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("train") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-struct Histogram {
- void increment(unsigned bin, unsigned delta = 1u) {
- data[bin] += delta;
- }
- void decrement(unsigned bin, unsigned delta = 1u) {
- data[bin] -= delta;
- }
- void move(unsigned from_bin, unsigned to_bin, unsigned delta = 1u) {
- decrement(from_bin, delta);
- increment(to_bin, delta);
- }
- map<unsigned, unsigned> data;
- // SparseVector<unsigned> data;
-};
-
-// Lord Rothschild. 1986. THE DISTRIBUTION OF ENGLISH DICTIONARY WORD LENGTHS.
-// Journal of Statistical Planning and Inference 14 (1986) 311-322
-struct PoissonLengthUniformCharWordModel {
- explicit PoissonLengthUniformCharWordModel(unsigned vocab_size) : plen(5,5), uc(-log(50)), llh() {}
- void increment(WordID w, MT19937*) {
- llh += log(prob(w)); // this isn't quite right
- plen.increment(TD::Convert(w).size() - 1);
- }
- void decrement(WordID w, MT19937*) {
- plen.decrement(TD::Convert(w).size() - 1);
- llh -= log(prob(w)); // this isn't quite right
- }
- double prob(WordID w) const {
- size_t len = TD::Convert(w).size();
- return plen.prob(len - 1) * exp(uc * len);
- }
- double log_likelihood() const { return llh; }
- void resample_hyperparameters(MT19937*) {}
- GammaPoisson plen;
- const double uc;
- double llh;
-};
-
-// uniform base distribution (0-gram model)
-struct UniformWordModel {
- explicit UniformWordModel(unsigned vocab_size) : p0(1.0 / vocab_size), draws() {}
- void increment(WordID, MT19937*) { ++draws; }
- void decrement(WordID, MT19937*) { --draws; assert(draws >= 0); }
- double prob(WordID) const { return p0; } // all words have equal prob
- double log_likelihood() const { return draws * log(p0); }
- void resample_hyperparameters(MT19937*) {}
- const double p0;
- int draws;
-};
-
-// represents an Unigram LM
-template <class BaseGenerator>
-struct UnigramLM {
- UnigramLM(unsigned vs, double da, double db, double ss, double sr) :
- base(vs),
- crp(da, db, ss, sr, 0.8, 1.0) {}
- void increment(WordID w, MT19937* rng) {
- const double backoff = base.prob(w);
- if (crp.increment(w, backoff, rng))
- base.increment(w, rng);
- }
- void decrement(WordID w, MT19937* rng) {
- if (crp.decrement(w, rng))
- base.decrement(w, rng);
- }
- double prob(WordID w) const {
- const double backoff = base.prob(w);
- return crp.prob(w, backoff);
- }
-
- double log_likelihood() const {
- double llh = base.log_likelihood();
- llh += crp.log_crp_prob();
- return llh;
- }
-
- void resample_hyperparameters(MT19937* rng) {
- crp.resample_hyperparameters(rng);
- base.resample_hyperparameters(rng);
- }
-
- double discount_a, discount_b, strength_s, strength_r;
- double d, strength;
- BaseGenerator base;
- CCRP<WordID> crp;
-};
-
-int main(int argc, char** argv) {
- po::variables_map conf;
-
- InitCommandLine(argc, argv, &conf);
- const unsigned samples = conf["samples"].as<unsigned>();
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
- MT19937& rng = *prng;
- vector<vector<WordID> > corpuse;
- set<WordID> vocabe;
- const WordID kEOS = TD::Convert("</s>");
- cerr << "Reading corpus...\n";
- CorpusTools::ReadFromFile(conf["train"].as<string>(), &corpuse, &vocabe);
- cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
- vector<vector<WordID> > test;
- if (conf.count("test"))
- CorpusTools::ReadFromFile(conf["test"].as<string>(), &test);
- else
- test = corpuse;
-#if 1
- UnigramLM<PoissonLengthUniformCharWordModel> lm(vocabe.size(),
-#else
- UnigramLM<UniformWordModel> lm(vocabe.size(),
-#endif
- conf["discount_prior_a"].as<double>(),
- conf["discount_prior_b"].as<double>(),
- conf["strength_prior_s"].as<double>(),
- conf["strength_prior_r"].as<double>());
- for (unsigned SS=0; SS < samples; ++SS) {
- for (unsigned ci = 0; ci < corpuse.size(); ++ci) {
- const vector<WordID>& s = corpuse[ci];
- for (unsigned i = 0; i <= s.size(); ++i) {
- WordID w = (i < s.size() ? s[i] : kEOS);
- if (SS > 0) lm.decrement(w, &rng);
- lm.increment(w, &rng);
- }
- if (SS > 0) lm.decrement(kEOS, &rng);
- lm.increment(kEOS, &rng);
- }
- cerr << "LLH=" << lm.log_likelihood() << "\t tables=" << lm.crp.num_tables() << " " << endl;
- if (SS % 10 == 9) lm.resample_hyperparameters(&rng);
- }
- double llh = 0;
- unsigned cnt = 0;
- unsigned oovs = 0;
- for (unsigned ci = 0; ci < test.size(); ++ci) {
- const vector<WordID>& s = test[ci];
- for (unsigned i = 0; i <= s.size(); ++i) {
- WordID w = (i < s.size() ? s[i] : kEOS);
- double lp = log(lm.prob(w)) / log(2);
- if (i < s.size() && vocabe.count(w) == 0) {
- cerr << "**OOV ";
- ++oovs;
- //lp = 0;
- }
- cerr << "p(" << TD::Convert(w) << ") = " << lp << endl;
- llh -= lp;
- cnt++;
- }
- }
- cerr << " Log_10 prob: " << (-llh * log(2) / log(10)) << endl;
- cerr << " Count: " << cnt << endl;
- cerr << " OOVs: " << oovs << endl;
- cerr << "Cross-entropy: " << (llh / cnt) << endl;
- cerr << " Perplexity: " << pow(2, llh / cnt) << endl;
- return 0;
-}
-