diff options
author | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2012-05-31 13:57:24 +0200 |
---|---|---|
committer | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2012-05-31 13:57:24 +0200 |
commit | 6f6601111710aa67eee5169e5b7d89102cc33bb8 (patch) | |
tree | 0872544abd6bc76162f3f80eb3920999afbf2c34 /phrasinator | |
parent | 8cee8b565a9c56a7732365e9563f52ff3c4ff7fd (diff) | |
parent | 090a64e73f94a6a35e5364a9d416dcf75c0a2938 (diff) |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'phrasinator')
-rw-r--r-- | phrasinator/Makefile.am | 2 | ||||
-rw-r--r-- | phrasinator/ccrp_nt.h | 170 | ||||
-rw-r--r-- | phrasinator/gibbs_train_plm.cc | 18 | ||||
-rw-r--r-- | phrasinator/gibbs_train_plm.notables.cc | 24 |
4 files changed, 21 insertions, 193 deletions
diff --git a/phrasinator/Makefile.am b/phrasinator/Makefile.am index aba98601..3ddd1934 100644 --- a/phrasinator/Makefile.am +++ b/phrasinator/Makefile.am @@ -11,4 +11,4 @@ gibbs_train_plm_LDADD = $(top_srcdir)/utils/libutils.a -lz #head_bigram_model_SOURCES = head_bigram_model.cc #head_bigram_model_LDADD = $(top_srcdir)/utils/libutils.a -lz -AM_CPPFLAGS = -funroll-loops -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval +AM_CPPFLAGS = -funroll-loops -ffast-math -W -Wall -I$(top_srcdir)/utils diff --git a/phrasinator/ccrp_nt.h b/phrasinator/ccrp_nt.h deleted file mode 100644 index 811bce73..00000000 --- a/phrasinator/ccrp_nt.h +++ /dev/null @@ -1,170 +0,0 @@ -#ifndef _CCRP_NT_H_ -#define _CCRP_NT_H_ - -#include <numeric> -#include <cassert> -#include <cmath> -#include <list> -#include <iostream> -#include <vector> -#include <tr1/unordered_map> -#include <boost/functional/hash.hpp> -#include "sampler.h" -#include "slice_sampler.h" - -// Chinese restaurant process (Pitman-Yor parameters) with table tracking. - -template <typename Dish, typename DishHash = boost::hash<Dish> > -class CCRP_NoTable { - public: - explicit CCRP_NoTable(double conc) : - num_customers_(), - concentration_(conc), - concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {} - - CCRP_NoTable(double c_shape, double c_rate, double c = 10.0) : - num_customers_(), - concentration_(c), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} - - double concentration() const { return concentration_; } - - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); - } - - void clear() { - num_customers_ = 0; - custs_.clear(); - } - - unsigned num_customers() const { - return num_customers_; - } - - unsigned num_customers(const Dish& dish) const { - const typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.find(dish); - if (it == custs_.end()) return 0; - return it->second; - } - - int increment(const Dish& dish) { - int table_diff = 0; - if (++custs_[dish] == 1) - table_diff = 1; - ++num_customers_; - return table_diff; - } - - int decrement(const Dish& dish) { - int table_diff = 0; - int nc = --custs_[dish]; - if (nc == 0) { - custs_.erase(dish); - table_diff = -1; - } else if (nc < 0) { - std::cerr << "Dish counts dropped below zero for: " << dish << std::endl; - abort(); - } - --num_customers_; - return table_diff; - } - - double prob(const Dish& dish, const double& p0) const { - const unsigned at_table = num_customers(dish); - return (at_table + p0 * concentration_) / (num_customers_ + concentration_); - } - - double logprob(const Dish& dish, const double& logp0) const { - const unsigned at_table = num_customers(dish); - return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_); - } - - double log_crp_prob() const { - return log_crp_prob(concentration_); - } - - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; - } - - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process - // does not include P_0's - double log_crp_prob(const double& concentration) const { - double lp = 0.0; - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); - assert(lp <= 0.0); - if (num_customers_) { - lp += lgamma(concentration) - lgamma(concentration + num_customers_) + - custs_.size() * log(concentration); - assert(std::isfinite(lp)); - for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin(); - it != custs_.end(); ++it) { - lp += lgamma(it->second); - } - } - assert(std::isfinite(lp)); - return lp; - } - - void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_concentration_prior()); - ConcentrationResampler cr(*this); - for (int iter = 0; iter < nloop; ++iter) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, - std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); - } - } - - struct ConcentrationResampler { - ConcentrationResampler(const CCRP_NoTable& crp) : crp_(crp) {} - const CCRP_NoTable& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(proposed_concentration); - } - }; - - void Print(std::ostream* out) const { - (*out) << "DP(alpha=" << concentration_ << ") customers=" << num_customers_ << std::endl; - int cc = 0; - for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin(); - it != custs_.end(); ++it) { - (*out) << " " << it->first << "(" << it->second << " eating)"; - ++cc; - if (cc > 10) { (*out) << " ..."; break; } - } - (*out) << std::endl; - } - - unsigned num_customers_; - std::tr1::unordered_map<Dish, unsigned, DishHash> custs_; - - typedef typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator const_iterator; - const_iterator begin() const { - return custs_.begin(); - } - const_iterator end() const { - return custs_.end(); - } - - double concentration_; - - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; -}; - -template <typename T,typename H> -std::ostream& operator<<(std::ostream& o, const CCRP_NoTable<T,H>& c) { - c.Print(&o); - return o; -} - -#endif diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc index 86fd7865..7847a460 100644 --- a/phrasinator/gibbs_train_plm.cc +++ b/phrasinator/gibbs_train_plm.cc @@ -18,7 +18,7 @@ Dict d; // global dictionary string Join(char joiner, const vector<int>& phrase) { ostringstream os; - for (int i = 0; i < phrase.size(); ++i) { + for (unsigned i = 0; i < phrase.size(); ++i) { if (i > 0) os << joiner; os << d.Convert(phrase[i]); } @@ -26,7 +26,7 @@ string Join(char joiner, const vector<int>& phrase) { } ostream& operator<<(ostream& os, const vector<int>& phrase) { - for (int i = 0; i < phrase.size(); ++i) + for (unsigned i = 0; i < phrase.size(); ++i) os << (i == 0 ? "" : " ") << d.Convert(phrase[i]); return os; } @@ -37,7 +37,7 @@ struct UnigramLM { assert(in); } - double logprob(int word) const { + double logprob(unsigned word) const { assert(word < freqs_.size()); return freqs_[word]; } @@ -91,7 +91,7 @@ void ReadCorpus(const string& filename, vector<vector<int> >* c, set<int>* vocab c->push_back(vector<int>()); vector<int>& v = c->back(); d.ConvertWhitespaceDelimitedLine(line, &v); - for (int i = 0; i < v.size(); ++i) vocab->insert(v[i]); + for (unsigned i = 0; i < v.size(); ++i) vocab->insert(v[i]); } if (in != &cin) delete in; } @@ -151,7 +151,7 @@ struct UniphraseLM { cerr << "Initializing...\n"; z_.resize(corpus_.size()); int tc = 0; - for (int i = 0; i < corpus_.size(); ++i) { + for (unsigned i = 0; i < corpus_.size(); ++i) { const vector<int>& line = corpus_[i]; const int ls = line.size(); const int last_pos = ls - 1; @@ -177,7 +177,7 @@ struct UniphraseLM { cerr << "Initial LLH: " << llh() << endl; cerr << "Sampling...\n"; cerr << gen_ << endl; - for (int s = 1; s < samples; ++s) { + for (unsigned s = 1; s < samples; ++s) { cerr << '.'; if (s % 10 == 0) { cerr << " [" << s; @@ -187,7 +187,7 @@ struct UniphraseLM { //for (int j = 0; j < z.size(); ++j) z[j] = z_[0][j]; //SegCorpus::Write(corpus_[0], z, d); } - for (int i = 0; i < corpus_.size(); ++i) { + for (unsigned i = 0; i < corpus_.size(); ++i) { const vector<int>& line = corpus_[i]; const int ls = line.size(); const int last_pos = ls - 1; @@ -286,7 +286,7 @@ int main(int argc, char** argv) { ulm.Sample(conf["samples"].as<unsigned>(), conf.count("no_hyperparameter_inference") == 0, &rng); cerr << "OOV unigram prob: " << ulm.OOVUnigramLogProb() << endl; - for (int i = 0; i < corpus.size(); ++i) + for (unsigned i = 0; i < corpus.size(); ++i) // SegCorpus::Write(corpus[i], shmmlm.z_[i], d); ; if (conf.count("write_cdec_grammar")) { @@ -304,8 +304,6 @@ int main(int argc, char** argv) { os << "# make C smaller to use more phrases\nP 1\nPassThrough " << ulm.OOVUnigramLogProb() << "\nC -3\n"; } - - return 0; } diff --git a/phrasinator/gibbs_train_plm.notables.cc b/phrasinator/gibbs_train_plm.notables.cc index 9dca9e8d..4526eaa6 100644 --- a/phrasinator/gibbs_train_plm.notables.cc +++ b/phrasinator/gibbs_train_plm.notables.cc @@ -18,7 +18,7 @@ Dict d; // global dictionary string Join(char joiner, const vector<int>& phrase) { ostringstream os; - for (int i = 0; i < phrase.size(); ++i) { + for (unsigned i = 0; i < phrase.size(); ++i) { if (i > 0) os << joiner; os << d.Convert(phrase[i]); } @@ -29,13 +29,13 @@ template <typename BType> void WriteSeg(const vector<int>& line, const vector<BType>& label, const Dict& d) { assert(line.size() == label.size()); assert(label.back()); - int prev = 0; - int cur = 0; + unsigned prev = 0; + unsigned cur = 0; while (cur < line.size()) { if (label[cur]) { if (prev) cout << ' '; cout << "{{"; - for (int i = prev; i <= cur; ++i) + for (unsigned i = prev; i <= cur; ++i) cout << (i == prev ? "" : " ") << d.Convert(line[i]); cout << "}}:" << label[cur]; prev = cur + 1; @@ -46,7 +46,7 @@ void WriteSeg(const vector<int>& line, const vector<BType>& label, const Dict& d } ostream& operator<<(ostream& os, const vector<int>& phrase) { - for (int i = 0; i < phrase.size(); ++i) + for (unsigned i = 0; i < phrase.size(); ++i) os << (i == 0 ? "" : " ") << d.Convert(phrase[i]); return os; } @@ -57,7 +57,7 @@ struct UnigramLM { assert(in); } - double logprob(int word) const { + double logprob(unsigned word) const { assert(word < freqs_.size()); return freqs_[word]; } @@ -111,7 +111,7 @@ void ReadCorpus(const string& filename, vector<vector<int> >* c, set<int>* vocab c->push_back(vector<int>()); vector<int>& v = c->back(); d.ConvertWhitespaceDelimitedLine(line, &v); - for (int i = 0; i < v.size(); ++i) vocab->insert(v[i]); + for (unsigned i = 0; i < v.size(); ++i) vocab->insert(v[i]); } if (in != &cin) delete in; } @@ -175,7 +175,7 @@ struct UniphraseLM { cerr << "Initializing...\n"; z_.resize(corpus_.size()); int tc = 0; - for (int i = 0; i < corpus_.size(); ++i) { + for (unsigned i = 0; i < corpus_.size(); ++i) { const vector<int>& line = corpus_[i]; const int ls = line.size(); const int last_pos = ls - 1; @@ -201,7 +201,7 @@ struct UniphraseLM { cerr << "Initial LLH: " << llh() << endl; cerr << "Sampling...\n"; cerr << gen_ << endl; - for (int s = 1; s < samples; ++s) { + for (unsigned s = 1; s < samples; ++s) { cerr << '.'; if (s % 10 == 0) { cerr << " [" << s; @@ -211,7 +211,7 @@ struct UniphraseLM { //for (int j = 0; j < z.size(); ++j) z[j] = z_[0][j]; //SegCorpus::Write(corpus_[0], z, d); } - for (int i = 0; i < corpus_.size(); ++i) { + for (unsigned i = 0; i < corpus_.size(); ++i) { const vector<int>& line = corpus_[i]; const int ls = line.size(); const int last_pos = ls - 1; @@ -276,7 +276,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " " << phrases_.concentration(); + cerr << " " << phrases_.alpha(); } CCRP_NoTable<vector<int> > phrases_; @@ -310,7 +310,7 @@ int main(int argc, char** argv) { ulm.Sample(conf["samples"].as<unsigned>(), conf.count("no_hyperparameter_inference") == 0, &rng); cerr << "OOV unigram prob: " << ulm.OOVUnigramLogProb() << endl; - for (int i = 0; i < corpus.size(); ++i) + for (unsigned i = 0; i < corpus.size(); ++i) WriteSeg(corpus[i], ulm.z_[i], d); if (conf.count("write_cdec_grammar")) { |