diff options
Diffstat (limited to 'phrasinator')
| -rw-r--r-- | phrasinator/Makefile.am | 2 | ||||
| -rw-r--r-- | phrasinator/ccrp_nt.h | 170 | ||||
| -rw-r--r-- | phrasinator/gibbs_train_plm.cc | 18 | ||||
| -rw-r--r-- | phrasinator/gibbs_train_plm.notables.cc | 24 | 
4 files changed, 21 insertions, 193 deletions
diff --git a/phrasinator/Makefile.am b/phrasinator/Makefile.am index aba98601..3ddd1934 100644 --- a/phrasinator/Makefile.am +++ b/phrasinator/Makefile.am @@ -11,4 +11,4 @@ gibbs_train_plm_LDADD = $(top_srcdir)/utils/libutils.a -lz  #head_bigram_model_SOURCES = head_bigram_model.cc  #head_bigram_model_LDADD = $(top_srcdir)/utils/libutils.a -lz -AM_CPPFLAGS = -funroll-loops -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval +AM_CPPFLAGS = -funroll-loops -ffast-math -W -Wall -I$(top_srcdir)/utils diff --git a/phrasinator/ccrp_nt.h b/phrasinator/ccrp_nt.h deleted file mode 100644 index 811bce73..00000000 --- a/phrasinator/ccrp_nt.h +++ /dev/null @@ -1,170 +0,0 @@ -#ifndef _CCRP_NT_H_ -#define _CCRP_NT_H_ - -#include <numeric> -#include <cassert> -#include <cmath> -#include <list> -#include <iostream> -#include <vector> -#include <tr1/unordered_map> -#include <boost/functional/hash.hpp> -#include "sampler.h" -#include "slice_sampler.h" - -// Chinese restaurant process (Pitman-Yor parameters) with table tracking. - -template <typename Dish, typename DishHash = boost::hash<Dish> > -class CCRP_NoTable { - public: -  explicit CCRP_NoTable(double conc) : -    num_customers_(), -    concentration_(conc), -    concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()), -    concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {} - -  CCRP_NoTable(double c_shape, double c_rate, double c = 10.0) : -    num_customers_(), -    concentration_(c), -    concentration_prior_shape_(c_shape), -    concentration_prior_rate_(c_rate) {} - -  double concentration() const { return concentration_; } - -  bool has_concentration_prior() const { -    return !std::isnan(concentration_prior_shape_); -  } - -  void clear() { -    num_customers_ = 0; -    custs_.clear(); -  } - -  unsigned num_customers() const { -    return num_customers_; -  } - -  unsigned num_customers(const Dish& dish) const { -    const typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.find(dish); -    if (it == custs_.end()) return 0; -    return it->second; -  } - -  int increment(const Dish& dish) { -    int table_diff = 0; -    if (++custs_[dish] == 1) -      table_diff = 1; -    ++num_customers_; -    return table_diff; -  } - -  int decrement(const Dish& dish) { -    int table_diff = 0; -    int nc = --custs_[dish]; -    if (nc == 0) { -      custs_.erase(dish); -      table_diff = -1; -    } else if (nc < 0) { -      std::cerr << "Dish counts dropped below zero for: " << dish << std::endl; -      abort(); -    } -    --num_customers_; -    return table_diff; -  } - -  double prob(const Dish& dish, const double& p0) const { -    const unsigned at_table = num_customers(dish); -    return (at_table + p0 * concentration_) / (num_customers_ + concentration_); -  } - -  double logprob(const Dish& dish, const double& logp0) const { -    const unsigned at_table = num_customers(dish); -    return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_); -  } - -  double log_crp_prob() const { -    return log_crp_prob(concentration_); -  } - -  static double log_gamma_density(const double& x, const double& shape, const double& rate) { -    assert(x >= 0.0); -    assert(shape > 0.0); -    assert(rate > 0.0); -    const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); -    return lp; -  } - -  // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process -  // does not include P_0's -  double log_crp_prob(const double& concentration) const { -    double lp = 0.0; -    if (has_concentration_prior()) -      lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); -    assert(lp <= 0.0); -    if (num_customers_) { -      lp += lgamma(concentration) - lgamma(concentration + num_customers_) + -        custs_.size() * log(concentration); -      assert(std::isfinite(lp)); -      for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin(); -             it != custs_.end(); ++it) { -          lp += lgamma(it->second); -      } -    } -    assert(std::isfinite(lp)); -    return lp; -  } - -  void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { -    assert(has_concentration_prior()); -    ConcentrationResampler cr(*this); -    for (int iter = 0; iter < nloop; ++iter) { -        concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, -                               std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); -    } -  } - -  struct ConcentrationResampler { -    ConcentrationResampler(const CCRP_NoTable& crp) : crp_(crp) {} -    const CCRP_NoTable& crp_; -    double operator()(const double& proposed_concentration) const { -      return crp_.log_crp_prob(proposed_concentration); -    } -  }; - -  void Print(std::ostream* out) const { -    (*out) << "DP(alpha=" << concentration_ << ") customers=" << num_customers_ << std::endl; -    int cc = 0; -    for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin(); -         it != custs_.end(); ++it) { -      (*out) << " " << it->first << "(" << it->second << " eating)"; -      ++cc; -      if (cc > 10) { (*out) << " ..."; break; } -    } -    (*out) << std::endl; -  } - -  unsigned num_customers_; -  std::tr1::unordered_map<Dish, unsigned, DishHash> custs_; - -  typedef typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator const_iterator; -  const_iterator begin() const { -    return custs_.begin(); -  } -  const_iterator end() const { -    return custs_.end(); -  } - -  double concentration_; - -  // optional gamma prior on concentration_ (NaN if no prior) -  double concentration_prior_shape_; -  double concentration_prior_rate_; -}; - -template <typename T,typename H> -std::ostream& operator<<(std::ostream& o, const CCRP_NoTable<T,H>& c) { -  c.Print(&o); -  return o; -} - -#endif diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc index 86fd7865..7847a460 100644 --- a/phrasinator/gibbs_train_plm.cc +++ b/phrasinator/gibbs_train_plm.cc @@ -18,7 +18,7 @@ Dict d; // global dictionary  string Join(char joiner, const vector<int>& phrase) {    ostringstream os; -  for (int i = 0; i < phrase.size(); ++i) { +  for (unsigned i = 0; i < phrase.size(); ++i) {      if (i > 0) os << joiner;      os << d.Convert(phrase[i]);    } @@ -26,7 +26,7 @@ string Join(char joiner, const vector<int>& phrase) {  }  ostream& operator<<(ostream& os, const vector<int>& phrase) { -  for (int i = 0; i < phrase.size(); ++i) +  for (unsigned i = 0; i < phrase.size(); ++i)      os << (i == 0 ? "" : " ") << d.Convert(phrase[i]);    return os;  } @@ -37,7 +37,7 @@ struct UnigramLM {      assert(in);    } -  double logprob(int word) const { +  double logprob(unsigned word) const {      assert(word < freqs_.size());      return freqs_[word];    } @@ -91,7 +91,7 @@ void ReadCorpus(const string& filename, vector<vector<int> >* c, set<int>* vocab      c->push_back(vector<int>());      vector<int>& v = c->back();      d.ConvertWhitespaceDelimitedLine(line, &v); -    for (int i = 0; i < v.size(); ++i) vocab->insert(v[i]); +    for (unsigned i = 0; i < v.size(); ++i) vocab->insert(v[i]);    }    if (in != &cin) delete in;  } @@ -151,7 +151,7 @@ struct UniphraseLM {      cerr << "Initializing...\n";      z_.resize(corpus_.size());      int tc = 0; -    for (int i = 0; i < corpus_.size(); ++i) { +    for (unsigned i = 0; i < corpus_.size(); ++i) {        const vector<int>& line = corpus_[i];        const int ls = line.size();        const int last_pos = ls - 1; @@ -177,7 +177,7 @@ struct UniphraseLM {      cerr << "Initial LLH: " << llh() << endl;      cerr << "Sampling...\n";      cerr << gen_ << endl; -    for (int s = 1; s < samples; ++s) { +    for (unsigned s = 1; s < samples; ++s) {        cerr << '.';        if (s % 10 == 0) {          cerr << " [" << s; @@ -187,7 +187,7 @@ struct UniphraseLM {          //for (int j = 0; j < z.size(); ++j) z[j] = z_[0][j];          //SegCorpus::Write(corpus_[0], z, d);        } -      for (int i = 0; i < corpus_.size(); ++i) { +      for (unsigned i = 0; i < corpus_.size(); ++i) {          const vector<int>& line = corpus_[i];          const int ls = line.size();          const int last_pos = ls - 1; @@ -286,7 +286,7 @@ int main(int argc, char** argv) {    ulm.Sample(conf["samples"].as<unsigned>(), conf.count("no_hyperparameter_inference") == 0, &rng);    cerr << "OOV unigram prob: " << ulm.OOVUnigramLogProb() << endl; -  for (int i = 0; i < corpus.size(); ++i) +  for (unsigned i = 0; i < corpus.size(); ++i)  //    SegCorpus::Write(corpus[i], shmmlm.z_[i], d);   ;    if (conf.count("write_cdec_grammar")) { @@ -304,8 +304,6 @@ int main(int argc, char** argv) {      os << "# make C smaller to use more phrases\nP 1\nPassThrough " << ulm.OOVUnigramLogProb() << "\nC -3\n";    } -   -    return 0;  } diff --git a/phrasinator/gibbs_train_plm.notables.cc b/phrasinator/gibbs_train_plm.notables.cc index 9dca9e8d..4526eaa6 100644 --- a/phrasinator/gibbs_train_plm.notables.cc +++ b/phrasinator/gibbs_train_plm.notables.cc @@ -18,7 +18,7 @@ Dict d; // global dictionary  string Join(char joiner, const vector<int>& phrase) {    ostringstream os; -  for (int i = 0; i < phrase.size(); ++i) { +  for (unsigned i = 0; i < phrase.size(); ++i) {      if (i > 0) os << joiner;      os << d.Convert(phrase[i]);    } @@ -29,13 +29,13 @@ template <typename BType>  void WriteSeg(const vector<int>& line, const vector<BType>& label, const Dict& d) {    assert(line.size() == label.size());    assert(label.back()); -  int prev = 0; -  int cur = 0; +  unsigned prev = 0; +  unsigned cur = 0;    while (cur < line.size()) {      if (label[cur]) {        if (prev) cout << ' ';        cout << "{{"; -      for (int i = prev; i <= cur; ++i) +      for (unsigned i = prev; i <= cur; ++i)          cout << (i == prev ? "" : " ") << d.Convert(line[i]);        cout << "}}:" << label[cur];        prev = cur + 1; @@ -46,7 +46,7 @@ void WriteSeg(const vector<int>& line, const vector<BType>& label, const Dict& d  }  ostream& operator<<(ostream& os, const vector<int>& phrase) { -  for (int i = 0; i < phrase.size(); ++i) +  for (unsigned i = 0; i < phrase.size(); ++i)      os << (i == 0 ? "" : " ") << d.Convert(phrase[i]);    return os;  } @@ -57,7 +57,7 @@ struct UnigramLM {      assert(in);    } -  double logprob(int word) const { +  double logprob(unsigned word) const {      assert(word < freqs_.size());      return freqs_[word];    } @@ -111,7 +111,7 @@ void ReadCorpus(const string& filename, vector<vector<int> >* c, set<int>* vocab      c->push_back(vector<int>());      vector<int>& v = c->back();      d.ConvertWhitespaceDelimitedLine(line, &v); -    for (int i = 0; i < v.size(); ++i) vocab->insert(v[i]); +    for (unsigned i = 0; i < v.size(); ++i) vocab->insert(v[i]);    }    if (in != &cin) delete in;  } @@ -175,7 +175,7 @@ struct UniphraseLM {      cerr << "Initializing...\n";      z_.resize(corpus_.size());      int tc = 0; -    for (int i = 0; i < corpus_.size(); ++i) { +    for (unsigned i = 0; i < corpus_.size(); ++i) {        const vector<int>& line = corpus_[i];        const int ls = line.size();        const int last_pos = ls - 1; @@ -201,7 +201,7 @@ struct UniphraseLM {      cerr << "Initial LLH: " << llh() << endl;      cerr << "Sampling...\n";      cerr << gen_ << endl; -    for (int s = 1; s < samples; ++s) { +    for (unsigned s = 1; s < samples; ++s) {        cerr << '.';        if (s % 10 == 0) {          cerr << " [" << s; @@ -211,7 +211,7 @@ struct UniphraseLM {          //for (int j = 0; j < z.size(); ++j) z[j] = z_[0][j];          //SegCorpus::Write(corpus_[0], z, d);        } -      for (int i = 0; i < corpus_.size(); ++i) { +      for (unsigned i = 0; i < corpus_.size(); ++i) {          const vector<int>& line = corpus_[i];          const int ls = line.size();          const int last_pos = ls - 1; @@ -276,7 +276,7 @@ struct UniphraseLM {    void ResampleHyperparameters(MT19937* rng) {      phrases_.resample_hyperparameters(rng);      gen_.resample_hyperparameters(rng); -    cerr << " " << phrases_.concentration(); +    cerr << " " << phrases_.alpha();    }    CCRP_NoTable<vector<int> > phrases_; @@ -310,7 +310,7 @@ int main(int argc, char** argv) {    ulm.Sample(conf["samples"].as<unsigned>(), conf.count("no_hyperparameter_inference") == 0, &rng);    cerr << "OOV unigram prob: " << ulm.OOVUnigramLogProb() << endl; -  for (int i = 0; i < corpus.size(); ++i) +  for (unsigned i = 0; i < corpus.size(); ++i)      WriteSeg(corpus[i], ulm.z_[i], d);    if (conf.count("write_cdec_grammar")) {  | 
