diff options
Diffstat (limited to 'gi/clda/src')
-rw-r--r-- | gi/clda/src/Makefile.am | 4 | ||||
-rw-r--r-- | gi/clda/src/clda.cc | 24 | ||||
-rw-r--r-- | gi/clda/src/dict.h | 43 | ||||
-rw-r--r-- | gi/clda/src/logval.h | 157 | ||||
-rw-r--r-- | gi/clda/src/prob.h | 8 | ||||
-rw-r--r-- | gi/clda/src/sampler.h | 138 | ||||
-rw-r--r-- | gi/clda/src/tdict.h | 49 | ||||
-rw-r--r-- | gi/clda/src/wordid.h | 6 |
8 files changed, 20 insertions, 409 deletions
diff --git a/gi/clda/src/Makefile.am b/gi/clda/src/Makefile.am index ebb016db..688746bb 100644 --- a/gi/clda/src/Makefile.am +++ b/gi/clda/src/Makefile.am @@ -2,5 +2,5 @@ bin_PROGRAMS = clda clda_SOURCES = clda.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -AM_LDFLAGS = -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/decoder +AM_LDFLAGS = $(top_srcdir)/decoder/libcdec.a -lz diff --git a/gi/clda/src/clda.cc b/gi/clda/src/clda.cc index 482a1c4c..574fa038 100644 --- a/gi/clda/src/clda.cc +++ b/gi/clda/src/clda.cc @@ -6,9 +6,6 @@ #include "crp.h" #include "sampler.h" #include "tdict.h" -Dict TD::dict_; -std::string TD::empty = ""; -std::string TD::space = " "; const size_t MAX_DOC_LEN_CHARS = 1000000; using namespace std; @@ -57,8 +54,8 @@ int main(int argc, char** argv) { MT19937 rng; cerr << "INITIALIZING RANDOM TOPIC ASSIGNMENTS\n"; zji.resize(wji.size()); - double beta = 0.01; - double alpha = 0.001; + double beta = 0.1; + double alpha = 50.0 / num_classes; vector<CRP<int> > dr(zji.size(), CRP<int>(beta)); // dr[i] describes the probability of using a topic in document i vector<CRP<int> > wr(num_classes, CRP<int>(alpha)); // wr[k] describes the probability of generating a word in topic k int random_topic = rng.next() * num_classes; @@ -79,9 +76,11 @@ int main(int argc, char** argv) { vector<map<WordID, int> > t2w(num_classes); Timer timer; SampleSet ss; - const int num_types = TD::dict_.max(); + const int num_types = TD::NumWords(); const prob_t class_p0(1.0 / num_classes); const prob_t word_p0(1.0 / num_types); + cerr << "CLASS PRIOR PROB: " << class_p0 << endl; + cerr << " WORD PRIOR LOGPROB: " << log(word_p0) << endl; ss.resize(num_classes); double total_time = 0; for (int iter = 0; iter < num_iterations; ++iter) { @@ -131,6 +130,19 @@ int main(int argc, char** argv) { cerr << "---------------------------------\n"; ShowTopWordsForTopic(t2w[i]); } + cerr << "-------------\n"; +#if 0 + for (int j = 0; j < zji.size(); ++j) { + const size_t num_words = wji[j].size(); + vector<int>& zj = zji[j]; + const vector<int>& wj = wji[j]; + zj.resize(num_words); + for (int i = 0; i < num_words; ++i) { + cerr << TD::Convert(wji[j][i]) << '(' << zj[i] << ") "; + } + cerr << endl; + } +#endif return 0; } diff --git a/gi/clda/src/dict.h b/gi/clda/src/dict.h deleted file mode 100644 index 72e82e6d..00000000 --- a/gi/clda/src/dict.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef DICT_H_ -#define DICT_H_ - -#include <cassert> -#include <cstring> -#include <tr1/unordered_map> -#include <string> -#include <vector> - -#include <boost/functional/hash.hpp> - -#include "wordid.h" - -class Dict { - typedef std::tr1::unordered_map<std::string, WordID, boost::hash<std::string> > Map; - public: - Dict() : b0_("<bad0>") { words_.reserve(1000); } - inline int max() const { return words_.size(); } - inline WordID Convert(const std::string& word, bool frozen = false) { - Map::iterator i = d_.find(word); - if (i == d_.end()) { - if (frozen) - return 0; - words_.push_back(word); - d_[word] = words_.size(); - return words_.size(); - } else { - return i->second; - } - } - inline const std::string& Convert(const WordID& id) const { - if (id == 0) return b0_; - assert(id <= words_.size()); - return words_[id-1]; - } - void clear() { words_.clear(); d_.clear(); } - private: - const std::string b0_; - std::vector<std::string> words_; - Map d_; -}; - -#endif diff --git a/gi/clda/src/logval.h b/gi/clda/src/logval.h deleted file mode 100644 index 7099b9be..00000000 --- a/gi/clda/src/logval.h +++ /dev/null @@ -1,157 +0,0 @@ -#ifndef LOGVAL_H_ -#define LOGVAL_H_ - -#include <iostream> -#include <cstdlib> -#include <cmath> -#include <limits> - -template <typename T> -class LogVal { - public: - LogVal() : s_(), v_(-std::numeric_limits<T>::infinity()) {} - explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {} - static LogVal<T> One() { return LogVal(1); } - static LogVal<T> Zero() { return LogVal(); } - - void logeq(const T& v) { s_ = false; v_ = v; } - - LogVal& operator+=(const LogVal& a) { - if (a.v_ == -std::numeric_limits<T>::infinity()) return *this; - if (a.s_ == s_) { - if (a.v_ < v_) { - v_ = v_ + log1p(std::exp(a.v_ - v_)); - } else { - v_ = a.v_ + log1p(std::exp(v_ - a.v_)); - } - } else { - if (a.v_ < v_) { - v_ = v_ + log1p(-std::exp(a.v_ - v_)); - } else { - v_ = a.v_ + log1p(-std::exp(v_ - a.v_)); - s_ = !s_; - } - } - return *this; - } - - LogVal& operator*=(const LogVal& a) { - s_ = (s_ != a.s_); - v_ += a.v_; - return *this; - } - - LogVal& operator/=(const LogVal& a) { - s_ = (s_ != a.s_); - v_ -= a.v_; - return *this; - } - - LogVal& operator-=(const LogVal& a) { - LogVal b = a; - b.invert(); - return *this += b; - } - - LogVal& poweq(const T& power) { - if (s_) { - std::cerr << "poweq(T) not implemented when s_ is true\n"; - std::abort(); - } else { - v_ *= power; - } - return *this; - } - - void invert() { s_ = !s_; } - - LogVal pow(const T& power) const { - LogVal res = *this; - res.poweq(power); - return res; - } - - operator T() const { - if (s_) return -std::exp(v_); else return std::exp(v_); - } - - bool s_; - T v_; -}; - -template<typename T> -LogVal<T> operator+(const LogVal<T>& o1, const LogVal<T>& o2) { - LogVal<T> res(o1); - res += o2; - return res; -} - -template<typename T> -LogVal<T> operator*(const LogVal<T>& o1, const LogVal<T>& o2) { - LogVal<T> res(o1); - res *= o2; - return res; -} - -template<typename T> -LogVal<T> operator/(const LogVal<T>& o1, const LogVal<T>& o2) { - LogVal<T> res(o1); - res /= o2; - return res; -} - -template<typename T> -LogVal<T> operator-(const LogVal<T>& o1, const LogVal<T>& o2) { - LogVal<T> res(o1); - res -= o2; - return res; -} - -template<typename T> -T log(const LogVal<T>& o) { - if (o.s_) return log(-1.0); - return o.v_; -} - -template <typename T> -LogVal<T> pow(const LogVal<T>& b, const T& e) { - return b.pow(e); -} - -template <typename T> -bool operator<(const LogVal<T>& lhs, const LogVal<T>& rhs) { - if (lhs.s_ == rhs.s_) { - return (lhs.v_ < rhs.v_); - } else { - return lhs.s_ > rhs.s_; - } -} - -#if 0 -template <typename T> -bool operator<=(const LogVal<T>& lhs, const LogVal<T>& rhs) { - return (lhs.v_ <= rhs.v_); -} - -template <typename T> -bool operator>(const LogVal<T>& lhs, const LogVal<T>& rhs) { - return (lhs.v_ > rhs.v_); -} - -template <typename T> -bool operator>=(const LogVal<T>& lhs, const LogVal<T>& rhs) { - return (lhs.v_ >= rhs.v_); -} -#endif - -template <typename T> -bool operator==(const LogVal<T>& lhs, const LogVal<T>& rhs) { - return (lhs.v_ == rhs.v_) && (lhs.s_ == rhs.s_); -} - -template <typename T> -bool operator!=(const LogVal<T>& lhs, const LogVal<T>& rhs) { - return !(lhs == rhs); -} - -#endif diff --git a/gi/clda/src/prob.h b/gi/clda/src/prob.h deleted file mode 100644 index bc297870..00000000 --- a/gi/clda/src/prob.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _PROB_H_ -#define _PROB_H_ - -#include "logval.h" - -typedef LogVal<double> prob_t; - -#endif diff --git a/gi/clda/src/sampler.h b/gi/clda/src/sampler.h deleted file mode 100644 index 4d0b2e64..00000000 --- a/gi/clda/src/sampler.h +++ /dev/null @@ -1,138 +0,0 @@ -#ifndef SAMPLER_H_ -#define SAMPLER_H_ - -#include <algorithm> -#include <functional> -#include <numeric> -#include <iostream> -#include <fstream> -#include <vector> - -#include <boost/random/mersenne_twister.hpp> -#include <boost/random/uniform_real.hpp> -#include <boost/random/variate_generator.hpp> -#include <boost/random/normal_distribution.hpp> -#include <boost/random/poisson_distribution.hpp> - -#include "prob.h" - -struct SampleSet; - -template <typename RNG> -struct RandomNumberGenerator { - static uint32_t GetTrulyRandomSeed() { - uint32_t seed; - std::ifstream r("/dev/urandom"); - if (r) { - r.read((char*)&seed,sizeof(uint32_t)); - } - if (r.fail() || !r) { - std::cerr << "Warning: could not read from /dev/urandom. Seeding from clock" << std::endl; - seed = time(NULL); - } - std::cerr << "Seeding random number sequence to " << seed << std::endl; - return seed; - } - - RandomNumberGenerator() : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { - uint32_t seed = GetTrulyRandomSeed(); - m_generator.seed(seed); - } - explicit RandomNumberGenerator(uint32_t seed) : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { - if (!seed) seed = GetTrulyRandomSeed(); - m_generator.seed(seed); - } - - size_t SelectSample(const prob_t& a, const prob_t& b, double T = 1.0) { - if (T == 1.0) { - if (this->next() > (a / (a + b))) return 1; else return 0; - } else { - assert(!"not implemented"); - } - } - - // T is the annealing temperature, if desired - size_t SelectSample(const SampleSet& ss, double T = 1.0); - - // draw a value from U(0,1) - double next() {return m_random();} - - // draw a value from N(mean,var) - double NextNormal(double mean, double var) { - return boost::normal_distribution<double>(mean, var)(m_random); - } - - // draw a value from a Poisson distribution - // lambda must be greater than 0 - int NextPoisson(int lambda) { - return boost::poisson_distribution<int>(lambda)(m_random); - } - - bool AcceptMetropolisHastings(const prob_t& p_cur, - const prob_t& p_prev, - const prob_t& q_cur, - const prob_t& q_prev) { - const prob_t a = (p_cur / p_prev) * (q_prev / q_cur); - if (log(a) >= 0.0) return true; - return (prob_t(this->next()) < a); - } - - private: - boost::uniform_real<> m_dist; - RNG m_generator; - boost::variate_generator<RNG&, boost::uniform_real<> > m_random; -}; - -typedef RandomNumberGenerator<boost::mt19937> MT19937; - -class SampleSet { - public: - const prob_t& operator[](int i) const { return m_scores[i]; } - prob_t& operator[](int i) { return m_scores[i]; } - bool empty() const { return m_scores.empty(); } - void add(const prob_t& s) { m_scores.push_back(s); } - void clear() { m_scores.clear(); } - size_t size() const { return m_scores.size(); } - void resize(int size) { m_scores.resize(size); } - std::vector<prob_t> m_scores; -}; - -template <typename RNG> -size_t RandomNumberGenerator<RNG>::SelectSample(const SampleSet& ss, double T) { - assert(T > 0.0); - assert(ss.m_scores.size() > 0); - if (ss.m_scores.size() == 1) return 0; - const prob_t annealing_factor(1.0 / T); - const bool anneal = (annealing_factor != prob_t::One()); - prob_t sum = prob_t::Zero(); - if (anneal) { - for (int i = 0; i < ss.m_scores.size(); ++i) - sum += ss.m_scores[i].pow(annealing_factor); // p^(1/T) - } else { - sum = std::accumulate(ss.m_scores.begin(), ss.m_scores.end(), prob_t::Zero()); - } - //for (size_t i = 0; i < ss.m_scores.size(); ++i) std::cerr << ss.m_scores[i] << ","; - //std::cerr << std::endl; - - prob_t random(this->next()); // random number between 0 and 1 - random *= sum; // scale with normalization factor - //std::cerr << "Random number " << random << std::endl; - - //now figure out which sample - size_t position = 1; - sum = ss.m_scores[0]; - if (anneal) { - sum.poweq(annealing_factor); - for (; position < ss.m_scores.size() && sum < random; ++position) - sum += ss.m_scores[position].pow(annealing_factor); - } else { - for (; position < ss.m_scores.size() && sum < random; ++position) - sum += ss.m_scores[position]; - } - //std::cout << "random: " << random << " sample: " << position << std::endl; - //std::cerr << "Sample: " << position-1 << std::endl; - //exit(1); - return position-1; -} - -#endif diff --git a/gi/clda/src/tdict.h b/gi/clda/src/tdict.h deleted file mode 100644 index 97f145a1..00000000 --- a/gi/clda/src/tdict.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef _TDICT_H_ -#define _TDICT_H_ - -#include <string> -#include <vector> -#include "wordid.h" -#include "dict.h" - -class Vocab; - -struct TD { - - static Dict dict_; - static std::string empty; - static std::string space; - - static std::string GetString(const std::vector<WordID>& str) { - std::string res; - for (std::vector<WordID>::const_iterator i = str.begin(); i != str.end(); ++i) - res += (i == str.begin() ? empty : space) + TD::Convert(*i); - return res; - } - - static void ConvertSentence(const std::string& sent, std::vector<WordID>* ids) { - std::string s = sent; - int last = 0; - ids->clear(); - for (int i=0; i < s.size(); ++i) - if (s[i] == 32 || s[i] == '\t') { - s[i]=0; - if (last != i) { - ids->push_back(Convert(&s[last])); - } - last = i + 1; - } - if (last != s.size()) - ids->push_back(Convert(&s[last])); - } - - static WordID Convert(const std::string& s) { - return dict_.Convert(s); - } - - static const std::string& Convert(const WordID& w) { - return dict_.Convert(w); - } -}; - -#endif diff --git a/gi/clda/src/wordid.h b/gi/clda/src/wordid.h deleted file mode 100644 index fb50bcc1..00000000 --- a/gi/clda/src/wordid.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _WORD_ID_H_ -#define _WORD_ID_H_ - -typedef int WordID; - -#endif |