summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-23 18:29:55 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-23 18:29:55 +0000
commit3f6de721b089823ac15f3d1ad786e6b479dee4d0 (patch)
tree98edfe38e297fddcbad2624384d174436f2001b4
parent70ad159e22fc6ea12a5e7b468ab38a93c3ed111f (diff)
use centralized make
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@11 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--Makefile.am2
-rw-r--r--configure.ac3
-rw-r--r--decoder/sampler.h2
-rw-r--r--decoder/tdict.cc4
-rw-r--r--decoder/tdict.h1
-rw-r--r--gi/clda/Makefile.am5
-rw-r--r--gi/clda/configure.ac17
-rw-r--r--gi/clda/src/Makefile.am4
-rw-r--r--gi/clda/src/clda.cc24
-rw-r--r--gi/clda/src/dict.h43
-rw-r--r--gi/clda/src/logval.h157
-rw-r--r--gi/clda/src/prob.h8
-rw-r--r--gi/clda/src/sampler.h138
-rw-r--r--gi/clda/src/tdict.h49
-rw-r--r--gi/clda/src/wordid.h6
-rw-r--r--gi/pyp-topics/src/Makefile30
16 files changed, 29 insertions, 464 deletions
diff --git a/Makefile.am b/Makefile.am
index d4e054a9..a355de1f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,4 +1,4 @@
-SUBDIRS = decoder training vest extools
+SUBDIRS = decoder training vest extools gi/pyp-topics/src gi/clda/src
AUTOMAKE_OPTIONS = foreign
ACLOCAL_AMFLAGS = -I m4
diff --git a/configure.ac b/configure.ac
index f371b3c3..a7f4cfe7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -69,5 +69,4 @@ then
AM_CONDITIONAL([RAND_LM], true)
fi
-AC_OUTPUT(Makefile extools/Makefile decoder/Makefile training/Makefile vest/Makefile)
-
+AC_OUTPUT(Makefile extools/Makefile decoder/Makefile training/Makefile vest/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile)
diff --git a/decoder/sampler.h b/decoder/sampler.h
index e5840f41..4d0b2e64 100644
--- a/decoder/sampler.h
+++ b/decoder/sampler.h
@@ -88,10 +88,12 @@ typedef RandomNumberGenerator<boost::mt19937> MT19937;
class SampleSet {
public:
const prob_t& operator[](int i) const { return m_scores[i]; }
+ prob_t& operator[](int i) { return m_scores[i]; }
bool empty() const { return m_scores.empty(); }
void add(const prob_t& s) { m_scores.push_back(s); }
void clear() { m_scores.clear(); }
size_t size() const { return m_scores.size(); }
+ void resize(int size) { m_scores.resize(size); }
std::vector<prob_t> m_scores;
};
diff --git a/decoder/tdict.cc b/decoder/tdict.cc
index c00d20b8..ac590bd8 100644
--- a/decoder/tdict.cc
+++ b/decoder/tdict.cc
@@ -10,6 +10,10 @@ Vocab* TD::dict_ = new Vocab;
static const string empty;
static const string space = " ";
+unsigned int TD::NumWords() {
+ return dict_->numWords();
+}
+
WordID TD::Convert(const std::string& s) {
return dict_->addWord((VocabString)s.c_str());
}
diff --git a/decoder/tdict.h b/decoder/tdict.h
index 31f66367..fd77543d 100644
--- a/decoder/tdict.h
+++ b/decoder/tdict.h
@@ -23,6 +23,7 @@ struct TD {
}
return (dest - buffer);
}
+ static unsigned int NumWords();
static WordID Convert(const std::string& s);
static const char* Convert(const WordID& w);
};
diff --git a/gi/clda/Makefile.am b/gi/clda/Makefile.am
deleted file mode 100644
index 936b6ae3..00000000
--- a/gi/clda/Makefile.am
+++ /dev/null
@@ -1,5 +0,0 @@
-SUBDIRS = src
-AUTOMAKE_OPTIONS = foreign
-
-ACLOCAL_AMFLAGS = -I m4
-
diff --git a/gi/clda/configure.ac b/gi/clda/configure.ac
deleted file mode 100644
index 8469ee09..00000000
--- a/gi/clda/configure.ac
+++ /dev/null
@@ -1,17 +0,0 @@
-AC_INIT
-AM_INIT_AUTOMAKE(cdec,0.1)
-AC_CONFIG_HEADERS(config.h)
-AC_PROG_LIBTOOL
-AC_PROG_CC
-AC_PROG_CXX
-AC_LANG_CPLUSPLUS
-BOOST_REQUIRE
-BOOST_PROGRAM_OPTIONS
-CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
-LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS"
-LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS"
-
-AC_PROG_INSTALL
-
-AC_OUTPUT(Makefile src/Makefile)
-
diff --git a/gi/clda/src/Makefile.am b/gi/clda/src/Makefile.am
index ebb016db..688746bb 100644
--- a/gi/clda/src/Makefile.am
+++ b/gi/clda/src/Makefile.am
@@ -2,5 +2,5 @@ bin_PROGRAMS = clda
clda_SOURCES = clda.cc
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS)
-AM_LDFLAGS = -lz
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/decoder
+AM_LDFLAGS = $(top_srcdir)/decoder/libcdec.a -lz
diff --git a/gi/clda/src/clda.cc b/gi/clda/src/clda.cc
index 482a1c4c..574fa038 100644
--- a/gi/clda/src/clda.cc
+++ b/gi/clda/src/clda.cc
@@ -6,9 +6,6 @@
#include "crp.h"
#include "sampler.h"
#include "tdict.h"
-Dict TD::dict_;
-std::string TD::empty = "";
-std::string TD::space = " ";
const size_t MAX_DOC_LEN_CHARS = 1000000;
using namespace std;
@@ -57,8 +54,8 @@ int main(int argc, char** argv) {
MT19937 rng;
cerr << "INITIALIZING RANDOM TOPIC ASSIGNMENTS\n";
zji.resize(wji.size());
- double beta = 0.01;
- double alpha = 0.001;
+ double beta = 0.1;
+ double alpha = 50.0 / num_classes;
vector<CRP<int> > dr(zji.size(), CRP<int>(beta)); // dr[i] describes the probability of using a topic in document i
vector<CRP<int> > wr(num_classes, CRP<int>(alpha)); // wr[k] describes the probability of generating a word in topic k
int random_topic = rng.next() * num_classes;
@@ -79,9 +76,11 @@ int main(int argc, char** argv) {
vector<map<WordID, int> > t2w(num_classes);
Timer timer;
SampleSet ss;
- const int num_types = TD::dict_.max();
+ const int num_types = TD::NumWords();
const prob_t class_p0(1.0 / num_classes);
const prob_t word_p0(1.0 / num_types);
+ cerr << "CLASS PRIOR PROB: " << class_p0 << endl;
+ cerr << " WORD PRIOR LOGPROB: " << log(word_p0) << endl;
ss.resize(num_classes);
double total_time = 0;
for (int iter = 0; iter < num_iterations; ++iter) {
@@ -131,6 +130,19 @@ int main(int argc, char** argv) {
cerr << "---------------------------------\n";
ShowTopWordsForTopic(t2w[i]);
}
+ cerr << "-------------\n";
+#if 0
+ for (int j = 0; j < zji.size(); ++j) {
+ const size_t num_words = wji[j].size();
+ vector<int>& zj = zji[j];
+ const vector<int>& wj = wji[j];
+ zj.resize(num_words);
+ for (int i = 0; i < num_words; ++i) {
+ cerr << TD::Convert(wji[j][i]) << '(' << zj[i] << ") ";
+ }
+ cerr << endl;
+ }
+#endif
return 0;
}
diff --git a/gi/clda/src/dict.h b/gi/clda/src/dict.h
deleted file mode 100644
index 72e82e6d..00000000
--- a/gi/clda/src/dict.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef DICT_H_
-#define DICT_H_
-
-#include <cassert>
-#include <cstring>
-#include <tr1/unordered_map>
-#include <string>
-#include <vector>
-
-#include <boost/functional/hash.hpp>
-
-#include "wordid.h"
-
-class Dict {
- typedef std::tr1::unordered_map<std::string, WordID, boost::hash<std::string> > Map;
- public:
- Dict() : b0_("<bad0>") { words_.reserve(1000); }
- inline int max() const { return words_.size(); }
- inline WordID Convert(const std::string& word, bool frozen = false) {
- Map::iterator i = d_.find(word);
- if (i == d_.end()) {
- if (frozen)
- return 0;
- words_.push_back(word);
- d_[word] = words_.size();
- return words_.size();
- } else {
- return i->second;
- }
- }
- inline const std::string& Convert(const WordID& id) const {
- if (id == 0) return b0_;
- assert(id <= words_.size());
- return words_[id-1];
- }
- void clear() { words_.clear(); d_.clear(); }
- private:
- const std::string b0_;
- std::vector<std::string> words_;
- Map d_;
-};
-
-#endif
diff --git a/gi/clda/src/logval.h b/gi/clda/src/logval.h
deleted file mode 100644
index 7099b9be..00000000
--- a/gi/clda/src/logval.h
+++ /dev/null
@@ -1,157 +0,0 @@
-#ifndef LOGVAL_H_
-#define LOGVAL_H_
-
-#include <iostream>
-#include <cstdlib>
-#include <cmath>
-#include <limits>
-
-template <typename T>
-class LogVal {
- public:
- LogVal() : s_(), v_(-std::numeric_limits<T>::infinity()) {}
- explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {}
- static LogVal<T> One() { return LogVal(1); }
- static LogVal<T> Zero() { return LogVal(); }
-
- void logeq(const T& v) { s_ = false; v_ = v; }
-
- LogVal& operator+=(const LogVal& a) {
- if (a.v_ == -std::numeric_limits<T>::infinity()) return *this;
- if (a.s_ == s_) {
- if (a.v_ < v_) {
- v_ = v_ + log1p(std::exp(a.v_ - v_));
- } else {
- v_ = a.v_ + log1p(std::exp(v_ - a.v_));
- }
- } else {
- if (a.v_ < v_) {
- v_ = v_ + log1p(-std::exp(a.v_ - v_));
- } else {
- v_ = a.v_ + log1p(-std::exp(v_ - a.v_));
- s_ = !s_;
- }
- }
- return *this;
- }
-
- LogVal& operator*=(const LogVal& a) {
- s_ = (s_ != a.s_);
- v_ += a.v_;
- return *this;
- }
-
- LogVal& operator/=(const LogVal& a) {
- s_ = (s_ != a.s_);
- v_ -= a.v_;
- return *this;
- }
-
- LogVal& operator-=(const LogVal& a) {
- LogVal b = a;
- b.invert();
- return *this += b;
- }
-
- LogVal& poweq(const T& power) {
- if (s_) {
- std::cerr << "poweq(T) not implemented when s_ is true\n";
- std::abort();
- } else {
- v_ *= power;
- }
- return *this;
- }
-
- void invert() { s_ = !s_; }
-
- LogVal pow(const T& power) const {
- LogVal res = *this;
- res.poweq(power);
- return res;
- }
-
- operator T() const {
- if (s_) return -std::exp(v_); else return std::exp(v_);
- }
-
- bool s_;
- T v_;
-};
-
-template<typename T>
-LogVal<T> operator+(const LogVal<T>& o1, const LogVal<T>& o2) {
- LogVal<T> res(o1);
- res += o2;
- return res;
-}
-
-template<typename T>
-LogVal<T> operator*(const LogVal<T>& o1, const LogVal<T>& o2) {
- LogVal<T> res(o1);
- res *= o2;
- return res;
-}
-
-template<typename T>
-LogVal<T> operator/(const LogVal<T>& o1, const LogVal<T>& o2) {
- LogVal<T> res(o1);
- res /= o2;
- return res;
-}
-
-template<typename T>
-LogVal<T> operator-(const LogVal<T>& o1, const LogVal<T>& o2) {
- LogVal<T> res(o1);
- res -= o2;
- return res;
-}
-
-template<typename T>
-T log(const LogVal<T>& o) {
- if (o.s_) return log(-1.0);
- return o.v_;
-}
-
-template <typename T>
-LogVal<T> pow(const LogVal<T>& b, const T& e) {
- return b.pow(e);
-}
-
-template <typename T>
-bool operator<(const LogVal<T>& lhs, const LogVal<T>& rhs) {
- if (lhs.s_ == rhs.s_) {
- return (lhs.v_ < rhs.v_);
- } else {
- return lhs.s_ > rhs.s_;
- }
-}
-
-#if 0
-template <typename T>
-bool operator<=(const LogVal<T>& lhs, const LogVal<T>& rhs) {
- return (lhs.v_ <= rhs.v_);
-}
-
-template <typename T>
-bool operator>(const LogVal<T>& lhs, const LogVal<T>& rhs) {
- return (lhs.v_ > rhs.v_);
-}
-
-template <typename T>
-bool operator>=(const LogVal<T>& lhs, const LogVal<T>& rhs) {
- return (lhs.v_ >= rhs.v_);
-}
-#endif
-
-template <typename T>
-bool operator==(const LogVal<T>& lhs, const LogVal<T>& rhs) {
- return (lhs.v_ == rhs.v_) && (lhs.s_ == rhs.s_);
-}
-
-template <typename T>
-bool operator!=(const LogVal<T>& lhs, const LogVal<T>& rhs) {
- return !(lhs == rhs);
-}
-
-#endif
diff --git a/gi/clda/src/prob.h b/gi/clda/src/prob.h
deleted file mode 100644
index bc297870..00000000
--- a/gi/clda/src/prob.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _PROB_H_
-#define _PROB_H_
-
-#include "logval.h"
-
-typedef LogVal<double> prob_t;
-
-#endif
diff --git a/gi/clda/src/sampler.h b/gi/clda/src/sampler.h
deleted file mode 100644
index 4d0b2e64..00000000
--- a/gi/clda/src/sampler.h
+++ /dev/null
@@ -1,138 +0,0 @@
-#ifndef SAMPLER_H_
-#define SAMPLER_H_
-
-#include <algorithm>
-#include <functional>
-#include <numeric>
-#include <iostream>
-#include <fstream>
-#include <vector>
-
-#include <boost/random/mersenne_twister.hpp>
-#include <boost/random/uniform_real.hpp>
-#include <boost/random/variate_generator.hpp>
-#include <boost/random/normal_distribution.hpp>
-#include <boost/random/poisson_distribution.hpp>
-
-#include "prob.h"
-
-struct SampleSet;
-
-template <typename RNG>
-struct RandomNumberGenerator {
- static uint32_t GetTrulyRandomSeed() {
- uint32_t seed;
- std::ifstream r("/dev/urandom");
- if (r) {
- r.read((char*)&seed,sizeof(uint32_t));
- }
- if (r.fail() || !r) {
- std::cerr << "Warning: could not read from /dev/urandom. Seeding from clock" << std::endl;
- seed = time(NULL);
- }
- std::cerr << "Seeding random number sequence to " << seed << std::endl;
- return seed;
- }
-
- RandomNumberGenerator() : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) {
- uint32_t seed = GetTrulyRandomSeed();
- m_generator.seed(seed);
- }
- explicit RandomNumberGenerator(uint32_t seed) : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) {
- if (!seed) seed = GetTrulyRandomSeed();
- m_generator.seed(seed);
- }
-
- size_t SelectSample(const prob_t& a, const prob_t& b, double T = 1.0) {
- if (T == 1.0) {
- if (this->next() > (a / (a + b))) return 1; else return 0;
- } else {
- assert(!"not implemented");
- }
- }
-
- // T is the annealing temperature, if desired
- size_t SelectSample(const SampleSet& ss, double T = 1.0);
-
- // draw a value from U(0,1)
- double next() {return m_random();}
-
- // draw a value from N(mean,var)
- double NextNormal(double mean, double var) {
- return boost::normal_distribution<double>(mean, var)(m_random);
- }
-
- // draw a value from a Poisson distribution
- // lambda must be greater than 0
- int NextPoisson(int lambda) {
- return boost::poisson_distribution<int>(lambda)(m_random);
- }
-
- bool AcceptMetropolisHastings(const prob_t& p_cur,
- const prob_t& p_prev,
- const prob_t& q_cur,
- const prob_t& q_prev) {
- const prob_t a = (p_cur / p_prev) * (q_prev / q_cur);
- if (log(a) >= 0.0) return true;
- return (prob_t(this->next()) < a);
- }
-
- private:
- boost::uniform_real<> m_dist;
- RNG m_generator;
- boost::variate_generator<RNG&, boost::uniform_real<> > m_random;
-};
-
-typedef RandomNumberGenerator<boost::mt19937> MT19937;
-
-class SampleSet {
- public:
- const prob_t& operator[](int i) const { return m_scores[i]; }
- prob_t& operator[](int i) { return m_scores[i]; }
- bool empty() const { return m_scores.empty(); }
- void add(const prob_t& s) { m_scores.push_back(s); }
- void clear() { m_scores.clear(); }
- size_t size() const { return m_scores.size(); }
- void resize(int size) { m_scores.resize(size); }
- std::vector<prob_t> m_scores;
-};
-
-template <typename RNG>
-size_t RandomNumberGenerator<RNG>::SelectSample(const SampleSet& ss, double T) {
- assert(T > 0.0);
- assert(ss.m_scores.size() > 0);
- if (ss.m_scores.size() == 1) return 0;
- const prob_t annealing_factor(1.0 / T);
- const bool anneal = (annealing_factor != prob_t::One());
- prob_t sum = prob_t::Zero();
- if (anneal) {
- for (int i = 0; i < ss.m_scores.size(); ++i)
- sum += ss.m_scores[i].pow(annealing_factor); // p^(1/T)
- } else {
- sum = std::accumulate(ss.m_scores.begin(), ss.m_scores.end(), prob_t::Zero());
- }
- //for (size_t i = 0; i < ss.m_scores.size(); ++i) std::cerr << ss.m_scores[i] << ",";
- //std::cerr << std::endl;
-
- prob_t random(this->next()); // random number between 0 and 1
- random *= sum; // scale with normalization factor
- //std::cerr << "Random number " << random << std::endl;
-
- //now figure out which sample
- size_t position = 1;
- sum = ss.m_scores[0];
- if (anneal) {
- sum.poweq(annealing_factor);
- for (; position < ss.m_scores.size() && sum < random; ++position)
- sum += ss.m_scores[position].pow(annealing_factor);
- } else {
- for (; position < ss.m_scores.size() && sum < random; ++position)
- sum += ss.m_scores[position];
- }
- //std::cout << "random: " << random << " sample: " << position << std::endl;
- //std::cerr << "Sample: " << position-1 << std::endl;
- //exit(1);
- return position-1;
-}
-
-#endif
diff --git a/gi/clda/src/tdict.h b/gi/clda/src/tdict.h
deleted file mode 100644
index 97f145a1..00000000
--- a/gi/clda/src/tdict.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef _TDICT_H_
-#define _TDICT_H_
-
-#include <string>
-#include <vector>
-#include "wordid.h"
-#include "dict.h"
-
-class Vocab;
-
-struct TD {
-
- static Dict dict_;
- static std::string empty;
- static std::string space;
-
- static std::string GetString(const std::vector<WordID>& str) {
- std::string res;
- for (std::vector<WordID>::const_iterator i = str.begin(); i != str.end(); ++i)
- res += (i == str.begin() ? empty : space) + TD::Convert(*i);
- return res;
- }
-
- static void ConvertSentence(const std::string& sent, std::vector<WordID>* ids) {
- std::string s = sent;
- int last = 0;
- ids->clear();
- for (int i=0; i < s.size(); ++i)
- if (s[i] == 32 || s[i] == '\t') {
- s[i]=0;
- if (last != i) {
- ids->push_back(Convert(&s[last]));
- }
- last = i + 1;
- }
- if (last != s.size())
- ids->push_back(Convert(&s[last]));
- }
-
- static WordID Convert(const std::string& s) {
- return dict_.Convert(s);
- }
-
- static const std::string& Convert(const WordID& w) {
- return dict_.Convert(w);
- }
-};
-
-#endif
diff --git a/gi/clda/src/wordid.h b/gi/clda/src/wordid.h
deleted file mode 100644
index fb50bcc1..00000000
--- a/gi/clda/src/wordid.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _WORD_ID_H_
-#define _WORD_ID_H_
-
-typedef int WordID;
-
-#endif
diff --git a/gi/pyp-topics/src/Makefile b/gi/pyp-topics/src/Makefile
deleted file mode 100644
index 1d1391ae..00000000
--- a/gi/pyp-topics/src/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
--include makefile.darwin
-
-local_objs = ../obj/corpus.o ../obj/gzstream.o ../obj/mt19937ar.o ../obj/pyp-topics.o ../obj/gammadist.o
-
-all: ../bin/pyp-topics-train
-
--include makefile.depend
-
-#-----------------------#
-# Local stuff
-#-----------------------#
-
-../bin/pyp-topics-train: ../obj/train.o $(local_objs)
- $(CXX) -o $@ $^ $(LDFLAGS)
-
-../obj/%.o: %.cc
- ${CXX} $(CXXFLAGS) -c $< -o $@
-
-../obj/%.o: %.c
- ${CC} $(CFLAGS) -c $< -o $@
-
-.PHONY: depend
-depend:
- $(CXX) -MM $(CXXFLAGS) *.cc | sed 's/^\(.*\.o:\)/obj\/\1/' > makefile.depend
-
-clean:
- rm -f ../obj/*.o
-
-#clobber: clean
-# rm makefile.depend ../bin/${ARCH}/*