diff options
author | philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-14 22:42:35 +0000 |
---|---|---|
committer | philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-14 22:42:35 +0000 |
commit | 8f97e6b03114761870f0c72f18f0928fac28d0f9 (patch) | |
tree | bcee285ff2ee1368f7408647a37ac6f9620c1ca7 /gi/pyp-topics/src/mpi-pyp-topics.hh | |
parent | 047b6eae4e5224d55c43d0994f2691692517f28d (diff) |
starting an mpi version.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@253 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/src/mpi-pyp-topics.hh')
-rw-r--r-- | gi/pyp-topics/src/mpi-pyp-topics.hh | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/gi/pyp-topics/src/mpi-pyp-topics.hh b/gi/pyp-topics/src/mpi-pyp-topics.hh new file mode 100644 index 00000000..d978c7a1 --- /dev/null +++ b/gi/pyp-topics/src/mpi-pyp-topics.hh @@ -0,0 +1,97 @@ +#ifndef PYP_TOPICS_HH +#define PYP_TOPICS_HH + +#include <vector> +#include <iostream> +#include <boost/ptr_container/ptr_vector.hpp> + +#include <boost/random/uniform_real.hpp> +#include <boost/random/variate_generator.hpp> +#include <boost/random/mersenne_twister.hpp> + +#include "mpi-pyp.hh" +#include "corpus.hh" +#include "workers.hh" + +class PYPTopics { +public: + typedef std::vector<int> DocumentTopics; + typedef std::vector<DocumentTopics> CorpusTopics; + typedef double F; + +public: + PYPTopics(int num_topics, bool use_topic_pyp=false, unsigned long seed = 0, + int max_threads = 1) + : m_num_topics(num_topics), m_word_pyps(1), + m_topic_pyp(0.5,1.0,seed), m_use_topic_pyp(use_topic_pyp), + m_seed(seed), + uni_dist(0,1), rng(seed == 0 ? (unsigned long)this : seed), + rnd(rng, uni_dist), max_threads(max_threads) {} + + void sample_corpus(const Corpus& corpus, int samples, + int freq_cutoff_start=0, int freq_cutoff_end=0, + int freq_cutoff_interval=0, + int max_contexts_per_document=0); + + int sample(const DocumentId& doc, const Term& term); + int max(const DocumentId& doc, const Term& term) const; + int max(const DocumentId& doc) const; + int max_topic() const; + + void set_backoff(const std::string& filename) { + m_backoff.reset(new TermBackoff); + m_backoff->read(filename); + m_word_pyps.clear(); + m_word_pyps.resize(m_backoff->order(), PYPs()); + } + void set_backoff(TermBackoffPtr backoff) { + m_backoff = backoff; + m_word_pyps.clear(); + m_word_pyps.resize(m_backoff->order(), PYPs()); + } + + F prob(const Term& term, int topic, int level=0) const; + void decrement(const Term& term, int topic, int level=0); + void increment(const Term& term, int topic, int level=0); + + std::ostream& print_document_topics(std::ostream& out) const; + std::ostream& print_topic_terms(std::ostream& out) const; + +private: + F word_pyps_p0(const Term& term, int topic, int level) const; + + int m_num_topics; + F m_term_p0, m_topic_p0, m_backoff_p0; + + CorpusTopics m_corpus_topics; + typedef boost::ptr_vector< PYP<int> > PYPs; + PYPs m_document_pyps; + std::vector<PYPs> m_word_pyps; + PYP<int> m_topic_pyp; + bool m_use_topic_pyp; + + unsigned long m_seed; + + typedef boost::mt19937 base_generator_type; + typedef boost::uniform_real<> uni_dist_type; + typedef boost::variate_generator<base_generator_type&, uni_dist_type> gen_type; + + uni_dist_type uni_dist; + base_generator_type rng; //this gets the seed + gen_type rnd; //instantiate: rnd(rng, uni_dist) + //call: rnd() generates uniform on [0,1) + + typedef boost::function<F()> JobReturnsF; + typedef SimpleWorker<JobReturnsF, F> SimpleResampleWorker; + typedef boost::ptr_vector<SimpleResampleWorker> WorkerPtrVect; + + F hresample_docs(int num_threads, int thread_id); + +// F hresample_topics(); + + int max_threads; + + TermBackoffPtr m_backoff; +}; + +#endif // PYP_TOPICS_HH |