From e0bca5fea3b0267819186d0fc34c036e6b77679c Mon Sep 17 00:00:00 2001 From: philblunsom Date: Fri, 23 Jul 2010 16:04:32 +0000 Subject: Changed the initialisation of the sampler, hopefully this will work better. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@376 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pyp-topics/src/contexts_corpus.hh | 8 ++++---- gi/pyp-topics/src/corpus.hh | 2 +- gi/pyp-topics/src/makefile.depend | 13 +++++++++---- gi/pyp-topics/src/mpi-pyp-topics.cc | 26 +++++++++++++++++--------- gi/pyp-topics/src/mpi-pyp-topics.hh | 8 ++++---- gi/pyp-topics/src/mpi-train-contexts.cc | 20 ++++++++++++++------ gi/pyp-topics/src/pyp-topics.cc | 6 ++++-- 7 files changed, 53 insertions(+), 30 deletions(-) (limited to 'gi/pyp-topics/src') diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh index f3c25454..f7dad21f 100644 --- a/gi/pyp-topics/src/contexts_corpus.hh +++ b/gi/pyp-topics/src/contexts_corpus.hh @@ -53,9 +53,9 @@ public: ContextsCorpus() : m_backoff(new TermBackoff) {} virtual ~ContextsCorpus() {} - unsigned read_contexts(const std::string &filename, - BackoffGenerator* backoff_gen=0, - bool filter_singeltons=false); + virtual unsigned read_contexts(const std::string &filename, + BackoffGenerator* backoff_gen=0, + bool filter_singeltons=false); TermBackoffPtr backoff_index() { return m_backoff; @@ -77,7 +77,7 @@ public: return m_keys.at(i); } -private: +protected: TermBackoffPtr m_backoff; Dict m_dict; std::vector m_keys; diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh index 24981946..2aa03527 100644 --- a/gi/pyp-topics/src/corpus.hh +++ b/gi/pyp-topics/src/corpus.hh @@ -25,7 +25,7 @@ public: Corpus(); virtual ~Corpus() {} - unsigned read(const std::string &filename); + virtual unsigned read(const std::string &filename); const_iterator begin() const { return m_documents.begin(); } const_iterator end() const { return m_documents.end(); } diff --git a/gi/pyp-topics/src/makefile.depend b/gi/pyp-topics/src/makefile.depend index d8ef78d8..9b8e306c 100644 --- a/gi/pyp-topics/src/makefile.depend +++ b/gi/pyp-topics/src/makefile.depend @@ -1442,6 +1442,9 @@ mpi-pyp-topics.o: mpi-pyp-topics.cc \ /home/pblunsom/packages/include/boost/random/linear_congruential.hpp \ /home/pblunsom/packages/include/boost/random/detail/const_mod.hpp \ /home/pblunsom/packages/include/boost/random/detail/seed.hpp \ + /home/pblunsom/packages/include/boost/random/inversive_congruential.hpp \ + /home/pblunsom/packages/include/boost/random/lagged_fibonacci.hpp \ + /home/pblunsom/packages/include/boost/config/no_tr1/cmath.hpp \ /home/pblunsom/packages/include/boost/mpi/environment.hpp mpi-pyp.hh \ /home/pblunsom/packages/include/boost/tuple/tuple.hpp \ /home/pblunsom/packages/include/boost/ref.hpp \ @@ -2151,7 +2154,10 @@ mpi-train-contexts.o: mpi-train-contexts.cc \ /home/pblunsom/packages/include/boost/random/mersenne_twister.hpp \ /home/pblunsom/packages/include/boost/random/linear_congruential.hpp \ /home/pblunsom/packages/include/boost/random/detail/const_mod.hpp \ - /home/pblunsom/packages/include/boost/random/detail/seed.hpp mpi-pyp.hh \ + /home/pblunsom/packages/include/boost/random/detail/seed.hpp \ + /home/pblunsom/packages/include/boost/random/inversive_congruential.hpp \ + /home/pblunsom/packages/include/boost/random/lagged_fibonacci.hpp \ + /home/pblunsom/packages/include/boost/config/no_tr1/cmath.hpp mpi-pyp.hh \ /home/pblunsom/packages/include/boost/tuple/tuple.hpp \ /home/pblunsom/packages/include/boost/tuple/detail/tuple_basic.hpp \ /home/pblunsom/packages/include/boost/type_traits/cv_traits.hpp \ @@ -2239,14 +2245,13 @@ mpi-train-contexts.o: mpi-train-contexts.cc \ /home/pblunsom/packages/include/boost/mpi/detail/broadcast_sc.hpp \ /home/pblunsom/packages/include/boost/mpi/detail/communicator_sc.hpp \ /home/pblunsom/packages/include/boost/mpi/timer.hpp pyp.hh \ - slice-sampler.h log_add.h mt19937ar.h corpus.hh contexts_corpus.hh \ - contexts_lexer.h ../../../decoder/dict.h \ + slice-sampler.h log_add.h mt19937ar.h corpus.hh mpi-corpus.hh \ + contexts_corpus.hh contexts_lexer.h ../../../decoder/dict.h \ /home/pblunsom/packages/include/boost/functional/hash.hpp \ /home/pblunsom/packages/include/boost/functional/hash/hash.hpp \ /home/pblunsom/packages/include/boost/functional/hash/hash_fwd.hpp \ /home/pblunsom/packages/include/boost/functional/hash/detail/hash_float.hpp \ /home/pblunsom/packages/include/boost/functional/hash/detail/float_functions.hpp \ - /home/pblunsom/packages/include/boost/config/no_tr1/cmath.hpp \ /home/pblunsom/packages/include/boost/functional/hash/detail/limits.hpp \ /home/pblunsom/packages/include/boost/integer/static_log2.hpp \ /home/pblunsom/packages/include/boost/functional/hash/detail/hash_float_generic.hpp \ diff --git a/gi/pyp-topics/src/mpi-pyp-topics.cc b/gi/pyp-topics/src/mpi-pyp-topics.cc index fa951156..967c3a77 100644 --- a/gi/pyp-topics/src/mpi-pyp-topics.cc +++ b/gi/pyp-topics/src/mpi-pyp-topics.cc @@ -4,13 +4,14 @@ #include "mpi-pyp-topics.hh" //#include -void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples, +void MPIPYPTopics::sample_corpus(const MPICorpus& corpus, int samples, int freq_cutoff_start, int freq_cutoff_end, int freq_cutoff_interval, int max_contexts_per_document) { Timer timer; - int documents = corpus.num_documents(); + //int documents = corpus.num_documents(); + /* m_mpi_start = 0; m_mpi_end = documents; if (m_size != 1) { @@ -19,6 +20,8 @@ void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples, if (m_rank == m_size-1) m_mpi_end = documents; else m_mpi_end = (documents / m_size)*(m_rank+1); } + */ + corpus.bounds(&m_mpi_start, &m_mpi_end); int local_documents = m_mpi_end - m_mpi_start; if (!m_backoff.get()) { @@ -74,7 +77,8 @@ void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples, int new_topic = -1; if (freq > frequency_cutoff && (!max_contexts_per_document || term_index < max_contexts_per_document)) { - new_topic = document_id % m_num_topics; + new_topic = sample(document_id, term); + //new_topic = document_id % m_num_topics; // add the new topic to the PYPs increment(term, new_topic); @@ -336,7 +340,8 @@ MPIPYPTopics::F MPIPYPTopics::word_pyps_p0(const Term& term, int topic, int leve Term backoff_term = (*m_backoff)[term]; if (!m_backoff->is_null(backoff_term)) { assert (level < m_backoff->order()); - p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1); + //p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1); + p0 = prob(backoff_term, topic, level+1); } else p0 = m_term_p0; @@ -373,10 +378,11 @@ int MPIPYPTopics::max_topic() const { } } assert(current_topic >= 0); - return current_topic; + assert(current_max >= 0); + return current_max; } -int MPIPYPTopics::max(const DocumentId& true_doc) const { +std::pair MPIPYPTopics::max(const DocumentId& true_doc) const { //std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl; // collect probs F current_max=0.0; @@ -399,10 +405,11 @@ int MPIPYPTopics::max(const DocumentId& true_doc) const { } } assert(current_topic >= 0); - return current_topic; + assert(current_max >= 0); + return std::make_pair(current_topic, current_max); } -int MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const { +std::pair MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const { //std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl; // collect probs F current_max=0.0; @@ -426,7 +433,8 @@ int MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const { } } assert(current_topic >= 0); - return current_topic; + assert(current_max >= 0); + return std::make_pair(current_topic, current_max); } std::ostream& MPIPYPTopics::print_document_topics(std::ostream& out) const { diff --git a/gi/pyp-topics/src/mpi-pyp-topics.hh b/gi/pyp-topics/src/mpi-pyp-topics.hh index 4a4433e6..d96bc4e5 100644 --- a/gi/pyp-topics/src/mpi-pyp-topics.hh +++ b/gi/pyp-topics/src/mpi-pyp-topics.hh @@ -16,7 +16,7 @@ #include "mpi-pyp.hh" -#include "corpus.hh" +#include "mpi-corpus.hh" class MPIPYPTopics { public: @@ -37,14 +37,14 @@ public: m_am_root = (m_rank == 0); } - void sample_corpus(const Corpus& corpus, int samples, + void sample_corpus(const MPICorpus& corpus, int samples, int freq_cutoff_start=0, int freq_cutoff_end=0, int freq_cutoff_interval=0, int max_contexts_per_document=0); int sample(const DocumentId& doc, const Term& term); - int max(const DocumentId& doc, const Term& term) const; - int max(const DocumentId& doc) const; + std::pair max(const DocumentId& doc, const Term& term) const; + std::pair max(const DocumentId& doc) const; int max_topic() const; void set_backoff(const std::string& filename) { diff --git a/gi/pyp-topics/src/mpi-train-contexts.cc b/gi/pyp-topics/src/mpi-train-contexts.cc index 4f966a65..7bb890d2 100644 --- a/gi/pyp-topics/src/mpi-train-contexts.cc +++ b/gi/pyp-topics/src/mpi-train-contexts.cc @@ -15,7 +15,7 @@ // Local #include "mpi-pyp-topics.hh" #include "corpus.hh" -#include "contexts_corpus.hh" +#include "mpi-corpus.hh" #include "gzstream.hh" static const char *REVISION = "$Rev: 170 $"; @@ -105,8 +105,13 @@ int main(int argc, char **argv) } } - ContextsCorpus contexts_corpus; + //ContextsCorpus contexts_corpus; + MPICorpus contexts_corpus; contexts_corpus.read_contexts(vm["data"].as(), backoff_gen, /*vm.count("filter-singleton-contexts")*/ false); + int mpi_start = 0, mpi_end = 0; + contexts_corpus.bounds(&mpi_start, &mpi_end); + std::cerr << "\tProcess " << rank << " has documents " << mpi_start << " -> " << mpi_end << "." << std::endl; + model.set_backoff(contexts_corpus.backoff_index()); if (backoff_gen) @@ -121,13 +126,15 @@ int main(int argc, char **argv) if (vm.count("document-topics-out")) { std::ofstream documents_out((vm["document-topics-out"].as() + ".pyp-process-" + boost::lexical_cast(rank)).c_str()); - int documents = contexts_corpus.num_documents(); + //int documents = contexts_corpus.num_documents(); + /* int mpi_start = 0, mpi_end = documents; if (world.size() != 1) { mpi_start = (documents / world.size()) * rank; if (rank == world.size()-1) mpi_end = documents; else mpi_end = (documents / world.size())*(rank+1); } + */ map all_terms; for (int document_id=mpi_start; document_id::const_iterator termIt=unique_terms.begin(); termIt != unique_terms.end(); ++termIt) { if (termIt != unique_terms.begin()) documents_out << " ||| "; vector strings = contexts_corpus.context2string(*termIt); copy(strings.begin(), strings.end(),ostream_iterator(documents_out, " ")); - documents_out << "||| C=" << model.max(document_id, *termIt); + std::pair maxinfo = model.max(document_id, *termIt); + documents_out << "||| C=" << maxinfo.first << " P=" << maxinfo.second; } documents_out <::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) { vector strings = contexts_corpus.context2string(termIt->first); - default_topics << model.max(-1, termIt->first) << " ||| " << termIt->second << " ||| "; + default_topics << model.max(-1, termIt->first).first << " ||| " << termIt->second << " ||| "; copy(strings.begin(), strings.end(),ostream_iterator(default_topics, " ")); default_topics < frequency_cutoff && (!max_contexts_per_document || term_index < max_contexts_per_document)) { - new_topic = document_id % m_num_topics; + new_topic = sample(document_id, term); + //new_topic = document_id % m_num_topics; // add the new topic to the PYPs increment(term, new_topic); @@ -314,7 +315,8 @@ PYPTopics::F PYPTopics::word_pyps_p0(const Term& term, int topic, int level) con Term backoff_term = (*m_backoff)[term]; if (!m_backoff->is_null(backoff_term)) { assert (level < m_backoff->order()); - p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1); + //p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1); + p0 = prob(backoff_term, topic, level+1); } else p0 = m_term_p0; -- cgit v1.2.3