From e0bca5fea3b0267819186d0fc34c036e6b77679c Mon Sep 17 00:00:00 2001 From: philblunsom Date: Fri, 23 Jul 2010 16:04:32 +0000 Subject: Changed the initialisation of the sampler, hopefully this will work better. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@376 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pyp-topics/src/mpi-pyp-topics.cc | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'gi/pyp-topics/src/mpi-pyp-topics.cc') diff --git a/gi/pyp-topics/src/mpi-pyp-topics.cc b/gi/pyp-topics/src/mpi-pyp-topics.cc index fa951156..967c3a77 100644 --- a/gi/pyp-topics/src/mpi-pyp-topics.cc +++ b/gi/pyp-topics/src/mpi-pyp-topics.cc @@ -4,13 +4,14 @@ #include "mpi-pyp-topics.hh" //#include -void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples, +void MPIPYPTopics::sample_corpus(const MPICorpus& corpus, int samples, int freq_cutoff_start, int freq_cutoff_end, int freq_cutoff_interval, int max_contexts_per_document) { Timer timer; - int documents = corpus.num_documents(); + //int documents = corpus.num_documents(); + /* m_mpi_start = 0; m_mpi_end = documents; if (m_size != 1) { @@ -19,6 +20,8 @@ void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples, if (m_rank == m_size-1) m_mpi_end = documents; else m_mpi_end = (documents / m_size)*(m_rank+1); } + */ + corpus.bounds(&m_mpi_start, &m_mpi_end); int local_documents = m_mpi_end - m_mpi_start; if (!m_backoff.get()) { @@ -74,7 +77,8 @@ void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples, int new_topic = -1; if (freq > frequency_cutoff && (!max_contexts_per_document || term_index < max_contexts_per_document)) { - new_topic = document_id % m_num_topics; + new_topic = sample(document_id, term); + //new_topic = document_id % m_num_topics; // add the new topic to the PYPs increment(term, new_topic); @@ -336,7 +340,8 @@ MPIPYPTopics::F MPIPYPTopics::word_pyps_p0(const Term& term, int topic, int leve Term backoff_term = (*m_backoff)[term]; if (!m_backoff->is_null(backoff_term)) { assert (level < m_backoff->order()); - p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1); + //p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1); + p0 = prob(backoff_term, topic, level+1); } else p0 = m_term_p0; @@ -373,10 +378,11 @@ int MPIPYPTopics::max_topic() const { } } assert(current_topic >= 0); - return current_topic; + assert(current_max >= 0); + return current_max; } -int MPIPYPTopics::max(const DocumentId& true_doc) const { +std::pair MPIPYPTopics::max(const DocumentId& true_doc) const { //std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl; // collect probs F current_max=0.0; @@ -399,10 +405,11 @@ int MPIPYPTopics::max(const DocumentId& true_doc) const { } } assert(current_topic >= 0); - return current_topic; + assert(current_max >= 0); + return std::make_pair(current_topic, current_max); } -int MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const { +std::pair MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const { //std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl; // collect probs F current_max=0.0; @@ -426,7 +433,8 @@ int MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const { } } assert(current_topic >= 0); - return current_topic; + assert(current_max >= 0); + return std::make_pair(current_topic, current_max); } std::ostream& MPIPYPTopics::print_document_topics(std::ostream& out) const { -- cgit v1.2.3