diff options
-rw-r--r-- | gi/pyp-topics/src/contexts_corpus.cc | 18 | ||||
-rw-r--r-- | gi/pyp-topics/src/contexts_corpus.hh | 3 | ||||
-rw-r--r-- | gi/pyp-topics/src/mpi-corpus.hh | 69 | ||||
-rw-r--r-- | gi/pyp-topics/src/mpi-pyp-topics.cc | 2 | ||||
-rw-r--r-- | gi/pyp-topics/src/mpi-train-contexts.cc | 3 |
5 files changed, 89 insertions, 6 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc index 1cf69429..92b1b34c 100644 --- a/gi/pyp-topics/src/contexts_corpus.cc +++ b/gi/pyp-topics/src/contexts_corpus.cc @@ -15,6 +15,8 @@ using namespace std; // ContextsCorpus ////////////////////////////////////////////////// +bool read_callback_binary_contexts = false; + void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) { assert(new_contexts.contexts.size() == new_contexts.counts.size()); @@ -50,9 +52,15 @@ void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* //int count = new_contexts.counts[i]; int count = new_contexts.counts.at(i).second; - for (int j=0; j<count; ++j) + if (read_callback_binary_contexts) { doc->push_back(id); - corpus_ptr->m_num_terms += count; + corpus_ptr->m_num_terms++; + } + else { + for (int j=0; j<count; ++j) + doc->push_back(id); + corpus_ptr->m_num_terms += count; + } // generate the backoff map if (backoff_gen) { @@ -104,6 +112,7 @@ void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void for (int i=0; i < (int)new_contexts.counts.size(); ++i) { int context_index = new_contexts.counts.at(i).first; int count = new_contexts.counts.at(i).second; + //if (read_callback_binary_contexts) count = 1; //int count = new_contexts.counts[i]; pair<map<string,int>::iterator,bool> result = context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[context_index]),count)); @@ -116,7 +125,10 @@ void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void unsigned ContextsCorpus::read_contexts(const string &filename, BackoffGenerator* backoff_gen_ptr, - bool /*filter_singeltons*/) { + bool /*filter_singeltons*/, + bool binary_contexts) { + read_callback_binary_contexts = binary_contexts; + map<string,int> counts; //if (filter_singeltons) { diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh index f7dad21f..dd721361 100644 --- a/gi/pyp-topics/src/contexts_corpus.hh +++ b/gi/pyp-topics/src/contexts_corpus.hh @@ -55,7 +55,8 @@ public: virtual unsigned read_contexts(const std::string &filename, BackoffGenerator* backoff_gen=0, - bool filter_singeltons=false); + bool filter_singeltons=false, + bool binary_contexts=false); TermBackoffPtr backoff_index() { return m_backoff; diff --git a/gi/pyp-topics/src/mpi-corpus.hh b/gi/pyp-topics/src/mpi-corpus.hh new file mode 100644 index 00000000..f5c478a9 --- /dev/null +++ b/gi/pyp-topics/src/mpi-corpus.hh @@ -0,0 +1,69 @@ +#ifndef _MPI_CORPUS_HH +#define _MPI_CORPUS_HH + +#include <vector> +#include <string> +#include <map> +#include <tr1/unordered_map> + +#include <boost/ptr_container/ptr_vector.hpp> +#include <boost/mpi/environment.hpp> +#include <boost/mpi/communicator.hpp> + +#include "contexts_corpus.hh" + + +//////////////////////////////////////////////////////////////// +// MPICorpus +//////////////////////////////////////////////////////////////// + +class MPICorpus : public ContextsCorpus { +public: + MPICorpus() : ContextsCorpus() { + boost::mpi::communicator world; + m_rank = world.rank(); + m_size = world.size(); + m_start = -1; + m_end = -1; + } + virtual ~MPICorpus() {} + + virtual unsigned read_contexts(const std::string &filename, + BackoffGenerator* backoff_gen=0, + bool filter_singeltons=false, + bool binary_contexts=false) { + unsigned result = ContextsCorpus::read_contexts(filename, backoff_gen, filter_singeltons, binary_contexts); + + if (m_rank == 0) std::cerr << "\tLoad balancing terms per mpi segment:" << std::endl; + float segment_size = num_terms() / m_size; + float term_threshold = segment_size; + int seen_terms = 0; + std::vector<int> end_points; + for (int i=0; i < num_documents(); ++i) { + seen_terms += m_documents.at(i).size(); + if (seen_terms >= term_threshold) { + end_points.push_back(i+1); + term_threshold += segment_size; + if (m_rank == 0) std::cerr << "\t\t" << i+1 << ": " << seen_terms << " terms, " << 100*seen_terms / (float)num_terms() << "%" << std::endl; + } + } + m_start = (m_rank == 0 ? 0 : end_points.at(m_rank-1)); + m_end = (m_rank == m_size-1 ? num_documents() : end_points.at(m_rank)); + + return result; + } + + void + bounds(int* start, int* end) const { + *start = m_start; + *end = m_end; + } + + + +protected: + int m_rank, m_size; + int m_start, m_end; +}; + +#endif // _MPI_CORPUS_HH diff --git a/gi/pyp-topics/src/mpi-pyp-topics.cc b/gi/pyp-topics/src/mpi-pyp-topics.cc index 967c3a77..d6e22af6 100644 --- a/gi/pyp-topics/src/mpi-pyp-topics.cc +++ b/gi/pyp-topics/src/mpi-pyp-topics.cc @@ -77,7 +77,7 @@ void MPIPYPTopics::sample_corpus(const MPICorpus& corpus, int samples, int new_topic = -1; if (freq > frequency_cutoff && (!max_contexts_per_document || term_index < max_contexts_per_document)) { - new_topic = sample(document_id, term); + new_topic = sample(i, term); //new_topic = document_id % m_num_topics; // add the new topic to the PYPs diff --git a/gi/pyp-topics/src/mpi-train-contexts.cc b/gi/pyp-topics/src/mpi-train-contexts.cc index 7bb890d2..e05e0eac 100644 --- a/gi/pyp-topics/src/mpi-train-contexts.cc +++ b/gi/pyp-topics/src/mpi-train-contexts.cc @@ -58,6 +58,7 @@ int main(int argc, char **argv) ("backoff-type", value<string>(), "backoff type: none|simple") // ("filter-singleton-contexts", "filter singleton contexts") ("hierarchical-topics", "Use a backoff hierarchical PYP as the P0 for the document topics distribution.") + ("binary-counts,b", "Use binary rather than integer counts for contexts.") ("freq-cutoff-start", value<int>()->default_value(0), "initial frequency cutoff.") ("freq-cutoff-end", value<int>()->default_value(0), "final frequency cutoff.") ("freq-cutoff-interval", value<int>()->default_value(0), "number of iterations between frequency decrement.") @@ -107,7 +108,7 @@ int main(int argc, char **argv) //ContextsCorpus contexts_corpus; MPICorpus contexts_corpus; - contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen, /*vm.count("filter-singleton-contexts")*/ false); + contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen, /*vm.count("filter-singleton-contexts")*/ false, vm.count("binary-counts")); int mpi_start = 0, mpi_end = 0; contexts_corpus.bounds(&mpi_start, &mpi_end); std::cerr << "\tProcess " << rank << " has documents " << mpi_start << " -> " << mpi_end << "." << std::endl; |