diff options
author | philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-25 02:11:28 +0000 |
---|---|---|
committer | philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-25 02:11:28 +0000 |
commit | 87001f279f6685be18a12fc153714c36fe19067d (patch) | |
tree | 3cd8bcb4f743918f6ad42b504c577b409b2b43e6 /gi/pyp-topics/src/mpi-corpus.hh | |
parent | 04d4aa3bdbd3366e1b140d6f0783c38b4e469c02 (diff) |
added missing file.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@402 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/src/mpi-corpus.hh')
-rw-r--r-- | gi/pyp-topics/src/mpi-corpus.hh | 69 |
1 files changed, 69 insertions, 0 deletions
diff --git a/gi/pyp-topics/src/mpi-corpus.hh b/gi/pyp-topics/src/mpi-corpus.hh new file mode 100644 index 00000000..f5c478a9 --- /dev/null +++ b/gi/pyp-topics/src/mpi-corpus.hh @@ -0,0 +1,69 @@ +#ifndef _MPI_CORPUS_HH +#define _MPI_CORPUS_HH + +#include <vector> +#include <string> +#include <map> +#include <tr1/unordered_map> + +#include <boost/ptr_container/ptr_vector.hpp> +#include <boost/mpi/environment.hpp> +#include <boost/mpi/communicator.hpp> + +#include "contexts_corpus.hh" + + +//////////////////////////////////////////////////////////////// +// MPICorpus +//////////////////////////////////////////////////////////////// + +class MPICorpus : public ContextsCorpus { +public: + MPICorpus() : ContextsCorpus() { + boost::mpi::communicator world; + m_rank = world.rank(); + m_size = world.size(); + m_start = -1; + m_end = -1; + } + virtual ~MPICorpus() {} + + virtual unsigned read_contexts(const std::string &filename, + BackoffGenerator* backoff_gen=0, + bool filter_singeltons=false, + bool binary_contexts=false) { + unsigned result = ContextsCorpus::read_contexts(filename, backoff_gen, filter_singeltons, binary_contexts); + + if (m_rank == 0) std::cerr << "\tLoad balancing terms per mpi segment:" << std::endl; + float segment_size = num_terms() / m_size; + float term_threshold = segment_size; + int seen_terms = 0; + std::vector<int> end_points; + for (int i=0; i < num_documents(); ++i) { + seen_terms += m_documents.at(i).size(); + if (seen_terms >= term_threshold) { + end_points.push_back(i+1); + term_threshold += segment_size; + if (m_rank == 0) std::cerr << "\t\t" << i+1 << ": " << seen_terms << " terms, " << 100*seen_terms / (float)num_terms() << "%" << std::endl; + } + } + m_start = (m_rank == 0 ? 0 : end_points.at(m_rank-1)); + m_end = (m_rank == m_size-1 ? num_documents() : end_points.at(m_rank)); + + return result; + } + + void + bounds(int* start, int* end) const { + *start = m_start; + *end = m_end; + } + + + +protected: + int m_rank, m_size; + int m_start, m_end; +}; + +#endif // _MPI_CORPUS_HH |