summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src/mpi-corpus.hh
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
commite26434979adc33bd949566ba7bf02dff64e80a3e (patch)
treed1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/pyp-topics/src/mpi-corpus.hh
parent0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'gi/pyp-topics/src/mpi-corpus.hh')
-rw-r--r--gi/pyp-topics/src/mpi-corpus.hh69
1 files changed, 0 insertions, 69 deletions
diff --git a/gi/pyp-topics/src/mpi-corpus.hh b/gi/pyp-topics/src/mpi-corpus.hh
deleted file mode 100644
index f5c478a9..00000000
--- a/gi/pyp-topics/src/mpi-corpus.hh
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef _MPI_CORPUS_HH
-#define _MPI_CORPUS_HH
-
-#include <vector>
-#include <string>
-#include <map>
-#include <tr1/unordered_map>
-
-#include <boost/ptr_container/ptr_vector.hpp>
-#include <boost/mpi/environment.hpp>
-#include <boost/mpi/communicator.hpp>
-
-#include "contexts_corpus.hh"
-
-
-////////////////////////////////////////////////////////////////
-// MPICorpus
-////////////////////////////////////////////////////////////////
-
-class MPICorpus : public ContextsCorpus {
-public:
- MPICorpus() : ContextsCorpus() {
- boost::mpi::communicator world;
- m_rank = world.rank();
- m_size = world.size();
- m_start = -1;
- m_end = -1;
- }
- virtual ~MPICorpus() {}
-
- virtual unsigned read_contexts(const std::string &filename,
- BackoffGenerator* backoff_gen=0,
- bool filter_singeltons=false,
- bool binary_contexts=false) {
- unsigned result = ContextsCorpus::read_contexts(filename, backoff_gen, filter_singeltons, binary_contexts);
-
- if (m_rank == 0) std::cerr << "\tLoad balancing terms per mpi segment:" << std::endl;
- float segment_size = num_terms() / m_size;
- float term_threshold = segment_size;
- int seen_terms = 0;
- std::vector<int> end_points;
- for (int i=0; i < num_documents(); ++i) {
- seen_terms += m_documents.at(i).size();
- if (seen_terms >= term_threshold) {
- end_points.push_back(i+1);
- term_threshold += segment_size;
- if (m_rank == 0) std::cerr << "\t\t" << i+1 << ": " << seen_terms << " terms, " << 100*seen_terms / (float)num_terms() << "%" << std::endl;
- }
- }
- m_start = (m_rank == 0 ? 0 : end_points.at(m_rank-1));
- m_end = (m_rank == m_size-1 ? num_documents() : end_points.at(m_rank));
-
- return result;
- }
-
- void
- bounds(int* start, int* end) const {
- *start = m_start;
- *end = m_end;
- }
-
-
-
-protected:
- int m_rank, m_size;
- int m_start, m_end;
-};
-
-#endif // _MPI_CORPUS_HH