diff options
Diffstat (limited to 'gi/pyp-topics/src')
| -rw-r--r-- | gi/pyp-topics/src/contexts_corpus.cc | 18 | ||||
| -rw-r--r-- | gi/pyp-topics/src/contexts_corpus.hh | 3 | ||||
| -rw-r--r-- | gi/pyp-topics/src/mpi-corpus.hh | 69 | ||||
| -rw-r--r-- | gi/pyp-topics/src/mpi-pyp-topics.cc | 2 | ||||
| -rw-r--r-- | gi/pyp-topics/src/mpi-train-contexts.cc | 3 | 
5 files changed, 89 insertions, 6 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc index 1cf69429..92b1b34c 100644 --- a/gi/pyp-topics/src/contexts_corpus.cc +++ b/gi/pyp-topics/src/contexts_corpus.cc @@ -15,6 +15,8 @@ using namespace std;  // ContextsCorpus  ////////////////////////////////////////////////// +bool read_callback_binary_contexts = false; +  void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {    assert(new_contexts.contexts.size() == new_contexts.counts.size()); @@ -50,9 +52,15 @@ void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void*      //int count = new_contexts.counts[i];      int count = new_contexts.counts.at(i).second; -    for (int j=0; j<count; ++j) +    if (read_callback_binary_contexts) {        doc->push_back(id); -    corpus_ptr->m_num_terms += count; +      corpus_ptr->m_num_terms++; +    } +    else { +      for (int j=0; j<count; ++j) +        doc->push_back(id); +      corpus_ptr->m_num_terms += count; +    }      // generate the backoff map      if (backoff_gen) { @@ -104,6 +112,7 @@ void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void    for (int i=0; i < (int)new_contexts.counts.size(); ++i) {      int context_index = new_contexts.counts.at(i).first;      int count = new_contexts.counts.at(i).second; +    //if (read_callback_binary_contexts) count = 1;      //int count = new_contexts.counts[i];      pair<map<string,int>::iterator,bool> result         = context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[context_index]),count)); @@ -116,7 +125,10 @@ void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void  unsigned ContextsCorpus::read_contexts(const string &filename,                                          BackoffGenerator* backoff_gen_ptr, -                                       bool /*filter_singeltons*/) { +                                       bool /*filter_singeltons*/, +                                       bool binary_contexts) { +  read_callback_binary_contexts = binary_contexts; +    map<string,int> counts;    //if (filter_singeltons)     { diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh index f7dad21f..dd721361 100644 --- a/gi/pyp-topics/src/contexts_corpus.hh +++ b/gi/pyp-topics/src/contexts_corpus.hh @@ -55,7 +55,8 @@ public:      virtual unsigned read_contexts(const std::string &filename,                                      BackoffGenerator* backoff_gen=0, -                                   bool filter_singeltons=false); +                                   bool filter_singeltons=false, +                                   bool binary_contexts=false);      TermBackoffPtr backoff_index() {        return m_backoff; diff --git a/gi/pyp-topics/src/mpi-corpus.hh b/gi/pyp-topics/src/mpi-corpus.hh new file mode 100644 index 00000000..f5c478a9 --- /dev/null +++ b/gi/pyp-topics/src/mpi-corpus.hh @@ -0,0 +1,69 @@ +#ifndef _MPI_CORPUS_HH +#define _MPI_CORPUS_HH + +#include <vector> +#include <string> +#include <map> +#include <tr1/unordered_map> + +#include <boost/ptr_container/ptr_vector.hpp> +#include <boost/mpi/environment.hpp> +#include <boost/mpi/communicator.hpp> + +#include "contexts_corpus.hh" + + +//////////////////////////////////////////////////////////////// +// MPICorpus +//////////////////////////////////////////////////////////////// + +class MPICorpus : public ContextsCorpus { +public: +  MPICorpus() : ContextsCorpus() { +    boost::mpi::communicator world; +    m_rank = world.rank(); +    m_size = world.size(); +    m_start = -1; +    m_end = -1; +  } +  virtual ~MPICorpus() {} + +  virtual unsigned read_contexts(const std::string &filename,  +                                 BackoffGenerator* backoff_gen=0, +                                 bool filter_singeltons=false, +                                 bool binary_contexts=false) { +    unsigned result = ContextsCorpus::read_contexts(filename, backoff_gen, filter_singeltons, binary_contexts); + +    if (m_rank == 0) std::cerr << "\tLoad balancing terms per mpi segment:" << std::endl; +    float segment_size = num_terms() / m_size; +    float term_threshold = segment_size; +    int seen_terms = 0; +    std::vector<int> end_points; +    for (int i=0; i < num_documents(); ++i) { +      seen_terms += m_documents.at(i).size(); +      if (seen_terms >= term_threshold) { +        end_points.push_back(i+1); +        term_threshold += segment_size; +        if (m_rank == 0) std::cerr << "\t\t" << i+1 << ": " <<  seen_terms << " terms, " << 100*seen_terms / (float)num_terms() << "%" << std::endl; +      } +    } +    m_start = (m_rank == 0 ? 0 : end_points.at(m_rank-1)); +    m_end = (m_rank == m_size-1 ? num_documents() : end_points.at(m_rank)); + +    return result; +  } + +  void +  bounds(int* start, int* end) const { +    *start = m_start; +    *end = m_end; +  } + + + +protected: +  int m_rank, m_size; +  int m_start, m_end; +}; + +#endif // _MPI_CORPUS_HH diff --git a/gi/pyp-topics/src/mpi-pyp-topics.cc b/gi/pyp-topics/src/mpi-pyp-topics.cc index 967c3a77..d6e22af6 100644 --- a/gi/pyp-topics/src/mpi-pyp-topics.cc +++ b/gi/pyp-topics/src/mpi-pyp-topics.cc @@ -77,7 +77,7 @@ void MPIPYPTopics::sample_corpus(const MPICorpus& corpus, int samples,        int new_topic = -1;        if (freq > frequency_cutoff            && (!max_contexts_per_document || term_index < max_contexts_per_document)) { -        new_topic = sample(document_id, term); +        new_topic = sample(i, term);          //new_topic = document_id % m_num_topics;          // add the new topic to the PYPs diff --git a/gi/pyp-topics/src/mpi-train-contexts.cc b/gi/pyp-topics/src/mpi-train-contexts.cc index 7bb890d2..e05e0eac 100644 --- a/gi/pyp-topics/src/mpi-train-contexts.cc +++ b/gi/pyp-topics/src/mpi-train-contexts.cc @@ -58,6 +58,7 @@ int main(int argc, char **argv)        ("backoff-type", value<string>(), "backoff type: none|simple")  //      ("filter-singleton-contexts", "filter singleton contexts")        ("hierarchical-topics", "Use a backoff hierarchical PYP as the P0 for the document topics distribution.") +      ("binary-counts,b", "Use binary rather than integer counts for contexts.")        ("freq-cutoff-start", value<int>()->default_value(0), "initial frequency cutoff.")        ("freq-cutoff-end", value<int>()->default_value(0), "final frequency cutoff.")        ("freq-cutoff-interval", value<int>()->default_value(0), "number of iterations between frequency decrement.") @@ -107,7 +108,7 @@ int main(int argc, char **argv)    //ContextsCorpus contexts_corpus;    MPICorpus contexts_corpus; -  contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen, /*vm.count("filter-singleton-contexts")*/ false); +  contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen, /*vm.count("filter-singleton-contexts")*/ false, vm.count("binary-counts"));    int mpi_start = 0, mpi_end = 0;    contexts_corpus.bounds(&mpi_start, &mpi_end);    std::cerr << "\tProcess " << rank << " has documents " << mpi_start << " -> " << mpi_end << "." << std::endl;  | 
