summaryrefslogtreecommitdiff
path: root/gi/pyp-topics
diff options
context:
space:
mode:
authorphilblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-25 02:11:28 +0000
committerphilblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-25 02:11:28 +0000
commitabf2311a2665097de2fd27fb83e1acbbe2a26f59 (patch)
tree5680454bf719ccda38d3cb440b310d525ff70555 /gi/pyp-topics
parent8760b7b41970bfbea6ba124a63633c139331b512 (diff)
added missing file.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@402 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics')
-rw-r--r--gi/pyp-topics/src/contexts_corpus.cc18
-rw-r--r--gi/pyp-topics/src/contexts_corpus.hh3
-rw-r--r--gi/pyp-topics/src/mpi-corpus.hh69
-rw-r--r--gi/pyp-topics/src/mpi-pyp-topics.cc2
-rw-r--r--gi/pyp-topics/src/mpi-train-contexts.cc3
5 files changed, 89 insertions, 6 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc
index 1cf69429..92b1b34c 100644
--- a/gi/pyp-topics/src/contexts_corpus.cc
+++ b/gi/pyp-topics/src/contexts_corpus.cc
@@ -15,6 +15,8 @@ using namespace std;
// ContextsCorpus
//////////////////////////////////////////////////
+bool read_callback_binary_contexts = false;
+
void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {
assert(new_contexts.contexts.size() == new_contexts.counts.size());
@@ -50,9 +52,15 @@ void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void*
//int count = new_contexts.counts[i];
int count = new_contexts.counts.at(i).second;
- for (int j=0; j<count; ++j)
+ if (read_callback_binary_contexts) {
doc->push_back(id);
- corpus_ptr->m_num_terms += count;
+ corpus_ptr->m_num_terms++;
+ }
+ else {
+ for (int j=0; j<count; ++j)
+ doc->push_back(id);
+ corpus_ptr->m_num_terms += count;
+ }
// generate the backoff map
if (backoff_gen) {
@@ -104,6 +112,7 @@ void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void
for (int i=0; i < (int)new_contexts.counts.size(); ++i) {
int context_index = new_contexts.counts.at(i).first;
int count = new_contexts.counts.at(i).second;
+ //if (read_callback_binary_contexts) count = 1;
//int count = new_contexts.counts[i];
pair<map<string,int>::iterator,bool> result
= context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[context_index]),count));
@@ -116,7 +125,10 @@ void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void
unsigned ContextsCorpus::read_contexts(const string &filename,
BackoffGenerator* backoff_gen_ptr,
- bool /*filter_singeltons*/) {
+ bool /*filter_singeltons*/,
+ bool binary_contexts) {
+ read_callback_binary_contexts = binary_contexts;
+
map<string,int> counts;
//if (filter_singeltons)
{
diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh
index f7dad21f..dd721361 100644
--- a/gi/pyp-topics/src/contexts_corpus.hh
+++ b/gi/pyp-topics/src/contexts_corpus.hh
@@ -55,7 +55,8 @@ public:
virtual unsigned read_contexts(const std::string &filename,
BackoffGenerator* backoff_gen=0,
- bool filter_singeltons=false);
+ bool filter_singeltons=false,
+ bool binary_contexts=false);
TermBackoffPtr backoff_index() {
return m_backoff;
diff --git a/gi/pyp-topics/src/mpi-corpus.hh b/gi/pyp-topics/src/mpi-corpus.hh
new file mode 100644
index 00000000..f5c478a9
--- /dev/null
+++ b/gi/pyp-topics/src/mpi-corpus.hh
@@ -0,0 +1,69 @@
+#ifndef _MPI_CORPUS_HH
+#define _MPI_CORPUS_HH
+
+#include <vector>
+#include <string>
+#include <map>
+#include <tr1/unordered_map>
+
+#include <boost/ptr_container/ptr_vector.hpp>
+#include <boost/mpi/environment.hpp>
+#include <boost/mpi/communicator.hpp>
+
+#include "contexts_corpus.hh"
+
+
+////////////////////////////////////////////////////////////////
+// MPICorpus
+////////////////////////////////////////////////////////////////
+
+class MPICorpus : public ContextsCorpus {
+public:
+ MPICorpus() : ContextsCorpus() {
+ boost::mpi::communicator world;
+ m_rank = world.rank();
+ m_size = world.size();
+ m_start = -1;
+ m_end = -1;
+ }
+ virtual ~MPICorpus() {}
+
+ virtual unsigned read_contexts(const std::string &filename,
+ BackoffGenerator* backoff_gen=0,
+ bool filter_singeltons=false,
+ bool binary_contexts=false) {
+ unsigned result = ContextsCorpus::read_contexts(filename, backoff_gen, filter_singeltons, binary_contexts);
+
+ if (m_rank == 0) std::cerr << "\tLoad balancing terms per mpi segment:" << std::endl;
+ float segment_size = num_terms() / m_size;
+ float term_threshold = segment_size;
+ int seen_terms = 0;
+ std::vector<int> end_points;
+ for (int i=0; i < num_documents(); ++i) {
+ seen_terms += m_documents.at(i).size();
+ if (seen_terms >= term_threshold) {
+ end_points.push_back(i+1);
+ term_threshold += segment_size;
+ if (m_rank == 0) std::cerr << "\t\t" << i+1 << ": " << seen_terms << " terms, " << 100*seen_terms / (float)num_terms() << "%" << std::endl;
+ }
+ }
+ m_start = (m_rank == 0 ? 0 : end_points.at(m_rank-1));
+ m_end = (m_rank == m_size-1 ? num_documents() : end_points.at(m_rank));
+
+ return result;
+ }
+
+ void
+ bounds(int* start, int* end) const {
+ *start = m_start;
+ *end = m_end;
+ }
+
+
+
+protected:
+ int m_rank, m_size;
+ int m_start, m_end;
+};
+
+#endif // _MPI_CORPUS_HH
diff --git a/gi/pyp-topics/src/mpi-pyp-topics.cc b/gi/pyp-topics/src/mpi-pyp-topics.cc
index 967c3a77..d6e22af6 100644
--- a/gi/pyp-topics/src/mpi-pyp-topics.cc
+++ b/gi/pyp-topics/src/mpi-pyp-topics.cc
@@ -77,7 +77,7 @@ void MPIPYPTopics::sample_corpus(const MPICorpus& corpus, int samples,
int new_topic = -1;
if (freq > frequency_cutoff
&& (!max_contexts_per_document || term_index < max_contexts_per_document)) {
- new_topic = sample(document_id, term);
+ new_topic = sample(i, term);
//new_topic = document_id % m_num_topics;
// add the new topic to the PYPs
diff --git a/gi/pyp-topics/src/mpi-train-contexts.cc b/gi/pyp-topics/src/mpi-train-contexts.cc
index 7bb890d2..e05e0eac 100644
--- a/gi/pyp-topics/src/mpi-train-contexts.cc
+++ b/gi/pyp-topics/src/mpi-train-contexts.cc
@@ -58,6 +58,7 @@ int main(int argc, char **argv)
("backoff-type", value<string>(), "backoff type: none|simple")
// ("filter-singleton-contexts", "filter singleton contexts")
("hierarchical-topics", "Use a backoff hierarchical PYP as the P0 for the document topics distribution.")
+ ("binary-counts,b", "Use binary rather than integer counts for contexts.")
("freq-cutoff-start", value<int>()->default_value(0), "initial frequency cutoff.")
("freq-cutoff-end", value<int>()->default_value(0), "final frequency cutoff.")
("freq-cutoff-interval", value<int>()->default_value(0), "number of iterations between frequency decrement.")
@@ -107,7 +108,7 @@ int main(int argc, char **argv)
//ContextsCorpus contexts_corpus;
MPICorpus contexts_corpus;
- contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen, /*vm.count("filter-singleton-contexts")*/ false);
+ contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen, /*vm.count("filter-singleton-contexts")*/ false, vm.count("binary-counts"));
int mpi_start = 0, mpi_end = 0;
contexts_corpus.bounds(&mpi_start, &mpi_end);
std::cerr << "\tProcess " << rank << " has documents " << mpi_start << " -> " << mpi_end << "." << std::endl;