summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gi/pyp-topics/src/contexts_corpus.hh8
-rw-r--r--gi/pyp-topics/src/corpus.hh2
-rw-r--r--gi/pyp-topics/src/makefile.depend13
-rw-r--r--gi/pyp-topics/src/mpi-pyp-topics.cc26
-rw-r--r--gi/pyp-topics/src/mpi-pyp-topics.hh8
-rw-r--r--gi/pyp-topics/src/mpi-train-contexts.cc20
-rw-r--r--gi/pyp-topics/src/pyp-topics.cc6
7 files changed, 53 insertions, 30 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh
index f3c25454..f7dad21f 100644
--- a/gi/pyp-topics/src/contexts_corpus.hh
+++ b/gi/pyp-topics/src/contexts_corpus.hh
@@ -53,9 +53,9 @@ public:
ContextsCorpus() : m_backoff(new TermBackoff) {}
virtual ~ContextsCorpus() {}
- unsigned read_contexts(const std::string &filename,
- BackoffGenerator* backoff_gen=0,
- bool filter_singeltons=false);
+ virtual unsigned read_contexts(const std::string &filename,
+ BackoffGenerator* backoff_gen=0,
+ bool filter_singeltons=false);
TermBackoffPtr backoff_index() {
return m_backoff;
@@ -77,7 +77,7 @@ public:
return m_keys.at(i);
}
-private:
+protected:
TermBackoffPtr m_backoff;
Dict m_dict;
std::vector<std::string> m_keys;
diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh
index 24981946..2aa03527 100644
--- a/gi/pyp-topics/src/corpus.hh
+++ b/gi/pyp-topics/src/corpus.hh
@@ -25,7 +25,7 @@ public:
Corpus();
virtual ~Corpus() {}
- unsigned read(const std::string &filename);
+ virtual unsigned read(const std::string &filename);
const_iterator begin() const { return m_documents.begin(); }
const_iterator end() const { return m_documents.end(); }
diff --git a/gi/pyp-topics/src/makefile.depend b/gi/pyp-topics/src/makefile.depend
index d8ef78d8..9b8e306c 100644
--- a/gi/pyp-topics/src/makefile.depend
+++ b/gi/pyp-topics/src/makefile.depend
@@ -1442,6 +1442,9 @@ mpi-pyp-topics.o: mpi-pyp-topics.cc \
/home/pblunsom/packages/include/boost/random/linear_congruential.hpp \
/home/pblunsom/packages/include/boost/random/detail/const_mod.hpp \
/home/pblunsom/packages/include/boost/random/detail/seed.hpp \
+ /home/pblunsom/packages/include/boost/random/inversive_congruential.hpp \
+ /home/pblunsom/packages/include/boost/random/lagged_fibonacci.hpp \
+ /home/pblunsom/packages/include/boost/config/no_tr1/cmath.hpp \
/home/pblunsom/packages/include/boost/mpi/environment.hpp mpi-pyp.hh \
/home/pblunsom/packages/include/boost/tuple/tuple.hpp \
/home/pblunsom/packages/include/boost/ref.hpp \
@@ -2151,7 +2154,10 @@ mpi-train-contexts.o: mpi-train-contexts.cc \
/home/pblunsom/packages/include/boost/random/mersenne_twister.hpp \
/home/pblunsom/packages/include/boost/random/linear_congruential.hpp \
/home/pblunsom/packages/include/boost/random/detail/const_mod.hpp \
- /home/pblunsom/packages/include/boost/random/detail/seed.hpp mpi-pyp.hh \
+ /home/pblunsom/packages/include/boost/random/detail/seed.hpp \
+ /home/pblunsom/packages/include/boost/random/inversive_congruential.hpp \
+ /home/pblunsom/packages/include/boost/random/lagged_fibonacci.hpp \
+ /home/pblunsom/packages/include/boost/config/no_tr1/cmath.hpp mpi-pyp.hh \
/home/pblunsom/packages/include/boost/tuple/tuple.hpp \
/home/pblunsom/packages/include/boost/tuple/detail/tuple_basic.hpp \
/home/pblunsom/packages/include/boost/type_traits/cv_traits.hpp \
@@ -2239,14 +2245,13 @@ mpi-train-contexts.o: mpi-train-contexts.cc \
/home/pblunsom/packages/include/boost/mpi/detail/broadcast_sc.hpp \
/home/pblunsom/packages/include/boost/mpi/detail/communicator_sc.hpp \
/home/pblunsom/packages/include/boost/mpi/timer.hpp pyp.hh \
- slice-sampler.h log_add.h mt19937ar.h corpus.hh contexts_corpus.hh \
- contexts_lexer.h ../../../decoder/dict.h \
+ slice-sampler.h log_add.h mt19937ar.h corpus.hh mpi-corpus.hh \
+ contexts_corpus.hh contexts_lexer.h ../../../decoder/dict.h \
/home/pblunsom/packages/include/boost/functional/hash.hpp \
/home/pblunsom/packages/include/boost/functional/hash/hash.hpp \
/home/pblunsom/packages/include/boost/functional/hash/hash_fwd.hpp \
/home/pblunsom/packages/include/boost/functional/hash/detail/hash_float.hpp \
/home/pblunsom/packages/include/boost/functional/hash/detail/float_functions.hpp \
- /home/pblunsom/packages/include/boost/config/no_tr1/cmath.hpp \
/home/pblunsom/packages/include/boost/functional/hash/detail/limits.hpp \
/home/pblunsom/packages/include/boost/integer/static_log2.hpp \
/home/pblunsom/packages/include/boost/functional/hash/detail/hash_float_generic.hpp \
diff --git a/gi/pyp-topics/src/mpi-pyp-topics.cc b/gi/pyp-topics/src/mpi-pyp-topics.cc
index fa951156..967c3a77 100644
--- a/gi/pyp-topics/src/mpi-pyp-topics.cc
+++ b/gi/pyp-topics/src/mpi-pyp-topics.cc
@@ -4,13 +4,14 @@
#include "mpi-pyp-topics.hh"
//#include <boost/date_time/posix_time/posix_time_types.hpp>
-void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples,
+void MPIPYPTopics::sample_corpus(const MPICorpus& corpus, int samples,
int freq_cutoff_start, int freq_cutoff_end,
int freq_cutoff_interval,
int max_contexts_per_document) {
Timer timer;
- int documents = corpus.num_documents();
+ //int documents = corpus.num_documents();
+ /*
m_mpi_start = 0;
m_mpi_end = documents;
if (m_size != 1) {
@@ -19,6 +20,8 @@ void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples,
if (m_rank == m_size-1) m_mpi_end = documents;
else m_mpi_end = (documents / m_size)*(m_rank+1);
}
+ */
+ corpus.bounds(&m_mpi_start, &m_mpi_end);
int local_documents = m_mpi_end - m_mpi_start;
if (!m_backoff.get()) {
@@ -74,7 +77,8 @@ void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples,
int new_topic = -1;
if (freq > frequency_cutoff
&& (!max_contexts_per_document || term_index < max_contexts_per_document)) {
- new_topic = document_id % m_num_topics;
+ new_topic = sample(document_id, term);
+ //new_topic = document_id % m_num_topics;
// add the new topic to the PYPs
increment(term, new_topic);
@@ -336,7 +340,8 @@ MPIPYPTopics::F MPIPYPTopics::word_pyps_p0(const Term& term, int topic, int leve
Term backoff_term = (*m_backoff)[term];
if (!m_backoff->is_null(backoff_term)) {
assert (level < m_backoff->order());
- p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1);
+ //p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1);
+ p0 = prob(backoff_term, topic, level+1);
}
else
p0 = m_term_p0;
@@ -373,10 +378,11 @@ int MPIPYPTopics::max_topic() const {
}
}
assert(current_topic >= 0);
- return current_topic;
+ assert(current_max >= 0);
+ return current_max;
}
-int MPIPYPTopics::max(const DocumentId& true_doc) const {
+std::pair<int,MPIPYPTopics::F> MPIPYPTopics::max(const DocumentId& true_doc) const {
//std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl;
// collect probs
F current_max=0.0;
@@ -399,10 +405,11 @@ int MPIPYPTopics::max(const DocumentId& true_doc) const {
}
}
assert(current_topic >= 0);
- return current_topic;
+ assert(current_max >= 0);
+ return std::make_pair(current_topic, current_max);
}
-int MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const {
+std::pair<int,MPIPYPTopics::F> MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const {
//std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl;
// collect probs
F current_max=0.0;
@@ -426,7 +433,8 @@ int MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const {
}
}
assert(current_topic >= 0);
- return current_topic;
+ assert(current_max >= 0);
+ return std::make_pair(current_topic, current_max);
}
std::ostream& MPIPYPTopics::print_document_topics(std::ostream& out) const {
diff --git a/gi/pyp-topics/src/mpi-pyp-topics.hh b/gi/pyp-topics/src/mpi-pyp-topics.hh
index 4a4433e6..d96bc4e5 100644
--- a/gi/pyp-topics/src/mpi-pyp-topics.hh
+++ b/gi/pyp-topics/src/mpi-pyp-topics.hh
@@ -16,7 +16,7 @@
#include "mpi-pyp.hh"
-#include "corpus.hh"
+#include "mpi-corpus.hh"
class MPIPYPTopics {
public:
@@ -37,14 +37,14 @@ public:
m_am_root = (m_rank == 0);
}
- void sample_corpus(const Corpus& corpus, int samples,
+ void sample_corpus(const MPICorpus& corpus, int samples,
int freq_cutoff_start=0, int freq_cutoff_end=0,
int freq_cutoff_interval=0,
int max_contexts_per_document=0);
int sample(const DocumentId& doc, const Term& term);
- int max(const DocumentId& doc, const Term& term) const;
- int max(const DocumentId& doc) const;
+ std::pair<int,F> max(const DocumentId& doc, const Term& term) const;
+ std::pair<int,F> max(const DocumentId& doc) const;
int max_topic() const;
void set_backoff(const std::string& filename) {
diff --git a/gi/pyp-topics/src/mpi-train-contexts.cc b/gi/pyp-topics/src/mpi-train-contexts.cc
index 4f966a65..7bb890d2 100644
--- a/gi/pyp-topics/src/mpi-train-contexts.cc
+++ b/gi/pyp-topics/src/mpi-train-contexts.cc
@@ -15,7 +15,7 @@
// Local
#include "mpi-pyp-topics.hh"
#include "corpus.hh"
-#include "contexts_corpus.hh"
+#include "mpi-corpus.hh"
#include "gzstream.hh"
static const char *REVISION = "$Rev: 170 $";
@@ -105,8 +105,13 @@ int main(int argc, char **argv)
}
}
- ContextsCorpus contexts_corpus;
+ //ContextsCorpus contexts_corpus;
+ MPICorpus contexts_corpus;
contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen, /*vm.count("filter-singleton-contexts")*/ false);
+ int mpi_start = 0, mpi_end = 0;
+ contexts_corpus.bounds(&mpi_start, &mpi_end);
+ std::cerr << "\tProcess " << rank << " has documents " << mpi_start << " -> " << mpi_end << "." << std::endl;
+
model.set_backoff(contexts_corpus.backoff_index());
if (backoff_gen)
@@ -121,13 +126,15 @@ int main(int argc, char **argv)
if (vm.count("document-topics-out")) {
std::ofstream documents_out((vm["document-topics-out"].as<string>() + ".pyp-process-" + boost::lexical_cast<std::string>(rank)).c_str());
- int documents = contexts_corpus.num_documents();
+ //int documents = contexts_corpus.num_documents();
+ /*
int mpi_start = 0, mpi_end = documents;
if (world.size() != 1) {
mpi_start = (documents / world.size()) * rank;
if (rank == world.size()-1) mpi_end = documents;
else mpi_end = (documents / world.size())*(rank+1);
}
+ */
map<int,int> all_terms;
for (int document_id=mpi_start; document_id<mpi_end; ++document_id) {
@@ -143,13 +150,14 @@ int main(int argc, char **argv)
all_terms[*docIt] = all_terms[*docIt] + 1;
}
documents_out << contexts_corpus.key(document_id) << '\t';
- documents_out << model.max(document_id) << " " << doc.size() << " ||| ";
+ documents_out << model.max(document_id).first << " " << doc.size() << " ||| ";
for (std::vector<int>::const_iterator termIt=unique_terms.begin(); termIt != unique_terms.end(); ++termIt) {
if (termIt != unique_terms.begin())
documents_out << " ||| ";
vector<std::string> strings = contexts_corpus.context2string(*termIt);
copy(strings.begin(), strings.end(),ostream_iterator<std::string>(documents_out, " "));
- documents_out << "||| C=" << model.max(document_id, *termIt);
+ std::pair<int,MPIPYPTopics::F> maxinfo = model.max(document_id, *termIt);
+ documents_out << "||| C=" << maxinfo.first << " P=" << maxinfo.second;
}
documents_out <<endl;
}
@@ -173,7 +181,7 @@ int main(int argc, char **argv)
default_topics << model.max_topic() <<endl;
for (std::map<int,int>::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) {
vector<std::string> strings = contexts_corpus.context2string(termIt->first);
- default_topics << model.max(-1, termIt->first) << " ||| " << termIt->second << " ||| ";
+ default_topics << model.max(-1, termIt->first).first << " ||| " << termIt->second << " ||| ";
copy(strings.begin(), strings.end(),ostream_iterator<std::string>(default_topics, " "));
default_topics <<endl;
}
diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc
index 06499291..4c777f0c 100644
--- a/gi/pyp-topics/src/pyp-topics.cc
+++ b/gi/pyp-topics/src/pyp-topics.cc
@@ -58,7 +58,8 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
int new_topic = -1;
if (freq > frequency_cutoff
&& (!max_contexts_per_document || term_index < max_contexts_per_document)) {
- new_topic = document_id % m_num_topics;
+ new_topic = sample(document_id, term);
+ //new_topic = document_id % m_num_topics;
// add the new topic to the PYPs
increment(term, new_topic);
@@ -314,7 +315,8 @@ PYPTopics::F PYPTopics::word_pyps_p0(const Term& term, int topic, int level) con
Term backoff_term = (*m_backoff)[term];
if (!m_backoff->is_null(backoff_term)) {
assert (level < m_backoff->order());
- p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1);
+ //p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1);
+ p0 = prob(backoff_term, topic, level+1);
}
else
p0 = m_term_p0;