summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src/pyp-topics.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pyp-topics/src/pyp-topics.cc')
-rw-r--r--gi/pyp-topics/src/pyp-topics.cc24
1 files changed, 17 insertions, 7 deletions
diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc
index 3614fb22..2cc1fc79 100644
--- a/gi/pyp-topics/src/pyp-topics.cc
+++ b/gi/pyp-topics/src/pyp-topics.cc
@@ -4,7 +4,8 @@
//#include <boost/date_time/posix_time/posix_time_types.hpp>
void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
int freq_cutoff_start, int freq_cutoff_end,
- int freq_cutoff_interval) {
+ int freq_cutoff_interval,
+ int max_contexts_per_document) {
Timer timer;
if (!m_backoff.get()) {
@@ -54,11 +55,12 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
// sample a new_topic
//int new_topic = (topic_counter % m_num_topics);
int freq = corpus.context_count(term);
- int new_topic = (freq > frequency_cutoff ? (document_id % m_num_topics) : -1);
+ int new_topic = -1;
+ if (freq > frequency_cutoff
+ && (!max_contexts_per_document || term_index < max_contexts_per_document)) {
+ new_topic = document_id % m_num_topics;
- // add the new topic to the PYPs
- m_corpus_topics[document_id][term_index] = new_topic;
- if (freq > frequency_cutoff) {
+ // add the new topic to the PYPs
increment(term, new_topic);
if (m_use_topic_pyp) {
@@ -69,6 +71,8 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
}
else m_document_pyps[document_id].increment(new_topic, m_topic_p0);
}
+
+ m_corpus_topics[document_id][term_index] = new_topic;
}
}
std::cerr << " Initialized in " << timer.Elapsed() << " seconds\n";
@@ -94,6 +98,7 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
// Randomize the corpus indexing array
int tmp;
+ int processed_terms=0;
for (int i = corpus.num_documents()-1; i > 0; --i)
{
//i+1 since j \in [0,i] but rnd() \in [0,1)
@@ -106,8 +111,7 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
// for each document in the corpus
int document_id;
- for (int i=0; i<corpus.num_documents(); ++i)
- {
+ for (int i=0; i<corpus.num_documents(); ++i) {
document_id = randomDocIndices[i];
// for each term in the document
@@ -115,11 +119,16 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
Document::const_iterator docEnd = corpus.at(document_id).end();
for (Document::const_iterator docIt=corpus.at(document_id).begin();
docIt != docEnd; ++docIt, ++term_index) {
+ if (max_contexts_per_document && term_index > max_contexts_per_document)
+ break;
+
Term term = *docIt;
int freq = corpus.context_count(term);
if (freq < frequency_cutoff)
continue;
+ processed_terms++;
+
// remove the prevous topic from the PYPs
int current_topic = m_corpus_topics[document_id][term_index];
// a negative label mean that term hasn't been sampled yet
@@ -150,6 +159,7 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
std::cerr << "."; std::cerr.flush();
}
}
+ std::cerr << " ||| sampled " << processed_terms << " terms.";
if (curr_sample != 0 && curr_sample % 10 == 0) {
std::cerr << " ||| time=" << (timer.Elapsed() / 10.0) << " sec/sample" << std::endl;