diff options
Diffstat (limited to 'gi/pyp-topics/src/pyp-topics.cc')
-rw-r--r-- | gi/pyp-topics/src/pyp-topics.cc | 24 |
1 files changed, 17 insertions, 7 deletions
diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc index 3614fb22..2cc1fc79 100644 --- a/gi/pyp-topics/src/pyp-topics.cc +++ b/gi/pyp-topics/src/pyp-topics.cc @@ -4,7 +4,8 @@ //#include <boost/date_time/posix_time/posix_time_types.hpp> void PYPTopics::sample_corpus(const Corpus& corpus, int samples, int freq_cutoff_start, int freq_cutoff_end, - int freq_cutoff_interval) { + int freq_cutoff_interval, + int max_contexts_per_document) { Timer timer; if (!m_backoff.get()) { @@ -54,11 +55,12 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, // sample a new_topic //int new_topic = (topic_counter % m_num_topics); int freq = corpus.context_count(term); - int new_topic = (freq > frequency_cutoff ? (document_id % m_num_topics) : -1); + int new_topic = -1; + if (freq > frequency_cutoff + && (!max_contexts_per_document || term_index < max_contexts_per_document)) { + new_topic = document_id % m_num_topics; - // add the new topic to the PYPs - m_corpus_topics[document_id][term_index] = new_topic; - if (freq > frequency_cutoff) { + // add the new topic to the PYPs increment(term, new_topic); if (m_use_topic_pyp) { @@ -69,6 +71,8 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, } else m_document_pyps[document_id].increment(new_topic, m_topic_p0); } + + m_corpus_topics[document_id][term_index] = new_topic; } } std::cerr << " Initialized in " << timer.Elapsed() << " seconds\n"; @@ -94,6 +98,7 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, // Randomize the corpus indexing array int tmp; + int processed_terms=0; for (int i = corpus.num_documents()-1; i > 0; --i) { //i+1 since j \in [0,i] but rnd() \in [0,1) @@ -106,8 +111,7 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, // for each document in the corpus int document_id; - for (int i=0; i<corpus.num_documents(); ++i) - { + for (int i=0; i<corpus.num_documents(); ++i) { document_id = randomDocIndices[i]; // for each term in the document @@ -115,11 +119,16 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, Document::const_iterator docEnd = corpus.at(document_id).end(); for (Document::const_iterator docIt=corpus.at(document_id).begin(); docIt != docEnd; ++docIt, ++term_index) { + if (max_contexts_per_document && term_index > max_contexts_per_document) + break; + Term term = *docIt; int freq = corpus.context_count(term); if (freq < frequency_cutoff) continue; + processed_terms++; + // remove the prevous topic from the PYPs int current_topic = m_corpus_topics[document_id][term_index]; // a negative label mean that term hasn't been sampled yet @@ -150,6 +159,7 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, std::cerr << "."; std::cerr.flush(); } } + std::cerr << " ||| sampled " << processed_terms << " terms."; if (curr_sample != 0 && curr_sample % 10 == 0) { std::cerr << " ||| time=" << (timer.Elapsed() / 10.0) << " sec/sample" << std::endl; |