diff options
Diffstat (limited to 'gi/pyp-topics/src')
| -rw-r--r-- | gi/pyp-topics/src/corpus.hh | 2 | ||||
| -rw-r--r-- | gi/pyp-topics/src/pyp-topics.cc | 29 | 
2 files changed, 26 insertions, 5 deletions
| diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh index 3dd17cf9..c1b0ceda 100644 --- a/gi/pyp-topics/src/corpus.hh +++ b/gi/pyp-topics/src/corpus.hh @@ -28,6 +28,8 @@ public:      const_iterator begin() const { return m_documents.begin(); }      const_iterator end() const { return m_documents.end(); } +    const Document& at(size_t i) const { return m_documents.at(i); } +      int num_documents() const { return m_documents.size(); }      int num_terms() const { return m_num_terms; }      int num_types() const { return m_num_types; } diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc index 9b43d6d1..f3369f2e 100644 --- a/gi/pyp-topics/src/pyp-topics.cc +++ b/gi/pyp-topics/src/pyp-topics.cc @@ -1,4 +1,5 @@  #include "pyp-topics.hh" +//#include "mt19937ar.h"  void PYPTopics::sample(const Corpus& corpus, int samples) {    if (!m_backoff.get()) { @@ -45,18 +46,35 @@ void PYPTopics::sample(const Corpus& corpus, int samples) {      }    } +  int* randomDocIndices = new int[corpus.num_documents()]; +  for (int i = 0; i < corpus.num_documents(); ++i) +	  randomDocIndices[i] = i; +    // Sampling phase    for (int curr_sample=0; curr_sample < samples; ++curr_sample) {      std::cerr << "\n  -- Sample " << curr_sample << " "; std::cerr.flush(); +    // Randomize the corpus indexing array +    int tmp; +    for (int i = corpus.num_documents()-1; i > 0; --i) +    { +    	int j = (int)(mt_genrand_real1() * i); +    	tmp = randomDocIndices[i]; +    	randomDocIndices[i] = randomDocIndices[j]; +    	randomDocIndices[j] = tmp; +    } +      // for each document in the corpus -    int document_id=0; -    for (Corpus::const_iterator corpusIt=corpus.begin();  -         corpusIt != corpus.end(); ++corpusIt, ++document_id) { +    int document_id; +    for (int i=0; i<corpus.num_documents(); ++i) +    { +    	document_id = randomDocIndices[i]; +        // for each term in the document        int term_index=0; -      for (Document::const_iterator docIt=corpusIt->begin(); -           docIt != corpusIt->end(); ++docIt, ++term_index) { +      Document::const_iterator docEnd = corpus.at(document_id).end(); +      for (Document::const_iterator docIt=corpus.at(document_id).begin(); +           docIt != docEnd; ++docIt, ++term_index) {          Term term = *docIt;          // remove the prevous topic from the PYPs @@ -101,6 +119,7 @@ void PYPTopics::sample(const Corpus& corpus, int samples) {        std::cerr << " ||| LLH=" << log_p << std::endl;      }    } +  delete [] randomDocIndices;  }  void PYPTopics::decrement(const Term& term, int topic, int level) { | 
