pyp-sampler: added randomization to sequence in which corpus is sampled

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@35 ec762483-ff6d-05da-a07a-a48fb63a330f
author: bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-28 14:16:49 +0000
committer: bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-28 14:16:49 +0000
commit: f7e25929adebd260f7e7d0bf12e05a37abbe1779 (patch)
tree: a76d5ef9412f4058ef632512b351c6ea03492e1b /gi/pyp-topics/src
parent: d3bd46ad52c26949d7523b135c22462108c2b297 (diff)
2 files changed, 26 insertions, 5 deletions
diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh
index 3dd17cf9..c1b0ceda 100644
--- a/gi/pyp-topics/src/corpus.hh
+++ b/gi/pyp-topics/src/corpus.hh
@@ -28,6 +28,8 @@ public:
     const_iterator begin() const { return m_documents.begin(); }
     const_iterator end() const { return m_documents.end(); }
 
+    const Document& at(size_t i) const { return m_documents.at(i); }
+
     int num_documents() const { return m_documents.size(); }
     int num_terms() const { return m_num_terms; }
     int num_types() const { return m_num_types; }
diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc
index 9b43d6d1..f3369f2e 100644
--- a/gi/pyp-topics/src/pyp-topics.cc
+++ b/gi/pyp-topics/src/pyp-topics.cc
@@ -1,4 +1,5 @@
 #include "pyp-topics.hh"
+//#include "mt19937ar.h"
 
 void PYPTopics::sample(const Corpus& corpus, int samples) {
   if (!m_backoff.get()) {
@@ -45,18 +46,35 @@ void PYPTopics::sample(const Corpus& corpus, int samples) {
     }
   }
 
+  int* randomDocIndices = new int[corpus.num_documents()];
+  for (int i = 0; i < corpus.num_documents(); ++i)
+	  randomDocIndices[i] = i;
+
   // Sampling phase
   for (int curr_sample=0; curr_sample < samples; ++curr_sample) {
     std::cerr << "\n  -- Sample " << curr_sample << " "; std::cerr.flush();
 
+    // Randomize the corpus indexing array
+    int tmp;
+    for (int i = corpus.num_documents()-1; i > 0; --i)
+    {
+    	int j = (int)(mt_genrand_real1() * i);
+    	tmp = randomDocIndices[i];
+    	randomDocIndices[i] = randomDocIndices[j];
+    	randomDocIndices[j] = tmp;
+    }
+
     // for each document in the corpus
-    int document_id=0;
-    for (Corpus::const_iterator corpusIt=corpus.begin(); 
-         corpusIt != corpus.end(); ++corpusIt, ++document_id) {
+    int document_id;
+    for (int i=0; i<corpus.num_documents(); ++i)
+    {
+    	document_id = randomDocIndices[i];
+
       // for each term in the document
       int term_index=0;
-      for (Document::const_iterator docIt=corpusIt->begin();
-           docIt != corpusIt->end(); ++docIt, ++term_index) {
+      Document::const_iterator docEnd = corpus.at(document_id).end();
+      for (Document::const_iterator docIt=corpus.at(document_id).begin();
+           docIt != docEnd; ++docIt, ++term_index) {
         Term term = *docIt;
 
         // remove the prevous topic from the PYPs
@@ -101,6 +119,7 @@ void PYPTopics::sample(const Corpus& corpus, int samples) {
       std::cerr << " ||| LLH=" << log_p << std::endl;
     }
   }
+  delete [] randomDocIndices;
 }
 
 void PYPTopics::decrement(const Term& term, int topic, int level) {
author	bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-28 14:16:49 +0000
committer	bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-28 14:16:49 +0000
commit	f7e25929adebd260f7e7d0bf12e05a37abbe1779 (patch)
tree	a76d5ef9412f4058ef632512b351c6ea03492e1b /gi/pyp-topics/src
parent	d3bd46ad52c26949d7523b135c22462108c2b297 (diff)