summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src
diff options
context:
space:
mode:
authorbothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-28 14:16:49 +0000
committerbothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-28 14:16:49 +0000
commit088725c4708e83343154d1bed9dee18286446eaf (patch)
treef66981687b7279805314ff7cd683175a53ebe478 /gi/pyp-topics/src
parent3e292f48ae09fef3ba31bd2e5082997b15385fcc (diff)
pyp-sampler: added randomization to sequence in which corpus is sampled
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@35 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/src')
-rw-r--r--gi/pyp-topics/src/corpus.hh2
-rw-r--r--gi/pyp-topics/src/pyp-topics.cc29
2 files changed, 26 insertions, 5 deletions
diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh
index 3dd17cf9..c1b0ceda 100644
--- a/gi/pyp-topics/src/corpus.hh
+++ b/gi/pyp-topics/src/corpus.hh
@@ -28,6 +28,8 @@ public:
const_iterator begin() const { return m_documents.begin(); }
const_iterator end() const { return m_documents.end(); }
+ const Document& at(size_t i) const { return m_documents.at(i); }
+
int num_documents() const { return m_documents.size(); }
int num_terms() const { return m_num_terms; }
int num_types() const { return m_num_types; }
diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc
index 9b43d6d1..f3369f2e 100644
--- a/gi/pyp-topics/src/pyp-topics.cc
+++ b/gi/pyp-topics/src/pyp-topics.cc
@@ -1,4 +1,5 @@
#include "pyp-topics.hh"
+//#include "mt19937ar.h"
void PYPTopics::sample(const Corpus& corpus, int samples) {
if (!m_backoff.get()) {
@@ -45,18 +46,35 @@ void PYPTopics::sample(const Corpus& corpus, int samples) {
}
}
+ int* randomDocIndices = new int[corpus.num_documents()];
+ for (int i = 0; i < corpus.num_documents(); ++i)
+ randomDocIndices[i] = i;
+
// Sampling phase
for (int curr_sample=0; curr_sample < samples; ++curr_sample) {
std::cerr << "\n -- Sample " << curr_sample << " "; std::cerr.flush();
+ // Randomize the corpus indexing array
+ int tmp;
+ for (int i = corpus.num_documents()-1; i > 0; --i)
+ {
+ int j = (int)(mt_genrand_real1() * i);
+ tmp = randomDocIndices[i];
+ randomDocIndices[i] = randomDocIndices[j];
+ randomDocIndices[j] = tmp;
+ }
+
// for each document in the corpus
- int document_id=0;
- for (Corpus::const_iterator corpusIt=corpus.begin();
- corpusIt != corpus.end(); ++corpusIt, ++document_id) {
+ int document_id;
+ for (int i=0; i<corpus.num_documents(); ++i)
+ {
+ document_id = randomDocIndices[i];
+
// for each term in the document
int term_index=0;
- for (Document::const_iterator docIt=corpusIt->begin();
- docIt != corpusIt->end(); ++docIt, ++term_index) {
+ Document::const_iterator docEnd = corpus.at(document_id).end();
+ for (Document::const_iterator docIt=corpus.at(document_id).begin();
+ docIt != docEnd; ++docIt, ++term_index) {
Term term = *docIt;
// remove the prevous topic from the PYPs
@@ -101,6 +119,7 @@ void PYPTopics::sample(const Corpus& corpus, int samples) {
std::cerr << " ||| LLH=" << log_p << std::endl;
}
}
+ delete [] randomDocIndices;
}
void PYPTopics::decrement(const Term& term, int topic, int level) {