From 088725c4708e83343154d1bed9dee18286446eaf Mon Sep 17 00:00:00 2001 From: bothameister Date: Mon, 28 Jun 2010 14:16:49 +0000 Subject: pyp-sampler: added randomization to sequence in which corpus is sampled git-svn-id: https://ws10smt.googlecode.com/svn/trunk@35 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pyp-topics/src/corpus.hh | 2 ++ gi/pyp-topics/src/pyp-topics.cc | 29 ++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh index 3dd17cf9..c1b0ceda 100644 --- a/gi/pyp-topics/src/corpus.hh +++ b/gi/pyp-topics/src/corpus.hh @@ -28,6 +28,8 @@ public: const_iterator begin() const { return m_documents.begin(); } const_iterator end() const { return m_documents.end(); } + const Document& at(size_t i) const { return m_documents.at(i); } + int num_documents() const { return m_documents.size(); } int num_terms() const { return m_num_terms; } int num_types() const { return m_num_types; } diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc index 9b43d6d1..f3369f2e 100644 --- a/gi/pyp-topics/src/pyp-topics.cc +++ b/gi/pyp-topics/src/pyp-topics.cc @@ -1,4 +1,5 @@ #include "pyp-topics.hh" +//#include "mt19937ar.h" void PYPTopics::sample(const Corpus& corpus, int samples) { if (!m_backoff.get()) { @@ -45,18 +46,35 @@ void PYPTopics::sample(const Corpus& corpus, int samples) { } } + int* randomDocIndices = new int[corpus.num_documents()]; + for (int i = 0; i < corpus.num_documents(); ++i) + randomDocIndices[i] = i; + // Sampling phase for (int curr_sample=0; curr_sample < samples; ++curr_sample) { std::cerr << "\n -- Sample " << curr_sample << " "; std::cerr.flush(); + // Randomize the corpus indexing array + int tmp; + for (int i = corpus.num_documents()-1; i > 0; --i) + { + int j = (int)(mt_genrand_real1() * i); + tmp = randomDocIndices[i]; + randomDocIndices[i] = randomDocIndices[j]; + randomDocIndices[j] = tmp; + } + // for each document in the corpus - int document_id=0; - for (Corpus::const_iterator corpusIt=corpus.begin(); - corpusIt != corpus.end(); ++corpusIt, ++document_id) { + int document_id; + for (int i=0; ibegin(); - docIt != corpusIt->end(); ++docIt, ++term_index) { + Document::const_iterator docEnd = corpus.at(document_id).end(); + for (Document::const_iterator docIt=corpus.at(document_id).begin(); + docIt != docEnd; ++docIt, ++term_index) { Term term = *docIt; // remove the prevous topic from the PYPs @@ -101,6 +119,7 @@ void PYPTopics::sample(const Corpus& corpus, int samples) { std::cerr << " ||| LLH=" << log_p << std::endl; } } + delete [] randomDocIndices; } void PYPTopics::decrement(const Term& term, int topic, int level) { -- cgit v1.2.3