diff options
author | bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-28 14:16:49 +0000 |
---|---|---|
committer | bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-28 14:16:49 +0000 |
commit | f7e25929adebd260f7e7d0bf12e05a37abbe1779 (patch) | |
tree | a76d5ef9412f4058ef632512b351c6ea03492e1b /gi | |
parent | d3bd46ad52c26949d7523b135c22462108c2b297 (diff) |
pyp-sampler: added randomization to sequence in which corpus is sampled
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@35 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi')
-rw-r--r-- | gi/pyp-topics/src/corpus.hh | 2 | ||||
-rw-r--r-- | gi/pyp-topics/src/pyp-topics.cc | 29 |
2 files changed, 26 insertions, 5 deletions
diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh index 3dd17cf9..c1b0ceda 100644 --- a/gi/pyp-topics/src/corpus.hh +++ b/gi/pyp-topics/src/corpus.hh @@ -28,6 +28,8 @@ public: const_iterator begin() const { return m_documents.begin(); } const_iterator end() const { return m_documents.end(); } + const Document& at(size_t i) const { return m_documents.at(i); } + int num_documents() const { return m_documents.size(); } int num_terms() const { return m_num_terms; } int num_types() const { return m_num_types; } diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc index 9b43d6d1..f3369f2e 100644 --- a/gi/pyp-topics/src/pyp-topics.cc +++ b/gi/pyp-topics/src/pyp-topics.cc @@ -1,4 +1,5 @@ #include "pyp-topics.hh" +//#include "mt19937ar.h" void PYPTopics::sample(const Corpus& corpus, int samples) { if (!m_backoff.get()) { @@ -45,18 +46,35 @@ void PYPTopics::sample(const Corpus& corpus, int samples) { } } + int* randomDocIndices = new int[corpus.num_documents()]; + for (int i = 0; i < corpus.num_documents(); ++i) + randomDocIndices[i] = i; + // Sampling phase for (int curr_sample=0; curr_sample < samples; ++curr_sample) { std::cerr << "\n -- Sample " << curr_sample << " "; std::cerr.flush(); + // Randomize the corpus indexing array + int tmp; + for (int i = corpus.num_documents()-1; i > 0; --i) + { + int j = (int)(mt_genrand_real1() * i); + tmp = randomDocIndices[i]; + randomDocIndices[i] = randomDocIndices[j]; + randomDocIndices[j] = tmp; + } + // for each document in the corpus - int document_id=0; - for (Corpus::const_iterator corpusIt=corpus.begin(); - corpusIt != corpus.end(); ++corpusIt, ++document_id) { + int document_id; + for (int i=0; i<corpus.num_documents(); ++i) + { + document_id = randomDocIndices[i]; + // for each term in the document int term_index=0; - for (Document::const_iterator docIt=corpusIt->begin(); - docIt != corpusIt->end(); ++docIt, ++term_index) { + Document::const_iterator docEnd = corpus.at(document_id).end(); + for (Document::const_iterator docIt=corpus.at(document_id).begin(); + docIt != docEnd; ++docIt, ++term_index) { Term term = *docIt; // remove the prevous topic from the PYPs @@ -101,6 +119,7 @@ void PYPTopics::sample(const Corpus& corpus, int samples) { std::cerr << " ||| LLH=" << log_p << std::endl; } } + delete [] randomDocIndices; } void PYPTopics::decrement(const Term& term, int topic, int level) { |