summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-02 17:14:55 +0000
committerphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-02 17:14:55 +0000
commitc0d43ad4fd094b5eeac37fc3e79d806aa928dc71 (patch)
treece35f92a137c7ef247daaa9be1b9f25f1bd8d639
parentedd233c2030e53cbfa4f6817be14d559db70f094 (diff)
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@115 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--gi/pyp-topics/src/pyp-topics.cc28
-rw-r--r--gi/pyp-topics/src/pyp-topics.hh3
-rw-r--r--gi/pyp-topics/src/train-contexts.cc3
3 files changed, 31 insertions, 3 deletions
diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc
index 56d49928..186267d3 100644
--- a/gi/pyp-topics/src/pyp-topics.cc
+++ b/gi/pyp-topics/src/pyp-topics.cc
@@ -180,6 +180,7 @@ void PYPTopics::sample(const Corpus& corpus, int samples) {
<< pypIt->num_types() << "," << m_topic_pyp.prob(k, m_topic_p0) << "> ";
if (k % 5 == 0) std::cerr << std::endl << '\t';
}
+ std::cerr.precision(4);
std::cerr << std::endl;
}
}
@@ -283,7 +284,32 @@ int PYPTopics::max_topic() const {
return current_topic;
}
-int PYPTopics::max(const DocumentId& doc, const Term& term) {
+int PYPTopics::max(const DocumentId& doc) const {
+ //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl;
+ // collect probs
+ F current_max=0.0;
+ int current_topic=-1;
+ for (int k=0; k<m_num_topics; ++k) {
+ //F p_w_k = prob(term, k);
+
+ F topic_prob = m_topic_p0;
+ if (m_use_topic_pyp)
+ topic_prob = m_topic_pyp.prob(k, m_topic_p0);
+
+ F prob = 0;
+ if (doc < 0) prob = topic_prob;
+ else prob = m_document_pyps[doc].prob(k, topic_prob);
+
+ if (prob > current_max) {
+ current_max = prob;
+ current_topic = k;
+ }
+ }
+ assert(current_topic >= 0);
+ return current_topic;
+}
+
+int PYPTopics::max(const DocumentId& doc, const Term& term) const {
//std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl;
// collect probs
F current_max=0.0;
diff --git a/gi/pyp-topics/src/pyp-topics.hh b/gi/pyp-topics/src/pyp-topics.hh
index 7e003228..c35645aa 100644
--- a/gi/pyp-topics/src/pyp-topics.hh
+++ b/gi/pyp-topics/src/pyp-topics.hh
@@ -21,7 +21,8 @@ public:
void sample(const Corpus& corpus, int samples);
int sample(const DocumentId& doc, const Term& term);
- int max(const DocumentId& doc, const Term& term);
+ int max(const DocumentId& doc, const Term& term) const;
+ int max(const DocumentId& doc) const;
int max_topic() const;
void set_backoff(const std::string& filename) {
diff --git a/gi/pyp-topics/src/train-contexts.cc b/gi/pyp-topics/src/train-contexts.cc
index 12e7baec..0a2f1959 100644
--- a/gi/pyp-topics/src/train-contexts.cc
+++ b/gi/pyp-topics/src/train-contexts.cc
@@ -99,7 +99,7 @@ int main(int argc, char **argv)
map<int,int> all_terms;
for (Corpus::const_iterator corpusIt=contexts_corpus.begin();
corpusIt != contexts_corpus.end(); ++corpusIt, ++document_id) {
- vector<int> unique_terms;
+ vector<int> unique_terms;
for (Document::const_iterator docIt=corpusIt->begin();
docIt != corpusIt->end(); ++docIt) {
if (unique_terms.empty() || *docIt != unique_terms.back())
@@ -111,6 +111,7 @@ int main(int argc, char **argv)
//insert_result.first++;
}
documents_out << contexts_corpus.key(document_id) << '\t';
+ documents_out << model.max(document_id) << " ||| ";
for (std::vector<int>::const_iterator termIt=unique_terms.begin();
termIt != unique_terms.end(); ++termIt) {
if (termIt != unique_terms.begin())