From 595dd3f0e577f522d32318acb2ad0fe288e0b00f Mon Sep 17 00:00:00 2001 From: "philblunsom@gmail.com" Date: Fri, 2 Jul 2010 14:31:13 +0000 Subject: Changed timer to be mac compatible. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@103 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pyp-topics/src/pyp-topics.cc | 57 ++++++++++++++++++++++++++++++------- gi/pyp-topics/src/pyp-topics.hh | 1 + gi/pyp-topics/src/train-contexts.cc | 12 ++++++++ 3 files changed, 60 insertions(+), 10 deletions(-) diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc index b727458b..796ab7af 100644 --- a/gi/pyp-topics/src/pyp-topics.cc +++ b/gi/pyp-topics/src/pyp-topics.cc @@ -3,24 +3,40 @@ #include #include +#include + struct Timer { Timer() { Reset(); } void Reset() { - clock_gettime(CLOCK_MONOTONIC, &start_t); + //clock_gettime(CLOCK_MONOTONIC, &start_t); + start_t = mach_absolute_time(); } double Elapsed() const { - timespec end_t; - - clock_gettime(CLOCK_MONOTONIC, &end_t); - - const double elapsed = (end_t.tv_sec - start_t.tv_sec) - + (end_t.tv_nsec - start_t.tv_nsec) / 1000000000.0; + //timespec end_t; + timespec tp; + uint64_t end_t = mach_absolute_time(); + mach_absolute_difference(end_t, start_t, &tp); + //clock_gettime(CLOCK_MONOTONIC, &end_t); + //const double elapsed = (end_t.tv_sec - start_t.tv_sec) + // + (end_t.tv_nsec - start_t.tv_nsec) / 1000000000.0; + const double elapsed = tp.tv_sec + tp.tv_nsec / 1000000000.0; return elapsed; } private: - timespec start_t; + void mach_absolute_difference(uint64_t end, uint64_t start, struct timespec *tp) const { + uint64_t difference = end - start; + static mach_timebase_info_data_t info = {0,0}; + + if (info.denom == 0) + mach_timebase_info(&info); + uint64_t elapsednano = difference * (info.numer / info.denom); + tp->tv_sec = elapsednano * 1e-9; + tp->tv_nsec = elapsednano - (tp->tv_sec * 1e9); + } + //timespec start_t; + uint64_t start_t; }; void PYPTopics::sample(const Corpus& corpus, int samples) { @@ -265,6 +281,23 @@ PYPTopics::F PYPTopics::prob(const Term& term, int topic, int level) const { return p_w_k; } +int PYPTopics::max_topic() const { + if (!m_use_topic_pyp) + return -1; + + F current_max=0.0; + int current_topic=-1; + for (int k=0; k current_max) { + current_max = prob; + current_topic = k; + } + } + assert(current_topic >= 0); + return current_topic; +} + int PYPTopics::max(const DocumentId& doc, const Term& term) { //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl; // collect probs @@ -274,8 +307,12 @@ int PYPTopics::max(const DocumentId& doc, const Term& term) { F p_w_k = prob(term, k); F topic_prob = m_topic_p0; - if (m_use_topic_pyp) topic_prob = m_topic_pyp.prob(k, m_topic_p0); - F p_k_d = m_document_pyps[doc].prob(k, topic_prob); + if (m_use_topic_pyp) + topic_prob = m_topic_pyp.prob(k, m_topic_p0); + + F p_k_d = 0; + if (doc < 0) p_k_d = topic_prob; + else p_k_d = m_document_pyps[doc].prob(k, topic_prob); F prob = (p_w_k*p_k_d); if (prob > current_max) { diff --git a/gi/pyp-topics/src/pyp-topics.hh b/gi/pyp-topics/src/pyp-topics.hh index db0f7468..7e003228 100644 --- a/gi/pyp-topics/src/pyp-topics.hh +++ b/gi/pyp-topics/src/pyp-topics.hh @@ -22,6 +22,7 @@ public: void sample(const Corpus& corpus, int samples); int sample(const DocumentId& doc, const Term& term); int max(const DocumentId& doc, const Term& term); + int max_topic() const; void set_backoff(const std::string& filename) { m_backoff.reset(new TermBackoff); diff --git a/gi/pyp-topics/src/train-contexts.cc b/gi/pyp-topics/src/train-contexts.cc index 02bb7b76..c58474da 100644 --- a/gi/pyp-topics/src/train-contexts.cc +++ b/gi/pyp-topics/src/train-contexts.cc @@ -40,6 +40,7 @@ int main(int argc, char **argv) ("data,d", value(), "file containing the documents and context terms") ("topics,t", value()->default_value(50), "number of topics") ("document-topics-out,o", value(), "file to write the document topics to") + ("default-topics-out", value(), "file to write default term topic assignments.") ("topic-words-out,w", value(), "file to write the topic word distribution to") ("samples,s", value()->default_value(10), "number of sampling passes through the data") ("backoff-type", value(), "backoff type: none|simple") @@ -95,6 +96,7 @@ int main(int argc, char **argv) ogzstream documents_out(vm["document-topics-out"].as().c_str()); int document_id=0; + std::set all_terms; for (Corpus::const_iterator corpusIt=contexts_corpus.begin(); corpusIt != contexts_corpus.end(); ++corpusIt, ++document_id) { std::vector unique_terms; @@ -111,10 +113,20 @@ int main(int argc, char **argv) std::vector strings = contexts_corpus.context2string(*termIt); std::copy(strings.begin(), strings.end(), std::ostream_iterator(documents_out, " ")); documents_out << "||| C=" << model.max(document_id, *termIt); + + all_terms.insert(*termIt); } documents_out << std::endl; } documents_out.close(); + + std::ofstream default_topics(vm["default-topics-out"].as().c_str()); + default_topics << model.max_topic() << std::endl; + for (std::set::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) { + std::vector strings = contexts_corpus.context2string(*termIt); + std::copy(strings.begin(), strings.end(), std::ostream_iterator(documents_out, " ")); + default_topics << model.max(-1, *termIt) << std::endl; + } } if (vm.count("topic-words-out")) { -- cgit v1.2.3