diff options
| -rw-r--r-- | gi/pyp-topics/src/pyp-topics.cc | 57 | ||||
| -rw-r--r-- | gi/pyp-topics/src/pyp-topics.hh | 1 | ||||
| -rw-r--r-- | gi/pyp-topics/src/train-contexts.cc | 12 | 
3 files changed, 60 insertions, 10 deletions
| diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc index b727458b..796ab7af 100644 --- a/gi/pyp-topics/src/pyp-topics.cc +++ b/gi/pyp-topics/src/pyp-topics.cc @@ -3,24 +3,40 @@  #include <boost/date_time/posix_time/posix_time_types.hpp>  #include <sys/time.h> +#include <mach/mach_time.h>   +  struct Timer {    Timer() { Reset(); }    void Reset()     {  -   clock_gettime(CLOCK_MONOTONIC, &start_t);  +    //clock_gettime(CLOCK_MONOTONIC, &start_t);  +    start_t = mach_absolute_time();      }    double Elapsed() const { -    timespec end_t; -   -    clock_gettime(CLOCK_MONOTONIC, &end_t);  -   -    const double elapsed = (end_t.tv_sec - start_t.tv_sec)  -                + (end_t.tv_nsec - start_t.tv_nsec) / 1000000000.0; +    //timespec end_t; +    timespec tp; +    uint64_t end_t = mach_absolute_time();   +    mach_absolute_difference(end_t, start_t, &tp); +    //clock_gettime(CLOCK_MONOTONIC, &end_t);  +    //const double elapsed = (end_t.tv_sec - start_t.tv_sec)  +    //            + (end_t.tv_nsec - start_t.tv_nsec) / 1000000000.0; +    const double elapsed = tp.tv_sec + tp.tv_nsec / 1000000000.0;      return elapsed;    }   private: -  timespec start_t; +  void mach_absolute_difference(uint64_t end, uint64_t start, struct timespec *tp) const {   +    uint64_t difference = end - start;   +    static mach_timebase_info_data_t info = {0,0};   + +    if (info.denom == 0)   +      mach_timebase_info(&info);   +    uint64_t elapsednano = difference * (info.numer / info.denom);   +    tp->tv_sec = elapsednano * 1e-9;   +    tp->tv_nsec = elapsednano - (tp->tv_sec * 1e9);   +  }   +  //timespec start_t; +  uint64_t start_t;  };  void PYPTopics::sample(const Corpus& corpus, int samples) { @@ -265,6 +281,23 @@ PYPTopics::F PYPTopics::prob(const Term& term, int topic, int level) const {    return p_w_k;  } +int PYPTopics::max_topic() const { +  if (!m_use_topic_pyp) +    return -1; + +  F current_max=0.0; +  int current_topic=-1; +  for (int k=0; k<m_num_topics; ++k) { +    F prob = m_topic_pyp.prob(k, m_topic_p0); +    if (prob > current_max) { +      current_max = prob; +      current_topic = k; +    } +  } +  assert(current_topic >= 0); +  return current_topic; +} +  int PYPTopics::max(const DocumentId& doc, const Term& term) {    //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl;    // collect probs @@ -274,8 +307,12 @@ int PYPTopics::max(const DocumentId& doc, const Term& term) {      F p_w_k = prob(term, k);      F topic_prob = m_topic_p0; -    if (m_use_topic_pyp) topic_prob = m_topic_pyp.prob(k, m_topic_p0); -    F p_k_d = m_document_pyps[doc].prob(k, topic_prob); +    if (m_use_topic_pyp)  +      topic_prob = m_topic_pyp.prob(k, m_topic_p0); + +    F p_k_d = 0; +    if (doc < 0) p_k_d = topic_prob; +    else         p_k_d = m_document_pyps[doc].prob(k, topic_prob);      F prob = (p_w_k*p_k_d);      if (prob > current_max) { diff --git a/gi/pyp-topics/src/pyp-topics.hh b/gi/pyp-topics/src/pyp-topics.hh index db0f7468..7e003228 100644 --- a/gi/pyp-topics/src/pyp-topics.hh +++ b/gi/pyp-topics/src/pyp-topics.hh @@ -22,6 +22,7 @@ public:    void sample(const Corpus& corpus, int samples);    int sample(const DocumentId& doc, const Term& term);    int max(const DocumentId& doc, const Term& term); +  int max_topic() const;    void set_backoff(const std::string& filename) {      m_backoff.reset(new TermBackoff); diff --git a/gi/pyp-topics/src/train-contexts.cc b/gi/pyp-topics/src/train-contexts.cc index 02bb7b76..c58474da 100644 --- a/gi/pyp-topics/src/train-contexts.cc +++ b/gi/pyp-topics/src/train-contexts.cc @@ -40,6 +40,7 @@ int main(int argc, char **argv)        ("data,d", value<string>(), "file containing the documents and context terms")        ("topics,t", value<int>()->default_value(50), "number of topics")        ("document-topics-out,o", value<string>(), "file to write the document topics to") +      ("default-topics-out", value<string>(), "file to write default term topic assignments.")        ("topic-words-out,w", value<string>(), "file to write the topic word distribution to")        ("samples,s", value<int>()->default_value(10), "number of sampling passes through the data")        ("backoff-type", value<string>(), "backoff type: none|simple") @@ -95,6 +96,7 @@ int main(int argc, char **argv)      ogzstream documents_out(vm["document-topics-out"].as<string>().c_str());      int document_id=0; +    std::set<int> all_terms;      for (Corpus::const_iterator corpusIt=contexts_corpus.begin();            corpusIt != contexts_corpus.end(); ++corpusIt, ++document_id) {        std::vector<int> unique_terms; @@ -111,10 +113,20 @@ int main(int argc, char **argv)          std::vector<std::string> strings = contexts_corpus.context2string(*termIt);          std::copy(strings.begin(), strings.end(), std::ostream_iterator<std::string>(documents_out, " "));          documents_out << "||| C=" << model.max(document_id, *termIt); + +        all_terms.insert(*termIt);        }        documents_out << std::endl;      }      documents_out.close(); + +    std::ofstream default_topics(vm["default-topics-out"].as<string>().c_str()); +    default_topics << model.max_topic() << std::endl; +    for (std::set<int>::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) { +      std::vector<std::string> strings = contexts_corpus.context2string(*termIt); +      std::copy(strings.begin(), strings.end(), std::ostream_iterator<std::string>(documents_out, " ")); +      default_topics << model.max(-1, *termIt) << std::endl; +    }    }    if (vm.count("topic-words-out")) { | 
