#include "pyp-topics.hh" //#include "mt19937ar.h" #include #include #include struct Timer { Timer() { Reset(); } void Reset() { //clock_gettime(CLOCK_MONOTONIC, &start_t); start_t = mach_absolute_time(); } double Elapsed() const { //timespec end_t; timespec tp; uint64_t end_t = mach_absolute_time(); mach_absolute_difference(end_t, start_t, &tp); //clock_gettime(CLOCK_MONOTONIC, &end_t); //const double elapsed = (end_t.tv_sec - start_t.tv_sec) // + (end_t.tv_nsec - start_t.tv_nsec) / 1000000000.0; const double elapsed = tp.tv_sec + tp.tv_nsec / 1000000000.0; return elapsed; } private: void mach_absolute_difference(uint64_t end, uint64_t start, struct timespec *tp) const { uint64_t difference = end - start; static mach_timebase_info_data_t info = {0,0}; if (info.denom == 0) mach_timebase_info(&info); uint64_t elapsednano = difference * (info.numer / info.denom); tp->tv_sec = elapsednano * 1e-9; tp->tv_nsec = elapsednano - (tp->tv_sec * 1e9); } //timespec start_t; uint64_t start_t; }; void PYPTopics::sample(const Corpus& corpus, int samples) { Timer timer; if (!m_backoff.get()) { m_word_pyps.clear(); m_word_pyps.push_back(PYPs()); } std::cerr << " Training with " << m_word_pyps.size()-1 << " backoff level" << (m_word_pyps.size()==2 ? ":" : "s:") << std::endl; for (int i=0; i<(int)m_word_pyps.size(); ++i) m_word_pyps.at(i).resize(m_num_topics, PYP(0.5, 1.0)); std::cerr << std::endl; m_document_pyps.resize(corpus.num_documents(), PYP(0.5, 1.0)); m_topic_p0 = 1.0/m_num_topics; m_term_p0 = 1.0/corpus.num_types(); m_backoff_p0 = 1.0/corpus.num_documents(); std::cerr << " Documents: " << corpus.num_documents() << " Terms: " << corpus.num_types() << std::endl; timer.Reset(); // Initialisation pass int document_id=0, topic_counter=0; for (Corpus::const_iterator corpusIt=corpus.begin(); corpusIt != corpus.end(); ++corpusIt, ++document_id) { m_corpus_topics.push_back(DocumentTopics(corpusIt->size(), 0)); int term_index=0; for (Document::const_iterator docIt=corpusIt->begin(); docIt != corpusIt->end(); ++docIt, ++term_index) { topic_counter++; Term term = *docIt; // sample a new_topic //int new_topic = (topic_counter % m_num_topics); int new_topic = (document_id % m_num_topics); // add the new topic to the PYPs m_corpus_topics[document_id][term_index] = new_topic; increment(term, new_topic); if (m_use_topic_pyp) { F p0 = m_topic_pyp.prob(new_topic, m_topic_p0); int table_delta = m_document_pyps[document_id].increment(new_topic, p0); if (table_delta) m_topic_pyp.increment(new_topic, m_topic_p0); } else m_document_pyps[document_id].increment(new_topic, m_topic_p0); } } std::cerr << " Initialized in " << timer.Elapsed() << " seconds\n"; int* randomDocIndices = new int[corpus.num_documents()]; for (int i = 0; i < corpus.num_documents(); ++i) randomDocIndices[i] = i; // Sampling phase for (int curr_sample=0; curr_sample < samples; ++curr_sample) { std::cerr << "\n -- Sample " << curr_sample << " "; std::cerr.flush(); // Randomize the corpus indexing array int tmp; for (int i = corpus.num_documents()-1; i > 0; --i) { int j = (int)(mt_genrand_real1() * i); tmp = randomDocIndices[i]; randomDocIndices[i] = randomDocIndices[j]; randomDocIndices[j] = tmp; } // for each document in the corpus int document_id; for (int i=0; i::iterator levelIt=m_word_pyps.begin(); levelIt != m_word_pyps.end(); ++levelIt) { for (PYPs::iterator pypIt=levelIt->begin(); pypIt != levelIt->end(); ++pypIt) { pypIt->resample_prior(); log_p += pypIt->log_restaurant_prob(); } } resample_counter=0; for (PYPs::iterator pypIt=m_document_pyps.begin(); pypIt != m_document_pyps.end(); ++pypIt, ++resample_counter) { pypIt->resample_prior(); log_p += pypIt->log_restaurant_prob(); if (resample_counter++ % 10000 == 0) { std::cerr << "."; std::cerr.flush(); } } if (m_use_topic_pyp) { m_topic_pyp.resample_prior(); log_p += m_topic_pyp.log_restaurant_prob(); } std::cerr << " ||| LLH=" << log_p << " ||| resampling time=" << timer.Elapsed() << " sec" << std::endl; timer.Reset(); int k=0; std::cerr << "Topics distribution: "; std::cerr.precision(2); for (PYPs::iterator pypIt=m_word_pyps.front().begin(); pypIt != m_word_pyps.front().end(); ++pypIt, ++k) { std::cerr << "<" << k << ":" << pypIt->num_customers() << "," << pypIt->num_types() << "," << m_topic_pyp.prob(k, m_topic_p0) << "> "; if (k % 5 == 0) std::cerr << std::endl << '\t'; } std::cerr << std::endl; } } delete [] randomDocIndices; } void PYPTopics::decrement(const Term& term, int topic, int level) { //std::cerr << "PYPTopics::decrement(" << term << "," << topic << "," << level << ")" << std::endl; m_word_pyps.at(level).at(topic).decrement(term); if (m_backoff.get()) { Term backoff_term = (*m_backoff)[term]; if (!m_backoff->is_null(backoff_term)) decrement(backoff_term, topic, level+1); } } void PYPTopics::increment(const Term& term, int topic, int level) { //std::cerr << "PYPTopics::increment(" << term << "," << topic << "," << level << ")" << std::endl; m_word_pyps.at(level).at(topic).increment(term, word_pyps_p0(term, topic, level)); if (m_backoff.get()) { Term backoff_term = (*m_backoff)[term]; if (!m_backoff->is_null(backoff_term)) increment(backoff_term, topic, level+1); } } int PYPTopics::sample(const DocumentId& doc, const Term& term) { // First pass: collect probs F sum=0.0; std::vector sums; for (int k=0; kis_null(backoff_term)) { assert (level < m_backoff->order()); p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1); } else p0 = m_term_p0; } //for (int i=0; i current_max) { current_max = prob; current_topic = k; } } assert(current_topic >= 0); return current_topic; } std::ostream& PYPTopics::print_document_topics(std::ostream& out) const { for (CorpusTopics::const_iterator corpusIt=m_corpus_topics.begin(); corpusIt != m_corpus_topics.end(); ++corpusIt) { int term_index=0; for (DocumentTopics::const_iterator docIt=corpusIt->begin(); docIt != corpusIt->end(); ++docIt, ++term_index) { if (term_index) out << " "; out << *docIt; } out << std::endl; } return out; } std::ostream& PYPTopics::print_topic_terms(std::ostream& out) const { for (PYPs::const_iterator pypsIt=m_word_pyps.front().begin(); pypsIt != m_word_pyps.front().end(); ++pypsIt) { int term_index=0; for (PYP::const_iterator termIt=pypsIt->begin(); termIt != pypsIt->end(); ++termIt, ++term_index) { if (term_index) out << " "; out << termIt->first << ":" << termIt->second; } out << std::endl; } return out; }