diff options
| author | Chris Dyer <cdyer@cab.ark.cs.cmu.edu> | 2012-10-02 00:19:43 -0400 | 
|---|---|---|
| committer | Chris Dyer <cdyer@cab.ark.cs.cmu.edu> | 2012-10-02 00:19:43 -0400 | 
| commit | e26434979adc33bd949566ba7bf02dff64e80a3e (patch) | |
| tree | d1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/pyp-topics/src/pyp-topics.cc | |
| parent | 0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff) | |
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'gi/pyp-topics/src/pyp-topics.cc')
| -rw-r--r-- | gi/pyp-topics/src/pyp-topics.cc | 499 | 
1 files changed, 0 insertions, 499 deletions
diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc deleted file mode 100644 index 4de52fd7..00000000 --- a/gi/pyp-topics/src/pyp-topics.cc +++ /dev/null @@ -1,499 +0,0 @@ -#include "timing.h" -#include "pyp-topics.hh" -#include "contexts_corpus.hh" - -//Dict const *dict; - -//#include <boost/date_time/posix_time/posix_time_types.hpp> -void PYPTopics::sample_corpus(const Corpus& corpus, int samples, -                              int freq_cutoff_start, int freq_cutoff_end, -                              int freq_cutoff_interval, -                              int max_contexts_per_document, -                              F temp_start, F temp_end) { -  Timer timer; -  //dict = &((ContextsCorpus*) &corpus)->dict(); - -  if (!m_backoff.get()) { -    m_word_pyps.clear(); -    m_word_pyps.push_back(PYPs()); -  } - -  std::cerr << "\n Training with " << m_word_pyps.size()-1 << " backoff level" -    << (m_word_pyps.size()==2 ? ":" : "s:") << std::endl; - - -  for (int i=0; i<(int)m_word_pyps.size(); ++i) -  { -    m_word_pyps.at(i).reserve(m_num_topics); -    for (int j=0; j<m_num_topics; ++j) -      m_word_pyps.at(i).push_back(new PYP<int>(0.01, 1.0, m_seed)); -  } -  std::cerr << std::endl; - -  m_document_pyps.reserve(corpus.num_documents()); -  for (int j=0; j<corpus.num_documents(); ++j) -    m_document_pyps.push_back(new PYP<int>(0.01, 1.0, m_seed)); - -  m_topic_p0 = 1.0/m_num_topics; -  m_term_p0 = 1.0/(F)m_backoff->terms_at_level(m_word_pyps.size()-1); -  //m_term_p0 = 1.0/corpus.num_types(); -  m_backoff_p0 = 1.0/corpus.num_documents(); - -  std::cerr << " Documents: " << corpus.num_documents() << " Terms: " -    << corpus.num_types() << std::endl; - -  int frequency_cutoff = freq_cutoff_start; -  std::cerr << " Context frequency cutoff set to " << frequency_cutoff << std::endl; - -  timer.Reset(); -  // Initialisation pass -  int document_id=0, topic_counter=0; -  for (Corpus::const_iterator corpusIt=corpus.begin(); -       corpusIt != corpus.end(); ++corpusIt, ++document_id) { -    m_corpus_topics.push_back(DocumentTopics(corpusIt->size(), 0)); - -    int term_index=0; -    for (Document::const_iterator docIt=corpusIt->begin(); -         docIt != corpusIt->end(); ++docIt, ++term_index) { -      topic_counter++; -      Term term = *docIt; - -      // sample a new_topic -      //int new_topic = (topic_counter % m_num_topics); -      int freq = corpus.context_count(term); -      int new_topic = -1; -      if (freq > frequency_cutoff -          && (!max_contexts_per_document || term_index < max_contexts_per_document)) { -        //new_topic = sample(document_id, term); -        //new_topic = document_id % m_num_topics; -        new_topic = (int) (rnd() * m_num_topics); - -        // add the new topic to the PYPs -        increment(term, new_topic); - -        if (m_use_topic_pyp) { -          F p0 = m_topic_pyp.prob(new_topic, m_topic_p0); -          int table_delta = m_document_pyps[document_id].increment(new_topic, p0); -          if (table_delta) -            m_topic_pyp.increment(new_topic, m_topic_p0); -        } -        else m_document_pyps[document_id].increment(new_topic, m_topic_p0); -      } - -      m_corpus_topics[document_id][term_index] = new_topic; -    } -  } -  std::cerr << "  Initialized in " << timer.Elapsed() << " seconds\n"; - -  int* randomDocIndices = new int[corpus.num_documents()]; -  for (int i = 0; i < corpus.num_documents(); ++i) -	  randomDocIndices[i] = i; - -  if (num_jobs < max_threads) -    num_jobs = max_threads; -  int job_incr = (int) ( (float)m_document_pyps.size() / float(num_jobs) ); - -  // Sampling phase -  for (int curr_sample=0; curr_sample < samples; ++curr_sample) { -    if (freq_cutoff_interval > 0 && curr_sample != 1 -        && curr_sample % freq_cutoff_interval == 1 -        && frequency_cutoff > freq_cutoff_end) { -      frequency_cutoff--; -      std::cerr << "\n Context frequency cutoff set to " << frequency_cutoff << std::endl; -    } - -    F temp = 1.0 / (temp_start - curr_sample*(temp_start-temp_end)/samples); -    std::cerr << "\n  -- Sample " << curr_sample << " (T=" << temp << ") "; std::cerr.flush(); - -    // Randomize the corpus indexing array -    int tmp; -    int processed_terms=0; -    /* -    for (int i = corpus.num_documents()-1; i > 0; --i) -    { -        //i+1 since j \in [0,i] but rnd() \in [0,1) -    	int j = (int)(rnd() * (i+1)); -      assert(j >= 0 && j <= i); -     	tmp = randomDocIndices[i]; -    	randomDocIndices[i] = randomDocIndices[j]; -    	randomDocIndices[j] = tmp; -    } -    */ - -    // for each document in the corpus -    int document_id; -    for (int i=0; i<corpus.num_documents(); ++i) { -    	document_id = randomDocIndices[i]; - -      // for each term in the document -      int term_index=0; -      Document::const_iterator docEnd = corpus.at(document_id).end(); -      for (Document::const_iterator docIt=corpus.at(document_id).begin(); -           docIt != docEnd; ++docIt, ++term_index) { -        if (max_contexts_per_document && term_index > max_contexts_per_document) -          break; -         -        Term term = *docIt; - -        int freq = corpus.context_count(term); -        if (freq < frequency_cutoff) -          continue; - -        processed_terms++; - -        // remove the prevous topic from the PYPs -        int current_topic = m_corpus_topics[document_id][term_index]; -        // a negative label mean that term hasn't been sampled yet -        if (current_topic >= 0) { -          decrement(term, current_topic); - -          int table_delta = m_document_pyps[document_id].decrement(current_topic); -          if (m_use_topic_pyp && table_delta < 0) -            m_topic_pyp.decrement(current_topic); -        } - -        // sample a new_topic -        int new_topic = sample(document_id, term, temp); -        //std::cerr << "TERM: " << dict->Convert(term) << " (" << term << ") " << " Old Topic: "  -        //  << current_topic << " New Topic: " << new_topic << "\n" << std::endl; - -        // add the new topic to the PYPs -        m_corpus_topics[document_id][term_index] = new_topic; -        increment(term, new_topic); - -        if (m_use_topic_pyp) { -          F p0 = m_topic_pyp.prob(new_topic, m_topic_p0); -          int table_delta = m_document_pyps[document_id].increment(new_topic, p0); -          if (table_delta) -            m_topic_pyp.increment(new_topic, m_topic_p0); -        } -        else m_document_pyps[document_id].increment(new_topic, m_topic_p0); -      } -      if (document_id && document_id % 10000 == 0) { -        std::cerr << "."; std::cerr.flush(); -      } -    } -    std::cerr << " ||| LLH= " << log_likelihood(); - -    if (curr_sample != 0 && curr_sample % 10 == 0) { -    //if (true) { -      std::cerr << " ||| time=" << (timer.Elapsed() / 10.0) << " sec/sample" << std::endl; -      timer.Reset(); -      std::cerr << "     ... Resampling hyperparameters ("; -       -      // resample the hyperparamters -      F log_p=0.0; -      if (max_threads == 1) -      {  -        std::cerr << "1 thread)" << std::endl; std::cerr.flush(); -        log_p += hresample_topics(); -        log_p += hresample_docs(0, m_document_pyps.size()); -      } -      else -      { //parallelize -        std::cerr << max_threads << " threads, " << num_jobs << " jobs)" << std::endl; std::cerr.flush(); -         -        WorkerPool<JobReturnsF, F> pool(max_threads);  -        int i=0, sz = m_document_pyps.size(); -        //documents... -        while (i <= sz - 2*job_incr) -        {     -          JobReturnsF job = boost::bind(&PYPTopics::hresample_docs, this, i, i+job_incr); -          pool.addJob(job); -          i += job_incr; -        } -        //  do all remaining documents -        JobReturnsF job = boost::bind(&PYPTopics::hresample_docs, this, i,sz); -        pool.addJob(job); -         -        //topics... -        JobReturnsF topics_job = boost::bind(&PYPTopics::hresample_topics, this); -        pool.addJob(topics_job); - -        log_p += pool.get_result(); //blocks - -      } - -      if (m_use_topic_pyp) { -        m_topic_pyp.resample_prior(rnd); -        log_p += m_topic_pyp.log_restaurant_prob(); -      } - -      std::cerr.precision(10); -      std::cerr << " ||| LLH=" << log_likelihood() << " ||| resampling time=" << timer.Elapsed() << " sec" << std::endl; -      timer.Reset(); - -      int k=0; -      std::cerr << "Topics distribution: "; -      std::cerr.precision(2); -      for (PYPs::iterator pypIt=m_word_pyps.front().begin(); -           pypIt != m_word_pyps.front().end(); ++pypIt, ++k) { -        if (k % 5 == 0) std::cerr << std::endl << '\t'; -        std::cerr << "<" << k << ":" << pypIt->num_customers() << "," -          << pypIt->num_types() << "," << m_topic_pyp.prob(k, m_topic_p0) << "> "; -      } -      std::cerr.precision(10); -      std::cerr << std::endl; -    } -  } -  delete [] randomDocIndices; -} - -PYPTopics::F PYPTopics::hresample_docs(int start, int end) -{ -  int resample_counter=0; -  F log_p = 0.0; -  assert(start >= 0); -  assert(end >= 0); -  assert(start <= end); -  for (int i=start; i < end; ++i) -  { -    m_document_pyps[i].resample_prior(rnd); -    log_p += m_document_pyps[i].log_restaurant_prob(); -    if (resample_counter++ % 5000 == 0) { -      std::cerr << "."; std::cerr.flush(); -    } -  } -  return log_p; -} - -PYPTopics::F PYPTopics::hresample_topics() -{ -  F log_p = 0.0; -  for (std::vector<PYPs>::iterator levelIt=m_word_pyps.begin(); -      levelIt != m_word_pyps.end(); ++levelIt) { -    for (PYPs::iterator pypIt=levelIt->begin(); -        pypIt != levelIt->end(); ++pypIt) { - -      pypIt->resample_prior(rnd); -      log_p += pypIt->log_restaurant_prob(); -    } -    std::cerr << log_p << std::endl; -  } -  return log_p; -} - -PYPTopics::F PYPTopics::log_likelihood() const  -{ -  F log_p = 0.0; - -  // LLH of topic term distribution -  size_t i=0; -  for (std::vector<PYPs>::const_iterator levelIt=m_word_pyps.begin(); -      levelIt != m_word_pyps.end(); ++levelIt, ++i) { -    for (PYPs::const_iterator pypIt=levelIt->begin(); -        pypIt != levelIt->end(); ++pypIt, ++i) { -      log_p += pypIt->log_restaurant_prob(); - -      if (i == m_word_pyps.size()-1) -        log_p += (pypIt->num_tables() * -log(m_backoff->terms_at_level(i))); -      else -        log_p += (pypIt->num_tables() * log(m_term_p0)); -    } -  } -  std::cerr << " TERM LLH: " << log_p << " "; //std::endl; - -  // LLH of document topic distribution -  for (size_t i=0; i < m_document_pyps.size(); ++i) { -    log_p += m_document_pyps[i].log_restaurant_prob(); -    if (!m_use_topic_pyp) log_p += (m_document_pyps[i].num_tables() * m_topic_p0); -  } -  if (m_use_topic_pyp) { -    log_p += m_topic_pyp.log_restaurant_prob(); -    log_p += (m_topic_pyp.num_tables() * log(m_topic_p0)); -  } - -  return log_p; -} - -void PYPTopics::decrement(const Term& term, int topic, int level) { -  //std::cerr << "PYPTopics::decrement(" << term << "," << topic << "," << level << ")" << std::endl; -  int table_delta = m_word_pyps.at(level).at(topic).decrement(term); -  if (table_delta && m_backoff.get()) { -    Term backoff_term = (*m_backoff)[term]; -    if (!m_backoff->is_null(backoff_term)) -      decrement(backoff_term, topic, level+1); -  } -} - -void PYPTopics::increment(const Term& term, int topic, int level) { -  //std::cerr << "PYPTopics::increment(" << term << "," << topic << "," << level << ")" << std::endl; -  int table_delta = m_word_pyps.at(level).at(topic).increment(term, word_pyps_p0(term, topic, level)); - -  if (table_delta && m_backoff.get()) { -    Term backoff_term = (*m_backoff)[term]; -    if (!m_backoff->is_null(backoff_term)) -      increment(backoff_term, topic, level+1); -  } -} - -int PYPTopics::sample(const DocumentId& doc, const Term& term, F inv_temp) { -  // First pass: collect probs -  F sum=0.0; -  std::vector<F> sums; -  for (int k=0; k<m_num_topics; ++k) { -    F p_w_k = prob(term, k); - -    F topic_prob = m_topic_p0; -    if (m_use_topic_pyp) topic_prob = m_topic_pyp.prob(k, m_topic_p0); - -    //F p_k_d = m_document_pyps[doc].prob(k, topic_prob); -    F p_k_d = m_document_pyps[doc].unnormalised_prob(k, topic_prob); - -    F prob = p_w_k*p_k_d; -    /* -    if (prob < 0.0) { std::cerr << "\n\n" << prob << " " << p_w_k << " " << p_k_d << std::endl; assert(false); } -    if (prob > 1.0) { std::cerr << "\n\n" << prob << " " << p_w_k << " " << p_k_d << std::endl; assert(false); } -    assert (pow(prob, inv_temp) >= 0.0); -    assert (pow(prob, inv_temp) <= 1.0); -    */ -    sum += pow(prob, inv_temp); -    sums.push_back(sum); -  } -  // Second pass: sample a topic -  F cutoff = rnd() * sum; -  for (int k=0; k<m_num_topics; ++k) { -    if (cutoff <= sums[k]) -      return k; -  } -  assert(false); -} - -PYPTopics::F PYPTopics::word_pyps_p0(const Term& term, int topic, int level) const { -  //for (int i=0; i<level+1; ++i) std::cerr << "  "; -  //std::cerr << "PYPTopics::word_pyps_p0(" << term << "," << topic << "," << level << ")" << std::endl; - -  F p0 = m_term_p0; -  if (m_backoff.get()) { -    //static F fudge=m_backoff_p0; // TODO - -    Term backoff_term = (*m_backoff)[term]; -    //std::cerr << "T: " << term << " BO: " << backoff_term << std::endl; -    if (!m_backoff->is_null(backoff_term)) { -      assert (level < m_backoff->order()); -      //p0 = (1.0/(F)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1); -      p0 = m_term_p0*prob(backoff_term, topic, level+1); -      p0 = prob(backoff_term, topic, level+1); -    } -    else -      p0 = (1.0/(F) m_backoff->terms_at_level(level)); -      //p0 = m_term_p0; -  } -  //for (int i=0; i<level+1; ++i) std::cerr << "  "; -  //std::cerr << "PYPTopics::word_pyps_p0(" << term << "," << topic << "," << level << ") = " << p0 << std::endl; -  return p0; -} - -PYPTopics::F PYPTopics::prob(const Term& term, int topic, int level) const { -  //for (int i=0; i<level+1; ++i) std::cerr << "  "; -  //std::cerr << "PYPTopics::prob(" << dict->Convert(term) << "," << topic << "," << level << ")" << std::endl; - -  F p0 = word_pyps_p0(term, topic, level); -  F p_w_k = m_word_pyps.at(level).at(topic).prob(term, p0); - -  /* -  for (int i=0; i<level+1; ++i) std::cerr << "  "; -  std::cerr << "PYPTopics::prob(" << dict->Convert(term) << "," << topic << "," << level << ") = " << p_w_k << std::endl; -  for (int i=0; i<level+1; ++i) std::cerr << "  "; -  m_word_pyps.at(level).at(topic).debug_info(std::cerr); -  */ -  return p_w_k; -} - -int PYPTopics::max_topic() const { -  if (!m_use_topic_pyp) -    return -1; - -  F current_max=0.0; -  int current_topic=-1; -  for (int k=0; k<m_num_topics; ++k) { -    F prob = m_topic_pyp.prob(k, m_topic_p0); -    if (prob > current_max) { -      current_max = prob; -      current_topic = k; -    } -  } -  assert(current_topic >= 0); -  return current_topic; -} - -std::pair<int,PYPTopics::F> PYPTopics::max(const DocumentId& doc) const { -  //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl; -  // collect probs -  F current_max=0.0; -  int current_topic=-1; -  for (int k=0; k<m_num_topics; ++k) { -    //F p_w_k = prob(term, k); - -    F topic_prob = m_topic_p0; -    if (m_use_topic_pyp) -      topic_prob = m_topic_pyp.prob(k, m_topic_p0); - -    F prob = 0; -    if (doc < 0) prob = topic_prob; -    else         prob = m_document_pyps[doc].prob(k, topic_prob); - -    if (prob > current_max) { -      current_max = prob; -      current_topic = k; -    } -  } -  assert(current_topic >= 0); -  assert(current_max >= 0); -  return std::make_pair(current_topic, current_max); -} - -std::pair<int,PYPTopics::F> PYPTopics::max(const DocumentId& doc, const Term& term) const { -  //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl; -  // collect probs -  F current_max=0.0; -  int current_topic=-1; -  for (int k=0; k<m_num_topics; ++k) { -    F p_w_k = prob(term, k); - -    F topic_prob = m_topic_p0; -    if (m_use_topic_pyp) -      topic_prob = m_topic_pyp.prob(k, m_topic_p0); - -    F p_k_d = 0; -    if (doc < 0) p_k_d = topic_prob; -    else         p_k_d = m_document_pyps[doc].prob(k, topic_prob); - -    F prob = (p_w_k*p_k_d); -    if (prob > current_max) { -      current_max = prob; -      current_topic = k; -    } -  } -  assert(current_topic >= 0); -  assert(current_max >= 0); -  return std::make_pair(current_topic,current_max); -} - -std::ostream& PYPTopics::print_document_topics(std::ostream& out) const { -  for (CorpusTopics::const_iterator corpusIt=m_corpus_topics.begin(); -       corpusIt != m_corpus_topics.end(); ++corpusIt) { -    int term_index=0; -    for (DocumentTopics::const_iterator docIt=corpusIt->begin(); -         docIt != corpusIt->end(); ++docIt, ++term_index) { -      if (term_index) out << " "; -      out << *docIt; -    } -    out << std::endl; -  } -  return out; -} - -std::ostream& PYPTopics::print_topic_terms(std::ostream& out) const { -  for (PYPs::const_iterator pypsIt=m_word_pyps.front().begin(); -       pypsIt != m_word_pyps.front().end(); ++pypsIt) { -    int term_index=0; -    for (PYP<int>::const_iterator termIt=pypsIt->begin(); -         termIt != pypsIt->end(); ++termIt, ++term_index) { -      if (term_index) out << " "; -      out << termIt->first << ":" << termIt->second; -    } -    out << std::endl; -  } -  return out; -}  | 
