From e26434979adc33bd949566ba7bf02dff64e80a3e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 2 Oct 2012 00:19:43 -0400 Subject: cdec cleanup, remove bayesian stuff, parsing stuff --- gi/pyp-topics/src/contexts_corpus.cc | 164 ----------------------------------- 1 file changed, 164 deletions(-) delete mode 100644 gi/pyp-topics/src/contexts_corpus.cc (limited to 'gi/pyp-topics/src/contexts_corpus.cc') diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc deleted file mode 100644 index 92b1b34c..00000000 --- a/gi/pyp-topics/src/contexts_corpus.cc +++ /dev/null @@ -1,164 +0,0 @@ -#include -#include -#include - -#include "contexts_corpus.hh" -#include "gzstream.hh" -#include "contexts_lexer.h" - -#include - - -using namespace std; - -////////////////////////////////////////////////// -// ContextsCorpus -////////////////////////////////////////////////// - -bool read_callback_binary_contexts = false; - -void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) { - assert(new_contexts.contexts.size() == new_contexts.counts.size()); - - boost::tuple* >* extra_pair - = static_cast< boost::tuple* >* >(extra); - - ContextsCorpus* corpus_ptr = extra_pair->get<0>(); - BackoffGenerator* backoff_gen = extra_pair->get<1>(); - //map* counts = extra_pair->get<2>(); - - Document* doc(new Document()); - - //cout << "READ: " << new_contexts.phrase << "\t"; - for (int i=0; i < (int)new_contexts.counts.size(); ++i) { - int cache_word_count = corpus_ptr->m_dict.max(); - - //string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[i]); - int context_index = new_contexts.counts.at(i).first; - string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[context_index]); - - // filter out singleton contexts - //if (!counts->empty()) { - // map::const_iterator find_it = counts->find(context_str); - // if (find_it == counts->end() || find_it->second < 2) - // continue; - //} - - WordID id = corpus_ptr->m_dict.Convert(context_str); - if (cache_word_count != corpus_ptr->m_dict.max()) { - corpus_ptr->m_backoff->terms_at_level(0)++; - corpus_ptr->m_num_types++; - } - - //int count = new_contexts.counts[i]; - int count = new_contexts.counts.at(i).second; - if (read_callback_binary_contexts) { - doc->push_back(id); - corpus_ptr->m_num_terms++; - } - else { - for (int j=0; jpush_back(id); - corpus_ptr->m_num_terms += count; - } - - // generate the backoff map - if (backoff_gen) { - int order = 1; - WordID backoff_id = id; - //ContextsLexer::Context backedoff_context = new_contexts.contexts[i]; - ContextsLexer::Context backedoff_context = new_contexts.contexts[context_index]; - while (true) { - if (!corpus_ptr->m_backoff->has_backoff(backoff_id)) { - //cerr << "Backing off from " << corpus_ptr->m_dict.Convert(backoff_id) << " to "; - backedoff_context = (*backoff_gen)(backedoff_context); - - if (backedoff_context.empty()) { - //cerr << "Nothing." << endl; - (*corpus_ptr->m_backoff)[backoff_id] = -1; - break; - } - - if (++order > corpus_ptr->m_backoff->order()) - corpus_ptr->m_backoff->order(order); - - int cache_word_count = corpus_ptr->m_dict.max(); - int new_backoff_id = corpus_ptr->m_dict.Convert(backedoff_context); - if (cache_word_count != corpus_ptr->m_dict.max()) - corpus_ptr->m_backoff->terms_at_level(order-1)++; - - //cerr << corpus_ptr->m_dict.Convert(new_backoff_id) << " ." << endl; - - backoff_id = ((*corpus_ptr->m_backoff)[backoff_id] = new_backoff_id); - } - else break; - } - } - //cout << context_str << " (" << id << ") ||| C=" << count << " ||| "; - } - //cout << endl; - - //if (!doc->empty()) { - corpus_ptr->m_documents.push_back(doc); - corpus_ptr->m_keys.push_back(new_contexts.phrase); - //} -} - -void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) { - assert(new_contexts.contexts.size() == new_contexts.counts.size()); - - map* context_counts = (static_cast*>(extra)); - - for (int i=0; i < (int)new_contexts.counts.size(); ++i) { - int context_index = new_contexts.counts.at(i).first; - int count = new_contexts.counts.at(i).second; - //if (read_callback_binary_contexts) count = 1; - //int count = new_contexts.counts[i]; - pair::iterator,bool> result - = context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[context_index]),count)); - //= context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[i]),count)); - if (!result.second) - result.first->second += count; - } -} - - -unsigned ContextsCorpus::read_contexts(const string &filename, - BackoffGenerator* backoff_gen_ptr, - bool /*filter_singeltons*/, - bool binary_contexts) { - read_callback_binary_contexts = binary_contexts; - - map counts; - //if (filter_singeltons) - { - // cerr << "--- Filtering singleton contexts ---" << endl; - - igzstream in(filename.c_str()); - ContextsLexer::ReadContexts(&in, filter_callback, &counts); - } - - m_num_terms = 0; - m_num_types = 0; - - igzstream in(filename.c_str()); - boost::tuple* > extra_pair(this,backoff_gen_ptr,&counts); - ContextsLexer::ReadContexts(&in, read_callback, &extra_pair); - - //m_num_types = m_dict.max(); - - cerr << "Read backoff with order " << m_backoff->order() << "\n"; - for (int o=0; oorder(); o++) - cerr << " Terms at " << o << " = " << m_backoff->terms_at_level(o) << endl; - //cerr << endl; - - int i=0; double av_freq=0; - for (map::const_iterator it=counts.begin(); it != counts.end(); ++it, ++i) { - WordID id = m_dict.Convert(it->first); - m_context_counts[id] = it->second; - av_freq += it->second; - } - cerr << " Average term frequency = " << av_freq / (double) i << endl; - - return m_documents.size(); -} -- cgit v1.2.3