diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 |
commit | 9339c80d465545aec5a6dccfef7c83ca715bf11f (patch) | |
tree | 64c56d558331edad1db3832018c80e799551c39a /gi/pyp-topics/src/contexts_corpus.cc | |
parent | 438dac41810b7c69fa10203ac5130d20efa2da9f (diff) | |
parent | afd7da3b2338661657ad0c4e9eec681e014d37bf (diff) |
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'gi/pyp-topics/src/contexts_corpus.cc')
-rw-r--r-- | gi/pyp-topics/src/contexts_corpus.cc | 164 |
1 files changed, 0 insertions, 164 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc deleted file mode 100644 index 92b1b34c..00000000 --- a/gi/pyp-topics/src/contexts_corpus.cc +++ /dev/null @@ -1,164 +0,0 @@ -#include <sstream> -#include <iostream> -#include <set> - -#include "contexts_corpus.hh" -#include "gzstream.hh" -#include "contexts_lexer.h" - -#include <boost/tuple/tuple.hpp> - - -using namespace std; - -////////////////////////////////////////////////// -// ContextsCorpus -////////////////////////////////////////////////// - -bool read_callback_binary_contexts = false; - -void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) { - assert(new_contexts.contexts.size() == new_contexts.counts.size()); - - boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* >* extra_pair - = static_cast< boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* >* >(extra); - - ContextsCorpus* corpus_ptr = extra_pair->get<0>(); - BackoffGenerator* backoff_gen = extra_pair->get<1>(); - //map<string,int>* counts = extra_pair->get<2>(); - - Document* doc(new Document()); - - //cout << "READ: " << new_contexts.phrase << "\t"; - for (int i=0; i < (int)new_contexts.counts.size(); ++i) { - int cache_word_count = corpus_ptr->m_dict.max(); - - //string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[i]); - int context_index = new_contexts.counts.at(i).first; - string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[context_index]); - - // filter out singleton contexts - //if (!counts->empty()) { - // map<string,int>::const_iterator find_it = counts->find(context_str); - // if (find_it == counts->end() || find_it->second < 2) - // continue; - //} - - WordID id = corpus_ptr->m_dict.Convert(context_str); - if (cache_word_count != corpus_ptr->m_dict.max()) { - corpus_ptr->m_backoff->terms_at_level(0)++; - corpus_ptr->m_num_types++; - } - - //int count = new_contexts.counts[i]; - int count = new_contexts.counts.at(i).second; - if (read_callback_binary_contexts) { - doc->push_back(id); - corpus_ptr->m_num_terms++; - } - else { - for (int j=0; j<count; ++j) - doc->push_back(id); - corpus_ptr->m_num_terms += count; - } - - // generate the backoff map - if (backoff_gen) { - int order = 1; - WordID backoff_id = id; - //ContextsLexer::Context backedoff_context = new_contexts.contexts[i]; - ContextsLexer::Context backedoff_context = new_contexts.contexts[context_index]; - while (true) { - if (!corpus_ptr->m_backoff->has_backoff(backoff_id)) { - //cerr << "Backing off from " << corpus_ptr->m_dict.Convert(backoff_id) << " to "; - backedoff_context = (*backoff_gen)(backedoff_context); - - if (backedoff_context.empty()) { - //cerr << "Nothing." << endl; - (*corpus_ptr->m_backoff)[backoff_id] = -1; - break; - } - - if (++order > corpus_ptr->m_backoff->order()) - corpus_ptr->m_backoff->order(order); - - int cache_word_count = corpus_ptr->m_dict.max(); - int new_backoff_id = corpus_ptr->m_dict.Convert(backedoff_context); - if (cache_word_count != corpus_ptr->m_dict.max()) - corpus_ptr->m_backoff->terms_at_level(order-1)++; - - //cerr << corpus_ptr->m_dict.Convert(new_backoff_id) << " ." << endl; - - backoff_id = ((*corpus_ptr->m_backoff)[backoff_id] = new_backoff_id); - } - else break; - } - } - //cout << context_str << " (" << id << ") ||| C=" << count << " ||| "; - } - //cout << endl; - - //if (!doc->empty()) { - corpus_ptr->m_documents.push_back(doc); - corpus_ptr->m_keys.push_back(new_contexts.phrase); - //} -} - -void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) { - assert(new_contexts.contexts.size() == new_contexts.counts.size()); - - map<string,int>* context_counts = (static_cast<map<string,int>*>(extra)); - - for (int i=0; i < (int)new_contexts.counts.size(); ++i) { - int context_index = new_contexts.counts.at(i).first; - int count = new_contexts.counts.at(i).second; - //if (read_callback_binary_contexts) count = 1; - //int count = new_contexts.counts[i]; - pair<map<string,int>::iterator,bool> result - = context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[context_index]),count)); - //= context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[i]),count)); - if (!result.second) - result.first->second += count; - } -} - - -unsigned ContextsCorpus::read_contexts(const string &filename, - BackoffGenerator* backoff_gen_ptr, - bool /*filter_singeltons*/, - bool binary_contexts) { - read_callback_binary_contexts = binary_contexts; - - map<string,int> counts; - //if (filter_singeltons) - { - // cerr << "--- Filtering singleton contexts ---" << endl; - - igzstream in(filename.c_str()); - ContextsLexer::ReadContexts(&in, filter_callback, &counts); - } - - m_num_terms = 0; - m_num_types = 0; - - igzstream in(filename.c_str()); - boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* > extra_pair(this,backoff_gen_ptr,&counts); - ContextsLexer::ReadContexts(&in, read_callback, &extra_pair); - - //m_num_types = m_dict.max(); - - cerr << "Read backoff with order " << m_backoff->order() << "\n"; - for (int o=0; o<m_backoff->order(); o++) - cerr << " Terms at " << o << " = " << m_backoff->terms_at_level(o) << endl; - //cerr << endl; - - int i=0; double av_freq=0; - for (map<string,int>::const_iterator it=counts.begin(); it != counts.end(); ++it, ++i) { - WordID id = m_dict.Convert(it->first); - m_context_counts[id] = it->second; - av_freq += it->second; - } - cerr << " Average term frequency = " << av_freq / (double) i << endl; - - return m_documents.size(); -} |