#include #include #include #include "contexts_corpus.hh" #include "gzstream.hh" #include "contexts_lexer.h" #include using namespace std; ////////////////////////////////////////////////// // ContextsCorpus ////////////////////////////////////////////////// void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) { assert(new_contexts.contexts.size() == new_contexts.counts.size()); boost::tuple* >* extra_pair = static_cast< boost::tuple* >* >(extra); ContextsCorpus* corpus_ptr = extra_pair->get<0>(); BackoffGenerator* backoff_gen = extra_pair->get<1>(); map* counts = extra_pair->get<2>(); Document* doc(new Document()); //cout << "READ: " << new_contexts.phrase << "\t"; for (int i=0; i < new_contexts.contexts.size(); ++i) { int cache_word_count = corpus_ptr->m_dict.max(); string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[i]); // filter out singleton contexts if (!counts->empty()) { map::const_iterator find_it = counts->find(context_str); if (find_it == counts->end() || find_it->second < 5) continue; } WordID id = corpus_ptr->m_dict.Convert(context_str); if (cache_word_count != corpus_ptr->m_dict.max()) { corpus_ptr->m_backoff->terms_at_level(0)++; corpus_ptr->m_num_types++; } int count = new_contexts.counts[i]; for (int j=0; jpush_back(id); corpus_ptr->m_num_terms += count; // generate the backoff map if (backoff_gen) { int order = 1; WordID backoff_id = id; ContextsLexer::Context backedoff_context = new_contexts.contexts[i]; while (true) { if (!corpus_ptr->m_backoff->has_backoff(backoff_id)) { //cerr << "Backing off from " << corpus_ptr->m_dict.Convert(backoff_id) << " to "; backedoff_context = (*backoff_gen)(backedoff_context); if (backedoff_context.empty()) { //cerr << "Nothing." << endl; (*corpus_ptr->m_backoff)[backoff_id] = -1; break; } if (++order > corpus_ptr->m_backoff->order()) corpus_ptr->m_backoff->order(order); int cache_word_count = corpus_ptr->m_dict.max(); int new_backoff_id = corpus_ptr->m_dict.Convert(backedoff_context); if (cache_word_count != corpus_ptr->m_dict.max()) corpus_ptr->m_backoff->terms_at_level(order-1)++; //cerr << corpus_ptr->m_dict.Convert(new_backoff_id) << " ." << endl; backoff_id = ((*corpus_ptr->m_backoff)[backoff_id] = new_backoff_id); } else break; } } //cout << context_str << " (" << id << ") ||| C=" << count << " ||| "; } //cout << endl; if (!doc->empty()) { corpus_ptr->m_documents.push_back(doc); corpus_ptr->m_keys.push_back(new_contexts.phrase); } } void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) { assert(new_contexts.contexts.size() == new_contexts.counts.size()); map* context_counts = (static_cast*>(extra)); for (int i=0; i < new_contexts.contexts.size(); ++i) { int count = new_contexts.counts[i]; pair::iterator,bool> result = context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[i]),count)); if (!result.second) result.first->second += count; } } unsigned ContextsCorpus::read_contexts(const string &filename, BackoffGenerator* backoff_gen_ptr, bool filter_singeltons) { map counts; if (filter_singeltons) { cerr << "--- Filtering singleton contexts ---" << endl; igzstream in(filename.c_str()); ContextsLexer::ReadContexts(&in, filter_callback, &counts); } m_num_terms = 0; m_num_types = 0; igzstream in(filename.c_str()); boost::tuple* > extra_pair(this,backoff_gen_ptr,&counts); ContextsLexer::ReadContexts(&in, read_callback, &extra_pair); //m_num_types = m_dict.max(); cerr << "Read backoff with order " << m_backoff->order() << "\n"; for (int o=0; oorder(); o++) cerr << " Terms at " << o << " = " << m_backoff->terms_at_level(o) << endl; cerr << endl; return m_documents.size(); }