summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src/contexts_corpus.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pyp-topics/src/contexts_corpus.cc')
-rw-r--r--gi/pyp-topics/src/contexts_corpus.cc164
1 files changed, 0 insertions, 164 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc
deleted file mode 100644
index 92b1b34c..00000000
--- a/gi/pyp-topics/src/contexts_corpus.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <set>
-
-#include "contexts_corpus.hh"
-#include "gzstream.hh"
-#include "contexts_lexer.h"
-
-#include <boost/tuple/tuple.hpp>
-
-
-using namespace std;
-
-//////////////////////////////////////////////////
-// ContextsCorpus
-//////////////////////////////////////////////////
-
-bool read_callback_binary_contexts = false;
-
-void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {
- assert(new_contexts.contexts.size() == new_contexts.counts.size());
-
- boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* >* extra_pair
- = static_cast< boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* >* >(extra);
-
- ContextsCorpus* corpus_ptr = extra_pair->get<0>();
- BackoffGenerator* backoff_gen = extra_pair->get<1>();
- //map<string,int>* counts = extra_pair->get<2>();
-
- Document* doc(new Document());
-
- //cout << "READ: " << new_contexts.phrase << "\t";
- for (int i=0; i < (int)new_contexts.counts.size(); ++i) {
- int cache_word_count = corpus_ptr->m_dict.max();
-
- //string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[i]);
- int context_index = new_contexts.counts.at(i).first;
- string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[context_index]);
-
- // filter out singleton contexts
- //if (!counts->empty()) {
- // map<string,int>::const_iterator find_it = counts->find(context_str);
- // if (find_it == counts->end() || find_it->second < 2)
- // continue;
- //}
-
- WordID id = corpus_ptr->m_dict.Convert(context_str);
- if (cache_word_count != corpus_ptr->m_dict.max()) {
- corpus_ptr->m_backoff->terms_at_level(0)++;
- corpus_ptr->m_num_types++;
- }
-
- //int count = new_contexts.counts[i];
- int count = new_contexts.counts.at(i).second;
- if (read_callback_binary_contexts) {
- doc->push_back(id);
- corpus_ptr->m_num_terms++;
- }
- else {
- for (int j=0; j<count; ++j)
- doc->push_back(id);
- corpus_ptr->m_num_terms += count;
- }
-
- // generate the backoff map
- if (backoff_gen) {
- int order = 1;
- WordID backoff_id = id;
- //ContextsLexer::Context backedoff_context = new_contexts.contexts[i];
- ContextsLexer::Context backedoff_context = new_contexts.contexts[context_index];
- while (true) {
- if (!corpus_ptr->m_backoff->has_backoff(backoff_id)) {
- //cerr << "Backing off from " << corpus_ptr->m_dict.Convert(backoff_id) << " to ";
- backedoff_context = (*backoff_gen)(backedoff_context);
-
- if (backedoff_context.empty()) {
- //cerr << "Nothing." << endl;
- (*corpus_ptr->m_backoff)[backoff_id] = -1;
- break;
- }
-
- if (++order > corpus_ptr->m_backoff->order())
- corpus_ptr->m_backoff->order(order);
-
- int cache_word_count = corpus_ptr->m_dict.max();
- int new_backoff_id = corpus_ptr->m_dict.Convert(backedoff_context);
- if (cache_word_count != corpus_ptr->m_dict.max())
- corpus_ptr->m_backoff->terms_at_level(order-1)++;
-
- //cerr << corpus_ptr->m_dict.Convert(new_backoff_id) << " ." << endl;
-
- backoff_id = ((*corpus_ptr->m_backoff)[backoff_id] = new_backoff_id);
- }
- else break;
- }
- }
- //cout << context_str << " (" << id << ") ||| C=" << count << " ||| ";
- }
- //cout << endl;
-
- //if (!doc->empty()) {
- corpus_ptr->m_documents.push_back(doc);
- corpus_ptr->m_keys.push_back(new_contexts.phrase);
- //}
-}
-
-void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {
- assert(new_contexts.contexts.size() == new_contexts.counts.size());
-
- map<string,int>* context_counts = (static_cast<map<string,int>*>(extra));
-
- for (int i=0; i < (int)new_contexts.counts.size(); ++i) {
- int context_index = new_contexts.counts.at(i).first;
- int count = new_contexts.counts.at(i).second;
- //if (read_callback_binary_contexts) count = 1;
- //int count = new_contexts.counts[i];
- pair<map<string,int>::iterator,bool> result
- = context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[context_index]),count));
- //= context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[i]),count));
- if (!result.second)
- result.first->second += count;
- }
-}
-
-
-unsigned ContextsCorpus::read_contexts(const string &filename,
- BackoffGenerator* backoff_gen_ptr,
- bool /*filter_singeltons*/,
- bool binary_contexts) {
- read_callback_binary_contexts = binary_contexts;
-
- map<string,int> counts;
- //if (filter_singeltons)
- {
- // cerr << "--- Filtering singleton contexts ---" << endl;
-
- igzstream in(filename.c_str());
- ContextsLexer::ReadContexts(&in, filter_callback, &counts);
- }
-
- m_num_terms = 0;
- m_num_types = 0;
-
- igzstream in(filename.c_str());
- boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* > extra_pair(this,backoff_gen_ptr,&counts);
- ContextsLexer::ReadContexts(&in, read_callback, &extra_pair);
-
- //m_num_types = m_dict.max();
-
- cerr << "Read backoff with order " << m_backoff->order() << "\n";
- for (int o=0; o<m_backoff->order(); o++)
- cerr << " Terms at " << o << " = " << m_backoff->terms_at_level(o) << endl;
- //cerr << endl;
-
- int i=0; double av_freq=0;
- for (map<string,int>::const_iterator it=counts.begin(); it != counts.end(); ++it, ++i) {
- WordID id = m_dict.Convert(it->first);
- m_context_counts[id] = it->second;
- av_freq += it->second;
- }
- cerr << " Average term frequency = " << av_freq / (double) i << endl;
-
- return m_documents.size();
-}