Merge branch 'master' of https://github.com/redpony/cdec

author: Chris Dyer <cdyer@cs.cmu.edu> 2012-10-11 14:06:32 -0400
committer: Chris Dyer <cdyer@cs.cmu.edu> 2012-10-11 14:06:32 -0400
commit: 9339c80d465545aec5a6dccfef7c83ca715bf11f (patch)
tree: 64c56d558331edad1db3832018c80e799551c39a /gi/pyp-topics/src/contexts_corpus.cc
parent: 438dac41810b7c69fa10203ac5130d20efa2da9f (diff)
parent: afd7da3b2338661657ad0c4e9eec681e014d37bf (diff)
1 files changed, 0 insertions, 164 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc
deleted file mode 100644
index 92b1b34c..00000000
--- a/gi/pyp-topics/src/contexts_corpus.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <set>
-
-#include "contexts_corpus.hh"
-#include "gzstream.hh"
-#include "contexts_lexer.h"
-
-#include <boost/tuple/tuple.hpp>
-
-
-using namespace std;
-
-//////////////////////////////////////////////////
-// ContextsCorpus
-//////////////////////////////////////////////////
-
-bool read_callback_binary_contexts = false;
-
-void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {
-  assert(new_contexts.contexts.size() == new_contexts.counts.size());
-
-  boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* >* extra_pair
-    = static_cast< boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* >* >(extra);
-
-  ContextsCorpus* corpus_ptr = extra_pair->get<0>();
-  BackoffGenerator* backoff_gen = extra_pair->get<1>();
-  //map<string,int>* counts = extra_pair->get<2>();
-
-  Document* doc(new Document());
-
-  //cout << "READ: " << new_contexts.phrase << "\t";
-  for (int i=0; i < (int)new_contexts.counts.size(); ++i) {
-    int cache_word_count = corpus_ptr->m_dict.max();
-
-    //string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[i]);
-    int context_index = new_contexts.counts.at(i).first;
-    string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[context_index]);
-
-    // filter out singleton contexts
-    //if (!counts->empty()) {
-    //  map<string,int>::const_iterator find_it = counts->find(context_str);
-    //  if (find_it == counts->end() || find_it->second < 2)
-    //    continue;
-    //}
-
-    WordID id = corpus_ptr->m_dict.Convert(context_str);
-    if (cache_word_count != corpus_ptr->m_dict.max()) {
-      corpus_ptr->m_backoff->terms_at_level(0)++;
-      corpus_ptr->m_num_types++;
-    }
-
-    //int count = new_contexts.counts[i];
-    int count = new_contexts.counts.at(i).second;
-    if (read_callback_binary_contexts) {
-      doc->push_back(id);
-      corpus_ptr->m_num_terms++;
-    }
-    else {
-      for (int j=0; j<count; ++j)
-        doc->push_back(id);
-      corpus_ptr->m_num_terms += count;
-    }
-
-    // generate the backoff map
-    if (backoff_gen) {
-      int order = 1;
-      WordID backoff_id = id;
-      //ContextsLexer::Context backedoff_context = new_contexts.contexts[i];
-      ContextsLexer::Context backedoff_context = new_contexts.contexts[context_index];
-      while (true) {
-        if (!corpus_ptr->m_backoff->has_backoff(backoff_id)) {
-          //cerr << "Backing off from " << corpus_ptr->m_dict.Convert(backoff_id) << " to ";
-          backedoff_context = (*backoff_gen)(backedoff_context);
-
-          if (backedoff_context.empty()) {
-            //cerr << "Nothing." << endl;
-            (*corpus_ptr->m_backoff)[backoff_id] = -1;
-            break;
-          }
-
-          if (++order > corpus_ptr->m_backoff->order())
-            corpus_ptr->m_backoff->order(order);
-
-          int cache_word_count = corpus_ptr->m_dict.max();
-          int new_backoff_id = corpus_ptr->m_dict.Convert(backedoff_context);
-          if (cache_word_count != corpus_ptr->m_dict.max())
-            corpus_ptr->m_backoff->terms_at_level(order-1)++;
-
-          //cerr << corpus_ptr->m_dict.Convert(new_backoff_id) << " ." << endl;
-
-          backoff_id = ((*corpus_ptr->m_backoff)[backoff_id] = new_backoff_id);
-        }
-        else break;
-      }
-    }
-    //cout << context_str << " (" << id << ") ||| C=" << count << " ||| ";
-  }
-  //cout << endl;
-
-  //if (!doc->empty()) {
-    corpus_ptr->m_documents.push_back(doc);
-    corpus_ptr->m_keys.push_back(new_contexts.phrase);
-  //}
-}
-
-void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {
-  assert(new_contexts.contexts.size() == new_contexts.counts.size());
-
-  map<string,int>* context_counts = (static_cast<map<string,int>*>(extra));
-
-  for (int i=0; i < (int)new_contexts.counts.size(); ++i) {
-    int context_index = new_contexts.counts.at(i).first;
-    int count = new_contexts.counts.at(i).second;
-    //if (read_callback_binary_contexts) count = 1;
-    //int count = new_contexts.counts[i];
-    pair<map<string,int>::iterator,bool> result 
-      = context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[context_index]),count));
-      //= context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[i]),count));
-    if (!result.second)
-      result.first->second += count;
-  }
-}
-
-
-unsigned ContextsCorpus::read_contexts(const string &filename, 
-                                       BackoffGenerator* backoff_gen_ptr,
-                                       bool /*filter_singeltons*/,
-                                       bool binary_contexts) {
-  read_callback_binary_contexts = binary_contexts;
-
-  map<string,int> counts;
-  //if (filter_singeltons) 
-  {
-  //  cerr << "--- Filtering singleton contexts ---" << endl;
-
-    igzstream in(filename.c_str());
-    ContextsLexer::ReadContexts(&in, filter_callback, &counts);
-  }
-
-  m_num_terms = 0;
-  m_num_types = 0;
-
-  igzstream in(filename.c_str());
-  boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* > extra_pair(this,backoff_gen_ptr,&counts);
-  ContextsLexer::ReadContexts(&in, read_callback, &extra_pair);
-
-  //m_num_types = m_dict.max();
-
-  cerr << "Read backoff with order " << m_backoff->order() << "\n";
-  for (int o=0; o<m_backoff->order(); o++)
-    cerr << "  Terms at " << o << " = " << m_backoff->terms_at_level(o) << endl;
-  //cerr << endl;
-
-  int i=0; double av_freq=0;
-  for (map<string,int>::const_iterator it=counts.begin(); it != counts.end(); ++it, ++i) {
-    WordID id = m_dict.Convert(it->first);
-    m_context_counts[id] = it->second;
-    av_freq += it->second;
-  }
-  cerr << "  Average term frequency = " << av_freq / (double) i << endl;
-
-  return m_documents.size();
-}
author	Chris Dyer <cdyer@cs.cmu.edu>	2012-10-11 14:06:32 -0400
committer	Chris Dyer <cdyer@cs.cmu.edu>	2012-10-11 14:06:32 -0400
commit	9339c80d465545aec5a6dccfef7c83ca715bf11f (patch)
tree	64c56d558331edad1db3832018c80e799551c39a /gi/pyp-topics/src/contexts_corpus.cc
parent	438dac41810b7c69fa10203ac5130d20efa2da9f (diff)
parent	afd7da3b2338661657ad0c4e9eec681e014d37bf (diff)