diff options
-rw-r--r-- | gi/pyp-topics/src/contexts_corpus.cc | 80 | ||||
-rw-r--r-- | gi/pyp-topics/src/contexts_corpus.hh | 3 | ||||
-rw-r--r-- | gi/pyp-topics/src/train-contexts.cc | 3 |
3 files changed, 62 insertions, 24 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc index f3d3c92e..e9644d5a 100644 --- a/gi/pyp-topics/src/contexts_corpus.cc +++ b/gi/pyp-topics/src/contexts_corpus.cc @@ -6,6 +6,9 @@ #include "gzstream.hh" #include "contexts_lexer.h" +#include <boost/tuple/tuple.hpp> + + using namespace std; ////////////////////////////////////////////////// @@ -15,18 +18,28 @@ using namespace std; void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) { assert(new_contexts.contexts.size() == new_contexts.counts.size()); - std::pair<ContextsCorpus*, BackoffGenerator*>* extra_pair - = static_cast< std::pair<ContextsCorpus*, BackoffGenerator*>* >(extra); + boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* >* extra_pair + = static_cast< boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* >* >(extra); - ContextsCorpus* corpus_ptr = extra_pair->first; - BackoffGenerator* backoff_gen = extra_pair->second; + ContextsCorpus* corpus_ptr = extra_pair->get<0>(); + BackoffGenerator* backoff_gen = extra_pair->get<1>(); + map<string,int>* counts = extra_pair->get<2>(); Document* doc(new Document()); - //std::cout << "READ: " << new_contexts.phrase << "\t"; + //cout << "READ: " << new_contexts.phrase << "\t"; for (int i=0; i < new_contexts.contexts.size(); ++i) { int cache_word_count = corpus_ptr->m_dict.max(); - WordID id = corpus_ptr->m_dict.Convert(new_contexts.contexts[i]); + string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[i]); + + // filter out singleton contexts + if (!counts->empty()) { + map<string,int>::const_iterator find_it = counts->find(context_str); + if (find_it == counts->end() || find_it->second == 1) + continue; + } + + WordID id = corpus_ptr->m_dict.Convert(context_str); if (cache_word_count != corpus_ptr->m_dict.max()) { corpus_ptr->m_backoff->terms_at_level(0)++; corpus_ptr->m_num_types++; @@ -44,11 +57,11 @@ void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* ContextsLexer::Context backedoff_context = new_contexts.contexts[i]; while (true) { if (!corpus_ptr->m_backoff->has_backoff(backoff_id)) { - //std::cerr << "Backing off from " << corpus_ptr->m_dict.Convert(backoff_id) << " to "; + //cerr << "Backing off from " << corpus_ptr->m_dict.Convert(backoff_id) << " to "; backedoff_context = (*backoff_gen)(backedoff_context); if (backedoff_context.empty()) { - //std::cerr << "Nothing." << std::endl; + //cerr << "Nothing." << endl; (*corpus_ptr->m_backoff)[backoff_id] = -1; break; } @@ -61,38 +74,61 @@ void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* if (cache_word_count != corpus_ptr->m_dict.max()) corpus_ptr->m_backoff->terms_at_level(order-1)++; - //std::cerr << corpus_ptr->m_dict.Convert(new_backoff_id) << " ." << std::endl; + //cerr << corpus_ptr->m_dict.Convert(new_backoff_id) << " ." << endl; backoff_id = ((*corpus_ptr->m_backoff)[backoff_id] = new_backoff_id); } else break; } } - //std::cout << context_str << " (" << id << ") ||| C=" << count << " ||| "; + //cout << context_str << " (" << id << ") ||| C=" << count << " ||| "; } - //std::cout << std::endl; + //cout << endl; - corpus_ptr->m_documents.push_back(doc); - corpus_ptr->m_keys.push_back(new_contexts.phrase); + if (!doc->empty()) { + corpus_ptr->m_documents.push_back(doc); + corpus_ptr->m_keys.push_back(new_contexts.phrase); + } } -unsigned ContextsCorpus::read_contexts(const std::string &filename, - BackoffGenerator* backoff_gen_ptr) { +void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) { + assert(new_contexts.contexts.size() == new_contexts.counts.size()); + + map<string,int>* context_counts = (static_cast<map<string,int>*>(extra)); + + for (int i=0; i < new_contexts.contexts.size(); ++i) { + int count = new_contexts.counts[i]; + pair<map<string,int>::iterator,bool> result + = context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[i]),count)); + if (!result.second) + result.first->second += count; + } +} + + +unsigned ContextsCorpus::read_contexts(const string &filename, + BackoffGenerator* backoff_gen_ptr, + bool filter_singeltons) { + map<string,int> counts; + if (filter_singeltons) { + cerr << "--- Filtering singleton contexts ---" << endl; + igzstream in(filename.c_str()); + ContextsLexer::ReadContexts(&in, filter_callback, &counts); + } + m_num_terms = 0; m_num_types = 0; igzstream in(filename.c_str()); - std::pair<ContextsCorpus*, BackoffGenerator*> extra_pair(this,backoff_gen_ptr); - ContextsLexer::ReadContexts(&in, - read_callback, - &extra_pair); + boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* > extra_pair(this,backoff_gen_ptr,&counts); + ContextsLexer::ReadContexts(&in, read_callback, &extra_pair); //m_num_types = m_dict.max(); - std::cerr << "Read backoff with order " << m_backoff->order() << "\n"; + cerr << "Read backoff with order " << m_backoff->order() << "\n"; for (int o=0; o<m_backoff->order(); o++) - std::cerr << " Terms at " << o << " = " << m_backoff->terms_at_level(o) << std::endl; - std::cerr << std::endl; + cerr << " Terms at " << o << " = " << m_backoff->terms_at_level(o) << endl; + cerr << endl; return m_documents.size(); } diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh index a55e52f2..891e3a6b 100644 --- a/gi/pyp-topics/src/contexts_corpus.hh +++ b/gi/pyp-topics/src/contexts_corpus.hh @@ -53,7 +53,8 @@ public: virtual ~ContextsCorpus() {} unsigned read_contexts(const std::string &filename, - BackoffGenerator* backoff_gen=0); + BackoffGenerator* backoff_gen=0, + bool filter_singeltons=false); TermBackoffPtr backoff_index() { return m_backoff; diff --git a/gi/pyp-topics/src/train-contexts.cc b/gi/pyp-topics/src/train-contexts.cc index c5ab8734..833565cd 100644 --- a/gi/pyp-topics/src/train-contexts.cc +++ b/gi/pyp-topics/src/train-contexts.cc @@ -43,6 +43,7 @@ int main(int argc, char **argv) ("topic-words-out,w", value<string>(), "file to write the topic word distribution to") ("samples,s", value<int>()->default_value(10), "number of sampling passes through the data") ("backoff-type", value<string>(), "backoff type: none|simple") + ("filter-singleton-contexts", "filter singleton contexts") ; store(parse_command_line(argc, argv, cmdline_options), vm); notify(vm); @@ -80,7 +81,7 @@ int main(int argc, char **argv) } ContextsCorpus contexts_corpus; - contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen); + contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen, vm.count("filter-singleton-contexts")); model.set_backoff(contexts_corpus.backoff_index()); if (backoff_gen) |