Added singleton filter.

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@70 ec762483-ff6d-05da-a07a-a48fb63a330f
author: philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-29 22:38:22 +0000
committer: philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-29 22:38:22 +0000
commit: 4e1c6e53e121aeb54df2b3ac825548fa4878fed2 (patch)
tree: 707beb9afc0294a39ad9ed7c573ba8b7c0d87ee9 /gi/pyp-topics/src
parent: 9f211fce859163cd26b1d8a0b7632cb300b0f67f (diff)
3 files changed, 62 insertions, 24 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc
index f3d3c92e..e9644d5a 100644
--- a/gi/pyp-topics/src/contexts_corpus.cc
+++ b/gi/pyp-topics/src/contexts_corpus.cc
@@ -6,6 +6,9 @@
 #include "gzstream.hh"
 #include "contexts_lexer.h"
 
+#include <boost/tuple/tuple.hpp>
+
+
 using namespace std;
 
 //////////////////////////////////////////////////
@@ -15,18 +18,28 @@ using namespace std;
 void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {
   assert(new_contexts.contexts.size() == new_contexts.counts.size());
 
-  std::pair<ContextsCorpus*, BackoffGenerator*>* extra_pair
-    = static_cast< std::pair<ContextsCorpus*, BackoffGenerator*>* >(extra);
+  boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* >* extra_pair
+    = static_cast< boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* >* >(extra);
 
-  ContextsCorpus* corpus_ptr = extra_pair->first;
-  BackoffGenerator* backoff_gen = extra_pair->second;
+  ContextsCorpus* corpus_ptr = extra_pair->get<0>();
+  BackoffGenerator* backoff_gen = extra_pair->get<1>();
+  map<string,int>* counts = extra_pair->get<2>();
 
   Document* doc(new Document());
 
-  //std::cout << "READ: " << new_contexts.phrase << "\t";
+  //cout << "READ: " << new_contexts.phrase << "\t";
   for (int i=0; i < new_contexts.contexts.size(); ++i) {
     int cache_word_count = corpus_ptr->m_dict.max();
-    WordID id = corpus_ptr->m_dict.Convert(new_contexts.contexts[i]);
+    string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[i]);
+
+    // filter out singleton contexts
+    if (!counts->empty()) {
+      map<string,int>::const_iterator find_it = counts->find(context_str);
+      if (find_it == counts->end() || find_it->second == 1)
+        continue;
+    }
+
+    WordID id = corpus_ptr->m_dict.Convert(context_str);
     if (cache_word_count != corpus_ptr->m_dict.max()) {
       corpus_ptr->m_backoff->terms_at_level(0)++;
       corpus_ptr->m_num_types++;
@@ -44,11 +57,11 @@ void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void*
       ContextsLexer::Context backedoff_context = new_contexts.contexts[i];
       while (true) {
         if (!corpus_ptr->m_backoff->has_backoff(backoff_id)) {
-          //std::cerr << "Backing off from " << corpus_ptr->m_dict.Convert(backoff_id) << " to ";
+          //cerr << "Backing off from " << corpus_ptr->m_dict.Convert(backoff_id) << " to ";
           backedoff_context = (*backoff_gen)(backedoff_context);
 
           if (backedoff_context.empty()) {
-            //std::cerr << "Nothing." << std::endl;
+            //cerr << "Nothing." << endl;
             (*corpus_ptr->m_backoff)[backoff_id] = -1;
             break;
           }
@@ -61,38 +74,61 @@ void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void*
           if (cache_word_count != corpus_ptr->m_dict.max())
             corpus_ptr->m_backoff->terms_at_level(order-1)++;
 
-          //std::cerr << corpus_ptr->m_dict.Convert(new_backoff_id) << " ." << std::endl;
+          //cerr << corpus_ptr->m_dict.Convert(new_backoff_id) << " ." << endl;
 
           backoff_id = ((*corpus_ptr->m_backoff)[backoff_id] = new_backoff_id);
         }
         else break;
       }
     }
-    //std::cout << context_str << " (" << id << ") ||| C=" << count << " ||| ";
+    //cout << context_str << " (" << id << ") ||| C=" << count << " ||| ";
   }
-  //std::cout << std::endl;
+  //cout << endl;
 
-  corpus_ptr->m_documents.push_back(doc);
-  corpus_ptr->m_keys.push_back(new_contexts.phrase);
+  if (!doc->empty()) {
+    corpus_ptr->m_documents.push_back(doc);
+    corpus_ptr->m_keys.push_back(new_contexts.phrase);
+  }
 }
 
-unsigned ContextsCorpus::read_contexts(const std::string &filename, 
-                                       BackoffGenerator* backoff_gen_ptr) {
+void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {
+  assert(new_contexts.contexts.size() == new_contexts.counts.size());
+
+  map<string,int>* context_counts = (static_cast<map<string,int>*>(extra));
+
+  for (int i=0; i < new_contexts.contexts.size(); ++i) {
+    int count = new_contexts.counts[i];
+    pair<map<string,int>::iterator,bool> result 
+      = context_counts->insert(make_pair(Dict::toString(new_contexts.contexts[i]),count));
+    if (!result.second)
+      result.first->second += count;
+  }
+}
+
+
+unsigned ContextsCorpus::read_contexts(const string &filename, 
+                                       BackoffGenerator* backoff_gen_ptr,
+                                       bool filter_singeltons) {
+  map<string,int> counts;
+  if (filter_singeltons) {
+    cerr << "--- Filtering singleton contexts ---" << endl;
+    igzstream in(filename.c_str());
+    ContextsLexer::ReadContexts(&in, filter_callback, &counts);
+  }
+
   m_num_terms = 0;
   m_num_types = 0;
 
   igzstream in(filename.c_str());
-  std::pair<ContextsCorpus*, BackoffGenerator*> extra_pair(this,backoff_gen_ptr);
-  ContextsLexer::ReadContexts(&in, 
-                              read_callback, 
-                              &extra_pair);
+  boost::tuple<ContextsCorpus*, BackoffGenerator*, map<string,int>* > extra_pair(this,backoff_gen_ptr,&counts);
+  ContextsLexer::ReadContexts(&in, read_callback, &extra_pair);
 
   //m_num_types = m_dict.max();
 
-  std::cerr << "Read backoff with order " << m_backoff->order() << "\n";
+  cerr << "Read backoff with order " << m_backoff->order() << "\n";
   for (int o=0; o<m_backoff->order(); o++)
-    std::cerr << "  Terms at " << o << " = " << m_backoff->terms_at_level(o) << std::endl;
-  std::cerr << std::endl;
+    cerr << "  Terms at " << o << " = " << m_backoff->terms_at_level(o) << endl;
+  cerr << endl;
 
   return m_documents.size();
 }
diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh
index a55e52f2..891e3a6b 100644
--- a/gi/pyp-topics/src/contexts_corpus.hh
+++ b/gi/pyp-topics/src/contexts_corpus.hh
@@ -53,7 +53,8 @@ public:
     virtual ~ContextsCorpus() {}
 
     unsigned read_contexts(const std::string &filename, 
-                           BackoffGenerator* backoff_gen=0);
+                           BackoffGenerator* backoff_gen=0,
+                           bool filter_singeltons=false);
 
     TermBackoffPtr backoff_index() {
       return m_backoff;
diff --git a/gi/pyp-topics/src/train-contexts.cc b/gi/pyp-topics/src/train-contexts.cc
index c5ab8734..833565cd 100644
--- a/gi/pyp-topics/src/train-contexts.cc
+++ b/gi/pyp-topics/src/train-contexts.cc
@@ -43,6 +43,7 @@ int main(int argc, char **argv)
       ("topic-words-out,w", value<string>(), "file to write the topic word distribution to")
       ("samples,s", value<int>()->default_value(10), "number of sampling passes through the data")
       ("backoff-type", value<string>(), "backoff type: none|simple")
+      ("filter-singleton-contexts", "filter singleton contexts")
       ;
     store(parse_command_line(argc, argv, cmdline_options), vm); 
     notify(vm);
@@ -80,7 +81,7 @@ int main(int argc, char **argv)
   }
 
   ContextsCorpus contexts_corpus;
-  contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen);
+  contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen, vm.count("filter-singleton-contexts"));
   model.set_backoff(contexts_corpus.backoff_index());
 
   if (backoff_gen)
author	philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-29 22:38:22 +0000
committer	philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-29 22:38:22 +0000
commit	4e1c6e53e121aeb54df2b3ac825548fa4878fed2 (patch)
tree	707beb9afc0294a39ad9ed7c573ba8b7c0d87ee9 /gi/pyp-topics/src
parent	9f211fce859163cd26b1d8a0b7632cb300b0f67f (diff)