Debugged hierarchical backoff model.

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@48 ec762483-ff6d-05da-a07a-a48fb63a330f
author: philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-29 04:28:03 +0000
committer: philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-29 04:28:03 +0000
commit: c2d0652b798b2b23744b8c81f0f1abf8d7eeed2f (patch)
tree: 2c2d137dda292221c7c0e4a5f1d7204d5c6bc68f /gi/pyp-topics/src/contexts_corpus.cc
parent: ebd00f59aab18446051f9838d3d08427b242b435 (diff)
1 files changed, 55 insertions, 14 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc
index 0b3ec644..afa1e19a 100644
--- a/gi/pyp-topics/src/contexts_corpus.cc
+++ b/gi/pyp-topics/src/contexts_corpus.cc
@@ -15,27 +15,59 @@ using namespace std;
 void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {
   assert(new_contexts.contexts.size() == new_contexts.counts.size());
 
-  ContextsCorpus* corpus_ptr = static_cast<ContextsCorpus*>(extra);
+  std::pair<ContextsCorpus*, BackoffGenerator*>* extra_pair
+    = static_cast< std::pair<ContextsCorpus*, BackoffGenerator*>* >(extra);
+
+  ContextsCorpus* corpus_ptr = extra_pair->first;
+  BackoffGenerator* backoff_gen = extra_pair->second;
+
   Document* doc(new Document());
 
   //std::cout << "READ: " << new_contexts.phrase << "\t";
-
   for (int i=0; i < new_contexts.contexts.size(); ++i) {
-    std::string context_str = "";
-    for (ContextsLexer::Context::const_iterator it=new_contexts.contexts[i].begin();
-         it != new_contexts.contexts[i].end(); ++it) {
-      //std::cout << *it << " ";
-      if (it != new_contexts.contexts[i].begin())
-        context_str += "__";
-      context_str += *it;
+    int cache_word_count = corpus_ptr->m_dict.max();
+    WordID id = corpus_ptr->m_dict.Convert(new_contexts.contexts[i]);
+    if (cache_word_count != corpus_ptr->m_dict.max()) {
+      corpus_ptr->m_backoff->terms_at_level(0)++;
+      corpus_ptr->m_num_types++;
     }
 
-    WordID id = corpus_ptr->m_dict.Convert(context_str);
     int count = new_contexts.counts[i];
-    for (int i=0; i<count; ++i)
+    for (int j=0; j<count; ++j)
       doc->push_back(id);
     corpus_ptr->m_num_terms += count;
 
+    // generate the backoff map
+    if (backoff_gen) {
+      int order = 1;
+      WordID backoff_id = id;
+      ContextsLexer::Context backedoff_context = new_contexts.contexts[i];
+      while (true) {
+        if (!corpus_ptr->m_backoff->has_backoff(backoff_id)) {
+          //std::cerr << "Backing off from " << corpus_ptr->m_dict.Convert(backoff_id) << " to ";
+          backedoff_context = (*backoff_gen)(backedoff_context);
+
+          if (backedoff_context.empty()) {
+            //std::cerr << "Nothing." << std::endl;
+            (*corpus_ptr->m_backoff)[backoff_id] = -1;
+            break;
+          }
+
+          if (++order > corpus_ptr->m_backoff->order())
+            corpus_ptr->m_backoff->order(order);
+
+          int cache_word_count = corpus_ptr->m_dict.max();
+          int new_backoff_id = corpus_ptr->m_dict.Convert(backedoff_context);
+          if (cache_word_count != corpus_ptr->m_dict.max())
+            corpus_ptr->m_backoff->terms_at_level(order-1)++;
+
+          //std::cerr << corpus_ptr->m_dict.Convert(new_backoff_id) << " ." << std::endl;
+
+          backoff_id = ((*corpus_ptr->m_backoff)[backoff_id] = new_backoff_id);
+        }
+        else break;
+      }
+    }
     //std::cout << context_str << " (" << id << ") ||| C=" << count << " ||| ";
   }
   //std::cout << std::endl;
@@ -43,14 +75,23 @@ void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void*
   corpus_ptr->m_documents.push_back(doc);
 }
 
-unsigned ContextsCorpus::read_contexts(const std::string &filename) {
+unsigned ContextsCorpus::read_contexts(const std::string &filename, 
+                                       BackoffGenerator* backoff_gen_ptr) {
   m_num_terms = 0;
   m_num_types = 0;
 
   igzstream in(filename.c_str());
-  ContextsLexer::ReadContexts(&in, read_callback, this);
+  std::pair<ContextsCorpus*, BackoffGenerator*> extra_pair(this,backoff_gen_ptr);
+  ContextsLexer::ReadContexts(&in, 
+                              read_callback, 
+                              &extra_pair);
+
+  //m_num_types = m_dict.max();
 
-  m_num_types = m_dict.max();
+  std::cerr << "Read backoff with order " << m_backoff->order() << "\n";
+  for (int o=0; o<m_backoff->order(); o++)
+    std::cerr << "  Terms at " << o << " = " << m_backoff->terms_at_level(o) << std::endl;
+  std::cerr << std::endl;
 
   return m_documents.size();
 }
author	philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-29 04:28:03 +0000
committer	philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-29 04:28:03 +0000
commit	c2d0652b798b2b23744b8c81f0f1abf8d7eeed2f (patch)
tree	2c2d137dda292221c7c0e4a5f1d7204d5c6bc68f /gi/pyp-topics/src/contexts_corpus.cc
parent	ebd00f59aab18446051f9838d3d08427b242b435 (diff)