Added contexts_corpus for reading text data files.

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@36 ec762483-ff6d-05da-a07a-a48fb63a330f
author: philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-28 15:01:17 +0000
committer: philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-28 15:01:17 +0000
commit: 1d089b02eff4fa8837faecf99021f624d8845e5d (patch)
tree: b6e3d20094514749c37485e154117871cdc8696f /gi/pyp-topics/src/contexts_corpus.cc
parent: 088725c4708e83343154d1bed9dee18286446eaf (diff)
1 files changed, 56 insertions, 0 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc
new file mode 100644
index 00000000..0b3ec644
--- /dev/null
+++ b/gi/pyp-topics/src/contexts_corpus.cc
@@ -0,0 +1,56 @@
+#include <sstream>
+#include <iostream>
+#include <set>
+
+#include "contexts_corpus.hh"
+#include "gzstream.hh"
+#include "contexts_lexer.h"
+
+using namespace std;
+
+//////////////////////////////////////////////////
+// ContextsCorpus
+//////////////////////////////////////////////////
+
+void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {
+  assert(new_contexts.contexts.size() == new_contexts.counts.size());
+
+  ContextsCorpus* corpus_ptr = static_cast<ContextsCorpus*>(extra);
+  Document* doc(new Document());
+
+  //std::cout << "READ: " << new_contexts.phrase << "\t";
+
+  for (int i=0; i < new_contexts.contexts.size(); ++i) {
+    std::string context_str = "";
+    for (ContextsLexer::Context::const_iterator it=new_contexts.contexts[i].begin();
+         it != new_contexts.contexts[i].end(); ++it) {
+      //std::cout << *it << " ";
+      if (it != new_contexts.contexts[i].begin())
+        context_str += "__";
+      context_str += *it;
+    }
+
+    WordID id = corpus_ptr->m_dict.Convert(context_str);
+    int count = new_contexts.counts[i];
+    for (int i=0; i<count; ++i)
+      doc->push_back(id);
+    corpus_ptr->m_num_terms += count;
+
+    //std::cout << context_str << " (" << id << ") ||| C=" << count << " ||| ";
+  }
+  //std::cout << std::endl;
+
+  corpus_ptr->m_documents.push_back(doc);
+}
+
+unsigned ContextsCorpus::read_contexts(const std::string &filename) {
+  m_num_terms = 0;
+  m_num_types = 0;
+
+  igzstream in(filename.c_str());
+  ContextsLexer::ReadContexts(&in, read_callback, this);
+
+  m_num_types = m_dict.max();
+
+  return m_documents.size();
+}
author	philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-28 15:01:17 +0000
committer	philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-28 15:01:17 +0000
commit	1d089b02eff4fa8837faecf99021f624d8845e5d (patch)
tree	b6e3d20094514749c37485e154117871cdc8696f /gi/pyp-topics/src/contexts_corpus.cc
parent	088725c4708e83343154d1bed9dee18286446eaf (diff)