summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src/contexts_corpus.cc
diff options
context:
space:
mode:
authorphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-28 15:01:17 +0000
committerphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-28 15:01:17 +0000
commit1d089b02eff4fa8837faecf99021f624d8845e5d (patch)
treeb6e3d20094514749c37485e154117871cdc8696f /gi/pyp-topics/src/contexts_corpus.cc
parent088725c4708e83343154d1bed9dee18286446eaf (diff)
Added contexts_corpus for reading text data files.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@36 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/src/contexts_corpus.cc')
-rw-r--r--gi/pyp-topics/src/contexts_corpus.cc56
1 files changed, 56 insertions, 0 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc
new file mode 100644
index 00000000..0b3ec644
--- /dev/null
+++ b/gi/pyp-topics/src/contexts_corpus.cc
@@ -0,0 +1,56 @@
+#include <sstream>
+#include <iostream>
+#include <set>
+
+#include "contexts_corpus.hh"
+#include "gzstream.hh"
+#include "contexts_lexer.h"
+
+using namespace std;
+
+//////////////////////////////////////////////////
+// ContextsCorpus
+//////////////////////////////////////////////////
+
+void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {
+ assert(new_contexts.contexts.size() == new_contexts.counts.size());
+
+ ContextsCorpus* corpus_ptr = static_cast<ContextsCorpus*>(extra);
+ Document* doc(new Document());
+
+ //std::cout << "READ: " << new_contexts.phrase << "\t";
+
+ for (int i=0; i < new_contexts.contexts.size(); ++i) {
+ std::string context_str = "";
+ for (ContextsLexer::Context::const_iterator it=new_contexts.contexts[i].begin();
+ it != new_contexts.contexts[i].end(); ++it) {
+ //std::cout << *it << " ";
+ if (it != new_contexts.contexts[i].begin())
+ context_str += "__";
+ context_str += *it;
+ }
+
+ WordID id = corpus_ptr->m_dict.Convert(context_str);
+ int count = new_contexts.counts[i];
+ for (int i=0; i<count; ++i)
+ doc->push_back(id);
+ corpus_ptr->m_num_terms += count;
+
+ //std::cout << context_str << " (" << id << ") ||| C=" << count << " ||| ";
+ }
+ //std::cout << std::endl;
+
+ corpus_ptr->m_documents.push_back(doc);
+}
+
+unsigned ContextsCorpus::read_contexts(const std::string &filename) {
+ m_num_terms = 0;
+ m_num_types = 0;
+
+ igzstream in(filename.c_str());
+ ContextsLexer::ReadContexts(&in, read_callback, this);
+
+ m_num_types = m_dict.max();
+
+ return m_documents.size();
+}