summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src/corpus.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pyp-topics/src/corpus.cc')
-rw-r--r--gi/pyp-topics/src/corpus.cc103
1 files changed, 103 insertions, 0 deletions
diff --git a/gi/pyp-topics/src/corpus.cc b/gi/pyp-topics/src/corpus.cc
new file mode 100644
index 00000000..93910ea3
--- /dev/null
+++ b/gi/pyp-topics/src/corpus.cc
@@ -0,0 +1,103 @@
+#include <sstream>
+#include <iostream>
+#include <set>
+
+#include "corpus.hh"
+#include "gzstream.hh"
+
+using namespace std;
+
+//////////////////////////////////////////////////
+// Corpus
+//////////////////////////////////////////////////
+
+Corpus::Corpus() {}
+
+unsigned Corpus::read(const std::string &filename) {
+ m_num_terms = 0;
+ m_num_types = 0;
+ std::set<int> seen_types;
+
+ igzstream in(filename.c_str());
+
+ string buf;
+ int token;
+ unsigned count=0;
+ while (getline(in, buf)) {
+ Document* doc(new Document());
+ istringstream ss(buf);
+
+ ss >> token; // the number of unique terms
+
+ char delimeter;
+ int count;
+ while(ss >> token >> delimeter >> count) {
+ for (int i=0; i<count; ++i)
+ doc->push_back(token);
+ m_num_terms += count;
+ seen_types.insert(token);
+ }
+
+ m_documents.push_back(doc);
+ count++;
+ }
+
+ m_num_types = seen_types.size();
+
+ return count;
+}
+
+//////////////////////////////////////////////////
+// TestCorpus
+//////////////////////////////////////////////////
+
+TestCorpus::TestCorpus() {}
+
+void TestCorpus::read(const std::string &filename) {
+ igzstream in(filename.c_str());
+
+ string buf;
+ Term term;
+ DocumentId doc;
+ char delimeter;
+ while (getline(in, buf)) {
+ DocumentTerms* line(new DocumentTerms());
+ istringstream ss(buf);
+
+ while(ss >> doc >> delimeter >> term)
+ line->push_back(DocumentTerm(doc, term));
+
+ m_lines.push_back(line);
+ }
+}
+
+//////////////////////////////////////////////////
+// TermBackoff
+//////////////////////////////////////////////////
+
+void TermBackoff::read(const std::string &filename) {
+ igzstream in(filename.c_str());
+
+ string buf;
+ int num_terms;
+ getline(in, buf);
+ istringstream ss(buf);
+ ss >> num_terms >> m_backoff_order;
+
+ m_dict.resize(num_terms, -1);
+ for (int i=0; i<m_backoff_order; ++i) {
+ int count; ss >> count;
+ m_terms_at_order.push_back(count);
+ }
+
+ Term term, backoff;
+ while (getline(in, buf)) {
+ istringstream ss(buf);
+ ss >> term >> backoff;
+
+ assert(term < num_terms);
+ assert(term >= 0);
+
+ m_dict[term] = backoff;
+ }
+}