summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src/corpus.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
commite26434979adc33bd949566ba7bf02dff64e80a3e (patch)
treed1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/pyp-topics/src/corpus.cc
parent0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'gi/pyp-topics/src/corpus.cc')
-rw-r--r--gi/pyp-topics/src/corpus.cc104
1 files changed, 0 insertions, 104 deletions
diff --git a/gi/pyp-topics/src/corpus.cc b/gi/pyp-topics/src/corpus.cc
deleted file mode 100644
index f182381f..00000000
--- a/gi/pyp-topics/src/corpus.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <set>
-
-#include "corpus.hh"
-#include "gzstream.hh"
-
-using namespace std;
-
-//////////////////////////////////////////////////
-// Corpus
-//////////////////////////////////////////////////
-
-Corpus::Corpus() : m_num_terms(0), m_num_types(0) {}
-
-unsigned Corpus::read(const std::string &filename) {
- m_num_terms = 0;
- m_num_types = 0;
- std::set<int> seen_types;
-
- igzstream in(filename.c_str());
-
- string buf;
- int token;
- unsigned doc_count=0;
- while (getline(in, buf)) {
- Document* doc(new Document());
- istringstream ss(buf);
-
- ss >> token; // the number of unique terms
-
- char delimeter;
- int count;
- while(ss >> token >> delimeter >> count) {
- for (int i=0; i<count; ++i)
- doc->push_back(token);
- m_num_terms += count;
- seen_types.insert(token);
- }
-
- m_documents.push_back(doc);
- doc_count++;
- }
-
- m_num_types = seen_types.size();
-
- return doc_count;
-}
-
-
-//////////////////////////////////////////////////
-// TestCorpus
-//////////////////////////////////////////////////
-
-TestCorpus::TestCorpus() {}
-
-void TestCorpus::read(const std::string &filename) {
- igzstream in(filename.c_str());
-
- string buf;
- Term term;
- DocumentId doc;
- char delimeter;
- while (getline(in, buf)) {
- DocumentTerms* line(new DocumentTerms());
- istringstream ss(buf);
-
- while(ss >> doc >> delimeter >> term)
- line->push_back(DocumentTerm(doc, term));
-
- m_lines.push_back(line);
- }
-}
-
-//////////////////////////////////////////////////
-// TermBackoff
-//////////////////////////////////////////////////
-
-void TermBackoff::read(const std::string &filename) {
- igzstream in(filename.c_str());
-
- string buf;
- int num_terms;
- getline(in, buf);
- istringstream ss(buf);
- ss >> num_terms >> m_backoff_order;
-
- m_dict.resize(num_terms, -1);
- for (int i=0; i<m_backoff_order; ++i) {
- int count; ss >> count;
- m_terms_at_order.push_back(count);
- }
-
- Term term, backoff;
- while (getline(in, buf)) {
- istringstream ss(buf);
- ss >> term >> backoff;
-
- assert(term < num_terms);
- assert(term >= 0);
-
- m_dict[term] = backoff;
- }
-}