From e26434979adc33bd949566ba7bf02dff64e80a3e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 2 Oct 2012 00:19:43 -0400 Subject: cdec cleanup, remove bayesian stuff, parsing stuff --- gi/pyp-topics/src/corpus.cc | 104 -------------------------------------------- 1 file changed, 104 deletions(-) delete mode 100644 gi/pyp-topics/src/corpus.cc (limited to 'gi/pyp-topics/src/corpus.cc') diff --git a/gi/pyp-topics/src/corpus.cc b/gi/pyp-topics/src/corpus.cc deleted file mode 100644 index f182381f..00000000 --- a/gi/pyp-topics/src/corpus.cc +++ /dev/null @@ -1,104 +0,0 @@ -#include -#include -#include - -#include "corpus.hh" -#include "gzstream.hh" - -using namespace std; - -////////////////////////////////////////////////// -// Corpus -////////////////////////////////////////////////// - -Corpus::Corpus() : m_num_terms(0), m_num_types(0) {} - -unsigned Corpus::read(const std::string &filename) { - m_num_terms = 0; - m_num_types = 0; - std::set seen_types; - - igzstream in(filename.c_str()); - - string buf; - int token; - unsigned doc_count=0; - while (getline(in, buf)) { - Document* doc(new Document()); - istringstream ss(buf); - - ss >> token; // the number of unique terms - - char delimeter; - int count; - while(ss >> token >> delimeter >> count) { - for (int i=0; ipush_back(token); - m_num_terms += count; - seen_types.insert(token); - } - - m_documents.push_back(doc); - doc_count++; - } - - m_num_types = seen_types.size(); - - return doc_count; -} - - -////////////////////////////////////////////////// -// TestCorpus -////////////////////////////////////////////////// - -TestCorpus::TestCorpus() {} - -void TestCorpus::read(const std::string &filename) { - igzstream in(filename.c_str()); - - string buf; - Term term; - DocumentId doc; - char delimeter; - while (getline(in, buf)) { - DocumentTerms* line(new DocumentTerms()); - istringstream ss(buf); - - while(ss >> doc >> delimeter >> term) - line->push_back(DocumentTerm(doc, term)); - - m_lines.push_back(line); - } -} - -////////////////////////////////////////////////// -// TermBackoff -////////////////////////////////////////////////// - -void TermBackoff::read(const std::string &filename) { - igzstream in(filename.c_str()); - - string buf; - int num_terms; - getline(in, buf); - istringstream ss(buf); - ss >> num_terms >> m_backoff_order; - - m_dict.resize(num_terms, -1); - for (int i=0; i> count; - m_terms_at_order.push_back(count); - } - - Term term, backoff; - while (getline(in, buf)) { - istringstream ss(buf); - ss >> term >> backoff; - - assert(term < num_terms); - assert(term >= 0); - - m_dict[term] = backoff; - } -} -- cgit v1.2.3