diff options
Diffstat (limited to 'gi/pyp-topics/src/corpus.cc')
-rw-r--r-- | gi/pyp-topics/src/corpus.cc | 104 |
1 files changed, 0 insertions, 104 deletions
diff --git a/gi/pyp-topics/src/corpus.cc b/gi/pyp-topics/src/corpus.cc deleted file mode 100644 index f182381f..00000000 --- a/gi/pyp-topics/src/corpus.cc +++ /dev/null @@ -1,104 +0,0 @@ -#include <sstream> -#include <iostream> -#include <set> - -#include "corpus.hh" -#include "gzstream.hh" - -using namespace std; - -////////////////////////////////////////////////// -// Corpus -////////////////////////////////////////////////// - -Corpus::Corpus() : m_num_terms(0), m_num_types(0) {} - -unsigned Corpus::read(const std::string &filename) { - m_num_terms = 0; - m_num_types = 0; - std::set<int> seen_types; - - igzstream in(filename.c_str()); - - string buf; - int token; - unsigned doc_count=0; - while (getline(in, buf)) { - Document* doc(new Document()); - istringstream ss(buf); - - ss >> token; // the number of unique terms - - char delimeter; - int count; - while(ss >> token >> delimeter >> count) { - for (int i=0; i<count; ++i) - doc->push_back(token); - m_num_terms += count; - seen_types.insert(token); - } - - m_documents.push_back(doc); - doc_count++; - } - - m_num_types = seen_types.size(); - - return doc_count; -} - - -////////////////////////////////////////////////// -// TestCorpus -////////////////////////////////////////////////// - -TestCorpus::TestCorpus() {} - -void TestCorpus::read(const std::string &filename) { - igzstream in(filename.c_str()); - - string buf; - Term term; - DocumentId doc; - char delimeter; - while (getline(in, buf)) { - DocumentTerms* line(new DocumentTerms()); - istringstream ss(buf); - - while(ss >> doc >> delimeter >> term) - line->push_back(DocumentTerm(doc, term)); - - m_lines.push_back(line); - } -} - -////////////////////////////////////////////////// -// TermBackoff -////////////////////////////////////////////////// - -void TermBackoff::read(const std::string &filename) { - igzstream in(filename.c_str()); - - string buf; - int num_terms; - getline(in, buf); - istringstream ss(buf); - ss >> num_terms >> m_backoff_order; - - m_dict.resize(num_terms, -1); - for (int i=0; i<m_backoff_order; ++i) { - int count; ss >> count; - m_terms_at_order.push_back(count); - } - - Term term, backoff; - while (getline(in, buf)) { - istringstream ss(buf); - ss >> term >> backoff; - - assert(term < num_terms); - assert(term >= 0); - - m_dict[term] = backoff; - } -} |