diff options
Diffstat (limited to 'gi/pyp-topics/src/corpus.cc')
-rw-r--r-- | gi/pyp-topics/src/corpus.cc | 103 |
1 files changed, 103 insertions, 0 deletions
diff --git a/gi/pyp-topics/src/corpus.cc b/gi/pyp-topics/src/corpus.cc new file mode 100644 index 00000000..93910ea3 --- /dev/null +++ b/gi/pyp-topics/src/corpus.cc @@ -0,0 +1,103 @@ +#include <sstream> +#include <iostream> +#include <set> + +#include "corpus.hh" +#include "gzstream.hh" + +using namespace std; + +////////////////////////////////////////////////// +// Corpus +////////////////////////////////////////////////// + +Corpus::Corpus() {} + +unsigned Corpus::read(const std::string &filename) { + m_num_terms = 0; + m_num_types = 0; + std::set<int> seen_types; + + igzstream in(filename.c_str()); + + string buf; + int token; + unsigned count=0; + while (getline(in, buf)) { + Document* doc(new Document()); + istringstream ss(buf); + + ss >> token; // the number of unique terms + + char delimeter; + int count; + while(ss >> token >> delimeter >> count) { + for (int i=0; i<count; ++i) + doc->push_back(token); + m_num_terms += count; + seen_types.insert(token); + } + + m_documents.push_back(doc); + count++; + } + + m_num_types = seen_types.size(); + + return count; +} + +////////////////////////////////////////////////// +// TestCorpus +////////////////////////////////////////////////// + +TestCorpus::TestCorpus() {} + +void TestCorpus::read(const std::string &filename) { + igzstream in(filename.c_str()); + + string buf; + Term term; + DocumentId doc; + char delimeter; + while (getline(in, buf)) { + DocumentTerms* line(new DocumentTerms()); + istringstream ss(buf); + + while(ss >> doc >> delimeter >> term) + line->push_back(DocumentTerm(doc, term)); + + m_lines.push_back(line); + } +} + +////////////////////////////////////////////////// +// TermBackoff +////////////////////////////////////////////////// + +void TermBackoff::read(const std::string &filename) { + igzstream in(filename.c_str()); + + string buf; + int num_terms; + getline(in, buf); + istringstream ss(buf); + ss >> num_terms >> m_backoff_order; + + m_dict.resize(num_terms, -1); + for (int i=0; i<m_backoff_order; ++i) { + int count; ss >> count; + m_terms_at_order.push_back(count); + } + + Term term, backoff; + while (getline(in, buf)) { + istringstream ss(buf); + ss >> term >> backoff; + + assert(term < num_terms); + assert(term >= 0); + + m_dict[term] = backoff; + } +} |