blob: f182381f473cf5f64355a6dd328817941ddaffa9 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
#include <sstream>
#include <iostream>
#include <set>
#include "corpus.hh"
#include "gzstream.hh"
using namespace std;
//////////////////////////////////////////////////
// Corpus
//////////////////////////////////////////////////
Corpus::Corpus() : m_num_terms(0), m_num_types(0) {}
unsigned Corpus::read(const std::string &filename) {
m_num_terms = 0;
m_num_types = 0;
std::set<int> seen_types;
igzstream in(filename.c_str());
string buf;
int token;
unsigned doc_count=0;
while (getline(in, buf)) {
Document* doc(new Document());
istringstream ss(buf);
ss >> token; // the number of unique terms
char delimeter;
int count;
while(ss >> token >> delimeter >> count) {
for (int i=0; i<count; ++i)
doc->push_back(token);
m_num_terms += count;
seen_types.insert(token);
}
m_documents.push_back(doc);
doc_count++;
}
m_num_types = seen_types.size();
return doc_count;
}
//////////////////////////////////////////////////
// TestCorpus
//////////////////////////////////////////////////
TestCorpus::TestCorpus() {}
void TestCorpus::read(const std::string &filename) {
igzstream in(filename.c_str());
string buf;
Term term;
DocumentId doc;
char delimeter;
while (getline(in, buf)) {
DocumentTerms* line(new DocumentTerms());
istringstream ss(buf);
while(ss >> doc >> delimeter >> term)
line->push_back(DocumentTerm(doc, term));
m_lines.push_back(line);
}
}
//////////////////////////////////////////////////
// TermBackoff
//////////////////////////////////////////////////
void TermBackoff::read(const std::string &filename) {
igzstream in(filename.c_str());
string buf;
int num_terms;
getline(in, buf);
istringstream ss(buf);
ss >> num_terms >> m_backoff_order;
m_dict.resize(num_terms, -1);
for (int i=0; i<m_backoff_order; ++i) {
int count; ss >> count;
m_terms_at_order.push_back(count);
}
Term term, backoff;
while (getline(in, buf)) {
istringstream ss(buf);
ss >> term >> backoff;
assert(term < num_terms);
assert(term >= 0);
m_dict[term] = backoff;
}
}
|