summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src/corpus.cc
blob: f182381f473cf5f64355a6dd328817941ddaffa9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#include <sstream>
#include <iostream>
#include <set>

#include "corpus.hh"
#include "gzstream.hh"

using namespace std;

//////////////////////////////////////////////////
// Corpus
//////////////////////////////////////////////////

Corpus::Corpus() : m_num_terms(0), m_num_types(0) {}

unsigned Corpus::read(const std::string &filename) {
  m_num_terms = 0;
  m_num_types = 0;
  std::set<int> seen_types;

  igzstream in(filename.c_str());

  string buf;
  int token;
  unsigned doc_count=0;
  while (getline(in, buf)) {
    Document* doc(new Document());
    istringstream ss(buf);

    ss >> token; // the number of unique terms

    char delimeter;
    int count;
    while(ss >> token >> delimeter >> count) {
      for (int i=0; i<count; ++i)
        doc->push_back(token);
      m_num_terms += count;
      seen_types.insert(token);
    }

    m_documents.push_back(doc);
    doc_count++;
  }

  m_num_types = seen_types.size();

  return doc_count;
}


//////////////////////////////////////////////////
// TestCorpus
//////////////////////////////////////////////////

TestCorpus::TestCorpus() {}

void TestCorpus::read(const std::string &filename) {
  igzstream in(filename.c_str());

  string buf;
  Term term;
  DocumentId doc;
  char delimeter;
  while (getline(in, buf)) {
    DocumentTerms* line(new DocumentTerms());
    istringstream ss(buf);

    while(ss >> doc >> delimeter >> term)
      line->push_back(DocumentTerm(doc, term));

    m_lines.push_back(line);
  }
}

//////////////////////////////////////////////////
// TermBackoff
//////////////////////////////////////////////////

void TermBackoff::read(const std::string &filename) {
  igzstream in(filename.c_str());

  string buf;
  int num_terms;
  getline(in, buf);
  istringstream ss(buf); 
  ss >> num_terms >> m_backoff_order;

  m_dict.resize(num_terms, -1);
  for (int i=0; i<m_backoff_order; ++i) {
    int count; ss >> count;
    m_terms_at_order.push_back(count);
  }

  Term term, backoff;
  while (getline(in, buf)) {
    istringstream ss(buf);
    ss >> term >> backoff;

    assert(term < num_terms);
    assert(term >= 0);

    m_dict[term] = backoff;
  }
}