summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src/corpus.hh
blob: 2aa0352737bbbaf6d5965afef44492548de85899 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#ifndef _CORPUS_HH
#define _CORPUS_HH

#include <vector>
#include <string>
#include <map>
#include <limits>

#include <boost/shared_ptr.hpp>
#include <boost/ptr_container/ptr_vector.hpp>

////////////////////////////////////////////////////////////////
// Corpus
////////////////////////////////////////////////////////////////
typedef int Term;

typedef std::vector<Term> Document;
typedef std::vector<Term> Terms;

class Corpus {
public:
    typedef boost::ptr_vector<Document>::const_iterator const_iterator;

public:
    Corpus();
    virtual ~Corpus() {}

    virtual unsigned read(const std::string &filename);

    const_iterator begin() const { return m_documents.begin(); }
    const_iterator end() const { return m_documents.end(); }

    const Document& at(size_t i) const { return m_documents.at(i); }

    int num_documents() const { return m_documents.size(); }
    int num_terms() const { return m_num_terms; }
    int num_types() const { return m_num_types; }

    virtual int context_count(const int&) const {
      return std::numeric_limits<int>::max();
    }

protected:
    int m_num_terms, m_num_types;
    boost::ptr_vector<Document> m_documents; 
};

typedef int DocumentId;
struct DocumentTerm {
  DocumentTerm(DocumentId d, Term t) : term(t), doc(d) {}
  Term term;
  DocumentId doc;
};
typedef std::vector<DocumentTerm> DocumentTerms;

class TestCorpus {
public:
    typedef boost::ptr_vector<DocumentTerms>::const_iterator const_iterator;

public:
    TestCorpus();
    ~TestCorpus() {}

    void read(const std::string &filename);

    const_iterator begin() const { return m_lines.begin(); }
    const_iterator end() const { return m_lines.end(); }

    int num_instances() const { return m_lines.size(); }

protected:
    boost::ptr_vector<DocumentTerms> m_lines; 
};

class TermBackoff {
public:
    typedef std::vector<Term> dictionary_type;
    typedef dictionary_type::const_iterator const_iterator;
    const static int NullBackoff=-1;

public:
    TermBackoff() { order(1); }
    ~TermBackoff() {}

    void read(const std::string &filename);

    const_iterator begin() const { return m_dict.begin(); }
    const_iterator end() const { return m_dict.end(); }

    const Term& operator[](const Term& t) const {
      assert(t < static_cast<int>(m_dict.size()));
      return m_dict[t];
    }

    Term& operator[](const Term& t) {
      if (t >= static_cast<int>(m_dict.size()))
        m_dict.resize(t+1, -1);
      return m_dict[t];
    }

    bool has_backoff(const Term& t) {
      return t >= 0 && t < static_cast<int>(m_dict.size()) && m_dict[t] >= 0;
    }

    int order() const { return m_backoff_order; }
    void order(int o) { 
      if (o >= (int)m_terms_at_order.size())
        m_terms_at_order.resize(o, 0);
      m_backoff_order = o; 
    }

//    int levels() const { return m_terms_at_order.size(); }
    bool is_null(const Term& term) const { return term < 0; }
    int terms_at_level(int level) const { 
      assert (level < (int)m_terms_at_order.size());
      return m_terms_at_order.at(level);
    }

    int& terms_at_level(int level) { 
      assert (level < (int)m_terms_at_order.size());
      return m_terms_at_order.at(level);
    }

    int size() const { return m_dict.size(); }

protected:
    dictionary_type m_dict; 
    int m_backoff_order;
    std::vector<int> m_terms_at_order;
};
typedef boost::shared_ptr<TermBackoff> TermBackoffPtr;

#endif // _CORPUS_HH