summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src/contexts_corpus.hh
blob: 2527f655374c3d7951d65422486f0454cf1b4898 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#ifndef _CONTEXTS_CORPUS_HH
#define _CONTEXTS_CORPUS_HH

#include <vector>
#include <string>
#include <map>
#include <tr1/unordered_map>

#include <boost/ptr_container/ptr_vector.hpp>

#include "corpus.hh"
#include "contexts_lexer.h"
#include "dict.h"


class BackoffGenerator {
public:
  virtual ContextsLexer::Context
    operator()(const ContextsLexer::Context& c) = 0;

protected:
  ContextsLexer::Context strip_edges(const ContextsLexer::Context& c) {
    if (c.size() <= 1) return ContextsLexer::Context();
    assert(c.size() % 2 == 1);
    return ContextsLexer::Context(c.begin() + 1, c.end() - 1);
  }
};

class NullBackoffGenerator : public BackoffGenerator {
  virtual ContextsLexer::Context
    operator()(const ContextsLexer::Context&) 
    { return ContextsLexer::Context(); }
};

class SimpleBackoffGenerator : public BackoffGenerator {
  virtual ContextsLexer::Context
    operator()(const ContextsLexer::Context& c) { 
      if (c.size() <= 3)
        return ContextsLexer::Context();
      return strip_edges(c); 
    }
};


////////////////////////////////////////////////////////////////
// ContextsCorpus
////////////////////////////////////////////////////////////////

class ContextsCorpus : public Corpus {
  friend void read_callback(const ContextsLexer::PhraseContextsType&, void*);

public:
    ContextsCorpus() : m_backoff(new TermBackoff) {}
    virtual ~ContextsCorpus() {}

    virtual unsigned read_contexts(const std::string &filename, 
                                   BackoffGenerator* backoff_gen=0,
                                   bool filter_singeltons=false,
                                   bool binary_contexts=false);

    TermBackoffPtr backoff_index() {
      return m_backoff;
    }

    std::vector<std::string> context2string(const WordID& id) const {
      std::vector<std::string> res;
      assert (id >= 0);
      m_dict.AsVector(id, &res);
      return res;
    }

    virtual int context_count(const WordID& id) const {
      return m_context_counts.find(id)->second;
    }


    const std::string& key(const int& i) const {
      return m_keys.at(i);
    }

    const Dict& dict() const { return m_dict; }

protected:
    TermBackoffPtr m_backoff;
    Dict m_dict;
    std::vector<std::string> m_keys;
    std::tr1::unordered_map<int,int> m_context_counts;
};

#endif // _CONTEXTS_CORPUS_HH