blob: 2527f655374c3d7951d65422486f0454cf1b4898 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
#ifndef _CONTEXTS_CORPUS_HH
#define _CONTEXTS_CORPUS_HH
#include <vector>
#include <string>
#include <map>
#include <tr1/unordered_map>
#include <boost/ptr_container/ptr_vector.hpp>
#include "corpus.hh"
#include "contexts_lexer.h"
#include "dict.h"
class BackoffGenerator {
public:
virtual ContextsLexer::Context
operator()(const ContextsLexer::Context& c) = 0;
protected:
ContextsLexer::Context strip_edges(const ContextsLexer::Context& c) {
if (c.size() <= 1) return ContextsLexer::Context();
assert(c.size() % 2 == 1);
return ContextsLexer::Context(c.begin() + 1, c.end() - 1);
}
};
class NullBackoffGenerator : public BackoffGenerator {
virtual ContextsLexer::Context
operator()(const ContextsLexer::Context&)
{ return ContextsLexer::Context(); }
};
class SimpleBackoffGenerator : public BackoffGenerator {
virtual ContextsLexer::Context
operator()(const ContextsLexer::Context& c) {
if (c.size() <= 3)
return ContextsLexer::Context();
return strip_edges(c);
}
};
////////////////////////////////////////////////////////////////
// ContextsCorpus
////////////////////////////////////////////////////////////////
class ContextsCorpus : public Corpus {
friend void read_callback(const ContextsLexer::PhraseContextsType&, void*);
public:
ContextsCorpus() : m_backoff(new TermBackoff) {}
virtual ~ContextsCorpus() {}
virtual unsigned read_contexts(const std::string &filename,
BackoffGenerator* backoff_gen=0,
bool filter_singeltons=false,
bool binary_contexts=false);
TermBackoffPtr backoff_index() {
return m_backoff;
}
std::vector<std::string> context2string(const WordID& id) const {
std::vector<std::string> res;
assert (id >= 0);
m_dict.AsVector(id, &res);
return res;
}
virtual int context_count(const WordID& id) const {
return m_context_counts.find(id)->second;
}
const std::string& key(const int& i) const {
return m_keys.at(i);
}
const Dict& dict() const { return m_dict; }
protected:
TermBackoffPtr m_backoff;
Dict m_dict;
std::vector<std::string> m_keys;
std::tr1::unordered_map<int,int> m_context_counts;
};
#endif // _CONTEXTS_CORPUS_HH
|