summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src/contexts_corpus.hh
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pyp-topics/src/contexts_corpus.hh')
-rw-r--r--gi/pyp-topics/src/contexts_corpus.hh35
1 files changed, 33 insertions, 2 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh
index e680cef5..bd0cd34c 100644
--- a/gi/pyp-topics/src/contexts_corpus.hh
+++ b/gi/pyp-topics/src/contexts_corpus.hh
@@ -11,6 +11,36 @@
#include "contexts_lexer.h"
#include "../../../decoder/dict.h"
+
+class BackoffGenerator {
+public:
+ virtual ContextsLexer::Context
+ operator()(const ContextsLexer::Context& c) = 0;
+
+protected:
+ ContextsLexer::Context strip_edges(const ContextsLexer::Context& c) {
+ if (c.size() <= 1) return ContextsLexer::Context();
+ assert(c.size() % 2 == 1);
+ return ContextsLexer::Context(c.begin() + 1, c.end() - 1);
+ }
+};
+
+class NullBackoffGenerator : public BackoffGenerator {
+ virtual ContextsLexer::Context
+ operator()(const ContextsLexer::Context&)
+ { return ContextsLexer::Context(); }
+};
+
+class SimpleBackoffGenerator : public BackoffGenerator {
+ virtual ContextsLexer::Context
+ operator()(const ContextsLexer::Context& c) {
+ if (c.size() <= 3)
+ return ContextsLexer::Context();
+ return strip_edges(c);
+ }
+};
+
+
////////////////////////////////////////////////////////////////
// ContextsCorpus
////////////////////////////////////////////////////////////////
@@ -22,10 +52,11 @@ public:
typedef boost::ptr_vector<Document>::const_iterator const_iterator;
public:
- ContextsCorpus() {}
+ ContextsCorpus() : m_backoff(new TermBackoff) {}
virtual ~ContextsCorpus() {}
- unsigned read_contexts(const std::string &filename);
+ unsigned read_contexts(const std::string &filename,
+ BackoffGenerator* backoff_gen=0);
TermBackoffPtr backoff_index() {
return m_backoff;