diff options
Diffstat (limited to 'gi/pyp-topics/src/contexts_corpus.hh')
-rw-r--r-- | gi/pyp-topics/src/contexts_corpus.hh | 35 |
1 files changed, 33 insertions, 2 deletions
diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh index e680cef5..bd0cd34c 100644 --- a/gi/pyp-topics/src/contexts_corpus.hh +++ b/gi/pyp-topics/src/contexts_corpus.hh @@ -11,6 +11,36 @@ #include "contexts_lexer.h" #include "../../../decoder/dict.h" + +class BackoffGenerator { +public: + virtual ContextsLexer::Context + operator()(const ContextsLexer::Context& c) = 0; + +protected: + ContextsLexer::Context strip_edges(const ContextsLexer::Context& c) { + if (c.size() <= 1) return ContextsLexer::Context(); + assert(c.size() % 2 == 1); + return ContextsLexer::Context(c.begin() + 1, c.end() - 1); + } +}; + +class NullBackoffGenerator : public BackoffGenerator { + virtual ContextsLexer::Context + operator()(const ContextsLexer::Context&) + { return ContextsLexer::Context(); } +}; + +class SimpleBackoffGenerator : public BackoffGenerator { + virtual ContextsLexer::Context + operator()(const ContextsLexer::Context& c) { + if (c.size() <= 3) + return ContextsLexer::Context(); + return strip_edges(c); + } +}; + + //////////////////////////////////////////////////////////////// // ContextsCorpus //////////////////////////////////////////////////////////////// @@ -22,10 +52,11 @@ public: typedef boost::ptr_vector<Document>::const_iterator const_iterator; public: - ContextsCorpus() {} + ContextsCorpus() : m_backoff(new TermBackoff) {} virtual ~ContextsCorpus() {} - unsigned read_contexts(const std::string &filename); + unsigned read_contexts(const std::string &filename, + BackoffGenerator* backoff_gen=0); TermBackoffPtr backoff_index() { return m_backoff; |