summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src/corpus.hh
diff options
context:
space:
mode:
authorphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 20:34:00 +0000
committerphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 20:34:00 +0000
commit2f2ba42a1453f4a3a08f9c1ecfc53c1b1c83d550 (patch)
tree646e81b6325280f64a72771b5eeadf5118e465a9 /gi/pyp-topics/src/corpus.hh
parent2f2e36ca3060e7e9853c3d611f6cc5e112a76ddd (diff)
Initial ci of gi dir
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@5 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/src/corpus.hh')
-rw-r--r--gi/pyp-topics/src/corpus.hh101
1 files changed, 101 insertions, 0 deletions
diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh
new file mode 100644
index 00000000..3dd17cf9
--- /dev/null
+++ b/gi/pyp-topics/src/corpus.hh
@@ -0,0 +1,101 @@
+#ifndef _CORPUS_HH
+#define _CORPUS_HH
+
+#include <vector>
+#include <string>
+#include <map>
+
+#include <boost/ptr_container/ptr_vector.hpp>
+
+////////////////////////////////////////////////////////////////
+// Corpus
+////////////////////////////////////////////////////////////////
+typedef int Term;
+
+typedef std::vector<Term> Document;
+typedef std::vector<Term> Terms;
+
+class Corpus {
+public:
+ typedef boost::ptr_vector<Document>::const_iterator const_iterator;
+
+public:
+ Corpus();
+ ~Corpus() {}
+
+ unsigned read(const std::string &filename);
+
+ const_iterator begin() const { return m_documents.begin(); }
+ const_iterator end() const { return m_documents.end(); }
+
+ int num_documents() const { return m_documents.size(); }
+ int num_terms() const { return m_num_terms; }
+ int num_types() const { return m_num_types; }
+
+protected:
+ int m_num_terms, m_num_types;
+ boost::ptr_vector<Document> m_documents;
+};
+
+typedef int DocumentId;
+struct DocumentTerm {
+ DocumentTerm(DocumentId d, Term t) : term(t), doc(d) {}
+ Term term;
+ DocumentId doc;
+};
+typedef std::vector<DocumentTerm> DocumentTerms;
+
+class TestCorpus {
+public:
+ typedef boost::ptr_vector<DocumentTerms>::const_iterator const_iterator;
+
+public:
+ TestCorpus();
+ ~TestCorpus() {}
+
+ void read(const std::string &filename);
+
+ const_iterator begin() const { return m_lines.begin(); }
+ const_iterator end() const { return m_lines.end(); }
+
+ int num_instances() const { return m_lines.size(); }
+
+protected:
+ boost::ptr_vector<DocumentTerms> m_lines;
+};
+
+class TermBackoff {
+public:
+ typedef std::vector<Term> dictionary_type;
+ typedef dictionary_type::const_iterator const_iterator;
+
+public:
+ TermBackoff() : m_backoff_order(-1) {}
+ ~TermBackoff() {}
+
+ void read(const std::string &filename);
+
+ const_iterator begin() const { return m_dict.begin(); }
+ const_iterator end() const { return m_dict.end(); }
+
+ const Term& operator[](const Term& t) const {
+ assert(t < static_cast<int>(m_dict.size()));
+ return m_dict[t];
+ }
+
+ int order() const { return m_backoff_order; }
+// int levels() const { return m_terms_at_order.size(); }
+ bool is_null(const Term& term) const { return term < 0; }
+ int terms_at_level(int level) const {
+ assert (level < (int)m_terms_at_order.size());
+ return m_terms_at_order[level];
+ }
+
+ int size() const { return m_dict.size(); }
+
+protected:
+ dictionary_type m_dict;
+ int m_backoff_order;
+ std::vector<int> m_terms_at_order;
+};
+#endif // _CORPUS_HH