diff options
author | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 20:34:00 +0000 |
---|---|---|
committer | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 20:34:00 +0000 |
commit | efe0d24fa7dbca47825638a52f51977456153bd0 (patch) | |
tree | 77c1d68ae29e423e1baaca6565a2455ec481955c /gi/pyp-topics/src/corpus.cc | |
parent | 42e1e2cb20c8f31d9a27bf0be5fe0846f3dde413 (diff) |
Initial ci of gi dir
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@5 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/src/corpus.cc')
-rw-r--r-- | gi/pyp-topics/src/corpus.cc | 103 |
1 files changed, 103 insertions, 0 deletions
diff --git a/gi/pyp-topics/src/corpus.cc b/gi/pyp-topics/src/corpus.cc new file mode 100644 index 00000000..93910ea3 --- /dev/null +++ b/gi/pyp-topics/src/corpus.cc @@ -0,0 +1,103 @@ +#include <sstream> +#include <iostream> +#include <set> + +#include "corpus.hh" +#include "gzstream.hh" + +using namespace std; + +////////////////////////////////////////////////// +// Corpus +////////////////////////////////////////////////// + +Corpus::Corpus() {} + +unsigned Corpus::read(const std::string &filename) { + m_num_terms = 0; + m_num_types = 0; + std::set<int> seen_types; + + igzstream in(filename.c_str()); + + string buf; + int token; + unsigned count=0; + while (getline(in, buf)) { + Document* doc(new Document()); + istringstream ss(buf); + + ss >> token; // the number of unique terms + + char delimeter; + int count; + while(ss >> token >> delimeter >> count) { + for (int i=0; i<count; ++i) + doc->push_back(token); + m_num_terms += count; + seen_types.insert(token); + } + + m_documents.push_back(doc); + count++; + } + + m_num_types = seen_types.size(); + + return count; +} + +////////////////////////////////////////////////// +// TestCorpus +////////////////////////////////////////////////// + +TestCorpus::TestCorpus() {} + +void TestCorpus::read(const std::string &filename) { + igzstream in(filename.c_str()); + + string buf; + Term term; + DocumentId doc; + char delimeter; + while (getline(in, buf)) { + DocumentTerms* line(new DocumentTerms()); + istringstream ss(buf); + + while(ss >> doc >> delimeter >> term) + line->push_back(DocumentTerm(doc, term)); + + m_lines.push_back(line); + } +} + +////////////////////////////////////////////////// +// TermBackoff +////////////////////////////////////////////////// + +void TermBackoff::read(const std::string &filename) { + igzstream in(filename.c_str()); + + string buf; + int num_terms; + getline(in, buf); + istringstream ss(buf); + ss >> num_terms >> m_backoff_order; + + m_dict.resize(num_terms, -1); + for (int i=0; i<m_backoff_order; ++i) { + int count; ss >> count; + m_terms_at_order.push_back(count); + } + + Term term, backoff; + while (getline(in, buf)) { + istringstream ss(buf); + ss >> term >> backoff; + + assert(term < num_terms); + assert(term >= 0); + + m_dict[term] = backoff; + } +} |