diff options
author | Patrick Simianer <p@simianer.de> | 2011-10-20 02:31:25 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2011-10-20 02:31:25 +0200 |
commit | a5a92ebe23c5819ed104313426012011e32539da (patch) | |
tree | 3416818c758d5ece4e71fe522c571e75ea04f100 /gi/pf/corpus.cc | |
parent | b88332caac2cbe737c99b8098813f868ca876d8b (diff) | |
parent | 78baccbb4231bb84a456702d4f574f8e601a8182 (diff) |
finalized merge
Diffstat (limited to 'gi/pf/corpus.cc')
-rw-r--r-- | gi/pf/corpus.cc | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc new file mode 100644 index 00000000..a408e7cf --- /dev/null +++ b/gi/pf/corpus.cc @@ -0,0 +1,57 @@ +#include "corpus.h" + +#include <set> +#include <vector> +#include <string> + +#include "tdict.h" +#include "filelib.h" + +using namespace std; + +namespace corpus { + +void ReadParallelCorpus(const string& filename, + vector<vector<WordID> >* f, + vector<vector<WordID> >* e, + set<WordID>* vocab_f, + set<WordID>* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + ReadFile rf(filename); + istream* in = rf.stream(); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector<WordID> tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector<int>()); + f->push_back(vector<int>()); + vector<int>& le = e->back(); + vector<int>& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } +} + +} + |