#include "corpus.h" #include #include #include #include "tdict.h" #include "filelib.h" using namespace std; namespace corpus { void ReadParallelCorpus(const string& filename, vector >* f, vector >* e, set* vocab_f, set* vocab_e) { f->clear(); e->clear(); vocab_f->clear(); vocab_e->clear(); ReadFile rf(filename); istream* in = rf.stream(); assert(*in); string line; const WordID kDIV = TD::Convert("|||"); vector tmp; while(*in) { getline(*in, line); if (line.empty() && !*in) break; e->push_back(vector()); f->push_back(vector()); vector& le = e->back(); vector& lf = f->back(); tmp.clear(); TD::ConvertSentence(line, &tmp); bool isf = true; for (unsigned i = 0; i < tmp.size(); ++i) { const int cur = tmp[i]; if (isf) { if (kDIV == cur) { isf = false; } else { lf.push_back(cur); vocab_f->insert(cur); } } else { assert(cur != kDIV); le.push_back(cur); vocab_e->insert(cur); } } assert(isf == false); } } }