#include "corpus_tools.h" #include #include "tdict.h" #include "filelib.h" #include "verbose.h" using namespace std; void CorpusTools::ReadFromFile(const string& filename, vector >* src, set* src_vocab, vector >* trg, set* trg_vocab, int rank, int size) { assert(rank >= 0); assert(size > 0); assert(rank < size); if (src) src->clear(); if (src_vocab) src_vocab->clear(); if (trg) trg->clear(); if (trg_vocab) trg_vocab->clear(); const int expected_fields = 1 + (trg == NULL ? 0 : 1); if (!SILENT) cerr << "Reading from " << filename << " ...\n"; ReadFile rf(filename); istream& in = *rf.stream(); string line; int lc = 0; static const WordID kDIV = TD::Convert("|||"); vector tmp; while(getline(in, line)) { const bool skip = (lc % size != rank); ++lc; TD::ConvertSentence(line, &tmp); vector* d = NULL; if (!skip) { src->push_back(vector()); d = &src->back(); } set* v = src_vocab; int s = 0; for (unsigned i = 0; i < tmp.size(); ++i) { if (tmp[i] == kDIV) { ++s; if (s > 1) { cerr << "Unexpected format in line " << lc << ": " << line << endl; abort(); } assert(trg); if (!skip) { trg->push_back(vector()); d = &trg->back(); } v = trg_vocab; } else { if (d) d->push_back(tmp[i]); if (v) v->insert(tmp[i]); } } ++s; if (expected_fields != s) { cerr << "Wrong number of fields in line " << lc << ": " << line << endl; abort(); } } }