summaryrefslogtreecommitdiff
path: root/gi/pf/corpus.cc
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2012-03-13 09:24:47 +0100
committerPatrick Simianer <p@simianer.de>2012-03-13 09:24:47 +0100
commitef6085e558e26c8819f1735425761103021b6470 (patch)
tree5cf70e4c48c64d838e1326b5a505c8c4061bff4a /gi/pf/corpus.cc
parent10a232656a0c882b3b955d2bcfac138ce11e8a2e (diff)
parentdfbc278c1057555fda9312291c8024049e00b7d8 (diff)
merge with upstream
Diffstat (limited to 'gi/pf/corpus.cc')
-rw-r--r--gi/pf/corpus.cc15
1 files changed, 10 insertions, 5 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc
index a408e7cf..cb6e4ed7 100644
--- a/gi/pf/corpus.cc
+++ b/gi/pf/corpus.cc
@@ -24,11 +24,11 @@ void ReadParallelCorpus(const string& filename,
istream* in = rf.stream();
assert(*in);
string line;
+ unsigned lc = 0;
const WordID kDIV = TD::Convert("|||");
vector<WordID> tmp;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
+ while(getline(*in, line)) {
+ ++lc;
e->push_back(vector<int>());
f->push_back(vector<int>());
vector<int>& le = e->back();
@@ -39,12 +39,17 @@ void ReadParallelCorpus(const string& filename,
for (unsigned i = 0; i < tmp.size(); ++i) {
const int cur = tmp[i];
if (isf) {
- if (kDIV == cur) { isf = false; } else {
+ if (kDIV == cur) {
+ isf = false;
+ } else {
lf.push_back(cur);
vocab_f->insert(cur);
}
} else {
- assert(cur != kDIV);
+ if (cur == kDIV) {
+ cerr << "ERROR in " << lc << ": " << line << endl << endl;
+ abort();
+ }
le.push_back(cur);
vocab_e->insert(cur);
}