From 173570597f77da8f0bdb9b5a42baa64675e93b17 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 30 Dec 2011 19:23:32 +0000 Subject: logging corpus errors --- gi/pf/corpus.cc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc index a408e7cf..cb6e4ed7 100644 --- a/gi/pf/corpus.cc +++ b/gi/pf/corpus.cc @@ -24,11 +24,11 @@ void ReadParallelCorpus(const string& filename, istream* in = rf.stream(); assert(*in); string line; + unsigned lc = 0; const WordID kDIV = TD::Convert("|||"); vector tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; + while(getline(*in, line)) { + ++lc; e->push_back(vector()); f->push_back(vector()); vector& le = e->back(); @@ -39,12 +39,17 @@ void ReadParallelCorpus(const string& filename, for (unsigned i = 0; i < tmp.size(); ++i) { const int cur = tmp[i]; if (isf) { - if (kDIV == cur) { isf = false; } else { + if (kDIV == cur) { + isf = false; + } else { lf.push_back(cur); vocab_f->insert(cur); } } else { - assert(cur != kDIV); + if (cur == kDIV) { + cerr << "ERROR in " << lc << ": " << line << endl << endl; + abort(); + } le.push_back(cur); vocab_e->insert(cur); } -- cgit v1.2.3