diff options
author | Chris Dyer <prguest11@taipan.cs> | 2011-12-30 19:23:32 +0000 |
---|---|---|
committer | Chris Dyer <prguest11@taipan.cs> | 2011-12-30 19:23:32 +0000 |
commit | 173570597f77da8f0bdb9b5a42baa64675e93b17 (patch) | |
tree | 17f642452966a50c41752736b1c50017aedc3143 | |
parent | 031dc91814c1b57269b8a789c93aad0da0a46b6b (diff) |
logging corpus errors
-rw-r--r-- | gi/pf/corpus.cc | 15 |
1 files changed, 10 insertions, 5 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc index a408e7cf..cb6e4ed7 100644 --- a/gi/pf/corpus.cc +++ b/gi/pf/corpus.cc @@ -24,11 +24,11 @@ void ReadParallelCorpus(const string& filename, istream* in = rf.stream(); assert(*in); string line; + unsigned lc = 0; const WordID kDIV = TD::Convert("|||"); vector<WordID> tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; + while(getline(*in, line)) { + ++lc; e->push_back(vector<int>()); f->push_back(vector<int>()); vector<int>& le = e->back(); @@ -39,12 +39,17 @@ void ReadParallelCorpus(const string& filename, for (unsigned i = 0; i < tmp.size(); ++i) { const int cur = tmp[i]; if (isf) { - if (kDIV == cur) { isf = false; } else { + if (kDIV == cur) { + isf = false; + } else { lf.push_back(cur); vocab_f->insert(cur); } } else { - assert(cur != kDIV); + if (cur == kDIV) { + cerr << "ERROR in " << lc << ": " << line << endl << endl; + abort(); + } le.push_back(cur); vocab_e->insert(cur); } |