summaryrefslogtreecommitdiff
path: root/gi/pf
diff options
context:
space:
mode:
authorChris Dyer <prguest11@taipan.cs>2011-12-30 19:23:32 +0000
committerChris Dyer <prguest11@taipan.cs>2011-12-30 19:23:32 +0000
commit173570597f77da8f0bdb9b5a42baa64675e93b17 (patch)
tree17f642452966a50c41752736b1c50017aedc3143 /gi/pf
parent031dc91814c1b57269b8a789c93aad0da0a46b6b (diff)
logging corpus errors
Diffstat (limited to 'gi/pf')
-rw-r--r--gi/pf/corpus.cc15
1 files changed, 10 insertions, 5 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc
index a408e7cf..cb6e4ed7 100644
--- a/gi/pf/corpus.cc
+++ b/gi/pf/corpus.cc
@@ -24,11 +24,11 @@ void ReadParallelCorpus(const string& filename,
istream* in = rf.stream();
assert(*in);
string line;
+ unsigned lc = 0;
const WordID kDIV = TD::Convert("|||");
vector<WordID> tmp;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
+ while(getline(*in, line)) {
+ ++lc;
e->push_back(vector<int>());
f->push_back(vector<int>());
vector<int>& le = e->back();
@@ -39,12 +39,17 @@ void ReadParallelCorpus(const string& filename,
for (unsigned i = 0; i < tmp.size(); ++i) {
const int cur = tmp[i];
if (isf) {
- if (kDIV == cur) { isf = false; } else {
+ if (kDIV == cur) {
+ isf = false;
+ } else {
lf.push_back(cur);
vocab_f->insert(cur);
}
} else {
- assert(cur != kDIV);
+ if (cur == kDIV) {
+ cerr << "ERROR in " << lc << ": " << line << endl << endl;
+ abort();
+ }
le.push_back(cur);
vocab_e->insert(cur);
}