summaryrefslogtreecommitdiff
path: root/gi/pf/corpus.cc
diff options
context:
space:
mode:
authorChris Dyer <prguest11@taipan.cs>2011-12-30 19:23:32 +0000
committerChris Dyer <prguest11@taipan.cs>2011-12-30 19:23:32 +0000
commit71da76d47d5a6f988b56b5641f7296249cb85124 (patch)
tree09a89566138184c7d51b78182803eb1ae88e5e2f /gi/pf/corpus.cc
parent5ea87bf5487f0bd9fef7385eb1812b0601b57a6e (diff)
logging corpus errors
Diffstat (limited to 'gi/pf/corpus.cc')
-rw-r--r--gi/pf/corpus.cc15
1 files changed, 10 insertions, 5 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc
index a408e7cf..cb6e4ed7 100644
--- a/gi/pf/corpus.cc
+++ b/gi/pf/corpus.cc
@@ -24,11 +24,11 @@ void ReadParallelCorpus(const string& filename,
istream* in = rf.stream();
assert(*in);
string line;
+ unsigned lc = 0;
const WordID kDIV = TD::Convert("|||");
vector<WordID> tmp;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
+ while(getline(*in, line)) {
+ ++lc;
e->push_back(vector<int>());
f->push_back(vector<int>());
vector<int>& le = e->back();
@@ -39,12 +39,17 @@ void ReadParallelCorpus(const string& filename,
for (unsigned i = 0; i < tmp.size(); ++i) {
const int cur = tmp[i];
if (isf) {
- if (kDIV == cur) { isf = false; } else {
+ if (kDIV == cur) {
+ isf = false;
+ } else {
lf.push_back(cur);
vocab_f->insert(cur);
}
} else {
- assert(cur != kDIV);
+ if (cur == kDIV) {
+ cerr << "ERROR in " << lc << ": " << line << endl << endl;
+ abort();
+ }
le.push_back(cur);
vocab_e->insert(cur);
}