From ee84ab027c0be54800cac0c9bff62dd097354f6d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 12 Oct 2011 14:57:15 +0100 Subject: model lenght properly, clean up --- gi/pf/corpus.cc | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 gi/pf/corpus.cc (limited to 'gi/pf/corpus.cc') diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc new file mode 100644 index 00000000..a408e7cf --- /dev/null +++ b/gi/pf/corpus.cc @@ -0,0 +1,57 @@ +#include "corpus.h" + +#include +#include +#include + +#include "tdict.h" +#include "filelib.h" + +using namespace std; + +namespace corpus { + +void ReadParallelCorpus(const string& filename, + vector >* f, + vector >* e, + set* vocab_f, + set* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + ReadFile rf(filename); + istream* in = rf.stream(); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + f->push_back(vector()); + vector& le = e->back(); + vector& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } +} + +} + -- cgit v1.2.3