diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2011-10-12 14:57:15 +0100 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2011-10-12 14:57:15 +0100 |
commit | ee84ab027c0be54800cac0c9bff62dd097354f6d (patch) | |
tree | 30344f4307d3efbafb3276807a6879c4eeeab173 /gi/pf/corpus.cc | |
parent | a32cd0131c6325e364c82e5f6bbefc03b61e437f (diff) |
model lenght properly, clean up
Diffstat (limited to 'gi/pf/corpus.cc')
-rw-r--r-- | gi/pf/corpus.cc | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc new file mode 100644 index 00000000..a408e7cf --- /dev/null +++ b/gi/pf/corpus.cc @@ -0,0 +1,57 @@ +#include "corpus.h" + +#include <set> +#include <vector> +#include <string> + +#include "tdict.h" +#include "filelib.h" + +using namespace std; + +namespace corpus { + +void ReadParallelCorpus(const string& filename, + vector<vector<WordID> >* f, + vector<vector<WordID> >* e, + set<WordID>* vocab_f, + set<WordID>* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + ReadFile rf(filename); + istream* in = rf.stream(); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector<WordID> tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector<int>()); + f->push_back(vector<int>()); + vector<int>& le = e->back(); + vector<int>& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } +} + +} + |