diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 |
commit | 07ea7b64b6f85e5798a8068453ed9fd2b97396db (patch) | |
tree | 644496a1690d84d82a396bbc1e39160788beb2cd /gi/pf/corpus.cc | |
parent | 37b9e45e5cb29d708f7249dbe0b0fb27685282a0 (diff) | |
parent | a36fcc5d55c1de84ae68c1091ebff2b1c32dc3b7 (diff) |
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'gi/pf/corpus.cc')
-rw-r--r-- | gi/pf/corpus.cc | 62 |
1 files changed, 0 insertions, 62 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc deleted file mode 100644 index cb6e4ed7..00000000 --- a/gi/pf/corpus.cc +++ /dev/null @@ -1,62 +0,0 @@ -#include "corpus.h" - -#include <set> -#include <vector> -#include <string> - -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -namespace corpus { - -void ReadParallelCorpus(const string& filename, - vector<vector<WordID> >* f, - vector<vector<WordID> >* e, - set<WordID>* vocab_f, - set<WordID>* vocab_e) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - ReadFile rf(filename); - istream* in = rf.stream(); - assert(*in); - string line; - unsigned lc = 0; - const WordID kDIV = TD::Convert("|||"); - vector<WordID> tmp; - while(getline(*in, line)) { - ++lc; - e->push_back(vector<int>()); - f->push_back(vector<int>()); - vector<int>& le = e->back(); - vector<int>& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { - isf = false; - } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - if (cur == kDIV) { - cerr << "ERROR in " << lc << ": " << line << endl << endl; - abort(); - } - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } -} - -} - |