diff options
author | Kenneth Heafield <github@kheafield.com> | 2012-10-22 12:07:20 +0100 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2012-10-22 12:07:20 +0100 |
commit | 5f98fe5c4f2a2090eeb9d30c030305a70a8347d1 (patch) | |
tree | 9b6002f850e6dea1e3400c6b19bb31a9cdf3067f /gi/pf/corpus.cc | |
parent | cf9994131993b40be62e90e213b1e11e6b550143 (diff) | |
parent | 21825a09d97c2e0afd20512f306fb25fed55e529 (diff) |
Merge remote branch 'upstream/master'
Conflicts:
Jamroot
bjam
decoder/Jamfile
decoder/cdec.cc
dpmert/Jamfile
jam-files/sanity.jam
klm/lm/Jamfile
klm/util/Jamfile
mira/Jamfile
Diffstat (limited to 'gi/pf/corpus.cc')
-rw-r--r-- | gi/pf/corpus.cc | 62 |
1 files changed, 0 insertions, 62 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc deleted file mode 100644 index cb6e4ed7..00000000 --- a/gi/pf/corpus.cc +++ /dev/null @@ -1,62 +0,0 @@ -#include "corpus.h" - -#include <set> -#include <vector> -#include <string> - -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -namespace corpus { - -void ReadParallelCorpus(const string& filename, - vector<vector<WordID> >* f, - vector<vector<WordID> >* e, - set<WordID>* vocab_f, - set<WordID>* vocab_e) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - ReadFile rf(filename); - istream* in = rf.stream(); - assert(*in); - string line; - unsigned lc = 0; - const WordID kDIV = TD::Convert("|||"); - vector<WordID> tmp; - while(getline(*in, line)) { - ++lc; - e->push_back(vector<int>()); - f->push_back(vector<int>()); - vector<int>& le = e->back(); - vector<int>& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { - isf = false; - } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - if (cur == kDIV) { - cerr << "ERROR in " << lc << ": " << line << endl << endl; - abort(); - } - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } -} - -} - |