diff options
author | Kenneth Heafield <github@kheafield.com> | 2012-10-22 12:07:20 +0100 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2012-10-22 12:07:20 +0100 |
commit | ac586bc9b156b4ae687cd5961ba1fe7b20ec57d6 (patch) | |
tree | 052473b46d7fa18d51f897cdb9e7c93a7186dafd /gi/pf/corpus.cc | |
parent | 97b85c082b3e55c28a8b0c0eb762483ac84a1577 (diff) | |
parent | ad6d4a1b2519896f2b16a282699ce4e64041fab8 (diff) |
Merge remote branch 'upstream/master'
Conflicts:
Jamroot
bjam
decoder/Jamfile
decoder/cdec.cc
dpmert/Jamfile
jam-files/sanity.jam
klm/lm/Jamfile
klm/util/Jamfile
mira/Jamfile
Diffstat (limited to 'gi/pf/corpus.cc')
-rw-r--r-- | gi/pf/corpus.cc | 62 |
1 files changed, 0 insertions, 62 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc deleted file mode 100644 index cb6e4ed7..00000000 --- a/gi/pf/corpus.cc +++ /dev/null @@ -1,62 +0,0 @@ -#include "corpus.h" - -#include <set> -#include <vector> -#include <string> - -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -namespace corpus { - -void ReadParallelCorpus(const string& filename, - vector<vector<WordID> >* f, - vector<vector<WordID> >* e, - set<WordID>* vocab_f, - set<WordID>* vocab_e) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - ReadFile rf(filename); - istream* in = rf.stream(); - assert(*in); - string line; - unsigned lc = 0; - const WordID kDIV = TD::Convert("|||"); - vector<WordID> tmp; - while(getline(*in, line)) { - ++lc; - e->push_back(vector<int>()); - f->push_back(vector<int>()); - vector<int>& le = e->back(); - vector<int>& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { - isf = false; - } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - if (cur == kDIV) { - cerr << "ERROR in " << lc << ": " << line << endl << endl; - abort(); - } - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } -} - -} - |