summaryrefslogtreecommitdiff
path: root/gi/pf/corpus.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-10-11 14:06:32 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2012-10-11 14:06:32 -0400
commit07ea7b64b6f85e5798a8068453ed9fd2b97396db (patch)
tree644496a1690d84d82a396bbc1e39160788beb2cd /gi/pf/corpus.cc
parent37b9e45e5cb29d708f7249dbe0b0fb27685282a0 (diff)
parenta36fcc5d55c1de84ae68c1091ebff2b1c32dc3b7 (diff)
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'gi/pf/corpus.cc')
-rw-r--r--gi/pf/corpus.cc62
1 files changed, 0 insertions, 62 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc
deleted file mode 100644
index cb6e4ed7..00000000
--- a/gi/pf/corpus.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "corpus.h"
-
-#include <set>
-#include <vector>
-#include <string>
-
-#include "tdict.h"
-#include "filelib.h"
-
-using namespace std;
-
-namespace corpus {
-
-void ReadParallelCorpus(const string& filename,
- vector<vector<WordID> >* f,
- vector<vector<WordID> >* e,
- set<WordID>* vocab_f,
- set<WordID>* vocab_e) {
- f->clear();
- e->clear();
- vocab_f->clear();
- vocab_e->clear();
- ReadFile rf(filename);
- istream* in = rf.stream();
- assert(*in);
- string line;
- unsigned lc = 0;
- const WordID kDIV = TD::Convert("|||");
- vector<WordID> tmp;
- while(getline(*in, line)) {
- ++lc;
- e->push_back(vector<int>());
- f->push_back(vector<int>());
- vector<int>& le = e->back();
- vector<int>& lf = f->back();
- tmp.clear();
- TD::ConvertSentence(line, &tmp);
- bool isf = true;
- for (unsigned i = 0; i < tmp.size(); ++i) {
- const int cur = tmp[i];
- if (isf) {
- if (kDIV == cur) {
- isf = false;
- } else {
- lf.push_back(cur);
- vocab_f->insert(cur);
- }
- } else {
- if (cur == kDIV) {
- cerr << "ERROR in " << lc << ": " << line << endl << endl;
- abort();
- }
- le.push_back(cur);
- vocab_e->insert(cur);
- }
- }
- assert(isf == false);
- }
-}
-
-}
-