summaryrefslogtreecommitdiff
path: root/gi/pf/corpus.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
commite26434979adc33bd949566ba7bf02dff64e80a3e (patch)
treed1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/pf/corpus.cc
parent0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'gi/pf/corpus.cc')
-rw-r--r--gi/pf/corpus.cc62
1 files changed, 0 insertions, 62 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc
deleted file mode 100644
index cb6e4ed7..00000000
--- a/gi/pf/corpus.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "corpus.h"
-
-#include <set>
-#include <vector>
-#include <string>
-
-#include "tdict.h"
-#include "filelib.h"
-
-using namespace std;
-
-namespace corpus {
-
-void ReadParallelCorpus(const string& filename,
- vector<vector<WordID> >* f,
- vector<vector<WordID> >* e,
- set<WordID>* vocab_f,
- set<WordID>* vocab_e) {
- f->clear();
- e->clear();
- vocab_f->clear();
- vocab_e->clear();
- ReadFile rf(filename);
- istream* in = rf.stream();
- assert(*in);
- string line;
- unsigned lc = 0;
- const WordID kDIV = TD::Convert("|||");
- vector<WordID> tmp;
- while(getline(*in, line)) {
- ++lc;
- e->push_back(vector<int>());
- f->push_back(vector<int>());
- vector<int>& le = e->back();
- vector<int>& lf = f->back();
- tmp.clear();
- TD::ConvertSentence(line, &tmp);
- bool isf = true;
- for (unsigned i = 0; i < tmp.size(); ++i) {
- const int cur = tmp[i];
- if (isf) {
- if (kDIV == cur) {
- isf = false;
- } else {
- lf.push_back(cur);
- vocab_f->insert(cur);
- }
- } else {
- if (cur == kDIV) {
- cerr << "ERROR in " << lc << ": " << line << endl << endl;
- abort();
- }
- le.push_back(cur);
- vocab_e->insert(cur);
- }
- }
- assert(isf == false);
- }
-}
-
-}
-