summaryrefslogtreecommitdiff
path: root/gi/pf/corpus.cc
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-10-19 14:02:34 +0200
committerPatrick Simianer <p@simianer.de>2011-10-19 14:02:34 +0200
commiteb14e36d0b29f19321d44dd7dfa73cc703838d86 (patch)
tree1285e9e56959bc3a4b506e36bbc3b49f4e938fa0 /gi/pf/corpus.cc
parent68f158b11df9f4072699fe6a4c8022ea54102b28 (diff)
parent04e38a57b19ea012895ac2efb39382c2e77833a9 (diff)
merge upstream/master
Diffstat (limited to 'gi/pf/corpus.cc')
-rw-r--r--gi/pf/corpus.cc57
1 files changed, 57 insertions, 0 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc
new file mode 100644
index 00000000..a408e7cf
--- /dev/null
+++ b/gi/pf/corpus.cc
@@ -0,0 +1,57 @@
+#include "corpus.h"
+
+#include <set>
+#include <vector>
+#include <string>
+
+#include "tdict.h"
+#include "filelib.h"
+
+using namespace std;
+
+namespace corpus {
+
+void ReadParallelCorpus(const string& filename,
+ vector<vector<WordID> >* f,
+ vector<vector<WordID> >* e,
+ set<WordID>* vocab_f,
+ set<WordID>* vocab_e) {
+ f->clear();
+ e->clear();
+ vocab_f->clear();
+ vocab_e->clear();
+ ReadFile rf(filename);
+ istream* in = rf.stream();
+ assert(*in);
+ string line;
+ const WordID kDIV = TD::Convert("|||");
+ vector<WordID> tmp;
+ while(*in) {
+ getline(*in, line);
+ if (line.empty() && !*in) break;
+ e->push_back(vector<int>());
+ f->push_back(vector<int>());
+ vector<int>& le = e->back();
+ vector<int>& lf = f->back();
+ tmp.clear();
+ TD::ConvertSentence(line, &tmp);
+ bool isf = true;
+ for (unsigned i = 0; i < tmp.size(); ++i) {
+ const int cur = tmp[i];
+ if (isf) {
+ if (kDIV == cur) { isf = false; } else {
+ lf.push_back(cur);
+ vocab_f->insert(cur);
+ }
+ } else {
+ assert(cur != kDIV);
+ le.push_back(cur);
+ vocab_e->insert(cur);
+ }
+ }
+ assert(isf == false);
+ }
+}
+
+}
+