summaryrefslogtreecommitdiff
path: root/gi/pf/corpus.cc
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-10-20 02:31:25 +0200
committerPatrick Simianer <p@simianer.de>2011-10-20 02:31:25 +0200
commita5a92ebe23c5819ed104313426012011e32539da (patch)
tree3416818c758d5ece4e71fe522c571e75ea04f100 /gi/pf/corpus.cc
parentb88332caac2cbe737c99b8098813f868ca876d8b (diff)
parent78baccbb4231bb84a456702d4f574f8e601a8182 (diff)
finalized merge
Diffstat (limited to 'gi/pf/corpus.cc')
-rw-r--r--gi/pf/corpus.cc57
1 files changed, 57 insertions, 0 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc
new file mode 100644
index 00000000..a408e7cf
--- /dev/null
+++ b/gi/pf/corpus.cc
@@ -0,0 +1,57 @@
+#include "corpus.h"
+
+#include <set>
+#include <vector>
+#include <string>
+
+#include "tdict.h"
+#include "filelib.h"
+
+using namespace std;
+
+namespace corpus {
+
+void ReadParallelCorpus(const string& filename,
+ vector<vector<WordID> >* f,
+ vector<vector<WordID> >* e,
+ set<WordID>* vocab_f,
+ set<WordID>* vocab_e) {
+ f->clear();
+ e->clear();
+ vocab_f->clear();
+ vocab_e->clear();
+ ReadFile rf(filename);
+ istream* in = rf.stream();
+ assert(*in);
+ string line;
+ const WordID kDIV = TD::Convert("|||");
+ vector<WordID> tmp;
+ while(*in) {
+ getline(*in, line);
+ if (line.empty() && !*in) break;
+ e->push_back(vector<int>());
+ f->push_back(vector<int>());
+ vector<int>& le = e->back();
+ vector<int>& lf = f->back();
+ tmp.clear();
+ TD::ConvertSentence(line, &tmp);
+ bool isf = true;
+ for (unsigned i = 0; i < tmp.size(); ++i) {
+ const int cur = tmp[i];
+ if (isf) {
+ if (kDIV == cur) { isf = false; } else {
+ lf.push_back(cur);
+ vocab_f->insert(cur);
+ }
+ } else {
+ assert(cur != kDIV);
+ le.push_back(cur);
+ vocab_e->insert(cur);
+ }
+ }
+ assert(isf == false);
+ }
+}
+
+}
+