summaryrefslogtreecommitdiff
path: root/gi/pf/corpus.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-10-12 14:57:15 +0100
committerChris Dyer <cdyer@cs.cmu.edu>2011-10-12 14:57:15 +0100
commitee84ab027c0be54800cac0c9bff62dd097354f6d (patch)
tree30344f4307d3efbafb3276807a6879c4eeeab173 /gi/pf/corpus.cc
parenta32cd0131c6325e364c82e5f6bbefc03b61e437f (diff)
model lenght properly, clean up
Diffstat (limited to 'gi/pf/corpus.cc')
-rw-r--r--gi/pf/corpus.cc57
1 files changed, 57 insertions, 0 deletions
diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc
new file mode 100644
index 00000000..a408e7cf
--- /dev/null
+++ b/gi/pf/corpus.cc
@@ -0,0 +1,57 @@
+#include "corpus.h"
+
+#include <set>
+#include <vector>
+#include <string>
+
+#include "tdict.h"
+#include "filelib.h"
+
+using namespace std;
+
+namespace corpus {
+
+void ReadParallelCorpus(const string& filename,
+ vector<vector<WordID> >* f,
+ vector<vector<WordID> >* e,
+ set<WordID>* vocab_f,
+ set<WordID>* vocab_e) {
+ f->clear();
+ e->clear();
+ vocab_f->clear();
+ vocab_e->clear();
+ ReadFile rf(filename);
+ istream* in = rf.stream();
+ assert(*in);
+ string line;
+ const WordID kDIV = TD::Convert("|||");
+ vector<WordID> tmp;
+ while(*in) {
+ getline(*in, line);
+ if (line.empty() && !*in) break;
+ e->push_back(vector<int>());
+ f->push_back(vector<int>());
+ vector<int>& le = e->back();
+ vector<int>& lf = f->back();
+ tmp.clear();
+ TD::ConvertSentence(line, &tmp);
+ bool isf = true;
+ for (unsigned i = 0; i < tmp.size(); ++i) {
+ const int cur = tmp[i];
+ if (isf) {
+ if (kDIV == cur) { isf = false; } else {
+ lf.push_back(cur);
+ vocab_f->insert(cur);
+ }
+ } else {
+ assert(cur != kDIV);
+ le.push_back(cur);
+ vocab_e->insert(cur);
+ }
+ }
+ assert(isf == false);
+ }
+}
+
+}
+