summaryrefslogtreecommitdiff
path: root/utils/corpus_tools.cc
diff options
context:
space:
mode:
Diffstat (limited to 'utils/corpus_tools.cc')
-rw-r--r--utils/corpus_tools.cc20
1 files changed, 20 insertions, 0 deletions
diff --git a/utils/corpus_tools.cc b/utils/corpus_tools.cc
index d17785af..191153a2 100644
--- a/utils/corpus_tools.cc
+++ b/utils/corpus_tools.cc
@@ -8,6 +8,26 @@
using namespace std;
+void CorpusTools::ReadLine(const string& line,
+ vector<WordID>* src,
+ vector<WordID>* trg) {
+ static const WordID kDIV = TD::Convert("|||");
+ static vector<WordID> tmp;
+ src->clear();
+ trg->clear();
+ TD::ConvertSentence(line, &tmp);
+ unsigned i = 0;
+ while(i < tmp.size() && tmp[i] != kDIV) {
+ src->push_back(tmp[i]);
+ ++i;
+ }
+ if (i < tmp.size() && tmp[i] == kDIV) {
+ ++i;
+ for (; i < tmp.size() ; ++i)
+ trg->push_back(tmp[i]);
+ }
+}
+
void CorpusTools::ReadFromFile(const string& filename,
vector<vector<WordID> >* src,
set<WordID>* src_vocab,