summaryrefslogtreecommitdiff
path: root/utils/corpus_tools.cc
blob: a0542b6eb6a7779d4916c8e6bcc157f31a84a788 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#include "corpus_tools.h"

#include <iostream>

#include "tdict.h"
#include "filelib.h"
#include "verbose.h"

using namespace std;

void CorpusTools::ReadFromFile(const string& filename,
                           vector<vector<WordID> >* src,
                           set<WordID>* src_vocab,
                           vector<vector<WordID> >* trg,
                           set<WordID>* trg_vocab,
                           int rank,
                           int size) {
  assert(rank >= 0);
  assert(size > 0);
  assert(rank < size);
  if (src) src->clear();
  if (src_vocab) src_vocab->clear();
  if (trg) trg->clear();
  if (trg_vocab) trg_vocab->clear();
  const int expected_fields = 1 + (trg == NULL ? 0 : 1);
  if (!SILENT) cerr << "Reading from " << filename << " ...\n";
  ReadFile rf(filename);
  istream& in = *rf.stream();
  string line;
  int lc = 0;
  static const WordID kDIV = TD::Convert("|||");
  vector<WordID> tmp;
  while(getline(in, line)) {
    const bool skip = (lc % size != rank);
    ++lc;
    if (skip) continue;
    TD::ConvertSentence(line, &tmp);
    src->push_back(vector<WordID>());
    vector<WordID>* d = &src->back();
    set<WordID>* v = src_vocab;
    int s = 0;
    for (unsigned i = 0; i < tmp.size(); ++i) {
      if (tmp[i] == kDIV) {
        ++s;
        if (s > 1) { cerr << "Unexpected format in line " << lc << ": " << line << endl; abort(); }
        assert(trg);
        trg->push_back(vector<WordID>());
        d = &trg->back();
        v = trg_vocab;
      } else {
        d->push_back(tmp[i]);
        if (v) v->insert(tmp[i]);
      }
    }
    ++s;
    if (expected_fields != s) {
      cerr << "Wrong number of fields in line " << lc << ": " << line << endl; abort();
    }
  }
}