diff options
| author | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 | 
| commit | ef6085e558e26c8819f1735425761103021b6470 (patch) | |
| tree | 5cf70e4c48c64d838e1326b5a505c8c4061bff4a /utils/corpus_tools.cc | |
| parent | 10a232656a0c882b3b955d2bcfac138ce11e8a2e (diff) | |
| parent | dfbc278c1057555fda9312291c8024049e00b7d8 (diff) | |
merge with upstream
Diffstat (limited to 'utils/corpus_tools.cc')
| -rw-r--r-- | utils/corpus_tools.cc | 66 | 
1 files changed, 66 insertions, 0 deletions
diff --git a/utils/corpus_tools.cc b/utils/corpus_tools.cc new file mode 100644 index 00000000..d17785af --- /dev/null +++ b/utils/corpus_tools.cc @@ -0,0 +1,66 @@ +#include "corpus_tools.h" + +#include <iostream> + +#include "tdict.h" +#include "filelib.h" +#include "verbose.h" + +using namespace std; + +void CorpusTools::ReadFromFile(const string& filename, +                           vector<vector<WordID> >* src, +                           set<WordID>* src_vocab, +                           vector<vector<WordID> >* trg, +                           set<WordID>* trg_vocab, +                           int rank, +                           int size) { +  assert(rank >= 0); +  assert(size > 0); +  assert(rank < size); +  if (src) src->clear(); +  if (src_vocab) src_vocab->clear(); +  if (trg) trg->clear(); +  if (trg_vocab) trg_vocab->clear(); +  const int expected_fields = 1 + (trg == NULL ? 0 : 1); +  if (!SILENT) cerr << "Reading from " << filename << " ...\n"; +  ReadFile rf(filename); +  istream& in = *rf.stream(); +  string line; +  int lc = 0; +  static const WordID kDIV = TD::Convert("|||"); +  vector<WordID> tmp; +  while(getline(in, line)) { +    const bool skip = (lc % size != rank); +    ++lc; +    TD::ConvertSentence(line, &tmp); +    vector<WordID>* d = NULL; +    if (!skip) { +      src->push_back(vector<WordID>()); +      d = &src->back(); +    } +    set<WordID>* v = src_vocab; +    int s = 0; +    for (unsigned i = 0; i < tmp.size(); ++i) { +      if (tmp[i] == kDIV) { +        ++s; +        if (s > 1) { cerr << "Unexpected format in line " << lc << ": " << line << endl; abort(); } +        assert(trg); +        if (!skip) { +          trg->push_back(vector<WordID>()); +          d = &trg->back(); +        } +        v = trg_vocab; +      } else { +        if (d) d->push_back(tmp[i]); +        if (v) v->insert(tmp[i]); +      } +    } +    ++s; +    if (expected_fields != s) { +      cerr << "Wrong number of fields in line " << lc << ": " << line << endl; abort(); +    } +  } +} + +  | 
