summaryrefslogtreecommitdiff
path: root/src/tdict.cc
blob: c00d20b87cc2c0119e3146707fdb485655a87d29 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#include "Ngram.h"
#include "dict.h"
#include "tdict.h"
#include "Vocab.h"

using namespace std;

Vocab* TD::dict_ = new Vocab;

static const string empty;
static const string space = " ";

WordID TD::Convert(const std::string& s) {
  return dict_->addWord((VocabString)s.c_str());
}

const char* TD::Convert(const WordID& w) {
  return dict_->getWord((VocabIndex)w);
}

void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) {
  ids->clear();
  for (vector<string>::const_iterator i = strings.begin(); i != strings.end(); ++i)
    ids->push_back(TD::Convert(*i));
}

std::string TD::GetString(const std::vector<WordID>& str) {
  string res;
  for (vector<WordID>::const_iterator i = str.begin(); i != str.end(); ++i)
    res += (i == str.begin() ? empty : space) + TD::Convert(*i);
  return res;
}

void TD::ConvertSentence(const std::string& sent, std::vector<WordID>* ids) {
  string s = sent;
  int last = 0;
  ids->clear();
  for (int i=0; i < s.size(); ++i)
    if (s[i] == 32 || s[i] == '\t') {
      s[i]=0;
      if (last != i) {
        ids->push_back(Convert(&s[last]));
      }
      last = i + 1;
    }
  if (last != s.size())
    ids->push_back(Convert(&s[last]));
}