blob: ac590bd8f709a0779fce57ae9a0f0abaa49f4b22 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
#include "Ngram.h"
#include "dict.h"
#include "tdict.h"
#include "Vocab.h"
using namespace std;
Vocab* TD::dict_ = new Vocab;
static const string empty;
static const string space = " ";
unsigned int TD::NumWords() {
return dict_->numWords();
}
WordID TD::Convert(const std::string& s) {
return dict_->addWord((VocabString)s.c_str());
}
const char* TD::Convert(const WordID& w) {
return dict_->getWord((VocabIndex)w);
}
void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) {
ids->clear();
for (vector<string>::const_iterator i = strings.begin(); i != strings.end(); ++i)
ids->push_back(TD::Convert(*i));
}
std::string TD::GetString(const std::vector<WordID>& str) {
string res;
for (vector<WordID>::const_iterator i = str.begin(); i != str.end(); ++i)
res += (i == str.begin() ? empty : space) + TD::Convert(*i);
return res;
}
void TD::ConvertSentence(const std::string& sent, std::vector<WordID>* ids) {
string s = sent;
int last = 0;
ids->clear();
for (int i=0; i < s.size(); ++i)
if (s[i] == 32 || s[i] == '\t') {
s[i]=0;
if (last != i) {
ids->push_back(Convert(&s[last]));
}
last = i + 1;
}
if (last != s.size())
ids->push_back(Convert(&s[last]));
}
|