blob: 7203b325c4916e8f55993da1a945e0e3b55f2ec0 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
|
#include "factored_lexicon_helper.h"
#include "filelib.h"
#include "stringlib.h"
using namespace std;
FactoredLexiconHelper::FactoredLexiconHelper() :
kNULL(TD::Convert("<eps>")),
has_src_(false),
has_trg_(false) { InitEscape(); }
FactoredLexiconHelper::FactoredLexiconHelper(const std::string& srcfile, const std::string& trgmapfile) :
kNULL(TD::Convert("<eps>")),
has_src_(false),
has_trg_(false) {
if (srcfile.size() && srcfile != "*") {
ReadFile rf(srcfile);
has_src_ = true;
istream& in = *rf.stream();
string line;
while(in) {
getline(in, line);
if (!in) continue;
vector<WordID> v;
TD::ConvertSentence(line, &v);
src_.push_back(v);
}
}
if (trgmapfile.size() && trgmapfile != "*") {
ReadFile rf(trgmapfile);
has_trg_ = true;
istream& in = *rf.stream();
string line;
vector<string> v;
while(in) {
getline(in, line);
if (!in) continue;
SplitOnWhitespace(line, &v);
if (v.size() != 2) {
cerr << "Error reading line in map file: " << line << endl;
abort();
}
WordID& to = trgmap_[TD::Convert(v[0])];
if (to != 0) {
cerr << "Duplicate entry for word " << v[0] << endl;
abort();
}
to = TD::Convert(v[1]);
}
}
InitEscape();
}
void FactoredLexiconHelper::InitEscape() {
escape_[TD::Convert("=")] = TD::Convert("__EQ");
escape_[TD::Convert(";")] = TD::Convert("__SC");
escape_[TD::Convert(",")] = TD::Convert("__CO");
}
void FactoredLexiconHelper::PrepareForInput(const SentenceMetadata& smeta) {
if (has_src_) {
const int id = smeta.GetSentenceID();
assert(id < src_.size());
cur_src_ = src_[id];
} else {
cur_src_.resize(smeta.GetSourceLength());
for (int i = 0; i < cur_src_.size(); ++i) {
const vector<LatticeArc>& arcs = smeta.GetSourceLattice()[i];
assert(arcs.size() == 1); // only sentences supported for now
cur_src_[i] = arcs[0].label;
}
}
if (cur_src_.size() != smeta.GetSourceLength()) {
cerr << "Length mismatch between mapped source and real source in sentence id=" << smeta.GetSentenceID() << endl;
cerr << " mapped len=" << cur_src_.size() << endl;
cerr << " actual len=" << smeta.GetSourceLength() << endl;
}
}
|