1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
#include "stringlib.h"
#include <cstdlib>
#include <cassert>
#include <iostream>
#include <map>
#include "lattice.h"
using namespace std;
void ParseTranslatorInput(const string& line, string* input, string* ref) {
size_t hint = 0;
if (line.find("{\"rules\":") == 0) {
hint = line.find("}}");
if (hint == string::npos) {
cerr << "Syntax error: " << line << endl;
abort();
}
hint += 2;
}
size_t pos = line.find("|||", hint);
if (pos == string::npos) { *input = line; return; }
ref->clear();
*input = line.substr(0, pos - 1);
string rline = line.substr(pos + 4);
if (rline.size() > 0) {
assert(ref);
*ref = rline;
}
}
void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) {
string sref;
ParseTranslatorInput(line, input, &sref);
if (sref.size() > 0) {
assert(ref);
LatticeTools::ConvertTextOrPLF(sref, ref);
}
}
void ProcessAndStripSGML(string* pline, map<string, string>* out) {
map<string, string>& meta = *out;
string& line = *pline;
string lline = LowercaseString(line);
if (lline.find("<seg")!=0) return;
size_t close = lline.find(">");
if (close == string::npos) return; // error
size_t end = lline.find("</seg>");
string seg = Trim(lline.substr(4, close-4));
string text = line.substr(close+1, end - close - 1);
for (size_t i = 1; i < seg.size(); i++) {
if (seg[i] == '=' && seg[i-1] == ' ') {
string less = seg.substr(0, i-1) + seg.substr(i);
seg = less; i = 0; continue;
}
if (seg[i] == '=' && seg[i+1] == ' ') {
string less = seg.substr(0, i+1);
if (i+2 < seg.size()) less += seg.substr(i+2);
seg = less; i = 0; continue;
}
}
line = Trim(text);
if (seg == "") return;
for (size_t i = 1; i < seg.size(); i++) {
if (seg[i] == '=') {
string label = seg.substr(0, i);
string val = seg.substr(i+1);
if (val[0] == '"') {
val = val.substr(1);
size_t close = val.find('"');
if (close == string::npos) {
cerr << "SGML parse error: missing \"\n";
seg = "";
i = 0;
} else {
seg = val.substr(close+1);
val = val.substr(0, close);
i = 0;
}
} else {
size_t close = val.find(' ');
if (close == string::npos) {
seg = "";
i = 0;
} else {
seg = val.substr(close+1);
val = val.substr(0, close);
}
}
label = Trim(label);
seg = Trim(seg);
meta[label] = val;
}
}
}
|