1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
|
%option nounput
%{
#include "json_feature_map_lexer.h"
#include "fdict.h"
#include "fast_sparse_vector.h"
#define YY_DECL int json_fmap_yylex (void)
#undef YY_INPUT
#define YY_INPUT(buf, result, max_size) (result = jfmap_stream->read(buf, max_size).gcount())
#define YY_SKIP_YYWRAP 1
int yywrap() { return 1; }
JSONFeatureMapLexer::FeatureMapCallback json_fmap_callback = NULL;
void* json_fmap_callback_extra = NULL;
std::istream* jfmap_stream = NULL;
bool fl = true;
unsigned spos = 0;
char featname[16000];
#define MAX_FEATS 20000
std::pair<int, float> featmap[MAX_FEATS];
unsigned curfeat = 0;
std::string instid;
inline unsigned unicode_escape_to_utf8(uint16_t w1, uint16_t w2, char* putf8) {
uint32_t cp;
if((w1 & 0xfc00) == 0xd800) {
if((w2 & 0xfc00) == 0xdc00) {
cp = 0x10000 + (((static_cast<uint32_t>(w1) & 0x3ff) << 10) | (w2 & 0x3ff));
} else {
abort();
}
} else {
cp = w1;
}
if(cp < 0x80) {
putf8[0] = static_cast<char>(cp);
return 1;
} else if(cp < 0x0800) {
putf8[0] = 0xc0 | ((cp >> 6) & 0x1f);
putf8[1] = 0x80 | (cp & 0x3f);
return 2;
} else if(cp < 0x10000) {
putf8[0] = 0xe0 | ((cp >> 6) & 0x0f);
putf8[1] = 0x80 | ((cp >> 6) & 0x3f);
putf8[2] = 0x80 | (cp & 0x3f);
return 3;
} else if(cp < 0x1fffff) {
putf8[0] = 0xf0 | ((cp >> 18) & 0x07);
putf8[1] = 0x80 | ((cp >> 12) & 0x3f);
putf8[2] = 0x80 | ((cp >> 6) & 0x3f);
putf8[3] = 0x80 | (cp & 0x3f);
return 4;
} else {
abort();
}
return 0;
}
%}
ID [^ \t\n\r]+
HEX_D [a-fA-F0-9]
INT [-]?[0-9]+
DOUBLE {INT}((\.[0-9]+)?([eE][-+]?[0-9]+)?)
WS [ \t\r\n]
LCB [{]
RCB [}]
UNESCAPED_CH [^\"\\\b\n\r\f\t]
%x JSON PREVAL STRING JSONVAL POSTVAL DOUBLE
%%
<INITIAL>{ID} { instid = yytext; BEGIN(JSON); }
<JSON>{WS}*{LCB}{WS}* { BEGIN(PREVAL); }
<JSON>{WS}*{LCB}{WS}*{RCB}\n* {const SparseVector<float> x;
json_fmap_callback(instid, x, json_fmap_callback_extra);
curfeat = 0;
BEGIN(INITIAL);}
<PREVAL>\" { BEGIN(STRING); spos=0; }
<STRING>\" { featname[spos] = 0;
featmap[curfeat].first = FD::Convert(featname);
BEGIN(JSONVAL);
}
<STRING>{UNESCAPED_CH} { featname[spos++] = yytext[0]; }
<STRING>\\\" { featname[spos++] = '"'; }
<STRING>\\\\ { featname[spos++] = '\\'; }
<STRING>\\\/ { featname[spos++] = '/'; }
<STRING>\\b { }
<STRING>\\f { }
<STRING>\\n { }
<STRING>\\r { }
<STRING>\\t { }
<STRING>\\u{HEX_D}{HEX_D}{HEX_D}{HEX_D} { uint16_t hex = strtol(&yytext[2], NULL, 16);
spos += unicode_escape_to_utf8(hex, 0, &featname[spos++])-1;
}
<JSONVAL>{WS}*:{WS}* { BEGIN(DOUBLE); }
<DOUBLE>{DOUBLE} { featmap[curfeat++].second = strtod(yytext, 0);
BEGIN(POSTVAL); }
<POSTVAL>{WS}*,{WS}* { BEGIN(PREVAL); }
<POSTVAL>{WS}*{RCB}\n* {
const SparseVector<float> x(&featmap[0], &featmap[curfeat]);
json_fmap_callback(instid, x, json_fmap_callback_extra);
curfeat = 0;
BEGIN(INITIAL);
}
<PREVAL,POSTVAL,DOUBLE,JSONVAL,INITIAL>. { std::cerr << "bad input: " << yytext << std::endl; abort(); }
%%
void JSONFeatureMapLexer::ReadRules(std::istream* in, FeatureMapCallback func, void* extra) {
json_fmap_callback = func;
json_fmap_callback_extra = extra;
jfmap_stream = in;
json_fmap_yylex();
}
#if 0
void cb(const std::string& id, const SparseVector<float>& fmap, void* extra) {
(void) extra;
static int cc = 0;
cc++;
}
int main() {
JSONFeatureMapLexer::ReadRules(&std::cin, cb, NULL);
}
#endif
|