summaryrefslogtreecommitdiff
path: root/utils/json_feature_map_lexer.ll
diff options
context:
space:
mode:
Diffstat (limited to 'utils/json_feature_map_lexer.ll')
-rw-r--r--utils/json_feature_map_lexer.ll132
1 files changed, 132 insertions, 0 deletions
diff --git a/utils/json_feature_map_lexer.ll b/utils/json_feature_map_lexer.ll
new file mode 100644
index 00000000..372b52f5
--- /dev/null
+++ b/utils/json_feature_map_lexer.ll
@@ -0,0 +1,132 @@
+%option nounput
+%{
+
+#include "json_feature_map_lexer.h"
+#include "fdict.h"
+#include "fast_sparse_vector.h"
+
+#define YY_DECL int json_fmap_yylex (void)
+#undef YY_INPUT
+#define YY_INPUT(buf, result, max_size) (result = jfmap_stream->read(buf, max_size).gcount())
+#define YY_SKIP_YYWRAP 1
+int yywrap() { return 1; }
+
+JSONFeatureMapLexer::FeatureMapCallback json_fmap_callback = NULL;
+void* json_fmap_callback_extra = NULL;
+std::istream* jfmap_stream = NULL;
+bool fl = true;
+unsigned spos = 0;
+char featname[16000];
+#define MAX_FEATS 20000
+std::pair<int, float> featmap[MAX_FEATS];
+unsigned curfeat = 0;
+std::string instid;
+
+inline unsigned unicode_escape_to_utf8(uint16_t w1, uint16_t w2, char* putf8) {
+ uint32_t cp;
+ if((w1 & 0xfc00) == 0xd800) {
+ if((w2 & 0xfc00) == 0xdc00) {
+ cp = 0x10000 + (((static_cast<uint32_t>(w1) & 0x3ff) << 10) | (w2 & 0x3ff));
+ } else {
+ abort();
+ }
+ } else {
+ cp = w1;
+ }
+
+
+ if(cp < 0x80) {
+ putf8[0] = static_cast<char>(cp);
+ return 1;
+ } else if(cp < 0x0800) {
+ putf8[0] = 0xc0 | ((cp >> 6) & 0x1f);
+ putf8[1] = 0x80 | (cp & 0x3f);
+ return 2;
+ } else if(cp < 0x10000) {
+ putf8[0] = 0xe0 | ((cp >> 6) & 0x0f);
+ putf8[1] = 0x80 | ((cp >> 6) & 0x3f);
+ putf8[2] = 0x80 | (cp & 0x3f);
+ return 3;
+ } else if(cp < 0x1fffff) {
+ putf8[0] = 0xf0 | ((cp >> 18) & 0x07);
+ putf8[1] = 0x80 | ((cp >> 12) & 0x3f);
+ putf8[2] = 0x80 | ((cp >> 6) & 0x3f);
+ putf8[3] = 0x80 | (cp & 0x3f);
+ return 4;
+ } else {
+ abort();
+ }
+ return 0;
+}
+
+%}
+
+ID [A-Za-z_0-9]+
+HEX_D [a-fA-F0-9]
+INT [-]?[0-9]+
+DOUBLE {INT}((\.[0-9]+)?([eE][-+]?[0-9]+)?)
+WS [ \t\r\n]
+LCB [{]
+RCB [}]
+UNESCAPED_CH [^\"\\\b\n\r\f\t]
+
+%x JSON PREVAL STRING JSONVAL POSTVAL DOUBLE
+%%
+
+<INITIAL>{ID} { instid = yytext; BEGIN(JSON); }
+
+<JSON>{WS}*{LCB}{WS}* { BEGIN(PREVAL); }
+
+<PREVAL>\" { BEGIN(STRING); spos=0; }
+
+<STRING>\" { featname[spos] = 0;
+ featmap[curfeat].first = FD::Convert(featname);
+ BEGIN(JSONVAL);
+ }
+<STRING>{UNESCAPED_CH} { featname[spos++] = yytext[0]; }
+<STRING>\\\" { featname[spos++] = '"'; }
+<STRING>\\\\ { featname[spos++] = '\\'; }
+<STRING>\\\/ { featname[spos++] = '/'; }
+<STRING>\\b { }
+<STRING>\\f { }
+<STRING>\\n { }
+<STRING>\\r { }
+<STRING>\\t { }
+<STRING>\\u{HEX_D}{HEX_D}{HEX_D}{HEX_D} { abort();
+ }
+
+<JSONVAL>{WS}*:{WS}* { BEGIN(DOUBLE); }
+<DOUBLE>{DOUBLE} { featmap[curfeat++].second = strtod(yytext, 0);
+ BEGIN(POSTVAL); }
+
+<POSTVAL>{WS}*,{WS}* { BEGIN(PREVAL); }
+<POSTVAL>{WS}*{RCB}\n* {
+ const SparseVector<float> x(&featmap[0], &featmap[curfeat]);
+ json_fmap_callback(instid, x, json_fmap_callback_extra);
+ curfeat = 0;
+ BEGIN(INITIAL);
+ }
+
+<PREVAL,POSTVAL,DOUBLE,JSONVAL,INITIAL>. { std::cerr << "bad input: " << yytext << std::endl; abort(); }
+
+%%
+
+void JSONFeatureMapLexer::ReadRules(std::istream* in, FeatureMapCallback func, void* extra) {
+ json_fmap_callback = func;
+ json_fmap_callback_extra = extra;
+ jfmap_stream = in;
+ json_fmap_yylex();
+}
+
+#if 0
+void cb(const std::string& id, const SparseVector<float>& fmap, void* extra) {
+ (void) extra;
+ static int cc = 0;
+ cc++;
+}
+
+int main() {
+ JSONFeatureMapLexer::ReadRules(&std::cin, cb, NULL);
+}
+#endif
+