summaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
Diffstat (limited to 'utils')
-rw-r--r--utils/Makefile.am4
-rw-r--r--utils/fast_sparse_vector.h16
-rw-r--r--utils/json_feature_map_lexer.h15
-rw-r--r--utils/json_feature_map_lexer.ll132
4 files changed, 164 insertions, 3 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 46650c75..b7da0f06 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -33,12 +33,16 @@ libutils_a_SOURCES = \
sparse_vector.cc \
timing_stats.cc \
verbose.cc \
+ json_feature_map_lexer.cc \
weights.cc
if HAVE_CMPH
libutils_a_SOURCES += perfect_hash.cc
endif
+json_feature_map_lexer.cc: json_feature_map_lexer.ll
+ $(LEX) -s -8 -CF -o$@ $<
+
phmt_SOURCES = phmt.cc
ts_SOURCES = ts.cc
m_test_SOURCES = m_test.cc
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index af832950..68caa704 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -7,8 +7,6 @@
// important: indexes are integers
// important: iterators may return elements in any order
-#include "config.h"
-
#include <cmath>
#include <cstring>
#include <climits>
@@ -16,8 +14,9 @@
#include <cassert>
#include <vector>
-#include <boost/static_assert.hpp>
+#include "config.h"
+#include <boost/static_assert.hpp>
#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP
#include <boost/serialization/map.hpp>
#endif
@@ -119,6 +118,17 @@ class FastSparseVector {
std::memcpy(this, &other, sizeof(FastSparseVector));
if (is_remote_) data_.rbmap = new std::map<int, T>(*data_.rbmap);
}
+ FastSparseVector(std::pair<int, T>* first, std::pair<int, T>* last) {
+ const ptrdiff_t n = last - first;
+ if (n <= LOCAL_MAX) {
+ is_remote_ = false;
+ local_size_ = n;
+ std::memcpy(data_.local, first, sizeof(std::pair<int, T>) * n);
+ } else {
+ is_remote_ = true;
+ data_.rbmap = new std::map<int, T>(first, last);
+ }
+ }
void erase(int k) {
if (is_remote_) {
data_.rbmap->erase(k);
diff --git a/utils/json_feature_map_lexer.h b/utils/json_feature_map_lexer.h
new file mode 100644
index 00000000..3324aa29
--- /dev/null
+++ b/utils/json_feature_map_lexer.h
@@ -0,0 +1,15 @@
+#ifndef _RULE_LEXER_H_
+#define _RULE_LEXER_H_
+
+#include <iostream>
+#include <string>
+
+#include "sparse_vector.h"
+
+struct JSONFeatureMapLexer {
+ typedef void (*FeatureMapCallback)(const std::string& id, const SparseVector<float>& fmap, void* extra);
+ static void ReadRules(std::istream* in, FeatureMapCallback func, void* extra);
+};
+
+#endif
+
diff --git a/utils/json_feature_map_lexer.ll b/utils/json_feature_map_lexer.ll
new file mode 100644
index 00000000..372b52f5
--- /dev/null
+++ b/utils/json_feature_map_lexer.ll
@@ -0,0 +1,132 @@
+%option nounput
+%{
+
+#include "json_feature_map_lexer.h"
+#include "fdict.h"
+#include "fast_sparse_vector.h"
+
+#define YY_DECL int json_fmap_yylex (void)
+#undef YY_INPUT
+#define YY_INPUT(buf, result, max_size) (result = jfmap_stream->read(buf, max_size).gcount())
+#define YY_SKIP_YYWRAP 1
+int yywrap() { return 1; }
+
+JSONFeatureMapLexer::FeatureMapCallback json_fmap_callback = NULL;
+void* json_fmap_callback_extra = NULL;
+std::istream* jfmap_stream = NULL;
+bool fl = true;
+unsigned spos = 0;
+char featname[16000];
+#define MAX_FEATS 20000
+std::pair<int, float> featmap[MAX_FEATS];
+unsigned curfeat = 0;
+std::string instid;
+
+inline unsigned unicode_escape_to_utf8(uint16_t w1, uint16_t w2, char* putf8) {
+ uint32_t cp;
+ if((w1 & 0xfc00) == 0xd800) {
+ if((w2 & 0xfc00) == 0xdc00) {
+ cp = 0x10000 + (((static_cast<uint32_t>(w1) & 0x3ff) << 10) | (w2 & 0x3ff));
+ } else {
+ abort();
+ }
+ } else {
+ cp = w1;
+ }
+
+
+ if(cp < 0x80) {
+ putf8[0] = static_cast<char>(cp);
+ return 1;
+ } else if(cp < 0x0800) {
+ putf8[0] = 0xc0 | ((cp >> 6) & 0x1f);
+ putf8[1] = 0x80 | (cp & 0x3f);
+ return 2;
+ } else if(cp < 0x10000) {
+ putf8[0] = 0xe0 | ((cp >> 6) & 0x0f);
+ putf8[1] = 0x80 | ((cp >> 6) & 0x3f);
+ putf8[2] = 0x80 | (cp & 0x3f);
+ return 3;
+ } else if(cp < 0x1fffff) {
+ putf8[0] = 0xf0 | ((cp >> 18) & 0x07);
+ putf8[1] = 0x80 | ((cp >> 12) & 0x3f);
+ putf8[2] = 0x80 | ((cp >> 6) & 0x3f);
+ putf8[3] = 0x80 | (cp & 0x3f);
+ return 4;
+ } else {
+ abort();
+ }
+ return 0;
+}
+
+%}
+
+ID [A-Za-z_0-9]+
+HEX_D [a-fA-F0-9]
+INT [-]?[0-9]+
+DOUBLE {INT}((\.[0-9]+)?([eE][-+]?[0-9]+)?)
+WS [ \t\r\n]
+LCB [{]
+RCB [}]
+UNESCAPED_CH [^\"\\\b\n\r\f\t]
+
+%x JSON PREVAL STRING JSONVAL POSTVAL DOUBLE
+%%
+
+<INITIAL>{ID} { instid = yytext; BEGIN(JSON); }
+
+<JSON>{WS}*{LCB}{WS}* { BEGIN(PREVAL); }
+
+<PREVAL>\" { BEGIN(STRING); spos=0; }
+
+<STRING>\" { featname[spos] = 0;
+ featmap[curfeat].first = FD::Convert(featname);
+ BEGIN(JSONVAL);
+ }
+<STRING>{UNESCAPED_CH} { featname[spos++] = yytext[0]; }
+<STRING>\\\" { featname[spos++] = '"'; }
+<STRING>\\\\ { featname[spos++] = '\\'; }
+<STRING>\\\/ { featname[spos++] = '/'; }
+<STRING>\\b { }
+<STRING>\\f { }
+<STRING>\\n { }
+<STRING>\\r { }
+<STRING>\\t { }
+<STRING>\\u{HEX_D}{HEX_D}{HEX_D}{HEX_D} { abort();
+ }
+
+<JSONVAL>{WS}*:{WS}* { BEGIN(DOUBLE); }
+<DOUBLE>{DOUBLE} { featmap[curfeat++].second = strtod(yytext, 0);
+ BEGIN(POSTVAL); }
+
+<POSTVAL>{WS}*,{WS}* { BEGIN(PREVAL); }
+<POSTVAL>{WS}*{RCB}\n* {
+ const SparseVector<float> x(&featmap[0], &featmap[curfeat]);
+ json_fmap_callback(instid, x, json_fmap_callback_extra);
+ curfeat = 0;
+ BEGIN(INITIAL);
+ }
+
+<PREVAL,POSTVAL,DOUBLE,JSONVAL,INITIAL>. { std::cerr << "bad input: " << yytext << std::endl; abort(); }
+
+%%
+
+void JSONFeatureMapLexer::ReadRules(std::istream* in, FeatureMapCallback func, void* extra) {
+ json_fmap_callback = func;
+ json_fmap_callback_extra = extra;
+ jfmap_stream = in;
+ json_fmap_yylex();
+}
+
+#if 0
+void cb(const std::string& id, const SparseVector<float>& fmap, void* extra) {
+ (void) extra;
+ static int cc = 0;
+ cc++;
+}
+
+int main() {
+ JSONFeatureMapLexer::ReadRules(&std::cin, cb, NULL);
+}
+#endif
+