diff options
Diffstat (limited to 'utils')
-rw-r--r-- | utils/Makefile.am | 4 | ||||
-rw-r--r-- | utils/fast_sparse_vector.h | 16 | ||||
-rw-r--r-- | utils/json_feature_map_lexer.h | 15 | ||||
-rw-r--r-- | utils/json_feature_map_lexer.ll | 132 |
4 files changed, 164 insertions, 3 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am index 46650c75..b7da0f06 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -33,12 +33,16 @@ libutils_a_SOURCES = \ sparse_vector.cc \ timing_stats.cc \ verbose.cc \ + json_feature_map_lexer.cc \ weights.cc if HAVE_CMPH libutils_a_SOURCES += perfect_hash.cc endif +json_feature_map_lexer.cc: json_feature_map_lexer.ll + $(LEX) -s -8 -CF -o$@ $< + phmt_SOURCES = phmt.cc ts_SOURCES = ts.cc m_test_SOURCES = m_test.cc diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index af832950..68caa704 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -7,8 +7,6 @@ // important: indexes are integers // important: iterators may return elements in any order -#include "config.h" - #include <cmath> #include <cstring> #include <climits> @@ -16,8 +14,9 @@ #include <cassert> #include <vector> -#include <boost/static_assert.hpp> +#include "config.h" +#include <boost/static_assert.hpp> #if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP #include <boost/serialization/map.hpp> #endif @@ -119,6 +118,17 @@ class FastSparseVector { std::memcpy(this, &other, sizeof(FastSparseVector)); if (is_remote_) data_.rbmap = new std::map<int, T>(*data_.rbmap); } + FastSparseVector(std::pair<int, T>* first, std::pair<int, T>* last) { + const ptrdiff_t n = last - first; + if (n <= LOCAL_MAX) { + is_remote_ = false; + local_size_ = n; + std::memcpy(data_.local, first, sizeof(std::pair<int, T>) * n); + } else { + is_remote_ = true; + data_.rbmap = new std::map<int, T>(first, last); + } + } void erase(int k) { if (is_remote_) { data_.rbmap->erase(k); diff --git a/utils/json_feature_map_lexer.h b/utils/json_feature_map_lexer.h new file mode 100644 index 00000000..3324aa29 --- /dev/null +++ b/utils/json_feature_map_lexer.h @@ -0,0 +1,15 @@ +#ifndef _RULE_LEXER_H_ +#define _RULE_LEXER_H_ + +#include <iostream> +#include <string> + +#include "sparse_vector.h" + +struct JSONFeatureMapLexer { + typedef void (*FeatureMapCallback)(const std::string& id, const SparseVector<float>& fmap, void* extra); + static void ReadRules(std::istream* in, FeatureMapCallback func, void* extra); +}; + +#endif + diff --git a/utils/json_feature_map_lexer.ll b/utils/json_feature_map_lexer.ll new file mode 100644 index 00000000..372b52f5 --- /dev/null +++ b/utils/json_feature_map_lexer.ll @@ -0,0 +1,132 @@ +%option nounput +%{ + +#include "json_feature_map_lexer.h" +#include "fdict.h" +#include "fast_sparse_vector.h" + +#define YY_DECL int json_fmap_yylex (void) +#undef YY_INPUT +#define YY_INPUT(buf, result, max_size) (result = jfmap_stream->read(buf, max_size).gcount()) +#define YY_SKIP_YYWRAP 1 +int yywrap() { return 1; } + +JSONFeatureMapLexer::FeatureMapCallback json_fmap_callback = NULL; +void* json_fmap_callback_extra = NULL; +std::istream* jfmap_stream = NULL; +bool fl = true; +unsigned spos = 0; +char featname[16000]; +#define MAX_FEATS 20000 +std::pair<int, float> featmap[MAX_FEATS]; +unsigned curfeat = 0; +std::string instid; + +inline unsigned unicode_escape_to_utf8(uint16_t w1, uint16_t w2, char* putf8) { + uint32_t cp; + if((w1 & 0xfc00) == 0xd800) { + if((w2 & 0xfc00) == 0xdc00) { + cp = 0x10000 + (((static_cast<uint32_t>(w1) & 0x3ff) << 10) | (w2 & 0x3ff)); + } else { + abort(); + } + } else { + cp = w1; + } + + + if(cp < 0x80) { + putf8[0] = static_cast<char>(cp); + return 1; + } else if(cp < 0x0800) { + putf8[0] = 0xc0 | ((cp >> 6) & 0x1f); + putf8[1] = 0x80 | (cp & 0x3f); + return 2; + } else if(cp < 0x10000) { + putf8[0] = 0xe0 | ((cp >> 6) & 0x0f); + putf8[1] = 0x80 | ((cp >> 6) & 0x3f); + putf8[2] = 0x80 | (cp & 0x3f); + return 3; + } else if(cp < 0x1fffff) { + putf8[0] = 0xf0 | ((cp >> 18) & 0x07); + putf8[1] = 0x80 | ((cp >> 12) & 0x3f); + putf8[2] = 0x80 | ((cp >> 6) & 0x3f); + putf8[3] = 0x80 | (cp & 0x3f); + return 4; + } else { + abort(); + } + return 0; +} + +%} + +ID [A-Za-z_0-9]+ +HEX_D [a-fA-F0-9] +INT [-]?[0-9]+ +DOUBLE {INT}((\.[0-9]+)?([eE][-+]?[0-9]+)?) +WS [ \t\r\n] +LCB [{] +RCB [}] +UNESCAPED_CH [^\"\\\b\n\r\f\t] + +%x JSON PREVAL STRING JSONVAL POSTVAL DOUBLE +%% + +<INITIAL>{ID} { instid = yytext; BEGIN(JSON); } + +<JSON>{WS}*{LCB}{WS}* { BEGIN(PREVAL); } + +<PREVAL>\" { BEGIN(STRING); spos=0; } + +<STRING>\" { featname[spos] = 0; + featmap[curfeat].first = FD::Convert(featname); + BEGIN(JSONVAL); + } +<STRING>{UNESCAPED_CH} { featname[spos++] = yytext[0]; } +<STRING>\\\" { featname[spos++] = '"'; } +<STRING>\\\\ { featname[spos++] = '\\'; } +<STRING>\\\/ { featname[spos++] = '/'; } +<STRING>\\b { } +<STRING>\\f { } +<STRING>\\n { } +<STRING>\\r { } +<STRING>\\t { } +<STRING>\\u{HEX_D}{HEX_D}{HEX_D}{HEX_D} { abort(); + } + +<JSONVAL>{WS}*:{WS}* { BEGIN(DOUBLE); } +<DOUBLE>{DOUBLE} { featmap[curfeat++].second = strtod(yytext, 0); + BEGIN(POSTVAL); } + +<POSTVAL>{WS}*,{WS}* { BEGIN(PREVAL); } +<POSTVAL>{WS}*{RCB}\n* { + const SparseVector<float> x(&featmap[0], &featmap[curfeat]); + json_fmap_callback(instid, x, json_fmap_callback_extra); + curfeat = 0; + BEGIN(INITIAL); + } + +<PREVAL,POSTVAL,DOUBLE,JSONVAL,INITIAL>. { std::cerr << "bad input: " << yytext << std::endl; abort(); } + +%% + +void JSONFeatureMapLexer::ReadRules(std::istream* in, FeatureMapCallback func, void* extra) { + json_fmap_callback = func; + json_fmap_callback_extra = extra; + jfmap_stream = in; + json_fmap_yylex(); +} + +#if 0 +void cb(const std::string& id, const SparseVector<float>& fmap, void* extra) { + (void) extra; + static int cc = 0; + cc++; +} + +int main() { + JSONFeatureMapLexer::ReadRules(&std::cin, cb, NULL); +} +#endif + |