diff options
author | Chris Dyer <redpony@gmail.com> | 2014-10-19 15:23:31 -0400 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2014-10-19 15:23:31 -0400 |
commit | fff4dc4a763c6f7fbda61b958ab45641c638d93f (patch) | |
tree | 72b36757d96681b8de16a6a5c6f51ae744cbea7c /decoder | |
parent | 2bb5f3f4c3c347a2474392993c17cc62653dd133 (diff) |
remove json hypergraph format
Diffstat (limited to 'decoder')
-rw-r--r-- | decoder/JSON_parser.c | 1012 | ||||
-rw-r--r-- | decoder/JSON_parser.h | 152 | ||||
-rw-r--r-- | decoder/Makefile.am | 6 | ||||
-rw-r--r-- | decoder/aligner.h | 2 | ||||
-rw-r--r-- | decoder/fst_translator.cc | 10 | ||||
-rw-r--r-- | decoder/hg.h | 10 | ||||
-rw-r--r-- | decoder/hg_io.cc | 258 | ||||
-rw-r--r-- | decoder/hg_io.h | 9 | ||||
-rw-r--r-- | decoder/hg_test.cc | 11 | ||||
-rw-r--r-- | decoder/hg_test.h | 32 | ||||
-rw-r--r-- | decoder/json_parse.cc | 50 | ||||
-rw-r--r-- | decoder/json_parse.h | 58 | ||||
-rw-r--r-- | decoder/rescore_translator.cc | 23 | ||||
-rw-r--r-- | decoder/test_data/perro.json.gz | bin | 608 -> 0 bytes | |||
-rw-r--r-- | decoder/test_data/small.json.gz | bin | 1733 -> 0 bytes | |||
-rw-r--r-- | decoder/test_data/urdu.json.gz | bin | 253497 -> 0 bytes | |||
-rw-r--r-- | decoder/trule.h | 4 |
17 files changed, 45 insertions, 1592 deletions
diff --git a/decoder/JSON_parser.c b/decoder/JSON_parser.c deleted file mode 100644 index 5e392bc6..00000000 --- a/decoder/JSON_parser.c +++ /dev/null @@ -1,1012 +0,0 @@ -/* JSON_parser.c */ - -/* 2007-08-24 */ - -/* -Copyright (c) 2005 JSON.org - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -The Software shall be used for Good, not Evil. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -/* - Callbacks, comments, Unicode handling by Jean Gressmann (jean@0x42.de), 2007-2009. - - For the added features the license above applies also. - - Changelog: - 2009-05-17 - Incorporated benrudiak@googlemail.com fix for UTF16 decoding. - - 2009-05-14 - Fixed float parsing bug related to a locale being set that didn't - use '.' as decimal point character (charles@transmissionbt.com). - - 2008-10-14 - Renamed states.IN to states.IT to avoid name clash which IN macro - defined in windef.h (alexey.pelykh@gmail.com) - - 2008-07-19 - Removed some duplicate code & debugging variable (charles@transmissionbt.com) - - 2008-05-28 - Made JSON_value structure ansi C compliant. This bug was report by - trisk@acm.jhu.edu - - 2008-05-20 - Fixed bug reported by charles@transmissionbt.com where the switching - from static to dynamic parse buffer did not copy the static parse - buffer's content. -*/ - - - -#include <assert.h> -#include <ctype.h> -#include <float.h> -#include <stddef.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <locale.h> - -#include "JSON_parser.h" - -#ifdef _MSC_VER -# if _MSC_VER >= 1400 /* Visual Studio 2005 and up */ -# pragma warning(disable:4996) // unsecure sscanf -# endif -#endif - - -#define true 1 -#define false 0 -#define __ -1 /* the universal error code */ - -/* values chosen so that the object size is approx equal to one page (4K) */ -#ifndef JSON_PARSER_STACK_SIZE -# define JSON_PARSER_STACK_SIZE 128 -#endif - -#ifndef JSON_PARSER_PARSE_BUFFER_SIZE -# define JSON_PARSER_PARSE_BUFFER_SIZE 3500 -#endif - -typedef unsigned short UTF16; - -struct JSON_parser_struct { - JSON_parser_callback callback; - void* ctx; - signed char state, before_comment_state, type, escaped, comment, allow_comments, handle_floats_manually; - UTF16 utf16_high_surrogate; - long depth; - long top; - signed char* stack; - long stack_capacity; - char decimal_point; - char* parse_buffer; - size_t parse_buffer_capacity; - size_t parse_buffer_count; - size_t comment_begin_offset; - signed char static_stack[JSON_PARSER_STACK_SIZE]; - char static_parse_buffer[JSON_PARSER_PARSE_BUFFER_SIZE]; -}; - -#define COUNTOF(x) (sizeof(x)/sizeof(x[0])) - -/* - Characters are mapped into these character classes. This allows for - a significant reduction in the size of the state transition table. -*/ - - - -enum classes { - C_SPACE, /* space */ - C_WHITE, /* other whitespace */ - C_LCURB, /* { */ - C_RCURB, /* } */ - C_LSQRB, /* [ */ - C_RSQRB, /* ] */ - C_COLON, /* : */ - C_COMMA, /* , */ - C_QUOTE, /* " */ - C_BACKS, /* \ */ - C_SLASH, /* / */ - C_PLUS, /* + */ - C_MINUS, /* - */ - C_POINT, /* . */ - C_ZERO , /* 0 */ - C_DIGIT, /* 123456789 */ - C_LOW_A, /* a */ - C_LOW_B, /* b */ - C_LOW_C, /* c */ - C_LOW_D, /* d */ - C_LOW_E, /* e */ - C_LOW_F, /* f */ - C_LOW_L, /* l */ - C_LOW_N, /* n */ - C_LOW_R, /* r */ - C_LOW_S, /* s */ - C_LOW_T, /* t */ - C_LOW_U, /* u */ - C_ABCDF, /* ABCDF */ - C_E, /* E */ - C_ETC, /* everything else */ - C_STAR, /* * */ - NR_CLASSES -}; - -static int ascii_class[128] = { -/* - This array maps the 128 ASCII characters into character classes. - The remaining Unicode characters should be mapped to C_ETC. - Non-whitespace control characters are errors. -*/ - __, __, __, __, __, __, __, __, - __, C_WHITE, C_WHITE, __, __, C_WHITE, __, __, - __, __, __, __, __, __, __, __, - __, __, __, __, __, __, __, __, - - C_SPACE, C_ETC, C_QUOTE, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, - C_ETC, C_ETC, C_STAR, C_PLUS, C_COMMA, C_MINUS, C_POINT, C_SLASH, - C_ZERO, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, - C_DIGIT, C_DIGIT, C_COLON, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, - - C_ETC, C_ABCDF, C_ABCDF, C_ABCDF, C_ABCDF, C_E, C_ABCDF, C_ETC, - C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, - C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, - C_ETC, C_ETC, C_ETC, C_LSQRB, C_BACKS, C_RSQRB, C_ETC, C_ETC, - - C_ETC, C_LOW_A, C_LOW_B, C_LOW_C, C_LOW_D, C_LOW_E, C_LOW_F, C_ETC, - C_ETC, C_ETC, C_ETC, C_ETC, C_LOW_L, C_ETC, C_LOW_N, C_ETC, - C_ETC, C_ETC, C_LOW_R, C_LOW_S, C_LOW_T, C_LOW_U, C_ETC, C_ETC, - C_ETC, C_ETC, C_ETC, C_LCURB, C_ETC, C_RCURB, C_ETC, C_ETC -}; - - -/* - The state codes. -*/ -enum states { - GO, /* start */ - OK, /* ok */ - OB, /* object */ - KE, /* key */ - CO, /* colon */ - VA, /* value */ - AR, /* array */ - ST, /* string */ - ES, /* escape */ - U1, /* u1 */ - U2, /* u2 */ - U3, /* u3 */ - U4, /* u4 */ - MI, /* minus */ - ZE, /* zero */ - IT, /* integer */ - FR, /* fraction */ - E1, /* e */ - E2, /* ex */ - E3, /* exp */ - T1, /* tr */ - T2, /* tru */ - T3, /* true */ - F1, /* fa */ - F2, /* fal */ - F3, /* fals */ - F4, /* false */ - N1, /* nu */ - N2, /* nul */ - N3, /* null */ - C1, /* / */ - C2, /* / * */ - C3, /* * */ - FX, /* *.* *eE* */ - D1, /* second UTF-16 character decoding started by \ */ - D2, /* second UTF-16 character proceeded by u */ - NR_STATES -}; - -enum actions -{ - CB = -10, /* comment begin */ - CE = -11, /* comment end */ - FA = -12, /* false */ - TR = -13, /* false */ - NU = -14, /* null */ - DE = -15, /* double detected by exponent e E */ - DF = -16, /* double detected by fraction . */ - SB = -17, /* string begin */ - MX = -18, /* integer detected by minus */ - ZX = -19, /* integer detected by zero */ - IX = -20, /* integer detected by 1-9 */ - EX = -21, /* next char is escaped */ - UC = -22 /* Unicode character read */ -}; - - -static int state_transition_table[NR_STATES][NR_CLASSES] = { -/* - The state transition table takes the current state and the current symbol, - and returns either a new state or an action. An action is represented as a - negative number. A JSON text is accepted if at the end of the text the - state is OK and if the mode is MODE_DONE. - - white 1-9 ABCDF etc - space | { } [ ] : , " \ / + - . 0 | a b c d e f l n r s t u | E | * */ -/*start GO*/ {GO,GO,-6,__,-5,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*ok OK*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*object OB*/ {OB,OB,__,-9,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*key KE*/ {KE,KE,__,__,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*colon CO*/ {CO,CO,__,__,__,__,-2,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*value VA*/ {VA,VA,-6,__,-5,__,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__}, -/*array AR*/ {AR,AR,-6,__,-5,-7,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__}, -/*string ST*/ {ST,__,ST,ST,ST,ST,ST,ST,-4,EX,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST}, -/*escape ES*/ {__,__,__,__,__,__,__,__,ST,ST,ST,__,__,__,__,__,__,ST,__,__,__,ST,__,ST,ST,__,ST,U1,__,__,__,__}, -/*u1 U1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U2,U2,U2,U2,U2,U2,U2,U2,__,__,__,__,__,__,U2,U2,__,__}, -/*u2 U2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U3,U3,U3,U3,U3,U3,U3,U3,__,__,__,__,__,__,U3,U3,__,__}, -/*u3 U3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U4,U4,U4,U4,U4,U4,U4,U4,__,__,__,__,__,__,U4,U4,__,__}, -/*u4 U4*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,UC,UC,UC,UC,UC,UC,UC,UC,__,__,__,__,__,__,UC,UC,__,__}, -/*minus MI*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,ZE,IT,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*zero ZE*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*int IT*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,IT,IT,__,__,__,__,DE,__,__,__,__,__,__,__,__,DE,__,__}, -/*frac FR*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__}, -/*e E1*/ {__,__,__,__,__,__,__,__,__,__,__,E2,E2,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*ex E2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*exp E3*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*tr T1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T2,__,__,__,__,__,__,__}, -/*tru T2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T3,__,__,__,__}, -/*true T3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__}, -/*fa F1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*fal F2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F3,__,__,__,__,__,__,__,__,__}, -/*fals F3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F4,__,__,__,__,__,__}, -/*false F4*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__}, -/*nu N1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N2,__,__,__,__}, -/*nul N2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N3,__,__,__,__,__,__,__,__,__}, -/*null N3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__}, -/*/ C1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,C2}, -/*/* C2*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3}, -/** C3*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,CE,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3}, -/*_. FX*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__}, -/*\ D1*/ {__,__,__,__,__,__,__,__,__,D2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*\ D2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,U1,__,__,__,__}, -}; - - -/* - These modes can be pushed on the stack. -*/ -enum modes { - MODE_ARRAY = 1, - MODE_DONE = 2, - MODE_KEY = 3, - MODE_OBJECT = 4 -}; - -static int -push(JSON_parser jc, int mode) -{ -/* - Push a mode onto the stack. Return false if there is overflow. -*/ - jc->top += 1; - if (jc->depth < 0) { - if (jc->top >= jc->stack_capacity) { - size_t bytes_to_allocate; - jc->stack_capacity *= 2; - bytes_to_allocate = jc->stack_capacity * sizeof(jc->static_stack[0]); - if (jc->stack == &jc->static_stack[0]) { - jc->stack = (signed char*)malloc(bytes_to_allocate); - memcpy(jc->stack, jc->static_stack, sizeof(jc->static_stack)); - } else { - jc->stack = (signed char*)realloc(jc->stack, bytes_to_allocate); - } - } - } else { - if (jc->top >= jc->depth) { - return false; - } - } - - jc->stack[jc->top] = mode; - return true; -} - - -static int -pop(JSON_parser jc, int mode) -{ -/* - Pop the stack, assuring that the current mode matches the expectation. - Return false if there is underflow or if the modes mismatch. -*/ - if (jc->top < 0 || jc->stack[jc->top] != mode) { - return false; - } - jc->top -= 1; - return true; -} - - -#define parse_buffer_clear(jc) \ - do {\ - jc->parse_buffer_count = 0;\ - jc->parse_buffer[0] = 0;\ - } while (0) - -#define parse_buffer_pop_back_char(jc)\ - do {\ - assert(jc->parse_buffer_count >= 1);\ - --jc->parse_buffer_count;\ - jc->parse_buffer[jc->parse_buffer_count] = 0;\ - } while (0) - -void delete_JSON_parser(JSON_parser jc) -{ - if (jc) { - if (jc->stack != &jc->static_stack[0]) { - free((void*)jc->stack); - } - if (jc->parse_buffer != &jc->static_parse_buffer[0]) { - free((void*)jc->parse_buffer); - } - free((void*)jc); - } -} - - -JSON_parser -new_JSON_parser(JSON_config* config) -{ -/* - new_JSON_parser starts the checking process by constructing a JSON_parser - object. It takes a depth parameter that restricts the level of maximum - nesting. - - To continue the process, call JSON_parser_char for each character in the - JSON text, and then call JSON_parser_done to obtain the final result. - These functions are fully reentrant. -*/ - - int depth = 0; - JSON_config default_config; - - JSON_parser jc = (JSON_parser)malloc(sizeof(struct JSON_parser_struct)); - - memset(jc, 0, sizeof(*jc)); - - - /* initialize configuration */ - init_JSON_config(&default_config); - - /* set to default configuration if none was provided */ - if (config == NULL) { - config = &default_config; - } - - depth = config->depth; - - /* We need to be able to push at least one object */ - if (depth == 0) { - depth = 1; - } - - jc->state = GO; - jc->top = -1; - - /* Do we want non-bound stack? */ - if (depth > 0) { - jc->stack_capacity = depth; - jc->depth = depth; - if (depth <= (int)COUNTOF(jc->static_stack)) { - jc->stack = &jc->static_stack[0]; - } else { - jc->stack = (signed char*)malloc(jc->stack_capacity * sizeof(jc->static_stack[0])); - } - } else { - jc->stack_capacity = COUNTOF(jc->static_stack); - jc->depth = -1; - jc->stack = &jc->static_stack[0]; - } - - /* set parser to start */ - push(jc, MODE_DONE); - - /* set up the parse buffer */ - jc->parse_buffer = &jc->static_parse_buffer[0]; - jc->parse_buffer_capacity = COUNTOF(jc->static_parse_buffer); - parse_buffer_clear(jc); - - /* set up callback, comment & float handling */ - jc->callback = config->callback; - jc->ctx = config->callback_ctx; - jc->allow_comments = config->allow_comments != 0; - jc->handle_floats_manually = config->handle_floats_manually != 0; - - /* set up decimal point */ - jc->decimal_point = *localeconv()->decimal_point; - - return jc; -} - -static void grow_parse_buffer(JSON_parser jc) -{ - size_t bytes_to_allocate; - jc->parse_buffer_capacity *= 2; - bytes_to_allocate = jc->parse_buffer_capacity * sizeof(jc->parse_buffer[0]); - if (jc->parse_buffer == &jc->static_parse_buffer[0]) { - jc->parse_buffer = (char*)malloc(bytes_to_allocate); - memcpy(jc->parse_buffer, jc->static_parse_buffer, jc->parse_buffer_count); - } else { - jc->parse_buffer = (char*)realloc(jc->parse_buffer, bytes_to_allocate); - } -} - -#define parse_buffer_push_back_char(jc, c)\ - do {\ - if (jc->parse_buffer_count + 1 >= jc->parse_buffer_capacity) grow_parse_buffer(jc);\ - jc->parse_buffer[jc->parse_buffer_count++] = c;\ - jc->parse_buffer[jc->parse_buffer_count] = 0;\ - } while (0) - -#define assert_is_non_container_type(jc) \ - assert( \ - jc->type == JSON_T_NULL || \ - jc->type == JSON_T_FALSE || \ - jc->type == JSON_T_TRUE || \ - jc->type == JSON_T_FLOAT || \ - jc->type == JSON_T_INTEGER || \ - jc->type == JSON_T_STRING) - - -static int parse_parse_buffer(JSON_parser jc) -{ - if (jc->callback) { - JSON_value value, *arg = NULL; - - if (jc->type != JSON_T_NONE) { - assert_is_non_container_type(jc); - - switch(jc->type) { - case JSON_T_FLOAT: - arg = &value; - if (jc->handle_floats_manually) { - value.vu.str.value = jc->parse_buffer; - value.vu.str.length = jc->parse_buffer_count; - } else { - /*sscanf(jc->parse_buffer, "%Lf", &value.vu.float_value);*/ - - /* not checking with end pointer b/c there may be trailing ws */ - value.vu.float_value = strtod(jc->parse_buffer, NULL); - } - break; - case JSON_T_INTEGER: - arg = &value; - sscanf(jc->parse_buffer, JSON_PARSER_INTEGER_SSCANF_TOKEN, &value.vu.integer_value); - break; - case JSON_T_STRING: - arg = &value; - value.vu.str.value = jc->parse_buffer; - value.vu.str.length = jc->parse_buffer_count; - break; - } - - if (!(*jc->callback)(jc->ctx, jc->type, arg)) { - return false; - } - } - } - - parse_buffer_clear(jc); - - return true; -} - -#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800) -#define IS_LOW_SURROGATE(uc) (((uc) & 0xFC00) == 0xDC00) -#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000) -static unsigned char utf8_lead_bits[4] = { 0x00, 0xC0, 0xE0, 0xF0 }; - -static int decode_unicode_char(JSON_parser jc) -{ - int i; - unsigned uc = 0; - char* p; - int trail_bytes; - - assert(jc->parse_buffer_count >= 6); - - p = &jc->parse_buffer[jc->parse_buffer_count - 4]; - - for (i = 12; i >= 0; i -= 4, ++p) { - unsigned x = *p; - - if (x >= 'a') { - x -= ('a' - 10); - } else if (x >= 'A') { - x -= ('A' - 10); - } else { - x &= ~0x30u; - } - - assert(x < 16); - - uc |= x << i; - } - - /* clear UTF-16 char from buffer */ - jc->parse_buffer_count -= 6; - jc->parse_buffer[jc->parse_buffer_count] = 0; - - /* attempt decoding ... */ - if (jc->utf16_high_surrogate) { - if (IS_LOW_SURROGATE(uc)) { - uc = DECODE_SURROGATE_PAIR(jc->utf16_high_surrogate, uc); - trail_bytes = 3; - jc->utf16_high_surrogate = 0; - } else { - /* high surrogate without a following low surrogate */ - return false; - } - } else { - if (uc < 0x80) { - trail_bytes = 0; - } else if (uc < 0x800) { - trail_bytes = 1; - } else if (IS_HIGH_SURROGATE(uc)) { - /* save the high surrogate and wait for the low surrogate */ - jc->utf16_high_surrogate = uc; - return true; - } else if (IS_LOW_SURROGATE(uc)) { - /* low surrogate without a preceding high surrogate */ - return false; - } else { - trail_bytes = 2; - } - } - - jc->parse_buffer[jc->parse_buffer_count++] = (char) ((uc >> (trail_bytes * 6)) | utf8_lead_bits[trail_bytes]); - - for (i = trail_bytes * 6 - 6; i >= 0; i -= 6) { - jc->parse_buffer[jc->parse_buffer_count++] = (char) (((uc >> i) & 0x3F) | 0x80); - } - - jc->parse_buffer[jc->parse_buffer_count] = 0; - - return true; -} - -static int add_escaped_char_to_parse_buffer(JSON_parser jc, int next_char) -{ - jc->escaped = 0; - /* remove the backslash */ - parse_buffer_pop_back_char(jc); - switch(next_char) { - case 'b': - parse_buffer_push_back_char(jc, '\b'); - break; - case 'f': - parse_buffer_push_back_char(jc, '\f'); - break; - case 'n': - parse_buffer_push_back_char(jc, '\n'); - break; - case 'r': - parse_buffer_push_back_char(jc, '\r'); - break; - case 't': - parse_buffer_push_back_char(jc, '\t'); - break; - case '"': - parse_buffer_push_back_char(jc, '"'); - break; - case '\\': - parse_buffer_push_back_char(jc, '\\'); - break; - case '/': - parse_buffer_push_back_char(jc, '/'); - break; - case 'u': - parse_buffer_push_back_char(jc, '\\'); - parse_buffer_push_back_char(jc, 'u'); - break; - default: - return false; - } - - return true; -} - -#define add_char_to_parse_buffer(jc, next_char, next_class) \ - do { \ - if (jc->escaped) { \ - if (!add_escaped_char_to_parse_buffer(jc, next_char)) \ - return false; \ - } else if (!jc->comment) { \ - if ((jc->type != JSON_T_NONE) | !((next_class == C_SPACE) | (next_class == C_WHITE)) /* non-white-space */) { \ - parse_buffer_push_back_char(jc, (char)next_char); \ - } \ - } \ - } while (0) - - -#define assert_type_isnt_string_null_or_bool(jc) \ - assert(jc->type != JSON_T_FALSE); \ - assert(jc->type != JSON_T_TRUE); \ - assert(jc->type != JSON_T_NULL); \ - assert(jc->type != JSON_T_STRING) - - -int -JSON_parser_char(JSON_parser jc, int next_char) -{ -/* - After calling new_JSON_parser, call this function for each character (or - partial character) in your JSON text. It can accept UTF-8, UTF-16, or - UTF-32. It returns true if things are looking ok so far. If it rejects the - text, it returns false. -*/ - int next_class, next_state; - -/* - Determine the character's class. -*/ - if (next_char < 0) { - return false; - } - if (next_char >= 128) { - next_class = C_ETC; - } else { - next_class = ascii_class[next_char]; - if (next_class <= __) { - return false; - } - } - - add_char_to_parse_buffer(jc, next_char, next_class); - -/* - Get the next state from the state transition table. -*/ - next_state = state_transition_table[jc->state][next_class]; - if (next_state >= 0) { -/* - Change the state. -*/ - jc->state = next_state; - } else { -/* - Or perform one of the actions. -*/ - switch (next_state) { -/* Unicode character */ - case UC: - if(!decode_unicode_char(jc)) { - return false; - } - /* check if we need to read a second UTF-16 char */ - if (jc->utf16_high_surrogate) { - jc->state = D1; - } else { - jc->state = ST; - } - break; -/* escaped char */ - case EX: - jc->escaped = 1; - jc->state = ES; - break; -/* integer detected by minus */ - case MX: - jc->type = JSON_T_INTEGER; - jc->state = MI; - break; -/* integer detected by zero */ - case ZX: - jc->type = JSON_T_INTEGER; - jc->state = ZE; - break; -/* integer detected by 1-9 */ - case IX: - jc->type = JSON_T_INTEGER; - jc->state = IT; - break; - -/* floating point number detected by exponent*/ - case DE: - assert_type_isnt_string_null_or_bool(jc); - jc->type = JSON_T_FLOAT; - jc->state = E1; - break; - -/* floating point number detected by fraction */ - case DF: - assert_type_isnt_string_null_or_bool(jc); - if (!jc->handle_floats_manually) { -/* - Some versions of strtod (which underlies sscanf) don't support converting - C-locale formated floating point values. -*/ - assert(jc->parse_buffer[jc->parse_buffer_count-1] == '.'); - jc->parse_buffer[jc->parse_buffer_count-1] = jc->decimal_point; - } - jc->type = JSON_T_FLOAT; - jc->state = FX; - break; -/* string begin " */ - case SB: - parse_buffer_clear(jc); - assert(jc->type == JSON_T_NONE); - jc->type = JSON_T_STRING; - jc->state = ST; - break; - -/* n */ - case NU: - assert(jc->type == JSON_T_NONE); - jc->type = JSON_T_NULL; - jc->state = N1; - break; -/* f */ - case FA: - assert(jc->type == JSON_T_NONE); - jc->type = JSON_T_FALSE; - jc->state = F1; - break; -/* t */ - case TR: - assert(jc->type == JSON_T_NONE); - jc->type = JSON_T_TRUE; - jc->state = T1; - break; - -/* closing comment */ - case CE: - jc->comment = 0; - assert(jc->parse_buffer_count == 0); - assert(jc->type == JSON_T_NONE); - jc->state = jc->before_comment_state; - break; - -/* opening comment */ - case CB: - if (!jc->allow_comments) { - return false; - } - parse_buffer_pop_back_char(jc); - if (!parse_parse_buffer(jc)) { - return false; - } - assert(jc->parse_buffer_count == 0); - assert(jc->type != JSON_T_STRING); - switch (jc->stack[jc->top]) { - case MODE_ARRAY: - case MODE_OBJECT: - switch(jc->state) { - case VA: - case AR: - jc->before_comment_state = jc->state; - break; - default: - jc->before_comment_state = OK; - break; - } - break; - default: - jc->before_comment_state = jc->state; - break; - } - jc->type = JSON_T_NONE; - jc->state = C1; - jc->comment = 1; - break; -/* empty } */ - case -9: - parse_buffer_clear(jc); - if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) { - return false; - } - if (!pop(jc, MODE_KEY)) { - return false; - } - jc->state = OK; - break; - -/* } */ case -8: - parse_buffer_pop_back_char(jc); - if (!parse_parse_buffer(jc)) { - return false; - } - if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) { - return false; - } - if (!pop(jc, MODE_OBJECT)) { - return false; - } - jc->type = JSON_T_NONE; - jc->state = OK; - break; - -/* ] */ case -7: - parse_buffer_pop_back_char(jc); - if (!parse_parse_buffer(jc)) { - return false; - } - if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_END, NULL)) { - return false; - } - if (!pop(jc, MODE_ARRAY)) { - return false; - } - - jc->type = JSON_T_NONE; - jc->state = OK; - break; - -/* { */ case -6: - parse_buffer_pop_back_char(jc); - if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_BEGIN, NULL)) { - return false; - } - if (!push(jc, MODE_KEY)) { - return false; - } - assert(jc->type == JSON_T_NONE); - jc->state = OB; - break; - -/* [ */ case -5: - parse_buffer_pop_back_char(jc); - if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_BEGIN, NULL)) { - return false; - } - if (!push(jc, MODE_ARRAY)) { - return false; - } - assert(jc->type == JSON_T_NONE); - jc->state = AR; - break; - -/* string end " */ case -4: - parse_buffer_pop_back_char(jc); - switch (jc->stack[jc->top]) { - case MODE_KEY: - assert(jc->type == JSON_T_STRING); - jc->type = JSON_T_NONE; - jc->state = CO; - - if (jc->callback) { - JSON_value value; - value.vu.str.value = jc->parse_buffer; - value.vu.str.length = jc->parse_buffer_count; - if (!(*jc->callback)(jc->ctx, JSON_T_KEY, &value)) { - return false; - } - } - parse_buffer_clear(jc); - break; - case MODE_ARRAY: - case MODE_OBJECT: - assert(jc->type == JSON_T_STRING); - if (!parse_parse_buffer(jc)) { - return false; - } - jc->type = JSON_T_NONE; - jc->state = OK; - break; - default: - return false; - } - break; - -/* , */ case -3: - parse_buffer_pop_back_char(jc); - if (!parse_parse_buffer(jc)) { - return false; - } - switch (jc->stack[jc->top]) { - case MODE_OBJECT: -/* - A comma causes a flip from object mode to key mode. -*/ - if (!pop(jc, MODE_OBJECT) || !push(jc, MODE_KEY)) { - return false; - } - assert(jc->type != JSON_T_STRING); - jc->type = JSON_T_NONE; - jc->state = KE; - break; - case MODE_ARRAY: - assert(jc->type != JSON_T_STRING); - jc->type = JSON_T_NONE; - jc->state = VA; - break; - default: - return false; - } - break; - -/* : */ case -2: -/* - A colon causes a flip from key mode to object mode. -*/ - parse_buffer_pop_back_char(jc); - if (!pop(jc, MODE_KEY) || !push(jc, MODE_OBJECT)) { - return false; - } - assert(jc->type == JSON_T_NONE); - jc->state = VA; - break; -/* - Bad action. -*/ - default: - return false; - } - } - return true; -} - - -int -JSON_parser_done(JSON_parser jc) -{ - const int result = jc->state == OK && pop(jc, MODE_DONE); - - return result; -} - - -int JSON_parser_is_legal_white_space_string(const char* s) -{ - int c, char_class; - - if (s == NULL) { - return false; - } - - for (; *s; ++s) { - c = *s; - - if (c < 0 || c >= 128) { - return false; - } - - char_class = ascii_class[c]; - - if (char_class != C_SPACE && char_class != C_WHITE) { - return false; - } - } - - return true; -} - - - -void init_JSON_config(JSON_config* config) -{ - if (config) { - memset(config, 0, sizeof(*config)); - - config->depth = JSON_PARSER_STACK_SIZE - 1; - } -} diff --git a/decoder/JSON_parser.h b/decoder/JSON_parser.h deleted file mode 100644 index de980072..00000000 --- a/decoder/JSON_parser.h +++ /dev/null @@ -1,152 +0,0 @@ -#ifndef JSON_PARSER_H -#define JSON_PARSER_H - -/* JSON_parser.h */ - - -#include <stddef.h> - -/* Windows DLL stuff */ -#ifdef _WIN32 -# ifdef JSON_PARSER_DLL_EXPORTS -# define JSON_PARSER_DLL_API __declspec(dllexport) -# else -# define JSON_PARSER_DLL_API __declspec(dllimport) -# endif -#else -# define JSON_PARSER_DLL_API -#endif - -/* Determine the integer type use to parse non-floating point numbers */ -#if __STDC_VERSION__ >= 199901L || HAVE_LONG_LONG == 1 -typedef long long JSON_int_t; -#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%lld" -#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%lld" -#else -typedef long JSON_int_t; -#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%ld" -#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%ld" -#endif - - -#ifdef __cplusplus -extern "C" { -#endif - -typedef enum -{ - JSON_T_NONE = 0, - JSON_T_ARRAY_BEGIN, // 1 - JSON_T_ARRAY_END, // 2 - JSON_T_OBJECT_BEGIN, // 3 - JSON_T_OBJECT_END, // 4 - JSON_T_INTEGER, // 5 - JSON_T_FLOAT, // 6 - JSON_T_NULL, // 7 - JSON_T_TRUE, // 8 - JSON_T_FALSE, // 9 - JSON_T_STRING, // 10 - JSON_T_KEY, // 11 - JSON_T_MAX // 12 -} JSON_type; - -typedef struct JSON_value_struct { - union { - JSON_int_t integer_value; - - double float_value; - - struct { - const char* value; - size_t length; - } str; - } vu; -} JSON_value; - -typedef struct JSON_parser_struct* JSON_parser; - -/*! \brief JSON parser callback - - \param ctx The pointer passed to new_JSON_parser. - \param type An element of JSON_type but not JSON_T_NONE. - \param value A representation of the parsed value. This parameter is NULL for - JSON_T_ARRAY_BEGIN, JSON_T_ARRAY_END, JSON_T_OBJECT_BEGIN, JSON_T_OBJECT_END, - JSON_T_NULL, JSON_T_TRUE, and SON_T_FALSE. String values are always returned - as zero-terminated C strings. - - \return Non-zero if parsing should continue, else zero. -*/ -typedef int (*JSON_parser_callback)(void* ctx, int type, const struct JSON_value_struct* value); - - -/*! \brief The structure used to configure a JSON parser object - - \param depth If negative, the parser can parse arbitrary levels of JSON, otherwise - the depth is the limit - \param Pointer to a callback. This parameter may be NULL. In this case the input is merely checked for validity. - \param Callback context. This parameter may be NULL. - \param depth. Specifies the levels of nested JSON to allow. Negative numbers yield unlimited nesting. - \param allowComments. To allow C style comments in JSON, set to non-zero. - \param handleFloatsManually. To decode floating point numbers manually set this parameter to non-zero. - - \return The parser object. -*/ -typedef struct { - JSON_parser_callback callback; - void* callback_ctx; - int depth; - int allow_comments; - int handle_floats_manually; -} JSON_config; - - -/*! \brief Initializes the JSON parser configuration structure to default values. - - The default configuration is - - 127 levels of nested JSON (depends on JSON_PARSER_STACK_SIZE, see json_parser.c) - - no parsing, just checking for JSON syntax - - no comments - - \param config. Used to configure the parser. -*/ -JSON_PARSER_DLL_API void init_JSON_config(JSON_config* config); - -/*! \brief Create a JSON parser object - - \param config. Used to configure the parser. Set to NULL to use the default configuration. - See init_JSON_config - - \return The parser object. -*/ -JSON_PARSER_DLL_API extern JSON_parser new_JSON_parser(JSON_config* config); - -/*! \brief Destroy a previously created JSON parser object. */ -JSON_PARSER_DLL_API extern void delete_JSON_parser(JSON_parser jc); - -/*! \brief Parse a character. - - \return Non-zero, if all characters passed to this function are part of are valid JSON. -*/ -JSON_PARSER_DLL_API extern int JSON_parser_char(JSON_parser jc, int next_char); - -/*! \brief Finalize parsing. - - Call this method once after all input characters have been consumed. - - \return Non-zero, if all parsed characters are valid JSON, zero otherwise. -*/ -JSON_PARSER_DLL_API extern int JSON_parser_done(JSON_parser jc); - -/*! \brief Determine if a given string is valid JSON white space - - \return Non-zero if the string is valid, zero otherwise. -*/ -JSON_PARSER_DLL_API extern int JSON_parser_is_legal_white_space_string(const char* s); - - -#ifdef __cplusplus -} -#endif - - -#endif /* JSON_PARSER_H */ diff --git a/decoder/Makefile.am b/decoder/Makefile.am index e46a7120..b56e4c72 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -33,7 +33,6 @@ noinst_LIBRARIES = libcdec.a EXTRA_DIST = test_data rule_lexer.ll libcdec_a_SOURCES = \ - JSON_parser.h \ aligner.h \ apply_models.h \ bottom_up_parser.h \ @@ -80,7 +79,6 @@ libcdec_a_SOURCES = \ hg_union.h \ incremental.h \ inside_outside.h \ - json_parse.h \ kbest.h \ lattice.h \ lexalign.h \ @@ -141,7 +139,6 @@ libcdec_a_SOURCES = \ hg_sampler.cc \ hg_union.cc \ incremental.cc \ - json_parse.cc \ lattice.cc \ lexalign.cc \ lextrans.cc \ @@ -157,5 +154,4 @@ libcdec_a_SOURCES = \ tagger.cc \ translator.cc \ trule.cc \ - viterbi.cc \ - JSON_parser.c + viterbi.cc diff --git a/decoder/aligner.h b/decoder/aligner.h index a34795c9..d68ceefc 100644 --- a/decoder/aligner.h +++ b/decoder/aligner.h @@ -1,4 +1,4 @@ -#ifndef _ALIGNER_H_ +#ifndef ALIGNER_H #include <string> #include <iostream> diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc index 50e6adcc..fe28f4c6 100644 --- a/decoder/fst_translator.cc +++ b/decoder/fst_translator.cc @@ -27,11 +27,15 @@ struct FSTTranslatorImpl { const vector<double>& weights, Hypergraph* forest) { bool composed = false; - if (input.find("{\"rules\"") == 0) { + if (input.find("::forest::") == 0) { istringstream is(input); + string header, fname; + is >> header >> fname; + ReadFile rf(fname); + if (!rf) { cerr << "Failed to open " << fname << endl; abort(); } Hypergraph src_cfg_hg; - if (!HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)) { - cerr << "Failed to read HG from JSON.\n"; + if (!HypergraphIO::ReadFromBinary(rf.stream(), &src_cfg_hg)) { + cerr << "Failed to read HG.\n"; abort(); } if (add_pass_through_rules) { diff --git a/decoder/hg.h b/decoder/hg.h index 124eab86..c756012e 100644 --- a/decoder/hg.h +++ b/decoder/hg.h @@ -71,7 +71,7 @@ namespace HG { short int prev_i_; short int prev_j_; template<class Archive> - void serialize(Archive & ar, const unsigned int version) { + void serialize(Archive & ar, const unsigned int /*version*/) { ar & head_node_; ar & tail_nodes_; ar & rule_; @@ -163,7 +163,7 @@ namespace HG { EdgesVector in_edges_; // an in edge is an edge with this node as its head. (in edges come from the bottom up to us) indices in edges_ EdgesVector out_edges_; // an out edge is an edge with this node as its tail. (out edges leave us up toward the top/goal). indices in edges_ template<class Archive> - void save(Archive & ar, const unsigned int version) const { + void save(Archive & ar, const unsigned int /*version*/) const { ar & node_hash; ar & id_; ar & TD::Convert(-cat_); @@ -171,7 +171,7 @@ namespace HG { ar & out_edges_; } template<class Archive> - void load(Archive & ar, const unsigned int version) { + void load(Archive & ar, const unsigned int /*version*/) { ar & node_hash; ar & id_; std::string cat; ar & cat; @@ -524,7 +524,7 @@ public: void check_ids() const; // assert that .id_ have been kept in sync template<class Archive> - void save(Archive & ar, const unsigned int version) const { + void save(Archive & ar, const unsigned int /*version*/) const { unsigned ns = nodes_.size(); ar & ns; unsigned es = edges_.size(); ar & es; for (auto& n : nodes_) ar & n; @@ -534,7 +534,7 @@ public: x = is_linear_chain_; ar & x; } template<class Archive> - void load(Archive & ar, const unsigned int version) { + void load(Archive & ar, const unsigned int /*version*/) { unsigned ns; ar & ns; nodes_.resize(ns); unsigned es; ar & es; edges_.resize(es); for (auto& n : nodes_) ar & n; diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc index 67760fb1..626b2954 100644 --- a/decoder/hg_io.cc +++ b/decoder/hg_io.cc @@ -13,268 +13,10 @@ #include "fast_lexical_cast.hpp" #include "tdict.h" -#include "json_parse.h" #include "hg.h" using namespace std; -struct HGReader : public JSONParser { - HGReader(Hypergraph* g) : rp("[X] ||| "), state(-1), hg(*g), nodes_needed(true), edges_needed(true) { nodes = 0; edges = 0; } - - void CreateNode(const string& cat, const string& shash, const vector<int>& in_edges) { - WordID c = TD::Convert("X") * -1; - if (!cat.empty()) c = TD::Convert(cat) * -1; - Hypergraph::Node* node = hg.AddNode(c); - char* dend; - if (shash.size()) - node->node_hash = strtoull(shash.c_str(), &dend, 16); - else - node->node_hash = 0; - for (int i = 0; i < in_edges.size(); ++i) { - if (in_edges[i] >= hg.edges_.size()) { - cerr << "JSONParser: in_edges[" << i << "]=" << in_edges[i] - << ", but hg only has " << hg.edges_.size() << " edges!\n"; - abort(); - } - hg.ConnectEdgeToHeadNode(&hg.edges_[in_edges[i]], node); - } - } - void CreateEdge(const TRulePtr& rule, SparseVector<double>* feats, const SmallVectorUnsigned& tail) { - Hypergraph::Edge* edge = hg.AddEdge(rule, tail); - feats->swap(edge->feature_values_); - edge->i_ = spans[0]; - edge->j_ = spans[1]; - edge->prev_i_ = spans[2]; - edge->prev_j_ = spans[3]; - } - - bool HandleJSONEvent(int type, const JSON_value* value) { - switch(state) { - case -1: - assert(type == JSON_T_OBJECT_BEGIN); - state = 0; - break; - case 0: - if (type == JSON_T_OBJECT_END) { - //cerr << "HG created\n"; // TODO, signal some kind of callback - } else if (type == JSON_T_KEY) { - string val = value->vu.str.value; - if (val == "features") { assert(fdict.empty()); state = 1; } - else if (val == "is_sorted") { state = 3; } - else if (val == "rules") { assert(rules.empty()); state = 4; } - else if (val == "node") { state = 8; } - else if (val == "edges") { state = 13; } - else { cerr << "Unexpected key: " << val << endl; return false; } - } - break; - - // features - case 1: - if(type == JSON_T_NULL) { state = 0; break; } - assert(type == JSON_T_ARRAY_BEGIN); - state = 2; - break; - case 2: - if(type == JSON_T_ARRAY_END) { state = 0; break; } - assert(type == JSON_T_STRING); - fdict.push_back(FD::Convert(value->vu.str.value)); - assert(fdict.back() > 0); - break; - - // is_sorted - case 3: - assert(type == JSON_T_TRUE || type == JSON_T_FALSE); - is_sorted = (type == JSON_T_TRUE); - if (!is_sorted) { cerr << "[WARNING] is_sorted flag is ignored\n"; } - state = 0; - break; - - // rules - case 4: - if(type == JSON_T_NULL) { state = 0; break; } - assert(type == JSON_T_ARRAY_BEGIN); - state = 5; - break; - case 5: - if(type == JSON_T_ARRAY_END) { state = 0; break; } - assert(type == JSON_T_INTEGER); - state = 6; - rule_id = value->vu.integer_value; - break; - case 6: - assert(type == JSON_T_STRING); - rules[rule_id] = TRulePtr(new TRule(value->vu.str.value)); - state = 5; - break; - - // Nodes - case 8: - assert(type == JSON_T_OBJECT_BEGIN); - ++nodes; - in_edges.clear(); - cat.clear(); - shash.clear(); - state = 9; break; - case 9: - if (type == JSON_T_OBJECT_END) { - //cerr << "Creating NODE\n"; - CreateNode(cat, shash, in_edges); - state = 0; break; - } - assert(type == JSON_T_KEY); - cur_key = value->vu.str.value; - if (cur_key == "cat") { assert(cat.empty()); state = 10; break; } - if (cur_key == "in_edges") { assert(in_edges.empty()); state = 11; break; } - if (cur_key == "node_hash") { assert(shash.empty()); state = 24; break; } - cerr << "Syntax error: unexpected key " << cur_key << " in node specification.\n"; - return false; - case 10: - assert(type == JSON_T_STRING || type == JSON_T_NULL); - cat = value->vu.str.value; - state = 9; break; - case 11: - if (type == JSON_T_NULL) { state = 9; break; } - assert(type == JSON_T_ARRAY_BEGIN); - state = 12; break; - case 12: - if (type == JSON_T_ARRAY_END) { state = 9; break; } - assert(type == JSON_T_INTEGER); - //cerr << "in_edges: " << value->vu.integer_value << endl; - in_edges.push_back(value->vu.integer_value); - break; - - // "edges": [ { "tail": null, "feats" : [0,1.63,1,-0.54], "rule": 12}, - // { "tail": null, "feats" : [0,0.87,1,0.02], "spans":[1,2,3,4], "rule": 17}, - // { "tail": [0], "feats" : [1,2.3,2,15.3,"ExtraFeature",1.2], "rule": 13}] - case 13: - assert(type == JSON_T_ARRAY_BEGIN); - state = 14; - break; - case 14: - if (type == JSON_T_ARRAY_END) { state = 0; break; } - assert(type == JSON_T_OBJECT_BEGIN); - //cerr << "New edge\n"; - ++edges; - cur_rule.reset(); feats.clear(); tail.clear(); - state = 15; break; - case 15: - if (type == JSON_T_OBJECT_END) { - CreateEdge(cur_rule, &feats, tail); - state = 14; break; - } - assert(type == JSON_T_KEY); - cur_key = value->vu.str.value; - //cerr << "edge key " << cur_key << endl; - if (cur_key == "rule") { assert(!cur_rule); state = 16; break; } - if (cur_key == "spans") { assert(!cur_rule); state = 22; break; } - if (cur_key == "feats") { assert(feats.empty()); state = 17; break; } - if (cur_key == "tail") { assert(tail.empty()); state = 20; break; } - cerr << "Unexpected key " << cur_key << " in edge specification\n"; - return false; - case 16: // edge.rule - if (type == JSON_T_INTEGER) { - int rule_id = value->vu.integer_value; - if (rules.find(rule_id) == rules.end()) { - // rules list must come before the edge definitions! - cerr << "Rule_id " << rule_id << " given but only loaded " << rules.size() << " rules\n"; - return false; - } - cur_rule = rules[rule_id]; - } else if (type == JSON_T_STRING) { - cur_rule.reset(new TRule(value->vu.str.value)); - } else { - cerr << "Rule must be either a rule id or a rule string" << endl; - return false; - } - // cerr << "Edge: rule=" << cur_rule->AsString() << endl; - state = 15; - break; - case 17: // edge.feats - if (type == JSON_T_NULL) { state = 15; break; } - assert(type == JSON_T_ARRAY_BEGIN); - state = 18; break; - case 18: - if (type == JSON_T_ARRAY_END) { state = 15; break; } - if (type != JSON_T_INTEGER && type != JSON_T_STRING) { - cerr << "Unexpected feature id type\n"; return false; - } - if (type == JSON_T_INTEGER) { - fid = value->vu.integer_value; - assert(fid < fdict.size()); - fid = fdict[fid]; - } else if (JSON_T_STRING) { - fid = FD::Convert(value->vu.str.value); - } else { abort(); } - state = 19; - break; - case 19: - { - assert(type == JSON_T_INTEGER || type == JSON_T_FLOAT); - double val = (type == JSON_T_INTEGER ? static_cast<double>(value->vu.integer_value) : - strtod(value->vu.str.value, NULL)); - feats.set_value(fid, val); - state = 18; - break; - } - case 20: // edge.tail - if (type == JSON_T_NULL) { state = 15; break; } - assert(type == JSON_T_ARRAY_BEGIN); - state = 21; break; - case 21: - if (type == JSON_T_ARRAY_END) { state = 15; break; } - assert(type == JSON_T_INTEGER); - tail.push_back(value->vu.integer_value); - break; - case 22: // edge.spans - assert(type == JSON_T_ARRAY_BEGIN); - state = 23; - spans[0] = spans[1] = spans[2] = spans[3] = -1; - spanc = 0; - break; - case 23: - if (type == JSON_T_ARRAY_END) { state = 15; break; } - assert(type == JSON_T_INTEGER); - assert(spanc < 4); - spans[spanc] = value->vu.integer_value; - ++spanc; - break; - case 24: // read node hash - assert(type == JSON_T_STRING); - shash = value->vu.str.value; - state = 9; - break; - } - return true; - } - string rp; - string cat; - SmallVectorUnsigned tail; - vector<int> in_edges; - string shash; - TRulePtr cur_rule; - map<int, TRulePtr> rules; - vector<int> fdict; - SparseVector<double> feats; - int state; - int fid; - int nodes; - int edges; - int spans[4]; - int spanc; - string cur_key; - Hypergraph& hg; - int rule_id; - bool nodes_needed; - bool edges_needed; - bool is_sorted; -}; - -bool HypergraphIO::ReadFromJSON(istream* in, Hypergraph* hg) { - hg->clear(); - HGReader reader(hg); - return reader.Parse(in); -} - bool HypergraphIO::ReadFromBinary(istream* in, Hypergraph* hg) { boost::archive::binary_iarchive oa(*in); hg->clear(); diff --git a/decoder/hg_io.h b/decoder/hg_io.h index 5ba86f69..93a9e280 100644 --- a/decoder/hg_io.h +++ b/decoder/hg_io.h @@ -9,15 +9,6 @@ class Hypergraph; struct HypergraphIO { - // the format is basically a list of nodes and edges in topological order - // any edge you read, you must have already read its tail nodes - // any node you read, you must have already read its incoming edges - // this may make writing a bit more challenging if your forest is not - // topologically sorted (but that probably doesn't happen very often), - // but it makes reading much more memory efficient. - // see test_data/small.json.gz for an email encoding - static bool ReadFromJSON(std::istream* in, Hypergraph* out); - static bool ReadFromBinary(std::istream* in, Hypergraph* out); static bool WriteToBinary(const Hypergraph& hg, std::ostream* out); diff --git a/decoder/hg_test.cc b/decoder/hg_test.cc index 25eddcec..366b269d 100644 --- a/decoder/hg_test.cc +++ b/decoder/hg_test.cc @@ -9,7 +9,6 @@ #include <iostream> #include "tdict.h" -#include "json_parse.h" #include "hg_intersect.h" #include "hg_union.h" #include "viterbi.h" @@ -399,16 +398,6 @@ BOOST_AUTO_TEST_CASE(Small) { BOOST_CHECK_CLOSE(2.1431036, log(c2), 1e-4); } -BOOST_AUTO_TEST_CASE(JSONTest) { - std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : TEST_DATA); - ostringstream os; - JSONParser::WriteEscapedString("\"I don't know\", she said.", &os); - BOOST_CHECK_EQUAL("\"\\\"I don't know\\\", she said.\"", os.str()); - ostringstream os2; - JSONParser::WriteEscapedString("yes", &os2); - BOOST_CHECK_EQUAL("\"yes\"", os2.str()); -} - BOOST_AUTO_TEST_CASE(TestGenericKBest) { std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : TEST_DATA); Hypergraph hg; diff --git a/decoder/hg_test.h b/decoder/hg_test.h index b7bab3c2..70c2c97d 100644 --- a/decoder/hg_test.h +++ b/decoder/hg_test.h @@ -12,11 +12,11 @@ namespace { typedef char const* Name; -Name urdu_json="urdu.json.gz"; +Name urdu_json="urdu.bin.gz"; Name urdu_wts="Arity_0 1.70741473606976 Arity_1 1.12426238048012 Arity_2 1.14986187839554 Glue -0.04589037041388 LanguageModel 1.09051 PassThrough -3.66226367902928 PhraseModel_0 -1.94633451863252 PhraseModel_1 -0.1475347695476 PhraseModel_2 -1.614818994946 WordPenalty -3.0 WordPenaltyFsa -0.56028442964748 ShorterThanPrev -10 LongerThanPrev -10"; -Name small_json="small.json.gz"; +Name small_json="small.bin.gz"; Name small_wts="Model_0 -2 Model_1 -.5 Model_2 -1.1 Model_3 -1 Model_4 -1 Model_5 .5 Model_6 .2 Model_7 -.3"; -Name perro_json="perro.json.gz"; +Name perro_json="perro.bin.gz"; Name perro_wts="SameFirstLetter 1 LongerThanPrev 1 ShorterThanPrev 1 GlueTop 0.0 Glue -1.0 EgivenF -0.5 FgivenE -0.5 LexEgivenF -0.5 LexFgivenE -0.5 LM 1"; } @@ -32,7 +32,7 @@ struct HGSetup { static void JsonFile(Hypergraph *hg,std::string f) { ReadFile rf(f); - HypergraphIO::ReadFromJSON(rf.stream(), hg); + HypergraphIO::ReadFromBinary(rf.stream(), hg); } static void JsonTestFile(Hypergraph *hg,std::string path,std::string n) { JsonFile(hg,path + "/"+n); @@ -48,35 +48,35 @@ void AddNullEdge(Hypergraph* hg) { } void HGSetup::CreateTinyLatticeHG(const std::string& path,Hypergraph* hg) { - ReadFile rf(path + "/hg_test.tiny_lattice"); - HypergraphIO::ReadFromJSON(rf.stream(), hg); + ReadFile rf(path + "/hg_test.tiny_lattice.bin.gz"); + HypergraphIO::ReadFromBinary(rf.stream(), hg); AddNullEdge(hg); } void HGSetup::CreateLatticeHG(const std::string& path,Hypergraph* hg) { - ReadFile rf(path + "/hg_test.lattice"); - HypergraphIO::ReadFromJSON(rf.stream(), hg); + ReadFile rf(path + "/hg_test.lattice.bin.gz"); + HypergraphIO::ReadFromBinary(rf.stream(), hg); AddNullEdge(hg); } void HGSetup::CreateHG_tiny(const std::string& path, Hypergraph* hg) { - ReadFile rf(path + "/hg_test.tiny"); - HypergraphIO::ReadFromJSON(rf.stream(), hg); + ReadFile rf(path + "/hg_test.tiny.bin.gz"); + HypergraphIO::ReadFromBinary(rf.stream(), hg); } void HGSetup::CreateHG_int(const std::string& path,Hypergraph* hg) { - ReadFile rf(path + "/hg_test.hg_int"); - HypergraphIO::ReadFromJSON(rf.stream(), hg); + ReadFile rf(path + "/hg_test.hg_int.bin.gz"); + HypergraphIO::ReadFromBinary(rf.stream(), hg); } void HGSetup::CreateHG(const std::string& path,Hypergraph* hg) { - ReadFile rf(path + "/hg_test.hg"); - HypergraphIO::ReadFromJSON(rf.stream(), hg); + ReadFile rf(path + "/hg_test.hg.bin.gz"); + HypergraphIO::ReadFromBinary(rf.stream(), hg); } void HGSetup::CreateHGBalanced(const std::string& path,Hypergraph* hg) { - ReadFile rf(path + "/hg_test.hg_balanced"); - HypergraphIO::ReadFromJSON(rf.stream(), hg); + ReadFile rf(path + "/hg_test.hg_balanced.bin.gz"); + HypergraphIO::ReadFromBinary(rf.stream(), hg); } #endif diff --git a/decoder/json_parse.cc b/decoder/json_parse.cc deleted file mode 100644 index f6fdfea8..00000000 --- a/decoder/json_parse.cc +++ /dev/null @@ -1,50 +0,0 @@ -#include "json_parse.h" - -#include <string> -#include <iostream> - -using namespace std; - -static const char *json_hex_chars = "0123456789abcdef"; - -void JSONParser::WriteEscapedString(const string& in, ostream* out) { - int pos = 0; - int start_offset = 0; - unsigned char c = 0; - (*out) << '"'; - while(pos < in.size()) { - c = in[pos]; - switch(c) { - case '\b': - case '\n': - case '\r': - case '\t': - case '"': - case '\\': - case '/': - if(pos - start_offset > 0) - (*out) << in.substr(start_offset, pos - start_offset); - if(c == '\b') (*out) << "\\b"; - else if(c == '\n') (*out) << "\\n"; - else if(c == '\r') (*out) << "\\r"; - else if(c == '\t') (*out) << "\\t"; - else if(c == '"') (*out) << "\\\""; - else if(c == '\\') (*out) << "\\\\"; - else if(c == '/') (*out) << "\\/"; - start_offset = ++pos; - break; - default: - if(c < ' ') { - cerr << "Warning, bad character (" << static_cast<int>(c) << ") in string\n"; - if(pos - start_offset > 0) - (*out) << in.substr(start_offset, pos - start_offset); - (*out) << "\\u00" << json_hex_chars[c >> 4] << json_hex_chars[c & 0xf]; - start_offset = ++pos; - } else pos++; - } - } - if(pos - start_offset > 0) - (*out) << in.substr(start_offset, pos - start_offset); - (*out) << '"'; -} - diff --git a/decoder/json_parse.h b/decoder/json_parse.h deleted file mode 100644 index 85e2eff1..00000000 --- a/decoder/json_parse.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef JSON_WRAPPER_H_ -#define JSON_WRAPPER_H_ - -#include <iostream> -#include <cassert> -#include "JSON_parser.h" - -class JSONParser { - public: - JSONParser() { - init_JSON_config(&config); - hack.mf = &JSONParser::Callback; - config.depth = 10; - config.callback_ctx = reinterpret_cast<void*>(this); - config.callback = hack.cb; - config.allow_comments = 1; - config.handle_floats_manually = 1; - jc = new_JSON_parser(&config); - } - virtual ~JSONParser() { - delete_JSON_parser(jc); - } - bool Parse(std::istream* in) { - int count = 0; - int lc = 1; - for (; in ; ++count) { - int next_char = in->get(); - if (!in->good()) break; - if (lc == '\n') { ++lc; } - if (!JSON_parser_char(jc, next_char)) { - std::cerr << "JSON_parser_char: syntax error, line " << lc << " (byte " << count << ")" << std::endl; - return false; - } - } - if (!JSON_parser_done(jc)) { - std::cerr << "JSON_parser_done: syntax error\n"; - return false; - } - return true; - } - static void WriteEscapedString(const std::string& in, std::ostream* out); - protected: - virtual bool HandleJSONEvent(int type, const JSON_value* value) = 0; - private: - int Callback(int type, const JSON_value* value) { - if (HandleJSONEvent(type, value)) return 1; - return 0; - } - JSON_parser_struct* jc; - JSON_config config; - typedef int (JSONParser::* MF)(int type, const struct JSON_value_struct* value); - union CBHack { - JSON_parser_callback cb; - MF mf; - } hack; -}; - -#endif diff --git a/decoder/rescore_translator.cc b/decoder/rescore_translator.cc index 18c83c56..2c5fa9c4 100644 --- a/decoder/rescore_translator.cc +++ b/decoder/rescore_translator.cc @@ -3,6 +3,7 @@ #include <sstream> #include <boost/shared_ptr.hpp> +#include "filelib.h" #include "sentence_metadata.h" #include "hg.h" #include "hg_io.h" @@ -20,16 +21,18 @@ struct RescoreTranslatorImpl { bool Translate(const string& input, const vector<double>& weights, Hypergraph* forest) { - if (input == "{}") return false; - if (input.find("{\"rules\"") == 0) { - istringstream is(input); - Hypergraph src_cfg_hg; - if (!HypergraphIO::ReadFromJSON(&is, forest)) { - cerr << "Parse error while reading HG from JSON.\n"; - abort(); - } - } else { - cerr << "Can only read HG input from JSON: use training/grammar_convert\n"; + istringstream is(input); + string header, fname; + is >> header >> fname; + if (header != "::forest::") { + cerr << "RescoreTranslator: expected input lines of form ::forest:: filename.gz\n"; + abort(); + } + ReadFile rf(fname); + if (!rf) { cerr << "Can't read " << fname << endl; abort(); } + Hypergraph src_cfg_hg; + if (!HypergraphIO::ReadFromBinary(rf.stream(), forest)) { + cerr << "Parse error while reading HG.\n"; abort(); } Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); diff --git a/decoder/test_data/perro.json.gz b/decoder/test_data/perro.json.gz Binary files differdeleted file mode 100644 index 41de5758..00000000 --- a/decoder/test_data/perro.json.gz +++ /dev/null diff --git a/decoder/test_data/small.json.gz b/decoder/test_data/small.json.gz Binary files differdeleted file mode 100644 index f6f37293..00000000 --- a/decoder/test_data/small.json.gz +++ /dev/null diff --git a/decoder/test_data/urdu.json.gz b/decoder/test_data/urdu.json.gz Binary files differdeleted file mode 100644 index 84535402..00000000 --- a/decoder/test_data/urdu.json.gz +++ /dev/null diff --git a/decoder/trule.h b/decoder/trule.h index 85842bb5..7af46747 100644 --- a/decoder/trule.h +++ b/decoder/trule.h @@ -167,7 +167,7 @@ class TRule { friend class boost::serialization::access; template<class Archive> - void save(Archive & ar, const unsigned int version) const { + void save(Archive & ar, const unsigned int /*version*/) const { ar & TD::Convert(-lhs_); unsigned f_size = f_.size(); ar & f_size; @@ -195,7 +195,7 @@ class TRule { ar & scores_; } template<class Archive> - void load(Archive & ar, const unsigned int version) { + void load(Archive & ar, const unsigned int /*version*/) { std::string lhs; ar & lhs; lhs_ = -TD::Convert(lhs); unsigned f_size; ar & f_size; f_.resize(f_size); |