diff options
Diffstat (limited to 'decoder')
110 files changed, 17207 insertions, 0 deletions
diff --git a/decoder/JSON_parser.c b/decoder/JSON_parser.c new file mode 100644 index 00000000..175b7cc9 --- /dev/null +++ b/decoder/JSON_parser.c @@ -0,0 +1,1012 @@ +/* JSON_parser.c */ + +/* 2007-08-24 */ + +/* +Copyright (c) 2005 JSON.org + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +The Software shall be used for Good, not Evil. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* +    Callbacks, comments, Unicode handling by Jean Gressmann (jean@0x42.de), 2007-2009. +     +    For the added features the license above applies also. +     +    Changelog: +        2009-05-17  +            Incorporated benrudiak@googlemail.com fix for UTF16 decoding. +             +        2009-05-14  +            Fixed float parsing bug related to a locale being set that didn't +            use '.' as decimal point character (charles@transmissionbt.com). +             +        2008-10-14  +            Renamed states.IN to states.IT to avoid name clash which IN macro +            defined in windef.h (alexey.pelykh@gmail.com) +             +        2008-07-19  +            Removed some duplicate code & debugging variable (charles@transmissionbt.com) +         +        2008-05-28  +            Made JSON_value structure ansi C compliant. This bug was report by  +            trisk@acm.jhu.edu +         +        2008-05-20  +            Fixed bug reported by charles@transmissionbt.com where the switching  +            from static to dynamic parse buffer did not copy the static parse  +            buffer's content. +*/ + + + +#include <assert.h> +#include <ctype.h> +#include <float.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <locale.h> + +#include "JSON_parser.h" + +#ifdef _MSC_VER +#   if _MSC_VER >= 1400 /* Visual Studio 2005 and up */ +#      pragma warning(disable:4996) // unsecure sscanf +#   endif +#endif + + +#define true  1 +#define false 0 +#define __   -1     /* the universal error code */ + +/* values chosen so that the object size is approx equal to one page (4K) */ +#ifndef JSON_PARSER_STACK_SIZE +#   define JSON_PARSER_STACK_SIZE 128 +#endif + +#ifndef JSON_PARSER_PARSE_BUFFER_SIZE +#   define JSON_PARSER_PARSE_BUFFER_SIZE 3500 +#endif + +typedef unsigned short UTF16; + +struct JSON_parser_struct { +    JSON_parser_callback callback; +    void* ctx; +    signed char state, before_comment_state, type, escaped, comment, allow_comments, handle_floats_manually; +    UTF16 utf16_high_surrogate; +    long depth; +    long top; +    signed char* stack; +    long stack_capacity; +    char decimal_point; +    char* parse_buffer; +    size_t parse_buffer_capacity; +    size_t parse_buffer_count; +    size_t comment_begin_offset; +    signed char static_stack[JSON_PARSER_STACK_SIZE]; +    char static_parse_buffer[JSON_PARSER_PARSE_BUFFER_SIZE]; +}; + +#define COUNTOF(x) (sizeof(x)/sizeof(x[0]))  + +/* +    Characters are mapped into these character classes. This allows for +    a significant reduction in the size of the state transition table. +*/ + + + +enum classes { +    C_SPACE,  /* space */ +    C_WHITE,  /* other whitespace */ +    C_LCURB,  /* {  */ +    C_RCURB,  /* } */ +    C_LSQRB,  /* [ */ +    C_RSQRB,  /* ] */ +    C_COLON,  /* : */ +    C_COMMA,  /* , */ +    C_QUOTE,  /* " */ +    C_BACKS,  /* \ */ +    C_SLASH,  /* / */ +    C_PLUS,   /* + */ +    C_MINUS,  /* - */ +    C_POINT,  /* . */ +    C_ZERO ,  /* 0 */ +    C_DIGIT,  /* 123456789 */ +    C_LOW_A,  /* a */ +    C_LOW_B,  /* b */ +    C_LOW_C,  /* c */ +    C_LOW_D,  /* d */ +    C_LOW_E,  /* e */ +    C_LOW_F,  /* f */ +    C_LOW_L,  /* l */ +    C_LOW_N,  /* n */ +    C_LOW_R,  /* r */ +    C_LOW_S,  /* s */ +    C_LOW_T,  /* t */ +    C_LOW_U,  /* u */ +    C_ABCDF,  /* ABCDF */ +    C_E,      /* E */ +    C_ETC,    /* everything else */ +    C_STAR,   /* * */    +    NR_CLASSES +}; + +static int ascii_class[128] = { +/* +    This array maps the 128 ASCII characters into character classes. +    The remaining Unicode characters should be mapped to C_ETC. +    Non-whitespace control characters are errors. +*/ +    __,      __,      __,      __,      __,      __,      __,      __, +    __,      C_WHITE, C_WHITE, __,      __,      C_WHITE, __,      __, +    __,      __,      __,      __,      __,      __,      __,      __, +    __,      __,      __,      __,      __,      __,      __,      __, + +    C_SPACE, C_ETC,   C_QUOTE, C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC, +    C_ETC,   C_ETC,   C_STAR,   C_PLUS,  C_COMMA, C_MINUS, C_POINT, C_SLASH, +    C_ZERO,  C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, +    C_DIGIT, C_DIGIT, C_COLON, C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC, + +    C_ETC,   C_ABCDF, C_ABCDF, C_ABCDF, C_ABCDF, C_E,     C_ABCDF, C_ETC, +    C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC, +    C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC, +    C_ETC,   C_ETC,   C_ETC,   C_LSQRB, C_BACKS, C_RSQRB, C_ETC,   C_ETC, + +    C_ETC,   C_LOW_A, C_LOW_B, C_LOW_C, C_LOW_D, C_LOW_E, C_LOW_F, C_ETC, +    C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_LOW_L, C_ETC,   C_LOW_N, C_ETC, +    C_ETC,   C_ETC,   C_LOW_R, C_LOW_S, C_LOW_T, C_LOW_U, C_ETC,   C_ETC, +    C_ETC,   C_ETC,   C_ETC,   C_LCURB, C_ETC,   C_RCURB, C_ETC,   C_ETC +}; + + +/* +    The state codes. +*/ +enum states { +    GO,  /* start    */ +    OK,  /* ok       */ +    OB,  /* object   */ +    KE,  /* key      */ +    CO,  /* colon    */ +    VA,  /* value    */ +    AR,  /* array    */ +    ST,  /* string   */ +    ES,  /* escape   */ +    U1,  /* u1       */ +    U2,  /* u2       */ +    U3,  /* u3       */ +    U4,  /* u4       */ +    MI,  /* minus    */ +    ZE,  /* zero     */ +    IT,  /* integer  */ +    FR,  /* fraction */ +    E1,  /* e        */ +    E2,  /* ex       */ +    E3,  /* exp      */ +    T1,  /* tr       */ +    T2,  /* tru      */ +    T3,  /* true     */ +    F1,  /* fa       */ +    F2,  /* fal      */ +    F3,  /* fals     */ +    F4,  /* false    */ +    N1,  /* nu       */ +    N2,  /* nul      */ +    N3,  /* null     */ +    C1,  /* /        */ +    C2,  /* / *     */ +    C3,  /* *        */ +    FX,  /* *.* *eE* */ +    D1,  /* second UTF-16 character decoding started by \ */ +    D2,  /* second UTF-16 character proceeded by u */ +    NR_STATES +}; + +enum actions +{ +    CB = -10, /* comment begin */ +    CE = -11, /* comment end */ +    FA = -12, /* false */ +    TR = -13, /* false */ +    NU = -14, /* null */ +    DE = -15, /* double detected by exponent e E */ +    DF = -16, /* double detected by fraction . */ +    SB = -17, /* string begin */ +    MX = -18, /* integer detected by minus */ +    ZX = -19, /* integer detected by zero */ +    IX = -20, /* integer detected by 1-9 */ +    EX = -21, /* next char is escaped */ +    UC = -22  /* Unicode character read */ +}; + + +static int state_transition_table[NR_STATES][NR_CLASSES] = { +/* +    The state transition table takes the current state and the current symbol, +    and returns either a new state or an action. An action is represented as a +    negative number. A JSON text is accepted if at the end of the text the +    state is OK and if the mode is MODE_DONE. + +                 white                                      1-9                                   ABCDF  etc +             space |  {  }  [  ]  :  ,  "  \  /  +  -  .  0  |  a  b  c  d  e  f  l  n  r  s  t  u  |  E  |  * */ +/*start  GO*/ {GO,GO,-6,__,-5,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*ok     OK*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*object OB*/ {OB,OB,__,-9,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*key    KE*/ {KE,KE,__,__,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*colon  CO*/ {CO,CO,__,__,__,__,-2,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*value  VA*/ {VA,VA,-6,__,-5,__,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__}, +/*array  AR*/ {AR,AR,-6,__,-5,-7,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__}, +/*string ST*/ {ST,__,ST,ST,ST,ST,ST,ST,-4,EX,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST}, +/*escape ES*/ {__,__,__,__,__,__,__,__,ST,ST,ST,__,__,__,__,__,__,ST,__,__,__,ST,__,ST,ST,__,ST,U1,__,__,__,__}, +/*u1     U1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U2,U2,U2,U2,U2,U2,U2,U2,__,__,__,__,__,__,U2,U2,__,__}, +/*u2     U2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U3,U3,U3,U3,U3,U3,U3,U3,__,__,__,__,__,__,U3,U3,__,__}, +/*u3     U3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U4,U4,U4,U4,U4,U4,U4,U4,__,__,__,__,__,__,U4,U4,__,__}, +/*u4     U4*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,UC,UC,UC,UC,UC,UC,UC,UC,__,__,__,__,__,__,UC,UC,__,__}, +/*minus  MI*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,ZE,IT,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*zero   ZE*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*int    IT*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,IT,IT,__,__,__,__,DE,__,__,__,__,__,__,__,__,DE,__,__}, +/*frac   FR*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__}, +/*e      E1*/ {__,__,__,__,__,__,__,__,__,__,__,E2,E2,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*ex     E2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*exp    E3*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*tr     T1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T2,__,__,__,__,__,__,__}, +/*tru    T2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T3,__,__,__,__}, +/*true   T3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__}, +/*fa     F1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*fal    F2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F3,__,__,__,__,__,__,__,__,__}, +/*fals   F3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F4,__,__,__,__,__,__}, +/*false  F4*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__}, +/*nu     N1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N2,__,__,__,__}, +/*nul    N2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N3,__,__,__,__,__,__,__,__,__}, +/*null   N3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__}, +/*/      C1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,C2}, +/*/*     C2*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3}, +/**      C3*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,CE,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3}, +/*_.     FX*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__}, +/*\      D1*/ {__,__,__,__,__,__,__,__,__,D2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*\      D2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,U1,__,__,__,__}, +}; + + +/* +    These modes can be pushed on the stack. +*/ +enum modes { +    MODE_ARRAY = 1,  +    MODE_DONE = 2,   +    MODE_KEY = 3,    +    MODE_OBJECT = 4 +}; + +static int +push(JSON_parser jc, int mode) +{ +/* +    Push a mode onto the stack. Return false if there is overflow. +*/ +    jc->top += 1; +    if (jc->depth < 0) { +        if (jc->top >= jc->stack_capacity) { +            size_t bytes_to_allocate; +            jc->stack_capacity *= 2; +            bytes_to_allocate = jc->stack_capacity * sizeof(jc->static_stack[0]); +            if (jc->stack == &jc->static_stack[0]) { +                jc->stack = (signed char*)malloc(bytes_to_allocate); +                memcpy(jc->stack, jc->static_stack, sizeof(jc->static_stack)); +            } else { +                jc->stack = (signed char*)realloc(jc->stack, bytes_to_allocate); +            } +        } +    } else { +        if (jc->top >= jc->depth) { +            return false; +        } +    } +     +    jc->stack[jc->top] = mode; +    return true; +} + + +static int +pop(JSON_parser jc, int mode) +{ +/* +    Pop the stack, assuring that the current mode matches the expectation. +    Return false if there is underflow or if the modes mismatch. +*/ +    if (jc->top < 0 || jc->stack[jc->top] != mode) { +        return false; +    } +    jc->top -= 1; +    return true; +} + + +#define parse_buffer_clear(jc) \ +    do {\ +        jc->parse_buffer_count = 0;\ +        jc->parse_buffer[0] = 0;\ +    } while (0) +     +#define parse_buffer_pop_back_char(jc)\ +    do {\ +        assert(jc->parse_buffer_count >= 1);\ +        --jc->parse_buffer_count;\ +        jc->parse_buffer[jc->parse_buffer_count] = 0;\ +    } while (0)     +     +void delete_JSON_parser(JSON_parser jc) +{ +    if (jc) { +        if (jc->stack != &jc->static_stack[0]) { +            free((void*)jc->stack); +        } +        if (jc->parse_buffer != &jc->static_parse_buffer[0]) { +            free((void*)jc->parse_buffer); +        } +        free((void*)jc); +     }    +} + + +JSON_parser +new_JSON_parser(JSON_config* config) +{ +/* +    new_JSON_parser starts the checking process by constructing a JSON_parser +    object. It takes a depth parameter that restricts the level of maximum +    nesting. + +    To continue the process, call JSON_parser_char for each character in the +    JSON text, and then call JSON_parser_done to obtain the final result. +    These functions are fully reentrant. +*/ + +    int depth = 0; +    JSON_config default_config; +     +    JSON_parser jc = (JSON_parser)malloc(sizeof(struct JSON_parser_struct)); +     +    memset(jc, 0, sizeof(*jc)); +     +     +    /* initialize configuration */ +    init_JSON_config(&default_config); +     +    /* set to default configuration if none was provided */ +    if (config == NULL) { +        config = &default_config; +    } + +    depth = config->depth; +     +    /* We need to be able to push at least one object */ +    if (depth == 0) { +        depth = 1; +    } +     +    jc->state = GO; +    jc->top = -1; +     +    /* Do we want non-bound stack? */ +    if (depth > 0) { +        jc->stack_capacity = depth; +        jc->depth = depth; +        if (depth <= (int)COUNTOF(jc->static_stack)) { +            jc->stack = &jc->static_stack[0]; +        } else { +            jc->stack = (signed char*)malloc(jc->stack_capacity * sizeof(jc->static_stack[0])); +        } +    } else { +        jc->stack_capacity = COUNTOF(jc->static_stack); +        jc->depth = -1; +        jc->stack = &jc->static_stack[0]; +    } +     +    /* set parser to start */ +    push(jc, MODE_DONE); +     +    /* set up the parse buffer */ +    jc->parse_buffer = &jc->static_parse_buffer[0]; +    jc->parse_buffer_capacity = COUNTOF(jc->static_parse_buffer); +    parse_buffer_clear(jc); +     +    /* set up callback, comment & float handling */ +    jc->callback = config->callback; +    jc->ctx = config->callback_ctx; +    jc->allow_comments = config->allow_comments != 0; +    jc->handle_floats_manually = config->handle_floats_manually != 0; +     +    /* set up decimal point */ +    jc->decimal_point = *localeconv()->decimal_point; +     +    return jc; +} + +static void grow_parse_buffer(JSON_parser jc) +{ +    size_t bytes_to_allocate; +    jc->parse_buffer_capacity *= 2; +    bytes_to_allocate = jc->parse_buffer_capacity * sizeof(jc->parse_buffer[0]); +    if (jc->parse_buffer == &jc->static_parse_buffer[0]) { +        jc->parse_buffer = (char*)malloc(bytes_to_allocate); +        memcpy(jc->parse_buffer, jc->static_parse_buffer, jc->parse_buffer_count); +    } else { +        jc->parse_buffer = (char*)realloc(jc->parse_buffer, bytes_to_allocate); +    } +} + +#define parse_buffer_push_back_char(jc, c)\ +    do {\ +        if (jc->parse_buffer_count + 1 >= jc->parse_buffer_capacity) grow_parse_buffer(jc);\ +        jc->parse_buffer[jc->parse_buffer_count++] = c;\ +        jc->parse_buffer[jc->parse_buffer_count]   = 0;\ +    } while (0) + +#define assert_is_non_container_type(jc) \ +    assert( \ +        jc->type == JSON_T_NULL || \ +        jc->type == JSON_T_FALSE || \ +        jc->type == JSON_T_TRUE || \ +        jc->type == JSON_T_FLOAT || \ +        jc->type == JSON_T_INTEGER || \ +        jc->type == JSON_T_STRING) +     + +static int parse_parse_buffer(JSON_parser jc) +{ +    if (jc->callback) { +        JSON_value value, *arg = NULL; +         +        if (jc->type != JSON_T_NONE) { +            assert_is_non_container_type(jc); +         +            switch(jc->type) { +                case JSON_T_FLOAT: +                    arg = &value; +                    if (jc->handle_floats_manually) { +                        value.vu.str.value = jc->parse_buffer; +                        value.vu.str.length = jc->parse_buffer_count; +                    } else {  +                        /*sscanf(jc->parse_buffer, "%Lf", &value.vu.float_value);*/ +                         +                        /* not checking with end pointer b/c there may be trailing ws */ +                        value.vu.float_value = strtold(jc->parse_buffer, NULL); +                    } +                    break; +                case JSON_T_INTEGER: +                    arg = &value; +                    sscanf(jc->parse_buffer, JSON_PARSER_INTEGER_SSCANF_TOKEN, &value.vu.integer_value); +                    break; +                case JSON_T_STRING: +                    arg = &value; +                    value.vu.str.value = jc->parse_buffer; +                    value.vu.str.length = jc->parse_buffer_count; +                    break; +            } +             +            if (!(*jc->callback)(jc->ctx, jc->type, arg)) { +                return false; +            } +        } +    } +     +    parse_buffer_clear(jc); +     +    return true; +} + +#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800) +#define IS_LOW_SURROGATE(uc)  (((uc) & 0xFC00) == 0xDC00) +#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000) +static unsigned char utf8_lead_bits[4] = { 0x00, 0xC0, 0xE0, 0xF0 }; + +static int decode_unicode_char(JSON_parser jc) +{ +    int i; +    unsigned uc = 0; +    char* p; +    int trail_bytes; +     +    assert(jc->parse_buffer_count >= 6); +     +    p = &jc->parse_buffer[jc->parse_buffer_count - 4]; +     +    for (i = 12; i >= 0; i -= 4, ++p) { +        unsigned x = *p; +         +        if (x >= 'a') { +            x -= ('a' - 10); +        } else if (x >= 'A') { +            x -= ('A' - 10); +        } else { +            x &= ~0x30u; +        } +         +        assert(x < 16); +         +        uc |= x << i; +    } +     +    /* clear UTF-16 char from buffer */ +    jc->parse_buffer_count -= 6; +    jc->parse_buffer[jc->parse_buffer_count] = 0; +     +    /* attempt decoding ... */ +    if (jc->utf16_high_surrogate) { +        if (IS_LOW_SURROGATE(uc)) { +            uc = DECODE_SURROGATE_PAIR(jc->utf16_high_surrogate, uc); +            trail_bytes = 3; +            jc->utf16_high_surrogate = 0; +        } else { +            /* high surrogate without a following low surrogate */ +            return false; +        } +    } else { +        if (uc < 0x80) { +            trail_bytes = 0; +        } else if (uc < 0x800) { +            trail_bytes = 1; +        } else if (IS_HIGH_SURROGATE(uc)) { +            /* save the high surrogate and wait for the low surrogate */ +            jc->utf16_high_surrogate = uc; +            return true; +        } else if (IS_LOW_SURROGATE(uc)) { +            /* low surrogate without a preceding high surrogate */ +            return false; +        } else { +            trail_bytes = 2; +        } +    } +     +    jc->parse_buffer[jc->parse_buffer_count++] = (char) ((uc >> (trail_bytes * 6)) | utf8_lead_bits[trail_bytes]); +     +    for (i = trail_bytes * 6 - 6; i >= 0; i -= 6) { +        jc->parse_buffer[jc->parse_buffer_count++] = (char) (((uc >> i) & 0x3F) | 0x80); +    } + +    jc->parse_buffer[jc->parse_buffer_count] = 0; +     +    return true; +} + +static int add_escaped_char_to_parse_buffer(JSON_parser jc, int next_char) +{ +    jc->escaped = 0; +    /* remove the backslash */ +    parse_buffer_pop_back_char(jc); +    switch(next_char) { +        case 'b': +            parse_buffer_push_back_char(jc, '\b'); +            break; +        case 'f': +            parse_buffer_push_back_char(jc, '\f'); +            break; +        case 'n': +            parse_buffer_push_back_char(jc, '\n'); +            break; +        case 'r': +            parse_buffer_push_back_char(jc, '\r'); +            break; +        case 't': +            parse_buffer_push_back_char(jc, '\t'); +            break; +        case '"': +            parse_buffer_push_back_char(jc, '"'); +            break; +        case '\\': +            parse_buffer_push_back_char(jc, '\\'); +            break; +        case '/': +            parse_buffer_push_back_char(jc, '/'); +            break; +        case 'u': +            parse_buffer_push_back_char(jc, '\\'); +            parse_buffer_push_back_char(jc, 'u'); +            break; +        default: +            return false; +    } + +    return true; +} + +#define add_char_to_parse_buffer(jc, next_char, next_class) \ +    do { \ +        if (jc->escaped) { \ +            if (!add_escaped_char_to_parse_buffer(jc, next_char)) \ +                return false; \ +        } else if (!jc->comment) { \ +            if ((jc->type != JSON_T_NONE) | !((next_class == C_SPACE) | (next_class == C_WHITE)) /* non-white-space */) { \ +                parse_buffer_push_back_char(jc, (char)next_char); \ +            } \ +        } \ +    } while (0) +     + +#define assert_type_isnt_string_null_or_bool(jc) \ +    assert(jc->type != JSON_T_FALSE); \ +    assert(jc->type != JSON_T_TRUE); \ +    assert(jc->type != JSON_T_NULL); \ +    assert(jc->type != JSON_T_STRING) + + +int +JSON_parser_char(JSON_parser jc, int next_char) +{ +/* +    After calling new_JSON_parser, call this function for each character (or +    partial character) in your JSON text. It can accept UTF-8, UTF-16, or +    UTF-32. It returns true if things are looking ok so far. If it rejects the +    text, it returns false. +*/ +    int next_class, next_state; +     +/* +    Determine the character's class. +*/ +    if (next_char < 0) { +        return false; +    } +    if (next_char >= 128) { +        next_class = C_ETC; +    } else { +        next_class = ascii_class[next_char]; +        if (next_class <= __) { +            return false; +        } +    } +     +    add_char_to_parse_buffer(jc, next_char, next_class); +     +/* +    Get the next state from the state transition table. +*/ +    next_state = state_transition_table[jc->state][next_class]; +    if (next_state >= 0) { +/* +    Change the state. +*/ +        jc->state = next_state; +    } else { +/* +    Or perform one of the actions. +*/ +        switch (next_state) { +/* Unicode character */         +        case UC: +            if(!decode_unicode_char(jc)) { +                return false; +            } +            /* check if we need to read a second UTF-16 char */ +            if (jc->utf16_high_surrogate) { +                jc->state = D1; +            } else { +                jc->state = ST; +            } +            break; +/* escaped char */ +        case EX: +            jc->escaped = 1; +            jc->state = ES; +            break; +/* integer detected by minus */ +        case MX: +            jc->type = JSON_T_INTEGER; +            jc->state = MI; +            break;   +/* integer detected by zero */             +        case ZX: +            jc->type = JSON_T_INTEGER; +            jc->state = ZE; +            break;   +/* integer detected by 1-9 */             +        case IX: +            jc->type = JSON_T_INTEGER; +            jc->state = IT; +            break;   +             +/* floating point number detected by exponent*/ +        case DE: +            assert_type_isnt_string_null_or_bool(jc); +            jc->type = JSON_T_FLOAT; +            jc->state = E1; +            break;    +         +/* floating point number detected by fraction */ +        case DF: +            assert_type_isnt_string_null_or_bool(jc); +            if (!jc->handle_floats_manually) { +/* +    Some versions of strtod (which underlies sscanf) don't support converting  +    C-locale formated floating point values. +*/            +                assert(jc->parse_buffer[jc->parse_buffer_count-1] == '.'); +                jc->parse_buffer[jc->parse_buffer_count-1] = jc->decimal_point; +            }             +            jc->type = JSON_T_FLOAT; +            jc->state = FX; +            break;    +/* string begin " */ +        case SB: +            parse_buffer_clear(jc); +            assert(jc->type == JSON_T_NONE); +            jc->type = JSON_T_STRING; +            jc->state = ST; +            break;         +         +/* n */ +        case NU: +            assert(jc->type == JSON_T_NONE); +            jc->type = JSON_T_NULL; +            jc->state = N1; +            break;         +/* f */ +        case FA: +            assert(jc->type == JSON_T_NONE); +            jc->type = JSON_T_FALSE; +            jc->state = F1; +            break;         +/* t */ +        case TR: +            assert(jc->type == JSON_T_NONE); +            jc->type = JSON_T_TRUE; +            jc->state = T1; +            break;         +         +/* closing comment */ +        case CE: +            jc->comment = 0; +            assert(jc->parse_buffer_count == 0); +            assert(jc->type == JSON_T_NONE); +            jc->state = jc->before_comment_state; +            break;         +         +/* opening comment  */ +        case CB: +            if (!jc->allow_comments) { +                return false; +            } +            parse_buffer_pop_back_char(jc); +            if (!parse_parse_buffer(jc)) { +                return false; +            } +            assert(jc->parse_buffer_count == 0); +            assert(jc->type != JSON_T_STRING); +            switch (jc->stack[jc->top]) { +            case MODE_ARRAY: +            case MODE_OBJECT:    +                switch(jc->state) { +                case VA: +                case AR: +                    jc->before_comment_state = jc->state; +                    break; +                default: +                    jc->before_comment_state = OK; +                    break; +                } +                break; +            default: +                jc->before_comment_state = jc->state; +                break; +            } +            jc->type = JSON_T_NONE; +            jc->state = C1; +            jc->comment = 1; +            break; +/* empty } */ +        case -9:         +            parse_buffer_clear(jc); +            if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) { +                return false; +            } +            if (!pop(jc, MODE_KEY)) { +                return false; +            } +            jc->state = OK; +            break; + +/* } */ case -8: +            parse_buffer_pop_back_char(jc); +            if (!parse_parse_buffer(jc)) { +                return false; +            } +            if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) { +                return false; +            } +            if (!pop(jc, MODE_OBJECT)) { +                return false; +            } +            jc->type = JSON_T_NONE; +            jc->state = OK; +            break; + +/* ] */ case -7: +            parse_buffer_pop_back_char(jc); +            if (!parse_parse_buffer(jc)) { +                return false; +            } +            if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_END, NULL)) { +                return false; +            } +            if (!pop(jc, MODE_ARRAY)) { +                return false; +            } +             +            jc->type = JSON_T_NONE; +            jc->state = OK; +            break; + +/* { */ case -6: +            parse_buffer_pop_back_char(jc); +            if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_BEGIN, NULL)) { +                return false; +            } +            if (!push(jc, MODE_KEY)) { +                return false; +            } +            assert(jc->type == JSON_T_NONE); +            jc->state = OB; +            break; + +/* [ */ case -5: +            parse_buffer_pop_back_char(jc); +            if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_BEGIN, NULL)) { +                return false; +            } +            if (!push(jc, MODE_ARRAY)) { +                return false; +            } +            assert(jc->type == JSON_T_NONE); +            jc->state = AR; +            break; + +/* string end " */ case -4: +            parse_buffer_pop_back_char(jc); +            switch (jc->stack[jc->top]) { +            case MODE_KEY: +                assert(jc->type == JSON_T_STRING); +                jc->type = JSON_T_NONE; +                jc->state = CO; +                 +                if (jc->callback) { +                    JSON_value value; +                    value.vu.str.value = jc->parse_buffer; +                    value.vu.str.length = jc->parse_buffer_count; +                    if (!(*jc->callback)(jc->ctx, JSON_T_KEY, &value)) { +                        return false; +                    } +                } +                parse_buffer_clear(jc); +                break; +            case MODE_ARRAY: +            case MODE_OBJECT: +                assert(jc->type == JSON_T_STRING); +                if (!parse_parse_buffer(jc)) { +                    return false; +                } +                jc->type = JSON_T_NONE; +                jc->state = OK; +                break; +            default: +                return false; +            } +            break; + +/* , */ case -3: +            parse_buffer_pop_back_char(jc); +            if (!parse_parse_buffer(jc)) { +                return false; +            } +            switch (jc->stack[jc->top]) { +            case MODE_OBJECT: +/* +    A comma causes a flip from object mode to key mode. +*/ +                if (!pop(jc, MODE_OBJECT) || !push(jc, MODE_KEY)) { +                    return false; +                } +                assert(jc->type != JSON_T_STRING); +                jc->type = JSON_T_NONE; +                jc->state = KE; +                break; +            case MODE_ARRAY: +                assert(jc->type != JSON_T_STRING); +                jc->type = JSON_T_NONE; +                jc->state = VA; +                break; +            default: +                return false; +            } +            break; + +/* : */ case -2: +/* +    A colon causes a flip from key mode to object mode. +*/ +            parse_buffer_pop_back_char(jc); +            if (!pop(jc, MODE_KEY) || !push(jc, MODE_OBJECT)) { +                return false; +            } +            assert(jc->type == JSON_T_NONE); +            jc->state = VA; +            break; +/* +    Bad action. +*/ +        default: +            return false; +        } +    } +    return true; +} + + +int +JSON_parser_done(JSON_parser jc) +{ +    const int result = jc->state == OK && pop(jc, MODE_DONE); + +    return result; +} + + +int JSON_parser_is_legal_white_space_string(const char* s) +{ +    int c, char_class; +     +    if (s == NULL) { +        return false; +    } +     +    for (; *s; ++s) {    +        c = *s; +         +        if (c < 0 || c >= 128) { +            return false; +        } +         +        char_class = ascii_class[c]; +         +        if (char_class != C_SPACE && char_class != C_WHITE) { +            return false; +        } +    } +     +    return true; +} + + + +void init_JSON_config(JSON_config* config) +{ +    if (config) { +        memset(config, 0, sizeof(*config)); +         +        config->depth = JSON_PARSER_STACK_SIZE - 1; +    } +} diff --git a/decoder/JSON_parser.h b/decoder/JSON_parser.h new file mode 100644 index 00000000..ceb5b24b --- /dev/null +++ b/decoder/JSON_parser.h @@ -0,0 +1,152 @@ +#ifndef JSON_PARSER_H +#define JSON_PARSER_H + +/* JSON_parser.h */ + + +#include <stddef.h> + +/* Windows DLL stuff */ +#ifdef _WIN32 +#	ifdef JSON_PARSER_DLL_EXPORTS +#		define JSON_PARSER_DLL_API __declspec(dllexport) +#	else +#		define JSON_PARSER_DLL_API __declspec(dllimport) +#   endif +#else +#	define JSON_PARSER_DLL_API  +#endif + +/* Determine the integer type use to parse non-floating point numbers */ +#if __STDC_VERSION__ >= 199901L || HAVE_LONG_LONG == 1 +typedef long long JSON_int_t; +#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%lld" +#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%lld" +#else  +typedef long JSON_int_t; +#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%ld" +#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%ld" +#endif + + +#ifdef __cplusplus +extern "C" { +#endif  + +typedef enum  +{ +    JSON_T_NONE = 0, +    JSON_T_ARRAY_BEGIN,  // 1 +    JSON_T_ARRAY_END,    // 2 +    JSON_T_OBJECT_BEGIN, // 3 +    JSON_T_OBJECT_END,   // 4 +    JSON_T_INTEGER,      // 5 +    JSON_T_FLOAT,        // 6 +    JSON_T_NULL,         // 7 +    JSON_T_TRUE,         // 8 +    JSON_T_FALSE,        // 9 +    JSON_T_STRING,       // 10 +    JSON_T_KEY,          // 11 +    JSON_T_MAX           // 12 +} JSON_type; + +typedef struct JSON_value_struct { +    union { +        JSON_int_t integer_value; +         +        long double float_value; +         +        struct { +            const char* value; +            size_t length; +        } str; +    } vu; +} JSON_value; + +typedef struct JSON_parser_struct* JSON_parser; + +/*! \brief JSON parser callback  + +    \param ctx The pointer passed to new_JSON_parser. +    \param type An element of JSON_type but not JSON_T_NONE.     +    \param value A representation of the parsed value. This parameter is NULL for +        JSON_T_ARRAY_BEGIN, JSON_T_ARRAY_END, JSON_T_OBJECT_BEGIN, JSON_T_OBJECT_END, +        JSON_T_NULL, JSON_T_TRUE, and SON_T_FALSE. String values are always returned +        as zero-terminated C strings. + +    \return Non-zero if parsing should continue, else zero. +*/     +typedef int (*JSON_parser_callback)(void* ctx, int type, const struct JSON_value_struct* value); + + +/*! \brief The structure used to configure a JSON parser object  +     +    \param depth If negative, the parser can parse arbitrary levels of JSON, otherwise +        the depth is the limit +    \param Pointer to a callback. This parameter may be NULL. In this case the input is merely checked for validity. +    \param Callback context. This parameter may be NULL. +    \param depth. Specifies the levels of nested JSON to allow. Negative numbers yield unlimited nesting. +    \param allowComments. To allow C style comments in JSON, set to non-zero. +    \param handleFloatsManually. To decode floating point numbers manually set this parameter to non-zero. +     +    \return The parser object. +*/ +typedef struct { +    JSON_parser_callback     callback; +    void*                    callback_ctx; +    int                      depth; +    int                      allow_comments; +    int                      handle_floats_manually; +} JSON_config; + + +/*! \brief Initializes the JSON parser configuration structure to default values. + +    The default configuration is +    - 127 levels of nested JSON (depends on JSON_PARSER_STACK_SIZE, see json_parser.c) +    - no parsing, just checking for JSON syntax +    - no comments + +    \param config. Used to configure the parser. +*/ +JSON_PARSER_DLL_API void init_JSON_config(JSON_config* config); + +/*! \brief Create a JSON parser object  +     +    \param config. Used to configure the parser. Set to NULL to use the default configuration.  +        See init_JSON_config +     +    \return The parser object. +*/ +JSON_PARSER_DLL_API extern JSON_parser new_JSON_parser(JSON_config* config); + +/*! \brief Destroy a previously created JSON parser object. */ +JSON_PARSER_DLL_API extern void delete_JSON_parser(JSON_parser jc); + +/*! \brief Parse a character. + +    \return Non-zero, if all characters passed to this function are part of are valid JSON. +*/ +JSON_PARSER_DLL_API extern int JSON_parser_char(JSON_parser jc, int next_char); + +/*! \brief Finalize parsing. + +    Call this method once after all input characters have been consumed. +     +    \return Non-zero, if all parsed characters are valid JSON, zero otherwise. +*/ +JSON_PARSER_DLL_API extern int JSON_parser_done(JSON_parser jc); + +/*! \brief Determine if a given string is valid JSON white space  + +    \return Non-zero if the string is valid, zero otherwise. +*/ +JSON_PARSER_DLL_API extern int JSON_parser_is_legal_white_space_string(const char* s); + + +#ifdef __cplusplus +} +#endif  +     + +#endif /* JSON_PARSER_H */ diff --git a/decoder/Makefile.am b/decoder/Makefile.am new file mode 100644 index 00000000..a385197c --- /dev/null +++ b/decoder/Makefile.am @@ -0,0 +1,84 @@ +bin_PROGRAMS = cdec + +if HAVE_GTEST +noinst_PROGRAMS = \ +  dict_test \ +  weights_test \ +  trule_test \ +  hg_test \ +  ff_test \ +  logval_test \ +  parser_test \ +  grammar_test \ +  small_vector_test +endif + +cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc ff_factory.cc timing_stats.cc +small_vector_test_SOURCES = small_vector_test.cc +small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +parser_test_SOURCES = parser_test.cc +parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +dict_test_SOURCES = dict_test.cc +dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +ff_test_SOURCES = ff_test.cc +ff_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +grammar_test_SOURCES = grammar_test.cc +grammar_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +hg_test_SOURCES = hg_test.cc +hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +trule_test_SOURCES = trule_test.cc +trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +weights_test_SOURCES = weights_test.cc +weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +logval_test_SOURCES = logval_test.cc +logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) + +LDADD = libcdec.a + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) +AM_LDFLAGS = -lz + +rule_lexer.cc: rule_lexer.l +	$(LEX) -s -CF -8 -o$@ $< + +noinst_LIBRARIES = libcdec.a + +libcdec_a_SOURCES = \ +  rule_lexer.cc \ +  fst_translator.cc \ +  csplit.cc \ +  translator.cc \ +  scfg_translator.cc \ +  hg.cc \ +  hg_io.cc \ +  hg_intersect.cc \ +  viterbi.cc \ +  lattice.cc \ +  aligner.cc \ +  gzstream.cc \ +  apply_models.cc \ +  earley_composer.cc \ +  phrasetable_fst.cc \ +  sparse_vector.cc \ +  trule.cc \ +  filelib.cc \ +  stringlib.cc \ +  fdict.cc \ +  tdict.cc \ +  weights.cc \ +  ttables.cc \ +  ff.cc \ +  ff_lm.cc \ +  ff_wordalign.cc \ +  ff_csplit.cc \ +  ff_tagger.cc \ +  tromble_loss.cc \ +  freqdict.cc \ +  lexalign.cc \ +  lextrans.cc \ +  tagger.cc \ +  bottom_up_parser.cc \ +  phrasebased_translator.cc \ +  JSON_parser.c \ +  json_parse.cc \ +  grammar.cc diff --git a/decoder/aligner.cc b/decoder/aligner.cc new file mode 100644 index 00000000..bad97b74 --- /dev/null +++ b/decoder/aligner.cc @@ -0,0 +1,319 @@ +#include "aligner.h" + +#include "array2d.h" +#include "hg.h" +#include "sentence_metadata.h" +#include "inside_outside.h" +#include "viterbi.h" +#include <set> + +using namespace std; + +static bool is_digit(char x) { return x >= '0' && x <= '9'; } + +boost::shared_ptr<Array2D<bool> > AlignerTools::ReadPharaohAlignmentGrid(const string& al) { +  int max_x = 0; +  int max_y = 0; +  int i = 0; +  size_t pos = al.rfind(" ||| "); +  if (pos != string::npos) { i = pos + 5; } +  while (i < al.size()) { +    if (al[i] == '\n' || al[i] == '\r') break; +    int x = 0; +    while(i < al.size() && is_digit(al[i])) { +      x *= 10; +      x += al[i] - '0'; +      ++i; +    } +    if (x > max_x) max_x = x; +    assert(i < al.size()); +    if(al[i] != '-') { +      cerr << "BAD ALIGNMENT: " << al << endl; +      abort(); +    } +    ++i; +    int y = 0; +    while(i < al.size() && is_digit(al[i])) { +      y *= 10; +      y += al[i] - '0'; +      ++i; +    } +    if (y > max_y) max_y = y; +    while(i < al.size() && al[i] == ' ') { ++i; } +  } + +  boost::shared_ptr<Array2D<bool> > grid(new Array2D<bool>(max_x + 1, max_y + 1)); +  i = 0; +  if (pos != string::npos) { i = pos + 5; } +  while (i < al.size()) { +    if (al[i] == '\n' || al[i] == '\r') break; +    int x = 0; +    while(i < al.size() && is_digit(al[i])) { +      x *= 10; +      x += al[i] - '0'; +      ++i; +    } +    assert(i < al.size()); +    assert(al[i] == '-'); +    ++i; +    int y = 0; +    while(i < al.size() && is_digit(al[i])) { +      y *= 10; +      y += al[i] - '0'; +      ++i; +    } +    (*grid)(x, y) = true; +    while(i < al.size() && al[i] == ' ') { ++i; } +  } +  // cerr << *grid << endl; +  return grid; +} + +void AlignerTools::SerializePharaohFormat(const Array2D<bool>& alignment, ostream* out) { +  bool need_space = false; +  for (int i = 0; i < alignment.width(); ++i) +    for (int j = 0; j < alignment.height(); ++j) +      if (alignment(i,j)) { +        if (need_space) (*out) << ' '; else need_space = true; +        (*out) << i << '-' << j; +      } +  (*out) << endl; +} + +// used with lexical models since they may not fully generate the +// source string +void SourceEdgeCoveragesUsingParseIndices(const Hypergraph& g, +                                          vector<set<int> >* src_cov) { +  src_cov->clear(); +  src_cov->resize(g.edges_.size()); +   +  for (int i = 0; i < g.edges_.size(); ++i) { +    const Hypergraph::Edge& edge = g.edges_[i]; +    set<int>& cov = (*src_cov)[i]; +    // no words +    if (edge.rule_->EWords() == 0 || edge.rule_->FWords() == 0) +      continue; +    // aligned to NULL (crf ibm variant only) +    if (edge.prev_i_ == -1 || edge.i_ == -1) +      continue; +    assert(edge.j_ >= 0); +    assert(edge.prev_j_ >= 0); +    if (edge.Arity() == 0) { +      for (int k = edge.prev_i_; k < edge.prev_j_; ++k) +        cov.insert(k); +    } else { +      // note: this code, which handles mixed NT and terminal +      // rules assumes that nodes uniquely define a src and trg +      // span. +      int k = edge.prev_i_; +      int j = 0; +      const vector<WordID>& f = edge.rule_->e();  // rules are inverted +      while (k < edge.prev_j_) { +        if (f[j] > 0) { +          cov.insert(k); +          // cerr << "src: " << k << endl; +          ++k; +          ++j; +        } else { +          const Hypergraph::Node& tailnode = g.nodes_[edge.tail_nodes_[-f[j]]]; +          assert(tailnode.in_edges_.size() > 0); +          // any edge will do: +          const Hypergraph::Edge& rep_edge = g.edges_[tailnode.in_edges_.front()]; +          //cerr << "skip " << (rep_edge.prev_j_ - rep_edge.prev_i_) << endl;  // src span +          k += (rep_edge.prev_j_ - rep_edge.prev_i_);  // src span +          ++j; +        } +      } +    } +  } +} + +int SourceEdgeCoveragesUsingTree(const Hypergraph& g, +                                 int node_id, +                                 int span_start, +                                 vector<int>* spans, +                                 vector<set<int> >* src_cov) { +  const Hypergraph::Node& node = g.nodes_[node_id]; +  int k = -1; +  for (int i = 0; i < node.in_edges_.size(); ++i) { +    const int edge_id = node.in_edges_[i]; +    const Hypergraph::Edge& edge = g.edges_[edge_id]; +    set<int>& cov = (*src_cov)[edge_id]; +    const vector<WordID>& f = edge.rule_->e();  // rules are inverted +    int j = 0; +    k = span_start; +    while (j < f.size()) { +      if (f[j] > 0) { +        cov.insert(k); +        ++k; +        ++j; +      } else { +        const int tail_node_id = edge.tail_nodes_[-f[j]]; +        int &right_edge = (*spans)[tail_node_id]; +        if (right_edge < 0) +          right_edge = SourceEdgeCoveragesUsingTree(g, tail_node_id, k, spans, src_cov); +        k = right_edge; +        ++j; +      } +    } +  } +  return k; +} + +void SourceEdgeCoveragesUsingTree(const Hypergraph& g, +                                  vector<set<int> >* src_cov) { +  src_cov->clear(); +  src_cov->resize(g.edges_.size()); +  vector<int> span_sizes(g.nodes_.size(), -1); +  SourceEdgeCoveragesUsingTree(g, g.nodes_.size() - 1, 0, &span_sizes, src_cov); +} + +int TargetEdgeCoveragesUsingTree(const Hypergraph& g, +                                 int node_id, +                                 int span_start, +                                 vector<int>* spans, +                                 vector<set<int> >* trg_cov) { +  const Hypergraph::Node& node = g.nodes_[node_id]; +  int k = -1; +  for (int i = 0; i < node.in_edges_.size(); ++i) { +    const int edge_id = node.in_edges_[i]; +    const Hypergraph::Edge& edge = g.edges_[edge_id]; +    set<int>& cov = (*trg_cov)[edge_id]; +    int ntc = 0; +    const vector<WordID>& e = edge.rule_->f();  // rules are inverted +    int j = 0; +    k = span_start; +    while (j < e.size()) { +      if (e[j] > 0) { +        cov.insert(k); +        ++k; +        ++j; +      } else { +        const int tail_node_id = edge.tail_nodes_[ntc]; +        ++ntc; +        int &right_edge = (*spans)[tail_node_id]; +        if (right_edge < 0) +          right_edge = TargetEdgeCoveragesUsingTree(g, tail_node_id, k, spans, trg_cov); +        k = right_edge; +        ++j; +      } +    } +    // cerr << "node=" << node_id << ": k=" << k << endl; +  } +  return k; +} + +void TargetEdgeCoveragesUsingTree(const Hypergraph& g, +                                  vector<set<int> >* trg_cov) { +  trg_cov->clear(); +  trg_cov->resize(g.edges_.size()); +  vector<int> span_sizes(g.nodes_.size(), -1); +  TargetEdgeCoveragesUsingTree(g, g.nodes_.size() - 1, 0, &span_sizes, trg_cov); +} + +struct TransitionEventWeightFunction { +  inline SparseVector<prob_t> operator()(const Hypergraph::Edge& e) const { +    SparseVector<prob_t> result; +    result.set_value(e.id_, e.edge_prob_); +    return result; +  } +}; + +// this code is rather complicated since it must deal with generating alignments +// when lattices are specified as input as well as with models that do not generate +// full sentence pairs (like lexical alignment models) +void AlignerTools::WriteAlignment(const Lattice& src_lattice, +                                  const Lattice& trg_lattice, +                                  const Hypergraph& in_g, +                                  ostream* out, +                                  bool map_instead_of_viterbi, +                                  const vector<bool>* edges) { +  bool fix_up_src_spans = false; +  const Hypergraph* g = &in_g; +  if (!src_lattice.IsSentence() || +      !trg_lattice.IsSentence()) { +    if (map_instead_of_viterbi) { +      cerr << "  Lattice alignment: using Viterbi instead of MAP alignment\n"; +    } +    map_instead_of_viterbi = false; +    fix_up_src_spans = !src_lattice.IsSentence(); +  } +  if (!map_instead_of_viterbi || edges) { +    Hypergraph* new_hg = in_g.CreateViterbiHypergraph(edges); +    for (int i = 0; i < new_hg->edges_.size(); ++i) +      new_hg->edges_[i].edge_prob_ = prob_t::One(); +    g = new_hg; +  } + +  vector<prob_t> edge_posteriors(g->edges_.size(), prob_t::Zero()); +  vector<WordID> trg_sent; +  vector<WordID> src_sent; +  if (fix_up_src_spans) { +    ViterbiESentence(*g, &src_sent); +  } else { +    src_sent.resize(src_lattice.size()); +    for (int i = 0; i < src_sent.size(); ++i) +      src_sent[i] = src_lattice[i][0].label; +  } + +  ViterbiFSentence(*g, &trg_sent); + +  if (edges || !map_instead_of_viterbi) { +    for (int i = 0; i < edge_posteriors.size(); ++i) +      edge_posteriors[i] = prob_t::One(); +  } else {  +    SparseVector<prob_t> posts; +    const prob_t z = InsideOutside<prob_t, EdgeProb, SparseVector<prob_t>, TransitionEventWeightFunction>(*g, &posts); +    for (int i = 0; i < edge_posteriors.size(); ++i) +      edge_posteriors[i] = posts[i] / z; +  } +  vector<set<int> > src_cov(g->edges_.size()); +  vector<set<int> > trg_cov(g->edges_.size()); +  TargetEdgeCoveragesUsingTree(*g, &trg_cov); + +  if (fix_up_src_spans) +    SourceEdgeCoveragesUsingTree(*g, &src_cov); +  else +    SourceEdgeCoveragesUsingParseIndices(*g, &src_cov); + +  // figure out the src and reference size; +  int src_size = src_sent.size(); +  int ref_size = trg_sent.size(); +  Array2D<prob_t> align(src_size, ref_size, prob_t::Zero()); +  for (int c = 0; c < g->edges_.size(); ++c) { +    const prob_t& p = edge_posteriors[c]; +    const set<int>& srcs = src_cov[c]; +    const set<int>& trgs = trg_cov[c]; +    for (set<int>::const_iterator si = srcs.begin(); +         si != srcs.end(); ++si) { +      for (set<int>::const_iterator ti = trgs.begin(); +           ti != trgs.end(); ++ti) { +        align(*si, *ti) += p; +      } +    } +  } +  if (g != &in_g) { delete g; g = NULL; } + +  prob_t threshold(0.9); +  const bool use_soft_threshold = true; // TODO configure + +  Array2D<bool> grid(src_size, ref_size, false); +  for (int j = 0; j < ref_size; ++j) { +    if (use_soft_threshold) { +      threshold = prob_t::Zero(); +      for (int i = 0; i < src_size; ++i) +        if (align(i, j) > threshold) threshold = align(i, j); +      //threshold *= prob_t(0.99); +    } +    for (int i = 0; i < src_size; ++i) +      grid(i, j) = align(i, j) >= threshold; +  } +  if (out == &cout) { +    // TODO need to do some sort of verbose flag +    cerr << align << endl; +    cerr << grid << endl; +  } +  (*out) << TD::GetString(src_sent) << " ||| " << TD::GetString(trg_sent) << " ||| "; +  SerializePharaohFormat(grid, out); +}; + diff --git a/decoder/aligner.h b/decoder/aligner.h new file mode 100644 index 00000000..cd159119 --- /dev/null +++ b/decoder/aligner.h @@ -0,0 +1,27 @@ +#ifndef _ALIGNER_H_ + +#include <string> +#include <iostream> +#include <boost/shared_ptr.hpp> +#include "array2d.h" +#include "lattice.h" + +class Hypergraph; +class SentenceMetadata; + +struct AlignerTools { +  static boost::shared_ptr<Array2D<bool> > ReadPharaohAlignmentGrid(const std::string& al); +  static void SerializePharaohFormat(const Array2D<bool>& alignment, std::ostream* out); + +  // assumption: g contains derivations of input/ref and +  // ONLY input/ref. +  // if edges is non-NULL, the alignment corresponding to the edge rules will be written +  static void WriteAlignment(const Lattice& src, +                             const Lattice& ref, +                             const Hypergraph& g, +                             std::ostream* out, +                             bool map_instead_of_viterbi = true, +                             const std::vector<bool>* edges = NULL); +}; + +#endif diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc new file mode 100644 index 00000000..2908005f --- /dev/null +++ b/decoder/apply_models.cc @@ -0,0 +1,426 @@ +#include "apply_models.h" + +#include <vector> +#include <algorithm> +#include <tr1/unordered_map> +#include <tr1/unordered_set> + +#include <boost/functional/hash.hpp> + +#include "hg.h" +#include "ff.h" + +using namespace std; +using namespace std::tr1; + +struct Candidate; +typedef SmallVector JVector; +typedef vector<Candidate*> CandidateHeap; +typedef vector<Candidate*> CandidateList; + +// default vector size (* sizeof string is memory used) +static const size_t kRESERVE_NUM_NODES = 500000ul; + +// life cycle: candidates are created, placed on the heap +// and retrieved by their estimated cost, when they're +// retrieved, they're incorporated into the +LM hypergraph +// where they also know the head node index they are +// attached to.  After they are added to the +LM hypergraph +// vit_prob_ and est_prob_ fields may be updated as better +// derivations are found (this happens since the successor's +// of derivation d may have a better score- they are +// explored lazily).  However, the updates don't happen +// when a candidate is in the heap so maintaining the heap +// property is not an issue. +struct Candidate { +  int node_index_;                     // -1 until incorporated +                                       // into the +LM forest +  const Hypergraph::Edge* in_edge_;    // in -LM forest +  Hypergraph::Edge out_edge_; +  string state_; +  const JVector j_; +  prob_t vit_prob_;            // these are fixed until the cand +                               // is popped, then they may be updated +  prob_t est_prob_; + +  Candidate(const Hypergraph::Edge& e, +            const JVector& j, +            const Hypergraph& out_hg, +            const vector<CandidateList>& D, +            const vector<string>& node_states, +            const SentenceMetadata& smeta, +            const ModelSet& models, +            bool is_goal) : +      node_index_(-1), +      in_edge_(&e), +      j_(j) { +    InitializeCandidate(out_hg, smeta, D, node_states, models, is_goal); +  } + +  // used to query uniqueness +  Candidate(const Hypergraph::Edge& e, +            const JVector& j) : in_edge_(&e), j_(j) {} + +  bool IsIncorporatedIntoHypergraph() const { +    return node_index_ >= 0; +  } + +  void InitializeCandidate(const Hypergraph& out_hg, +                           const SentenceMetadata& smeta, +                           const vector<vector<Candidate*> >& D, +                           const vector<string>& node_states, +                           const ModelSet& models, +                           const bool is_goal) { +    const Hypergraph::Edge& in_edge = *in_edge_; +    out_edge_.rule_ = in_edge.rule_; +    out_edge_.feature_values_ = in_edge.feature_values_; +    out_edge_.i_ = in_edge.i_; +    out_edge_.j_ = in_edge.j_; +    out_edge_.prev_i_ = in_edge.prev_i_; +    out_edge_.prev_j_ = in_edge.prev_j_; +    Hypergraph::TailNodeVector& tail = out_edge_.tail_nodes_; +    tail.resize(j_.size()); +    prob_t p = prob_t::One(); +    // cerr << "\nEstimating application of " << in_edge.rule_->AsString() << endl; +    for (int i = 0; i < tail.size(); ++i) { +      const Candidate& ant = *D[in_edge.tail_nodes_[i]][j_[i]]; +      assert(ant.IsIncorporatedIntoHypergraph()); +      tail[i] = ant.node_index_; +      p *= ant.vit_prob_; +    } +    prob_t edge_estimate = prob_t::One(); +    if (is_goal) { +      assert(tail.size() == 1); +      const string& ant_state = node_states[tail.front()]; +      models.AddFinalFeatures(ant_state, &out_edge_); +    } else { +      models.AddFeaturesToEdge(smeta, out_hg, node_states, &out_edge_, &state_, &edge_estimate); +    } +    vit_prob_ = out_edge_.edge_prob_ * p; +    est_prob_ = vit_prob_ * edge_estimate; +  } +}; + +ostream& operator<<(ostream& os, const Candidate& cand) { +  os << "CAND["; +  if (!cand.IsIncorporatedIntoHypergraph()) { os << "PENDING "; } +  else { os << "+LM_node=" << cand.node_index_; } +  os << " edge=" << cand.in_edge_->id_; +  os << " j=<"; +  for (int i = 0; i < cand.j_.size(); ++i) +    os << (i==0 ? "" : " ") << cand.j_[i]; +  os << "> vit=" << log(cand.vit_prob_); +  os << " est=" << log(cand.est_prob_); +  return os << ']'; +} + +struct HeapCandCompare { +  bool operator()(const Candidate* l, const Candidate* r) const { +    return l->est_prob_ < r->est_prob_; +  } +}; + +struct EstProbSorter { +  bool operator()(const Candidate* l, const Candidate* r) const { +    return l->est_prob_ > r->est_prob_; +  } +}; + +// the same candidate <edge, j> can be added multiple times if +// j is multidimensional (if you're going NW in Manhattan, you +// can first go north, then west, or you can go west then north) +// this is a hash function on the relevant variables from +// Candidate to enforce this. +struct CandidateUniquenessHash { +  size_t operator()(const Candidate* c) const { +    size_t x = 5381; +    x = ((x << 5) + x) ^ c->in_edge_->id_; +    for (int i = 0; i < c->j_.size(); ++i) +      x = ((x << 5) + x) ^ c->j_[i]; +    return x; +  } +}; + +struct CandidateUniquenessEquals { +  bool operator()(const Candidate* a, const Candidate* b) const { +    return (a->in_edge_ == b->in_edge_) && (a->j_ == b->j_); +  } +}; + +typedef unordered_set<const Candidate*, CandidateUniquenessHash, CandidateUniquenessEquals> UniqueCandidateSet; +typedef unordered_map<string, Candidate*, boost::hash<string> > State2Node; + +class CubePruningRescorer { + +public: +  CubePruningRescorer(const ModelSet& m, +                      const SentenceMetadata& sm, +                      const Hypergraph& i, +                      int pop_limit, +                      Hypergraph* o) : +      models(m), +      smeta(sm), +      in(i), +      out(*o), +      D(in.nodes_.size()), +      pop_limit_(pop_limit) { +    cerr << "  Applying feature functions (cube pruning, pop_limit = " << pop_limit_ << ')' << endl; +    node_states_.reserve(kRESERVE_NUM_NODES); +  } + +  void Apply() { +    int num_nodes = in.nodes_.size(); +    int goal_id = num_nodes - 1; +    int pregoal = goal_id - 1; +    int every = 1; +    if (num_nodes > 100) every = 10; +    assert(in.nodes_[pregoal].out_edges_.size() == 1); +    cerr << "    "; +    for (int i = 0; i < in.nodes_.size(); ++i) { +      if (i % every == 0) cerr << '.'; +      KBest(i, i == goal_id); +    } +    cerr << endl; +    cerr << "  Best path: " << log(D[goal_id].front()->vit_prob_) +         << "\t" << log(D[goal_id].front()->est_prob_) << endl; +    out.PruneUnreachable(D[goal_id].front()->node_index_); +    FreeAll(); +  } + + private: +  void FreeAll() { +    for (int i = 0; i < D.size(); ++i) { +      CandidateList& D_i = D[i]; +      for (int j = 0; j < D_i.size(); ++j) +        delete D_i[j]; +    } +    D.clear(); +  } + +  void IncorporateIntoPlusLMForest(Candidate* item, State2Node* s2n, CandidateList* freelist) { +    Hypergraph::Edge* new_edge = out.AddEdge(item->out_edge_.rule_, item->out_edge_.tail_nodes_); +    new_edge->feature_values_ = item->out_edge_.feature_values_; +    new_edge->edge_prob_ = item->out_edge_.edge_prob_; +    new_edge->i_ = item->out_edge_.i_; +    new_edge->j_ = item->out_edge_.j_; +    new_edge->prev_i_ = item->out_edge_.prev_i_; +    new_edge->prev_j_ = item->out_edge_.prev_j_; +    Candidate*& o_item = (*s2n)[item->state_]; +    if (!o_item) o_item = item; +     +    int& node_id = o_item->node_index_; +    if (node_id < 0) { +      Hypergraph::Node* new_node = out.AddNode(in.nodes_[item->in_edge_->head_node_].cat_); +      node_states_.push_back(item->state_); +      node_id = new_node->id_; +    } +    Hypergraph::Node* node = &out.nodes_[node_id]; +    out.ConnectEdgeToHeadNode(new_edge, node); + +    // update candidate if we have a better derivation +    // note: the difference between the vit score and the estimated +    // score is the same for all items with a common residual DP +    // state +    if (item->vit_prob_ > o_item->vit_prob_) { +      assert(o_item->state_ == item->state_);    // sanity check! +      o_item->est_prob_ = item->est_prob_; +      o_item->vit_prob_ = item->vit_prob_; +    } +    if (item != o_item) freelist->push_back(item); +  } + +  void KBest(const int vert_index, const bool is_goal) { +    // cerr << "KBest(" << vert_index << ")\n"; +    CandidateList& D_v = D[vert_index]; +    assert(D_v.empty()); +    const Hypergraph::Node& v = in.nodes_[vert_index]; +    // cerr << "  has " << v.in_edges_.size() << " in-coming edges\n"; +    const vector<int>& in_edges = v.in_edges_; +    CandidateHeap cand; +    CandidateList freelist; +    cand.reserve(in_edges.size()); +    UniqueCandidateSet unique_cands; +    for (int i = 0; i < in_edges.size(); ++i) { +      const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; +      const JVector j(edge.tail_nodes_.size(), 0); +      cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal)); +      assert(unique_cands.insert(cand.back()).second);  // these should all be unique! +    } +//    cerr << "  making heap of " << cand.size() << " candidates\n"; +    make_heap(cand.begin(), cand.end(), HeapCandCompare()); +    State2Node state2node;   // "buf" in Figure 2 +    int pops = 0; +    while(!cand.empty() && pops < pop_limit_) { +      pop_heap(cand.begin(), cand.end(), HeapCandCompare()); +      Candidate* item = cand.back(); +      cand.pop_back(); +      // cerr << "POPPED: " << *item << endl; +      PushSucc(*item, is_goal, &cand, &unique_cands); +      IncorporateIntoPlusLMForest(item, &state2node, &freelist); +      ++pops; +    } +    D_v.resize(state2node.size()); +    int c = 0; +    for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i) +      D_v[c++] = i->second; +    sort(D_v.begin(), D_v.end(), EstProbSorter()); +    // cerr << "  expanded to " << D_v.size() << " nodes\n"; + +    for (int i = 0; i < cand.size(); ++i) +      delete cand[i]; +    // freelist is necessary since even after an item merged, it still stays in +    // the unique set so it can't be deleted til now +    for (int i = 0; i < freelist.size(); ++i) +      delete freelist[i]; +  } + +  void PushSucc(const Candidate& item, const bool is_goal, CandidateHeap* pcand, UniqueCandidateSet* cs) { +    CandidateHeap& cand = *pcand; +    for (int i = 0; i < item.j_.size(); ++i) { +      JVector j = item.j_; +      ++j[i]; +      if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) { +        Candidate query_unique(*item.in_edge_, j); +        if (cs->count(&query_unique) == 0) { +          Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal); +          cand.push_back(new_cand); +          push_heap(cand.begin(), cand.end(), HeapCandCompare()); +          assert(cs->insert(new_cand).second);  // insert into uniqueness set, sanity check +        } +      } +    } +  } + +  const ModelSet& models; +  const SentenceMetadata& smeta; +  const Hypergraph& in; +  Hypergraph& out; + +  vector<CandidateList> D;   // maps nodes in in-HG to the +                             // equivalent nodes (many due to state +                             // splits) in the out-HG. +  vector<string> node_states_;  // for each node in the out-HG what is +                             // its q function value? +  const int pop_limit_; +}; + +struct NoPruningRescorer { +  NoPruningRescorer(const ModelSet& m, const SentenceMetadata &sm, const Hypergraph& i, Hypergraph* o) : +      models(m), +      smeta(sm), +      in(i), +      out(*o), +      nodemap(i.nodes_.size()) { +    cerr << "  Rescoring forest (full intersection)\n"; +    node_states_.reserve(kRESERVE_NUM_NODES); +  } + +  typedef unordered_map<string, int, boost::hash<string> > State2NodeIndex; + +  void ExpandEdge(const Hypergraph::Edge& in_edge, bool is_goal, State2NodeIndex* state2node) { +    const int arity = in_edge.Arity(); +    Hypergraph::TailNodeVector ends(arity); +    for (int i = 0; i < arity; ++i) +      ends[i] = nodemap[in_edge.tail_nodes_[i]].size(); + +    Hypergraph::TailNodeVector tail_iter(arity, 0); +    bool done = false; +    while (!done) { +      Hypergraph::TailNodeVector tail(arity); +      for (int i = 0; i < arity; ++i) +        tail[i] = nodemap[in_edge.tail_nodes_[i]][tail_iter[i]]; +      Hypergraph::Edge* new_edge = out.AddEdge(in_edge.rule_, tail); +      new_edge->feature_values_ = in_edge.feature_values_; +      new_edge->i_ = in_edge.i_; +      new_edge->j_ = in_edge.j_; +      new_edge->prev_i_ = in_edge.prev_i_; +      new_edge->prev_j_ = in_edge.prev_j_; +      string head_state; +      if (is_goal) { +        assert(tail.size() == 1); +        const string& ant_state = node_states_[tail.front()]; +        models.AddFinalFeatures(ant_state, new_edge); +      } else { +        prob_t edge_estimate; // this is a full intersection, so we disregard this +        models.AddFeaturesToEdge(smeta, out, node_states_, new_edge, &head_state, &edge_estimate); +      } +      int& head_plus1 = (*state2node)[head_state]; +      if (!head_plus1) { +        head_plus1 = out.AddNode(in_edge.rule_->GetLHS())->id_ + 1; +        node_states_.push_back(head_state); +        nodemap[in_edge.head_node_].push_back(head_plus1 - 1); +      } +      const int head_index = head_plus1 - 1; +      out.ConnectEdgeToHeadNode(new_edge->id_, head_index); + +      int ii = 0; +      for (; ii < arity; ++ii) { +        ++tail_iter[ii]; +        if (tail_iter[ii] < ends[ii]) break; +        tail_iter[ii] = 0; +      } +      done = (ii == arity); +    } +  } + +  void ProcessOneNode(const int node_num, const bool is_goal) { +    State2NodeIndex state2node; +    const Hypergraph::Node& node = in.nodes_[node_num]; +    for (int i = 0; i < node.in_edges_.size(); ++i) { +      const Hypergraph::Edge& edge = in.edges_[node.in_edges_[i]]; +      ExpandEdge(edge, is_goal, &state2node); +    } +  } + +  void Apply() { +    int num_nodes = in.nodes_.size(); +    int goal_id = num_nodes - 1; +    int pregoal = goal_id - 1; +    int every = 1; +    if (num_nodes > 100) every = 10; +    assert(in.nodes_[pregoal].out_edges_.size() == 1); +    cerr << "    "; +    for (int i = 0; i < in.nodes_.size(); ++i) { +      if (i % every == 0) cerr << '.'; +      ProcessOneNode(i, i == goal_id); +    } +    cerr << endl; +  } + + private: +  const ModelSet& models; +  const SentenceMetadata& smeta; +  const Hypergraph& in; +  Hypergraph& out; + +  vector<vector<int> > nodemap; +  vector<string> node_states_;  // for each node in the out-HG what is +                             // its q function value? +}; + +// each node in the graph has one of these, it keeps track of +void ApplyModelSet(const Hypergraph& in, +                   const SentenceMetadata& smeta, +                   const ModelSet& models, +                   const IntersectionConfiguration& config, +                   Hypergraph* out) { +  // TODO special handling when all models are stateless +  if (config.algorithm == 1) { +    int pl = config.pop_limit; +    if (pl > 100 && in.nodes_.size() > 80000) { +      cerr << "  Note: reducing pop_limit to " << pl << " for very large forest\n"; +      pl = 30; +    } +    CubePruningRescorer ma(models, smeta, in, pl, out); +    ma.Apply(); +  } else if (config.algorithm == 0) { +    NoPruningRescorer ma(models, smeta, in, out); +    ma.Apply(); +  } else { +    cerr << "Don't understand intersection algorithm " << config.algorithm << endl; +    exit(1); +  } +  out->is_linear_chain_ = in.is_linear_chain_;  // TODO remove when this is computed +                                                // automatically +} + diff --git a/decoder/apply_models.h b/decoder/apply_models.h new file mode 100644 index 00000000..d6d8b34a --- /dev/null +++ b/decoder/apply_models.h @@ -0,0 +1,20 @@ +#ifndef _APPLY_MODELS_H_ +#define _APPLY_MODELS_H_ + +struct ModelSet; +struct Hypergraph; +struct SentenceMetadata; + +struct IntersectionConfiguration { +  const int algorithm; // 0 = full intersection, 1 = cube pruning +  const int pop_limit; // max number of pops off the heap at each node +  IntersectionConfiguration(int alg, int k) : algorithm(alg), pop_limit(k) {} +}; + +void ApplyModelSet(const Hypergraph& in, +                   const SentenceMetadata& smeta, +                   const ModelSet& models, +                   const IntersectionConfiguration& config, +                   Hypergraph* out); + +#endif diff --git a/decoder/array2d.h b/decoder/array2d.h new file mode 100644 index 00000000..e63eda0d --- /dev/null +++ b/decoder/array2d.h @@ -0,0 +1,172 @@ +#ifndef ARRAY2D_H_ +#define ARRAY2D_H_ + +#include <iostream> +#include <algorithm> +#include <cassert> +#include <vector> +#include <string> + +template<typename T> +class Array2D { + public: +  typedef typename std::vector<T>::reference reference; +  typedef typename std::vector<T>::const_reference const_reference; +  typedef typename std::vector<T>::iterator iterator; +  typedef typename std::vector<T>::const_iterator const_iterator; +  Array2D() : width_(0), height_(0) {} +  Array2D(int w, int h, const T& d = T()) : +    width_(w), height_(h), data_(w*h, d) {} +  Array2D(const Array2D& rhs) : +    width_(rhs.width_), height_(rhs.height_), data_(rhs.data_) {} +  bool empty() const { return data_.empty(); } +  void resize(int w, int h, const T& d = T()) { +    data_.resize(w * h, d); +    width_ = w; +    height_ = h; +  } +  const Array2D& operator=(const Array2D& rhs) { +    data_ = rhs.data_; +    width_ = rhs.width_; +    height_ = rhs.height_; +    return *this; +  } +  void fill(const T& v) { data_.assign(data_.size(), v); } +  int width() const { return width_; } +  int height() const { return height_; } +  reference operator()(int i, int j) { +    return data_[offset(i, j)]; +  } +  void clear() { data_.clear(); width_=0; height_=0; } +  const_reference operator()(int i, int j) const { +    return data_[offset(i, j)]; +  } +  iterator begin_col(int j) { +    return data_.begin() + offset(0,j); +  } +  const_iterator begin_col(int j) const { +    return data_.begin() + offset(0,j); +  } +  iterator end_col(int j) { +    return data_.begin() + offset(0,j) + width_; +  } +  const_iterator end_col(int j) const { +    return data_.begin() + offset(0,j) + width_; +  } +  iterator end() { return data_.end(); } +  const_iterator end() const { return data_.end(); } +  const Array2D<T>& operator*=(const T& x) { +    std::transform(data_.begin(), data_.end(), data_.begin(), +        std::bind2nd(std::multiplies<T>(), x)); +  } +  const Array2D<T>& operator/=(const T& x) { +    std::transform(data_.begin(), data_.end(), data_.begin(), +        std::bind2nd(std::divides<T>(), x)); +  } +  const Array2D<T>& operator+=(const Array2D<T>& m) { +    std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::plus<T>()); +  } +  const Array2D<T>& operator-=(const Array2D<T>& m) { +    std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::minus<T>()); +  } + + private: +  inline int offset(int i, int j) const { +    assert(i<width_); +    assert(j<height_); +    return i + j * width_; +  } + +  int width_; +  int height_; + +  std::vector<T> data_; +}; + +template <typename T> +Array2D<T> operator*(const Array2D<T>& l, const T& scalar) { +  Array2D<T> res(l); +  res *= scalar; +  return res; +} + +template <typename T> +Array2D<T> operator*(const T& scalar, const Array2D<T>& l) { +  Array2D<T> res(l); +  res *= scalar; +  return res; +} + +template <typename T> +Array2D<T> operator/(const Array2D<T>& l, const T& scalar) { +  Array2D<T> res(l); +  res /= scalar; +  return res; +} + +template <typename T> +Array2D<T> operator+(const Array2D<T>& l, const Array2D<T>& r) { +  Array2D<T> res(l); +  res += r; +  return res; +} + +template <typename T> +Array2D<T> operator-(const Array2D<T>& l, const Array2D<T>& r) { +  Array2D<T> res(l); +  res -= r; +  return res; +} + +template <typename T> +inline std::ostream& operator<<(std::ostream& os, const Array2D<T>& m) { +  for (int i=0; i<m.width(); ++i) { +    for (int j=0; j<m.height(); ++j) +      os << '\t' << m(i,j); +    os << '\n'; +  } +  return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Array2D<bool>& m) { +  os << ' '; +  for (int j=0; j<m.height(); ++j) +    os << (j%10); +  os << "\n"; +  for (int i=0; i<m.width(); ++i) { +    os << (i%10); +    for (int j=0; j<m.height(); ++j) +      os << (m(i,j) ? '*' : '.'); +    os << (i%10) << "\n"; +  } +  os << ' '; +  for (int j=0; j<m.height(); ++j) +    os << (j%10); +  os << "\n"; +  return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Array2D<std::vector<bool> >& m) { +  os << ' '; +  for (int j=0; j<m.height(); ++j) +    os << (j%10) << "\t"; +  os << "\n"; +  for (int i=0; i<m.width(); ++i) { +    os << (i%10); +    for (int j=0; j<m.height(); ++j) { +      const std::vector<bool>& ar = m(i,j); +      for (int k=0; k<ar.size(); ++k) +        os << (ar[k] ? '*' : '.'); +    } +    os << "\t"; +    os << (i%10) << "\n"; +  } +  os << ' '; +  for (int j=0; j<m.height(); ++j) +    os << (j%10) << "\t"; +  os << "\n"; +  return os; +} + +#endif + diff --git a/decoder/bottom_up_parser.cc b/decoder/bottom_up_parser.cc new file mode 100644 index 00000000..dd54a606 --- /dev/null +++ b/decoder/bottom_up_parser.cc @@ -0,0 +1,302 @@ +#include "bottom_up_parser.h" + +#include <iostream> +#include <map> + +#include "hg.h" +#include "array2d.h" +#include "tdict.h" + +using namespace std; + +struct ParserStats { +  ParserStats() : active_items(), passive_items() {} +  void Reset() { active_items=0; passive_items=0; } +  void Report() { +    cerr << "  ACTIVE ITEMS: " << active_items << "\tPASSIVE ITEMS: " << passive_items << endl; +  } +  int active_items; +  int passive_items; +  void NotifyActive(int i, int j) { ++active_items; } +  void NotifyPassive(int i, int j) { ++passive_items; } +}; + +ParserStats stats; + +class ActiveChart; +class PassiveChart { + public: +  PassiveChart(const string& goal, +               const vector<GrammarPtr>& grammars, +               const Lattice& input, +               Hypergraph* forest); +  ~PassiveChart(); + +  inline const vector<int>& operator()(int i, int j) const { return chart_(i,j); } +  bool Parse(); +  inline int size() const { return chart_.width(); } +  inline bool GoalFound() const { return goal_idx_ >= 0; } +  inline int GetGoalIndex() const { return goal_idx_; } + + private: +  void ApplyRules(const int i, +                  const int j, +                  const RuleBin* rules, +                  const Hypergraph::TailNodeVector& tail, +                  const float lattice_cost); + +  void ApplyRule(const int i, +                 const int j, +                 const TRulePtr& r, +                 const Hypergraph::TailNodeVector& ant_nodes, +                 const float lattice_cost); + +  void ApplyUnaryRules(const int i, const int j); + +  const vector<GrammarPtr>& grammars_; +  const Lattice& input_; +  Hypergraph* forest_; +  Array2D<vector<int> > chart_;   // chart_(i,j) is the list of nodes derived spanning i,j +  typedef map<int, int> Cat2NodeMap; +  Array2D<Cat2NodeMap> nodemap_; +  vector<ActiveChart*> act_chart_; +  const WordID goal_cat_;    // category that is being searched for at [0,n] +  TRulePtr goal_rule_; +  int goal_idx_;             // index of goal node, if found +  const int lc_fid_; + +  static WordID kGOAL;       // [Goal] +}; + +WordID PassiveChart::kGOAL = 0; + +class ActiveChart { + public: +  ActiveChart(const Hypergraph* hg, const PassiveChart& psv_chart) : +    hg_(hg), +    act_chart_(psv_chart.size(), psv_chart.size()), psv_chart_(psv_chart) {} + +  struct ActiveItem { +    ActiveItem(const GrammarIter* g, const Hypergraph::TailNodeVector& a, float lcost) : +      gptr_(g), ant_nodes_(a), lattice_cost(lcost) {} +    explicit ActiveItem(const GrammarIter* g) : +      gptr_(g), ant_nodes_(), lattice_cost(0.0) {} + +    void ExtendTerminal(int symbol, float src_cost, vector<ActiveItem>* out_cell) const { +      const GrammarIter* ni = gptr_->Extend(symbol); +      if (ni) { +        stats.NotifyActive(-1,-1);  // TRACKING STATS +        out_cell->push_back(ActiveItem(ni, ant_nodes_, lattice_cost + src_cost)); +      } +    } +    void ExtendNonTerminal(const Hypergraph* hg, int node_index, vector<ActiveItem>* out_cell) const { +      int symbol = hg->nodes_[node_index].cat_; +      const GrammarIter* ni = gptr_->Extend(symbol); +      if (!ni) return; +      stats.NotifyActive(-1,-1);  // TRACKING STATS +      Hypergraph::TailNodeVector na(ant_nodes_.size() + 1); +      for (int i = 0; i < ant_nodes_.size(); ++i) +        na[i] = ant_nodes_[i]; +      na[ant_nodes_.size()] = node_index; +      out_cell->push_back(ActiveItem(ni, na, lattice_cost)); +    } + +    const GrammarIter* gptr_; +    Hypergraph::TailNodeVector ant_nodes_; +    float lattice_cost;  // TODO? use SparseVector<double> +  }; + +  inline const vector<ActiveItem>& operator()(int i, int j) const { return act_chart_(i,j); } +  void SeedActiveChart(const Grammar& g) { +    int size = act_chart_.width(); +    for (int i = 0; i < size; ++i) +      if (g.HasRuleForSpan(i,i,0)) +        act_chart_(i,i).push_back(ActiveItem(g.GetRoot())); +  } + +  void ExtendActiveItems(int i, int k, int j) { +    //cerr << "  LOOK(" << i << "," << k << ") for completed items in (" << k << "," << j << ")\n"; +    vector<ActiveItem>& cell = act_chart_(i,j); +    const vector<ActiveItem>& icell = act_chart_(i,k); +    const vector<int>& idxs = psv_chart_(k, j); +    //if (!idxs.empty()) { cerr << "FOUND IN (" << k << "," << j << ")\n"; } +    for (vector<ActiveItem>::const_iterator di = icell.begin(); di != icell.end(); ++di) { +      for (vector<int>::const_iterator ni = idxs.begin(); ni != idxs.end(); ++ni) { +         di->ExtendNonTerminal(hg_, *ni, &cell); +      } +    } +  } + +  void AdvanceDotsForAllItemsInCell(int i, int j, const vector<vector<LatticeArc> >& input) { +    //cerr << "ADVANCE(" << i << "," << j << ")\n"; +    for (int k=i+1; k < j; ++k) +      ExtendActiveItems(i, k, j); + +    const vector<LatticeArc>& out_arcs = input[j-1]; +    for (vector<LatticeArc>::const_iterator ai = out_arcs.begin(); +         ai != out_arcs.end(); ++ai) { +      const WordID& f = ai->label; +      const double& c = ai->cost; +      const int& len = ai->dist2next; +      //VLOG(1) << "F: " << TD::Convert(f) << endl; +      const vector<ActiveItem>& ec = act_chart_(i, j-1); +      for (vector<ActiveItem>::const_iterator di = ec.begin(); di != ec.end(); ++di) +        di->ExtendTerminal(f, c, &act_chart_(i, j + len - 1)); +    } +  } + + private: +  const Hypergraph* hg_; +  Array2D<vector<ActiveItem> > act_chart_; +  const PassiveChart& psv_chart_;  +}; + +PassiveChart::PassiveChart(const string& goal, +                           const vector<GrammarPtr>& grammars, +                           const Lattice& input, +                           Hypergraph* forest) : +    grammars_(grammars), +    input_(input), +    forest_(forest), +    chart_(input.size()+1, input.size()+1), +    nodemap_(input.size()+1, input.size()+1), +    goal_cat_(TD::Convert(goal) * -1), +    goal_rule_(new TRule("[Goal] ||| [" + goal + ",1] ||| [" + goal + ",1]")), +    goal_idx_(-1), +    lc_fid_(FD::Convert("LatticeCost")) { +  act_chart_.resize(grammars_.size()); +  for (int i = 0; i < grammars_.size(); ++i) +    act_chart_[i] = new ActiveChart(forest, *this); +  if (!kGOAL) kGOAL = TD::Convert("Goal") * -1; +  cerr << "  Goal category: [" << goal << ']' << endl; +} + +void PassiveChart::ApplyRule(const int i, +                             const int j, +                             const TRulePtr& r, +                             const Hypergraph::TailNodeVector& ant_nodes, +                             const float lattice_cost) { +  stats.NotifyPassive(i,j);  // TRACKING STATS +  Hypergraph::Edge* new_edge = forest_->AddEdge(r, ant_nodes); +  new_edge->prev_i_ = r->prev_i; +  new_edge->prev_j_ = r->prev_j; +  new_edge->i_ = i; +  new_edge->j_ = j; +  new_edge->feature_values_ = r->GetFeatureValues(); +  if (lattice_cost) +    new_edge->feature_values_.set_value(lc_fid_, lattice_cost); +  Cat2NodeMap& c2n = nodemap_(i,j); +  const bool is_goal = (r->GetLHS() == kGOAL); +  const Cat2NodeMap::iterator ni = c2n.find(r->GetLHS()); +  Hypergraph::Node* node = NULL; +  if (ni == c2n.end()) { +    node = forest_->AddNode(r->GetLHS()); +    c2n[r->GetLHS()] = node->id_; +    if (is_goal) { +      assert(goal_idx_ == -1); +      goal_idx_ = node->id_; +    } else { +      chart_(i,j).push_back(node->id_); +    } +  } else { +    node = &forest_->nodes_[ni->second]; +  } +  forest_->ConnectEdgeToHeadNode(new_edge, node); +} + +void PassiveChart::ApplyRules(const int i, +                       const int j, +                       const RuleBin* rules, +                       const Hypergraph::TailNodeVector& tail, +                       const float lattice_cost) { +  const int n = rules->GetNumRules(); +  for (int k = 0; k < n; ++k) +    ApplyRule(i, j, rules->GetIthRule(k), tail, lattice_cost); +} + +void PassiveChart::ApplyUnaryRules(const int i, const int j) { +  const vector<int>& nodes = chart_(i,j);  // reference is important! +  for (int gi = 0; gi < grammars_.size(); ++gi) { +    if (!grammars_[gi]->HasRuleForSpan(i,j,input_.Distance(i,j))) continue; +    for (int di = 0; di < nodes.size(); ++di) { +      const WordID& cat = forest_->nodes_[nodes[di]].cat_; +      const vector<TRulePtr>& unaries = grammars_[gi]->GetUnaryRulesForRHS(cat); +      for (int ri = 0; ri < unaries.size(); ++ri) { +        // cerr << "At (" << i << "," << j << "): applying " << unaries[ri]->AsString() << endl; +        const Hypergraph::TailNodeVector ant(1, nodes[di]); +        ApplyRule(i, j, unaries[ri], ant, 0);  // may update nodes +      } +    } +  } +} + +bool PassiveChart::Parse() { +  forest_->nodes_.reserve(input_.size() * input_.size() * 2); +  forest_->edges_.reserve(input_.size() * input_.size() * 1000);  // TODO: reservation?? +  goal_idx_ = -1; +  for (int gi = 0; gi < grammars_.size(); ++gi) +    act_chart_[gi]->SeedActiveChart(*grammars_[gi]); + +  cerr << "    "; +  for (int l=1; l<input_.size()+1; ++l) { +    cerr << '.'; +    for (int i=0; i<input_.size() + 1 - l; ++i) { +      int j = i + l; +      for (int gi = 0; gi < grammars_.size(); ++gi) { +        const Grammar& g = *grammars_[gi]; +        if (g.HasRuleForSpan(i, j, input_.Distance(i, j))) { +          act_chart_[gi]->AdvanceDotsForAllItemsInCell(i, j, input_); + +          const vector<ActiveChart::ActiveItem>& cell = (*act_chart_[gi])(i,j); +          for (vector<ActiveChart::ActiveItem>::const_iterator ai = cell.begin(); +               ai != cell.end(); ++ai) { +            const RuleBin* rules = (ai->gptr_->GetRules()); +            if (!rules) continue; +            ApplyRules(i, j, rules, ai->ant_nodes_, ai->lattice_cost); +          } +        } +      } +      ApplyUnaryRules(i,j); + +      for (int gi = 0; gi < grammars_.size(); ++gi) { +        const Grammar& g = *grammars_[gi]; +          // deal with non-terminals that were just proved +          if (g.HasRuleForSpan(i, j, input_.Distance(i,j))) +            act_chart_[gi]->ExtendActiveItems(i, i, j); +      } +    } +    const vector<int>& dh = chart_(0, input_.size()); +    for (int di = 0; di < dh.size(); ++di) { +      const Hypergraph::Node& node = forest_->nodes_[dh[di]]; +      if (node.cat_ == goal_cat_) { +        Hypergraph::TailNodeVector ant(1, node.id_); +        ApplyRule(0, input_.size(), goal_rule_, ant, 0); +      } +    } +  } +  cerr << endl; + +  if (GoalFound()) +    forest_->PruneUnreachable(forest_->nodes_.size() - 1); +  return GoalFound(); +} + +PassiveChart::~PassiveChart() { +  for (int i = 0; i < act_chart_.size(); ++i) +    delete act_chart_[i]; +} + +ExhaustiveBottomUpParser::ExhaustiveBottomUpParser( +    const string& goal_sym, +    const vector<GrammarPtr>& grammars) : +  goal_sym_(goal_sym), +  grammars_(grammars) {} + +bool ExhaustiveBottomUpParser::Parse(const Lattice& input, +                                     Hypergraph* forest) const { +  stats.Reset(); +  PassiveChart chart(goal_sym_, grammars_, input, forest); +  const bool result = chart.Parse(); +  stats.Report(); +  return result; +} diff --git a/decoder/bottom_up_parser.h b/decoder/bottom_up_parser.h new file mode 100644 index 00000000..546bfb54 --- /dev/null +++ b/decoder/bottom_up_parser.h @@ -0,0 +1,27 @@ +#ifndef _BOTTOM_UP_PARSER_H_ +#define _BOTTOM_UP_PARSER_H_ + +#include <vector> +#include <string> + +#include "lattice.h" +#include "grammar.h" + +class Hypergraph; + +class ExhaustiveBottomUpParser { + public: +  ExhaustiveBottomUpParser(const std::string& goal_sym, +                           const std::vector<GrammarPtr>& grammars); + +  // returns true if goal reached spanning the full input +  // forest contains the full (i.e., unpruned) parse forest +  bool Parse(const Lattice& input, +             Hypergraph* forest) const; + + private: +  const std::string goal_sym_; +  const std::vector<GrammarPtr> grammars_; +}; + +#endif diff --git a/decoder/cdec.cc b/decoder/cdec.cc new file mode 100644 index 00000000..dbf32cb3 --- /dev/null +++ b/decoder/cdec.cc @@ -0,0 +1,592 @@ +#include <iostream> +#include <fstream> +#include <tr1/unordered_map> +#include <tr1/unordered_set> + +#include <boost/shared_ptr.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "timing_stats.h" +#include "translator.h" +#include "phrasebased_translator.h" +#include "aligner.h" +#include "stringlib.h" +#include "forest_writer.h" +#include "hg_io.h" +#include "filelib.h" +#include "sampler.h" +#include "sparse_vector.h" +#include "tagger.h" +#include "lextrans.h" +#include "lexalign.h" +#include "csplit.h" +#include "weights.h" +#include "tdict.h" +#include "ff.h" +#include "ff_factory.h" +#include "hg_intersect.h" +#include "apply_models.h" +#include "viterbi.h" +#include "kbest.h" +#include "inside_outside.h" +#include "exp_semiring.h" +#include "sentence_metadata.h" + +using namespace std; +using namespace std::tr1; +using boost::shared_ptr; +namespace po = boost::program_options; + +// some globals ... +boost::shared_ptr<RandomNumberGenerator<boost::mt19937> > rng; +static const double kMINUS_EPSILON = -1e-6;  // don't be too strict + +namespace Hack { void MaxTrans(const Hypergraph& in, int beam_size); } +namespace NgramCache { void Clear(); } + +void ShowBanner() { +  cerr << "cdec v1.0 (c) 2009-2010 by Chris Dyer\n"; +} + +void ConvertSV(const SparseVector<prob_t>& src, SparseVector<double>* trg) { +  for (SparseVector<prob_t>::const_iterator it = src.begin(); it != src.end(); ++it) +    trg->set_value(it->first, it->second); +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("formalism,f",po::value<string>(),"Decoding formalism; values include SCFG, FST, PB, LexTrans (lexical translation model, also disc training), CSplit (compound splitting), Tagger (sequence labeling), LexAlign (alignment only, or EM training)") +        ("input,i",po::value<string>()->default_value("-"),"Source file") +        ("grammar,g",po::value<vector<string> >()->composing(),"Either SCFG grammar file(s) or phrase tables file(s)") +        ("weights,w",po::value<string>(),"Feature weights file") +        ("no_freeze_feature_set,Z", "Do not freeze feature set after reading feature weights file") +        ("feature_function,F",po::value<vector<string> >()->composing(), "Additional feature function(s) (-L for list)") +        ("list_feature_functions,L","List available feature functions") +        ("add_pass_through_rules,P","Add rules to translate OOV words as themselves") +	("k_best,k",po::value<int>(),"Extract the k best derivations") +	("unique_k_best,r", "Unique k-best translation list") +        ("aligner,a", "Run as a word/phrase aligner (src & ref required)") +        ("intersection_strategy,I",po::value<string>()->default_value("cube_pruning"), "Intersection strategy for incorporating finite-state features; values include Cube_pruning, Full") +        ("cubepruning_pop_limit,K",po::value<int>()->default_value(200), "Max number of pops from the candidate heap at each node") +        ("goal",po::value<string>()->default_value("S"),"Goal symbol (SCFG & FST)") +        ("scfg_extra_glue_grammar", po::value<string>(), "Extra glue grammar file (Glue grammars apply when i=0 but have no other span restrictions)") +        ("scfg_no_hiero_glue_grammar,n", "No Hiero glue grammar (nb. by default the SCFG decoder adds Hiero glue rules)") +        ("scfg_default_nt,d",po::value<string>()->default_value("X"),"Default non-terminal symbol in SCFG") +        ("scfg_max_span_limit,S",po::value<int>()->default_value(10),"Maximum non-terminal span limit (except \"glue\" grammar)") +	("show_tree_structure", "Show the Viterbi derivation structure") +        ("show_expected_length", "Show the expected translation length under the model") +        ("show_partition,z", "Compute and show the partition (inside score)") +        ("show_cfg_search_space", "Show the search space as a CFG") +        ("beam_prune", po::value<double>(), "Prune paths from +LM forest") +        ("lexalign_use_null", "Support source-side null words in lexical translation") +        ("tagger_tagset,t", po::value<string>(), "(Tagger) file containing tag set") +        ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format") +        ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice") +        ("extract_rules", po::value<string>(), "Extract the rules used in translation (de-duped) to this file") +        ("graphviz","Show (constrained) translation forest in GraphViz format") +        ("max_translation_beam,x", po::value<int>(), "Beam approximation to get max translation from the chart") +        ("max_translation_sample,X", po::value<int>(), "Sample the max translation from the chart") +        ("pb_max_distortion,D", po::value<int>()->default_value(4), "Phrase-based decoder: maximum distortion") +        ("cll_gradient,G","Compute conditional log-likelihood gradient and write to STDOUT (src & ref required)") +        ("crf_uniform_empirical", "If there are multple references use (i.e., lattice) a uniform distribution rather than posterior weighting a la EM") +        ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)") +        ("vector_format",po::value<string>()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)") +        ("combine_size,C",po::value<int>()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)") +        ("forest_output,O",po::value<string>(),"Directory to write forests to") +        ("minimal_forests,m","Write minimal forests (excludes Rule information). Such forests can be used for ML/MAP training, but not rescoring, etc."); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config,c", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); + +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    const string cfg = (*conf)["config"].as<string>(); +    cerr << "Configuration file: " << cfg << endl; +    ifstream config(cfg.c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("list_feature_functions")) { +    cerr << "Available feature functions (specify with -F):\n"; +    global_ff_registry->DisplayList(); +    cerr << endl; +    exit(1); +  } + +  if (conf->count("help") || conf->count("formalism") == 0) { +    cerr << dcmdline_options << endl; +    exit(1); +  } + +  const string formalism = LowercaseString((*conf)["formalism"].as<string>()); +  if (formalism != "scfg" && formalism != "fst" && formalism != "lextrans" && formalism != "pb" && formalism != "csplit" && formalism != "tagger" && formalism != "lexalign") { +    cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit', 'lextrans', 'lexalign', or 'tagger'\n"; +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +// TODO move out of cdec into some sampling decoder file +void SampleRecurse(const Hypergraph& hg, const vector<SampleSet>& ss, int n, vector<WordID>* out) { +  const SampleSet& s = ss[n]; +  int i = rng->SelectSample(s); +  const Hypergraph::Edge& edge = hg.edges_[hg.nodes_[n].in_edges_[i]]; +  vector<vector<WordID> > ants(edge.tail_nodes_.size()); +  for (int j = 0; j < ants.size(); ++j) +    SampleRecurse(hg, ss, edge.tail_nodes_[j], &ants[j]); + +  vector<const vector<WordID>*> pants(ants.size()); +  for (int j = 0; j < ants.size(); ++j) pants[j] = &ants[j]; +  edge.rule_->ESubstitute(pants, out); +} + +struct SampleSort { +  bool operator()(const pair<int,string>& a, const pair<int,string>& b) const { +    return a.first > b.first; +  } +}; + +// TODO move out of cdec into some sampling decoder file +void MaxTranslationSample(Hypergraph* hg, const int samples, const int k) { +  unordered_map<string, int, boost::hash<string> > m; +  hg->PushWeightsToGoal(); +  const int num_nodes = hg->nodes_.size(); +  vector<SampleSet> ss(num_nodes); +  for (int i = 0; i < num_nodes; ++i) { +    SampleSet& s = ss[i]; +    const vector<int>& in_edges = hg->nodes_[i].in_edges_; +    for (int j = 0; j < in_edges.size(); ++j) { +      s.add(hg->edges_[in_edges[j]].edge_prob_); +    } +  } +  for (int i = 0; i < samples; ++i) { +    vector<WordID> yield; +    SampleRecurse(*hg, ss, hg->nodes_.size() - 1, &yield); +    const string trans = TD::GetString(yield); +    ++m[trans]; +  } +  vector<pair<int, string> > dist; +  for (unordered_map<string, int, boost::hash<string> >::iterator i = m.begin(); +         i != m.end(); ++i) { +    dist.push_back(make_pair(i->second, i->first)); +  } +  sort(dist.begin(), dist.end(), SampleSort()); +  if (k) { +    for (int i = 0; i < k; ++i) +      cout << dist[i].first << " ||| " << dist[i].second << endl; +  } else { +    cout << dist[0].second << endl; +  } +} + +// TODO decoder output should probably be moved to another file +void DumpKBest(const int sent_id, const Hypergraph& forest, const int k, const bool unique) { +cerr << "In kbest\n";  + if (unique) { +    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique> kbest(forest, k); +    for (int i = 0; i < k; ++i) { +      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique>::Derivation* d = +        kbest.LazyKthBest(forest.nodes_.size() - 1, i); +      if (!d) break; +      cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " +           << d->feature_values << " ||| " << log(d->score) << endl; +    } +  } else { +    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k); +    for (int i = 0; i < k; ++i) { +      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +        kbest.LazyKthBest(forest.nodes_.size() - 1, i); +      if (!d) break; +      cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " +           << d->feature_values << " ||| " << log(d->score) << endl; +    } +  } +} + +struct ELengthWeightFunction { +  double operator()(const Hypergraph::Edge& e) const { +    return e.rule_->ELength() - e.rule_->Arity(); +  } +}; + + +struct TRPHash { +  size_t operator()(const TRulePtr& o) const { return reinterpret_cast<size_t>(o.get()); } +}; +static void ExtractRulesDedupe(const Hypergraph& hg, ostream* os) { +  static unordered_set<TRulePtr, TRPHash> written; +  for (int i = 0; i < hg.edges_.size(); ++i) { +    const TRulePtr& rule = hg.edges_[i].rule_; +    if (written.insert(rule).second) { +      (*os) << rule->AsString() << endl; +    } +  } +} + +void register_feature_functions(); + +int main(int argc, char** argv) { +  global_ff_registry.reset(new FFRegistry); +  register_feature_functions(); +  ShowBanner(); +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const bool write_gradient = conf.count("cll_gradient"); +  const bool feature_expectations = conf.count("feature_expectations"); +  if (write_gradient && feature_expectations) { +    cerr << "You can only specify --gradient or --feature_expectations, not both!\n"; +    exit(1); +  } +  const bool output_training_vector = (write_gradient || feature_expectations); + +  boost::shared_ptr<Translator> translator; +  const string formalism = LowercaseString(conf["formalism"].as<string>()); +  const bool csplit_preserve_full_word = conf.count("csplit_preserve_full_word"); +  if (csplit_preserve_full_word && +      (formalism != "csplit" || !conf.count("beam_prune"))) { +    cerr << "--csplit_preserve_full_word should only be " +         << "used with csplit AND --beam_prune!\n"; +    exit(1); +  } +  const bool csplit_output_plf = conf.count("csplit_output_plf"); +  if (csplit_output_plf && formalism != "csplit") { +    cerr << "--csplit_output_plf should only be used with csplit!\n"; +    exit(1); +  } + +  // load feature weights (and possibly freeze feature set) +  vector<double> feature_weights; +  Weights w; +  if (conf.count("weights")) { +    w.InitFromFile(conf["weights"].as<string>()); +    feature_weights.resize(FD::NumFeats()); +    w.InitVector(&feature_weights); +    if (!conf.count("no_freeze_feature_set")) { +      cerr << "Freezing feature set (use --no_freeze_feature_set to change)." << endl; +      FD::Freeze(); +    } +  } + +  // set up translation back end +  if (formalism == "scfg") +    translator.reset(new SCFGTranslator(conf)); +  else if (formalism == "fst") +    translator.reset(new FSTTranslator(conf)); +  else if (formalism == "pb") +    translator.reset(new PhraseBasedTranslator(conf)); +  else if (formalism == "csplit") +    translator.reset(new CompoundSplit(conf)); +  else if (formalism == "lextrans") +    translator.reset(new LexicalTrans(conf)); +  else if (formalism == "lexalign") +    translator.reset(new LexicalAlign(conf)); +  else if (formalism == "tagger") +    translator.reset(new Tagger(conf)); +  else +    assert(!"error"); + +  // set up additional scoring features +  vector<shared_ptr<FeatureFunction> > pffs; +  vector<const FeatureFunction*> late_ffs; +  if (conf.count("feature_function") > 0) { +    const vector<string>& add_ffs = conf["feature_function"].as<vector<string> >(); +    for (int i = 0; i < add_ffs.size(); ++i) { +      string ff, param; +      SplitCommandAndParam(add_ffs[i], &ff, ¶m); +      cerr << "Feature: " << ff; +      if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n"; +      else cerr << " (no config parameters)\n"; +      shared_ptr<FeatureFunction> pff = global_ff_registry->Create(ff, param); +      if (!pff) { exit(1); } +      // TODO check that multiple features aren't trying to set the same fid +      pffs.push_back(pff); +      late_ffs.push_back(pff.get()); +    } +  } +  ModelSet late_models(feature_weights, late_ffs); +  int palg = 1; +  if (LowercaseString(conf["intersection_strategy"].as<string>()) == "full") { +    palg = 0; +    cerr << "Using full intersection (no pruning).\n"; +  } +  const IntersectionConfiguration inter_conf(palg, conf["cubepruning_pop_limit"].as<int>()); + +  const int sample_max_trans = conf.count("max_translation_sample") ? +    conf["max_translation_sample"].as<int>() : 0; +  if (sample_max_trans) +    rng.reset(new RandomNumberGenerator<boost::mt19937>); +  const bool aligner_mode = conf.count("aligner"); +  const bool minimal_forests = conf.count("minimal_forests"); +  const bool graphviz = conf.count("graphviz"); +  const bool encode_b64 = conf["vector_format"].as<string>() == "b64"; +  const bool kbest = conf.count("k_best"); +  const bool unique_kbest = conf.count("unique_k_best"); +  const bool crf_uniform_empirical = conf.count("crf_uniform_empirical"); +  shared_ptr<WriteFile> extract_file; +  if (conf.count("extract_rules")) +    extract_file.reset(new WriteFile(conf["extract_rules"].as<string>())); + +  int combine_size = conf["combine_size"].as<int>(); +  if (combine_size < 1) combine_size = 1; +  const string input = conf["input"].as<string>(); +  cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; +  ReadFile in_read(input); +  istream *in = in_read.stream(); +  assert(*in); + +  SparseVector<prob_t> acc_vec;  // accumulate gradient +  double acc_obj = 0; // accumulate objective +  int g_count = 0;    // number of gradient pieces computed +  int sent_id = -1;         // line counter + +  while(*in) { +    NgramCache::Clear();   // clear ngram cache for remote LM (if used) +    Timer::Summarize(); +    ++sent_id; +    string buf; +    getline(*in, buf); +    if (buf.empty()) continue; +    map<string, string> sgml; +    ProcessAndStripSGML(&buf, &sgml); +    if (sgml.find("id") != sgml.end()) +      sent_id = atoi(sgml["id"].c_str()); + +    cerr << "\nINPUT: "; +    if (buf.size() < 100) +      cerr << buf << endl; +    else { +     size_t x = buf.rfind(" ", 100); +     if (x == string::npos) x = 100; +     cerr << buf.substr(0, x) << " ..." << endl; +    } +    cerr << "  id = " << sent_id << endl; +    string to_translate; +    Lattice ref; +    ParseTranslatorInputLattice(buf, &to_translate, &ref); +    const bool has_ref = ref.size() > 0; +    SentenceMetadata smeta(sent_id, ref); +    const bool hadoop_counters = (write_gradient); +    Hypergraph forest;          // -LM forest +    translator->ProcessMarkupHints(sgml); +    Timer t("Translation"); +    const bool translation_successful = +      translator->Translate(to_translate, &smeta, feature_weights, &forest); +    translator->SentenceComplete(); +    if (!translation_successful) { +      cerr << "  NO PARSE FOUND.\n"; +      if (hadoop_counters) +        cerr << "reporter:counter:UserCounters,FParseFailed,1" << endl; +      cout << endl << flush; +      continue; +    } +    cerr << "  -LM forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; +    cerr << "  -LM forest       (paths): " << forest.NumberOfPaths() << endl; +    if (conf.count("show_expected_length")) { +      const PRPair<double, double> res = +        Inside<PRPair<double, double>, +               PRWeightFunction<double, EdgeProb, double, ELengthWeightFunction> >(forest); +      cerr << "  Expected length  (words): " << res.r / res.p << "\t" << res << endl; +    } +    if (conf.count("show_partition")) { +      const prob_t z = Inside<prob_t, EdgeProb>(forest); +      cerr << "  -LM partition     log(Z): " << log(z) << endl; +    } +    if (extract_file) +      ExtractRulesDedupe(forest, extract_file->stream()); +    vector<WordID> trans; +    const prob_t vs = ViterbiESentence(forest, &trans); +    cerr << "  -LM Viterbi: " << TD::GetString(trans) << endl; +    if (conf.count("show_tree_structure")) +      cerr << "  -LM    tree: " << ViterbiETree(forest) << endl;; +    cerr << "  -LM Viterbi: " << log(vs) << endl; + +    bool has_late_models = !late_models.empty(); +    if (has_late_models) { +      forest.Reweight(feature_weights); +      forest.SortInEdgesByEdgeWeights(); +      Hypergraph lm_forest; +      ApplyModelSet(forest, +                    smeta, +                    late_models, +                    inter_conf, +                    &lm_forest); +      forest.swap(lm_forest); +      forest.Reweight(feature_weights); +      trans.clear(); +      ViterbiESentence(forest, &trans); +      cerr << "  +LM forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; +      cerr << "  +LM forest       (paths): " << forest.NumberOfPaths() << endl; +      cerr << "  +LM Viterbi: " << TD::GetString(trans) << endl; +    } +    if (conf.count("beam_prune")) { +      vector<bool> preserve_mask(forest.edges_.size(), false); +      if (csplit_preserve_full_word) +        preserve_mask[CompoundSplit::GetFullWordEdgeIndex(forest)] = true; +      forest.BeamPruneInsideOutside(1.0, false, conf["beam_prune"].as<double>(), &preserve_mask); +      cerr << "  Pruned forest    (paths): " << forest.NumberOfPaths() << endl; +    } + +    if (conf.count("forest_output") && !has_ref) { +      ForestWriter writer(conf["forest_output"].as<string>(), sent_id); +      if (FileExists(writer.fname_)) { +        cerr << "  Unioning...\n"; +        Hypergraph new_hg; +        { +          ReadFile rf(writer.fname_); +          bool succeeded = HypergraphIO::ReadFromJSON(rf.stream(), &new_hg); +          assert(succeeded); +        } +        new_hg.Union(forest); +        bool succeeded = writer.Write(new_hg, minimal_forests); +        assert(succeeded); +      } else { +        bool succeeded = writer.Write(forest, minimal_forests); +        assert(succeeded); +      } +    } + +    if (sample_max_trans) { +      MaxTranslationSample(&forest, sample_max_trans, conf.count("k_best") ? conf["k_best"].as<int>() : 0); +    } else { +      if (kbest) { +        DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest); +      } else if (csplit_output_plf) { +        cout << HypergraphIO::AsPLF(forest, false) << endl; +      } else { +        if (!graphviz && !has_ref) { +          cout << TD::GetString(trans) << endl << flush; +        } +      } +    } + +    const int max_trans_beam_size = conf.count("max_translation_beam") ? +      conf["max_translation_beam"].as<int>() : 0; +    if (max_trans_beam_size) { +      Hack::MaxTrans(forest, max_trans_beam_size); +      continue; +    } + +    if (graphviz && !has_ref) forest.PrintGraphviz(); + +    // the following are only used if write_gradient is true! +    SparseVector<prob_t> full_exp, ref_exp, gradient; +    double log_z = 0, log_ref_z = 0; +    if (write_gradient) { +      const prob_t z = InsideOutside<prob_t, EdgeProb, SparseVector<prob_t>, EdgeFeaturesAndProbWeightFunction>(forest, &full_exp); +      log_z = log(z); +      full_exp /= z; +    } +    if (conf.count("show_cfg_search_space")) +      HypergraphIO::WriteAsCFG(forest); +    if (has_ref) { +      if (HG::Intersect(ref, &forest)) { +        cerr << "  Constr. forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; +        cerr << "  Constr. forest       (paths): " << forest.NumberOfPaths() << endl; +        if (crf_uniform_empirical) { +          cerr << "  USING UNIFORM WEIGHTS\n"; +          for (int i = 0; i < forest.edges_.size(); ++i) +            forest.edges_[i].edge_prob_=prob_t::One(); +        } else { +          forest.Reweight(feature_weights); +          cerr << "  Constr. VitTree: " << ViterbiFTree(forest) << endl; +        } +	if (hadoop_counters) +          cerr << "reporter:counter:UserCounters,SentencePairsParsed,1" << endl; +        if (conf.count("show_partition")) { +           const prob_t z = Inside<prob_t, EdgeProb>(forest); +           cerr << "  Contst. partition  log(Z): " << log(z) << endl; +        } +        //DumpKBest(sent_id, forest, 1000); +        if (conf.count("forest_output")) { +          ForestWriter writer(conf["forest_output"].as<string>(), sent_id); +          if (FileExists(writer.fname_)) { +            cerr << "  Unioning...\n"; +            Hypergraph new_hg; +            { +              ReadFile rf(writer.fname_); +              bool succeeded = HypergraphIO::ReadFromJSON(rf.stream(), &new_hg); +              assert(succeeded); +            } +            new_hg.Union(forest); +            bool succeeded = writer.Write(new_hg, minimal_forests); +            assert(succeeded); +          } else { +            bool succeeded = writer.Write(forest, minimal_forests); +            assert(succeeded); +          } +        } +        if (aligner_mode && !output_training_vector) +          AlignerTools::WriteAlignment(smeta.GetSourceLattice(), smeta.GetReference(), forest, &cout); +        if (write_gradient) { +          const prob_t ref_z = InsideOutside<prob_t, EdgeProb, SparseVector<prob_t>, EdgeFeaturesAndProbWeightFunction>(forest, &ref_exp); +          ref_exp /= ref_z; +          if (crf_uniform_empirical) { +            log_ref_z = ref_exp.dot(feature_weights); +          } else { +            log_ref_z = log(ref_z); +          } +          //cerr << "      MODEL LOG Z: " << log_z << endl; +          //cerr << "  EMPIRICAL LOG Z: " << log_ref_z << endl; +          if ((log_z - log_ref_z) < kMINUS_EPSILON) { +            cerr << "DIFF. ERR! log_z < log_ref_z: " << log_z << " " << log_ref_z << endl; +            exit(1); +          } +          assert(!isnan(log_ref_z)); +          ref_exp -= full_exp; +          acc_vec += ref_exp; +          acc_obj += (log_z - log_ref_z); +        } +        if (feature_expectations) { +          const prob_t z =  +            InsideOutside<prob_t, EdgeProb, SparseVector<prob_t>, EdgeFeaturesAndProbWeightFunction>(forest, &ref_exp); +          ref_exp /= z; +          acc_obj += log(z); +          acc_vec += ref_exp; +        } + +        if (output_training_vector) { +          acc_vec.clear_value(0); +          ++g_count; +          if (g_count % combine_size == 0) { +            if (encode_b64) { +              cout << "0\t"; +              SparseVector<double> dav; ConvertSV(acc_vec, &dav); +              B64::Encode(acc_obj, dav, &cout); +              cout << endl << flush; +            } else { +              cout << "0\t**OBJ**=" << acc_obj << ';' <<  acc_vec << endl << flush; +            } +            acc_vec.clear(); +            acc_obj = 0; +          } +        } +        if (conf.count("graphviz")) forest.PrintGraphviz(); +      } else { +        cerr << "  REFERENCE UNREACHABLE.\n"; +        if (write_gradient) { +	  if (hadoop_counters) +            cerr << "reporter:counter:UserCounters,EFParseFailed,1" << endl; +          cout << endl << flush; +	} +      } +    } +  } +  if (output_training_vector && !acc_vec.empty()) { +    if (encode_b64) { +      cout << "0\t"; +      SparseVector<double> dav; ConvertSV(acc_vec, &dav); +      B64::Encode(acc_obj, dav, &cout); +      cout << endl << flush; +    } else { +      cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush; +    } +  } +} + diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc new file mode 100644 index 00000000..d0b93795 --- /dev/null +++ b/decoder/cdec_ff.cc @@ -0,0 +1,32 @@ +#include <boost/shared_ptr.hpp> + +#include "ff.h" +#include "ff_lm.h" +#include "ff_csplit.h" +#include "ff_wordalign.h" +#include "ff_tagger.h" +#include "ff_factory.h" + +boost::shared_ptr<FFRegistry> global_ff_registry; + +void register_feature_functions() { +  global_ff_registry->Register("LanguageModel", new FFFactory<LanguageModel>); +#ifdef HAVE_RANDLM +  global_ff_registry->Register("RandLM", new FFFactory<LanguageModelRandLM>); +#endif +  global_ff_registry->Register("WordPenalty", new FFFactory<WordPenalty>); +  global_ff_registry->Register("SourceWordPenalty", new FFFactory<SourceWordPenalty>); +  global_ff_registry->Register("ArityPenalty", new FFFactory<ArityPenalty>); +  global_ff_registry->Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>); +  global_ff_registry->Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>); +  global_ff_registry->Register("MarkovJump", new FFFactory<MarkovJump>); +  global_ff_registry->Register("MarkovJumpFClass", new FFFactory<MarkovJumpFClass>); +  global_ff_registry->Register("SourcePOSBigram", new FFFactory<SourcePOSBigram>); +  global_ff_registry->Register("BlunsomSynchronousParseHack", new FFFactory<BlunsomSynchronousParseHack>); +  global_ff_registry->Register("AlignerResults", new FFFactory<AlignerResults>); +  global_ff_registry->Register("CSplit_BasicFeatures", new FFFactory<BasicCSplitFeatures>); +  global_ff_registry->Register("CSplit_ReverseCharLM", new FFFactory<ReverseCharLMCSplitFeature>); +  global_ff_registry->Register("Tagger_BigramIdentity", new FFFactory<Tagger_BigramIdentity>); +  global_ff_registry->Register("LexicalPairIdentity", new FFFactory<LexicalPairIdentity>); +}; + diff --git a/decoder/csplit.cc b/decoder/csplit.cc new file mode 100644 index 00000000..b1a30fb0 --- /dev/null +++ b/decoder/csplit.cc @@ -0,0 +1,173 @@ +#include "csplit.h" + +#include <iostream> + +#include "filelib.h" +#include "stringlib.h" +#include "hg.h" +#include "tdict.h" +#include "grammar.h" +#include "sentence_metadata.h" + +using namespace std; + +struct CompoundSplitImpl { +  CompoundSplitImpl(const boost::program_options::variables_map& conf) : +      fugen_elements_(true),   // TODO configure +      min_size_(3), +      kXCAT(TD::Convert("X")*-1), +      kWORDBREAK_RULE(new TRule("[X] ||| # ||| #")), +      kTEMPLATE_RULE(new TRule("[X] ||| [X,1] ? ||| [1] ?")), +      kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")), +      kFUGEN_S(FD::Convert("FugS")), +      kFUGEN_N(FD::Convert("FugN")) {} + +  void PasteTogetherStrings(const vector<string>& chars, +                            const int i, +                            const int j, +                            string* yield) { +    int size = 0; +    for (int k=i; k<j; ++k) +      size += chars[k].size(); +    yield->resize(size); +    int cur = 0; +    for (int k=i; k<j; ++k) { +      const string& cs = chars[k]; +      for (int l = 0; l < cs.size(); ++l) +        (*yield)[cur++] = cs[l]; +    } +  } + +  void BuildTrellis(const vector<string>& chars, +                    Hypergraph* forest) { +    vector<int> nodes(chars.size()+1, -1); +    nodes[0] = forest->AddNode(kXCAT)->id_;       // source +    const int left_rule = forest->AddEdge(kWORDBREAK_RULE, Hypergraph::TailNodeVector())->id_; +    forest->ConnectEdgeToHeadNode(left_rule, nodes[0]); + +    const int max_split_ = max(static_cast<int>(chars.size()) - min_size_ + 1, 1); +    cerr << "max: " << max_split_ << "  " << " min: " << min_size_ << endl; +    for (int i = min_size_; i < max_split_; ++i) +      nodes[i] = forest->AddNode(kXCAT)->id_; +    assert(nodes.back() == -1); +    nodes.back() = forest->AddNode(kXCAT)->id_;   // sink + +    for (int i = 0; i < max_split_; ++i) { +      if (nodes[i] < 0) continue; +      const int start = min(i + min_size_, static_cast<int>(chars.size())); +      for (int j = start; j <= chars.size(); ++j) { +        if (nodes[j] < 0) continue; +        string yield; +        PasteTogetherStrings(chars, i, j, &yield); +        // cerr << "[" << i << "," << j << "] " << yield << endl; +        TRulePtr rule = TRulePtr(new TRule(*kTEMPLATE_RULE)); +        rule->e_[1] = rule->f_[1] = TD::Convert(yield); +        // cerr << rule->AsString() << endl; +        int edge = forest->AddEdge( +          rule, +          Hypergraph::TailNodeVector(1, nodes[i]))->id_; +        forest->ConnectEdgeToHeadNode(edge, nodes[j]); +        forest->edges_[edge].i_ = i; +        forest->edges_[edge].j_ = j; + +        // handle "fugenelemente" here +        // don't delete "fugenelemente" at the end of words +        if (fugen_elements_ && j != chars.size()) { +          const int len = yield.size(); +          string alt; +          int fid = 0; +          if (len > (min_size_ + 2) && yield[len-1] == 's' && yield[len-2] == 'e') { +            alt = yield.substr(0, len - 2); +            fid = kFUGEN_S; +          } else if (len > (min_size_ + 1) && yield[len-1] == 's') { +            alt = yield.substr(0, len - 1); +            fid = kFUGEN_S; +          } else if (len > (min_size_ + 2) && yield[len-2] == 'e' && yield[len-1] == 'n') { +            alt = yield.substr(0, len - 1); +            fid = kFUGEN_N; +          } +          if (alt.size()) { +            TRulePtr altrule = TRulePtr(new TRule(*rule)); +            altrule->e_[1] = TD::Convert(alt); +            // cerr << altrule->AsString() << endl; +            int edge = forest->AddEdge( +              altrule, +              Hypergraph::TailNodeVector(1, nodes[i]))->id_; +            forest->ConnectEdgeToHeadNode(edge, nodes[j]); +            forest->edges_[edge].feature_values_.set_value(fid, 1.0); +            forest->edges_[edge].i_ = i; +            forest->edges_[edge].j_ = j; +          } +        } +      } +    } + +    // add goal rule +    Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); +    Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1); +    Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); +    forest->ConnectEdgeToHeadNode(hg_edge, goal); +  } + private: +  const bool fugen_elements_; +  const int min_size_; +  const WordID kXCAT; +  const TRulePtr kWORDBREAK_RULE; +  const TRulePtr kTEMPLATE_RULE; +  const TRulePtr kGOAL_RULE; +  const int kFUGEN_S; +  const int kFUGEN_N; +}; + +CompoundSplit::CompoundSplit(const boost::program_options::variables_map& conf) : +  pimpl_(new CompoundSplitImpl(conf)) {} + +static void SplitUTF8String(const string& in, vector<string>* out) { +  out->resize(in.size()); +  int i = 0; +  int c = 0; +  while (i < in.size()) { +    const int len = UTF8Len(in[i]); +    assert(len); +    (*out)[c] = in.substr(i, len); +    ++c; +    i += len; +  } +  out->resize(c); +} + +bool CompoundSplit::TranslateImpl(const string& input, +                      SentenceMetadata* smeta, +                      const vector<double>& weights, +                      Hypergraph* forest) { +  if (input.find(" ") != string::npos) { +    cerr << "  BAD INPUT: " << input << "\n    CompoundSplit expects single words\n"; +    abort(); +  } +  vector<string> in; +  SplitUTF8String(input, &in); +  smeta->SetSourceLength(in.size());  // TODO do utf8 or somethign +  for (int i = 0; i < in.size(); ++i) +    smeta->src_lattice_.push_back(vector<LatticeArc>(1, LatticeArc(TD::Convert(in[i]), 0.0, 1))); +  pimpl_->BuildTrellis(in, forest); +  forest->Reweight(weights); +  return true; +} + +int CompoundSplit::GetFullWordEdgeIndex(const Hypergraph& forest) { +  assert(forest.nodes_.size() > 0); +  const vector<int> out_edges = forest.nodes_[0].out_edges_; +  int max_edge = -1; +  int max_j = -1; +  for (int i = 0; i < out_edges.size(); ++i) { +    const int j = forest.edges_[out_edges[i]].j_; +    if (j > max_j) { +      max_j = j; +      max_edge = out_edges[i]; +    } +  } +  assert(max_edge >= 0); +  assert(max_edge < forest.edges_.size()); +  return max_edge; +} + diff --git a/decoder/csplit.h b/decoder/csplit.h new file mode 100644 index 00000000..82ed23fc --- /dev/null +++ b/decoder/csplit.h @@ -0,0 +1,30 @@ +#ifndef _CSPLIT_H_ +#define _CSPLIT_H_ + +#include "translator.h" +#include "lattice.h" + +// this "translator" takes single words (with NO SPACES) and segments +// them using the approach described in: +// +// C. Dyer. (2009) Using a maximum entropy model to build segmentation +//                 lattices for MT. In Proceedings of NAACL HLT 2009. +// note, an extra word space marker # is inserted at the left edge of +// the forest! +struct CompoundSplitImpl; +struct CompoundSplit : public Translator { +  CompoundSplit(const boost::program_options::variables_map& conf); +  bool TranslateImpl(const std::string& input, +                 SentenceMetadata* smeta, +                 const std::vector<double>& weights, +                 Hypergraph* forest); + +  // given a forest generated by CompoundSplit::Translate, +  // find the edge representing the unsegmented form +  static int GetFullWordEdgeIndex(const Hypergraph& forest); + + private: +  boost::shared_ptr<CompoundSplitImpl> pimpl_; +}; + +#endif diff --git a/decoder/dict.h b/decoder/dict.h new file mode 100644 index 00000000..72e82e6d --- /dev/null +++ b/decoder/dict.h @@ -0,0 +1,43 @@ +#ifndef DICT_H_ +#define DICT_H_ + +#include <cassert> +#include <cstring> +#include <tr1/unordered_map> +#include <string> +#include <vector> + +#include <boost/functional/hash.hpp> + +#include "wordid.h" + +class Dict { + typedef std::tr1::unordered_map<std::string, WordID, boost::hash<std::string> > Map; + public: +  Dict() : b0_("<bad0>") { words_.reserve(1000); } +  inline int max() const { return words_.size(); } +  inline WordID Convert(const std::string& word, bool frozen = false) { +    Map::iterator i = d_.find(word); +    if (i == d_.end()) { +      if (frozen) +        return 0; +      words_.push_back(word); +      d_[word] = words_.size(); +      return words_.size(); +    } else { +      return i->second; +    } +  } +  inline const std::string& Convert(const WordID& id) const { +    if (id == 0) return b0_; +    assert(id <= words_.size()); +    return words_[id-1]; +  } +  void clear() { words_.clear(); d_.clear(); } + private: +  const std::string b0_; +  std::vector<std::string> words_; +  Map d_; +}; + +#endif diff --git a/decoder/dict_test.cc b/decoder/dict_test.cc new file mode 100644 index 00000000..694877fa --- /dev/null +++ b/decoder/dict_test.cc @@ -0,0 +1,50 @@ +#include "dict.h" + +#include "fdict.h" + +#include <iostream> +#include <gtest/gtest.h> +#include <cassert> +#include "filelib.h" + +#include "tdict.h" + +using namespace std; + +class DTest : public testing::Test { + public: +  DTest() {} + protected: +  virtual void SetUp() { } +  virtual void TearDown() { } +}; + +TEST_F(DTest, Convert) { +  Dict d; +  WordID a = d.Convert("foo"); +  WordID b = d.Convert("bar"); +  std::string x = "foo"; +  WordID c = d.Convert(x); +  EXPECT_NE(a, b); +  EXPECT_EQ(a, c); +  EXPECT_EQ(d.Convert(a), "foo"); +  EXPECT_EQ(d.Convert(b), "bar"); +} + +TEST_F(DTest, FDictTest) { +  int fid = FD::Convert("First"); +  EXPECT_GT(fid, 0); +  EXPECT_EQ(FD::Convert(fid), "First"); +  string x = FD::Escape("="); +  cerr << x << endl; +  EXPECT_NE(x, "="); +  x = FD::Escape(";"); +  cerr << x << endl; +  EXPECT_NE(x, ";"); +} + +int main(int argc, char** argv) { +  testing::InitGoogleTest(&argc, argv); +  return RUN_ALL_TESTS(); +} + diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc new file mode 100644 index 00000000..f6a01e52 --- /dev/null +++ b/decoder/earley_composer.cc @@ -0,0 +1,726 @@ +#include "earley_composer.h" + +#include <iostream> +#include <fstream> +#include <map> +#include <queue> +#include <tr1/unordered_set> + +#include <boost/shared_ptr.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> +#include <boost/lexical_cast.hpp> + +#include "phrasetable_fst.h" +#include "sparse_vector.h" +#include "tdict.h" +#include "hg.h" + +using boost::shared_ptr; +namespace po = boost::program_options; +using namespace std; +using namespace std::tr1; + +// Define the following macro if you want to see lots of debugging output +// when you run the chart parser +#undef DEBUG_CHART_PARSER + +// A few constants used by the chart parser /////////////// +static const int kMAX_NODES = 2000000; +static const string kPHRASE_STRING = "X"; +static bool constants_need_init = true; +static WordID kUNIQUE_START; +static WordID kPHRASE; +static TRulePtr kX1X2; +static TRulePtr kX1; +static WordID kEPS; +static TRulePtr kEPSRule; + +static void InitializeConstants() { +  if (constants_need_init) { +    kPHRASE = TD::Convert(kPHRASE_STRING) * -1; +    kUNIQUE_START = TD::Convert("S") * -1; +    kX1X2.reset(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]")); +    kX1.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); +    kEPSRule.reset(new TRule("[X] ||| <eps> ||| <eps>")); +    kEPS = TD::Convert("<eps>"); +    constants_need_init = false; +  } +} +//////////////////////////////////////////////////////////// + +class EGrammarNode { +  friend bool EarleyComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); +  friend void AddGrammarRule(const string& r, map<WordID, EGrammarNode>* g); + public: +#ifdef DEBUG_CHART_PARSER +  string hint; +#endif +  EGrammarNode() : is_some_rule_complete(false), is_root(false) {} +  const map<WordID, EGrammarNode>& GetTerminals() const { return tptr; } +  const map<WordID, EGrammarNode>& GetNonTerminals() const { return ntptr; } +  bool HasNonTerminals() const { return (!ntptr.empty()); } +  bool HasTerminals() const { return (!tptr.empty()); } +  bool RuleCompletes() const { +    return (is_some_rule_complete || (ntptr.empty() && tptr.empty())); +  } +  bool GrammarContinues() const { +    return !(ntptr.empty() && tptr.empty()); +  } +  bool IsRoot() const { +    return is_root; +  } +  // these are the features associated with the rule from the start +  // node up to this point.  If you use these features, you must +  // not Extend() this rule. +  const SparseVector<double>& GetCFGProductionFeatures() const { +    return input_features; +  } + +  const EGrammarNode* Extend(const WordID& t) const { +    if (t < 0) { +      map<WordID, EGrammarNode>::const_iterator it = ntptr.find(t); +      if (it == ntptr.end()) return NULL; +      return &it->second; +    } else { +      map<WordID, EGrammarNode>::const_iterator it = tptr.find(t); +      if (it == tptr.end()) return NULL; +      return &it->second; +    } +  } + + private: +  map<WordID, EGrammarNode> tptr; +  map<WordID, EGrammarNode> ntptr; +  SparseVector<double> input_features; +  bool is_some_rule_complete; +  bool is_root; +}; +typedef map<WordID, EGrammarNode> EGrammar;    // indexed by the rule LHS + +// edges are immutable once created +struct Edge { +#ifdef DEBUG_CHART_PARSER +  static int id_count; +  const int id; +#endif +  const WordID cat;                   // lhs side of rule proved/being proved +  const EGrammarNode* const dot;      // dot position +  const FSTNode* const q;             // start of span +  const FSTNode* const r;             // end of span +  const Edge* const active_parent;    // back pointer, NULL for PREDICT items +  const Edge* const passive_parent;   // back pointer, NULL for SCAN and PREDICT items +  const TargetPhraseSet* const tps;   // translations +  shared_ptr<SparseVector<double> > features; // features from CFG rule + +  bool IsPassive() const { +    // when a rule is completed, this value will be set +    return static_cast<bool>(features); +  } +  bool IsActive() const { return !IsPassive(); } +  bool IsInitial() const { +    return !(active_parent || passive_parent); +  } +  bool IsCreatedByScan() const { +    return active_parent && !passive_parent && !dot->IsRoot(); +  } +  bool IsCreatedByPredict() const { +    return dot->IsRoot(); +  } +  bool IsCreatedByComplete() const { +    return active_parent && passive_parent; +  } + +  // constructor for PREDICT +  Edge(WordID c, const EGrammarNode* d, const FSTNode* q_and_r) : +#ifdef DEBUG_CHART_PARSER +    id(++id_count), +#endif +    cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(NULL), passive_parent(NULL), tps(NULL) {} +  Edge(WordID c, const EGrammarNode* d, const FSTNode* q_and_r, const Edge* act_parent) : +#ifdef DEBUG_CHART_PARSER +    id(++id_count), +#endif +    cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(act_parent), passive_parent(NULL), tps(NULL) {} + +  // constructors for SCAN +  Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, +       const Edge* act_par, const TargetPhraseSet* translations) : +#ifdef DEBUG_CHART_PARSER +    id(++id_count), +#endif +    cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations) {} + +  Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, +       const Edge* act_par, const TargetPhraseSet* translations, +       const SparseVector<double>& feats) : +#ifdef DEBUG_CHART_PARSER +    id(++id_count), +#endif +    cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations), +    features(new SparseVector<double>(feats)) {} + +  // constructors for COMPLETE +  Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, +       const Edge* act_par, const Edge *pas_par) : +#ifdef DEBUG_CHART_PARSER +    id(++id_count), +#endif +    cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(NULL) { +      assert(pas_par->IsPassive()); +      assert(act_par->IsActive()); +    } + +  Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, +       const Edge* act_par, const Edge *pas_par, const SparseVector<double>& feats) : +#ifdef DEBUG_CHART_PARSER +    id(++id_count), +#endif +    cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(NULL), +    features(new SparseVector<double>(feats)) { +      assert(pas_par->IsPassive()); +      assert(act_par->IsActive()); +    } + +  // constructor for COMPLETE query +  Edge(const FSTNode* _r) : +#ifdef DEBUG_CHART_PARSER +    id(0), +#endif +    cat(0), dot(NULL), q(NULL), +    r(_r), active_parent(NULL), passive_parent(NULL), tps(NULL) {} +  // constructor for MERGE quere +  Edge(const FSTNode* _q, int) : +#ifdef DEBUG_CHART_PARSER +    id(0), +#endif +    cat(0), dot(NULL), q(_q), +    r(NULL), active_parent(NULL), passive_parent(NULL), tps(NULL) {} +}; +#ifdef DEBUG_CHART_PARSER +int Edge::id_count = 0; +#endif + +ostream& operator<<(ostream& os, const Edge& e) { +  string type = "PREDICT"; +  if (e.IsCreatedByScan()) +    type = "SCAN"; +  else if (e.IsCreatedByComplete()) +    type = "COMPLETE";  +  os << "[" +#ifdef DEBUG_CHART_PARSER +     << '(' << e.id << ") " +#else +     << '(' << &e << ") " +#endif +     << "q=" << e.q << ", r=" << e.r  +     << ", cat="<< TD::Convert(e.cat*-1) << ", dot="  +     << e.dot +#ifdef DEBUG_CHART_PARSER +     << e.dot->hint +#endif +     << (e.IsActive() ? ", Active" : ", Passive") +     << ", " << type; +#ifdef DEBUG_CHART_PARSER +  if (e.active_parent) { os << ", act.parent=(" << e.active_parent->id << ')'; } +  if (e.passive_parent) { os << ", psv.parent=(" << e.passive_parent->id << ')'; } +#endif +  if (e.tps) { os << ", tps=" << e.tps; } +  return os << ']'; +} + +struct Traversal { +  const Edge* const edge;      // result from the active / passive combination +  const Edge* const active; +  const Edge* const passive; +  Traversal(const Edge* me, const Edge* a, const Edge* p) : edge(me), active(a), passive(p) {} +}; + +struct UniqueTraversalHash { +  size_t operator()(const Traversal* t) const { +    size_t x = 5381; +    x = ((x << 5) + x) ^ reinterpret_cast<size_t>(t->active); +    x = ((x << 5) + x) ^ reinterpret_cast<size_t>(t->passive); +    x = ((x << 5) + x) ^ t->edge->IsActive(); +    return x; +  } +}; + +struct UniqueTraversalEquals { +  size_t operator()(const Traversal* a, const Traversal* b) const { +    return (a->passive == b->passive && a->active == b->active && a->edge->IsActive() == b->edge->IsActive()); +  } +}; + +struct UniqueEdgeHash { +  size_t operator()(const Edge* e) const { +    size_t x = 5381; +    if (e->IsActive()) { +      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->dot); +      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q); +      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r); +      x = ((x << 5) + x) ^ static_cast<size_t>(e->cat); +      x += 13; +    } else {  // with passive edges, we don't care about the dot +      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q); +      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r); +      x = ((x << 5) + x) ^ static_cast<size_t>(e->cat); +    } +    return x; +  } +}; + +struct UniqueEdgeEquals { +  bool operator()(const Edge* a, const Edge* b) const { +    if (a->IsActive() != b->IsActive()) return false; +    if (a->IsActive()) { +      return (a->cat == b->cat) && (a->dot == b->dot) && (a->q == b->q) && (a->r == b->r); +    } else { +      return (a->cat == b->cat) && (a->q == b->q) && (a->r == b->r); +    } +  } +}; + +struct REdgeHash { +  size_t operator()(const Edge* e) const { +    size_t x = 5381; +    x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r); +    return x; +  } +}; + +struct REdgeEquals { +  bool operator()(const Edge* a, const Edge* b) const { +    return (a->r == b->r); +  } +}; + +struct QEdgeHash { +  size_t operator()(const Edge* e) const { +    size_t x = 5381; +    x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q); +    return x; +  } +}; + +struct QEdgeEquals { +  bool operator()(const Edge* a, const Edge* b) const { +    return (a->q == b->q); +  } +}; + +struct EdgeQueue { +  queue<const Edge*> q; +  EdgeQueue() {} +  void clear() { while(!q.empty()) q.pop(); } +  bool HasWork() const { return !q.empty(); } +  const Edge* Next() { const Edge* res = q.front(); q.pop(); return res; } +  void AddEdge(const Edge* s) { q.push(s); } +}; + +class EarleyComposerImpl { + public: +  EarleyComposerImpl(WordID start_cat, const FSTNode& q_0) : start_cat_(start_cat), q_0_(&q_0) {} + +  // returns false if the intersection is empty +  bool Compose(const EGrammar& g, Hypergraph* forest) { +    goal_node = NULL; +    EGrammar::const_iterator sit = g.find(start_cat_); +    forest->ReserveNodes(kMAX_NODES); +    assert(sit != g.end()); +    Edge* init = new Edge(start_cat_, &sit->second, q_0_); +    assert(IncorporateNewEdge(init)); +    while (exp_agenda.HasWork() || agenda.HasWork()) { +      while(exp_agenda.HasWork()) { +        const Edge* edge = exp_agenda.Next(); +        FinishEdge(edge, forest); +      } +      if (agenda.HasWork()) { +        const Edge* edge = agenda.Next(); +#ifdef DEBUG_CHART_PARSER +        cerr << "processing (" << edge->id << ')' << endl; +#endif +        if (edge->IsActive()) { +          if (edge->dot->HasTerminals()) +            DoScan(edge); +          if (edge->dot->HasNonTerminals()) { +            DoMergeWithPassives(edge); +            DoPredict(edge, g); +          } +        } else { +          DoComplete(edge); +        } +      } +    } +    if (goal_node) { +      forest->PruneUnreachable(goal_node->id_); +      forest->EpsilonRemove(kEPS); +    } +    FreeAll(); +    return goal_node; +  } + +  void FreeAll() { +    for (int i = 0; i < free_list_.size(); ++i) +      delete free_list_[i]; +    free_list_.clear(); +    for (int i = 0; i < traversal_free_list_.size(); ++i) +      delete traversal_free_list_[i]; +    traversal_free_list_.clear(); +    all_traversals.clear(); +    exp_agenda.clear(); +    agenda.clear(); +    tps2node.clear(); +    edge2node.clear(); +    all_edges.clear(); +    passive_edges.clear(); +    active_edges.clear(); +  } + +  ~EarleyComposerImpl() { +    FreeAll(); +  } + +  // returns the total number of edges created during composition +  int EdgesCreated() const { +    return free_list_.size(); +  } + + private: +  void DoScan(const Edge* edge) { +    // here, we assume that the FST will potentially have many more outgoing +    // edges than the grammar, which will be just a couple.  If you want to +    // efficiently handle the case where both are relatively large, this code +    // will need to change how the intersection is done.  The best general +    // solution would probably be the Baeza-Yates double binary search. + +    const EGrammarNode* dot = edge->dot; +    const FSTNode* r = edge->r; +    const map<WordID, EGrammarNode>& terms = dot->GetTerminals(); +    for (map<WordID, EGrammarNode>::const_iterator git = terms.begin(); +         git != terms.end(); ++git) { +      const FSTNode* next_r = r->Extend(git->first); +      if (!next_r) continue; +      const EGrammarNode* next_dot = &git->second; +      const bool grammar_continues = next_dot->GrammarContinues(); +      const bool rule_completes    = next_dot->RuleCompletes(); +      assert(grammar_continues || rule_completes); +      const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures(); +      // create up to 4 new edges! +      if (next_r->HasOutgoingNonEpsilonEdges()) {     // are there further symbols in the FST? +        const TargetPhraseSet* translations = NULL; +        if (rule_completes) +          IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, translations, input_features)); +        if (grammar_continues) +          IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, translations)); +      } +      if (next_r->HasData()) {   // indicates a loop back to q_0 in the FST +        const TargetPhraseSet* translations = next_r->GetTranslations(); +        if (rule_completes) +          IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, q_0_, edge, translations, input_features)); +        if (grammar_continues) +          IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, q_0_, edge, translations)); +      } +    } +  } + +  void DoPredict(const Edge* edge, const EGrammar& g) { +    const EGrammarNode* dot = edge->dot; +    const map<WordID, EGrammarNode>& non_terms = dot->GetNonTerminals(); +    for (map<WordID, EGrammarNode>::const_iterator git = non_terms.begin(); +         git != non_terms.end(); ++git) { +      const WordID nt_to_predict = git->first; +      //cerr << edge->id << " -- " << TD::Convert(nt_to_predict*-1) << endl; +      EGrammar::const_iterator egi = g.find(nt_to_predict); +      if (egi == g.end()) { +        cerr << "[ERROR] Can't find any grammar rules with a LHS of type " +             << TD::Convert(-1*nt_to_predict) << '!' << endl; +        continue; +      } +      assert(edge->IsActive()); +      const EGrammarNode* new_dot = &egi->second; +      Edge* new_edge = new Edge(nt_to_predict, new_dot, edge->r, edge); +      IncorporateNewEdge(new_edge); +    } +  } + +  void DoComplete(const Edge* passive) { +#ifdef DEBUG_CHART_PARSER +    cerr << "  complete: " << *passive << endl; +#endif +    const WordID completed_nt = passive->cat; +    const FSTNode* q = passive->q; +    const FSTNode* next_r = passive->r; +    const Edge query(q); +    const pair<unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator, +         unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator > p = +      active_edges.equal_range(&query); +    for (unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator it = p.first; +         it != p.second; ++it) { +      const Edge* active = *it; +#ifdef DEBUG_CHART_PARSER +      cerr << "    pos: " << *active << endl; +#endif +      const EGrammarNode* next_dot = active->dot->Extend(completed_nt); +      if (!next_dot) continue; +      const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures(); +      // add up to 2 rules +      if (next_dot->RuleCompletes()) +        IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); +      if (next_dot->GrammarContinues()) +        IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); +    } +  } + +  void DoMergeWithPassives(const Edge* active) { +    // edge is active, has non-terminals, we need to find the passives that can extend it +    assert(active->IsActive()); +    assert(active->dot->HasNonTerminals()); +#ifdef DEBUG_CHART_PARSER +    cerr << "  merge active with passives: ACT=" << *active << endl; +#endif +    const Edge query(active->r, 1); +    const pair<unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator, +         unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator > p = +      passive_edges.equal_range(&query); +    for (unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator it = p.first; +         it != p.second; ++it) { +      const Edge* passive = *it; +      const EGrammarNode* next_dot = active->dot->Extend(passive->cat); +      if (!next_dot) continue; +      const FSTNode* next_r = passive->r; +      const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures(); +      if (next_dot->RuleCompletes()) +        IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); +      if (next_dot->GrammarContinues()) +        IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); +    } +  } + +  // take ownership of edge memory, add to various indexes, etc +  // returns true if this edge is new +  bool IncorporateNewEdge(Edge* edge) { +    free_list_.push_back(edge); +    if (edge->passive_parent && edge->active_parent) { +      Traversal* t = new Traversal(edge, edge->active_parent, edge->passive_parent); +      traversal_free_list_.push_back(t); +      if (all_traversals.find(t) != all_traversals.end()) { +        return false; +      } else { +        all_traversals.insert(t); +      } +    } +    exp_agenda.AddEdge(edge); +    return true; +  } + +  bool FinishEdge(const Edge* edge, Hypergraph* hg) { +    bool is_new = false; +    if (all_edges.find(edge) == all_edges.end()) { +#ifdef DEBUG_CHART_PARSER +      cerr << *edge << " is NEW\n"; +#endif +      all_edges.insert(edge); +      is_new = true; +      if (edge->IsPassive()) passive_edges.insert(edge); +      if (edge->IsActive()) active_edges.insert(edge); +      agenda.AddEdge(edge); +    } else { +#ifdef DEBUG_CHART_PARSER +      cerr << *edge << " is NOT NEW.\n"; +#endif +    } +    AddEdgeToTranslationForest(edge, hg); +    return is_new; +  } + +  // build the translation forest +  void AddEdgeToTranslationForest(const Edge* edge, Hypergraph* hg) { +    assert(hg->nodes_.size() < kMAX_NODES); +    Hypergraph::Node* tps = NULL; +    // first add any target language rules +    if (edge->tps) { +      Hypergraph::Node*& node = tps2node[(size_t)edge->tps]; +      if (!node) { +        // cerr << "Creating phrases for " << edge->tps << endl; +        const vector<TRulePtr>& rules = edge->tps->GetRules(); +        node = hg->AddNode(kPHRASE); +        for (int i = 0; i < rules.size(); ++i) { +          Hypergraph::Edge* hg_edge = hg->AddEdge(rules[i], Hypergraph::TailNodeVector()); +          hg_edge->feature_values_ += rules[i]->GetFeatureValues(); +          hg->ConnectEdgeToHeadNode(hg_edge, node); +        } +      } +      tps = node; +    } +    Hypergraph::Node*& head_node = edge2node[edge]; +    if (!head_node) +      head_node = hg->AddNode(kPHRASE); +    if (edge->cat == start_cat_ && edge->q == q_0_ && edge->r == q_0_ && edge->IsPassive()) { +      assert(goal_node == NULL || goal_node == head_node); +      goal_node = head_node; +    } +    Hypergraph::TailNodeVector tail; +    SparseVector<double> extra; +    if (edge->IsCreatedByPredict()) { +      // extra.set_value(FD::Convert("predict"), 1); +    } else if (edge->IsCreatedByScan()) { +      tail.push_back(edge2node[edge->active_parent]->id_); +      if (tps) { +        tail.push_back(tps->id_); +      } +      //extra.set_value(FD::Convert("scan"), 1); +    } else if (edge->IsCreatedByComplete()) { +      tail.push_back(edge2node[edge->active_parent]->id_); +      tail.push_back(edge2node[edge->passive_parent]->id_); +      //extra.set_value(FD::Convert("complete"), 1); +    } else { +      assert(!"unexpected edge type!"); +    } +    //cerr << head_node->id_ << "<--" << *edge << endl; + +#ifdef DEBUG_CHART_PARSER +      for (int i = 0; i < tail.size(); ++i) +        if (tail[i] == head_node->id_) { +          cerr << "ERROR: " << *edge << "\n   i=" << i << endl; +          if (i == 1) { cerr << "\tP: " << *edge->passive_parent << endl; } +          if (i == 0) { cerr << "\tA: " << *edge->active_parent << endl; } +          assert(!"self-loop found!"); +        } +#endif +    Hypergraph::Edge* hg_edge = NULL;  +    if (tail.size() == 0) { +      hg_edge = hg->AddEdge(kEPSRule, tail); +    } else if (tail.size() == 1) { +      hg_edge = hg->AddEdge(kX1, tail); +    } else if (tail.size() == 2) { +      hg_edge = hg->AddEdge(kX1X2, tail); +    } +    if (edge->features) +      hg_edge->feature_values_ += *edge->features; +    hg_edge->feature_values_ += extra; +    hg->ConnectEdgeToHeadNode(hg_edge, head_node); +  } + +  Hypergraph::Node* goal_node; +  EdgeQueue exp_agenda; +  EdgeQueue agenda; +  unordered_map<size_t, Hypergraph::Node*> tps2node; +  unordered_map<const Edge*, Hypergraph::Node*, UniqueEdgeHash, UniqueEdgeEquals> edge2node; +  unordered_set<const Traversal*, UniqueTraversalHash, UniqueTraversalEquals> all_traversals; +  unordered_set<const Edge*, UniqueEdgeHash, UniqueEdgeEquals> all_edges; +  unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals> passive_edges; +  unordered_multiset<const Edge*, REdgeHash, REdgeEquals> active_edges; +  vector<Edge*> free_list_; +  vector<Traversal*> traversal_free_list_; +  const WordID start_cat_; +  const FSTNode* const q_0_; +}; + +#ifdef DEBUG_CHART_PARSER +static string TrimRule(const string& r) { +  size_t start = r.find(" |||") + 5; +  size_t end = r.rfind(" |||"); +  return r.substr(start, end - start); +} +#endif + +void AddGrammarRule(const string& r, EGrammar* g) { +  const size_t pos = r.find(" ||| "); +  if (pos == string::npos || r[0] != '[') { +    cerr << "Bad rule: " << r << endl; +    return; +  } +  const size_t rpos = r.rfind(" ||| "); +  string feats; +  string rs = r; +  if (rpos != pos) { +    feats = r.substr(rpos + 5); +    rs = r.substr(0, rpos); +  } +  string rhs = rs.substr(pos + 5); +  string trule = rs + " ||| " + rhs + " ||| " + feats; +  TRule tr(trule); +#ifdef DEBUG_CHART_PARSER +  string hint_last_rule; +#endif +  EGrammarNode* cur = &(*g)[tr.GetLHS()]; +  cur->is_root = true; +  for (int i = 0; i < tr.FLength(); ++i) { +    WordID sym = tr.f()[i]; +#ifdef DEBUG_CHART_PARSER +    hint_last_rule = TD::Convert(sym < 0 ? -sym : sym); +    cur->hint += " <@@> (*" + hint_last_rule + ") " + TrimRule(tr.AsString()); +#endif +    if (sym < 0) +      cur = &cur->ntptr[sym]; +    else +      cur = &cur->tptr[sym]; +  } +#ifdef DEBUG_CHART_PARSER +  cur->hint += " <@@> (" + hint_last_rule + "*) " + TrimRule(tr.AsString()); +#endif +  cur->is_some_rule_complete = true; +  cur->input_features = tr.GetFeatureValues(); +} + +EarleyComposer::~EarleyComposer() { +  delete pimpl_; +} + +EarleyComposer::EarleyComposer(const FSTNode* fst) { +  InitializeConstants(); +  pimpl_ = new EarleyComposerImpl(kUNIQUE_START, *fst); +} + +bool EarleyComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest) { +  // first, convert the src forest into an EGrammar +  EGrammar g; +  const int nedges = src_forest.edges_.size(); +  const int nnodes = src_forest.nodes_.size(); +  vector<int> cats(nnodes); +  bool assign_cats = false; +  for (int i = 0; i < nnodes; ++i) +    if (assign_cats) { +      cats[i] = TD::Convert("CAT_" + boost::lexical_cast<string>(i)) * -1; +    } else { +      cats[i] = src_forest.nodes_[i].cat_; +    } +  // construct the grammar +  for (int i = 0; i < nedges; ++i) { +    const Hypergraph::Edge& edge = src_forest.edges_[i]; +    const vector<WordID>& src = edge.rule_->f(); +    EGrammarNode* cur = &g[cats[edge.head_node_]]; +    cur->is_root = true; +    int ntc = 0; +    for (int j = 0; j < src.size(); ++j) { +      WordID sym = src[j]; +      if (sym <= 0) { +        sym = cats[edge.tail_nodes_[ntc]]; +        ++ntc; +        cur = &cur->ntptr[sym]; +      } else { +        cur = &cur->tptr[sym]; +      } +    } +    cur->is_some_rule_complete = true; +    cur->input_features = edge.feature_values_; +  } +  EGrammarNode& goal_rule = g[kUNIQUE_START]; +  assert((goal_rule.ntptr.size() == 1 && goal_rule.tptr.size() == 0) || +         (goal_rule.ntptr.size() == 0 && goal_rule.tptr.size() == 1)); + +  return pimpl_->Compose(g, trg_forest); +} + +bool EarleyComposer::Compose(istream* in, Hypergraph* trg_forest) { +  EGrammar g; +  while(*in) { +    string line; +    getline(*in, line); +    if (line.empty()) continue; +    AddGrammarRule(line, &g); +  } + +  return pimpl_->Compose(g, trg_forest); +} diff --git a/decoder/earley_composer.h b/decoder/earley_composer.h new file mode 100644 index 00000000..9f786bf6 --- /dev/null +++ b/decoder/earley_composer.h @@ -0,0 +1,29 @@ +#ifndef _EARLEY_COMPOSER_H_ +#define _EARLEY_COMPOSER_H_ + +#include <iostream> + +class EarleyComposerImpl; +class FSTNode; +class Hypergraph; + +class EarleyComposer { + public: +  ~EarleyComposer(); +  EarleyComposer(const FSTNode* phrasetable_root); +  bool Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); + +  // reads the grammar from a file. There must be a single top-level +  // S -> X rule.  Anything else is possible. Format is: +  // [S] ||| [SS,1] +  // [SS] ||| [NP,1] [VP,2] ||| Feature1=0.2 Feature2=-2.3 +  // [SS] ||| [VP,1] [NP,2] ||| Feature1=0.8 +  // [NP] ||| [DET,1] [N,2] ||| Feature3=2 +  // ... +  bool Compose(std::istream* grammar_file, Hypergraph* trg_forest); + + private: +  EarleyComposerImpl* pimpl_; +}; + +#endif diff --git a/decoder/exp_semiring.h b/decoder/exp_semiring.h new file mode 100644 index 00000000..f91beee4 --- /dev/null +++ b/decoder/exp_semiring.h @@ -0,0 +1,71 @@ +#ifndef _EXP_SEMIRING_H_ +#define _EXP_SEMIRING_H_ + +#include <iostream> + +// this file implements the first-order expectation semiring described +// in Li & Eisner (EMNLP 2009) + +// requirements: +//   RType * RType ==> RType +//   PType * PType ==> PType +//   RType * PType ==> RType +// good examples: +//   PType scalar, RType vector +// BAD examples: +//   PType vector, RType scalar +template <typename PType, typename RType> +struct PRPair { +  PRPair() : p(), r() {} +  // Inside algorithm requires that T(0) and T(1) +  // return the 0 and 1 values of the semiring +  explicit PRPair(double x) : p(x), r() {} +  PRPair(const PType& p, const RType& r) : p(p), r(r) {} +  PRPair& operator+=(const PRPair& o) { +    p += o.p; +    r += o.r; +    return *this; +  } +  PRPair& operator*=(const PRPair& o) { +    r = (o.r * p) + (o.p * r); +    p *= o.p; +    return *this; +  } +  PType p; +  RType r; +}; + +template <typename P, typename R> +std::ostream& operator<<(std::ostream& o, const PRPair<P,R>& x) { +  return o << '<' << x.p << ", " << x.r << '>'; +} + +template <typename P, typename R> +const PRPair<P,R> operator+(const PRPair<P,R>& a, const PRPair<P,R>& b) { +  PRPair<P,R> result = a; +  result += b; +  return result; +} + +template <typename P, typename R> +const PRPair<P,R> operator*(const PRPair<P,R>& a, const PRPair<P,R>& b) { +  PRPair<P,R> result = a; +  result *= b; +  return result; +} + +template <typename P, typename PWeightFunction, typename R, typename RWeightFunction> +struct PRWeightFunction { +  explicit PRWeightFunction(const PWeightFunction& pwf = PWeightFunction(), +                            const RWeightFunction& rwf = RWeightFunction()) : +    pweight(pwf), rweight(rwf) {} +  PRPair<P,R> operator()(const Hypergraph::Edge& e) const { +    const P p = pweight(e); +    const R r = rweight(e); +    return PRPair<P,R>(p, r * p); +  } +  const PWeightFunction pweight; +  const RWeightFunction rweight; +}; + +#endif diff --git a/decoder/fdict.cc b/decoder/fdict.cc new file mode 100644 index 00000000..7e1b0e1f --- /dev/null +++ b/decoder/fdict.cc @@ -0,0 +1,129 @@ +#include "fdict.h" + +#include <string> + +using namespace std; + +Dict FD::dict_; +bool FD::frozen_ = false; + +static int HexPairValue(const char * code) { +  int value = 0; +  const char * pch = code; +  for (;;) { +    int digit = *pch++; +    if (digit >= '0' && digit <= '9') { +      value += digit - '0'; +    } +    else if (digit >= 'A' && digit <= 'F') { +      value += digit - 'A' + 10; +    } +    else if (digit >= 'a' && digit <= 'f') { +      value += digit - 'a' + 10; +    } +    else { +      return -1; +    } +    if (pch == code + 2) +      return value; +    value <<= 4; +  } +} + +int UrlDecode(const char *source, char *dest) +{ +  char * start = dest; + +  while (*source) { +    switch (*source) { +    case '+': +      *(dest++) = ' '; +      break; +    case '%': +      if (source[1] && source[2]) { +        int value = HexPairValue(source + 1); +        if (value >= 0) { +          *(dest++) = value; +          source += 2; +        } +        else { +          *dest++ = '?'; +        } +      } +      else { +        *dest++ = '?'; +      } +      break; +    default: +      *dest++ = *source; +    } +    source++; +  } +   +  *dest = 0; +  return dest - start; +}   + +int UrlEncode(const char *source, char *dest, unsigned max) { +  static const char *digits = "0123456789ABCDEF"; +  unsigned char ch; +  unsigned len = 0; +  char *start = dest; + +  while (len < max - 4 && *source) +  { +    ch = (unsigned char)*source; +    if (*source == ' ') { +      *dest++ = '+'; +    } +    else if (strchr("=:;,_| %", ch)) { +      *dest++ = '%'; +      *dest++ = digits[(ch >> 4) & 0x0F]; +      *dest++ = digits[       ch & 0x0F]; +    } +    else { +      *dest++ = *source; +    }   +    source++; +  } +  *dest = 0; +  return start - dest; +} + +std::string UrlDecodeString(const std::string & encoded) { +  const char * sz_encoded = encoded.c_str(); +  size_t needed_length = encoded.length(); +  for (const char * pch = sz_encoded; *pch; pch++) { +    if (*pch == '%') +      needed_length += 2; +  } +  needed_length += 10; +  char stackalloc[64]; +  char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ? +    (char *)malloc(needed_length) : stackalloc; +  UrlDecode(encoded.c_str(), buf); +  std::string result(buf); +  if (buf != stackalloc) { +    free(buf); +  } +  return result; +} + +std::string UrlEncodeString(const std::string & decoded) { +  const char * sz_decoded = decoded.c_str(); +  size_t needed_length = decoded.length() * 3 + 3; +  char stackalloc[64]; +  char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ? +    (char *)malloc(needed_length) : stackalloc; +  UrlEncode(decoded.c_str(), buf, needed_length); +  std::string result(buf); +  if (buf != stackalloc) { +    free(buf); +  } +  return result; +} + +string FD::Escape(const string& s) { +  return UrlEncodeString(s); +} + diff --git a/decoder/fdict.h b/decoder/fdict.h new file mode 100644 index 00000000..c4236580 --- /dev/null +++ b/decoder/fdict.h @@ -0,0 +1,31 @@ +#ifndef _FDICT_H_ +#define _FDICT_H_ + +#include <string> +#include <vector> +#include "dict.h" + +struct FD { +  // once the FD is frozen, new features not already in the +  // dictionary will return 0 +  static void Freeze() { +    frozen_ = true; +  } +  static inline int NumFeats() { +    return dict_.max() + 1; +  } +  static inline WordID Convert(const std::string& s) { +    return dict_.Convert(s, frozen_); +  } +  static inline const std::string& Convert(const WordID& w) { +    return dict_.Convert(w); +  } +  // Escape any string to a form that can be used as the name +  // of a weight in a weights file +  static std::string Escape(const std::string& s); +  static Dict dict_; + private: +  static bool frozen_; +}; + +#endif diff --git a/decoder/ff.cc b/decoder/ff.cc new file mode 100644 index 00000000..61f4f0b6 --- /dev/null +++ b/decoder/ff.cc @@ -0,0 +1,137 @@ +#include "ff.h" + +#include "tdict.h" +#include "hg.h" + +using namespace std; + +FeatureFunction::~FeatureFunction() {} + + +void FeatureFunction::FinalTraversalFeatures(const void* ant_state, +                                             SparseVector<double>* features) const { +  (void) ant_state; +  (void) features; +} + +// Hiero and Joshua use log_10(e) as the value, so I do to +WordPenalty::WordPenalty(const string& param) : +    fid_(FD::Convert("WordPenalty")), +    value_(-1.0 / log(10)) { +  if (!param.empty()) { +    cerr << "Warning WordPenalty ignoring parameter: " << param << endl; +  } +} + +void WordPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                        const Hypergraph::Edge& edge, +                                        const std::vector<const void*>& ant_states, +                                        SparseVector<double>* features, +                                        SparseVector<double>* estimated_features, +                                        void* state) const { +  (void) smeta; +  (void) ant_states; +  (void) state; +  (void) estimated_features; +  features->set_value(fid_, edge.rule_->EWords() * value_); +} + +SourceWordPenalty::SourceWordPenalty(const string& param) : +    fid_(FD::Convert("SourceWordPenalty")), +    value_(-1.0 / log(10)) { +  if (!param.empty()) { +    cerr << "Warning SourceWordPenalty ignoring parameter: " << param << endl; +  } +} + +void SourceWordPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                        const Hypergraph::Edge& edge, +                                        const std::vector<const void*>& ant_states, +                                        SparseVector<double>* features, +                                        SparseVector<double>* estimated_features, +                                        void* state) const { +  (void) smeta; +  (void) ant_states; +  (void) state; +  (void) estimated_features; +  features->set_value(fid_, edge.rule_->FWords() * value_); +} + +ArityPenalty::ArityPenalty(const std::string& param) : +    value_(-1.0 / log(10)) { +  string fname = "Arity_X"; +  for (int i = 0; i < 10; ++i) { +    fname[6]=i + '0'; +    fids_[i] = FD::Convert(fname); +  } +} + +void ArityPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                         const Hypergraph::Edge& edge, +                                         const std::vector<const void*>& ant_states, +                                         SparseVector<double>* features, +                                         SparseVector<double>* estimated_features, +                                         void* state) const { +  (void) smeta; +  (void) ant_states; +  (void) state; +  (void) estimated_features; +  features->set_value(fids_[edge.Arity()], value_); +} + +ModelSet::ModelSet(const vector<double>& w, const vector<const FeatureFunction*>& models) : +    models_(models), +    weights_(w), +    state_size_(0), +    model_state_pos_(models.size()) { +  for (int i = 0; i < models_.size(); ++i) { +    model_state_pos_[i] = state_size_; +    state_size_ += models_[i]->NumBytesContext(); +  } +} + +void ModelSet::AddFeaturesToEdge(const SentenceMetadata& smeta, +                                 const Hypergraph& hg, +                                 const vector<string>& node_states, +                                 Hypergraph::Edge* edge, +                                 string* context, +                                 prob_t* combination_cost_estimate) const { +  context->resize(state_size_); +  memset(&(*context)[0], 0, state_size_); +  SparseVector<double> est_vals;  // only computed if combination_cost_estimate is non-NULL +  if (combination_cost_estimate) *combination_cost_estimate = prob_t::One(); +  for (int i = 0; i < models_.size(); ++i) { +    const FeatureFunction& ff = *models_[i]; +    void* cur_ff_context = NULL; +    vector<const void*> ants(edge->tail_nodes_.size()); +    bool has_context = ff.NumBytesContext() > 0; +    if (has_context) { +      int spos = model_state_pos_[i]; +      cur_ff_context = &(*context)[spos]; +      for (int i = 0; i < ants.size(); ++i) { +        ants[i] = &node_states[edge->tail_nodes_[i]][spos]; +      } +    } +    ff.TraversalFeatures(smeta, *edge, ants, &edge->feature_values_, &est_vals, cur_ff_context); +  } +  if (combination_cost_estimate) +    combination_cost_estimate->logeq(est_vals.dot(weights_)); +  edge->edge_prob_.logeq(edge->feature_values_.dot(weights_)); +} + +void ModelSet::AddFinalFeatures(const std::string& state, Hypergraph::Edge* edge) const { +  assert(1 == edge->rule_->Arity()); + +  for (int i = 0; i < models_.size(); ++i) { +    const FeatureFunction& ff = *models_[i]; +    const void* ant_state = NULL; +    bool has_context = ff.NumBytesContext() > 0; +    if (has_context) { +      int spos = model_state_pos_[i]; +      ant_state = &state[spos]; +    } +    ff.FinalTraversalFeatures(ant_state, &edge->feature_values_); +  } +  edge->edge_prob_.logeq(edge->feature_values_.dot(weights_)); +} + diff --git a/decoder/ff.h b/decoder/ff.h new file mode 100644 index 00000000..630b3208 --- /dev/null +++ b/decoder/ff.h @@ -0,0 +1,152 @@ +#ifndef _FF_H_ +#define _FF_H_ + +#include <vector> + +#include "fdict.h" +#include "hg.h" + +class SentenceMetadata; +class FeatureFunction;  // see definition below + +// if you want to develop a new feature, inherit from this class and +// override TraversalFeaturesImpl(...).  If it's a feature that returns / +// depends on context, you may also need to implement +// FinalTraversalFeatures(...) +class FeatureFunction { + public: +  FeatureFunction() : state_size_() {} +  explicit FeatureFunction(int state_size) : state_size_(state_size) {} +  virtual ~FeatureFunction(); + +  // returns the number of bytes of context that this feature function will +  // (maximally) use.  By default, 0 ("stateless" models in Hiero/Joshua). +  // NOTE: this value is fixed for the instance of your class, you cannot +  // use different amounts of memory for different nodes in the forest. +  inline int NumBytesContext() const { return state_size_; } + +  // Compute the feature values and (if this applies) the estimates of the +  // feature values when this edge is used incorporated into a larger context +  inline void TraversalFeatures(const SentenceMetadata& smeta, +                                const Hypergraph::Edge& edge, +                                const std::vector<const void*>& ant_contexts, +                                SparseVector<double>* features, +                                SparseVector<double>* estimated_features, +                                void* out_state) const { +    TraversalFeaturesImpl(smeta, edge, ant_contexts, +                          features, estimated_features, out_state); +    // TODO it's easy for careless feature function developers to overwrite +    // the end of their state and clobber someone else's memory.  These bugs +    // will be horrendously painful to track down.  There should be some +    // optional strict mode that's enforced here that adds some kind of +    // barrier between the blocks reserved for the residual contexts +  } + +  // if there's some state left when you transition to the goal state, score +  // it here.  For example, the language model computes the cost of adding +  // <s> and </s>. +  virtual void FinalTraversalFeatures(const void* residual_state, +                                      SparseVector<double>* final_features) const; + + protected: +  // context is a pointer to a buffer of size NumBytesContext() that the +  // feature function can write its state to.  It's up to the feature function +  // to determine how much space it needs and to determine how to encode its +  // residual contextual information since it is OPAQUE to all clients outside +  // of the particular FeatureFunction class.  There is one exception: +  // equality of the contents (i.e., memcmp) is required to determine whether +  // two states can be combined. +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* context) const = 0; + +  // !!! ONLY call this from subclass *CONSTRUCTORS* !!! +  void SetStateSize(size_t state_size) { +    state_size_ = state_size; +  } + + private: +  int state_size_; +}; + +// word penalty feature, for each word on the E side of a rule, +// add value_ +class WordPenalty : public FeatureFunction { + public: +  WordPenalty(const std::string& param); + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* context) const; + private: +  const int fid_; +  const double value_; +}; + +class SourceWordPenalty : public FeatureFunction { + public: +  SourceWordPenalty(const std::string& param); + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* context) const; + private: +  const int fid_; +  const double value_; +}; + +class ArityPenalty : public FeatureFunction { + public: +  ArityPenalty(const std::string& param); + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* context) const; + private: +  int fids_[10]; +  const double value_; +}; + +// this class is a set of FeatureFunctions that can be used to score, rescore, +// etc. a (translation?) forest +class ModelSet { + public: +  ModelSet() : state_size_(0) {} + +  ModelSet(const std::vector<double>& weights, +           const std::vector<const FeatureFunction*>& models); + +  // sets edge->feature_values_ and edge->edge_prob_ +  // NOTE: edge must not necessarily be in hg.edges_ but its TAIL nodes +  // must be. +  void AddFeaturesToEdge(const SentenceMetadata& smeta, +                         const Hypergraph& hg, +                         const std::vector<std::string>& node_states, +                         Hypergraph::Edge* edge, +                         std::string* residual_context, +                         prob_t* combination_cost_estimate = NULL) const; + +  void AddFinalFeatures(const std::string& residual_context, +                        Hypergraph::Edge* edge) const; + +  bool empty() const { return models_.empty(); } + private: +  std::vector<const FeatureFunction*> models_; +  std::vector<double> weights_;  +  int state_size_; +  std::vector<int> model_state_pos_; +}; + +#endif diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc new file mode 100644 index 00000000..658603e4 --- /dev/null +++ b/decoder/ff_csplit.cc @@ -0,0 +1,225 @@ +#include "ff_csplit.h" + +#include <set> +#include <cstring> + +#include "Vocab.h" +#include "Ngram.h" + +#include "sentence_metadata.h" +#include "lattice.h" +#include "tdict.h" +#include "freqdict.h" +#include "filelib.h" +#include "stringlib.h" +#include "tdict.h" + +using namespace std; + +struct BasicCSplitFeaturesImpl { +  BasicCSplitFeaturesImpl(const string& param) : +      word_count_(FD::Convert("WordCount")), +      letters_sq_(FD::Convert("LettersSq")), +      letters_sqrt_(FD::Convert("LettersSqrt")), +      in_dict_(FD::Convert("InDict")), +      short_(FD::Convert("Short")), +      long_(FD::Convert("Long")), +      oov_(FD::Convert("OOV")), +      short_range_(FD::Convert("ShortRange")), +      high_freq_(FD::Convert("HighFreq")), +      med_freq_(FD::Convert("MedFreq")), +      freq_(FD::Convert("Freq")), +      fl1_(FD::Convert("FreqLen1")), +      fl2_(FD::Convert("FreqLen2")), +      bad_(FD::Convert("Bad")) { +    vector<string> argv; +    int argc = SplitOnWhitespace(param, &argv); +    if (argc != 1 && argc != 2) { +      cerr << "Expected: freqdict.txt [badwords.txt]\n"; +      abort(); +    } +    freq_dict_.Load(argv[0]); +    if (argc == 2) { +      ReadFile rf(argv[1]); +      istream& in = *rf.stream(); +      while(in) { +        string badword; +        in >> badword; +        if (badword.empty()) continue; +        bad_words_.insert(TD::Convert(badword)); +      } +    } +  } + +  void TraversalFeaturesImpl(const Hypergraph::Edge& edge, +                             SparseVector<double>* features) const; + +  const int word_count_; +  const int letters_sq_; +  const int letters_sqrt_; +  const int in_dict_; +  const int short_; +  const int long_; +  const int oov_; +  const int short_range_; +  const int high_freq_; +  const int med_freq_; +  const int freq_; +  const int fl1_; +  const int fl2_; +  const int bad_; +  FreqDict freq_dict_; +  set<WordID> bad_words_; +}; + +BasicCSplitFeatures::BasicCSplitFeatures(const string& param) : +  pimpl_(new BasicCSplitFeaturesImpl(param)) {} + +void BasicCSplitFeaturesImpl::TraversalFeaturesImpl( +                                     const Hypergraph::Edge& edge, +                                     SparseVector<double>* features) const { +  features->set_value(word_count_, 1.0); +  features->set_value(letters_sq_, (edge.j_ - edge.i_) * (edge.j_ - edge.i_)); +  features->set_value(letters_sqrt_, sqrt(edge.j_ - edge.i_)); +  const WordID word = edge.rule_->e_[1]; +  const char* sword = TD::Convert(word); +  const int len = strlen(sword); +  int cur = 0; +  int chars = 0; +  while(cur < len) { +    cur += UTF8Len(sword[cur]); +    ++chars; +  } + +  // these are corrections that attempt to make chars +  // more like a phoneme count than a letter count, they +  // are only really meaningful for german and should +  // probably be gotten rid of +  bool has_sch = strstr(sword, "sch"); +  bool has_ch = (!has_sch && strstr(sword, "ch")); +  bool has_ie = strstr(sword, "ie"); +  bool has_zw = strstr(sword, "zw"); +  if (has_sch) chars -= 2; +  if (has_ch) --chars; +  if (has_ie) --chars; +  if (has_zw) --chars; + +  float freq = freq_dict_.LookUp(word); +  if (freq) { +    features->set_value(freq_, freq); +    features->set_value(in_dict_, 1.0); +  } else { +    features->set_value(oov_, 1.0); +    freq = 99.0f; +  } +  if (bad_words_.count(word) != 0) +    features->set_value(bad_, 1.0); +  if (chars < 5) +    features->set_value(short_, 1.0); +  if (chars > 10) +    features->set_value(long_, 1.0); +  if (freq < 7.0f) +    features->set_value(high_freq_, 1.0); +  if (freq > 8.0f && freq < 10.f) +    features->set_value(med_freq_, 1.0); +  if (freq < 10.0f && chars < 5) +    features->set_value(short_range_, 1.0); + +  // i don't understand these features, but they really help! +  features->set_value(fl1_, sqrt(chars * freq)); +  features->set_value(fl2_, freq / chars); +} + +void BasicCSplitFeatures::TraversalFeaturesImpl( +                                     const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const { +  (void) smeta; +  (void) ant_contexts; +  (void) out_context; +  (void) estimated_features; +  if (edge.Arity() == 0) return; +  if (edge.rule_->EWords() != 1) return; +  pimpl_->TraversalFeaturesImpl(edge, features); +} + +struct ReverseCharLMCSplitFeatureImpl { +  ReverseCharLMCSplitFeatureImpl(const string& param) : +      order_(5), +      vocab_(*TD::dict_), +      ngram_(vocab_, order_) { +    kBOS = vocab_.getIndex("<s>"); +    kEOS = vocab_.getIndex("</s>"); +    File file(param.c_str(), "r", 0); +    assert(file); +    cerr << "Reading " << order_ << "-gram LM from " << param << endl; +    ngram_.read(file); +  } + +  double LeftPhonotacticProb(const Lattice& inword, const int start) { +    const int end = inword.size(); +    for (int i = 0; i < order_; ++i) +      sc[i] = kBOS; +    int sp = min(end - start, order_ - 1); +    // cerr << "[" << start << "," << sp << "]\n"; +    int ci = (order_ - sp - 1); +    int wi = start; +    while (sp > 0) { +      sc[ci] = inword[wi][0].label; +      // cerr << " CHAR: " << TD::Convert(sc[ci]) << "  ci=" << ci << endl; +      ++wi; +      ++ci; +      --sp; +    } +    // cerr << "  END ci=" << ci << endl; +    sc[ci] = Vocab_None; +    const double startprob = ngram_.wordProb(kEOS, sc); +    // cerr << "  PROB=" << startprob << endl; +    return startprob; +  } + private: +  const int order_; +  Vocab& vocab_; +  VocabIndex kBOS; +  VocabIndex kEOS; +  Ngram ngram_; +  VocabIndex sc[80]; +}; + +ReverseCharLMCSplitFeature::ReverseCharLMCSplitFeature(const string& param) : +  pimpl_(new ReverseCharLMCSplitFeatureImpl(param)), +  fid_(FD::Convert("RevCharLM")) {} + +void ReverseCharLMCSplitFeature::TraversalFeaturesImpl( +                                     const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const { +  (void) ant_contexts; +  (void) estimated_features; +  (void) out_context; + +  if (edge.Arity() != 1) return; +  if (edge.rule_->EWords() != 1) return; +  const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_); +  features->set_value(fid_, lpp); +#if 0 +  WordID neighbor_word = 0; +  const WordID word = edge.rule_->e_[1]; +  if (chars > 4 && (sword[0] == 's' || sword[0] == 'n')) { +    neighbor_word = TD::Convert(string(&sword[1])); +  } +  if (neighbor_word) { +    float nfreq = freq_dict_.LookUp(neighbor_word); +    cerr << "COMPARE: " << TD::Convert(word) << " & " << TD::Convert(neighbor_word) << endl; +    if (!nfreq) nfreq = 99.0f; +    features->set_value(fdoes_deletion_help_, (freq - nfreq)); +  } +#endif +} + diff --git a/decoder/ff_csplit.h b/decoder/ff_csplit.h new file mode 100644 index 00000000..c1cfb64b --- /dev/null +++ b/decoder/ff_csplit.h @@ -0,0 +1,39 @@ +#ifndef _FF_CSPLIT_H_ +#define _FF_CSPLIT_H_ + +#include <boost/shared_ptr.hpp> + +#include "ff.h" + +class BasicCSplitFeaturesImpl; +class BasicCSplitFeatures : public FeatureFunction { + public: +  BasicCSplitFeatures(const std::string& param); + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const; + private: +  boost::shared_ptr<BasicCSplitFeaturesImpl> pimpl_; +}; + +class ReverseCharLMCSplitFeatureImpl; +class ReverseCharLMCSplitFeature : public FeatureFunction { + public: +  ReverseCharLMCSplitFeature(const std::string& param); + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const; + private: +  boost::shared_ptr<ReverseCharLMCSplitFeatureImpl> pimpl_; +  const int fid_; +}; + +#endif diff --git a/decoder/ff_factory.cc b/decoder/ff_factory.cc new file mode 100644 index 00000000..1854e0bb --- /dev/null +++ b/decoder/ff_factory.cc @@ -0,0 +1,35 @@ +#include "ff_factory.h" + +#include "ff.h" + +using boost::shared_ptr; +using namespace std; + +FFFactoryBase::~FFFactoryBase() {} + +void FFRegistry::DisplayList() const { +  for (map<string, shared_ptr<FFFactoryBase> >::const_iterator it = reg_.begin(); +       it != reg_.end(); ++it) { +    cerr << "  " << it->first << endl; +  } +} + +shared_ptr<FeatureFunction> FFRegistry::Create(const string& ffname, const string& param) const { +  map<string, shared_ptr<FFFactoryBase> >::const_iterator it = reg_.find(ffname); +  shared_ptr<FeatureFunction> res; +  if (it == reg_.end()) { +    cerr << "I don't know how to create feature " << ffname << endl; +  } else { +    res = it->second->Create(param); +  } +  return res; +} + +void FFRegistry::Register(const string& ffname, FFFactoryBase* factory) { +  if (reg_.find(ffname) != reg_.end()) { +    cerr << "Duplicate registration of FeatureFunction with name " << ffname << "!\n"; +    abort(); +  } +  reg_[ffname].reset(factory); +} + diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h new file mode 100644 index 00000000..bc586567 --- /dev/null +++ b/decoder/ff_factory.h @@ -0,0 +1,39 @@ +#ifndef _FF_FACTORY_H_ +#define _FF_FACTORY_H_ + +#include <iostream> +#include <string> +#include <map> + +#include <boost/shared_ptr.hpp> + +class FeatureFunction; +class FFRegistry; +class FFFactoryBase; +extern boost::shared_ptr<FFRegistry> global_ff_registry; + +class FFRegistry { +  friend int main(int argc, char** argv); +  friend class FFFactoryBase; + public: +  boost::shared_ptr<FeatureFunction> Create(const std::string& ffname, const std::string& param) const; +  void DisplayList() const; +  void Register(const std::string& ffname, FFFactoryBase* factory); + private: +  FFRegistry() {} +  std::map<std::string, boost::shared_ptr<FFFactoryBase> > reg_; +}; + +struct FFFactoryBase { +  virtual ~FFFactoryBase(); +  virtual boost::shared_ptr<FeatureFunction> Create(const std::string& param) const = 0; +}; + +template<class FF> +class FFFactory : public FFFactoryBase { +  boost::shared_ptr<FeatureFunction> Create(const std::string& param) const { +    return boost::shared_ptr<FeatureFunction>(new FF(param)); +  } +}; + +#endif diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc new file mode 100644 index 00000000..a1dc8b81 --- /dev/null +++ b/decoder/ff_lm.cc @@ -0,0 +1,454 @@ +#include "ff_lm.h" + +#include <sstream> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <netinet/in.h> +#include <netdb.h> + +#include <boost/shared_ptr.hpp> + +#include "tdict.h" +#include "Vocab.h" +#include "Ngram.h" +#include "hg.h" +#include "stringlib.h" + +#ifdef HAVE_RANDLM +#include "RandLM.h" +#endif + +using namespace std; + +namespace NgramCache { +  struct Cache { +    map<WordID, Cache> tree; +    float prob; +    Cache() : prob() {} +  }; +  static Cache cache_; +  void Clear() { cache_.tree.clear(); } +} + +struct LMClient { + +  LMClient(const char* host) : port(6666) { +    strcpy(request_buffer, "prob "); +    s = const_cast<char*>(strchr(host, ':'));  // TODO fix const_cast +    if (s != NULL) { +      *s = '\0'; +      ++s; +      port = atoi(s); +    } +    sock = socket(AF_INET, SOCK_STREAM, 0); +    hp = gethostbyname(host); +    if (hp == NULL) { +      cerr << "unknown host " << host << endl; +      abort(); +    } +    bzero((char *)&server, sizeof(server)); +    bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); +    server.sin_family = hp->h_addrtype; +    server.sin_port = htons(port); + +    int errors = 0; +    while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) { +      cerr << "Error: connect()\n"; +      sleep(1); +      errors++; +      if (errors > 3) exit(1); +    } +    cerr << "Connected to LM on " << host << " on port " << port << endl; +  } + +  float wordProb(int word, int* context) { +    NgramCache::Cache* cur = &NgramCache::cache_; +    int i = 0; +    while (context[i] > 0) { +      cur = &cur->tree[context[i++]]; +    } +    cur = &cur->tree[word]; +    if (cur->prob) { return cur->prob; } + +    i = 0; +    int pos = TD::AppendString(word, 5, 16000, request_buffer); +    while (context[i] > 0) { +      assert(pos < 15995); +      request_buffer[pos] = ' '; +      ++pos; +      pos = TD::AppendString(context[i], pos, 16000, request_buffer); +      ++i; +    } +    assert(pos < 15999); +    request_buffer[pos] = '\n'; +    ++pos; +    request_buffer[pos] = 0; +    write(sock, request_buffer, pos); +    int r = read(sock, res, 6); +    int errors = 0; +    int cnt = 0; +    while (1) { +      if (r < 0) { +        errors++; sleep(1); +        cerr << "Error: read()\n"; +        if (errors > 5) exit(1); +      } else if (r==0 || res[cnt] == '\n') { break; } +      else { +        cnt += r; +        if (cnt==6) break; +        read(sock, &res[cnt], 6-cnt); +      } +    } +    cur->prob = *reinterpret_cast<float*>(res); +    return cur->prob; +  } + + private: +  int sock, port; +  char *s; +  struct hostent *hp; +  struct sockaddr_in server; +  char res[8]; +  char request_buffer[16000]; +}; + +class LanguageModelImpl { + public: +  explicit LanguageModelImpl(int order) : +      ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), +      floor_(-100.0), +      client_(), +      kSTART(TD::Convert("<s>")), +      kSTOP(TD::Convert("</s>")), +      kUNKNOWN(TD::Convert("<unk>")), +      kNONE(-1), +      kSTAR(TD::Convert("<{STAR}>")) {} + +  LanguageModelImpl(int order, const string& f) : +      ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), +      floor_(-100.0), +      client_(NULL), +      kSTART(TD::Convert("<s>")), +      kSTOP(TD::Convert("</s>")), +      kUNKNOWN(TD::Convert("<unk>")), +      kNONE(-1), +      kSTAR(TD::Convert("<{STAR}>")) { +    if (f.find("lm://") == 0) { +      client_ = new LMClient(f.substr(5).c_str()); +    } else { +      File file(f.c_str(), "r", 0); +      assert(file); +      cerr << "Reading " << order_ << "-gram LM from " << f << endl; +      ngram_.read(file, false); +    } +  } + +  virtual ~LanguageModelImpl() { +    delete client_; +  } + +  inline int StateSize(const void* state) const { +    return *(static_cast<const char*>(state) + state_size_); +  } + +  inline void SetStateSize(int size, void* state) const { +    *(static_cast<char*>(state) + state_size_) = size; +  } + +  virtual double WordProb(int word, int* context) { +    return client_ ? +          client_->wordProb(word, context) +        : ngram_.wordProb(word, (VocabIndex*)context); +  } + +  inline double LookupProbForBufferContents(int i) { +//    int k = i; cerr << "P("; while(buffer_[k] > 0) { std::cerr << TD::Convert(buffer_[k++]) << " "; } +    double p = WordProb(buffer_[i], &buffer_[i+1]); +    if (p < floor_) p = floor_; +//    cerr << ")=" << p << endl; +    return p; +  } + +  string DebugStateToString(const void* state) const { +    int len = StateSize(state); +    const int* astate = reinterpret_cast<const int*>(state); +    string res = "["; +    for (int i = 0; i < len; ++i) { +      res += " "; +      res += TD::Convert(astate[i]); +    } +    res += " ]"; +    return res; +  } + +  inline double ProbNoRemnant(int i, int len) { +    int edge = len; +    bool flag = true; +    double sum = 0.0; +    while (i >= 0) { +      if (buffer_[i] == kSTAR) { +        edge = i; +        flag = false; +      } else if (buffer_[i] <= 0) { +        edge = i; +        flag = true; +      } else { +        if ((edge-i >= order_) || (flag && !(i == (len-1) && buffer_[i] == kSTART))) +          sum += LookupProbForBufferContents(i); +      } +      --i; +    } +    return sum; +  } + +  double EstimateProb(const vector<WordID>& phrase) { +    int len = phrase.size(); +    buffer_.resize(len + 1); +    buffer_[len] = kNONE; +    int i = len - 1; +    for (int j = 0; j < len; ++j,--i) +      buffer_[i] = phrase[j]; +    return ProbNoRemnant(len - 1, len); +  } + +  double EstimateProb(const void* state) { +    int len = StateSize(state); +    // cerr << "residual len: " << len << endl; +    buffer_.resize(len + 1); +    buffer_[len] = kNONE; +    const int* astate = reinterpret_cast<const int*>(state); +    int i = len - 1; +    for (int j = 0; j < len; ++j,--i) +      buffer_[i] = astate[j]; +    return ProbNoRemnant(len - 1, len); +  } + +  double FinalTraversalCost(const void* state) { +    int slen = StateSize(state); +    int len = slen + 2; +    // cerr << "residual len: " << len << endl; +    buffer_.resize(len + 1); +    buffer_[len] = kNONE; +    buffer_[len-1] = kSTART; +    const int* astate = reinterpret_cast<const int*>(state); +    int i = len - 2; +    for (int j = 0; j < slen; ++j,--i) +      buffer_[i] = astate[j]; +    buffer_[i] = kSTOP; +    assert(i == 0); +    return ProbNoRemnant(len - 1, len); +  } + +  double LookupWords(const TRule& rule, const vector<const void*>& ant_states, void* vstate) { +    int len = rule.ELength() - rule.Arity(); +    for (int i = 0; i < ant_states.size(); ++i) +      len += StateSize(ant_states[i]); +    buffer_.resize(len + 1); +    buffer_[len] = kNONE; +    int i = len - 1; +    const vector<WordID>& e = rule.e(); +    for (int j = 0; j < e.size(); ++j) { +      if (e[j] < 1) { +        const int* astate = reinterpret_cast<const int*>(ant_states[-e[j]]); +        int slen = StateSize(astate); +        for (int k = 0; k < slen; ++k) +          buffer_[i--] = astate[k]; +      } else { +        buffer_[i--] = e[j]; +      } +    } + +    double sum = 0.0; +    int* remnant = reinterpret_cast<int*>(vstate); +    int j = 0; +    i = len - 1; +    int edge = len; + +    while (i >= 0) { +      if (buffer_[i] == kSTAR) { +        edge = i; +      } else if (edge-i >= order_) { +        sum += LookupProbForBufferContents(i); +      } else if (edge == len && remnant) { +        remnant[j++] = buffer_[i]; +      } +      --i; +    } +    if (!remnant) return sum; + +    if (edge != len || len >= order_) { +      remnant[j++] = kSTAR; +      if (order_-1 < edge) edge = order_-1; +      for (int i = edge-1; i >= 0; --i) +        remnant[j++] = buffer_[i]; +    } + +    SetStateSize(j, vstate); +    return sum; +  } + +  static int OrderToStateSize(int order) { +    return ((order-1) * 2 + 1) * sizeof(WordID) + 1; +  } + + protected: +  Ngram ngram_; +  vector<WordID> buffer_; +  const int order_; +  const int state_size_; +  const double floor_; + private: +  LMClient* client_; + + public: +  const WordID kSTART; +  const WordID kSTOP; +  const WordID kUNKNOWN; +  const WordID kNONE; +  const WordID kSTAR; +}; + +LanguageModel::LanguageModel(const string& param) : +    fid_(FD::Convert("LanguageModel")) { +  vector<string> argv; +  int argc = SplitOnWhitespace(param, &argv); +  int order = 3; +  // TODO add support for -n FeatureName +  string filename; +  if (argc < 1) { cerr << "LanguageModel requires a filename, minimally!\n"; abort(); } +  else if (argc == 1) { filename = argv[0]; } +  else if (argc == 2 || argc > 3) { cerr << "Don't understand 'LanguageModel " << param << "'\n"; } +  else if (argc == 3) { +    if (argv[0] == "-o") { +      order = atoi(argv[1].c_str()); +      filename = argv[2]; +    } else if (argv[1] == "-o") { +      order = atoi(argv[2].c_str()); +      filename = argv[0]; +    } +  } +  SetStateSize(LanguageModelImpl::OrderToStateSize(order)); +  pimpl_ = new LanguageModelImpl(order, filename); +} + +LanguageModel::~LanguageModel() { +  delete pimpl_; +} + +string LanguageModel::DebugStateToString(const void* state) const{ +  return pimpl_->DebugStateToString(state); +} + +void LanguageModel::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                          const Hypergraph::Edge& edge, +                                          const vector<const void*>& ant_states, +                                          SparseVector<double>* features, +                                          SparseVector<double>* estimated_features, +                                          void* state) const { +  (void) smeta; +  features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state)); +  estimated_features->set_value(fid_, pimpl_->EstimateProb(state)); +} + +void LanguageModel::FinalTraversalFeatures(const void* ant_state, +                                           SparseVector<double>* features) const { +  features->set_value(fid_, pimpl_->FinalTraversalCost(ant_state)); +} + +#ifdef HAVE_RANDLM +struct RandLMImpl : public LanguageModelImpl { +  RandLMImpl(int order, randlm::RandLM* rlm) : +      LanguageModelImpl(order), +      rlm_(rlm), +      oov_(rlm->getWordID(rlm->getOOV())), +      rb_(1000, oov_) { +    map<int, randlm::WordID> map_cdec2randlm; +    int max_wordid = 0; +    for(map<randlm::Word, randlm::WordID>::const_iterator it = rlm->vocabStart(); +        it != rlm->vocabEnd(); ++it) { +      const int cur = TD::Convert(it->first); +      map_cdec2randlm[TD::Convert(it->first)] = it->second; +      if (cur > max_wordid) max_wordid = cur; +    } +    cdec2randlm_.resize(max_wordid + 1, oov_); +    for (map<int, randlm::WordID>::iterator it = map_cdec2randlm.begin(); +         it != map_cdec2randlm.end(); ++it) +      cdec2randlm_[it->first] = it->second; +    map_cdec2randlm.clear(); +  } + +  inline randlm::WordID Convert2RandLM(int w) { +    return (w < cdec2randlm_.size() ? cdec2randlm_[w] : oov_); +  } + +  virtual double WordProb(int word, int* context) { +    int i = order_; +    int c = 1; +    rb_[i] = Convert2RandLM(word); +    while (i > 1 && *context > 0) { +      --i; +      rb_[i] = Convert2RandLM(*context); +      ++context; +      ++c; +    } +    const void* finalState = 0; +    int found; +    //cerr << "I = " << i << endl; +    return rlm_->getProb(&rb_[i], c, &found, &finalState); +  } + private: +  boost::shared_ptr<randlm::RandLM> rlm_; +  randlm::WordID oov_; +  vector<randlm::WordID> cdec2randlm_; +  vector<randlm::WordID> rb_; +}; + +LanguageModelRandLM::LanguageModelRandLM(const string& param) : +    fid_(FD::Convert("RandLM")) { +  vector<string> argv; +  int argc = SplitOnWhitespace(param, &argv); +  int order = 3; +  // TODO add support for -n FeatureName +  string filename; +  if (argc < 1) { cerr << "RandLM requires a filename, minimally!\n"; abort(); } +  else if (argc == 1) { filename = argv[0]; } +  else if (argc == 2 || argc > 3) { cerr << "Don't understand 'RandLM " << param << "'\n"; } +  else if (argc == 3) { +    if (argv[0] == "-o") { +      order = atoi(argv[1].c_str()); +      filename = argv[2]; +    } else if (argv[1] == "-o") { +      order = atoi(argv[2].c_str()); +      filename = argv[0]; +    } +  } +  SetStateSize(LanguageModelImpl::OrderToStateSize(order)); +  int cache_MB = 200; // increase cache size +  randlm::RandLM* rlm = randlm::RandLM::initRandLM(filename, order, cache_MB); +  assert(rlm != NULL); +  pimpl_ = new RandLMImpl(order, rlm); +} + +LanguageModelRandLM::~LanguageModelRandLM() { +  delete pimpl_; +} + +void LanguageModelRandLM::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                          const Hypergraph::Edge& edge, +                                          const vector<const void*>& ant_states, +                                          SparseVector<double>* features, +                                          SparseVector<double>* estimated_features, +                                          void* state) const { +  (void) smeta; +  features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state)); +  estimated_features->set_value(fid_, pimpl_->EstimateProb(state)); +} + +void LanguageModelRandLM::FinalTraversalFeatures(const void* ant_state, +                                           SparseVector<double>* features) const { +  features->set_value(fid_, pimpl_->FinalTraversalCost(ant_state)); +} + +#endif + diff --git a/decoder/ff_lm.h b/decoder/ff_lm.h new file mode 100644 index 00000000..45fc1da7 --- /dev/null +++ b/decoder/ff_lm.h @@ -0,0 +1,55 @@ +#ifndef _LM_FF_H_ +#define _LM_FF_H_ + +#include <vector> +#include <string> + +#include "hg.h" +#include "ff.h" +#include "config.h" + +class LanguageModelImpl; + +class LanguageModel : public FeatureFunction { + public: +  // param = "filename.lm [-o n]" +  LanguageModel(const std::string& param); +  ~LanguageModel(); +  virtual void FinalTraversalFeatures(const void* context, +                                      SparseVector<double>* features) const; +  std::string DebugStateToString(const void* state) const; + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const; + private: +  const int fid_; +  mutable LanguageModelImpl* pimpl_; +}; + +#ifdef HAVE_RANDLM +class LanguageModelRandLM : public FeatureFunction { + public: +  // param = "filename.lm [-o n]" +  LanguageModelRandLM(const std::string& param); +  ~LanguageModelRandLM(); +  virtual void FinalTraversalFeatures(const void* context, +                                      SparseVector<double>* features) const; +  std::string DebugStateToString(const void* state) const; + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const; + private: +  const int fid_; +  mutable LanguageModelImpl* pimpl_; +}; +#endif + +#endif diff --git a/decoder/ff_tagger.cc b/decoder/ff_tagger.cc new file mode 100644 index 00000000..7a9d1def --- /dev/null +++ b/decoder/ff_tagger.cc @@ -0,0 +1,96 @@ +#include "ff_tagger.h" + +#include "tdict.h" +#include "sentence_metadata.h" + +#include <sstream> + +using namespace std; + +Tagger_BigramIdentity::Tagger_BigramIdentity(const std::string& param) : +  FeatureFunction(sizeof(WordID)) {} + +void Tagger_BigramIdentity::FireFeature(const WordID& left, +                                 const WordID& right, +                                 SparseVector<double>* features) const { +  int& fid = fmap_[left][right]; +  if (!fid) { +    ostringstream os; +    if (right == 0) { +      os << "Uni:" << TD::Convert(left); +    } else { +      os << "Bi:"; +      if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); } +      os << '_'; +      if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); } +    } +    fid = FD::Convert(os.str()); +  } +  features->set_value(fid, 1.0); +} + +void Tagger_BigramIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* context) const { +  WordID& out_context = *static_cast<WordID*>(context); +  const int arity = edge.Arity(); +  if (arity == 0) { +    out_context = edge.rule_->e_[0]; +    FireFeature(out_context, 0, features); +  } else if (arity == 2) { +    WordID left = *static_cast<const WordID*>(ant_contexts[0]); +    WordID right = *static_cast<const WordID*>(ant_contexts[1]); +    if (edge.i_ == 0 && edge.j_ == 2) +      FireFeature(-1, left, features); +    FireFeature(left, right, features); +    if (edge.i_ == 0 && edge.j_ == smeta.GetSourceLength()) +      FireFeature(right, -1, features); +    out_context = right; +  } +} + +LexicalPairIdentity::LexicalPairIdentity(const std::string& param) {} + +void LexicalPairIdentity::FireFeature(WordID src, +                                 WordID trg, +                                 SparseVector<double>* features) const { +  int& fid = fmap_[src][trg]; +  if (!fid) { +    static map<WordID, WordID> escape; +    if (escape.empty()) { +      escape[TD::Convert("=")] = TD::Convert("__EQ"); +      escape[TD::Convert(";")] = TD::Convert("__SC"); +      escape[TD::Convert(",")] = TD::Convert("__CO"); +    } +    if (escape.count(src)) src = escape[src]; +    if (escape.count(trg)) trg = escape[trg]; +    ostringstream os; +    os << "Id:" << TD::Convert(src) << ':' << TD::Convert(trg); +    fid = FD::Convert(os.str()); +  } +  features->set_value(fid, 1.0); +} + +void LexicalPairIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* context) const { +  const vector<WordID>& ew = edge.rule_->e_; +  const vector<WordID>& fw = edge.rule_->f_; +  for (int i = 0; i < ew.size(); ++i) { +    const WordID& e = ew[i]; +    if (e <= 0) continue; +    for (int j = 0; j < fw.size(); ++j) { +      const WordID& f = fw[j]; +      if (f <= 0) continue; +      FireFeature(f, e, features); +    } +  } +} + + diff --git a/decoder/ff_tagger.h b/decoder/ff_tagger.h new file mode 100644 index 00000000..41c3ee5b --- /dev/null +++ b/decoder/ff_tagger.h @@ -0,0 +1,51 @@ +#ifndef _FF_TAGGER_H_ +#define _FF_TAGGER_H_ + +#include <map> +#include "ff.h" + +typedef std::map<WordID, int> Class2FID; +typedef std::map<WordID, Class2FID> Class2Class2FID; + +// the reason this is a "tagger" feature is that it assumes that +// the sequence unfolds from left to right, which means it doesn't +// have to split states based on left context. +// fires unigram features as well +class Tagger_BigramIdentity : public FeatureFunction { + public: +  Tagger_BigramIdentity(const std::string& param); + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* context) const; + private: +  void FireFeature(const WordID& left, +                   const WordID& right, +                   SparseVector<double>* features) const; +  mutable Class2Class2FID fmap_; +}; + +// for each pair of symbols cooccuring in a lexicalized rule, fire +// a feature (mostly used for tagging, but could be used for any model) +class LexicalPairIdentity : public FeatureFunction { + public: +  LexicalPairIdentity(const std::string& param); + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* context) const; + private: +  void FireFeature(WordID src, +                   WordID trg, +                   SparseVector<double>* features) const; +  mutable Class2Class2FID fmap_; +}; + + +#endif diff --git a/decoder/ff_test.cc b/decoder/ff_test.cc new file mode 100644 index 00000000..9e640517 --- /dev/null +++ b/decoder/ff_test.cc @@ -0,0 +1,64 @@ +#include <cassert> +#include <iostream> +#include <fstream> +#include <vector> +#include <gtest/gtest.h> +#include "hg.h" +#include "ff_lm.h" +#include "ff.h" +#include "trule.h" +#include "sentence_metadata.h" + +using namespace std; + +LanguageModel* lm_ = NULL; +LanguageModel* lm3_ = NULL; + +class FFTest : public testing::Test { + public: +  FFTest() : smeta(0,Lattice()) { +    if (!lm_) { +      static LanguageModel slm("-o 2 ./test_data/test_2gram.lm.gz"); +      lm_ = &slm; +      static LanguageModel slm3("./test_data/dummy.3gram.lm -o 3"); +      lm3_ = &slm3; +    } +  } + protected: +  virtual void SetUp() { } +  virtual void TearDown() { } +  SentenceMetadata smeta; +}; +        +TEST_F(FFTest, LM3) { +  int x = lm3_->NumBytesContext(); +  Hypergraph::Edge edge1; +  edge1.rule_.reset(new TRule("[X] ||| x y ||| one ||| 1.0 -2.4 3.0")); +  Hypergraph::Edge edge2; +  edge2.rule_.reset(new TRule("[X] ||| [X,1] a ||| [X,1] two ||| 1.0 -2.4 3.0")); +  Hypergraph::Edge edge3; +  edge3.rule_.reset(new TRule("[X] ||| [X,1] a ||| zero [X,1] two ||| 1.0 -2.4 3.0")); +  vector<const void*> ants1; +  string state(x, '\0'); +  SparseVector<double> feats; +  SparseVector<double> est; +  lm3_->TraversalFeatures(smeta, edge1, ants1, &feats, &est, (void *)&state[0]); +  cerr << "returned " << feats << endl; +  cerr << edge1.feature_values_ << endl; +  cerr << lm3_->DebugStateToString((const void*)&state[0]) << endl; +  EXPECT_EQ("[ one ]", lm3_->DebugStateToString((const void*)&state[0])); +  ants1.push_back((const void*)&state[0]); +  string state2(x, '\0'); +  lm3_->TraversalFeatures(smeta, edge2, ants1, &feats, &est, (void *)&state2[0]); +  cerr << lm3_->DebugStateToString((const void*)&state2[0]) << endl; +  EXPECT_EQ("[ one two ]", lm3_->DebugStateToString((const void*)&state2[0])); +  string state3(x, '\0'); +  lm3_->TraversalFeatures(smeta, edge3, ants1, &feats, &est, (void *)&state3[0]); +  cerr << lm3_->DebugStateToString((const void*)&state3[0]) << endl; +  EXPECT_EQ("[ zero one <{STAR}> one two ]", lm3_->DebugStateToString((const void*)&state3[0])); +} + +int main(int argc, char **argv) { +  testing::InitGoogleTest(&argc, argv); +  return RUN_ALL_TESTS(); +} diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc new file mode 100644 index 00000000..669aa530 --- /dev/null +++ b/decoder/ff_wordalign.cc @@ -0,0 +1,445 @@ +#include "ff_wordalign.h" + +#include <set> +#include <sstream> +#include <string> +#include <cmath> + +#include "stringlib.h" +#include "sentence_metadata.h" +#include "hg.h" +#include "fdict.h" +#include "aligner.h" +#include "tdict.h"   // Blunsom hack +#include "filelib.h" // Blunsom hack + +static const int MAX_SENTENCE_SIZE = 100; + +using namespace std; + +Model2BinaryFeatures::Model2BinaryFeatures(const string& param) : +    fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) { +  for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) { +    for (int j = 0; j < i; ++j) { +      for (int k = 0; k < MAX_SENTENCE_SIZE; ++k) { +        int& val = fids_[i][j][k]; +        val = -1; +        if (j < i) { +          ostringstream os; +          os << "M2FL:" << i << ":TI:" << k << "_SI:" << j; +          val = FD::Convert(os.str()); +        } +      } +    } +  } +} + +void Model2BinaryFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                                 const Hypergraph::Edge& edge, +                                                 const vector<const void*>& ant_states, +                                                 SparseVector<double>* features, +                                                 SparseVector<double>* estimated_features, +                                                 void* state) const { +  // if the source word is either null or the generated word +  // has no position in the reference +  if (edge.i_ == -1 || edge.prev_i_ == -1) +    return; + +  assert(smeta.GetTargetLength() > 0); +  const int fid = fids_[smeta.GetSourceLength()][edge.i_][edge.prev_i_]; +  features->set_value(fid, 1.0); +//  cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl; +} + + +RelativeSentencePosition::RelativeSentencePosition(const string& param) : +    fid_(FD::Convert("RelativeSentencePosition")) { +  if (!param.empty()) { +    cerr << "  Loading word classes from " << param << endl; +    condition_on_fclass_ = true; +    ReadFile rf(param); +    istream& in = *rf.stream(); +    set<WordID> classes; +    while(in) { +      string line; +      getline(in, line); +      if (line.empty()) continue; +      vector<WordID> v; +      TD::ConvertSentence(line, &v); +      pos_.push_back(v); +      for (int i = 0; i < v.size(); ++i) +        classes.insert(v[i]); +      for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) { +        ostringstream os; +        os << "RelPos_FC:" << TD::Convert(*i); +        fids_[*i] = FD::Convert(os.str()); +      } +    } +  } else { +    condition_on_fclass_ = false; +  } +} + +void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                                     const Hypergraph::Edge& edge, +                                                     const vector<const void*>& ant_states, +                                                     SparseVector<double>* features, +                                                     SparseVector<double>* estimated_features, +                                                     void* state) const { +  // if the source word is either null or the generated word +  // has no position in the reference +  if (edge.i_ == -1 || edge.prev_i_ == -1) +    return; + +  assert(smeta.GetTargetLength() > 0); +  const double val = fabs(static_cast<double>(edge.i_) / smeta.GetSourceLength() - +                          static_cast<double>(edge.prev_i_) / smeta.GetTargetLength()); +  features->set_value(fid_, val); +  if (condition_on_fclass_) { +    assert(smeta.GetSentenceID() < pos_.size()); +    const WordID cur_fclass = pos_[smeta.GetSentenceID()][edge.i_]; +    const int fid = fids_.find(cur_fclass)->second; +    features->set_value(fid, val); +  } +//  cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl; +} + +MarkovJumpFClass::MarkovJumpFClass(const string& param) : +    FeatureFunction(1), +    fids_(MAX_SENTENCE_SIZE) { +  cerr << "    MarkovJumpFClass" << endl; +  cerr << "Reading source POS tags from " << param << endl; +  ReadFile rf(param); +  istream& in = *rf.stream(); +  set<WordID> classes; +  while(in) { +    string line; +    getline(in, line); +    if (line.empty()) continue; +    vector<WordID> v; +    TD::ConvertSentence(line, &v); +    pos_.push_back(v); +    for (int i = 0; i < v.size(); ++i) +      classes.insert(v[i]); +  } +  cerr << "  (" << pos_.size() << " lines)\n"; +  cerr << "  Classes: " << classes.size() << endl; +  for (int ss = 1; ss < MAX_SENTENCE_SIZE; ++ss) { +    map<WordID, map<int, int> >& cfids = fids_[ss]; +    for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) { +      map<int, int> &fids = cfids[*i]; +      for (int j = -ss; j <= ss; ++j) { +        ostringstream os; +        os << "Jump_FL:" << ss << "_FC:" << TD::Convert(*i) << "_J:" << j; +        fids[j] = FD::Convert(os.str()); +      } +    } +  } +} + +void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta, +                                   int prev_src_pos, +                                   int cur_src_pos, +                                   SparseVector<double>* features) const { +  const int jumpsize = cur_src_pos - prev_src_pos; +  assert(smeta.GetSentenceID() < pos_.size()); +  const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos]; +  const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second; +  features->set_value(fid, 1.0); +} + +void MarkovJumpFClass::FinalTraversalFeatures(const void* context, +                                      SparseVector<double>* features) const { +  int left_index = *static_cast<const unsigned char*>(context); +//  int right_index = cur_flen; +  // TODO +} + +void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_states, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* state) const { +  unsigned char& dpstate = *((unsigned char*)state); +  if (edge.Arity() == 0) { +    dpstate = static_cast<unsigned int>(edge.i_); +  } else if (edge.Arity() == 1) { +    dpstate = *((unsigned char*)ant_states[0]); +  } else if (edge.Arity() == 2) { +    int left_index = *((unsigned char*)ant_states[0]); +    int right_index = *((unsigned char*)ant_states[1]); +    if (right_index == -1) +      dpstate = static_cast<unsigned int>(left_index); +    else +      dpstate = static_cast<unsigned int>(right_index); +//    const WordID cur_fclass = pos_[smeta.GetSentenceID()][right_index]; +//    cerr << edge.i_ << "," << edge.j_ << ": fclass=" << TD::Convert(cur_fclass) << " j=" << jumpsize << endl; +//    const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second; +//    features->set_value(fid, 1.0); +    FireFeature(smeta, left_index, right_index, features); +  } +} + +//  std::vector<std::map<int, int> > flen2jump2fid_; +MarkovJump::MarkovJump(const string& param) : +    FeatureFunction(1), +    fid_(FD::Convert("MarkovJump")), +    binary_params_(false) { +  cerr << "    MarkovJump"; +  vector<string> argv; +  int argc = SplitOnWhitespace(param, &argv); +  if (argc != 1 || !(argv[0] == "-b" || argv[0] == "+b")) { +    cerr << "MarkovJump: expected parameters to be -b or +b\n"; +    exit(1); +  } +  binary_params_ = argv[0] == "+b"; +  if (binary_params_) { +    flen2jump2fid_.resize(MAX_SENTENCE_SIZE); +    for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) { +      map<int, int>& jump2fid = flen2jump2fid_[i]; +      for (int jump = -i; jump <= i; ++jump) { +        ostringstream os; +        os << "Jump:FLen:" << i << "_J:" << jump; +        jump2fid[jump] = FD::Convert(os.str()); +      } +    } +  } else { +    cerr << " (Blunsom & Cohn definition)"; +  } +  cerr << endl; +} + +// TODO handle NULLs according to Och 2000 +void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                       const Hypergraph::Edge& edge, +                                       const vector<const void*>& ant_states, +                                       SparseVector<double>* features, +                                       SparseVector<double>* estimated_features, +                                       void* state) const { +  unsigned char& dpstate = *((unsigned char*)state); +  const int flen = smeta.GetSourceLength(); +  if (edge.Arity() == 0) { +    dpstate = static_cast<unsigned int>(edge.i_); +    if (edge.prev_i_ == 0) { +      if (binary_params_) { +        // NULL will be tricky +        // TODO initial state distribution, not normal jumps +        const int fid = flen2jump2fid_[flen].find(edge.i_ + 1)->second; +        features->set_value(fid, 1.0); +      } +    } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) { +        // NULL will be tricky +      if (binary_params_) { +        int jumpsize = flen - edge.i_; +        const int fid = flen2jump2fid_[flen].find(jumpsize)->second; +        features->set_value(fid, 1.0); +      } +    } +  } else if (edge.Arity() == 1) { +    dpstate = *((unsigned char*)ant_states[0]); +  } else if (edge.Arity() == 2) { +    int left_index = *((unsigned char*)ant_states[0]); +    int right_index = *((unsigned char*)ant_states[1]); +    if (right_index == -1) +      dpstate = static_cast<unsigned int>(left_index); +    else +      dpstate = static_cast<unsigned int>(right_index); +    const int jumpsize = right_index - left_index; + +    if (binary_params_) { +      const int fid = flen2jump2fid_[flen].find(jumpsize)->second; +      features->set_value(fid, 1.0); +    } else { +      features->set_value(fid_, fabs(jumpsize - 1));  // Blunsom and Cohn def +    } +  } else { +    assert(!"something really unexpected is happening"); +  } +} + +// state: POS of src word used, number of trg words generated +SourcePOSBigram::SourcePOSBigram(const std::string& param) : +    FeatureFunction(sizeof(WordID) + sizeof(int)) { +  cerr << "Reading source POS tags from " << param << endl; +  ReadFile rf(param); +  istream& in = *rf.stream(); +  while(in) { +    string line; +    getline(in, line); +    if (line.empty()) continue; +    vector<WordID> v; +    TD::ConvertSentence(line, &v); +    pos_.push_back(v); +  } +  cerr << "  (" << pos_.size() << " lines)\n"; +} + +void SourcePOSBigram::FinalTraversalFeatures(const void* context, +                                      SparseVector<double>* features) const { +  WordID left = *static_cast<const WordID*>(context); +  int left_wc = *(static_cast<const int*>(context) + 1); +  if (left_wc == 1) +    FireFeature(-1, left, features); +  FireFeature(left, -1, features); +} + +void SourcePOSBigram::FireFeature(WordID left, +                   WordID right, +                   SparseVector<double>* features) const { +  int& fid = fmap_[left][right]; +  if (!fid) { +    ostringstream os; +    os << "SP:"; +    if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); } +    os << '_'; +    if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); } +    fid = FD::Convert(os.str()); +    if (fid == 0) fid = -1; +  } +  if (fid < 0) return; +  features->set_value(fid, 1.0); +} + +void SourcePOSBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* context) const { +  WordID& out_context = *static_cast<WordID*>(context); +  int& out_word_count = *(static_cast<int*>(context) + 1); +  const int arity = edge.Arity(); +  if (arity == 0) { +    assert(smeta.GetSentenceID() < pos_.size()); +    const vector<WordID>& pos_sent = pos_[smeta.GetSentenceID()]; +    assert(edge.i_ < pos_sent.size()); +    out_context = pos_sent[edge.i_]; +    out_word_count = edge.rule_->EWords(); +    assert(out_word_count == 1); // this is only defined for lex translation! +    // revisit this if you want to translate into null words +  } else if (arity == 2) { +    WordID left = *static_cast<const WordID*>(ant_contexts[0]); +    WordID right = *static_cast<const WordID*>(ant_contexts[1]); +    int left_wc = *(static_cast<const int*>(ant_contexts[0]) + 1); +    int right_wc = *(static_cast<const int*>(ant_contexts[0]) + 1); +    if (left_wc == 1 && right_wc == 1) +      FireFeature(-1, left, features); +    FireFeature(left, right, features); +    out_word_count = left_wc + right_wc; +    out_context = right; +  } +} + +AlignerResults::AlignerResults(const std::string& param) : +    cur_sent_(-1), +    cur_grid_(NULL) { +  vector<string> argv; +  int argc = SplitOnWhitespace(param, &argv); +  if (argc != 2) { +    cerr << "Required format: AlignerResults [FeatureName] [file.pharaoh]\n"; +    exit(1); +  } +  cerr << "  feature: " << argv[0] << "\talignments: " << argv[1] << endl; +  fid_ = FD::Convert(argv[0]); +  ReadFile rf(argv[1]); +  istream& in = *rf.stream(); int lc = 0; +  while(in) { +    string line; +    getline(in, line); +    if (!in) break;  +    ++lc; +    is_aligned_.push_back(AlignerTools::ReadPharaohAlignmentGrid(line)); +  } +  cerr << "  Loaded " << lc << " refs\n"; +} + +void AlignerResults::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                           const Hypergraph::Edge& edge, +                                           const vector<const void*>& ant_states, +                                           SparseVector<double>* features, +                                           SparseVector<double>* estimated_features, +                                           void* state) const { +  if (edge.i_ == -1 || edge.prev_i_ == -1) +    return; + +  if (cur_sent_ != smeta.GetSentenceID()) { +    assert(smeta.HasReference()); +    cur_sent_ = smeta.GetSentenceID(); +    assert(cur_sent_ < is_aligned_.size()); +    cur_grid_ = is_aligned_[cur_sent_].get(); +  } + +  //cerr << edge.rule_->AsString() << endl; + +  int j = edge.i_;        // source side (f) +  int i = edge.prev_i_;   // target side (e) +  if (j < cur_grid_->height() && i < cur_grid_->width() && (*cur_grid_)(i, j)) { +//    if (edge.rule_->e_[0] == smeta.GetReference()[i][0].label) { +      features->set_value(fid_, 1.0); +//      cerr << edge.rule_->AsString() << "   (" << i << "," << j << ")\n"; +//    } +  } +} + +BlunsomSynchronousParseHack::BlunsomSynchronousParseHack(const string& param) : +  FeatureFunction((100 / 8) + 1), fid_(FD::Convert("NotRef")), cur_sent_(-1) { +  ReadFile rf(param); +  istream& in = *rf.stream(); int lc = 0; +  while(in) { +    string line; +    getline(in, line); +    if (!in) break;  +    ++lc; +    refs_.push_back(vector<WordID>()); +    TD::ConvertSentence(line, &refs_.back()); +  } +  cerr << "  Loaded " << lc << " refs\n"; +} + +void BlunsomSynchronousParseHack::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                           const Hypergraph::Edge& edge, +                                           const vector<const void*>& ant_states, +                                           SparseVector<double>* features, +                                           SparseVector<double>* estimated_features, +                                           void* state) const { +  if (cur_sent_ != smeta.GetSentenceID()) { +    // assert(smeta.HasReference()); +    cur_sent_ = smeta.GetSentenceID(); +    assert(cur_sent_ < refs_.size()); +    cur_ref_ = &refs_[cur_sent_]; +    cur_map_.clear(); +    for (int i = 0; i < cur_ref_->size(); ++i) { +      vector<WordID> phrase; +      for (int j = i; j < cur_ref_->size(); ++j) { +        phrase.push_back((*cur_ref_)[j]); +        cur_map_[phrase] = i; +      } +    } +  } +  //cerr << edge.rule_->AsString() << endl; +  for (int i = 0; i < ant_states.size(); ++i) { +    if (DoesNotBelong(ant_states[i])) { +      //cerr << "  ant " << i << " does not belong\n"; +      return; +    } +  } +  vector<vector<WordID> > ants(ant_states.size()); +  vector<const vector<WordID>* > pants(ant_states.size()); +  for (int i = 0; i < ant_states.size(); ++i) { +    AppendAntecedentString(ant_states[i], &ants[i]); +    //cerr << "  ant[" << i << "]: " << ((int)*(static_cast<const unsigned char*>(ant_states[i]))) << " " << TD::GetString(ants[i]) << endl; +    pants[i] = &ants[i]; +  } +  vector<WordID> yield; +  edge.rule_->ESubstitute(pants, &yield); +  //cerr << "YIELD: " << TD::GetString(yield) << endl; +  Vec2Int::iterator it = cur_map_.find(yield); +  if (it == cur_map_.end()) { +    features->set_value(fid_, 1); +    //cerr << "  BAD!\n"; +    return; +  } +  SetStateMask(it->second, it->second + yield.size(), state); +} + diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h new file mode 100644 index 00000000..c44ad26b --- /dev/null +++ b/decoder/ff_wordalign.h @@ -0,0 +1,196 @@ +#ifndef _FF_WORD_ALIGN_H_ +#define _FF_WORD_ALIGN_H_ + +#include "ff.h" +#include "array2d.h" + +#include <boost/multi_array.hpp> + +class RelativeSentencePosition : public FeatureFunction { + public: +  RelativeSentencePosition(const std::string& param); + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const; + private: +  const int fid_; +  bool condition_on_fclass_; +  std::vector<std::vector<WordID> > pos_; +  std::map<WordID, int> fids_;  // fclass -> fid +}; + +class Model2BinaryFeatures : public FeatureFunction { + public: +  Model2BinaryFeatures(const std::string& param); + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const; + private: +  boost::multi_array<int, 3> fids_; +}; + +class MarkovJump : public FeatureFunction { + public: +  MarkovJump(const std::string& param); + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const; + private: +  const int fid_; +  bool binary_params_; +  std::vector<std::map<int, int> > flen2jump2fid_; +}; + +class MarkovJumpFClass : public FeatureFunction { + public: +  MarkovJumpFClass(const std::string& param); +  virtual void FinalTraversalFeatures(const void* context, +                                      SparseVector<double>* features) const; + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* context) const; + +  void FireFeature(const SentenceMetadata& smeta, +                   int prev_src_pos, +                   int cur_src_pos, +                   SparseVector<double>* features) const; + + private: +  std::vector<std::map<WordID, std::map<int, int> > > fids_;  // flen -> fclass -> jumpsize -> fid +  std::vector<std::vector<WordID> > pos_; +}; + +typedef std::map<WordID, int> Class2FID; +typedef std::map<WordID, Class2FID> Class2Class2FID; +class SourcePOSBigram : public FeatureFunction { + public: +  SourcePOSBigram(const std::string& param); +  virtual void FinalTraversalFeatures(const void* context, +                                      SparseVector<double>* features) const; + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* context) const; + private: +  void FireFeature(WordID src, +                   WordID trg, +                   SparseVector<double>* features) const; +  mutable Class2Class2FID fmap_; +  std::vector<std::vector<WordID> > pos_; +}; + +class AlignerResults : public FeatureFunction { + public: +  AlignerResults(const std::string& param); + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const; + private: +  int fid_; +  std::vector<boost::shared_ptr<Array2D<bool> > > is_aligned_; +  mutable int cur_sent_; +  const Array2D<bool> mutable* cur_grid_; +}; + +#include <tr1/unordered_map> +#include <boost/functional/hash.hpp> +#include <cassert> +class BlunsomSynchronousParseHack : public FeatureFunction { + public: +  BlunsomSynchronousParseHack(const std::string& param); + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const; + private: +  inline bool DoesNotBelong(const void* state) const { +    for (int i = 0; i < NumBytesContext(); ++i) { +      if (*(static_cast<const unsigned char*>(state) + i)) return false; +    } +    return true; +  } + +  inline void AppendAntecedentString(const void* state, std::vector<WordID>* yield) const { +    int i = 0; +    int ind = 0; +    while (i < NumBytesContext() && !(*(static_cast<const unsigned char*>(state) + i))) { ++i; ind += 8; } +    // std::cerr << i << " " << NumBytesContext() << std::endl; +    assert(i != NumBytesContext()); +    assert(ind < cur_ref_->size()); +    int cur = *(static_cast<const unsigned char*>(state) + i); +    int comp = 1; +    while (comp < 256 && (comp & cur) == 0) { comp <<= 1; ++ind; } +    assert(ind < cur_ref_->size()); +    assert(comp < 256); +    do { +      assert(ind < cur_ref_->size()); +      yield->push_back((*cur_ref_)[ind]); +      ++ind; +      comp <<= 1; +      if (comp == 256) { +        comp = 1; +        ++i; +        cur = *(static_cast<const unsigned char*>(state) + i); +      } +    } while (comp & cur); +  } + +  inline void SetStateMask(int start, int end, void* state) const { +    assert((end / 8) < NumBytesContext()); +    int i = 0; +    int comp = 1; +    for (int j = 0; j < start; ++j) { +      comp <<= 1; +      if (comp == 256) { +        ++i; +        comp = 1; +      } +    } +    //std::cerr << "SM: " << i << "\n"; +    for (int j = start; j < end; ++j) { +      *(static_cast<unsigned char*>(state) + i) |= comp; +      //std::cerr << "  " << comp << "\n"; +      comp <<= 1; +      if (comp == 256) { +        ++i; +        comp = 1; +      } +    } +    //std::cerr << "   MASK: " << ((int)*(static_cast<unsigned char*>(state))) << "\n"; +  } + +  const int fid_; +  mutable int cur_sent_; +  typedef std::tr1::unordered_map<std::vector<WordID>, int, boost::hash<std::vector<WordID> > > Vec2Int; +  mutable Vec2Int cur_map_; +  const std::vector<WordID> mutable * cur_ref_; +  mutable std::vector<std::vector<WordID> > refs_; +}; + +#endif diff --git a/decoder/filelib.cc b/decoder/filelib.cc new file mode 100644 index 00000000..79ad2847 --- /dev/null +++ b/decoder/filelib.cc @@ -0,0 +1,22 @@ +#include "filelib.h" + +#include <unistd.h> +#include <sys/stat.h> + +using namespace std; + +bool FileExists(const std::string& fn) { +  struct stat info; +  int s = stat(fn.c_str(), &info); +  return (s==0); +} + +bool DirectoryExists(const string& dir) { +  if (access(dir.c_str(),0) == 0) { +    struct stat status; +    stat(dir.c_str(), &status); +    if (status.st_mode & S_IFDIR) return true; +  } +  return false; +} + diff --git a/decoder/filelib.h b/decoder/filelib.h new file mode 100644 index 00000000..03c22b0d --- /dev/null +++ b/decoder/filelib.h @@ -0,0 +1,70 @@ +#ifndef _FILELIB_H_ +#define _FILELIB_H_ + +#include <cassert> +#include <string> +#include <iostream> +#include <cstdlib> +#include "gzstream.h" + +bool FileExists(const std::string& file_name); +bool DirectoryExists(const std::string& dir_name); + +// reads from standard in if filename is - +// uncompresses if file ends with .gz +// otherwise, reads from a normal file +class ReadFile { + public: +  ReadFile(const std::string& filename) : +    no_delete_on_exit_(filename == "-"), +    in_(no_delete_on_exit_ ? static_cast<std::istream*>(&std::cin) : +      (EndsWith(filename, ".gz") ? +        static_cast<std::istream*>(new igzstream(filename.c_str())) : +        static_cast<std::istream*>(new std::ifstream(filename.c_str())))) { +    if (!no_delete_on_exit_ && !FileExists(filename)) { +      std::cerr << "File does not exist: " << filename << std::endl; +      abort(); +    } +    if (!*in_) { +      std::cerr << "Failed to open " << filename << std::endl; +      abort(); +    } +  } +  ~ReadFile() { +    if (!no_delete_on_exit_) delete in_; +  } + +  inline std::istream* stream() { return in_; } +   + private: +  static bool EndsWith(const std::string& f, const std::string& suf) { +    return (f.size() > suf.size()) && (f.rfind(suf) == f.size() - suf.size()); +  } +  const bool no_delete_on_exit_; +  std::istream* const in_; +}; + +class WriteFile { + public: +  WriteFile(const std::string& filename) : +    no_delete_on_exit_(filename == "-"), +    out_(no_delete_on_exit_ ? static_cast<std::ostream*>(&std::cout) : +      (EndsWith(filename, ".gz") ? +        static_cast<std::ostream*>(new ogzstream(filename.c_str())) : +        static_cast<std::ostream*>(new std::ofstream(filename.c_str())))) {} +  ~WriteFile() { +    (*out_) << std::flush; +    if (!no_delete_on_exit_) delete out_; +  } + +  inline std::ostream* stream() { return out_; } +   + private: +  static bool EndsWith(const std::string& f, const std::string& suf) { +    return (f.size() > suf.size()) && (f.rfind(suf) == f.size() - suf.size()); +  } +  const bool no_delete_on_exit_; +  std::ostream* const out_; +}; + +#endif diff --git a/decoder/forest_writer.cc b/decoder/forest_writer.cc new file mode 100644 index 00000000..a9117d18 --- /dev/null +++ b/decoder/forest_writer.cc @@ -0,0 +1,23 @@ +#include "forest_writer.h" + +#include <iostream> + +#include <boost/lexical_cast.hpp> + +#include "filelib.h" +#include "hg_io.h" +#include "hg.h" + +using namespace std; + +ForestWriter::ForestWriter(const std::string& path, int num) : +  fname_(path + '/' + boost::lexical_cast<string>(num) + ".json.gz"), used_(false) {} + +bool ForestWriter::Write(const Hypergraph& forest, bool minimal_rules) { +  assert(!used_); +  used_ = true; +  cerr << "  Writing forest to " << fname_ << endl; +  WriteFile wf(fname_); +  return HypergraphIO::WriteToJSON(forest, minimal_rules, wf.stream()); +} + diff --git a/decoder/forest_writer.h b/decoder/forest_writer.h new file mode 100644 index 00000000..819a8940 --- /dev/null +++ b/decoder/forest_writer.h @@ -0,0 +1,16 @@ +#ifndef _FOREST_WRITER_H_ +#define _FOREST_WRITER_H_ + +#include <string> + +class Hypergraph; + +struct ForestWriter { +  ForestWriter(const std::string& path, int num); +  bool Write(const Hypergraph& forest, bool minimal_rules); + +  const std::string fname_; +  bool used_; +}; + +#endif diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc new file mode 100644 index 00000000..9e25d346 --- /dev/null +++ b/decoder/freqdict.cc @@ -0,0 +1,29 @@ +#include <iostream> +#include <fstream> +#include <cassert> +#include "freqdict.h" +#include "tdict.h" +#include "filelib.h" + +using namespace std; + +void FreqDict::Load(const std::string& fname) { +  cerr << "Reading word frequencies: " << fname << endl; +  ReadFile rf(fname); +  istream& ifs = *rf.stream(); +  int cc=0; +  while (ifs) { +    std::string word; +    ifs >> word; +    if (word.size() == 0) continue; +    if (word[0] == '#') continue; +    double count = 0; +    ifs >> count; +    assert(count > 0.0);  // use -log(f) +    counts_[TD::Convert(word)]=count; +    ++cc; +    if (cc % 10000 == 0) { std::cerr << "."; } +  } +  std::cerr << "\n"; +  std::cerr << "Loaded " << cc << " words\n"; +} diff --git a/decoder/freqdict.h b/decoder/freqdict.h new file mode 100644 index 00000000..9acf0c33 --- /dev/null +++ b/decoder/freqdict.h @@ -0,0 +1,20 @@ +#ifndef _FREQDICT_H_ +#define _FREQDICT_H_ + +#include <map> +#include <string> +#include "wordid.h" + +class FreqDict { + public: +  void Load(const std::string& fname); +  float LookUp(const WordID& word) const { +    std::map<WordID,float>::const_iterator i = counts_.find(word); +    if (i == counts_.end()) return 0; +    return i->second; +  } + private: +  std::map<WordID, float> counts_; +}; + +#endif diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc new file mode 100644 index 00000000..38dbd717 --- /dev/null +++ b/decoder/fst_translator.cc @@ -0,0 +1,91 @@ +#include "translator.h" + +#include <sstream> +#include <boost/shared_ptr.hpp> + +#include "sentence_metadata.h" +#include "filelib.h" +#include "hg.h" +#include "hg_io.h" +#include "earley_composer.h" +#include "phrasetable_fst.h" +#include "tdict.h" + +using namespace std; + +struct FSTTranslatorImpl { +  FSTTranslatorImpl(const boost::program_options::variables_map& conf) : +      goal_sym(conf["goal"].as<string>()), +      kGOAL_RULE(new TRule("[Goal] ||| [" + goal_sym + ",1] ||| [1]")), +      kGOAL(TD::Convert("Goal") * -1), +      add_pass_through_rules(conf.count("add_pass_through_rules")) { +    fst.reset(LoadTextPhrasetable(conf["grammar"].as<vector<string> >())); +    ec.reset(new EarleyComposer(fst.get())); +  } + +  bool Translate(const string& input, +                 const vector<double>& weights, +                 Hypergraph* forest) { +    bool composed = false; +    if (input.find("{\"rules\"") == 0) { +      istringstream is(input); +      Hypergraph src_cfg_hg; +      assert(HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)); +      if (add_pass_through_rules) { +        SparseVector<double> feats; +        feats.set_value(FD::Convert("PassThrough"), 1); +        for (int i = 0; i < src_cfg_hg.edges_.size(); ++i) { +          const vector<WordID>& f = src_cfg_hg.edges_[i].rule_->f_; +          for (int j = 0; j < f.size(); ++j) { +            if (f[j] > 0) { +              fst->AddPassThroughTranslation(f[j], feats); +            } +          } +        } +      } +      composed = ec->Compose(src_cfg_hg, forest); +    } else { +      const string dummy_grammar("[" + goal_sym + "] ||| " + input + " ||| TOP=1"); +      cerr << "  Dummy grammar: " << dummy_grammar << endl; +      istringstream is(dummy_grammar); +      if (add_pass_through_rules) { +        vector<WordID> words; +        TD::ConvertSentence(input, &words); +        SparseVector<double> feats; +        feats.set_value(FD::Convert("PassThrough"), 1); +        for (int i = 0; i < words.size(); ++i) +          fst->AddPassThroughTranslation(words[i], feats); +      } +      composed = ec->Compose(&is, forest); +    } +    if (composed) { +      Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); +      Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1); +      Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); +      forest->ConnectEdgeToHeadNode(hg_edge, goal); +      forest->Reweight(weights); +    } +    if (add_pass_through_rules) +      fst->ClearPassThroughTranslations(); +    return composed; +  } + +  const string goal_sym; +  const TRulePtr kGOAL_RULE; +  const WordID kGOAL; +  const bool add_pass_through_rules; +  boost::shared_ptr<EarleyComposer> ec; +  boost::shared_ptr<FSTNode> fst; +}; + +FSTTranslator::FSTTranslator(const boost::program_options::variables_map& conf) : +  pimpl_(new FSTTranslatorImpl(conf)) {} + +bool FSTTranslator::TranslateImpl(const string& input, +                              SentenceMetadata* smeta, +                              const vector<double>& weights, +                              Hypergraph* minus_lm_forest) { +  smeta->SetSourceLength(0);  // don't know how to compute this +  return pimpl_->Translate(input, weights, minus_lm_forest); +} + diff --git a/decoder/grammar.cc b/decoder/grammar.cc new file mode 100644 index 00000000..5eb7887d --- /dev/null +++ b/decoder/grammar.cc @@ -0,0 +1,148 @@ +#include "grammar.h" + +#include <algorithm> +#include <utility> +#include <map> + +#include "rule_lexer.h" +#include "filelib.h" +#include "tdict.h" + +using namespace std; + +const vector<TRulePtr> Grammar::NO_RULES; + +RuleBin::~RuleBin() {} +GrammarIter::~GrammarIter() {} +Grammar::~Grammar() {} + +bool Grammar::HasRuleForSpan(int i, int j, int distance) const { +  (void) i; +  (void) j; +  (void) distance; +  return true;  // always true by default +} + +struct TextRuleBin : public RuleBin { +  int GetNumRules() const { +    return rules_.size(); +  } +  TRulePtr GetIthRule(int i) const { +    return rules_[i]; +  } +  void AddRule(TRulePtr t) { +    rules_.push_back(t); +  } +  int Arity() const { +    return rules_.front()->Arity(); +  } +  void Dump() const { +    for (int i = 0; i < rules_.size(); ++i) +      cerr << rules_[i]->AsString() << endl; +  } + private: +  vector<TRulePtr> rules_; +}; + +struct TextGrammarNode : public GrammarIter { +  TextGrammarNode() : rb_(NULL) {} +  ~TextGrammarNode() { +    delete rb_; +  } +  const GrammarIter* Extend(int symbol) const { +    map<WordID, TextGrammarNode>::const_iterator i = tree_.find(symbol); +    if (i == tree_.end()) return NULL; +    return &i->second; +  } + +  const RuleBin* GetRules() const { +    if (rb_) { +      //rb_->Dump(); +    } +    return rb_; +  } + +  map<WordID, TextGrammarNode> tree_; +  TextRuleBin* rb_; +}; + +struct TGImpl { +  TextGrammarNode root_; +}; + +TextGrammar::TextGrammar() : max_span_(10), pimpl_(new TGImpl) {} +TextGrammar::TextGrammar(const string& file) :  +    max_span_(10), +    pimpl_(new TGImpl) { +  ReadFromFile(file); +} + +const GrammarIter* TextGrammar::GetRoot() const { +  return &pimpl_->root_; +} + +void TextGrammar::AddRule(const TRulePtr& rule) { +  if (rule->IsUnary()) { +    rhs2unaries_[rule->f().front()].push_back(rule); +    unaries_.push_back(rule); +  } else { +    TextGrammarNode* cur = &pimpl_->root_; +    for (int i = 0; i < rule->f_.size(); ++i) +      cur = &cur->tree_[rule->f_[i]]; +    if (cur->rb_ == NULL) +      cur->rb_ = new TextRuleBin; +    cur->rb_->AddRule(rule); +  } +} + +static void AddRuleHelper(const TRulePtr& new_rule, void* extra) { +  static_cast<TextGrammar*>(extra)->AddRule(new_rule); +} + +void TextGrammar::ReadFromFile(const string& filename) { +  ReadFile in(filename); +  RuleLexer::ReadRules(in.stream(), &AddRuleHelper, this); +} + +bool TextGrammar::HasRuleForSpan(int i, int j, int distance) const { +  return (max_span_ >= distance); +} + +GlueGrammar::GlueGrammar(const string& file) : TextGrammar(file) {} + +GlueGrammar::GlueGrammar(const string& goal_nt, const string& default_nt) { +  TRulePtr stop_glue(new TRule("[" + goal_nt + "] ||| [" + default_nt + ",1] ||| [" + default_nt + ",1]")); +  TRulePtr glue(new TRule("[" + goal_nt + "] ||| [" + goal_nt + ",1] [" +    + default_nt + ",2] ||| [" + goal_nt + ",1] [" + default_nt + ",2] ||| Glue=1")); + +  AddRule(stop_glue); +  AddRule(glue); +  //cerr << "GLUE: " << stop_glue->AsString() << endl; +  //cerr << "GLUE: " << glue->AsString() << endl; +} + +bool GlueGrammar::HasRuleForSpan(int i, int j, int distance) const { +  (void) j; +  return (i == 0); +} + +PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat) : +    has_rule_(input.size() + 1) { +  for (int i = 0; i < input.size(); ++i) { +    const vector<LatticeArc>& alts = input[i]; +    for (int k = 0; k < alts.size(); ++k) { +      const int j = alts[k].dist2next + i; +      has_rule_[i].insert(j); +      const string& src = TD::Convert(alts[k].label); +      TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1")); +      AddRule(pt); +//      cerr << "PT: " << pt->AsString() << endl; +    } +  } +} + +bool PassThroughGrammar::HasRuleForSpan(int i, int j, int distance) const { +  const set<int>& hr = has_rule_[i]; +  if (i == j) { return !hr.empty(); } +  return (hr.find(j) != hr.end()); +} diff --git a/decoder/grammar.h b/decoder/grammar.h new file mode 100644 index 00000000..46886d3a --- /dev/null +++ b/decoder/grammar.h @@ -0,0 +1,89 @@ +#ifndef GRAMMAR_H_ +#define GRAMMAR_H_ + +#include <vector> +#include <map> +#include <set> +#include <boost/shared_ptr.hpp> +#include <string> + +#include "lattice.h" +#include "trule.h" + +struct RuleBin { +  virtual ~RuleBin(); +  virtual int GetNumRules() const = 0; +  virtual TRulePtr GetIthRule(int i) const = 0; +  virtual int Arity() const = 0; +}; + +struct GrammarIter { +  virtual ~GrammarIter(); +  virtual const RuleBin* GetRules() const = 0; +  virtual const GrammarIter* Extend(int symbol) const = 0; +}; + +struct Grammar { +  typedef std::map<WordID, std::vector<TRulePtr> > Cat2Rules; +  static const std::vector<TRulePtr> NO_RULES; + +  virtual ~Grammar(); +  virtual const GrammarIter* GetRoot() const = 0; +  virtual bool HasRuleForSpan(int i, int j, int distance) const; +  const std::string GetGrammarName(){return grammar_name_;} +  void SetGrammarName(std::string n) {grammar_name_ = n; } +  // cat is the category to be rewritten +  inline const std::vector<TRulePtr>& GetAllUnaryRules() const { +    return unaries_; +  } + +  // get all the unary rules that rewrite category cat +  inline const std::vector<TRulePtr>& GetUnaryRulesForRHS(const WordID& cat) const { +    Cat2Rules::const_iterator found = rhs2unaries_.find(cat); +    if (found == rhs2unaries_.end()) +      return NO_RULES; +    else +      return found->second; +  } + + protected: +  Cat2Rules rhs2unaries_;     // these must be filled in by subclasses! +  std::vector<TRulePtr> unaries_; +  std::string grammar_name_;  +}; + +typedef boost::shared_ptr<Grammar> GrammarPtr; + +class TGImpl; +struct TextGrammar : public Grammar { +  TextGrammar(); +  TextGrammar(const std::string& file); +  void SetMaxSpan(int m) { max_span_ = m; } +   +  virtual const GrammarIter* GetRoot() const; +  void AddRule(const TRulePtr& rule); +  void ReadFromFile(const std::string& filename); +  virtual bool HasRuleForSpan(int i, int j, int distance) const; +  const std::vector<TRulePtr>& GetUnaryRules(const WordID& cat) const; +   + private: +  int max_span_; +  boost::shared_ptr<TGImpl> pimpl_; +   +}; + +struct GlueGrammar : public TextGrammar { +  // read glue grammar from file +  explicit GlueGrammar(const std::string& file); +  GlueGrammar(const std::string& goal_nt, const std::string& default_nt);  // "S", "X" +  virtual bool HasRuleForSpan(int i, int j, int distance) const; +}; + +struct PassThroughGrammar : public TextGrammar { +  PassThroughGrammar(const Lattice& input, const std::string& cat); +  virtual bool HasRuleForSpan(int i, int j, int distance) const; + private: +  std::vector<std::set<int> > has_rule_;  // index by [i][j] +}; + +#endif diff --git a/decoder/grammar_test.cc b/decoder/grammar_test.cc new file mode 100644 index 00000000..62b8f958 --- /dev/null +++ b/decoder/grammar_test.cc @@ -0,0 +1,59 @@ +#include <cassert> +#include <iostream> +#include <fstream> +#include <vector> +#include <gtest/gtest.h> +#include "trule.h" +#include "tdict.h" +#include "grammar.h" +#include "bottom_up_parser.h" +#include "ff.h" +#include "weights.h" + +using namespace std; + +class GrammarTest : public testing::Test { + public: +  GrammarTest() { +    wts.InitFromFile("test_data/weights.gt"); +  } + protected: +  virtual void SetUp() { } +  virtual void TearDown() { } +  Weights wts; +}; +        +TEST_F(GrammarTest,TestTextGrammar) { +  vector<double> w; +  vector<const FeatureFunction*> ms; +  ModelSet models(w, ms); + +  TextGrammar g; +  TRulePtr r1(new TRule("[X] ||| a b c ||| A B C ||| 0.1 0.2 0.3", true)); +  TRulePtr r2(new TRule("[X] ||| a b c ||| 1 2 3 ||| 0.2 0.3 0.4", true)); +  TRulePtr r3(new TRule("[X] ||| a b c d ||| A B C D ||| 0.1 0.2 0.3", true)); +  cerr << r1->AsString() << endl; +  g.AddRule(r1); +  g.AddRule(r2); +  g.AddRule(r3); +} + +TEST_F(GrammarTest,TestTextGrammarFile) { +  GrammarPtr g(new TextGrammar("./test_data/grammar.prune")); +  vector<GrammarPtr> grammars(1, g); + +  LatticeArc a(TD::Convert("ein"), 0.0, 1); +  LatticeArc b(TD::Convert("haus"), 0.0, 1); +  Lattice lattice(2); +  lattice[0].push_back(a); +  lattice[1].push_back(b); +  Hypergraph forest; +  ExhaustiveBottomUpParser parser("PHRASE", grammars); +  parser.Parse(lattice, &forest); +  forest.PrintGraphviz(); +} + +int main(int argc, char **argv) { +  testing::InitGoogleTest(&argc, argv); +  return RUN_ALL_TESTS(); +} diff --git a/decoder/gzstream.cc b/decoder/gzstream.cc new file mode 100644 index 00000000..9703e6ad --- /dev/null +++ b/decoder/gzstream.cc @@ -0,0 +1,165 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +// ============================================================================ +// +// File          : gzstream.C +// Revision      : $Revision: 1.1 $ +// Revision_date : $Date: 2006/03/30 04:05:52 $ +// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner +//  +// Standard streambuf implementation following Nicolai Josuttis, "The  +// Standard C++ Library". +// ============================================================================ + +#include "gzstream.h" +#include <iostream> +#include <cstring> + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See header file for user classes. +// ---------------------------------------------------------------------------- + +// -------------------------------------- +// class gzstreambuf: +// -------------------------------------- + +gzstreambuf* gzstreambuf::open( const char* name, int open_mode) { +    if ( is_open()) +        return (gzstreambuf*)0; +    mode = open_mode; +    // no append nor read/write mode +    if ((mode & std::ios::ate) || (mode & std::ios::app) +        || ((mode & std::ios::in) && (mode & std::ios::out))) +        return (gzstreambuf*)0; +    char  fmode[10]; +    char* fmodeptr = fmode; +    if ( mode & std::ios::in) +        *fmodeptr++ = 'r'; +    else if ( mode & std::ios::out) +        *fmodeptr++ = 'w'; +    *fmodeptr++ = 'b'; +    *fmodeptr = '\0'; +    file = gzopen( name, fmode); +    if (file == 0) +        return (gzstreambuf*)0; +    opened = 1; +    return this; +} + +gzstreambuf * gzstreambuf::close() { +    if ( is_open()) { +        sync(); +        opened = 0; +        if ( gzclose( file) == Z_OK) +            return this; +    } +    return (gzstreambuf*)0; +} + +int gzstreambuf::underflow() { // used for input buffer only +    if ( gptr() && ( gptr() < egptr())) +        return * reinterpret_cast<unsigned char *>( gptr()); + +    if ( ! (mode & std::ios::in) || ! opened) +        return EOF; +    // Josuttis' implementation of inbuf +    int n_putback = gptr() - eback(); +    if ( n_putback > 4) +        n_putback = 4; +    memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback); + +    int num = gzread( file, buffer+4, bufferSize-4); +    if (num <= 0) // ERROR or EOF +        return EOF; + +    // reset buffer pointers +    setg( buffer + (4 - n_putback),   // beginning of putback area +          buffer + 4,                 // read position +          buffer + 4 + num);          // end of buffer + +    // return next character +    return * reinterpret_cast<unsigned char *>( gptr());     +} + +int gzstreambuf::flush_buffer() { +    // Separate the writing of the buffer from overflow() and +    // sync() operation. +    int w = pptr() - pbase(); +    if ( gzwrite( file, pbase(), w) != w) +        return EOF; +    pbump( -w); +    return w; +} + +int gzstreambuf::overflow( int c) { // used for output buffer only +    if ( ! ( mode & std::ios::out) || ! opened) +        return EOF; +    if (c != EOF) { +        *pptr() = c; +        pbump(1); +    } +    if ( flush_buffer() == EOF) +        return EOF; +    return c; +} + +int gzstreambuf::sync() { +    // Changed to use flush_buffer() instead of overflow( EOF) +    // which caused improper behavior with std::endl and flush(), +    // bug reported by Vincent Ricard. +    if ( pptr() && pptr() > pbase()) { +        if ( flush_buffer() == EOF) +            return -1; +    } +    return 0; +} + +// -------------------------------------- +// class gzstreambase: +// -------------------------------------- + +gzstreambase::gzstreambase( const char* name, int mode) { +    init( &buf); +    open( name, mode); +} + +gzstreambase::~gzstreambase() { +    buf.close(); +} + +void gzstreambase::open( const char* name, int open_mode) { +    if ( ! buf.open( name, open_mode)) +        clear( rdstate() | std::ios::badbit); +} + +void gzstreambase::close() { +    if ( buf.is_open()) +        if ( ! buf.close()) +            clear( rdstate() | std::ios::badbit); +} + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +// ============================================================================ +// EOF // diff --git a/decoder/gzstream.h b/decoder/gzstream.h new file mode 100644 index 00000000..ad9785fd --- /dev/null +++ b/decoder/gzstream.h @@ -0,0 +1,121 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +// ============================================================================ +// +// File          : gzstream.h +// Revision      : $Revision: 1.1 $ +// Revision_date : $Date: 2006/03/30 04:05:52 $ +// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner +//  +// Standard streambuf implementation following Nicolai Josuttis, "The  +// Standard C++ Library". +// ============================================================================ + +#ifndef GZSTREAM_H +#define GZSTREAM_H 1 + +// standard C++ with new header file names and std:: namespace +#include <iostream> +#include <fstream> +#include <zlib.h> + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See below for user classes. +// ---------------------------------------------------------------------------- + +class gzstreambuf : public std::streambuf { +private: +    static const int bufferSize = 47+256;    // size of data buff +    // totals 512 bytes under g++ for igzstream at the end. + +    gzFile           file;               // file handle for compressed file +    char             buffer[bufferSize]; // data buffer +    char             opened;             // open/close state of stream +    int              mode;               // I/O mode + +    int flush_buffer(); +public: +    gzstreambuf() : opened(0) { +        setp( buffer, buffer + (bufferSize-1)); +        setg( buffer + 4,     // beginning of putback area +              buffer + 4,     // read position +              buffer + 4);    // end position       +        // ASSERT: both input & output capabilities will not be used together +    } +    int is_open() { return opened; } +    gzstreambuf* open( const char* name, int open_mode); +    gzstreambuf* close(); +    ~gzstreambuf() { close(); } +     +    virtual int     overflow( int c = EOF); +    virtual int     underflow(); +    virtual int     sync(); +}; + +class gzstreambase : virtual public std::ios { +protected: +    gzstreambuf buf; +public: +    gzstreambase() { init(&buf); } +    gzstreambase( const char* name, int open_mode); +    ~gzstreambase(); +    void open( const char* name, int open_mode); +    void close(); +    gzstreambuf* rdbuf() { return &buf; } +}; + +// ---------------------------------------------------------------------------- +// User classes. Use igzstream and ogzstream analogously to ifstream and +// ofstream respectively. They read and write files based on the gz*  +// function interface of the zlib. Files are compatible with gzip compression. +// ---------------------------------------------------------------------------- + +class igzstream : public gzstreambase, public std::istream { +public: +    igzstream() : std::istream( &buf) {}  +    igzstream( const char* name, int open_mode = std::ios::in) +        : gzstreambase( name, open_mode), std::istream( &buf) {}   +    gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } +    void open( const char* name, int open_mode = std::ios::in) { +        gzstreambase::open( name, open_mode); +    } +}; + +class ogzstream : public gzstreambase, public std::ostream { +public: +    ogzstream() : std::ostream( &buf) {} +    ogzstream( const char* name, int mode = std::ios::out) +        : gzstreambase( name, mode), std::ostream( &buf) {}   +    gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } +    void open( const char* name, int open_mode = std::ios::out) { +        gzstreambase::open( name, open_mode); +    } +}; + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +#endif // GZSTREAM_H +// ============================================================================ +// EOF // + diff --git a/decoder/hg.cc b/decoder/hg.cc new file mode 100644 index 00000000..b56f1246 --- /dev/null +++ b/decoder/hg.cc @@ -0,0 +1,588 @@ +#include "hg.h" + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <set> +#include <map> +#include <iostream> + +#include "viterbi.h" +#include "inside_outside.h" +#include "tdict.h" + +using namespace std; + +double Hypergraph::NumberOfPaths() const { +  return Inside<double, TransitionCountWeightFunction>(*this); +} + +struct ScaledTransitionEventWeightFunction { +  ScaledTransitionEventWeightFunction(double alpha) : scale_(alpha) {} +  inline SparseVector<prob_t> operator()(const Hypergraph::Edge& e) const { +    SparseVector<prob_t> result; +    result.set_value(e.id_, e.edge_prob_.pow(scale_)); +    return result; +  } +  const double scale_; +}; + +struct TropicalValue { +  TropicalValue() : v_() {} +  explicit TropicalValue(int v) { +    if (v == 0) v_ = prob_t::Zero(); +    else if (v == 1) v_ = prob_t::One(); +    else { cerr << "Bad value in TropicalValue(int).\n"; abort(); } +  } +  explicit TropicalValue(const prob_t& v) : v_(v) {} +  inline TropicalValue& operator+=(const TropicalValue& o) { +    if (v_ < o.v_) v_ = o.v_; +    return *this; +  } +  inline TropicalValue& operator*=(const TropicalValue& o) { +    v_ *= o.v_; +    return *this; +  } +  inline bool operator==(const TropicalValue& o) const { return v_ == o.v_; } +  prob_t v_; +}; + +struct ViterbiWeightFunction { +  inline TropicalValue operator()(const Hypergraph::Edge& e) const { +    return TropicalValue(e.edge_prob_); +  } +}; + +struct ViterbiTransitionEventWeightFunction { +  inline SparseVector<TropicalValue> operator()(const Hypergraph::Edge& e) const { +    SparseVector<TropicalValue> result; +    result.set_value(e.id_, TropicalValue(e.edge_prob_)); +    return result; +  } +}; + + +prob_t Hypergraph::ComputeEdgePosteriors(double scale, vector<prob_t>* posts) const { +  const ScaledEdgeProb weight(scale); +  const ScaledTransitionEventWeightFunction w2(scale); +  SparseVector<prob_t> pv; +  const double inside = InsideOutside<prob_t, +                  ScaledEdgeProb, +                  SparseVector<prob_t>, +                  ScaledTransitionEventWeightFunction>(*this, &pv, weight, w2); +  posts->resize(edges_.size()); +  for (int i = 0; i < edges_.size(); ++i) +    (*posts)[i] = prob_t(pv.value(i)); +  return prob_t(inside); +} + +prob_t Hypergraph::ComputeBestPathThroughEdges(vector<prob_t>* post) const { +  SparseVector<TropicalValue> pv; +  const TropicalValue viterbi_weight = InsideOutside<TropicalValue, +                  ViterbiWeightFunction, +                  SparseVector<TropicalValue>, +                  ViterbiTransitionEventWeightFunction>(*this, &pv); +  post->resize(edges_.size()); +  for (int i = 0; i < edges_.size(); ++i) +    (*post)[i] = pv.value(i).v_; +  return viterbi_weight.v_; +} + +void Hypergraph::PushWeightsToSource(double scale) { +  vector<prob_t> posts; +  ComputeEdgePosteriors(scale, &posts); +  for (int i = 0; i < nodes_.size(); ++i) { +    const Hypergraph::Node& node = nodes_[i]; +    prob_t z = prob_t::Zero(); +    for (int j = 0; j < node.out_edges_.size(); ++j) +      z += posts[node.out_edges_[j]]; +    for (int j = 0; j < node.out_edges_.size(); ++j) { +      edges_[node.out_edges_[j]].edge_prob_ = posts[node.out_edges_[j]] / z; +    } +  } +} + +void Hypergraph::PushWeightsToGoal(double scale) { +  vector<prob_t> posts; +  ComputeEdgePosteriors(scale, &posts); +  for (int i = 0; i < nodes_.size(); ++i) { +    const Hypergraph::Node& node = nodes_[i]; +    prob_t z = prob_t::Zero(); +    for (int j = 0; j < node.in_edges_.size(); ++j) +      z += posts[node.in_edges_[j]]; +    for (int j = 0; j < node.in_edges_.size(); ++j) { +      edges_[node.in_edges_[j]].edge_prob_ = posts[node.in_edges_[j]] / z; +    } +  } +} + +struct EdgeExistsWeightFunction { +  EdgeExistsWeightFunction(const std::vector<bool>& prunes) : prunes_(prunes) {} +  double operator()(const Hypergraph::Edge& edge) const { +    return (prunes_[edge.id_] ? 0.0 : 1.0); +  } + private: +  const vector<bool>& prunes_; +}; + +void Hypergraph::PruneEdges(const std::vector<bool>& prune_edge, bool run_inside_algorithm) { +  assert(prune_edge.size() == edges_.size()); +  vector<bool> filtered = prune_edge; + +  if (run_inside_algorithm) { +    const EdgeExistsWeightFunction wf(prune_edge); +    // use double, not bool since vector<bool> causes problems with the Inside algorithm. +    // I don't know a good c++ way to resolve this short of template specialization which +    // I dislike.  If you know of a better way that doesn't involve specialization, +    // fix this! +    vector<double> reachable; +    bool goal_derivable = (0 < Inside<double, EdgeExistsWeightFunction>(*this, &reachable, wf)); +    if (!goal_derivable) { +      edges_.clear(); +      nodes_.clear(); +      nodes_.push_back(Node()); +      return; +    } + +    assert(reachable.size() == nodes_.size()); +    for (int i = 0; i < edges_.size(); ++i) { +      bool prune = prune_edge[i]; +      if (!prune) { +        const Edge& edge = edges_[i]; +        for (int j = 0; j < edge.tail_nodes_.size(); ++j) { +          if (!reachable[edge.tail_nodes_[j]]) { +            prune = true; +            break; +          } +        } +      } +      filtered[i] = prune; +    } +  } +  +  TopologicallySortNodesAndEdges(nodes_.size() - 1, &filtered); +} + +void Hypergraph::DensityPruneInsideOutside(const double scale, +                                           const bool use_sum_prod_semiring, +                                           const double density, +                                           const vector<bool>* preserve_mask) { +  assert(density >= 1.0); +  const int plen = ViterbiPathLength(*this); +  vector<WordID> bp; +  int rnum = min(static_cast<int>(edges_.size()), static_cast<int>(density * static_cast<double>(plen))); +  if (rnum == edges_.size()) { +    cerr << "No pruning required: denisty already sufficient"; +    return; +  } +  vector<prob_t> io(edges_.size()); +  if (use_sum_prod_semiring) +    ComputeEdgePosteriors(scale, &io); +  else +    ComputeBestPathThroughEdges(&io); +  assert(edges_.size() == io.size()); +  vector<prob_t> sorted = io; +  nth_element(sorted.begin(), sorted.begin() + rnum, sorted.end(), greater<prob_t>()); +  const double cutoff = sorted[rnum]; +  vector<bool> prune(edges_.size()); +  for (int i = 0; i < edges_.size(); ++i) { +    prune[i] = (io[i] < cutoff); +    if (preserve_mask && (*preserve_mask)[i]) prune[i] = false; +  } +  PruneEdges(prune); +} + +void Hypergraph::BeamPruneInsideOutside( +    const double scale, +    const bool use_sum_prod_semiring, +    const double alpha, +    const vector<bool>* preserve_mask) { +  assert(alpha > 0.0); +  assert(scale > 0.0); +  vector<prob_t> io(edges_.size()); +  if (use_sum_prod_semiring) +    ComputeEdgePosteriors(scale, &io); +  else +    ComputeBestPathThroughEdges(&io); +  assert(edges_.size() == io.size()); +  prob_t best;  // initializes to zero +  for (int i = 0; i < io.size(); ++i) +    if (io[i] > best) best = io[i]; +  const prob_t aprob(exp(-alpha)); +  const prob_t cutoff = best * aprob; +  // cerr << "aprob = " << aprob << "\t  CUTOFF=" << cutoff << endl; +  vector<bool> prune(edges_.size()); +  //cerr << preserve_mask.size() << " " << edges_.size() << endl; +  int pc = 0; +  for (int i = 0; i < io.size(); ++i) { +    const bool prune_edge = (io[i] < cutoff); +    if (prune_edge) ++pc; +    prune[i] = (io[i] < cutoff); +    if (preserve_mask && (*preserve_mask)[i]) prune[i] = false; +  } +  // cerr << "Beam pruning " << pc << "/" << io.size() << " edges\n"; +  PruneEdges(prune); +} + +void Hypergraph::PrintGraphviz() const { +  int ei = 0; +  cerr << "digraph G {\n  rankdir=LR;\n  nodesep=.05;\n"; +  for (vector<Edge>::const_iterator i = edges_.begin(); +       i != edges_.end(); ++i) { +    const Edge& edge=*i; +    ++ei; +    static const string none = "<null>"; +    string rule = (edge.rule_ ? edge.rule_->AsString(false) : none); + +    cerr << "   A_" << ei << " [label=\"" << rule << " p=" << edge.edge_prob_  +         << " F:" << edge.feature_values_ +         << "\" shape=\"rect\"];\n"; +    Hypergraph::TailNodeVector indorder(edge.tail_nodes_.size(), 0); +    int ntc = 0; +    for (int i = 0; i < edge.rule_->e_.size(); ++i) { +      if (edge.rule_->e_[i] <= 0) indorder[ntc++] = 1 + (-1 * edge.rule_->e_[i]); +    } +    for (int i = 0; i < edge.tail_nodes_.size(); ++i) { +      cerr << "     " << edge.tail_nodes_[i] << " -> A_" << ei; +      if (edge.tail_nodes_.size() > 1) { +        cerr << " [label=\"" << indorder[i] << "\"]"; +      } +      cerr << ";\n"; +    } +    cerr << "     A_" << ei << " -> " << edge.head_node_ << ";\n"; +  } +  for (vector<Node>::const_iterator ni = nodes_.begin(); +      ni != nodes_.end(); ++ni) { +    cerr << "  " << ni->id_ << "[label=\"" << (ni->cat_ < 0 ? TD::Convert(ni->cat_ * -1) : "") +    //cerr << "  " << ni->id_ << "[label=\"" << ni->cat_ +         << " n=" << ni->id_ +//         << ",x=" << &*ni +//         << ",in=" << ni->in_edges_.size()  +//         << ",out=" << ni->out_edges_.size() +         << "\"];\n"; +  } +  cerr << "}\n"; +} + +void Hypergraph::Union(const Hypergraph& other) { +  if (&other == this) return; +  if (nodes_.empty()) { nodes_ = other.nodes_; edges_ = other.edges_; return; } +  int noff = nodes_.size(); +  int eoff = edges_.size(); +  int ogoal = other.nodes_.size() - 1; +  int cgoal = noff - 1; +  // keep a single goal node, so add nodes.size - 1 +  nodes_.resize(nodes_.size() + ogoal); +  // add all edges +  edges_.resize(edges_.size() + other.edges_.size()); + +  for (int i = 0; i < ogoal; ++i) { +    const Node& on = other.nodes_[i]; +    Node& cn = nodes_[i + noff]; +    cn.id_ = i + noff; +    cn.in_edges_.resize(on.in_edges_.size()); +    for (int j = 0; j < on.in_edges_.size(); ++j) +      cn.in_edges_[j] = on.in_edges_[j] + eoff; + +    cn.out_edges_.resize(on.out_edges_.size()); +    for (int j = 0; j < on.out_edges_.size(); ++j) +      cn.out_edges_[j] = on.out_edges_[j] + eoff; +  } + +  for (int i = 0; i < other.edges_.size(); ++i) { +    const Edge& oe = other.edges_[i]; +    Edge& ce = edges_[i + eoff]; +    ce.id_ = i + eoff; +    ce.rule_ = oe.rule_; +    ce.feature_values_ = oe.feature_values_; +    if (oe.head_node_ == ogoal) { +      ce.head_node_ = cgoal; +      nodes_[cgoal].in_edges_.push_back(ce.id_); +    } else { +      ce.head_node_ = oe.head_node_ + noff; +    } +    ce.tail_nodes_.resize(oe.tail_nodes_.size()); +    for (int j = 0; j < oe.tail_nodes_.size(); ++j) +      ce.tail_nodes_[j] = oe.tail_nodes_[j] + noff; +  } + +  TopologicallySortNodesAndEdges(cgoal); +} + +void Hypergraph::PruneUnreachable(int goal_node_id) { +  TopologicallySortNodesAndEdges(goal_node_id, NULL); +} + +void Hypergraph::RemoveNoncoaccessibleStates(int goal_node_id) { +  if (goal_node_id < 0) goal_node_id += nodes_.size(); +  assert(goal_node_id >= 0); +  assert(goal_node_id < nodes_.size()); + +  // TODO finish implementation +  abort(); +} + +struct DFSContext { +  int node; +  int edge_iter; +  int tail_iter; +  DFSContext(int n, int e, int t) : node(n), edge_iter(e), tail_iter(t) {} +}; + +enum ColorType { WHITE, GRAY, BLACK }; + +template <class T> +struct BadId { +  bool operator()(const T& obj) const { return obj.id_ == -1; } +}; + +template <class T> +struct IdCompare { +  bool operator()(const T& a, const T& b) { return a.id_ < b.id_; } +}; + +void Hypergraph::TopologicallySortNodesAndEdges(int goal_index, +                                                const vector<bool>* prune_edges) { +  // figure out which nodes are reachable from the goal +  vector<int> reloc_node(nodes_.size(), -1); +  vector<int> reloc_edge(edges_.size(), -1); +  vector<ColorType> color(nodes_.size(), WHITE); +  vector<DFSContext> stack; +  stack.reserve(nodes_.size()); +  stack.push_back(DFSContext(goal_index, 0, 0)); +  int node_count = 0; +  int edge_count = 0; +  while(!stack.empty()) { +    const DFSContext& p = stack.back(); +    int cur_ni = p.node; +    int edge_i = p.edge_iter; +    int tail_i = p.tail_iter; +    stack.pop_back(); +    const Node* cur_node = &nodes_[cur_ni]; +    int edge_end = cur_node->in_edges_.size(); +    while (edge_i != edge_end) { +      const Edge& cur_edge = edges_[cur_node->in_edges_[edge_i]]; +      const int tail_end = cur_edge.tail_nodes_.size(); +      if ((tail_end == tail_i) || (prune_edges && (*prune_edges)[cur_edge.id_])) { +        ++edge_i; +        tail_i = 0; +        continue; +      } +      const int tail_ni = cur_edge.tail_nodes_[tail_i]; +      const int tail_color = color[tail_ni]; +      if (tail_color == WHITE) { +        stack.push_back(DFSContext(cur_ni, edge_i, ++tail_i)); +        cur_ni = tail_ni; +        cur_node = &nodes_[cur_ni]; +        color[cur_ni] = GRAY; +        edge_i = 0; +        edge_end = cur_node->in_edges_.size(); +        tail_i = 0; +      } else if (tail_color == BLACK) { +        ++tail_i; +      } else if (tail_color == GRAY) { +        // this can happen if, e.g., it is possible to rederive +        // a single cell in the CKY chart via a cycle. +        cerr << "Detected forbidden cycle in HG:\n"; +        cerr << "  " << cur_edge.rule_->AsString() << endl; +        while(!stack.empty()) { +          const DFSContext& p = stack.back(); +          cerr << "  " << edges_[nodes_[p.node].in_edges_[p.edge_iter]].rule_->AsString() << endl; +          stack.pop_back(); +        } +        abort(); +      } +    } +    color[cur_ni] = BLACK; +    reloc_node[cur_ni] = node_count++; +    if (prune_edges) { +      for (int i = 0; i < edge_end; ++i) { +        int ei = cur_node->in_edges_[i]; +        if (!(*prune_edges)[ei]) +          reloc_edge[cur_node->in_edges_[i]] = edge_count++; +      } +    } else { +      for (int i = 0; i < edge_end; ++i) +        reloc_edge[cur_node->in_edges_[i]] = edge_count++; +    } +  } +#ifndef HG_EDGES_TOPO_SORTED +  int ec = 0; +  for (int i = 0; i < reloc_edge.size(); ++i) { +    int& cp = reloc_edge[i]; +    if (cp >= 0) { cp = ec++; } +  } +#endif + +#if 0 +  cerr << "TOPO:"; +  for (int i = 0; i < reloc_node.size(); ++i) +    cerr << " " << reloc_node[i]; +  cerr << endl; +  cerr << "EDGE:"; +  for (int i = 0; i < reloc_edge.size(); ++i) +    cerr << " " << reloc_edge[i]; +  cerr << endl; +#endif +  bool no_op = true; +  for (int i = 0; i < reloc_node.size() && no_op; ++i) +    if (reloc_node[i] != i) no_op = false; +  for (int i = 0; i < reloc_edge.size() && no_op; ++i) +    if (reloc_edge[i] != i) no_op = false; +  if (no_op) return; +  for (int i = 0; i < reloc_node.size(); ++i) { +    Node& node = nodes_[i]; +    node.id_ = reloc_node[i]; +    int c = 0; +    for (int j = 0; j < node.in_edges_.size(); ++j) { +      const int new_index = reloc_edge[node.in_edges_[j]]; +      if (new_index >= 0) +        node.in_edges_[c++] = new_index; +    } +    node.in_edges_.resize(c); +    c = 0; +    for (int j = 0; j < node.out_edges_.size(); ++j) { +      const int new_index = reloc_edge[node.out_edges_[j]]; +      if (new_index >= 0) +        node.out_edges_[c++] = new_index; +    } +    node.out_edges_.resize(c); +  } +  for (int i = 0; i < reloc_edge.size(); ++i) { +    Edge& edge = edges_[i]; +    edge.id_ = reloc_edge[i]; +    edge.head_node_ = reloc_node[edge.head_node_]; +    for (int j = 0; j < edge.tail_nodes_.size(); ++j) +      edge.tail_nodes_[j] = reloc_node[edge.tail_nodes_[j]]; +  } +  edges_.erase(remove_if(edges_.begin(), edges_.end(), BadId<Edge>()), edges_.end()); +  nodes_.erase(remove_if(nodes_.begin(), nodes_.end(), BadId<Node>()), nodes_.end()); +  sort(nodes_.begin(), nodes_.end(), IdCompare<Node>()); +#ifndef HG_EDGES_TOPO_SORTED +  sort(edges_.begin(), edges_.end(), IdCompare<Edge>()); +#endif +} + +TRulePtr Hypergraph::kEPSRule; +TRulePtr Hypergraph::kUnaryRule; + +void Hypergraph::EpsilonRemove(WordID eps) { +  if (!kEPSRule) { +    kEPSRule.reset(new TRule("[X] ||| <eps> ||| <eps>")); +    kUnaryRule.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); +  } +  vector<bool> kill(edges_.size(), false); +  for (int i = 0; i < edges_.size(); ++i) { +    const Edge& edge = edges_[i]; +    if (edge.tail_nodes_.empty() && +        edge.rule_->f_.size() == 1 && +        edge.rule_->f_[0] == eps) { +      kill[i] = true; +      if (!edge.feature_values_.empty()) { +        Node& node = nodes_[edge.head_node_]; +        if (node.in_edges_.size() != 1) { +          cerr << "[WARNING] <eps> edge with features going into non-empty node - can't promote\n"; +          // this *probably* means that there are multiple derivations of the +          // same sequence via different paths through the input forest +          // this needs to be investigated and fixed +        } else { +          for (int j = 0; j < node.out_edges_.size(); ++j) +            edges_[node.out_edges_[j]].feature_values_ += edge.feature_values_; +          // cerr << "PROMOTED " << edge.feature_values_ << endl; +	} +      } +    } +  } +  bool created_eps = false; +  PruneEdges(kill); +  for (int i = 0; i < nodes_.size(); ++i) { +    const Node& node = nodes_[i]; +    if (node.in_edges_.empty()) { +      for (int j = 0; j < node.out_edges_.size(); ++j) { +        Edge& edge = edges_[node.out_edges_[j]]; +        if (edge.rule_->Arity() == 2) { +          assert(edge.rule_->f_.size() == 2); +          assert(edge.rule_->e_.size() == 2); +          edge.rule_ = kUnaryRule; +          int cur = node.id_; +          int t = -1; +          assert(edge.tail_nodes_.size() == 2); +          for (int i = 0; i < 2; ++i) if (edge.tail_nodes_[i] != cur) { t = edge.tail_nodes_[i]; } +          assert(t != -1); +          edge.tail_nodes_.resize(1); +          edge.tail_nodes_[0] = t; +        } else { +          edge.rule_ = kEPSRule; +          edge.rule_->f_[0] = eps; +          edge.rule_->e_[0] = eps; +          edge.tail_nodes_.clear(); +          created_eps = true; +        } +      } +    } +  } +  vector<bool> k2(edges_.size(), false); +  PruneEdges(k2); +  if (created_eps) EpsilonRemove(eps); +} + +struct EdgeWeightSorter { +  const Hypergraph& hg; +  EdgeWeightSorter(const Hypergraph& h) : hg(h) {} +  bool operator()(int a, int b) const { +    return hg.edges_[a].edge_prob_ > hg.edges_[b].edge_prob_; +  } +}; + +void Hypergraph::SortInEdgesByEdgeWeights() { +  for (int i = 0; i < nodes_.size(); ++i) { +    Node& node = nodes_[i]; +    sort(node.in_edges_.begin(), node.in_edges_.end(), EdgeWeightSorter(*this)); +  } +} + +Hypergraph* Hypergraph::CreateViterbiHypergraph(const vector<bool>* edges) const { +  vector<const Edge*> vit_edges; +  if (edges) { +    assert(edges->size() == edges_.size()); +    Viterbi<vector<const Edge*>, ViterbiPathTraversal, prob_t, EdgeSelectEdgeWeightFunction>(*this, &vit_edges, ViterbiPathTraversal(), EdgeSelectEdgeWeightFunction(*edges)); +  } else { +    Viterbi<vector<const Edge*>, ViterbiPathTraversal, prob_t, EdgeProb>(*this, &vit_edges); +  } +  map<int, int> old2new_node; +  int num_new_nodes = 0; +  for (int i = 0; i < vit_edges.size(); ++i) { +    const Edge& edge = *vit_edges[i]; +    for (int j = 0; j < edge.tail_nodes_.size(); ++j) +      assert(old2new_node.count(edge.tail_nodes_[j]) > 0); +    if (old2new_node.count(edge.head_node_) == 0) { +      old2new_node[edge.head_node_] = num_new_nodes; +      ++num_new_nodes; +    } +  } +  Hypergraph* out = new Hypergraph(num_new_nodes, vit_edges.size(), is_linear_chain_); +  for (map<int, int>::iterator it = old2new_node.begin(); +       it != old2new_node.end(); ++it) { +    const Node& old_node = nodes_[it->first]; +    Node& new_node = out->nodes_[it->second]; +    new_node.cat_ = old_node.cat_; +    new_node.id_ = it->second; +  } + +  for (int i = 0; i < vit_edges.size(); ++i) { +    const Edge& old_edge = *vit_edges[i]; +    Edge& new_edge = out->edges_[i]; +    new_edge = old_edge; +    new_edge.id_ = i; +    const int new_head_node = old2new_node[old_edge.head_node_]; +    new_edge.head_node_ = new_head_node; +    out->nodes_[new_head_node].in_edges_.push_back(i); +    for (int j = 0; j < old_edge.tail_nodes_.size(); ++j) { +      const int new_tail_node = old2new_node[old_edge.tail_nodes_[j]]; +      new_edge.tail_nodes_[j] = new_tail_node; +      out->nodes_[new_tail_node].out_edges_.push_back(i); +    } +  } +  return out; +} + diff --git a/decoder/hg.h b/decoder/hg.h new file mode 100644 index 00000000..8d056358 --- /dev/null +++ b/decoder/hg.h @@ -0,0 +1,247 @@ +#ifndef _HG_H_ +#define _HG_H_ + +#include <string> +#include <vector> + +#include "small_vector.h" +#include "sparse_vector.h" +#include "wordid.h" +#include "trule.h" +#include "prob.h" + +// if you define this, edges_ will be sorted +// (normally, just nodes_ are), but this can be quite +// slow +#undef HG_EDGES_TOPO_SORTED + +// class representing an acyclic hypergraph +//  - edges have 1 head, 0..n tails +class Hypergraph { + public: +  Hypergraph() : is_linear_chain_(false) {} + +  // SmallVector is a fast, small vector<int> implementation for sizes <= 2 +  typedef SmallVector TailNodeVector; + +  // TODO get rid of cat_? +  struct Node { +    Node() : id_(), cat_() {} +    int id_; // equal to this object's position in the nodes_ vector +    WordID cat_;  // non-terminal category if <0, 0 if not set +    std::vector<int> in_edges_;   // contents refer to positions in edges_ +    std::vector<int> out_edges_;  // contents refer to positions in edges_ +  }; + +  // TODO get rid of edge_prob_? (can be computed on the fly as the dot +  // product of the weight vector and the feature values) +  struct Edge { +    Edge() : i_(-1), j_(-1), prev_i_(-1), prev_j_(-1) {} +    inline int Arity() const { return tail_nodes_.size(); } +    int head_node_;               // refers to a position in nodes_ +    TailNodeVector tail_nodes_;   // contents refer to positions in nodes_ +    TRulePtr rule_; +    SparseVector<double> feature_values_; +    prob_t edge_prob_;             // dot product of weights and feat_values +    int id_;   // equal to this object's position in the edges_ vector + +    // span info. typically, i_ and j_ refer to indices in the source sentence +    // if a synchronous parse has been executed i_ and j_ will refer to indices +    // in the target sentence / lattice and prev_i_ prev_j_ will refer to +    // positions in the source.  Note: it is up to the translator implementation +    // to properly set these values.  For some models (like the Forest-input +    // phrase based model) it may not be straightforward to do.  if these values +    // are not properly set, most things will work but alignment and any features +    // that depend on them will be broken. +    short int i_; +    short int j_; +    short int prev_i_; +    short int prev_j_; +  }; + +  void swap(Hypergraph& other) { +    other.nodes_.swap(nodes_); +    std::swap(is_linear_chain_, other.is_linear_chain_); +    other.edges_.swap(edges_); +  } + +  void ResizeNodes(int size) { +    nodes_.resize(size); +    for (int i = 0; i < size; ++i) nodes_[i].id_ = i; +  } + +  // reserves space in the nodes vector to prevent memory locations +  // from changing +  void ReserveNodes(size_t n, size_t e = 0) { +    nodes_.reserve(n); +    if (e) edges_.reserve(e); +  } + +  Edge* AddEdge(const TRulePtr& rule, const TailNodeVector& tail) { +    edges_.push_back(Edge()); +    Edge* edge = &edges_.back(); +    edge->rule_ = rule; +    edge->tail_nodes_ = tail; +    edge->id_ = edges_.size() - 1; +    for (int i = 0; i < edge->tail_nodes_.size(); ++i) +      nodes_[edge->tail_nodes_[i]].out_edges_.push_back(edge->id_); +    return edge; +  } + +  Node* AddNode(const WordID& cat) { +    nodes_.push_back(Node()); +    nodes_.back().cat_ = cat; +    nodes_.back().id_ = nodes_.size() - 1; +    return &nodes_.back(); +  } + +  void ConnectEdgeToHeadNode(const int edge_id, const int head_id) { +    edges_[edge_id].head_node_ = head_id; +    nodes_[head_id].in_edges_.push_back(edge_id); +  } + +  // TODO remove this - use the version that takes indices +  void ConnectEdgeToHeadNode(Edge* edge, Node* head) { +    edge->head_node_ = head->id_; +    head->in_edges_.push_back(edge->id_); +  } + +  // merge the goal node from other with this goal node +  void Union(const Hypergraph& other); + +  void PrintGraphviz() const; + +  // compute the total number of paths in the forest +  double NumberOfPaths() const; + +  // BEWARE. this assumes that the source and target language +  // strings are identical and that there are no loops. +  // It assumes a bunch of other things about where the +  // epsilons will be.  It tries to assert failure if you +  // break these assumptions, but it may not. +  // TODO - make this work +  void EpsilonRemove(WordID eps); + +  // multiple the weights vector by the edge feature vector +  // (inner product) to set the edge probabilities +  template <typename V> +  void Reweight(const V& weights) { +    for (int i = 0; i < edges_.size(); ++i) { +      Edge& e = edges_[i]; +      e.edge_prob_.logeq(e.feature_values_.dot(weights)); +    } +  } + +  // computes inside and outside scores for each +  // edge in the hypergraph +  // alpha->size = edges_.size = beta->size +  // returns inside prob of goal node +  prob_t ComputeEdgePosteriors(double scale, +                               std::vector<prob_t>* posts) const; + +  // find the score of the very best path passing through each edge +  prob_t ComputeBestPathThroughEdges(std::vector<prob_t>* posts) const; + +  // create a new hypergraph consisting only of the nodes / edges +  // in the Viterbi derivation of this hypergraph +  // if edges is set, use the EdgeSelectEdgeWeightFunction +  Hypergraph* CreateViterbiHypergraph(const std::vector<bool>* edges = NULL) const; + +  // move weights as near to the source as possible, resulting in a +  // stochastic automaton.  ONLY FUNCTIONAL FOR *LATTICES*. +  // See M. Mohri and M. Riley. A Weight Pushing Algorithm for Large +  //   Vocabulary Speech Recognition. 2001. +  // the log semiring (NOT tropical) is used +  void PushWeightsToSource(double scale = 1.0); +  // same, except weights are pushed to the goal, works for HGs, +  // not just lattices +  void PushWeightsToGoal(double scale = 1.0); + +  void SortInEdgesByEdgeWeights(); + +  void PruneUnreachable(int goal_node_id); // DEPRECATED + +  void RemoveNoncoaccessibleStates(int goal_node_id = -1); + +  // remove edges from the hypergraph if prune_edge[edge_id] is true +  // TODO need to investigate why this shouldn't be run for the forest trans +  // case.  To investigate, change false to true and see where ftrans crashes +  void PruneEdges(const std::vector<bool>& prune_edge, bool run_inside_algorithm = false); + +  // if you don't know, use_sum_prod_semiring should be false +  void DensityPruneInsideOutside(const double scale, const bool use_sum_prod_semiring, const double density, +                                 const std::vector<bool>* preserve_mask = NULL); + +  // prunes any edge whose score on the best path taking that edge is more than alpha away +  // from the score of the global best past (or the highest edge posterior) +  void BeamPruneInsideOutside(const double scale, const bool use_sum_prod_semiring, const double alpha, +                              const std::vector<bool>* preserve_mask = NULL); + +  void clear() { +    nodes_.clear(); +    edges_.clear(); +  } + +  inline size_t NumberOfEdges() const { return edges_.size(); } +  inline size_t NumberOfNodes() const { return nodes_.size(); } +  inline bool empty() const { return nodes_.empty(); } + +  // linear chains can be represented in a number of ways in a hypergraph, +  // we define them to consist only of lexical translations and monotonic rules +  inline bool IsLinearChain() const { return is_linear_chain_; } +  bool is_linear_chain_; + +  // nodes_ is sorted in topological order +  std::vector<Node> nodes_; +  // edges_ is not guaranteed to be in any particular order +  std::vector<Edge> edges_; + +  // reorder nodes_ so they are in topological order +  // source nodes at 0 sink nodes at size-1 +  void TopologicallySortNodesAndEdges(int goal_idx, +                                      const std::vector<bool>* prune_edges = NULL); + private: +  Hypergraph(int num_nodes, int num_edges, bool is_lc) : is_linear_chain_(is_lc), nodes_(num_nodes), edges_(num_edges) {} + +  static TRulePtr kEPSRule; +  static TRulePtr kUnaryRule; +}; + +// common WeightFunctions, map an edge -> WeightType +// for generic Viterbi/Inside algorithms +struct EdgeProb { +  inline const prob_t& operator()(const Hypergraph::Edge& e) const { return e.edge_prob_; } +}; + +struct EdgeSelectEdgeWeightFunction { +  EdgeSelectEdgeWeightFunction(const std::vector<bool>& v) : v_(v) {} +  inline prob_t operator()(const Hypergraph::Edge& e) const { +    if (v_[e.id_]) return prob_t::One(); +    else return prob_t::Zero(); +  } + private: +  const std::vector<bool>& v_; +}; + +struct ScaledEdgeProb { +  ScaledEdgeProb(const double& alpha) : alpha_(alpha) {} +  inline prob_t operator()(const Hypergraph::Edge& e) const { return e.edge_prob_.pow(alpha_); } +  const double alpha_; +}; + +// see Li (2010), Section 3.2.2-- this is 'x_e = p_e*r_e' +struct EdgeFeaturesAndProbWeightFunction { +  inline const SparseVector<prob_t> operator()(const Hypergraph::Edge& e) const { +    SparseVector<prob_t> res; +    for (SparseVector<double>::const_iterator it = e.feature_values_.begin(); +         it != e.feature_values_.end(); ++it) +      res.set_value(it->first, prob_t(it->second) * e.edge_prob_); +    return res; +  } +}; + +struct TransitionCountWeightFunction { +  inline double operator()(const Hypergraph::Edge& e) const { (void)e; return 1.0; } +}; + +#endif diff --git a/decoder/hg_intersect.cc b/decoder/hg_intersect.cc new file mode 100644 index 00000000..02ff752e --- /dev/null +++ b/decoder/hg_intersect.cc @@ -0,0 +1,160 @@ +#include "hg_intersect.h" + +#include <vector> +#include <tr1/unordered_map> +#include <boost/lexical_cast.hpp> +#include <boost/functional/hash.hpp> + +#include "tdict.h" +#include "hg.h" +#include "trule.h" +#include "wordid.h" +#include "bottom_up_parser.h" + +using boost::lexical_cast; +using namespace std::tr1; +using namespace std; + +struct RuleFilter { +  unordered_map<vector<WordID>, bool, boost::hash<vector<WordID> > > exists_; +  bool true_lattice; +  RuleFilter(const Lattice& target, int max_phrase_size) { +    true_lattice = false; +    for (int i = 0; i < target.size(); ++i) { +      vector<WordID> phrase; +      int lim = min(static_cast<int>(target.size()), i + max_phrase_size); +      for (int j = i; j < lim; ++j) { +        if (target[j].size() > 1) { true_lattice = true; break; } +        phrase.push_back(target[j][0].label); +        exists_[phrase] = true; +      } +    } +    vector<WordID> sos(1, TD::Convert("<s>")); +    exists_[sos] = true; +  } +  bool operator()(const TRule& r) const { +    // TODO do some smarter filtering for lattices +    if (true_lattice) return false;  // don't filter "true lattice" input +    const vector<WordID>& e = r.e(); +    for (int i = 0; i < e.size(); ++i) { +      if (e[i] <= 0) continue; +      vector<WordID> phrase; +      for (int j = i; j < e.size(); ++j) { +        if (e[j] <= 0) break; +        phrase.push_back(e[j]); +        if (exists_.count(phrase) == 0) return true; +      } +    } +    return false; +  } +}; + +static bool FastLinearIntersect(const Lattice& target, Hypergraph* hg) { +  cerr << "  Fast linear-chain intersection...\n"; +  vector<bool> prune(hg->edges_.size(), false); +  set<int> cov; +  map<const TRule*, TRulePtr> inverted_rules; +  for (int i = 0; i < prune.size(); ++i) { +    Hypergraph::Edge& edge = hg->edges_[i]; +    if (edge.Arity() == 0) { +      const int trg_index = edge.prev_i_; +      const WordID trg = target[trg_index][0].label; +      assert(edge.rule_->EWords() == 1); +      TRulePtr& inv_rule = inverted_rules[edge.rule_.get()]; +      if (!inv_rule) { +        inv_rule.reset(new TRule(*edge.rule_)); +        inv_rule->e_.swap(inv_rule->f_); +      } +      prune[i] = (edge.rule_->e_[0] != trg); +      if (!prune[i]) { +        cov.insert(trg_index); +        swap(edge.prev_i_, edge.i_); +        swap(edge.prev_j_, edge.j_); +        edge.rule_.swap(inv_rule); +      } +    } +  } +  hg->PruneEdges(prune, true); +  return (cov.size() == target.size()); +} + +bool HG::Intersect(const Lattice& target, Hypergraph* hg) { +  // there are a number of faster algorithms available for restricted +  // classes of hypergraph and/or target. +  if (hg->IsLinearChain() && target.IsSentence()) +    return FastLinearIntersect(target, hg); + +  vector<bool> rem(hg->edges_.size(), false); +  const RuleFilter filter(target, 15);   // TODO make configurable +  for (int i = 0; i < rem.size(); ++i) +    rem[i] = filter(*hg->edges_[i].rule_); +  hg->PruneEdges(rem, true); + +  const int nedges = hg->edges_.size(); +  const int nnodes = hg->nodes_.size(); + +  TextGrammar* g = new TextGrammar; +  GrammarPtr gp(g); +  vector<int> cats(nnodes); +  // each node in the translation forest becomes a "non-terminal" in the new +  // grammar, create the labels here +  const string kSEP = "_"; +  for (int i = 0; i < nnodes; ++i) { +    const char* pstr = "CAT"; +    if (hg->nodes_[i].cat_ < 0) +      pstr = TD::Convert(-hg->nodes_[i].cat_); +    cats[i] = TD::Convert(pstr + kSEP + lexical_cast<string>(i)) * -1; +  } + +  // construct the grammar +  for (int i = 0; i < nedges; ++i) { +    const Hypergraph::Edge& edge = hg->edges_[i]; +    const vector<WordID>& tgt = edge.rule_->e(); +    const vector<WordID>& src = edge.rule_->f(); +    TRulePtr rule(new TRule); +    rule->prev_i = edge.i_; +    rule->prev_j = edge.j_; +    rule->lhs_ = cats[edge.head_node_]; +    vector<WordID>& f = rule->f_; +    vector<WordID>& e = rule->e_; +    f.resize(tgt.size());   // swap source and target, since the parser +    e.resize(src.size());   // parses using the source side! +    Hypergraph::TailNodeVector tn(edge.tail_nodes_.size()); +    int ntc = 0; +    for (int j = 0; j < tgt.size(); ++j) { +      const WordID& cur = tgt[j]; +      if (cur > 0) { +        f[j] = cur; +      } else { +        tn[ntc++] = cur; +        f[j] = cats[edge.tail_nodes_[-cur]]; +      } +    } +    ntc = 0; +    for (int j = 0; j < src.size(); ++j) { +      const WordID& cur = src[j]; +      if (cur > 0) { +        e[j] = cur; +      } else { +        e[j] = tn[ntc++]; +      } +    } +    rule->scores_ = edge.feature_values_; +    rule->parent_rule_ = edge.rule_; +    rule->ComputeArity(); +    //cerr << "ADD: " << rule->AsString() << endl; +     +    g->AddRule(rule); +  } +  g->SetMaxSpan(target.size() + 1); +  const string& new_goal = TD::Convert(cats.back() * -1); +  vector<GrammarPtr> grammars(1, gp); +  Hypergraph tforest; +  ExhaustiveBottomUpParser parser(new_goal, grammars); +  if (!parser.Parse(target, &tforest)) +    return false; +  else +    hg->swap(tforest); +  return true; +} + diff --git a/decoder/hg_intersect.h b/decoder/hg_intersect.h new file mode 100644 index 00000000..826bdaae --- /dev/null +++ b/decoder/hg_intersect.h @@ -0,0 +1,13 @@ +#ifndef _HG_INTERSECT_H_ +#define _HG_INTERSECT_H_ + +#include <vector> + +#include "lattice.h" + +class Hypergraph; +struct HG { +  static bool Intersect(const Lattice& target, Hypergraph* hg); +}; + +#endif diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc new file mode 100644 index 00000000..5161931d --- /dev/null +++ b/decoder/hg_io.cc @@ -0,0 +1,673 @@ +#include "hg_io.h" + +#include <sstream> +#include <iostream> + +#include <boost/lexical_cast.hpp> + +#include "tdict.h" +#include "json_parse.h" +#include "hg.h" + +using namespace std; + +struct HGReader : public JSONParser { +  HGReader(Hypergraph* g) : rp("[X] ||| "), state(-1), hg(*g), nodes_needed(true), edges_needed(true) { nodes = 0; edges = 0; } + +  void CreateNode(const string& cat, const vector<int>& in_edges) { +    WordID c = TD::Convert("X") * -1; +    if (!cat.empty()) c = TD::Convert(cat) * -1; +    Hypergraph::Node* node = hg.AddNode(c); +    for (int i = 0; i < in_edges.size(); ++i) { +      if (in_edges[i] >= hg.edges_.size()) { +        cerr << "JSONParser: in_edges[" << i << "]=" << in_edges[i] +             << ", but hg only has " << hg.edges_.size() << " edges!\n"; +        abort(); +      } +      hg.ConnectEdgeToHeadNode(&hg.edges_[in_edges[i]], node); +    } +  } +  void CreateEdge(const TRulePtr& rule, SparseVector<double>* feats, const SmallVector& tail) { +    Hypergraph::Edge* edge = hg.AddEdge(rule, tail); +    feats->swap(edge->feature_values_); +    edge->i_ = spans[0]; +    edge->j_ = spans[1]; +    edge->prev_i_ = spans[2]; +    edge->prev_j_ = spans[3]; +  } + +  bool HandleJSONEvent(int type, const JSON_value* value) { +    switch(state) { +    case -1: +      assert(type == JSON_T_OBJECT_BEGIN); +      state = 0; +      break; +    case 0: +      if (type == JSON_T_OBJECT_END) { +        //cerr << "HG created\n";  // TODO, signal some kind of callback +      } else if (type == JSON_T_KEY) { +        string val = value->vu.str.value; +        if (val == "features") { assert(fdict.empty()); state = 1; } +        else if (val == "is_sorted") { state = 3; } +        else if (val == "rules") { assert(rules.empty()); state = 4; } +        else if (val == "node") { state = 8; } +        else if (val == "edges") { state = 13; } +        else { cerr << "Unexpected key: " << val << endl; return false; } +      } +      break; + +    // features +    case 1: +      if(type == JSON_T_NULL) { state = 0; break; } +      assert(type == JSON_T_ARRAY_BEGIN); +      state = 2; +      break; +    case 2: +      if(type == JSON_T_ARRAY_END) { state = 0; break; } +      assert(type == JSON_T_STRING); +      fdict.push_back(FD::Convert(value->vu.str.value)); +      assert(fdict.back() > 0); +      break; + +    // is_sorted +    case 3: +      assert(type == JSON_T_TRUE || type == JSON_T_FALSE); +      is_sorted = (type == JSON_T_TRUE); +      if (!is_sorted) { cerr << "[WARNING] is_sorted flag is ignored\n"; } +      state = 0; +      break; + +    // rules +    case 4: +      if(type == JSON_T_NULL) { state = 0; break; } +      assert(type == JSON_T_ARRAY_BEGIN); +      state = 5; +      break; +    case 5: +      if(type == JSON_T_ARRAY_END) { state = 0; break; } +      assert(type == JSON_T_INTEGER); +      state = 6; +      rule_id = value->vu.integer_value; +      break; +    case 6: +      assert(type == JSON_T_STRING); +      rules[rule_id] = TRulePtr(new TRule(value->vu.str.value)); +      state = 5; +      break; + +    // Nodes +    case 8: +      assert(type == JSON_T_OBJECT_BEGIN); +      ++nodes; +      in_edges.clear(); +      cat.clear(); +      state = 9; break; +    case 9: +      if (type == JSON_T_OBJECT_END) { +        //cerr << "Creating NODE\n"; +        CreateNode(cat, in_edges); +        state = 0; break; +      } +      assert(type == JSON_T_KEY); +      cur_key = value->vu.str.value; +      if (cur_key == "cat") { assert(cat.empty()); state = 10; break; } +      if (cur_key == "in_edges") { assert(in_edges.empty()); state = 11; break; } +      cerr << "Syntax error: unexpected key " << cur_key << " in node specification.\n"; +      return false; +    case 10: +      assert(type == JSON_T_STRING || type == JSON_T_NULL); +      cat = value->vu.str.value; +      state = 9; break; +    case 11: +      if (type == JSON_T_NULL) { state = 9; break; } +      assert(type == JSON_T_ARRAY_BEGIN); +      state = 12; break; +    case 12: +      if (type == JSON_T_ARRAY_END) { state = 9; break; } +      assert(type == JSON_T_INTEGER); +      //cerr << "in_edges: " << value->vu.integer_value << endl; +      in_edges.push_back(value->vu.integer_value); +      break; + +    //   "edges": [ { "tail": null, "feats" : [0,1.63,1,-0.54], "rule": 12}, +    //         { "tail": null, "feats" : [0,0.87,1,0.02], "spans":[1,2,3,4], "rule": 17}, +    //         { "tail": [0], "feats" : [1,2.3,2,15.3,"ExtraFeature",1.2], "rule": 13}] +    case 13: +      assert(type == JSON_T_ARRAY_BEGIN); +      state = 14; +      break; +    case 14: +      if (type == JSON_T_ARRAY_END) { state = 0; break; } +      assert(type == JSON_T_OBJECT_BEGIN); +      //cerr << "New edge\n"; +      ++edges; +      cur_rule.reset(); feats.clear(); tail.clear(); +      state = 15; break; +    case 15: +      if (type == JSON_T_OBJECT_END) { +        CreateEdge(cur_rule, &feats, tail); +        state = 14; break; +      } +      assert(type == JSON_T_KEY); +      cur_key = value->vu.str.value; +      //cerr << "edge key " << cur_key << endl; +      if (cur_key == "rule") { assert(!cur_rule); state = 16; break; } +      if (cur_key == "spans") { assert(!cur_rule); state = 22; break; } +      if (cur_key == "feats") { assert(feats.empty()); state = 17; break; } +      if (cur_key == "tail") { assert(tail.empty()); state = 20; break; } +      cerr << "Unexpected key " << cur_key << " in edge specification\n"; +      return false; +    case 16:    // edge.rule +      if (type == JSON_T_INTEGER) { +        int rule_id = value->vu.integer_value; +        if (rules.find(rule_id) == rules.end()) { +          // rules list must come before the edge definitions! +          cerr << "Rule_id " << rule_id << " given but only loaded " << rules.size() << " rules\n"; +          return false; +        } +        cur_rule = rules[rule_id]; +      } else if (type == JSON_T_STRING) { +        cur_rule.reset(new TRule(value->vu.str.value)); +      } else { +        cerr << "Rule must be either a rule id or a rule string" << endl; +        return false; +      } +      // cerr << "Edge: rule=" << cur_rule->AsString() << endl; +      state = 15; +      break; +    case 17:      // edge.feats +      if (type == JSON_T_NULL) { state = 15; break; } +      assert(type == JSON_T_ARRAY_BEGIN); +      state = 18; break; +    case 18: +      if (type == JSON_T_ARRAY_END) { state = 15; break; } +      if (type != JSON_T_INTEGER && type != JSON_T_STRING) { +        cerr << "Unexpected feature id type\n"; return false; +      } +      if (type == JSON_T_INTEGER) { +        fid = value->vu.integer_value; +        assert(fid < fdict.size()); +        fid = fdict[fid]; +      } else if (JSON_T_STRING) { +        fid = FD::Convert(value->vu.str.value); +      } else { abort(); } +      state = 19; +      break; +    case 19: +      { +        assert(type == JSON_T_INTEGER || type == JSON_T_FLOAT); +        double val = (type == JSON_T_INTEGER ? static_cast<double>(value->vu.integer_value) : +	                                       strtod(value->vu.str.value, NULL)); +        feats.set_value(fid, val); +        state = 18; +        break; +      } +    case 20:     // edge.tail +      if (type == JSON_T_NULL) { state = 15; break; } +      assert(type == JSON_T_ARRAY_BEGIN); +      state = 21; break; +    case 21: +      if (type == JSON_T_ARRAY_END) { state = 15; break; } +      assert(type == JSON_T_INTEGER); +      tail.push_back(value->vu.integer_value); +      break; +    case 22:     // edge.spans +      assert(type == JSON_T_ARRAY_BEGIN); +      state = 23; +      spans[0] = spans[1] = spans[2] = spans[3] = -1; +      spanc = 0; +      break; +    case 23: +      if (type == JSON_T_ARRAY_END) { state = 15; break; } +      assert(type == JSON_T_INTEGER); +      assert(spanc < 4); +      spans[spanc] = value->vu.integer_value; +      ++spanc; +    } +    return true; +  } +  string rp; +  string cat; +  SmallVector tail; +  vector<int> in_edges; +  TRulePtr cur_rule; +  map<int, TRulePtr> rules; +  vector<int> fdict; +  SparseVector<double> feats; +  int state; +  int fid; +  int nodes; +  int edges; +  int spans[4]; +  int spanc; +  string cur_key; +  Hypergraph& hg; +  int rule_id; +  bool nodes_needed; +  bool edges_needed; +  bool is_sorted; +}; + +bool HypergraphIO::ReadFromJSON(istream* in, Hypergraph* hg) { +  hg->clear(); +  HGReader reader(hg); +  return reader.Parse(in); +} + +static void WriteRule(const TRule& r, ostream* out) { +  if (!r.lhs_) { (*out) << "[X] ||| "; } +  JSONParser::WriteEscapedString(r.AsString(), out); +} + +bool HypergraphIO::WriteToJSON(const Hypergraph& hg, bool remove_rules, ostream* out) { +  map<const TRule*, int> rid; +  ostream& o = *out; +  rid[NULL] = 0; +  o << '{'; +  if (!remove_rules) { +    o << "\"rules\":["; +    for (int i = 0; i < hg.edges_.size(); ++i) { +      const TRule* r = hg.edges_[i].rule_.get(); +      int &id = rid[r]; +      if (!id) { +        id=rid.size() - 1; +        if (id > 1) o << ','; +        o << id << ','; +        WriteRule(*r, &o); +      }; +    } +    o << "],"; +  } +  const bool use_fdict = FD::NumFeats() < 1000; +  if (use_fdict) { +    o << "\"features\":["; +    for (int i = 1; i < FD::NumFeats(); ++i) { +      o << (i==1 ? "":",") << '"' << FD::Convert(i) << '"'; +    } +    o << "],"; +  } +  vector<int> edgemap(hg.edges_.size(), -1);  // edges may be in non-topo order +  int edge_count = 0; +  for (int i = 0; i < hg.nodes_.size(); ++i) { +    const Hypergraph::Node& node = hg.nodes_[i]; +    if (i > 0) { o << ","; } +    o << "\"edges\":["; +    for (int j = 0; j < node.in_edges_.size(); ++j) { +      const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +      edgemap[edge.id_] = edge_count; +      ++edge_count; +      o << (j == 0 ? "" : ",") << "{"; + +      o << "\"tail\":["; +      for (int k = 0; k < edge.tail_nodes_.size(); ++k) { +        o << (k > 0 ? "," : "") << edge.tail_nodes_[k]; +      } +      o << "],"; + +      o << "\"spans\":[" << edge.i_ << "," << edge.j_ << "," << edge.prev_i_ << "," << edge.prev_j_ << "],"; + +      o << "\"feats\":["; +      bool first = true; +      for (SparseVector<double>::const_iterator it = edge.feature_values_.begin(); it != edge.feature_values_.end(); ++it) { +        if (!it->second) continue;   // don't write features that have a zero value +        if (!it->first) continue;    // if the feature set was frozen this might happen +        if (!first) o << ','; +        if (use_fdict) +          o << (it->first - 1); +        else +          o << '"' << FD::Convert(it->first) << '"'; +        o << ',' << it->second; +        first = false; +      } +      o << "]"; +      if (!remove_rules) { o << ",\"rule\":" << rid[edge.rule_.get()]; } +      o << "}"; +    } +    o << "],"; + +    o << "\"node\":{\"in_edges\":["; +    for (int j = 0; j < node.in_edges_.size(); ++j) { +      int mapped_edge = edgemap[node.in_edges_[j]]; +      assert(mapped_edge >= 0); +      o << (j == 0 ? "" : ",") << mapped_edge; +    } +    o << "]"; +    if (node.cat_ < 0) { o << ",\"cat\":\"" << TD::Convert(node.cat_ * -1) << '"'; } +    o << "}"; +  } +  o << "}\n"; +  return true; +} + +bool needs_escape[128]; +void InitEscapes() { +  memset(needs_escape, false, 128); +  needs_escape[static_cast<size_t>('\'')] = true; +  needs_escape[static_cast<size_t>('\\')] = true; +} + +string HypergraphIO::Escape(const string& s) { +  size_t len = s.size(); +  for (int i = 0; i < s.size(); ++i) { +    unsigned char c = s[i]; +    if (c < 128 && needs_escape[c]) ++len; +  } +  if (len == s.size()) return s; +  string res(len, ' '); +  size_t o = 0; +  for (int i = 0; i < s.size(); ++i) { +    unsigned char c = s[i]; +    if (c < 128 && needs_escape[c]) +      res[o++] = '\\'; +    res[o++] = c; +  } +  assert(o == len); +  return res; +} + +string HypergraphIO::AsPLF(const Hypergraph& hg, bool include_global_parentheses) { +  static bool first = true; +  if (first) { InitEscapes(); first = false; } +  if (hg.nodes_.empty()) return "()"; +  ostringstream os; +  if (include_global_parentheses) os << '('; +  static const string EPS="*EPS*"; +  for (int i = 0; i < hg.nodes_.size()-1; ++i) { +    if (hg.nodes_[i].out_edges_.empty()) abort(); +    const bool last_node = (i == hg.nodes_.size() - 2); +    const int out_edges_size = hg.nodes_[i].out_edges_.size(); +    // compound splitter adds an extra goal transition which we suppress with +    // the following conditional +    if (!last_node || out_edges_size != 1 || +         hg.edges_[hg.nodes_[i].out_edges_[0]].rule_->EWords() == 1) { +      os << '('; +      for (int j = 0; j < out_edges_size; ++j) { +        const Hypergraph::Edge& e = hg.edges_[hg.nodes_[i].out_edges_[j]]; +        const string output = e.rule_->e_.size() ==2 ? Escape(TD::Convert(e.rule_->e_[1])) : EPS; +        double prob = log(e.edge_prob_); +        if (isinf(prob)) { prob = -9e20; } +        if (isnan(prob)) { prob = 0; } +        os << "('" << output << "'," << prob << "," << e.head_node_ - i << "),"; +      } +      os << "),"; +    } +  } +  if (include_global_parentheses) os << ')'; +  return os.str(); +} + +namespace PLF { + +const string chars = "'\\"; +const char& quote = chars[0]; +const char& slash = chars[1]; + +// safe get +inline char get(const std::string& in, int c) { +  if (c < 0 || c >= (int)in.size()) return 0; +  else return in[(size_t)c]; +} + +// consume whitespace +inline void eatws(const std::string& in, int& c) { +  while (get(in,c) == ' ') { c++; } +} + +// from 'foo' return foo +std::string getEscapedString(const std::string& in, int &c) +{ +  eatws(in,c); +  if (get(in,c++) != quote) return "ERROR"; +  std::string res; +  char cur = 0; +  do { +    cur = get(in,c++); +    if (cur == slash) { res += get(in,c++); } +    else if (cur != quote) { res += cur; } +  } while (get(in,c) != quote && (c < (int)in.size())); +  c++; +  eatws(in,c); +  return res; +} + +// basically atof +float getFloat(const std::string& in, int &c) +{ +  std::string tmp; +  eatws(in,c); +  while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') { +    tmp += get(in,c++); +  } +  eatws(in,c); +  if (tmp.empty()) { +    cerr << "Syntax error while reading number! col=" << c << endl; +    abort(); +  } +  return atof(tmp.c_str()); +} + +// basically atoi +int getInt(const std::string& in, int &c) +{ +  std::string tmp; +  eatws(in,c); +  while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') { +    tmp += get(in,c++); +  } +  eatws(in,c); +  return atoi(tmp.c_str()); +} + +// maximum number of nodes permitted +#define MAX_NODES 100000000 +// parse ('foo', 0.23) +void ReadPLFEdge(const std::string& in, int &c, int cur_node, Hypergraph* hg) { +  if (get(in,c++) != '(') { assert(!"PCN/PLF parse error: expected ( at start of cn alt block\n"); } +  vector<WordID> ewords(2, 0); +  ewords[1] = TD::Convert(getEscapedString(in,c)); +  TRulePtr r(new TRule(ewords)); +  r->ComputeArity(); +  // cerr << "RULE: " << r->AsString() << endl; +  if (get(in,c++) != ',') { cerr << in << endl; assert(!"PCN/PLF parse error: expected , after string\n"); } +  size_t cnNext = 1; +  std::vector<float> probs; +  probs.push_back(getFloat(in,c)); +  while (get(in,c) == ',') { +    c++; +    float val = getFloat(in,c); +    probs.push_back(val); +    // cerr << val << endl;  //REMO +  } +  //if we read more than one prob, this was a lattice, last item was column increment +  if (probs.size()>1) { +    cnNext = static_cast<size_t>(probs.back()); +    probs.pop_back(); +    if (cnNext < 1) { cerr << cnNext << endl; +             assert(!"PCN/PLF parse error: bad link length at last element of cn alt block\n"); } +  } +  if (get(in,c++) != ')') { assert(!"PCN/PLF parse error: expected ) at end of cn alt block\n"); } +  eatws(in,c); +  Hypergraph::TailNodeVector tail(1, cur_node); +  Hypergraph::Edge* edge = hg->AddEdge(r, tail); +  //cerr << "  <--" << cur_node << endl; +  int head_node = cur_node + cnNext; +  assert(head_node < MAX_NODES);  // prevent malicious PLFs from using all the memory +  if (hg->nodes_.size() < (head_node + 1)) { hg->ResizeNodes(head_node + 1); } +  hg->ConnectEdgeToHeadNode(edge, &hg->nodes_[head_node]); +  for (int i = 0; i < probs.size(); ++i) +    edge->feature_values_.set_value(FD::Convert("Feature_" + boost::lexical_cast<string>(i)), probs[i]); +} + +// parse (('foo', 0.23), ('bar', 0.77)) +void ReadPLFNode(const std::string& in, int &c, int cur_node, int line, Hypergraph* hg) { +  //cerr << "PLF READING NODE " << cur_node << endl; +  if (hg->nodes_.size() < (cur_node + 1)) { hg->ResizeNodes(cur_node + 1); } +  if (get(in,c++) != '(') { cerr << line << ": Syntax error 1\n"; abort(); } +  eatws(in,c); +  while (1) { +    if (c > (int)in.size()) { break; } +    if (get(in,c) == ')') { +      c++; +      eatws(in,c); +      break; +    } +    if (get(in,c) == ',' && get(in,c+1) == ')') { +      c+=2; +      eatws(in,c); +      break; +    } +    if (get(in,c) == ',') { c++; eatws(in,c); } +    ReadPLFEdge(in, c, cur_node, hg); +  } +} + +} // namespace PLF  + +void HypergraphIO::ReadFromPLF(const std::string& in, Hypergraph* hg, int line) { +  hg->clear(); +  int c = 0; +  int cur_node = 0; +  if (in[c++] != '(') { cerr << line << ": Syntax error!\n"; abort(); } +  while (1) { +    if (c > (int)in.size()) { break; } +    if (PLF::get(in,c) == ')') { +      c++; +      PLF::eatws(in,c); +      break; +    } +    if (PLF::get(in,c) == ',' && PLF::get(in,c+1) == ')') { +      c+=2; +      PLF::eatws(in,c); +      break; +    } +    if (PLF::get(in,c) == ',') { c++; PLF::eatws(in,c); } +    PLF::ReadPLFNode(in, c, cur_node, line, hg); +    ++cur_node; +  } +  assert(cur_node == hg->nodes_.size() - 1); +} + +void HypergraphIO::PLFtoLattice(const string& plf, Lattice* pl) { +  Lattice& l = *pl; +  Hypergraph g; +  ReadFromPLF(plf, &g, 0); +  const int num_nodes = g.nodes_.size() - 1; +  l.resize(num_nodes); +  for (int i = 0; i < num_nodes; ++i) { +    vector<LatticeArc>& alts = l[i]; +    const Hypergraph::Node& node = g.nodes_[i]; +    const int num_alts = node.out_edges_.size(); +    alts.resize(num_alts); +    for (int j = 0; j < num_alts; ++j) { +      const Hypergraph::Edge& edge = g.edges_[node.out_edges_[j]]; +      alts[j].label = edge.rule_->e_[1]; +      alts[j].cost = edge.feature_values_.value(FD::Convert("Feature_0")); +      alts[j].dist2next = edge.head_node_ - node.id_; +    } +  } +} + +void HypergraphIO::WriteAsCFG(const Hypergraph& hg) { +  vector<int> cats(hg.nodes_.size()); +  // each node in the translation forest becomes a "non-terminal" in the new +  // grammar, create the labels here +  const string kSEP = "_"; +  for (int i = 0; i < hg.nodes_.size(); ++i) { +    const char* pstr = "CAT"; +    if (hg.nodes_[i].cat_ < 0) +      pstr = TD::Convert(-hg.nodes_[i].cat_); +    cats[i] = TD::Convert(pstr + kSEP + boost::lexical_cast<string>(i)) * -1; +  } + +  for (int i = 0; i < hg.edges_.size(); ++i) { +    const Hypergraph::Edge& edge = hg.edges_[i]; +    const vector<WordID>& tgt = edge.rule_->e(); +    const vector<WordID>& src = edge.rule_->f(); +    TRulePtr rule(new TRule); +    rule->prev_i = edge.i_; +    rule->prev_j = edge.j_; +    rule->lhs_ = cats[edge.head_node_]; +    vector<WordID>& f = rule->f_; +    vector<WordID>& e = rule->e_; +    f.resize(tgt.size());   // swap source and target, since the parser +    e.resize(src.size());   // parses using the source side! +    Hypergraph::TailNodeVector tn(edge.tail_nodes_.size()); +    int ntc = 0; +    for (int j = 0; j < tgt.size(); ++j) { +      const WordID& cur = tgt[j]; +      if (cur > 0) { +        f[j] = cur; +      } else { +        tn[ntc++] = cur; +        f[j] = cats[edge.tail_nodes_[-cur]]; +      } +    } +    ntc = 0; +    for (int j = 0; j < src.size(); ++j) { +      const WordID& cur = src[j]; +      if (cur > 0) { +        e[j] = cur; +      } else { +        e[j] = tn[ntc++]; +      } +    } +    rule->scores_ = edge.feature_values_; +    rule->parent_rule_ = edge.rule_; +    rule->ComputeArity(); +    cout << rule->AsString() << endl; +  } +} + +namespace B64 { + +static const char cb64[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`abcdefghijklmnopq"; + +static void encodeblock(const unsigned char* in, ostream* os, int len) { +  char out[4]; +  out[0] = cb64[ in[0] >> 2 ]; +  out[1] = cb64[ ((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4) ]; +  out[2] = (len > 1 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '='); +  out[3] = (len > 2 ? cb64[ in[2] & 0x3f ] : '='); +  os->write(out, 4); +} + +void b64encode(const char* data, const size_t size, ostream* out) { +  size_t cur = 0; +  while(cur < size) { +    int len = min(static_cast<size_t>(3), size - cur); +    encodeblock(reinterpret_cast<const unsigned char*>(&data[cur]), out, len); +    cur += len; +  } +} + +static void decodeblock(const unsigned char* in, unsigned char* out) {    +  out[0] = (unsigned char ) (in[0] << 2 | in[1] >> 4); +  out[1] = (unsigned char ) (in[1] << 4 | in[2] >> 2); +  out[2] = (unsigned char ) (((in[2] << 6) & 0xc0) | in[3]); +} + +bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize) { +  size_t cur = 0; +  size_t ocur = 0; +  unsigned char in[4]; +  while(cur < insize) { +    assert(ocur < outsize); +    for (int i = 0; i < 4; ++i) { +      unsigned char v = data[cur]; +      v = (unsigned char) ((v < 43 || v > 122) ? '\0' : cd64[ v - 43 ]); +      if (!v) { +        cerr << "B64 decode error at offset " << cur << " offending character: " << (int)data[cur] << endl; +        return false; +      } +      v = (unsigned char) ((v == '$') ? '\0' : v - 61); +      if (v) in[i] = v - 1; else in[i] = 0; +      ++cur; +    } +    decodeblock(in, reinterpret_cast<unsigned char*>(&out[ocur])); +    ocur += 3; +  } +  return true; +} +} + diff --git a/decoder/hg_io.h b/decoder/hg_io.h new file mode 100644 index 00000000..7162106e --- /dev/null +++ b/decoder/hg_io.h @@ -0,0 +1,39 @@ +#ifndef _HG_IO_H_ +#define _HG_IO_H_ + +#include <iostream> + +#include "lattice.h" +class Hypergraph; + +struct HypergraphIO { + +  // the format is basically a list of nodes and edges in topological order +  // any edge you read, you must have already read its tail nodes +  // any node you read, you must have already read its incoming edges +  // this may make writing a bit more challenging if your forest is not +  // topologically sorted (but that probably doesn't happen very often), +  // but it makes reading much more memory efficient. +  // see test_data/small.json.gz for an email encoding +  static bool ReadFromJSON(std::istream* in, Hypergraph* out); + +  // if remove_rules is used, the hypergraph is serialized without rule information +  // (so it only contains structure and feature information) +  static bool WriteToJSON(const Hypergraph& hg, bool remove_rules, std::ostream* out); + +  static void WriteAsCFG(const Hypergraph& hg); + +  // serialization utils +  static void ReadFromPLF(const std::string& in, Hypergraph* out, int line = 0); +  // return PLF string representation (undefined behavior on non-lattices) +  static std::string AsPLF(const Hypergraph& hg, bool include_global_parentheses = true); +  static void PLFtoLattice(const std::string& plf, Lattice* pl); +  static std::string Escape(const std::string& s);  // PLF helper +}; + +namespace B64 { +  bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize); +  void b64encode(const char* data, const size_t size, std::ostream* out); +} + +#endif diff --git a/decoder/hg_test.cc b/decoder/hg_test.cc new file mode 100644 index 00000000..51934ad1 --- /dev/null +++ b/decoder/hg_test.cc @@ -0,0 +1,455 @@ +#include <cassert> +#include <iostream> +#include <fstream> +#include <vector> +#include <gtest/gtest.h> +#include "tdict.h" + +#include "json_parse.h" +#include "filelib.h" +#include "hg.h" +#include "hg_io.h" +#include "hg_intersect.h" +#include "viterbi.h" +#include "kbest.h" +#include "inside_outside.h" + +using namespace std; + +class HGTest : public testing::Test { + protected: +  virtual void SetUp() { } +  virtual void TearDown() { } +  void CreateHG(Hypergraph* hg) const; +  void CreateHG_int(Hypergraph* hg) const; +  void CreateHG_tiny(Hypergraph* hg) const; +  void CreateHGBalanced(Hypergraph* hg) const; +  void CreateLatticeHG(Hypergraph* hg) const; +  void CreateTinyLatticeHG(Hypergraph* hg) const; +}; +        +void HGTest::CreateTinyLatticeHG(Hypergraph* hg) const { +  const string json = "{\"rules\":[1,\"[X] ||| [1] a\",2,\"[X] ||| [1] A\",3,\"[X] ||| [1] b\",4,\"[X] ||| [1] B'\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[],\"node\":{\"in_edges\":[]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.2],\"rule\":1},{\"tail\":[0],\"feats\":[0,-0.6],\"rule\":2}],\"node\":{\"in_edges\":[0,1]},\"edges\":[{\"tail\":[1],\"feats\":[0,-0.1],\"rule\":3},{\"tail\":[1],\"feats\":[0,-0.9],\"rule\":4}],\"node\":{\"in_edges\":[2,3]}}"; +  istringstream instr(json); +  EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); +} + +void HGTest::CreateLatticeHG(Hypergraph* hg) const { +  const string json = "{\"rules\":[1,\"[X] ||| [1] a\",2,\"[X] ||| [1] A\",3,\"[X] ||| [1] A A\",4,\"[X] ||| [1] b\",5,\"[X] ||| [1] c\",6,\"[X] ||| [1] B C\",7,\"[X] ||| [1] A B C\",8,\"[X] ||| [1] CC\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[],\"node\":{\"in_edges\":[]},\"edges\":[{\"tail\":[0],\"feats\":[2,-0.3],\"rule\":1},{\"tail\":[0],\"feats\":[2,-0.6],\"rule\":2},{\"tail\":[0],\"feats\":[2,-1.7],\"rule\":3}],\"node\":{\"in_edges\":[0,1,2]},\"edges\":[{\"tail\":[1],\"feats\":[2,-0.5],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[2],\"feats\":[2,-0.6],\"rule\":5},{\"tail\":[1],\"feats\":[2,-0.8],\"rule\":6},{\"tail\":[0],\"feats\":[2,-0.01],\"rule\":7},{\"tail\":[2],\"feats\":[2,-0.8],\"rule\":8}],\"node\":{\"in_edges\":[4,5,6,7]}}"; +  istringstream instr(json); +  EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); +} + +void HGTest::CreateHG_tiny(Hypergraph* hg) const { +  const string json = "{\"rules\":[1,\"[X] ||| <s>\",2,\"[X] ||| X [1]\",3,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,-2,1,-99],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.5,1,-0.8],\"rule\":2},{\"tail\":[0],\"feats\":[0,-0.7,1,-0.9],\"rule\":3}],\"node\":{\"in_edges\":[1,2]}}"; +  istringstream instr(json); +  EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); +} + +void HGTest::CreateHG_int(Hypergraph* hg) const { +  const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| b\",3,\"[X] ||| a [1]\",4,\"[X] ||| [1] b\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,0.1],\"rule\":1},{\"tail\":[],\"feats\":[0,0.1],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X\"},\"edges\":[{\"tail\":[0],\"feats\":[0,0.3],\"rule\":3},{\"tail\":[0],\"feats\":[0,0.2],\"rule\":4}],\"node\":{\"in_edges\":[2,3],\"cat\":\"Goal\"}}"; +  istringstream instr(json); +  EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); +} + +void HGTest::CreateHG(Hypergraph* hg) const { +  string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; +  istringstream instr(json); +  EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); +} + +void HGTest::CreateHGBalanced(Hypergraph* hg) const { +  const string json = "{\"rules\":[1,\"[X] ||| i\",2,\"[X] ||| a\",3,\"[X] ||| b\",4,\"[X] ||| [1] [2]\",5,\"[X] ||| [1] [2]\",6,\"[X] ||| c\",7,\"[X] ||| d\",8,\"[X] ||| [1] [2]\",9,\"[X] ||| [1] [2]\",10,\"[X] ||| [1] [2]\",11,\"[X] ||| [1] [2]\",12,\"[X] ||| [1] [2]\",13,\"[X] ||| [1] [2]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[1,2],\"feats\":[],\"rule\":4},{\"tail\":[2,1],\"feats\":[],\"rule\":5}],\"node\":{\"in_edges\":[3,4]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":6}],\"node\":{\"in_edges\":[5]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":7}],\"node\":{\"in_edges\":[6]},\"edges\":[{\"tail\":[4,5],\"feats\":[],\"rule\":8},{\"tail\":[5,4],\"feats\":[],\"rule\":9}],\"node\":{\"in_edges\":[7,8]},\"edges\":[{\"tail\":[3,6],\"feats\":[],\"rule\":10},{\"tail\":[6,3],\"feats\":[],\"rule\":11}],\"node\":{\"in_edges\":[9,10]},\"edges\":[{\"tail\":[7,0],\"feats\":[],\"rule\":12},{\"tail\":[0,7],\"feats\":[],\"rule\":13}],\"node\":{\"in_edges\":[11,12]}}"; +  istringstream instr(json); +  EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); +} + +TEST_F(HGTest,Controlled) { +  Hypergraph hg; +  CreateHG_tiny(&hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 0.4); +  wts.set_value(FD::Convert("f2"), 0.8); +  hg.Reweight(wts); +  vector<WordID> trans; +  prob_t prob = ViterbiESentence(hg, &trans); +  cerr << TD::GetString(trans) << "\n"; +  cerr << "prob: " << prob << "\n"; +  EXPECT_FLOAT_EQ(-80.839996, log(prob)); +  EXPECT_EQ("X <s>", TD::GetString(trans)); +  vector<prob_t> post; +  hg.PrintGraphviz(); +  prob_t c2 = Inside<prob_t, ScaledEdgeProb>(hg, NULL, ScaledEdgeProb(0.6)); +  EXPECT_FLOAT_EQ(-47.8577, log(c2)); +} + +TEST_F(HGTest,Union) { +  Hypergraph hg1; +  Hypergraph hg2; +  CreateHG_tiny(&hg1); +  CreateHG(&hg2); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 0.4); +  wts.set_value(FD::Convert("f2"), 1.0); +  hg1.Reweight(wts); +  hg2.Reweight(wts); +  prob_t c1,c2,c3,c4; +  vector<WordID> t1,t2,t3,t4; +  c1 = ViterbiESentence(hg1, &t1); +  c2 = ViterbiESentence(hg2, &t2); +  int l2 = ViterbiPathLength(hg2); +  cerr << c1 << "\t" << TD::GetString(t1) << endl; +  cerr << c2 << "\t" << TD::GetString(t2) << endl; +  hg1.Union(hg2); +  hg1.Reweight(wts); +  c3 = ViterbiESentence(hg1, &t3); +  int l3 = ViterbiPathLength(hg1); +  cerr << c3 << "\t" << TD::GetString(t3) << endl; +  EXPECT_FLOAT_EQ(c2, c3); +  EXPECT_EQ(TD::GetString(t2), TD::GetString(t3)); +  EXPECT_EQ(l2, l3); + +  wts.set_value(FD::Convert("f2"), -1); +  hg1.Reweight(wts); +  c4 = ViterbiESentence(hg1, &t4); +  cerr << c4 << "\t" << TD::GetString(t4) << endl; +  EXPECT_EQ("Z <s>", TD::GetString(t4)); +  EXPECT_FLOAT_EQ(98.82, log(c4)); + +  vector<pair<vector<WordID>, prob_t> > list; +  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg1, 10); +  for (int i = 0; i < 10; ++i) { +    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +      kbest.LazyKthBest(hg1.nodes_.size() - 1, i); +    if (!d) break; +    list.push_back(make_pair(d->yield, d->score)); +  } +  EXPECT_TRUE(list[0].first == t4); +  EXPECT_FLOAT_EQ(log(list[0].second), log(c4)); +  EXPECT_EQ(list.size(), 6); +  EXPECT_FLOAT_EQ(log(list.back().second / list.front().second), -97.7); +} + +TEST_F(HGTest,ControlledKBest) { +  Hypergraph hg; +  CreateHG(&hg); +  vector<double> w(2); w[0]=0.4; w[1]=0.8; +  hg.Reweight(w); +  vector<WordID> trans; +  prob_t cost = ViterbiESentence(hg, &trans); +  cerr << TD::GetString(trans) << "\n"; +  cerr << "cost: " << cost << "\n"; + +  int best = 0; +  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); +  for (int i = 0; i < 10; ++i) { +    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +      kbest.LazyKthBest(hg.nodes_.size() - 1, i); +    if (!d) break; +    cerr << TD::GetString(d->yield) << endl; +    ++best; +  } +  EXPECT_EQ(4, best); +} + + +TEST_F(HGTest,InsideScore) { +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 1.0); +  Hypergraph hg; +  CreateTinyLatticeHG(&hg); +  hg.Reweight(wts); +  vector<WordID> trans; +  prob_t cost = ViterbiESentence(hg, &trans); +  cerr << TD::GetString(trans) << "\n"; +  cerr << "cost: " << cost << "\n"; +  hg.PrintGraphviz(); +  prob_t inside = Inside<prob_t, EdgeProb>(hg); +  EXPECT_FLOAT_EQ(1.7934048, inside);  // computed by hand +  vector<prob_t> post; +  inside = hg.ComputeBestPathThroughEdges(&post); +  EXPECT_FLOAT_EQ(-0.3, log(inside));  // computed by hand +  EXPECT_EQ(post.size(), 4); +  for (int i = 0; i < 4; ++i) { +    cerr << "edge post: " << log(post[i]) << '\t' << hg.edges_[i].rule_->AsString() << endl; +  } +} + + +TEST_F(HGTest,PruneInsideOutside) { +  SparseVector<double> wts; +  wts.set_value(FD::Convert("Feature_1"), 1.0); +  Hypergraph hg; +  CreateLatticeHG(&hg); +  hg.Reweight(wts); +  vector<WordID> trans; +  prob_t cost = ViterbiESentence(hg, &trans); +  cerr << TD::GetString(trans) << "\n"; +  cerr << "cost: " << cost << "\n"; +  hg.PrintGraphviz(); +  //hg.DensityPruneInsideOutside(0.5, false, 2.0); +  hg.BeamPruneInsideOutside(0.5, false, 0.5); +  cost = ViterbiESentence(hg, &trans); +  cerr << "Ncst: " << cost << endl; +  cerr << TD::GetString(trans) << "\n"; +  hg.PrintGraphviz(); +} + +TEST_F(HGTest,TestPruneEdges) { +  Hypergraph hg; +  CreateLatticeHG(&hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 1.0); +  hg.Reweight(wts); +  hg.PrintGraphviz(); +  vector<bool> prune(hg.edges_.size(), true); +  prune[6] = false; +  hg.PruneEdges(prune); +  cerr << "Pruned:\n"; +  hg.PrintGraphviz(); +} + +TEST_F(HGTest,TestIntersect) { +  Hypergraph hg; +  CreateHG_int(&hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 1.0); +  hg.Reweight(wts); +  hg.PrintGraphviz(); + +  int best = 0; +  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); +  for (int i = 0; i < 10; ++i) { +    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +      kbest.LazyKthBest(hg.nodes_.size() - 1, i); +    if (!d) break; +    cerr << TD::GetString(d->yield) << endl; +    ++best; +  } +  EXPECT_EQ(4, best); + +  Lattice target(2); +  target[0].push_back(LatticeArc(TD::Convert("a"), 0.0, 1)); +  target[1].push_back(LatticeArc(TD::Convert("b"), 0.0, 1)); +  HG::Intersect(target, &hg); +  hg.PrintGraphviz(); +} + +TEST_F(HGTest,TestPrune2) { +  Hypergraph hg; +  CreateHG_int(&hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 1.0); +  hg.Reweight(wts); +  hg.PrintGraphviz(); +  vector<bool> rem(hg.edges_.size(), false); +  rem[0] = true; +  rem[1] = true; +  hg.PruneEdges(rem); +  hg.PrintGraphviz(); +  cerr << "TODO: fix this pruning behavior-- the resulting HG should be empty!\n"; +} + +TEST_F(HGTest,Sample) { +  Hypergraph hg; +  CreateLatticeHG(&hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("Feature_1"), 0.0); +  hg.Reweight(wts); +  vector<WordID> trans; +  prob_t cost = ViterbiESentence(hg, &trans); +  cerr << TD::GetString(trans) << "\n"; +  cerr << "cost: " << cost << "\n"; +  hg.PrintGraphviz(); +} + +TEST_F(HGTest,PLF) { +  Hypergraph hg; +  string inplf = "((('haupt',-2.06655,1),('hauptgrund',-5.71033,2),),(('grund',-1.78709,1),),(('für\\'',0.1,1),),)"; +  HypergraphIO::ReadFromPLF(inplf, &hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("Feature_0"), 1.0); +  hg.Reweight(wts); +  hg.PrintGraphviz(); +  string outplf = HypergraphIO::AsPLF(hg); +  cerr << " IN: " << inplf << endl; +  cerr << "OUT: " << outplf << endl; +  assert(inplf == outplf); +} + +TEST_F(HGTest,PushWeightsToGoal) { +  Hypergraph hg; +  CreateHG(&hg); +  vector<double> w(2); w[0]=0.4; w[1]=0.8; +  hg.Reweight(w); +  vector<WordID> trans; +  prob_t cost = ViterbiESentence(hg, &trans); +  cerr << TD::GetString(trans) << "\n"; +  cerr << "cost: " << cost << "\n"; +  hg.PrintGraphviz(); +  hg.PushWeightsToGoal(); +  hg.PrintGraphviz(); +} + +TEST_F(HGTest,TestSpecialKBest) { +  Hypergraph hg; +  CreateHGBalanced(&hg); +  vector<double> w(1); w[0]=0; +  hg.Reweight(w); +  vector<pair<vector<WordID>, prob_t> > list; +  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 100000); +  for (int i = 0; i < 100000; ++i) { +    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +      kbest.LazyKthBest(hg.nodes_.size() - 1, i); +    if (!d) break; +    cerr << TD::GetString(d->yield) << endl; +  } +  hg.PrintGraphviz(); +} + +TEST_F(HGTest, TestGenericViterbi) { +  Hypergraph hg; +  CreateHG_tiny(&hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 0.4); +  wts.set_value(FD::Convert("f2"), 0.8); +  hg.Reweight(wts); +  vector<WordID> trans; +  const prob_t prob = ViterbiESentence(hg, &trans); +  cerr << TD::GetString(trans) << "\n"; +  cerr << "prob: " << prob << "\n"; +  EXPECT_FLOAT_EQ(-80.839996, log(prob)); +  EXPECT_EQ("X <s>", TD::GetString(trans)); +} + +TEST_F(HGTest, TestGenericInside) { +  Hypergraph hg; +  CreateTinyLatticeHG(&hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 1.0); +  hg.Reweight(wts); +  vector<prob_t> inside; +  prob_t ins = Inside<prob_t, EdgeProb>(hg, &inside); +  EXPECT_FLOAT_EQ(1.7934048, ins);  // computed by hand +  vector<prob_t> outside; +  Outside<prob_t, EdgeProb>(hg, inside, &outside); +  EXPECT_EQ(3, outside.size()); +  EXPECT_FLOAT_EQ(1.7934048, outside[0]); +  EXPECT_FLOAT_EQ(1.3114071, outside[1]); +  EXPECT_FLOAT_EQ(1.0, outside[2]); +} + +TEST_F(HGTest,TestGenericInside2) { +  Hypergraph hg; +  CreateHG(&hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 0.4); +  wts.set_value(FD::Convert("f2"), 0.8); +  hg.Reweight(wts); +  vector<prob_t> inside, outside; +  prob_t ins = Inside<prob_t, EdgeProb>(hg, &inside); +  Outside<prob_t, EdgeProb>(hg, inside, &outside); +  for (int i = 0; i < hg.nodes_.size(); ++i) +    cerr << i << "\t" << log(inside[i]) << "\t" << log(outside[i]) << endl; +  EXPECT_FLOAT_EQ(0, log(inside[0])); +  EXPECT_FLOAT_EQ(-1.7861683, log(outside[0])); +  EXPECT_FLOAT_EQ(-0.4, log(inside[1])); +  EXPECT_FLOAT_EQ(-1.3861683, log(outside[1])); +  EXPECT_FLOAT_EQ(-0.8, log(inside[2])); +  EXPECT_FLOAT_EQ(-0.986168, log(outside[2])); +  EXPECT_FLOAT_EQ(-0.96, log(inside[3])); +  EXPECT_FLOAT_EQ(-0.8261683, log(outside[3])); +  EXPECT_FLOAT_EQ(-1.562512, log(inside[4])); +  EXPECT_FLOAT_EQ(-0.22365622, log(outside[4])); +  EXPECT_FLOAT_EQ(-1.7861683, log(inside[5])); +  EXPECT_FLOAT_EQ(0, log(outside[5])); +} + +TEST_F(HGTest,TestAddExpectations) { +  Hypergraph hg; +  CreateHG(&hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 0.4); +  wts.set_value(FD::Convert("f2"), 0.8); +  hg.Reweight(wts); +  SparseVector<prob_t> feat_exps; +  prob_t z = InsideOutside<prob_t, EdgeProb, +                  SparseVector<prob_t>, EdgeFeaturesAndProbWeightFunction>(hg, &feat_exps); +  EXPECT_FLOAT_EQ(-2.5439765, feat_exps[FD::Convert("f1")] / z); +  EXPECT_FLOAT_EQ(-2.6357865, feat_exps[FD::Convert("f2")] / z); +  cerr << feat_exps << endl; +  cerr << "Z=" << z << endl; +} + +TEST_F(HGTest, Small) { +  ReadFile rf("test_data/small.json.gz"); +  Hypergraph hg; +  assert(HypergraphIO::ReadFromJSON(rf.stream(), &hg)); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("Model_0"), -2.0); +  wts.set_value(FD::Convert("Model_1"), -0.5); +  wts.set_value(FD::Convert("Model_2"), -1.1); +  wts.set_value(FD::Convert("Model_3"), -1.0); +  wts.set_value(FD::Convert("Model_4"), -1.0); +  wts.set_value(FD::Convert("Model_5"), 0.5); +  wts.set_value(FD::Convert("Model_6"), 0.2); +  wts.set_value(FD::Convert("Model_7"), -3.0); +  hg.Reweight(wts); +  vector<WordID> trans; +  prob_t cost = ViterbiESentence(hg, &trans); +  cerr << TD::GetString(trans) << "\n"; +  cerr << "cost: " << cost << "\n"; +  vector<prob_t> post; +  prob_t c2 = Inside<prob_t, ScaledEdgeProb>(hg, NULL, ScaledEdgeProb(0.6)); +  EXPECT_FLOAT_EQ(2.1431036, log(c2)); +} + +TEST_F(HGTest, JSONTest) { +  ostringstream os; +  JSONParser::WriteEscapedString("\"I don't know\", she said.", &os); +  EXPECT_EQ("\"\\\"I don't know\\\", she said.\"", os.str()); +  ostringstream os2; +  JSONParser::WriteEscapedString("yes", &os2); +  EXPECT_EQ("\"yes\"", os2.str()); +} + +TEST_F(HGTest, TestGenericKBest) { +  Hypergraph hg; +  CreateHG(&hg); +  //CreateHGBalanced(&hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 0.4); +  wts.set_value(FD::Convert("f2"), 1.0); +  hg.Reweight(wts); +  vector<WordID> trans; +  prob_t cost = ViterbiESentence(hg, &trans); +  cerr << TD::GetString(trans) << "\n"; +  cerr << "cost: " << cost << "\n"; + +  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 1000); +  for (int i = 0; i < 1000; ++i) { +    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +      kbest.LazyKthBest(hg.nodes_.size() - 1, i); +    if (!d) break; +    cerr << TD::GetString(d->yield) << " F:" << d->feature_values << endl; +  } +} + +TEST_F(HGTest, TestReadWriteHG) { +  Hypergraph hg,hg2; +  CreateHG(&hg); +  hg.edges_.front().j_ = 23; +  hg.edges_.back().prev_i_ = 99; +  ostringstream os; +  HypergraphIO::WriteToJSON(hg, false, &os); +  istringstream is(os.str()); +  HypergraphIO::ReadFromJSON(&is, &hg2); +  EXPECT_EQ(hg2.NumberOfPaths(), hg.NumberOfPaths()); +  EXPECT_EQ(hg2.edges_.front().j_, 23); +  EXPECT_EQ(hg2.edges_.back().prev_i_, 99); +} + +int main(int argc, char **argv) { +  testing::InitGoogleTest(&argc, argv); +  return RUN_ALL_TESTS(); +} diff --git a/decoder/inside_outside.h b/decoder/inside_outside.h new file mode 100644 index 00000000..3c7518f2 --- /dev/null +++ b/decoder/inside_outside.h @@ -0,0 +1,112 @@ +#ifndef _INSIDE_H_ +#define _INSIDE_H_ + +#include <vector> +#include <algorithm> +#include "hg.h" + +// run the inside algorithm and return the inside score +// if result is non-NULL, result will contain the inside +// score for each node +// NOTE: WeightType()  must construct the semiring's additive identity +//       WeightType(1) must construct the semiring's multiplicative identity +template<typename WeightType, typename WeightFunction> +WeightType Inside(const Hypergraph& hg, +                  std::vector<WeightType>* result = NULL, +                  const WeightFunction& weight = WeightFunction()) { +  const int num_nodes = hg.nodes_.size(); +  std::vector<WeightType> dummy; +  std::vector<WeightType>& inside_score = result ? *result : dummy; +  inside_score.resize(num_nodes); +  std::fill(inside_score.begin(), inside_score.end(), WeightType()); +  for (int i = 0; i < num_nodes; ++i) { +    const Hypergraph::Node& cur_node = hg.nodes_[i]; +    WeightType* const cur_node_inside_score = &inside_score[i]; +    const int num_in_edges = cur_node.in_edges_.size(); +    if (num_in_edges == 0) { +      *cur_node_inside_score = WeightType(1); +      continue; +    } +    for (int j = 0; j < num_in_edges; ++j) { +      const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]]; +      WeightType score = weight(edge); +      for (int k = 0; k < edge.tail_nodes_.size(); ++k) { +        const int tail_node_index = edge.tail_nodes_[k]; +        score *= inside_score[tail_node_index]; +      } +      *cur_node_inside_score += score; +    } +  } +  return inside_score.back(); +} + +template<typename WeightType, typename WeightFunction> +void Outside(const Hypergraph& hg, +             std::vector<WeightType>& inside_score, +             std::vector<WeightType>* result, +             const WeightFunction& weight = WeightFunction()) { +  assert(result); +  const int num_nodes = hg.nodes_.size(); +  assert(inside_score.size() == num_nodes); +  std::vector<WeightType>& outside_score = *result; +  outside_score.resize(num_nodes); +  std::fill(outside_score.begin(), outside_score.end(), WeightType()); +  outside_score.back() = WeightType(1); +  for (int i = num_nodes - 1; i >= 0; --i) { +    const Hypergraph::Node& cur_node = hg.nodes_[i]; +    const WeightType& head_node_outside_score = outside_score[i]; +    const int num_in_edges = cur_node.in_edges_.size(); +    for (int j = 0; j < num_in_edges; ++j) { +      const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]]; +      WeightType head_and_edge_weight = weight(edge); +      head_and_edge_weight *= head_node_outside_score; +      const int num_tail_nodes = edge.tail_nodes_.size(); +      for (int k = 0; k < num_tail_nodes; ++k) { +        const int update_tail_node_index = edge.tail_nodes_[k]; +        WeightType* const tail_outside_score = &outside_score[update_tail_node_index]; +        WeightType inside_contribution = WeightType(1); +        for (int l = 0; l < num_tail_nodes; ++l) { +          const int other_tail_node_index = edge.tail_nodes_[l]; +          if (update_tail_node_index != other_tail_node_index) +            inside_contribution *= inside_score[other_tail_node_index]; +        } +        inside_contribution *= head_and_edge_weight; +        *tail_outside_score += inside_contribution; +      } +    } +  } +} + +// this is the Inside-Outside optimization described in Li and Eisner (EMNLP 2009) +// for computing the inside algorithm over expensive semirings +// (such as expectations over features).  See Figure 4. +// NOTE: XType * KType must be valid (and yield XType) +// NOTE: This may do things slightly differently than you are used to, please +// read the description in Li and Eisner (2009) carefully! +template<typename KType, typename KWeightFunction, typename XType, typename XWeightFunction> +KType InsideOutside(const Hypergraph& hg, +                    XType* result_x, +                    const KWeightFunction& kwf = KWeightFunction(), +                    const XWeightFunction& xwf = XWeightFunction()) { +  const int num_nodes = hg.nodes_.size(); +  std::vector<KType> inside, outside; +  const KType k = Inside<KType,KWeightFunction>(hg, &inside, kwf); +  Outside<KType,KWeightFunction>(hg, inside, &outside, kwf); +  XType& x = *result_x; +  x = XType();      // default constructor is semiring 0 +  for (int i = 0; i < num_nodes; ++i) { +    const Hypergraph::Node& cur_node = hg.nodes_[i]; +    const int num_in_edges = cur_node.in_edges_.size(); +    for (int j = 0; j < num_in_edges; ++j) { +      const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]]; +      KType kbar_e = outside[i]; +      const int num_tail_nodes = edge.tail_nodes_.size(); +      for (int k = 0; k < num_tail_nodes; ++k) +        kbar_e *= inside[edge.tail_nodes_[k]]; +      x += xwf(edge) * kbar_e; +    } +  } +  return k; +} + +#endif diff --git a/decoder/json_parse.cc b/decoder/json_parse.cc new file mode 100644 index 00000000..f6fdfea8 --- /dev/null +++ b/decoder/json_parse.cc @@ -0,0 +1,50 @@ +#include "json_parse.h" + +#include <string> +#include <iostream> + +using namespace std; + +static const char *json_hex_chars = "0123456789abcdef"; + +void JSONParser::WriteEscapedString(const string& in, ostream* out) { +  int pos = 0; +  int start_offset = 0; +  unsigned char c = 0; +  (*out) << '"'; +  while(pos < in.size()) { +    c = in[pos]; +    switch(c) { +    case '\b': +    case '\n': +    case '\r': +    case '\t': +    case '"': +    case '\\': +    case '/': +      if(pos - start_offset > 0) +	(*out) << in.substr(start_offset, pos - start_offset); +      if(c == '\b') (*out) << "\\b"; +      else if(c == '\n') (*out) << "\\n"; +      else if(c == '\r') (*out) << "\\r"; +      else if(c == '\t') (*out) << "\\t"; +      else if(c == '"') (*out) << "\\\""; +      else if(c == '\\') (*out) << "\\\\"; +      else if(c == '/') (*out) << "\\/"; +      start_offset = ++pos; +      break; +    default: +      if(c < ' ') { +        cerr << "Warning, bad character (" << static_cast<int>(c) << ") in string\n"; +	if(pos - start_offset > 0) +	  (*out) << in.substr(start_offset, pos - start_offset); +	(*out) << "\\u00" << json_hex_chars[c >> 4] << json_hex_chars[c & 0xf]; +	start_offset = ++pos; +      } else pos++; +    } +  } +  if(pos - start_offset > 0) +    (*out) << in.substr(start_offset, pos - start_offset); +  (*out) << '"'; +} + diff --git a/decoder/json_parse.h b/decoder/json_parse.h new file mode 100644 index 00000000..c3cba954 --- /dev/null +++ b/decoder/json_parse.h @@ -0,0 +1,58 @@ +#ifndef _JSON_WRAPPER_H_ +#define _JSON_WRAPPER_H_ + +#include <iostream> +#include <cassert> +#include "JSON_parser.h" + +class JSONParser { + public: +  JSONParser() { +    init_JSON_config(&config); +    hack.mf = &JSONParser::Callback; +    config.depth = 10; +    config.callback_ctx = reinterpret_cast<void*>(this); +    config.callback = hack.cb; +    config.allow_comments = 1; +    config.handle_floats_manually = 1; +    jc = new_JSON_parser(&config); +  } +  virtual ~JSONParser() { +    delete_JSON_parser(jc); +  } +  bool Parse(std::istream* in) { +    int count = 0; +    int lc = 1; +    for (; in ; ++count) { +      int next_char = in->get(); +      if (!in->good()) break; +      if (lc == '\n') { ++lc; } +      if (!JSON_parser_char(jc, next_char)) { +        std::cerr << "JSON_parser_char: syntax error, line " << lc << " (byte " << count << ")" << std::endl; +        return false; +      } +    } +    if (!JSON_parser_done(jc)) { +      std::cerr << "JSON_parser_done: syntax error\n"; +      return false; +    } +    return true; +  } +  static void WriteEscapedString(const std::string& in, std::ostream* out); + protected: +  virtual bool HandleJSONEvent(int type, const JSON_value* value) = 0; + private: +  int Callback(int type, const JSON_value* value) { +    if (HandleJSONEvent(type, value)) return 1; +    return 0; +  } +  JSON_parser_struct* jc; +  JSON_config config; +  typedef int (JSONParser::* MF)(int type, const struct JSON_value_struct* value); +  union CBHack { +    JSON_parser_callback cb; +    MF mf; +  } hack; +}; + +#endif diff --git a/decoder/kbest.h b/decoder/kbest.h new file mode 100644 index 00000000..fcd40fcd --- /dev/null +++ b/decoder/kbest.h @@ -0,0 +1,208 @@ +#ifndef _HG_KBEST_H_ +#define _HG_KBEST_H_ + +#include <vector> +#include <utility> +#include <tr1/unordered_set> + +#include <boost/shared_ptr.hpp> + +#include "wordid.h" +#include "hg.h" + +namespace KBest { +  // default, don't filter any derivations from the k-best list +  struct NoFilter { +    bool operator()(const std::vector<WordID>& yield) { +      (void) yield; +      return false; +    } +  }; + +  // optional, filter unique yield strings +  struct FilterUnique { +    std::tr1::unordered_set<std::vector<WordID>, boost::hash<std::vector<WordID> > > unique; + +    bool operator()(const std::vector<WordID>& yield) { +      return !unique.insert(yield).second; +    } +  }; + +  // utility class to lazily create the k-best derivations from a forest, uses +  // the lazy k-best algorithm (Algorithm 3) from Huang and Chiang (IWPT 2005) +  template<typename T,  // yield type (returned by Traversal) +           typename Traversal, +           typename DerivationFilter = NoFilter, +           typename WeightType = prob_t, +           typename WeightFunction = EdgeProb> +  struct KBestDerivations { +    KBestDerivations(const Hypergraph& hg, +                     const size_t k, +                     const Traversal& tf = Traversal(), +                     const WeightFunction& wf = WeightFunction()) : +      traverse(tf), w(wf), g(hg), nds(g.nodes_.size()), k_prime(k) {} + +    ~KBestDerivations() { +      for (int i = 0; i < freelist.size(); ++i) +        delete freelist[i]; +    } + +    struct Derivation { +      Derivation(const Hypergraph::Edge& e, +                 const SmallVector& jv, +                 const WeightType& w, +                 const SparseVector<double>& f) : +        edge(&e), +        j(jv), +        score(w), +        feature_values(f) {} + +      // dummy constructor, just for query +      Derivation(const Hypergraph::Edge& e, +                 const SmallVector& jv) : edge(&e), j(jv) {} + +      T yield; +      const Hypergraph::Edge* const edge; +      const SmallVector j; +      const WeightType score; +      const SparseVector<double> feature_values; +    }; +    struct HeapCompare { +      bool operator()(const Derivation* a, const Derivation* b) const { +        return a->score < b->score; +      } +    }; +    struct DerivationCompare { +      bool operator()(const Derivation* a, const Derivation* b) const { +        return a->score > b->score; +      } +    }; +    struct DerivationUniquenessHash { +      size_t operator()(const Derivation* d) const { +        size_t x = 5381; +        x = ((x << 5) + x) ^ d->edge->id_; +        for (int i = 0; i < d->j.size(); ++i) +          x = ((x << 5) + x) ^ d->j[i]; +        return x; +      } +    }; +    struct DerivationUniquenessEquals { +      bool operator()(const Derivation* a, const Derivation* b) const { +        return (a->edge == b->edge) && (a->j == b->j); +      } +    }; +    typedef std::vector<Derivation*> CandidateHeap; +    typedef std::vector<Derivation*> DerivationList; +    typedef std::tr1::unordered_set< +       const Derivation*, DerivationUniquenessHash, DerivationUniquenessEquals> UniqueDerivationSet; + +    struct NodeDerivationState { +      CandidateHeap cand; +      DerivationList D; +      DerivationFilter filter; +      UniqueDerivationSet ds; +      explicit NodeDerivationState(const DerivationFilter& f = DerivationFilter()) : filter(f) {} +    }; + +    Derivation* LazyKthBest(int v, int k) { +      NodeDerivationState& s = GetCandidates(v); +      CandidateHeap& cand = s.cand; +      DerivationList& D = s.D; +      DerivationFilter& filter = s.filter; +      bool add_next = true; +      while (D.size() <= k) { +        if (add_next && D.size() > 0) { +          const Derivation* d = D.back(); +          LazyNext(d, &cand, &s.ds); +        } +        add_next = false; + +        if (cand.size() > 0) { +          std::pop_heap(cand.begin(), cand.end(), HeapCompare()); +          Derivation* d = cand.back(); +          cand.pop_back(); +          std::vector<const T*> ants(d->edge->Arity()); +          for (int j = 0; j < ants.size(); ++j) +            ants[j] = &LazyKthBest(d->edge->tail_nodes_[j], d->j[j])->yield; +          traverse(*d->edge, ants, &d->yield); +          if (!filter(d->yield)) { +            D.push_back(d); +            add_next = true; +          } +        } else { +          break; +        } +      } +      if (k < D.size()) return D[k]; else return NULL; +    } + +  private: +    // creates a derivation object with all fields set but the yield +    // the yield is computed in LazyKthBest before the derivation is added to D +    // returns NULL if j refers to derivation numbers larger than the +    // antecedent structure define +    Derivation* CreateDerivation(const Hypergraph::Edge& e, const SmallVector& j) { +      WeightType score = w(e); +      SparseVector<double> feats = e.feature_values_; +      for (int i = 0; i < e.Arity(); ++i) { +        const Derivation* ant = LazyKthBest(e.tail_nodes_[i], j[i]); +        if (!ant) { return NULL; } +        score *= ant->score; +        feats += ant->feature_values; +      } +      freelist.push_back(new Derivation(e, j, score, feats)); +      return freelist.back(); +    } + +    NodeDerivationState& GetCandidates(int v) { +      NodeDerivationState& s = nds[v]; +      if (!s.D.empty() || !s.cand.empty()) return s; + +      const Hypergraph::Node& node = g.nodes_[v]; +      for (int i = 0; i < node.in_edges_.size(); ++i) { +        const Hypergraph::Edge& edge = g.edges_[node.in_edges_[i]]; +        SmallVector jv(edge.Arity(), 0); +        Derivation* d = CreateDerivation(edge, jv); +        assert(d); +        s.cand.push_back(d); +      } + +      const int effective_k = std::min(k_prime, s.cand.size()); +      const typename CandidateHeap::iterator kth = s.cand.begin() + effective_k; +      std::nth_element(s.cand.begin(), kth, s.cand.end(), DerivationCompare()); +      s.cand.resize(effective_k); +      std::make_heap(s.cand.begin(), s.cand.end(), HeapCompare()); + +      return s; +    } + +    void LazyNext(const Derivation* d, CandidateHeap* cand, UniqueDerivationSet* ds) { +      for (int i = 0; i < d->j.size(); ++i) { +        SmallVector j = d->j; +        ++j[i]; +        const Derivation* ant = LazyKthBest(d->edge->tail_nodes_[i], j[i]); +        if (ant) { +          Derivation query_unique(*d->edge, j); +          if (ds->count(&query_unique) == 0) { +            Derivation* new_d = CreateDerivation(*d->edge, j); +            if (new_d) { +              cand->push_back(new_d); +              std::push_heap(cand->begin(), cand->end(), HeapCompare()); +              bool inserted = ds->insert(new_d).second;  // insert into uniqueness set +              assert(inserted); +            } +          } +        } +      } +    } + +    const Traversal traverse; +    const WeightFunction w; +    const Hypergraph& g; +    std::vector<NodeDerivationState> nds; +    std::vector<Derivation*> freelist; +    const size_t k_prime; +  }; +} + +#endif diff --git a/decoder/lattice.cc b/decoder/lattice.cc new file mode 100644 index 00000000..e3631e59 --- /dev/null +++ b/decoder/lattice.cc @@ -0,0 +1,62 @@ +#include "lattice.h" + +#include "tdict.h" +#include "hg_io.h" + +using namespace std; + +static const int kUNREACHABLE = 99999999; + +void Lattice::ComputeDistances() { +  const int n = this->size() + 1; +  dist_.resize(n, n, kUNREACHABLE); +  for (int i = 0; i < this->size(); ++i) { +    const vector<LatticeArc>& alts = (*this)[i]; +    for (int j = 0; j < alts.size(); ++j) +      dist_(i, i + alts[j].dist2next) = 1; +  } +  for (int k = 0; k < n; ++k) { +    for (int i = 0; i < n; ++i) { +      for (int j = 0; j < n; ++j) { +        const int dp = dist_(i,k) + dist_(k,j); +        if (dist_(i,j) > dp) +          dist_(i,j) = dp; +      } +    } +  } + +  for (int i = 0; i < n; ++i) { +    int latest = kUNREACHABLE; +    for (int j = n-1; j >= 0; --j) { +      const int c = dist_(i,j); +      if (c < kUNREACHABLE) +        latest = c; +      else +        dist_(i,j) = latest; +    } +  } +  // cerr << dist_ << endl; +} + +bool LatticeTools::LooksLikePLF(const string &line) { +  return (line.size() > 5) && (line.substr(0,4) == "((('"); +} + +void LatticeTools::ConvertTextToLattice(const string& text, Lattice* pl) { +  Lattice& l = *pl; +  vector<WordID> ids; +  TD::ConvertSentence(text, &ids); +  l.resize(ids.size()); +  for (int i = 0; i < l.size(); ++i) +    l[i].push_back(LatticeArc(ids[i], 0.0, 1)); +  l.is_sentence_ = true; +} + +void LatticeTools::ConvertTextOrPLF(const string& text_or_plf, Lattice* pl) { +  if (LooksLikePLF(text_or_plf)) +    HypergraphIO::PLFtoLattice(text_or_plf, pl); +  else +    ConvertTextToLattice(text_or_plf, pl); +  pl->ComputeDistances(); +} + diff --git a/decoder/lattice.h b/decoder/lattice.h new file mode 100644 index 00000000..ad4ca50d --- /dev/null +++ b/decoder/lattice.h @@ -0,0 +1,46 @@ +#ifndef __LATTICE_H_ +#define __LATTICE_H_ + +#include <string> +#include <vector> +#include "wordid.h" +#include "array2d.h" + +class Lattice; +struct LatticeTools { +  static bool LooksLikePLF(const std::string &line); +  static void ConvertTextToLattice(const std::string& text, Lattice* pl); +  static void ConvertTextOrPLF(const std::string& text_or_plf, Lattice* pl); +}; + +struct LatticeArc { +  WordID label; +  double cost; +  int dist2next; +  LatticeArc() : label(), cost(), dist2next() {} +  LatticeArc(WordID w, double c, int i) : label(w), cost(c), dist2next(i) {} +}; + +class Lattice : public std::vector<std::vector<LatticeArc> > { +  friend void LatticeTools::ConvertTextOrPLF(const std::string& text_or_plf, Lattice* pl); +  friend void LatticeTools::ConvertTextToLattice(const std::string& text, Lattice* pl); + public: +  Lattice() : is_sentence_(false) {} +  explicit Lattice(size_t t, const std::vector<LatticeArc>& v = std::vector<LatticeArc>()) : +   std::vector<std::vector<LatticeArc> >(t, v), +   is_sentence_(false) {} +  int Distance(int from, int to) const { +    if (dist_.empty()) +      return (to - from); +    return dist_(from, to); +  } +  // TODO this should actually be computed based on the contents +  // of the lattice +  bool IsSentence() const { return is_sentence_; } + private: +  void ComputeDistances(); +  Array2D<int> dist_; +  bool is_sentence_; +}; + +#endif diff --git a/decoder/lexalign.cc b/decoder/lexalign.cc new file mode 100644 index 00000000..6adb1892 --- /dev/null +++ b/decoder/lexalign.cc @@ -0,0 +1,129 @@ +#include "lexalign.h" + +#include <iostream> + +#include "filelib.h" +#include "hg.h" +#include "tdict.h" +#include "grammar.h" +#include "sentence_metadata.h" + +using namespace std; + +struct LexicalAlignImpl { +  LexicalAlignImpl(const boost::program_options::variables_map& conf) : +      use_null(conf.count("lexcrf_use_null") > 0), +      kXCAT(TD::Convert("X")*-1), +      kNULL(TD::Convert("<eps>")), +      kBINARY(new TRule("[X] ||| [X,1] [X,2] ||| [1] [2]")), +      kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")) { +  } + +  void BuildTrellis(const Lattice& lattice, const SentenceMetadata& smeta, Hypergraph* forest) { +    const int e_len = smeta.GetTargetLength(); +    assert(e_len > 0); +    const Lattice& target = smeta.GetReference(); +    const int f_len = lattice.size(); +    // hack to tell the feature function system how big the sentence pair is +    const int f_start = (use_null ? -1 : 0); +    int prev_node_id = -1; +    for (int i = 0; i < e_len; ++i) {  // for each word in the *target* +      const WordID& e_i = target[i][0].label; +      Hypergraph::Node* node = forest->AddNode(kXCAT); +      const int new_node_id = node->id_; +      int num_srcs = 0; +      for (int j = f_start; j < f_len; ++j) { // for each word in the source +        const WordID src_sym = (j < 0 ? kNULL : lattice[j][0].label); +        const TRulePtr& rule = LexRule(src_sym, e_i); +        if (rule) { +          Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector()); +          edge->i_ = j; +          edge->j_ = j+1; +          edge->prev_i_ = i; +          edge->prev_j_ = i+1; +          edge->feature_values_ += edge->rule_->GetFeatureValues(); +          ++num_srcs; +          forest->ConnectEdgeToHeadNode(edge->id_, new_node_id); +        } else { +          cerr << TD::Convert(src_sym) << " does not translate to " << TD::Convert(e_i) << endl; +        } +      } +      assert(num_srcs > 0); +      if (prev_node_id >= 0) { +        const int comb_node_id = forest->AddNode(kXCAT)->id_; +        Hypergraph::TailNodeVector tail(2, prev_node_id); +        tail[1] = new_node_id; +        Hypergraph::Edge* edge = forest->AddEdge(kBINARY, tail); +        forest->ConnectEdgeToHeadNode(edge->id_, comb_node_id); +        prev_node_id = comb_node_id; +      } else { +        prev_node_id = new_node_id; +      } +    } +    Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); +    Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1); +    Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); +    forest->ConnectEdgeToHeadNode(hg_edge, goal); +  } + +  inline int LexFeatureId(const WordID& f, const WordID& e) { +    map<int, int>& e2fid = f2e2fid[f]; +    map<int, int>::iterator it = e2fid.find(e); +    if (it != e2fid.end()) +      return it->second; +    int& fid = e2fid[e]; +    if (f == 0) { +      fid = FD::Convert("Lx:<eps>_" + FD::Escape(TD::Convert(e))); +    } else { +      fid = FD::Convert("Lx:" + FD::Escape(TD::Convert(f)) + "_" + FD::Escape(TD::Convert(e))); +    } +    return fid; +  } + +  inline const TRulePtr& LexRule(const WordID& f, const WordID& e) { +    const int fid = LexFeatureId(f, e); +    if (!fid) { return kNULL_PTR; } +    map<int, TRulePtr>& e2rule = f2e2rule[f]; +    map<int, TRulePtr>::iterator it = e2rule.find(e); +    if (it != e2rule.end()) +      return it->second; +    TRulePtr& tr = e2rule[e]; +    tr.reset(TRule::CreateLexicalRule(f, e)); +    tr->scores_.set_value(fid, 1.0); +    return tr; +  } + + private: +  const bool use_null; +  const WordID kXCAT; +  const WordID kNULL; +  const TRulePtr kBINARY; +  const TRulePtr kGOAL_RULE; +  const TRulePtr kNULL_PTR; +  map<int, map<int, TRulePtr> > f2e2rule; +  map<int, map<int, int> > f2e2fid; +  GrammarPtr grammar; +}; + +LexicalAlign::LexicalAlign(const boost::program_options::variables_map& conf) : +  pimpl_(new LexicalAlignImpl(conf)) {} + +bool LexicalAlign::TranslateImpl(const string& input, +                      SentenceMetadata* smeta, +                      const vector<double>& weights, +                      Hypergraph* forest) { +  Lattice& lattice = smeta->src_lattice_; +  LatticeTools::ConvertTextOrPLF(input, &lattice); +  if (!lattice.IsSentence()) { +    // lexical models make independence assumptions +    // that don't work with lattices or conf nets +    cerr << "LexicalTrans: cannot deal with lattice source input!\n"; +    abort(); +  } +  smeta->SetSourceLength(lattice.size()); +  pimpl_->BuildTrellis(lattice, *smeta, forest); +  forest->is_linear_chain_ = true; +  forest->Reweight(weights); +  return true; +} + diff --git a/decoder/lexalign.h b/decoder/lexalign.h new file mode 100644 index 00000000..7ba4fe64 --- /dev/null +++ b/decoder/lexalign.h @@ -0,0 +1,18 @@ +#ifndef _LEXALIGN_H_ +#define _LEXALIGN_H_ + +#include "translator.h" +#include "lattice.h" + +struct LexicalAlignImpl; +struct LexicalAlign : public Translator { +  LexicalAlign(const boost::program_options::variables_map& conf); +  bool TranslateImpl(const std::string& input, +                 SentenceMetadata* smeta, +                 const std::vector<double>& weights, +                 Hypergraph* forest); + private: +  boost::shared_ptr<LexicalAlignImpl> pimpl_; +}; + +#endif diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc new file mode 100644 index 00000000..3fcd1a7d --- /dev/null +++ b/decoder/lextrans.cc @@ -0,0 +1,119 @@ +#include "lextrans.h" + +#include <iostream> + +#include "filelib.h" +#include "hg.h" +#include "tdict.h" +#include "grammar.h" +#include "sentence_metadata.h" + +using namespace std; + +struct LexicalTransImpl { +  LexicalTransImpl(const boost::program_options::variables_map& conf) : +      use_null(conf.count("lexcrf_use_null") > 0), +      kXCAT(TD::Convert("X")*-1), +      kNULL(TD::Convert("<eps>")), +      kBINARY(new TRule("[X] ||| [X,1] [X,2] ||| [1] [2]")), +      kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")) { +    vector<string> gfiles = conf["grammar"].as<vector<string> >(); +    assert(gfiles.size() == 1); +    ReadFile rf(gfiles.front()); +    TextGrammar *tg = new TextGrammar; +    grammar.reset(tg); +    istream* in = rf.stream(); +    int lc = 0; +    bool flag = false; +    while(*in) { +      string line; +      getline(*in, line); +      if (line.empty()) continue; +      ++lc; +      TRulePtr r(TRule::CreateRulePhrasetable(line)); +      tg->AddRule(r); +      if (lc %   50000 == 0) { cerr << '.'; flag = true; } +      if (lc % 2000000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } +    } +    if (flag) cerr << endl; +    cerr << "Loaded " << lc << " rules\n"; +  } + +  void BuildTrellis(const Lattice& lattice, const SentenceMetadata& smeta, Hypergraph* forest) { +    const int e_len = smeta.GetTargetLength(); +    assert(e_len > 0); +    const int f_len = lattice.size(); +    // hack to tell the feature function system how big the sentence pair is +    const int f_start = (use_null ? -1 : 0); +    int prev_node_id = -1; +    for (int i = 0; i < e_len; ++i) {  // for each word in the *target* +      Hypergraph::Node* node = forest->AddNode(kXCAT); +      const int new_node_id = node->id_; +      for (int j = f_start; j < f_len; ++j) { // for each word in the source +        const WordID src_sym = (j < 0 ? kNULL : lattice[j][0].label); +        const GrammarIter* gi = grammar->GetRoot()->Extend(src_sym); +        if (!gi) { +          cerr << "No translations found for: " << TD::Convert(src_sym) << "\n"; +          abort(); +        } +        const RuleBin* rb = gi->GetRules(); +        assert(rb); +        for (int k = 0; k < rb->GetNumRules(); ++k) { +          TRulePtr rule = rb->GetIthRule(k); +          Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector()); +          edge->i_ = j; +          edge->j_ = j+1; +          edge->prev_i_ = i; +          edge->prev_j_ = i+1; +          edge->feature_values_ += edge->rule_->GetFeatureValues(); +          forest->ConnectEdgeToHeadNode(edge->id_, new_node_id); +        } +      } +      if (prev_node_id >= 0) { +        const int comb_node_id = forest->AddNode(kXCAT)->id_; +        Hypergraph::TailNodeVector tail(2, prev_node_id); +        tail[1] = new_node_id; +        Hypergraph::Edge* edge = forest->AddEdge(kBINARY, tail); +        forest->ConnectEdgeToHeadNode(edge->id_, comb_node_id); +        prev_node_id = comb_node_id; +      } else { +        prev_node_id = new_node_id; +      } +    } +    Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); +    Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1); +    Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); +    forest->ConnectEdgeToHeadNode(hg_edge, goal); +  } + + private: +  const bool use_null; +  const WordID kXCAT; +  const WordID kNULL; +  const TRulePtr kBINARY; +  const TRulePtr kGOAL_RULE; +  GrammarPtr grammar; +}; + +LexicalTrans::LexicalTrans(const boost::program_options::variables_map& conf) : +  pimpl_(new LexicalTransImpl(conf)) {} + +bool LexicalTrans::TranslateImpl(const string& input, +                      SentenceMetadata* smeta, +                      const vector<double>& weights, +                      Hypergraph* forest) { +  Lattice& lattice = smeta->src_lattice_; +  LatticeTools::ConvertTextOrPLF(input, &lattice); +  if (!lattice.IsSentence()) { +    // lexical models make independence assumptions +    // that don't work with lattices or conf nets +    cerr << "LexicalTrans: cannot deal with lattice source input!\n"; +    abort(); +  } +  smeta->SetSourceLength(lattice.size()); +  pimpl_->BuildTrellis(lattice, *smeta, forest); +  forest->is_linear_chain_ = true; +  forest->Reweight(weights); +  return true; +} + diff --git a/decoder/lextrans.h b/decoder/lextrans.h new file mode 100644 index 00000000..2d51e7c0 --- /dev/null +++ b/decoder/lextrans.h @@ -0,0 +1,18 @@ +#ifndef _LEXTrans_H_ +#define _LEXTrans_H_ + +#include "translator.h" +#include "lattice.h" + +struct LexicalTransImpl; +struct LexicalTrans : public Translator { +  LexicalTrans(const boost::program_options::variables_map& conf); +  bool TranslateImpl(const std::string& input, +                 SentenceMetadata* smeta, +                 const std::vector<double>& weights, +                 Hypergraph* forest); + private: +  boost::shared_ptr<LexicalTransImpl> pimpl_; +}; + +#endif diff --git a/decoder/logval.h b/decoder/logval.h new file mode 100644 index 00000000..7099b9be --- /dev/null +++ b/decoder/logval.h @@ -0,0 +1,157 @@ +#ifndef LOGVAL_H_ +#define LOGVAL_H_ + +#include <iostream> +#include <cstdlib> +#include <cmath> +#include <limits> + +template <typename T> +class LogVal { + public: +  LogVal() : s_(), v_(-std::numeric_limits<T>::infinity()) {} +  explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {} +  static LogVal<T> One() { return LogVal(1); } +  static LogVal<T> Zero() { return LogVal(); } + +  void logeq(const T& v) { s_ = false; v_ = v; } + +  LogVal& operator+=(const LogVal& a) { +    if (a.v_ == -std::numeric_limits<T>::infinity()) return *this; +    if (a.s_ == s_) { +      if (a.v_ < v_) { +        v_ = v_ + log1p(std::exp(a.v_ - v_)); +      } else { +        v_ = a.v_ + log1p(std::exp(v_ - a.v_)); +      } +    } else { +      if (a.v_ < v_) { +        v_ = v_ + log1p(-std::exp(a.v_ - v_)); +      } else { +        v_ = a.v_ + log1p(-std::exp(v_ - a.v_)); +        s_ = !s_; +      } +    } +    return *this; +  } + +  LogVal& operator*=(const LogVal& a) { +    s_ = (s_ != a.s_); +    v_ += a.v_; +    return *this; +  } + +  LogVal& operator/=(const LogVal& a) { +    s_ = (s_ != a.s_); +    v_ -= a.v_; +    return *this; +  } + +  LogVal& operator-=(const LogVal& a) { +    LogVal b = a; +    b.invert(); +    return *this += b; +  } + +  LogVal& poweq(const T& power) { +    if (s_) { +      std::cerr << "poweq(T) not implemented when s_ is true\n"; +      std::abort(); +    } else { +      v_ *= power; +    } +    return *this; +  } + +  void invert() { s_ = !s_; } + +  LogVal pow(const T& power) const { +    LogVal res = *this; +    res.poweq(power); +    return res; +  } + +  operator T() const { +    if (s_) return -std::exp(v_); else return std::exp(v_); +  } + +  bool s_; +  T v_; +}; + +template<typename T> +LogVal<T> operator+(const LogVal<T>& o1, const LogVal<T>& o2) { +  LogVal<T> res(o1); +  res += o2; +  return res; +} + +template<typename T> +LogVal<T> operator*(const LogVal<T>& o1, const LogVal<T>& o2) { +  LogVal<T> res(o1); +  res *= o2; +  return res; +} + +template<typename T> +LogVal<T> operator/(const LogVal<T>& o1, const LogVal<T>& o2) { +  LogVal<T> res(o1); +  res /= o2; +  return res; +} + +template<typename T> +LogVal<T> operator-(const LogVal<T>& o1, const LogVal<T>& o2) { +  LogVal<T> res(o1); +  res -= o2; +  return res; +} + +template<typename T> +T log(const LogVal<T>& o) { +  if (o.s_) return log(-1.0); +  return o.v_; +} + +template <typename T> +LogVal<T> pow(const LogVal<T>& b, const T& e) { +  return b.pow(e); +} + +template <typename T> +bool operator<(const LogVal<T>& lhs, const LogVal<T>& rhs) { +  if (lhs.s_ == rhs.s_) { +    return (lhs.v_ < rhs.v_); +  } else { +    return lhs.s_ > rhs.s_; +  } +} + +#if 0 +template <typename T> +bool operator<=(const LogVal<T>& lhs, const LogVal<T>& rhs) { +  return (lhs.v_ <= rhs.v_); +} + +template <typename T> +bool operator>(const LogVal<T>& lhs, const LogVal<T>& rhs) { +  return (lhs.v_ > rhs.v_); +} + +template <typename T> +bool operator>=(const LogVal<T>& lhs, const LogVal<T>& rhs) { +  return (lhs.v_ >= rhs.v_); +} +#endif + +template <typename T> +bool operator==(const LogVal<T>& lhs, const LogVal<T>& rhs) { +  return (lhs.v_ == rhs.v_) && (lhs.s_ == rhs.s_); +} + +template <typename T> +bool operator!=(const LogVal<T>& lhs, const LogVal<T>& rhs) { +  return !(lhs == rhs); +} + +#endif diff --git a/decoder/logval_test.cc b/decoder/logval_test.cc new file mode 100644 index 00000000..1a23177d --- /dev/null +++ b/decoder/logval_test.cc @@ -0,0 +1,73 @@ +#include "logval.h" + +#include <gtest/gtest.h> +#include <iostream> + +class LogValTest : public testing::Test { + protected: +  virtual void SetUp() { } +  virtual void TearDown() { } +}; + +using namespace std; + +TEST_F(LogValTest,Order) { +  LogVal<double> a(-0.3); +  LogVal<double> b(0.3); +  LogVal<double> c(2.4); +  EXPECT_LT(a,b); +  EXPECT_LT(b,c); +  EXPECT_LT(a,c); +  EXPECT_FALSE(b < a); +  EXPECT_FALSE(c < a); +  EXPECT_FALSE(c < b); +  EXPECT_FALSE(c < c); +  EXPECT_FALSE(b < b); +  EXPECT_FALSE(a < a); +} + +TEST_F(LogValTest,Invert) { +  LogVal<double> x(-2.4); +  LogVal<double> y(2.4); +  y.invert(); +  EXPECT_FLOAT_EQ(x,y); +} + +TEST_F(LogValTest,Minus) { +  LogVal<double> x(12); +  LogVal<double> y(2); +  LogVal<double> z1 = x - y; +  LogVal<double> z2 = x; +  z2 -= y; +  EXPECT_FLOAT_EQ(z1, z2); +  EXPECT_FLOAT_EQ(z1, 10.0); +  EXPECT_FLOAT_EQ(y - x, -10.0); +} + +TEST_F(LogValTest,TestOps) { +  LogVal<double> x(-12.12); +  LogVal<double> y(x); +  cerr << x << endl; +  cerr << (x*y) << endl; +  cerr << (x*y + x) << endl; +  cerr << (x + x*y) << endl; +  cerr << log1p(-0.5) << endl; +  LogVal<double> aa(0.2); +  LogVal<double> bb(-0.3); +  cerr << (aa + bb) << endl; +  cerr << (bb + aa) << endl; +  EXPECT_FLOAT_EQ((aa + bb), (bb + aa)); +  EXPECT_FLOAT_EQ((aa + bb), -0.1); +} + +TEST_F(LogValTest,TestSizes) { +  cerr << sizeof(LogVal<double>) << endl; +  cerr << sizeof(LogVal<float>) << endl; +  cerr << sizeof(void*) << endl; +} + +int main(int argc, char** argv) { +  testing::InitGoogleTest(&argc, argv); +  return RUN_ALL_TESTS(); +} + diff --git a/decoder/maxtrans_blunsom.cc b/decoder/maxtrans_blunsom.cc new file mode 100644 index 00000000..34e175db --- /dev/null +++ b/decoder/maxtrans_blunsom.cc @@ -0,0 +1,287 @@ +#include "apply_models.h" + +#include <vector> +#include <algorithm> +#include <tr1/unordered_map> +#include <tr1/unordered_set> + +#include <boost/tuple/tuple.hpp> +#include <boost/functional/hash.hpp> + +#include "tdict.h" +#include "hg.h" +#include "ff.h" + +using boost::tuple; +using namespace std; +using namespace std::tr1; + +namespace Hack { + +struct Candidate; +typedef SmallVector JVector; +typedef vector<Candidate*> CandidateHeap; +typedef vector<Candidate*> CandidateList; + +// life cycle: candidates are created, placed on the heap +// and retrieved by their estimated cost, when they're +// retrieved, they're incorporated into the +LM hypergraph +// where they also know the head node index they are +// attached to.  After they are added to the +LM hypergraph +// inside_prob_ and est_prob_ fields may be updated as better +// derivations are found (this happens since the successor's +// of derivation d may have a better score- they are +// explored lazily).  However, the updates don't happen +// when a candidate is in the heap so maintaining the heap +// property is not an issue. +struct Candidate { +  int node_index_;                     // -1 until incorporated +                                       // into the +LM forest +  const Hypergraph::Edge* in_edge_;    // in -LM forest +  Hypergraph::Edge out_edge_; +  vector<WordID> state_; +  const JVector j_; +  prob_t inside_prob_;            // these are fixed until the cand +                               // is popped, then they may be updated +  prob_t est_prob_; + +  Candidate(const Hypergraph::Edge& e, +            const JVector& j, +            const vector<CandidateList>& D, +            bool is_goal) : +      node_index_(-1), +      in_edge_(&e), +      j_(j) { +    InitializeCandidate(D, is_goal); +  } + +  // used to query uniqueness +  Candidate(const Hypergraph::Edge& e, +            const JVector& j) : in_edge_(&e), j_(j) {} + +  bool IsIncorporatedIntoHypergraph() const { +    return node_index_ >= 0; +  } + +  void InitializeCandidate(const vector<vector<Candidate*> >& D, +                           const bool is_goal) { +    const Hypergraph::Edge& in_edge = *in_edge_; +    out_edge_.rule_ = in_edge.rule_; +    out_edge_.feature_values_ = in_edge.feature_values_; +    Hypergraph::TailNodeVector& tail = out_edge_.tail_nodes_; +    tail.resize(j_.size()); +    prob_t p = prob_t::One(); +    // cerr << "\nEstimating application of " << in_edge.rule_->AsString() << endl; +    vector<const vector<WordID>* > ants(tail.size()); +    for (int i = 0; i < tail.size(); ++i) { +      const Candidate& ant = *D[in_edge.tail_nodes_[i]][j_[i]]; +      ants[i] = &ant.state_; +      assert(ant.IsIncorporatedIntoHypergraph()); +      tail[i] = ant.node_index_; +      p *= ant.inside_prob_; +    } +    prob_t edge_estimate = prob_t::One(); +    if (is_goal) { +      assert(tail.size() == 1); +      out_edge_.edge_prob_ = in_edge.edge_prob_; +    } else { +      in_edge.rule_->ESubstitute(ants, &state_); +      out_edge_.edge_prob_ = in_edge.edge_prob_; +    } +    inside_prob_ = out_edge_.edge_prob_ * p; +    est_prob_ = inside_prob_ * edge_estimate; +  } +}; + +ostream& operator<<(ostream& os, const Candidate& cand) { +  os << "CAND["; +  if (!cand.IsIncorporatedIntoHypergraph()) { os << "PENDING "; } +  else { os << "+LM_node=" << cand.node_index_; } +  os << " edge=" << cand.in_edge_->id_; +  os << " j=<"; +  for (int i = 0; i < cand.j_.size(); ++i) +    os << (i==0 ? "" : " ") << cand.j_[i]; +  os << "> vit=" << log(cand.inside_prob_); +  os << " est=" << log(cand.est_prob_); +  return os << ']'; +} + +struct HeapCandCompare { +  bool operator()(const Candidate* l, const Candidate* r) const { +    return l->est_prob_ < r->est_prob_; +  } +}; + +struct EstProbSorter { +  bool operator()(const Candidate* l, const Candidate* r) const { +    return l->est_prob_ > r->est_prob_; +  } +}; + +// the same candidate <edge, j> can be added multiple times if +// j is multidimensional (if you're going NW in Manhattan, you +// can first go north, then west, or you can go west then north) +// this is a hash function on the relevant variables from +// Candidate to enforce this. +struct CandidateUniquenessHash { +  size_t operator()(const Candidate* c) const { +    size_t x = 5381; +    x = ((x << 5) + x) ^ c->in_edge_->id_; +    for (int i = 0; i < c->j_.size(); ++i) +      x = ((x << 5) + x) ^ c->j_[i]; +    return x; +  } +}; + +struct CandidateUniquenessEquals { +  bool operator()(const Candidate* a, const Candidate* b) const { +    return (a->in_edge_ == b->in_edge_) && (a->j_ == b->j_); +  } +}; + +typedef unordered_set<const Candidate*, CandidateUniquenessHash, CandidateUniquenessEquals> UniqueCandidateSet; +typedef unordered_map<vector<WordID>, Candidate*, boost::hash<vector<WordID> > > State2Node; + +class MaxTransBeamSearch { + +public: +  MaxTransBeamSearch(const Hypergraph& i, int pop_limit, Hypergraph* o) : +      in(i), +      out(*o), +      D(in.nodes_.size()), +      pop_limit_(pop_limit) { +    cerr << "  Finding max translation (cube pruning, pop_limit = " << pop_limit_ << ')' << endl; +  } + +  void Apply() { +    int num_nodes = in.nodes_.size(); +    int goal_id = num_nodes - 1; +    int pregoal = goal_id - 1; +    assert(in.nodes_[pregoal].out_edges_.size() == 1); +    cerr << "    "; +    for (int i = 0; i < in.nodes_.size(); ++i) { +      cerr << '.'; +      KBest(i, i == goal_id); +    } +    cerr << endl; +    int best_node = D[goal_id].front()->in_edge_->tail_nodes_.front(); +    Candidate& best = *D[best_node].front(); +    cerr << "  Best path: " << log(best.inside_prob_) +         << "\t" << log(best.est_prob_) << endl; +    cout << TD::GetString(D[best_node].front()->state_) << endl; +    FreeAll(); +  } + + private: +  void FreeAll() { +    for (int i = 0; i < D.size(); ++i) { +      CandidateList& D_i = D[i]; +      for (int j = 0; j < D_i.size(); ++j) +        delete D_i[j]; +    } +    D.clear(); +  } + +  void IncorporateIntoPlusLMForest(Candidate* item, State2Node* s2n, CandidateList* freelist) { +    Hypergraph::Edge* new_edge = out.AddEdge(item->out_edge_.rule_, item->out_edge_.tail_nodes_); +    new_edge->feature_values_ = item->out_edge_.feature_values_; +    new_edge->edge_prob_ = item->out_edge_.edge_prob_; +    Candidate*& o_item = (*s2n)[item->state_]; +    if (!o_item) o_item = item; +     +    int& node_id = o_item->node_index_; +    if (node_id < 0) { +      Hypergraph::Node* new_node = out.AddNode(in.nodes_[item->in_edge_->head_node_].cat_); +      node_id = new_node->id_; +    } +    Hypergraph::Node* node = &out.nodes_[node_id]; +    out.ConnectEdgeToHeadNode(new_edge, node); + +    if (item != o_item) { +      assert(o_item->state_ == item->state_);    // sanity check! +      o_item->est_prob_ += item->est_prob_; +      o_item->inside_prob_ += item->inside_prob_; +      freelist->push_back(item); +    } +  } + +  void KBest(const int vert_index, const bool is_goal) { +    // cerr << "KBest(" << vert_index << ")\n"; +    CandidateList& D_v = D[vert_index]; +    assert(D_v.empty()); +    const Hypergraph::Node& v = in.nodes_[vert_index]; +    // cerr << "  has " << v.in_edges_.size() << " in-coming edges\n"; +    const vector<int>& in_edges = v.in_edges_; +    CandidateHeap cand; +    CandidateList freelist; +    cand.reserve(in_edges.size()); +    UniqueCandidateSet unique_cands; +    for (int i = 0; i < in_edges.size(); ++i) { +      const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; +      const JVector j(edge.tail_nodes_.size(), 0); +      cand.push_back(new Candidate(edge, j, D, is_goal)); +      assert(unique_cands.insert(cand.back()).second);  // these should all be unique! +    } +//    cerr << "  making heap of " << cand.size() << " candidates\n"; +    make_heap(cand.begin(), cand.end(), HeapCandCompare()); +    State2Node state2node;   // "buf" in Figure 2 +    int pops = 0; +    while(!cand.empty() && pops < pop_limit_) { +      pop_heap(cand.begin(), cand.end(), HeapCandCompare()); +      Candidate* item = cand.back(); +      cand.pop_back(); +      // cerr << "POPPED: " << *item << endl; +      PushSucc(*item, is_goal, &cand, &unique_cands); +      IncorporateIntoPlusLMForest(item, &state2node, &freelist); +      ++pops; +    } +    D_v.resize(state2node.size()); +    int c = 0; +    for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i) +      D_v[c++] = i->second; +    sort(D_v.begin(), D_v.end(), EstProbSorter()); +    // cerr << "  expanded to " << D_v.size() << " nodes\n"; + +    for (int i = 0; i < cand.size(); ++i) +      delete cand[i]; +    // freelist is necessary since even after an item merged, it still stays in +    // the unique set so it can't be deleted til now +    for (int i = 0; i < freelist.size(); ++i) +      delete freelist[i]; +  } + +  void PushSucc(const Candidate& item, const bool is_goal, CandidateHeap* pcand, UniqueCandidateSet* cs) { +    CandidateHeap& cand = *pcand; +    for (int i = 0; i < item.j_.size(); ++i) { +      JVector j = item.j_; +      ++j[i]; +      if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) { +        Candidate query_unique(*item.in_edge_, j); +        if (cs->count(&query_unique) == 0) { +          Candidate* new_cand = new Candidate(*item.in_edge_, j, D, is_goal); +          cand.push_back(new_cand); +          push_heap(cand.begin(), cand.end(), HeapCandCompare()); +          assert(cs->insert(new_cand).second);  // insert into uniqueness set, sanity check +        } +      } +    } +  } + +  const Hypergraph& in; +  Hypergraph& out; + +  vector<CandidateList> D;   // maps nodes in in-HG to the +                                   // equivalent nodes (many due to state +                                   // splits) in the out-HG. +  const int pop_limit_; +}; + +// each node in the graph has one of these, it keeps track of +void MaxTrans(const Hypergraph& in, +              int beam_size) { +  Hypergraph out; +  MaxTransBeamSearch ma(in, beam_size, &out); +  ma.Apply(); +} + +} diff --git a/decoder/parser_test.cc b/decoder/parser_test.cc new file mode 100644 index 00000000..da1fbd89 --- /dev/null +++ b/decoder/parser_test.cc @@ -0,0 +1,35 @@ +#include <cassert> +#include <iostream> +#include <fstream> +#include <vector> +#include <gtest/gtest.h> +#include "hg.h" +#include "trule.h" +#include "bottom_up_parser.h" +#include "tdict.h" + +using namespace std; + +class ChartTest : public testing::Test { + protected: +  virtual void SetUp() { } +  virtual void TearDown() { } +}; +        +TEST_F(ChartTest,LanguageModel) { +  LatticeArc a(TD::Convert("ein"), 0.0, 1); +  LatticeArc b(TD::Convert("haus"), 0.0, 1); +  Lattice lattice(2); +  lattice[0].push_back(a); +  lattice[1].push_back(b); +  Hypergraph forest; +  GrammarPtr g(new TextGrammar); +  vector<GrammarPtr> grammars(1, g); +  ExhaustiveBottomUpParser parser("PHRASE", grammars); +  parser.Parse(lattice, &forest); +} + +int main(int argc, char **argv) { +  testing::InitGoogleTest(&argc, argv); +  return RUN_ALL_TESTS(); +} diff --git a/decoder/phrasebased_translator.cc b/decoder/phrasebased_translator.cc new file mode 100644 index 00000000..726b3f9a --- /dev/null +++ b/decoder/phrasebased_translator.cc @@ -0,0 +1,206 @@ +#include "phrasebased_translator.h" + +#include <queue> +#include <iostream> +#include <tr1/unordered_map> +#include <tr1/unordered_set> + +#include <boost/tuple/tuple.hpp> +#include <boost/functional/hash.hpp> + +#include "sentence_metadata.h" +#include "tdict.h" +#include "hg.h" +#include "filelib.h" +#include "lattice.h" +#include "phrasetable_fst.h" +#include "array2d.h" + +using namespace std; +using namespace std::tr1; +using namespace boost::tuples; + +struct Coverage : public vector<bool> { +  explicit Coverage(int n, bool v = false) : vector<bool>(n, v), first_gap() {} +  void Cover(int i, int j) { +    vector<bool>::iterator it = this->begin() + i; +    vector<bool>::iterator end = this->begin() + j; +    while (it != end) +      *it++ = true; +    if (first_gap == i) { +      first_gap = j; +      it = end; +      while (*it && it != this->end()) { +        ++it; +        ++first_gap; +      } +    } +  } +  bool Collides(int i, int j) const { +    vector<bool>::const_iterator it = this->begin() + i; +    vector<bool>::const_iterator end = this->begin() + j; +    while (it != end) +      if (*it++) return true; +    return false; +  } +  int GetFirstGap() const { return first_gap; } + private: +  int first_gap; +}; +struct CoverageHash { +  size_t operator()(const Coverage& cov) const { +    return hasher_(static_cast<const vector<bool>&>(cov)); +  } + private: +  boost::hash<vector<bool> > hasher_; +}; +ostream& operator<<(ostream& os, const Coverage& cov) { +  os << '['; +  for (int i = 0; i < cov.size(); ++i) +    os << (cov[i] ? '*' : '.'); +  return os << " gap=" << cov.GetFirstGap() << ']'; +} + +typedef unordered_map<Coverage, int, CoverageHash> CoverageNodeMap; +typedef unordered_set<Coverage, CoverageHash> UniqueCoverageSet; + +struct PhraseBasedTranslatorImpl { +  PhraseBasedTranslatorImpl(const boost::program_options::variables_map& conf) : +      add_pass_through_rules(conf.count("add_pass_through_rules")), +      max_distortion(conf["pb_max_distortion"].as<int>()), +      kSOURCE_RULE(new TRule("[X] ||| [X,1] ||| [X,1]", true)), +      kCONCAT_RULE(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]", true)), +      kNT_TYPE(TD::Convert("X") * -1) { +    assert(max_distortion >= 0); +    vector<string> gfiles = conf["grammar"].as<vector<string> >(); +    assert(gfiles.size() == 1); +    cerr << "Reading phrasetable from " << gfiles.front() << endl; +    ReadFile in(gfiles.front()); +    fst.reset(LoadTextPhrasetable(in.stream())); +  } + +  struct State { +    State(const Coverage& c, int _i, int _j, const FSTNode* q) : +      coverage(c), i(_i), j(_j), fst(q) {} +    Coverage coverage; +    int i; +    int j; +    const FSTNode* fst; +  }; + +  // we keep track of unique coverages that have been extended since it's +  // possible to "extend" the same coverage twice, e.g. translate "a b c" +  // with phrases "a" "b" "a b" and "c".  There are two ways to cover "a b" +  void EnqueuePossibleContinuations(const Coverage& coverage, queue<State>* q, UniqueCoverageSet* ucs) { +    if (ucs->insert(coverage).second) { +      const int gap = coverage.GetFirstGap(); +      const int end = min(static_cast<int>(coverage.size()), gap + max_distortion + 1); +      for (int i = gap; i < end; ++i) +        if (!coverage[i]) q->push(State(coverage, i, i, fst.get())); +    } +  } + +  bool Translate(const std::string& input, +                 SentenceMetadata* smeta, +                 const std::vector<double>& weights, +                 Hypergraph* minus_lm_forest) { +    Lattice lattice; +    LatticeTools::ConvertTextOrPLF(input, &lattice); +    smeta->SetSourceLength(lattice.size()); +    size_t est_nodes = lattice.size() * lattice.size() * (1 << max_distortion); +    minus_lm_forest->ReserveNodes(est_nodes, est_nodes * 100); +    if (add_pass_through_rules) { +      SparseVector<double> feats; +      feats.set_value(FD::Convert("PassThrough"), 1); +      for (int i = 0; i < lattice.size(); ++i) { +        const vector<LatticeArc>& arcs = lattice[i]; +        for (int j = 0; j < arcs.size(); ++j) { +          fst->AddPassThroughTranslation(arcs[j].label, feats); +          // TODO handle lattice edge features +        } +      } +    } +    CoverageNodeMap c; +    queue<State> q; +    UniqueCoverageSet ucs; +    const Coverage empty_cov(lattice.size(), false); +    const Coverage goal_cov(lattice.size(), true); +    EnqueuePossibleContinuations(empty_cov, &q, &ucs); +    c[empty_cov] = 0;   // have to handle the left edge specially +    while(!q.empty()) { +      const State s = q.front(); +      q.pop(); +      // cerr << "(" << s.i << "," << s.j << " ptr=" << s.fst << ") cov=" << s.coverage << endl; +      const vector<LatticeArc>& arcs = lattice[s.j]; +      if (s.fst->HasData()) { +        Coverage new_cov = s.coverage; +        new_cov.Cover(s.i, s.j); +        EnqueuePossibleContinuations(new_cov, &q, &ucs); +        const vector<TRulePtr>& phrases = s.fst->GetTranslations()->GetRules(); +        const int phrase_head_index = minus_lm_forest->AddNode(kNT_TYPE)->id_; +        for (int i = 0; i < phrases.size(); ++i) { +          Hypergraph::Edge* edge = minus_lm_forest->AddEdge(phrases[i], Hypergraph::TailNodeVector()); +          edge->feature_values_ = edge->rule_->scores_; +          minus_lm_forest->ConnectEdgeToHeadNode(edge->id_, phrase_head_index); +        } +        CoverageNodeMap::iterator cit = c.find(s.coverage); +        assert(cit != c.end()); +        const int tail_node_plus1 = cit->second; +        if (tail_node_plus1 == 0) {  // left edge +          c[new_cov] = phrase_head_index + 1; +        } else { // not left edge +          int& head_node_plus1 = c[new_cov]; +          if (!head_node_plus1) +            head_node_plus1 = minus_lm_forest->AddNode(kNT_TYPE)->id_ + 1; +          Hypergraph::TailNodeVector tail(2, tail_node_plus1 - 1); +          tail[1] = phrase_head_index; +          const int concat_edge = minus_lm_forest->AddEdge(kCONCAT_RULE, tail)->id_; +          minus_lm_forest->ConnectEdgeToHeadNode(concat_edge, head_node_plus1 - 1); +        } +      } +      if (s.j == lattice.size()) continue; +      for (int l = 0; l < arcs.size(); ++l) { +        const LatticeArc& arc = arcs[l]; + +        const FSTNode* next_fst_state = s.fst->Extend(arc.label); +        const int next_j = s.j + arc.dist2next; +        if (next_fst_state && +            !s.coverage.Collides(s.i, next_j)) { +          q.push(State(s.coverage, s.i, next_j, next_fst_state)); +        } +      } +    } +    if (add_pass_through_rules) +      fst->ClearPassThroughTranslations(); +    int pregoal_plus1 = c[goal_cov]; +    if (pregoal_plus1 > 0) { +      TRulePtr kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [X,1]")); +      int goal = minus_lm_forest->AddNode(TD::Convert("Goal") * -1)->id_; +      int gedge = minus_lm_forest->AddEdge(kGOAL_RULE, Hypergraph::TailNodeVector(1, pregoal_plus1 - 1))->id_; +      minus_lm_forest->ConnectEdgeToHeadNode(gedge, goal); +      // they are almost topo, but not quite always +      minus_lm_forest->TopologicallySortNodesAndEdges(goal); +      minus_lm_forest->Reweight(weights); +      return true; +    } else { +      return false;  // composition failed +    } +  } + +  const bool add_pass_through_rules; +  const int max_distortion; +  TRulePtr kSOURCE_RULE; +  const TRulePtr kCONCAT_RULE; +  const WordID kNT_TYPE; +  boost::shared_ptr<FSTNode> fst; +}; + +PhraseBasedTranslator::PhraseBasedTranslator(const boost::program_options::variables_map& conf) : +  pimpl_(new PhraseBasedTranslatorImpl(conf)) {} + +bool PhraseBasedTranslator::TranslateImpl(const std::string& input, +                                      SentenceMetadata* smeta, +                                      const std::vector<double>& weights, +                                      Hypergraph* minus_lm_forest) { +  return pimpl_->Translate(input, smeta, weights, minus_lm_forest); +} diff --git a/decoder/phrasebased_translator.h b/decoder/phrasebased_translator.h new file mode 100644 index 00000000..e5e3f8a2 --- /dev/null +++ b/decoder/phrasebased_translator.h @@ -0,0 +1,18 @@ +#ifndef _PHRASEBASED_TRANSLATOR_H_ +#define _PHRASEBASED_TRANSLATOR_H_ + +#include "translator.h" + +class PhraseBasedTranslatorImpl; +class PhraseBasedTranslator : public Translator { + public: +  PhraseBasedTranslator(const boost::program_options::variables_map& conf); +  bool TranslateImpl(const std::string& input, +                 SentenceMetadata* smeta, +                 const std::vector<double>& weights, +                 Hypergraph* minus_lm_forest); + private: +  boost::shared_ptr<PhraseBasedTranslatorImpl> pimpl_; +}; + +#endif diff --git a/decoder/phrasetable_fst.cc b/decoder/phrasetable_fst.cc new file mode 100644 index 00000000..f421e941 --- /dev/null +++ b/decoder/phrasetable_fst.cc @@ -0,0 +1,141 @@ +#include "phrasetable_fst.h" + +#include <cassert> +#include <iostream> +#include <map> + +#include <boost/shared_ptr.hpp> + +#include "filelib.h" +#include "tdict.h" + +using boost::shared_ptr; +using namespace std; + +TargetPhraseSet::~TargetPhraseSet() {} +FSTNode::~FSTNode() {} + +class TextTargetPhraseSet : public TargetPhraseSet { + public: +  void AddRule(TRulePtr rule) { +    rules_.push_back(rule); +  } +  const vector<TRulePtr>& GetRules() const { +    return rules_; +  } + + private: +  // all rules must have arity 0 +  vector<TRulePtr> rules_; +}; + +class TextFSTNode : public FSTNode { + public: +  const TargetPhraseSet* GetTranslations() const { return data.get(); } +  bool HasData() const { return (bool)data; } +  bool HasOutgoingNonEpsilonEdges() const { return !ptr.empty(); }  +  const FSTNode* Extend(const WordID& t) const { +    map<WordID, TextFSTNode>::const_iterator it = ptr.find(t); +    if (it == ptr.end()) return NULL; +    return &it->second; +  } + +  void AddPhrase(const string& phrase); + +  void AddPassThroughTranslation(const WordID& w, const SparseVector<double>& feats); +  void ClearPassThroughTranslations(); + private: +  vector<WordID> passthroughs; +  shared_ptr<TargetPhraseSet> data; +  map<WordID, TextFSTNode> ptr; +}; + +#ifdef DEBUG_CHART_PARSER +static string TrimRule(const string& r) { +  size_t start = r.find(" |||") + 5; +  size_t end = r.rfind(" |||"); +  return r.substr(start, end - start); +} +#endif + +void TextFSTNode::AddPhrase(const string& phrase) { +  vector<WordID> words; +  TRulePtr rule(TRule::CreateRulePhrasetable(phrase)); +  if (!rule) { +    static int err = 0; +    ++err; +    if (err > 2) { cerr << "TOO MANY PHRASETABLE ERRORS\n"; exit(1); } +    return; +  } + +  TextFSTNode* fsa = this; +  for (int i = 0; i < rule->FLength(); ++i) +    fsa = &fsa->ptr[rule->f_[i]]; + +  if (!fsa->data) +    fsa->data.reset(new TextTargetPhraseSet); +  static_cast<TextTargetPhraseSet*>(fsa->data.get())->AddRule(rule); +} + +void TextFSTNode::AddPassThroughTranslation(const WordID& w, const SparseVector<double>& feats) { +  TextFSTNode* next = &ptr[w]; +  // current, rules are only added if the symbol is completely missing as a +  // word starting the phrase.  As a result, it is possible that some sentences +  // won't parse.  If this becomes a problem, fix it here. +  if (!next->data) { +    TextTargetPhraseSet* tps = new TextTargetPhraseSet; +    next->data.reset(tps); +    TRule* rule = new TRule; +    rule->e_.resize(1, w); +    rule->f_.resize(1, w); +    rule->lhs_ = TD::Convert("___PHRASE") * -1; +    rule->scores_ = feats; +    rule->arity_ = 0; +    tps->AddRule(TRulePtr(rule)); +    passthroughs.push_back(w);  +  } +} + +void TextFSTNode::ClearPassThroughTranslations() { +  for (int i = 0; i < passthroughs.size(); ++i) +    ptr.erase(passthroughs[i]); +  passthroughs.clear(); +} + +static void AddPhrasetableToFST(istream* in, TextFSTNode* fst) { +  int lc = 0; +  bool flag = false; +  while(*in) { +    string line; +    getline(*in, line); +    if (line.empty()) continue; +    ++lc; +    fst->AddPhrase(line); +    if (lc % 10000 == 0) { flag = true; cerr << '.' << flush; } +    if (lc % 500000 == 0) { flag = false; cerr << " [" << lc << ']' << endl << flush; } +  } +  if (flag) cerr << endl; +  cerr << "Loaded " << lc << " source phrases\n"; +} + +FSTNode* LoadTextPhrasetable(istream* in) { +  TextFSTNode *fst = new TextFSTNode; +  AddPhrasetableToFST(in, fst); +  return fst; +} + +FSTNode* LoadTextPhrasetable(const vector<string>& filenames) { +  TextFSTNode* fst = new TextFSTNode; +  for (int i = 0; i < filenames.size(); ++i) { +    ReadFile rf(filenames[i]); +    cerr << "Reading phrase from " << filenames[i] << endl; +    AddPhrasetableToFST(rf.stream(), fst); +  } +  return fst; +} + +FSTNode* LoadBinaryPhrasetable(const string& fname_prefix) { +  (void) fname_prefix; +  assert(!"not implemented yet"); +} + diff --git a/decoder/phrasetable_fst.h b/decoder/phrasetable_fst.h new file mode 100644 index 00000000..477de1f7 --- /dev/null +++ b/decoder/phrasetable_fst.h @@ -0,0 +1,34 @@ +#ifndef _PHRASETABLE_FST_H_ +#define _PHRASETABLE_FST_H_ + +#include <vector> +#include <string> + +#include "sparse_vector.h" +#include "trule.h" + +class TargetPhraseSet { + public: +  virtual ~TargetPhraseSet(); +  virtual const std::vector<TRulePtr>& GetRules() const = 0; +}; + +class FSTNode { + public: +  virtual ~FSTNode(); +  virtual const TargetPhraseSet* GetTranslations() const = 0; +  virtual bool HasData() const = 0; +  virtual bool HasOutgoingNonEpsilonEdges() const = 0; +  virtual const FSTNode* Extend(const WordID& t) const = 0; + +  // these should only be called on q_0: +  virtual void AddPassThroughTranslation(const WordID& w, const SparseVector<double>& feats) = 0; +  virtual void ClearPassThroughTranslations() = 0; +}; + +// attn caller: you own the memory +FSTNode* LoadTextPhrasetable(const std::vector<std::string>& filenames); +FSTNode* LoadTextPhrasetable(std::istream* in); +FSTNode* LoadBinaryPhrasetable(const std::string& fname_prefix); + +#endif diff --git a/decoder/prob.h b/decoder/prob.h new file mode 100644 index 00000000..bc297870 --- /dev/null +++ b/decoder/prob.h @@ -0,0 +1,8 @@ +#ifndef _PROB_H_ +#define _PROB_H_ + +#include "logval.h" + +typedef LogVal<double> prob_t; + +#endif diff --git a/decoder/rule_lexer.h b/decoder/rule_lexer.h new file mode 100644 index 00000000..e5db4018 --- /dev/null +++ b/decoder/rule_lexer.h @@ -0,0 +1,13 @@ +#ifndef _RULE_LEXER_H_ +#define _RULE_LEXER_H_ + +#include <iostream> + +#include "trule.h" + +struct RuleLexer { +  typedef void (*RuleCallback)(const TRulePtr& new_rule, void* extra); +  static void ReadRules(std::istream* in, RuleCallback func, void* extra); +}; + +#endif diff --git a/decoder/rule_lexer.l b/decoder/rule_lexer.l new file mode 100644 index 00000000..ff8f10b0 --- /dev/null +++ b/decoder/rule_lexer.l @@ -0,0 +1,269 @@ +%{ +#include "rule_lexer.h" + +#include <string> +#include <iostream> +#include <sstream> +#include <cstring> +#include <cassert> +#include "tdict.h" +#include "fdict.h" +#include "trule.h" + +int lex_line = 0; +std::istream* scfglex_stream = NULL; +RuleLexer::RuleCallback rule_callback = NULL; +void* rule_callback_extra = NULL; +std::vector<int> scfglex_phrase_fnames; + +#undef YY_INPUT +#define YY_INPUT(buf, result, max_size) (result = scfglex_stream->read(buf, max_size).gcount()) + +#define YY_SKIP_YYWRAP 1 +int num_rules = 0; +int yywrap() { return 1; } +bool fl = true; +#define MAX_TOKEN_SIZE 255 +std::string scfglex_tmp_token(MAX_TOKEN_SIZE, '\0'); + +#define MAX_RULE_SIZE 48 +WordID scfglex_src_rhs[MAX_RULE_SIZE]; +WordID scfglex_trg_rhs[MAX_RULE_SIZE]; +int scfglex_src_rhs_size; +int scfglex_trg_rhs_size; +WordID scfglex_lhs; +int scfglex_src_arity; +int scfglex_trg_arity; + +#define MAX_FEATS 20 +int scfglex_feat_ids[MAX_FEATS]; +double scfglex_feat_vals[MAX_FEATS]; +int scfglex_num_feats; + +#define MAX_ARITY 20 +int scfglex_nt_sanity[MAX_ARITY]; +int scfglex_src_nts[MAX_ARITY]; +float scfglex_nt_size_means[MAX_ARITY]; +float scfglex_nt_size_vars[MAX_ARITY]; + + +void sanity_check_trg_symbol(WordID nt, int index) { +  if (scfglex_src_nts[index-1] != nt) { +    std::cerr << "Target symbol with index " << index << " is of type " << TD::Convert(nt*-1) +              << " but corresponding source is of type " +              << TD::Convert(scfglex_src_nts[index-1] * -1) << std::endl; +    abort(); +  } +} + +void sanity_check_trg_index(int index) { +  if (index > scfglex_src_arity) { +    std::cerr << "Target index " << index << " exceeds source arity " << scfglex_src_arity << std::endl; +    abort(); +  } +  int& flag = scfglex_nt_sanity[index - 1]; +  if (flag) { +    std::cerr << "Target index " << index << " used multiple times!" << std::endl; +    abort(); +  } +  flag = 1; +} + +void scfglex_reset() { +  scfglex_src_arity = 0; +  scfglex_trg_arity = 0; +  scfglex_num_feats = 0; +  scfglex_src_rhs_size = 0; +  scfglex_trg_rhs_size = 0; +} + +%} + +REAL [\-+]?[0-9]+(\.[0-9]*([eE][-+]*[0-9]+)?)?|inf|[\-+]inf +NT [\-#$A-Z_:=.",\\][\-#$".A-Z+/=_0-9!:@\\]* + +%x LHS_END SRC TRG FEATS FEATVAL ALIGNS +%% + +<INITIAL>[ \t]	; + +<INITIAL>\[{NT}\]   { +		scfglex_tmp_token.assign(yytext + 1, yyleng - 2); +		scfglex_lhs = -TD::Convert(scfglex_tmp_token); +		// std::cerr << scfglex_tmp_token << "\n"; +  		BEGIN(LHS_END); +		} + +<SRC>\[{NT}\]   { +		scfglex_tmp_token.assign(yytext + 1, yyleng - 2); +		scfglex_src_nts[scfglex_src_arity] = scfglex_src_rhs[scfglex_src_rhs_size] = -TD::Convert(scfglex_tmp_token); +		++scfglex_src_arity; +		++scfglex_src_rhs_size; +		} + +<SRC>\[{NT},[1-9][0-9]?\]   { +		int index = yytext[yyleng - 2] - '0'; +		if (yytext[yyleng - 3] == ',') { +		  scfglex_tmp_token.assign(yytext + 1, yyleng - 4); +		} else { +		  scfglex_tmp_token.assign(yytext + 1, yyleng - 5); +		  index += 10 * (yytext[yyleng - 3] - '0'); +		} +		if ((scfglex_src_arity+1) != index) { +			std::cerr << "Src indices must go in order: expected " << scfglex_src_arity << " but got " << index << std::endl; +			abort(); +		} +		scfglex_src_nts[scfglex_src_arity] = scfglex_src_rhs[scfglex_src_rhs_size] = -TD::Convert(scfglex_tmp_token); +		++scfglex_src_rhs_size; +		++scfglex_src_arity; +		} + +<TRG>\[{NT},[1-9][0-9]?\]   { +		int index = yytext[yyleng - 2] - '0'; +		if (yytext[yyleng - 3] == ',') { +		  scfglex_tmp_token.assign(yytext + 1, yyleng - 4); +		} else { +		  scfglex_tmp_token.assign(yytext + 1, yyleng - 5); +		  index += 10 * (yytext[yyleng - 3] - '0'); +		} +		++scfglex_trg_arity; +		// std::cerr << "TRG INDEX: " << index << std::endl; +		sanity_check_trg_symbol(-TD::Convert(scfglex_tmp_token), index); +		sanity_check_trg_index(index); +		scfglex_trg_rhs[scfglex_trg_rhs_size] = 1 - index; +		++scfglex_trg_rhs_size; +} + +<TRG>\[[1-9][0-9]?\]   { +		int index = yytext[yyleng - 2] - '0'; +		if (yyleng == 4) { +		  index += 10 * (yytext[yyleng - 3] - '0'); +		} +		++scfglex_trg_arity; +		sanity_check_trg_index(index); +		scfglex_trg_rhs[scfglex_trg_rhs_size] = 1 - index; +		++scfglex_trg_rhs_size; +} + +<LHS_END>[ \t] { ; } +<LHS_END>\|\|\|	{ +		scfglex_reset(); +		BEGIN(SRC); +		} +<INITIAL,LHS_END>.	{ +		std::cerr << "Line " << lex_line << ": unexpected input in LHS: " << yytext << std::endl; +		abort(); +		} + +<SRC>\|\|\|	{ +		memset(scfglex_nt_sanity, 0, scfglex_src_arity * sizeof(int)); +		BEGIN(TRG); +		} +<SRC>[^ \t]+	{  +		scfglex_tmp_token.assign(yytext, yyleng); +		scfglex_src_rhs[scfglex_src_rhs_size] = TD::Convert(scfglex_tmp_token); +		++scfglex_src_rhs_size; +		} +<SRC>[ \t]+	{ ; } + +<TRG>\|\|\|	{ +		BEGIN(FEATS); +		} +<TRG>[^ \t]+	{ +		scfglex_tmp_token.assign(yytext, yyleng); +		scfglex_trg_rhs[scfglex_trg_rhs_size] = TD::Convert(scfglex_tmp_token); +		++scfglex_trg_rhs_size; +		} +<TRG>[ \t]+	{ ; } + +<TRG,FEATS,ALIGNS>\n	{ +                if (scfglex_src_arity != scfglex_trg_arity) { +                  std::cerr << "Line " << lex_line << ": LHS and RHS arity mismatch!\n"; +                  abort(); +                } +		TRulePtr rp(new TRule(scfglex_lhs, scfglex_src_rhs, scfglex_src_rhs_size, scfglex_trg_rhs, scfglex_trg_rhs_size, scfglex_feat_ids, scfglex_feat_vals, scfglex_num_feats, scfglex_src_arity)); +		rule_callback(rp, rule_callback_extra); +		// std::cerr << rp->AsString() << std::endl; +		num_rules++; +                lex_line++; +                if (num_rules %   50000 == 0) { std::cerr << '.' << std::flush; fl = true; } +                if (num_rules % 2000000 == 0) { std::cerr << " [" << num_rules << "]\n"; fl = false; } +		BEGIN(INITIAL); +		} + +<FEATS>[ \t;]	{ ; } +<FEATS>[^ \t=;]+=	{ +		scfglex_tmp_token.assign(yytext, yyleng - 1); +		const int fid = FD::Convert(scfglex_tmp_token); +		if (fid < 1) { +			std::cerr << "\nUNWEIGHED FEATURE " << scfglex_tmp_token << std::endl; +			abort(); +		} +		scfglex_feat_ids[scfglex_num_feats] = fid; +		BEGIN(FEATVAL); +		} +<FEATS>\|\|\|	{ +		BEGIN(ALIGNS); +		} +<FEATVAL>{REAL}	{ +		scfglex_feat_vals[scfglex_num_feats] = strtod(yytext, NULL); +		++scfglex_num_feats; +		BEGIN(FEATS); +		} +<FEATVAL>.	{ +		std::cerr << "Line " << lex_line << ": unexpected input in feature value: " << yytext << std::endl; +		abort(); +		} +<FEATS>{REAL} 	{  +		scfglex_feat_ids[scfglex_num_feats] = scfglex_phrase_fnames[scfglex_num_feats]; +		scfglex_feat_vals[scfglex_num_feats] = strtod(yytext, NULL); +		++scfglex_num_feats; +		} +<FEATS>.	{ +		std::cerr << "Line " << lex_line << " unexpected input in features: " << yytext << std::endl; +		abort(); +		} +<ALIGNS>[0-9]+-[0-9]+	{ +                int i = 0; +		int a = 0; +		int b = 0; +		while (i < yyleng) { +		  char c = yytext[i]; +		  if (c == '-') break; +		  a *= 10; +		  a += c - '0'; +		  ++i; +		} +		++i; +		while (i < yyleng) { +		  b *= 10; +		  b += yytext[i] - '0'; +		  ++i; +		} +		// TODO store alignment points somewhere +		} +<ALIGNS>[ \t]	; +<ALIGNS>.	{ +		std::cerr << "Line " << lex_line << ": unexpected input in alignment: " << yytext << std::endl; +		abort(); +		} +%% + +#include "filelib.h" + +void RuleLexer::ReadRules(std::istream* in, RuleLexer::RuleCallback func, void* extra) { +  if (scfglex_phrase_fnames.empty()) { +    scfglex_phrase_fnames.resize(100); +    for (int i = 0; i < scfglex_phrase_fnames.size(); ++i) { +      std::ostringstream os; +      os << "PhraseModel_" << i; +      scfglex_phrase_fnames[i] = FD::Convert(os.str()); +    } +  } +  lex_line = 1; +  scfglex_stream = in; +  rule_callback_extra = extra, +  rule_callback = func; +  yylex(); +} + diff --git a/decoder/sampler.h b/decoder/sampler.h new file mode 100644 index 00000000..e5840f41 --- /dev/null +++ b/decoder/sampler.h @@ -0,0 +1,136 @@ +#ifndef SAMPLER_H_ +#define SAMPLER_H_ + +#include <algorithm> +#include <functional> +#include <numeric> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/random/mersenne_twister.hpp> +#include <boost/random/uniform_real.hpp> +#include <boost/random/variate_generator.hpp> +#include <boost/random/normal_distribution.hpp> +#include <boost/random/poisson_distribution.hpp> + +#include "prob.h" + +struct SampleSet; + +template <typename RNG> +struct RandomNumberGenerator { +  static uint32_t GetTrulyRandomSeed() { +    uint32_t seed; +    std::ifstream r("/dev/urandom"); +    if (r) { +      r.read((char*)&seed,sizeof(uint32_t)); +    } +    if (r.fail() || !r) { +      std::cerr << "Warning: could not read from /dev/urandom. Seeding from clock" << std::endl; +      seed = time(NULL); +    } +    std::cerr << "Seeding random number sequence to " << seed << std::endl; +    return seed; +  } + +  RandomNumberGenerator() : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { +    uint32_t seed = GetTrulyRandomSeed(); +    m_generator.seed(seed); +  } +  explicit RandomNumberGenerator(uint32_t seed) : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { +    if (!seed) seed = GetTrulyRandomSeed(); +    m_generator.seed(seed); +  } + +  size_t SelectSample(const prob_t& a, const prob_t& b, double T = 1.0) { +    if (T == 1.0) { +      if (this->next() > (a / (a + b))) return 1; else return 0; +    } else { +      assert(!"not implemented"); +    } +  } + +  // T is the annealing temperature, if desired +  size_t SelectSample(const SampleSet& ss, double T = 1.0); + +  // draw a value from U(0,1) +  double next() {return m_random();} + +  // draw a value from N(mean,var) +  double NextNormal(double mean, double var) { +    return boost::normal_distribution<double>(mean, var)(m_random); +  } + +  // draw a value from a Poisson distribution +  // lambda must be greater than 0 +  int NextPoisson(int lambda) { +    return boost::poisson_distribution<int>(lambda)(m_random); +  } + +  bool AcceptMetropolisHastings(const prob_t& p_cur, +                                const prob_t& p_prev, +                                const prob_t& q_cur, +                                const prob_t& q_prev) { +    const prob_t a = (p_cur / p_prev) * (q_prev / q_cur); +    if (log(a) >= 0.0) return true; +    return (prob_t(this->next()) < a); +  } + + private: +  boost::uniform_real<> m_dist; +  RNG m_generator; +  boost::variate_generator<RNG&, boost::uniform_real<> > m_random; +}; + +typedef RandomNumberGenerator<boost::mt19937> MT19937; + +class SampleSet { + public: +  const prob_t& operator[](int i) const { return m_scores[i]; } +  bool empty() const { return m_scores.empty(); } +  void add(const prob_t& s) { m_scores.push_back(s); } +  void clear() { m_scores.clear(); } +  size_t size() const { return m_scores.size(); } +  std::vector<prob_t> m_scores; +}; + +template <typename RNG> +size_t RandomNumberGenerator<RNG>::SelectSample(const SampleSet& ss, double T) { +  assert(T > 0.0); +  assert(ss.m_scores.size() > 0); +  if (ss.m_scores.size() == 1) return 0; +  const prob_t annealing_factor(1.0 / T); +  const bool anneal = (annealing_factor != prob_t::One()); +  prob_t sum = prob_t::Zero(); +  if (anneal) { +    for (int i = 0; i < ss.m_scores.size(); ++i) +      sum += ss.m_scores[i].pow(annealing_factor);  // p^(1/T) +  } else { +    sum = std::accumulate(ss.m_scores.begin(), ss.m_scores.end(), prob_t::Zero()); +  } +  //for (size_t i = 0; i < ss.m_scores.size(); ++i) std::cerr << ss.m_scores[i] << ","; +  //std::cerr << std::endl; +  +  prob_t random(this->next());    // random number between 0 and 1 +  random *= sum;                  // scale with normalization factor +  //std::cerr << "Random number " << random << std::endl; + +  //now figure out which sample +  size_t position = 1; +  sum = ss.m_scores[0]; +  if (anneal) { +    sum.poweq(annealing_factor); +    for (; position < ss.m_scores.size() && sum < random; ++position)  +      sum += ss.m_scores[position].pow(annealing_factor); +  } else { +    for (; position < ss.m_scores.size() && sum < random; ++position)  +      sum += ss.m_scores[position]; +  } +  //std::cout << "random: " << random <<  " sample: " << position << std::endl; +  //std::cerr << "Sample: " << position-1 << std::endl; +  //exit(1); +  return position-1; +} + +#endif diff --git a/decoder/scfg_translator.cc b/decoder/scfg_translator.cc new file mode 100644 index 00000000..c215eea6 --- /dev/null +++ b/decoder/scfg_translator.cc @@ -0,0 +1,132 @@ +#include "translator.h" + +#include <vector> + +#include "hg.h" +#include "grammar.h" +#include "bottom_up_parser.h" +#include "sentence_metadata.h" + +using namespace std; +static bool usingSentenceGrammar = false; +static bool printGrammarsUsed = false; + +struct SCFGTranslatorImpl { +  SCFGTranslatorImpl(const boost::program_options::variables_map& conf) : +      max_span_limit(conf["scfg_max_span_limit"].as<int>()), +      add_pass_through_rules(conf.count("add_pass_through_rules")), +      goal(conf["goal"].as<string>()), +      default_nt(conf["scfg_default_nt"].as<string>()) { +    if(conf.count("grammar")) +      { +	vector<string> gfiles = conf["grammar"].as<vector<string> >(); +	for (int i = 0; i < gfiles.size(); ++i) { +	  cerr << "Reading SCFG grammar from " << gfiles[i] << endl; +	  TextGrammar* g = new TextGrammar(gfiles[i]); +	  g->SetMaxSpan(max_span_limit); +	  g->SetGrammarName(gfiles[i]); +	  grammars.push_back(GrammarPtr(g)); +	   +	} +      } +    if (!conf.count("scfg_no_hiero_glue_grammar")) +      {  +	GlueGrammar* g = new GlueGrammar(goal, default_nt); +	g->SetGrammarName("GlueGrammar"); +	grammars.push_back(GrammarPtr(g)); +	cerr << "Adding glue grammar" << endl; +      } +    if (conf.count("scfg_extra_glue_grammar")) +      { +	GlueGrammar* g = new GlueGrammar(conf["scfg_extra_glue_grammar"].as<string>()); +	g->SetGrammarName("ExtraGlueGrammar");		 +	grammars.push_back(GrammarPtr(g)); +	cerr << "Adding extra glue grammar" << endl; +      } +  } + +  const int max_span_limit; +  const bool add_pass_through_rules; +  const string goal; +  const string default_nt; +  vector<GrammarPtr> grammars; + +  bool Translate(const string& input, +                 SentenceMetadata* smeta, +                 const vector<double>& weights, +                 Hypergraph* forest) { +    vector<GrammarPtr> glist = grammars; +    Lattice& lattice = smeta->src_lattice_; +    LatticeTools::ConvertTextOrPLF(input, &lattice); +    smeta->SetSourceLength(lattice.size()); +    if (add_pass_through_rules){ +      PassThroughGrammar* g = new PassThroughGrammar(lattice, default_nt); +      g->SetGrammarName("PassThrough"); +      glist.push_back(GrammarPtr(g)); +      cerr << "Adding pass through grammar" << endl; +    } + + + +    if(printGrammarsUsed){    //Iterate trough grammars we have for this sentence and list them +      for (int gi = 0; gi < glist.size(); ++gi)  +	{ +	  cerr << "Using grammar::" << 	 glist[gi]->GetGrammarName() << endl; +	} +    } + +    ExhaustiveBottomUpParser parser(goal, glist); +    if (!parser.Parse(lattice, forest)) +      return false; +    forest->Reweight(weights); +    return true; +  } +}; + +/* +Called once from cdec.cc to setup the initial SCFG translation structure backend +*/ +SCFGTranslator::SCFGTranslator(const boost::program_options::variables_map& conf) : +  pimpl_(new SCFGTranslatorImpl(conf)) {} + +/* +Called for each sentence to perform translation using the SCFG backend +*/ +bool SCFGTranslator::TranslateImpl(const string& input, +                               SentenceMetadata* smeta, +                               const vector<double>& weights, +                               Hypergraph* minus_lm_forest) { +   +  return pimpl_->Translate(input, smeta, weights, minus_lm_forest); +} + +/* +Check for grammar pointer in the sentence markup, for use with sentence specific grammars + */ +void SCFGTranslator::ProcessMarkupHintsImpl(const map<string, string>& kv) { +  map<string,string>::const_iterator it = kv.find("grammar"); +  +   +  if (it == kv.end()) { +    usingSentenceGrammar= false;  +    return; +  } +  //Create sentence specific grammar from specified file name and load grammar into list of grammars +  cerr << "Loading sentence grammar from:" << it->second <<  endl; +  usingSentenceGrammar = true; +  TextGrammar* sentGrammar = new TextGrammar(it->second); +  sentGrammar->SetMaxSpan(pimpl_->max_span_limit); +  sentGrammar->SetGrammarName(it->second); +  pimpl_->grammars.push_back(GrammarPtr(sentGrammar)); + +} + +void SCFGTranslator::SentenceCompleteImpl() { + +  if(usingSentenceGrammar)      // Drop the last sentence grammar from the list of grammars +    { +      cerr << "Clearing grammar" << endl; +      pimpl_->grammars.pop_back(); +    } +} + diff --git a/decoder/sentence_metadata.h b/decoder/sentence_metadata.h new file mode 100644 index 00000000..ef9eb388 --- /dev/null +++ b/decoder/sentence_metadata.h @@ -0,0 +1,47 @@ +#ifndef _SENTENCE_METADATA_H_ +#define _SENTENCE_METADATA_H_ + +#include <cassert> +#include "lattice.h" + +struct SentenceMetadata { +  SentenceMetadata(int id, const Lattice& ref) : +    sent_id_(id), +    src_len_(-1), +    has_reference_(ref.size() > 0), +    trg_len_(ref.size()), +    ref_(has_reference_ ? &ref : NULL) {} + +  // this should be called by the Translator object after +  // it has parsed the source +  void SetSourceLength(int sl) { src_len_ = sl; } + +  // this should be called if a separate model needs to +  // specify how long the target sentence should be +  void SetTargetLength(int tl) { +    assert(!has_reference_); +    trg_len_ = tl; +  } +  bool HasReference() const { return has_reference_; } +  const Lattice& GetReference() const { return *ref_; } +  int GetSourceLength() const { return src_len_; } +  int GetTargetLength() const { return trg_len_; } +  int GetSentenceID() const { return sent_id_; } +  // this will be empty if the translator accepts non FS input! +  const Lattice& GetSourceLattice() const { return src_lattice_; } + + private: +  const int sent_id_; +  // the following should be set, if possible, by the Translator +  int src_len_; + public: +  Lattice src_lattice_;  // this will only be set if inputs are finite state! + private: +  // you need to be very careful when depending on these values +  // they will only be set during training / alignment contexts +  const bool has_reference_; +  int trg_len_; +  const Lattice* const ref_; +}; + +#endif diff --git a/decoder/small_vector.h b/decoder/small_vector.h new file mode 100644 index 00000000..800c1df1 --- /dev/null +++ b/decoder/small_vector.h @@ -0,0 +1,187 @@ +#ifndef _SMALL_VECTOR_H_ + +#include <streambuf>  // std::max - where to get this? +#include <cstring> +#include <cassert> + +#define __SV_MAX_STATIC 2 + +class SmallVector { + + public: +  SmallVector() : size_(0) {} + +  explicit SmallVector(size_t s, int v = 0) : size_(s) { +    assert(s < 0x80); +    if (s <= __SV_MAX_STATIC) { +      for (int i = 0; i < s; ++i) data_.vals[i] = v; +    } else { +      capacity_ = s; +      size_ = s; +      data_.ptr = new int[s]; +      for (int i = 0; i < size_; ++i) data_.ptr[i] = v; +    } +  } + +  SmallVector(const SmallVector& o) : size_(o.size_) { +    if (size_ <= __SV_MAX_STATIC) { +      for (int i = 0; i < __SV_MAX_STATIC; ++i) data_.vals[i] = o.data_.vals[i]; +    } else { +      capacity_ = size_ = o.size_; +      data_.ptr = new int[capacity_]; +      std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(int)); +    } +  } + +  const SmallVector& operator=(const SmallVector& o) { +    if (size_ <= __SV_MAX_STATIC) { +      if (o.size_ <= __SV_MAX_STATIC) { +        size_ = o.size_; +        for (int i = 0; i < __SV_MAX_STATIC; ++i) data_.vals[i] = o.data_.vals[i]; +      } else { +        capacity_ = size_ = o.size_; +        data_.ptr = new int[capacity_]; +        std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(int)); +      } +    } else { +      if (o.size_ <= __SV_MAX_STATIC) { +        delete[] data_.ptr; +        size_ = o.size_; +        for (int i = 0; i < size_; ++i) data_.vals[i] = o.data_.vals[i]; +      } else { +        if (capacity_ < o.size_) { +          delete[] data_.ptr; +          capacity_ = o.size_; +          data_.ptr = new int[capacity_]; +        } +        size_ = o.size_; +        for (int i = 0; i < size_; ++i) +          data_.ptr[i] = o.data_.ptr[i]; +      } +    } +    return *this; +  } + +  ~SmallVector() { +    if (size_ <= __SV_MAX_STATIC) return; +    delete[] data_.ptr; +  } + +  void clear() { +    if (size_ > __SV_MAX_STATIC) { +      delete[] data_.ptr; +    } +    size_ = 0; +  } + +  bool empty() const { return size_ == 0; } +  size_t size() const { return size_; } + +  inline void ensure_capacity(unsigned char min_size) { +    assert(min_size > __SV_MAX_STATIC); +    if (min_size < capacity_) return; +    unsigned char new_cap = std::max(static_cast<unsigned char>(capacity_ << 1), min_size); +    int* tmp = new int[new_cap]; +    std::memcpy(tmp, data_.ptr, capacity_ * sizeof(int)); +    delete[] data_.ptr; +    data_.ptr = tmp; +    capacity_ = new_cap; +  } + +  inline void copy_vals_to_ptr() { +    capacity_ = __SV_MAX_STATIC * 2; +    int* tmp = new int[capacity_]; +    for (int i = 0; i < __SV_MAX_STATIC; ++i) tmp[i] = data_.vals[i]; +    data_.ptr = tmp; +  } + +  inline void push_back(int v) { +    if (size_ < __SV_MAX_STATIC) { +      data_.vals[size_] = v; +      ++size_; +      return; +    } else if (size_ == __SV_MAX_STATIC) { +      copy_vals_to_ptr(); +    } else if (size_ == capacity_) { +      ensure_capacity(size_ + 1); +    } +    data_.ptr[size_] = v; +    ++size_; +  } + +  int& back() { return this->operator[](size_ - 1); } +  const int& back() const { return this->operator[](size_ - 1); } +  int& front() { return this->operator[](0); } +  const int& front() const { return this->operator[](0); } + +  void resize(size_t s, int v = 0) { +    if (s <= __SV_MAX_STATIC) { +      if (size_ > __SV_MAX_STATIC) { +        int tmp[__SV_MAX_STATIC]; +        for (int i = 0; i < s; ++i) tmp[i] = data_.ptr[i]; +        delete[] data_.ptr; +        for (int i = 0; i < s; ++i) data_.vals[i] = tmp[i]; +        size_ = s; +        return; +      } +      if (s <= size_) { +        size_ = s; +        return; +      } else { +        for (int i = size_; i < s; ++i) +          data_.vals[i] = v; +        size_ = s; +        return; +      } +    } else { +      if (size_ <= __SV_MAX_STATIC) +        copy_vals_to_ptr(); +      if (s > capacity_) +        ensure_capacity(s); +      if (s > size_) { +        for (int i = size_; i < s; ++i) +          data_.ptr[i] = v; +      } +      size_ = s; +    } +  } + +  int& operator[](size_t i) { +    if (size_ <= __SV_MAX_STATIC) return data_.vals[i]; +    return data_.ptr[i]; +  } + +  const int& operator[](size_t i) const { +    if (size_ <= __SV_MAX_STATIC) return data_.vals[i]; +    return data_.ptr[i]; +  } + +  bool operator==(const SmallVector& o) const { +    if (size_ != o.size_) return false; +    if (size_ <= __SV_MAX_STATIC) { +      for (size_t i = 0; i < size_; ++i) +        if (data_.vals[i] != o.data_.vals[i]) return false; +      return true; +    } else { +      for (size_t i = 0; i < size_; ++i) +        if (data_.ptr[i] != o.data_.ptr[i]) return false; +      return true; +    } +  } + + private: +  unsigned char capacity_;  // only defined when size_ >= __SV_MAX_STATIC +  unsigned char size_; +  union StorageType { +    int vals[__SV_MAX_STATIC]; +    int* ptr; +  }; +  StorageType data_; + +}; + +inline bool operator!=(const SmallVector& a, const SmallVector& b) { +  return !(a==b); +} + +#endif diff --git a/decoder/small_vector_test.cc b/decoder/small_vector_test.cc new file mode 100644 index 00000000..84237791 --- /dev/null +++ b/decoder/small_vector_test.cc @@ -0,0 +1,129 @@ +#include "small_vector.h" + +#include <gtest/gtest.h> +#include <iostream> +#include <cassert> +#include <vector> + +using namespace std; + +class SVTest : public testing::Test { + protected: +  virtual void SetUp() { } +  virtual void TearDown() { } +}; +        +TEST_F(SVTest, LargerThan2) { +  SmallVector v; +  SmallVector v2; +  v.push_back(0); +  v.push_back(1); +  v.push_back(2); +  assert(v.size() == 3); +  assert(v[2] == 2); +  assert(v[1] == 1); +  assert(v[0] == 0); +  v2 = v; +  SmallVector copy(v); +  assert(copy.size() == 3); +  assert(copy[0] == 0); +  assert(copy[1] == 1); +  assert(copy[2] == 2); +  assert(copy == v2); +  copy[1] = 99; +  assert(copy != v2); +  assert(v2.size() == 3); +  assert(v2[2] == 2); +  assert(v2[1] == 1); +  assert(v2[0] == 0); +  v2[0] = -2; +  v2[1] = -1; +  v2[2] = 0; +  assert(v2[2] == 0); +  assert(v2[1] == -1); +  assert(v2[0] == -2); +  SmallVector v3(1,1); +  assert(v3[0] == 1); +  v2 = v3; +  assert(v2.size() == 1); +  assert(v2[0] == 1); +  SmallVector v4(10, 1); +  assert(v4.size() == 10); +  assert(v4[5] == 1); +  assert(v4[9] == 1); +  v4 = v; +  assert(v4.size() == 3); +  assert(v4[2] == 2); +  assert(v4[1] == 1); +  assert(v4[0] == 0); +  SmallVector v5(10, 2); +  assert(v5.size() == 10); +  assert(v5[7] == 2); +  assert(v5[0] == 2); +  assert(v.size() == 3); +  v = v5; +  assert(v.size() == 10); +  assert(v[2] == 2); +  assert(v[9] == 2); +  SmallVector cc; +  for (int i = 0; i < 33; ++i) +    cc.push_back(i); +  for (int i = 0; i < 33; ++i) +    assert(cc[i] == i); +  cc.resize(20); +  assert(cc.size() == 20); +  for (int i = 0; i < 20; ++i) +    assert(cc[i] == i); +  cc[0]=-1; +  cc.resize(1, 999); +  assert(cc.size() == 1); +  assert(cc[0] == -1); +  cc.resize(99, 99); +  for (int i = 1; i < 99; ++i) { +    cerr << i << " " << cc[i] << endl; +    assert(cc[i] == 99); +  } +  cc.clear(); +  assert(cc.size() == 0); +} + +TEST_F(SVTest, Small) { +  SmallVector v; +  SmallVector v1(1,0); +  SmallVector v2(2,10); +  SmallVector v1a(2,0); +  EXPECT_TRUE(v1 != v1a); +  EXPECT_TRUE(v1 == v1); +  EXPECT_EQ(v1[0], 0); +  EXPECT_EQ(v2[1], 10); +  EXPECT_EQ(v2[0], 10); +  ++v2[1]; +  --v2[0]; +  EXPECT_EQ(v2[0], 9); +  EXPECT_EQ(v2[1], 11); +  SmallVector v3(v2); +  assert(v3[0] == 9); +  assert(v3[1] == 11); +  assert(!v3.empty()); +  assert(v3.size() == 2); +  v3.clear(); +  assert(v3.empty()); +  assert(v3.size() == 0); +  assert(v3 != v2); +  assert(v2 != v3); +  v3 = v2; +  assert(v3 == v2); +  assert(v2 == v3); +  assert(v3[0] == 9); +  assert(v3[1] == 11); +  assert(!v3.empty()); +  assert(v3.size() == 2); +  cerr << sizeof(SmallVector) << endl; +  cerr << sizeof(vector<int>) << endl; +} + +int main(int argc, char** argv) { +  testing::InitGoogleTest(&argc, argv); +  return RUN_ALL_TESTS(); +} + diff --git a/decoder/sparse_vector.cc b/decoder/sparse_vector.cc new file mode 100644 index 00000000..4035b9ef --- /dev/null +++ b/decoder/sparse_vector.cc @@ -0,0 +1,98 @@ +#include "sparse_vector.h" + +#include <iostream> +#include <cstring> + +#include "hg_io.h" + +using namespace std; + +namespace B64 { + +void Encode(double objective, const SparseVector<double>& v, ostream* out) { +  const int num_feats = v.num_active(); +  size_t tot_size = 0; +  const size_t off_objective = tot_size; +  tot_size += sizeof(double);                   // objective +  const size_t off_num_feats = tot_size; +  tot_size += sizeof(int);                      // num_feats +  const size_t off_data = tot_size; +  tot_size += sizeof(unsigned char) * num_feats; // lengths of feature names; +  typedef SparseVector<double>::const_iterator const_iterator; +  for (const_iterator it = v.begin(); it != v.end(); ++it) +    tot_size += FD::Convert(it->first).size();   // feature names; +  tot_size += sizeof(double) * num_feats;        // gradient +  const size_t off_magic = tot_size; +  tot_size += 4;                                 // magic + +  // size_t b64_size = tot_size * 4 / 3; +  // cerr << "Sparse vector binary size: " << tot_size << "  (b64 size=" << b64_size << ")\n"; +  char* data = new char[tot_size]; +  *reinterpret_cast<double*>(&data[off_objective]) = objective; +  *reinterpret_cast<int*>(&data[off_num_feats]) = num_feats; +  char* cur = &data[off_data]; +  assert(cur - data == off_data); +  for (const_iterator it = v.begin(); it != v.end(); ++it) { +    const string& fname = FD::Convert(it->first); +    *cur++ = static_cast<char>(fname.size());   // name len +    memcpy(cur, &fname[0], fname.size()); +    cur += fname.size(); +    *reinterpret_cast<double*>(cur) = it->second; +    cur += sizeof(double); +  } +  assert(cur - data == off_magic); +  *reinterpret_cast<unsigned int*>(cur) = 0xBAABABBAu; +  cur += sizeof(unsigned int); +  assert(cur - data == tot_size); +  b64encode(data, tot_size, out); +  delete[] data; +} + +bool Decode(double* objective, SparseVector<double>* v, const char* in, size_t size) { +  v->clear(); +  if (size % 4 != 0) { +    cerr << "B64 error - line % 4 != 0\n"; +    return false; +  } +  const size_t decoded_size = size * 3 / 4 - sizeof(unsigned int); +  const size_t buf_size = decoded_size + sizeof(unsigned int); +  if (decoded_size < 6) { cerr << "SparseVector decoding error: too short!\n"; return false; } +  char* data = new char[buf_size]; +  if (!b64decode(reinterpret_cast<const unsigned char*>(in), size, data, buf_size)) { +    delete[] data; +    return false; +  } +  size_t cur = 0; +  *objective = *reinterpret_cast<double*>(data); +  cur += sizeof(double); +  const int num_feats = *reinterpret_cast<int*>(&data[cur]); +  cur += sizeof(int); +  int fc = 0; +  while(fc < num_feats && cur < decoded_size) { +    ++fc; +    const int fname_len = data[cur++]; +    assert(fname_len > 0); +    assert(fname_len < 256); +    string fname(fname_len, '\0'); +    memcpy(&fname[0], &data[cur], fname_len); +    cur += fname_len; +    const double val = *reinterpret_cast<double*>(&data[cur]); +    cur += sizeof(double); +    int fid = FD::Convert(fname); +    v->set_value(fid, val); +  } +  if(num_feats != fc) { +    cerr << "Expected " << num_feats << " but only decoded " << fc << "!\n"; +    delete[] data; +    return false; +  } +  if (*reinterpret_cast<unsigned int*>(&data[cur]) != 0xBAABABBAu) { +    cerr << "SparseVector decodeding error : magic does not match!\n"; +    delete[] data; +    return false; +  } +  delete[] data; +  return true; +} + +} diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h new file mode 100644 index 00000000..66c9b10d --- /dev/null +++ b/decoder/sparse_vector.h @@ -0,0 +1,274 @@ +#ifndef _SPARSE_VECTOR_H_ +#define _SPARSE_VECTOR_H_ + +// this is a modified version of code originally written +// by Phil Blunsom + +#include <iostream> +#include <map> +#include <tr1/unordered_map> +#include <vector> +#include <valarray> + +#include "fdict.h" + +template <typename T> +class SparseVector { +public: +  typedef std::map<int, T> MapType; +  typedef typename std::map<int, T>::const_iterator const_iterator; +  SparseVector() {} + +  const T operator[](int index) const { +    typename MapType::const_iterator found = values_.find(index); +    if (found == values_.end()) +      return T(0); +    else +      return found->second; +  } + +  void set_value(int index, const T &value) { +    values_[index] = value; +  } + +    T add_value(int index, const T &value) { +        return values_[index] += value; +    } + +    T value(int index) const { +        typename MapType::const_iterator found = values_.find(index); +        if (found != values_.end()) +            return found->second; +        else +            return T(0); +    } + +    void store(std::valarray<T>* target) const { +      (*target) *= 0; +      for (typename MapType::const_iterator  +              it = values_.begin(); it != values_.end(); ++it) { +        if (it->first >= target->size()) break; +        (*target)[it->first] = it->second; +      } +    } + +    int max_index() const { +        if (values_.empty()) return 0; +        typename MapType::const_iterator found =values_.end(); +        --found; +        return found->first; +    } + +    // dot product with a unit vector of the same length +    // as the sparse vector +    T dot() const { +        T sum = 0; +        for (typename MapType::const_iterator  +                it = values_.begin(); it != values_.end(); ++it) +            sum += it->second; +        return sum; +    } + +    template<typename S> +    S dot(const SparseVector<S> &vec) const { +        S sum = 0; +        for (typename MapType::const_iterator  +                it = values_.begin(); it != values_.end(); ++it) +        { +            typename MapType::const_iterator  +                found = vec.values_.find(it->first); +            if (found != vec.values_.end()) +                sum += it->second * found->second; +        } +        return sum; +    } +     +    template<typename S> +    S dot(const std::vector<S> &vec) const { +        S sum = 0; +        for (typename MapType::const_iterator  +                it = values_.begin(); it != values_.end(); ++it) +        { +            if (it->first < static_cast<int>(vec.size())) +                sum += it->second * vec[it->first]; +        } +        return sum; +    } + +    template<typename S> +    S dot(const S *vec) const { +        // this is not range checked! +        S sum = 0; +        for (typename MapType::const_iterator  +                it = values_.begin(); it != values_.end(); ++it) +            sum += it->second * vec[it->first]; +        std::cout << "dot(*vec) " << sum << std::endl; +        return sum; +    } + +    T l1norm() const { +      T sum = 0; +      for (typename MapType::const_iterator  +              it = values_.begin(); it != values_.end(); ++it) +        sum += fabs(it->second); +      return sum; +    } +     +    T l2norm() const { +      T sum = 0; +      for (typename MapType::const_iterator  +              it = values_.begin(); it != values_.end(); ++it) +        sum += it->second * it->second; +      return sqrt(sum); +    } +     +    SparseVector<T> &operator+=(const SparseVector<T> &other) { +        for (typename MapType::const_iterator  +                it = other.values_.begin(); it != other.values_.end(); ++it) +        { +            T v = (values_[it->first] += it->second); +            if (v == T()) +                values_.erase(it->first); +        } +        return *this; +    } + +    SparseVector<T> &operator-=(const SparseVector<T> &other) { +        for (typename MapType::const_iterator  +                it = other.values_.begin(); it != other.values_.end(); ++it) +        { +            T v = (values_[it->first] -= it->second); +            if (v == T(0)) +                values_.erase(it->first); +        } +        return *this; +    } + +    SparseVector<T> &operator-=(const double &x) { +        for (typename MapType::iterator  +                it = values_.begin(); it != values_.end(); ++it) +            it->second -= x; +        return *this; +    } + +    SparseVector<T> &operator+=(const double &x) { +        for (typename MapType::iterator  +                it = values_.begin(); it != values_.end(); ++it) +            it->second += x; +        return *this; +    } + +    SparseVector<T> &operator/=(const T &x) { +        for (typename MapType::iterator  +                it = values_.begin(); it != values_.end(); ++it) +            it->second /= x; +        return *this; +    } + +    SparseVector<T> &operator*=(const T& x) { +        for (typename MapType::iterator  +                it = values_.begin(); it != values_.end(); ++it) +            it->second *= x; +        return *this; +    } + +    SparseVector<T> operator+(const double &x) const { +        SparseVector<T> result = *this; +        return result += x; +    } + +    SparseVector<T> operator-(const double &x) const { +        SparseVector<T> result = *this; +        return result -= x; +    } + +    SparseVector<T> operator/(const double &x) const { +        SparseVector<T> result = *this; +        return result /= x; +    } + +    std::ostream &operator<<(std::ostream &out) const { +        bool first = true; +        for (typename MapType::const_iterator  +                it = values_.begin(); it != values_.end(); ++it) { +          // by definition feature id 0 is a dummy value +          if (it->first == 0) continue; +          out << (first ? "" : ";") +	      << FD::Convert(it->first) << '=' << it->second; +          first = false; +        } +        return out; +    } + +    bool operator<(const SparseVector<T> &other) const { +        typename MapType::const_iterator it = values_.begin(); +        typename MapType::const_iterator other_it = other.values_.begin(); + +        for (; it != values_.end() && other_it != other.values_.end(); ++it, ++other_it) +        { +            if (it->first < other_it->first) return true; +            if (it->first > other_it->first) return false; +            if (it->second < other_it->second) return true; +            if (it->second > other_it->second) return false; +        } +        return values_.size() < other.values_.size(); +    } + +    int num_active() const { return values_.size(); } +    bool empty() const { return values_.empty(); } + +    const_iterator begin() const { return values_.begin(); } +    const_iterator end() const { return values_.end(); } + +    void clear() { +        values_.clear(); +    } +    void clear_value(int index) { +      values_.erase(index); +    } + +    void swap(SparseVector<T>& other) { +      values_.swap(other.values_); +    } + +private: +  MapType values_; +}; + +template <typename T> +SparseVector<T> operator+(const SparseVector<T>& a, const SparseVector<T>& b) { +  SparseVector<T> result = a; +  return result += b; +} + +template <typename T> +SparseVector<T> operator*(const SparseVector<T>& a, const double& b) { +  SparseVector<T> result = a; +  return result *= b; +} + +template <typename T> +SparseVector<T> operator*(const SparseVector<T>& a, const T& b) { +  SparseVector<T> result = a; +  return result *= b; +} + +template <typename T> +SparseVector<T> operator*(const double& a, const SparseVector<T>& b) { +  SparseVector<T> result = b; +  return result *= a; +} + +template <typename T> +std::ostream &operator<<(std::ostream &out, const SparseVector<T> &vec) +{ +    return vec.operator<<(out); +} + +namespace B64 { +  void Encode(double objective, const SparseVector<double>& v, std::ostream* out); +  // returns false if failed to decode +  bool Decode(double* objective, SparseVector<double>* v, const char* data, size_t size); +} + +#endif diff --git a/decoder/stringlib.cc b/decoder/stringlib.cc new file mode 100644 index 00000000..3e52ae87 --- /dev/null +++ b/decoder/stringlib.cc @@ -0,0 +1,98 @@ +#include "stringlib.h" + +#include <cstring> +#include <cstdlib> +#include <cassert> +#include <iostream> +#include <map> + +#include "lattice.h" + +using namespace std; + +void ParseTranslatorInput(const string& line, string* input, string* ref) { +  size_t hint = 0; +  if (line.find("{\"rules\":") == 0) { +    hint = line.find("}}"); +    if (hint == string::npos) { +      cerr << "Syntax error: " << line << endl; +      abort(); +    } +    hint += 2; +  } +  size_t pos = line.find("|||", hint); +  if (pos == string::npos) { *input = line; return; } +  ref->clear(); +  *input = line.substr(0, pos - 1); +  string rline = line.substr(pos + 4); +  if (rline.size() > 0) { +    assert(ref); +    *ref = rline; +  } +} + +void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) { +  string sref; +  ParseTranslatorInput(line, input, &sref); +  if (sref.size() > 0) { +    assert(ref); +    LatticeTools::ConvertTextOrPLF(sref, ref); +  } +} + +void ProcessAndStripSGML(string* pline, map<string, string>* out) { +  map<string, string>& meta = *out; +  string& line = *pline; +  string lline = LowercaseString(line); +  if (lline.find("<seg")!=0) return; +  size_t close = lline.find(">"); +  if (close == string::npos) return; // error +  size_t end = lline.find("</seg>"); +  string seg = Trim(lline.substr(4, close-4)); +  string text = line.substr(close+1, end - close - 1); +  for (size_t i = 1; i < seg.size(); i++) { +    if (seg[i] == '=' && seg[i-1] == ' ') { +      string less = seg.substr(0, i-1) + seg.substr(i); +      seg = less; i = 0; continue; +    } +    if (seg[i] == '=' && seg[i+1] == ' ') { +      string less = seg.substr(0, i+1); +      if (i+2 < seg.size()) less += seg.substr(i+2); +      seg = less; i = 0; continue; +    } +  } +  line = Trim(text); +  if (seg == "") return; +  for (size_t i = 1; i < seg.size(); i++) { +    if (seg[i] == '=') { +      string label = seg.substr(0, i); +      string val = seg.substr(i+1); +      if (val[0] == '"') { +        val = val.substr(1); +        size_t close = val.find('"'); +        if (close == string::npos) { +          cerr << "SGML parse error: missing \"\n"; +          seg = ""; +          i = 0; +        } else { +          seg = val.substr(close+1); +          val = val.substr(0, close); +          i = 0; +        } +      } else { +        size_t close = val.find(' '); +        if (close == string::npos) { +          seg = ""; +          i = 0; +        } else { +          seg = val.substr(close+1); +          val = val.substr(0, close); +        } +      } +      label = Trim(label); +      seg = Trim(seg); +      meta[label] = val; +    } +  } +} + diff --git a/decoder/stringlib.h b/decoder/stringlib.h new file mode 100644 index 00000000..76efee8f --- /dev/null +++ b/decoder/stringlib.h @@ -0,0 +1,101 @@ +#ifndef _STRINGLIB_H_ + +#include <map> +#include <vector> +#include <cctype> +#include <string> + +// read line in the form of either: +//   source +//   source ||| target +// source will be returned as a string, target must be a sentence or +// a lattice (in PLF format) and will be returned as a Lattice object +void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref); +struct Lattice; +void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref); + +inline const std::string Trim(const std::string& str, const std::string& dropChars = " \t") { +  std::string res = str; +  res.erase(str.find_last_not_of(dropChars)+1); +  return res.erase(0, res.find_first_not_of(dropChars)); +} + +inline void Tokenize(const std::string& str, char delimiter, std::vector<std::string>* res) { +  std::string s = str; +  int last = 0; +  res->clear(); +  for (int i=0; i < s.size(); ++i) +    if (s[i] == delimiter) { +      s[i]=0; +      if (last != i) { +        res->push_back(&s[last]); +      } +      last = i + 1; +    } +  if (last != s.size()) +    res->push_back(&s[last]); +} + +inline std::string LowercaseString(const std::string& in) { +  std::string res(in.size(),' '); +  for (int i = 0; i < in.size(); ++i) +    res[i] = tolower(in[i]); +  return res; +} + +inline int CountSubstrings(const std::string& str, const std::string& sub) { +  size_t p = 0; +  int res = 0; +  while (p < str.size()) { +    p = str.find(sub, p); +    if (p == std::string::npos) break; +    ++res; +    p += sub.size(); +  } +  return res; +} + +inline int SplitOnWhitespace(const std::string& in, std::vector<std::string>* out) { +  out->clear(); +  int i = 0; +  int start = 0; +  std::string cur; +  while(i < in.size()) { +    if (in[i] == ' ' || in[i] == '\t') { +      if (i - start > 0) +        out->push_back(in.substr(start, i - start)); +      start = i + 1; +    } +    ++i; +  } +  if (i > start) +    out->push_back(in.substr(start, i - start)); +  return out->size(); +} + +inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) { +  cmd->clear(); +  param->clear(); +  std::vector<std::string> x; +  SplitOnWhitespace(in, &x); +  if (x.size() == 0) return; +  *cmd = x[0]; +  for (int i = 1; i < x.size(); ++i) { +    if (i > 1) { *param += " "; } +    *param += x[i]; +  } +} + +void ProcessAndStripSGML(std::string* line, std::map<std::string, std::string>* out); + +// given the first character of a UTF8 block, find out how wide it is +// see http://en.wikipedia.org/wiki/UTF-8 for more info +inline unsigned int UTF8Len(unsigned char x) { +  if (x < 0x80) return 1; +  else if ((x >> 5) == 0x06) return 2; +  else if ((x >> 4) == 0x0e) return 3; +  else if ((x >> 3) == 0x1e) return 4; +  else return 0; +} + +#endif diff --git a/decoder/tagger.cc b/decoder/tagger.cc new file mode 100644 index 00000000..4dded35f --- /dev/null +++ b/decoder/tagger.cc @@ -0,0 +1,112 @@ +#include "tagger.h" + +#include "tdict.h" +#include "hg_io.h" +#include "filelib.h" +#include "hg.h" +#include "wordid.h" +#include "sentence_metadata.h" + +using namespace std; + +// This is a really simple linear chain tagger. +// You specify a tagset, and it hypothesizes that each word in the +// input can be tagged with any member of the tagset. +// The are a couple sample features implemented in ff_tagger.h/cc +// One thing to note, that while CRFs typically define the label +// sequence as corresponding to the hidden states in a trellis, +// in our model the labels are on edges, but mathematically +// they are identical. +// +// Things to do if you want to make this a "real" tagger: +// - support dictionaries (for each word, limit the tags considered) +// - add latent variables - this is really easy to do + +static void ReadTagset(const string& file, vector<WordID>* tags) { +  ReadFile rf(file); +  istream& in(*rf.stream()); +  while(in) { +    string tag; +    in >> tag; +    if (tag.empty()) continue; +    tags->push_back(TD::Convert(tag)); +  } +  cerr << "Read " << tags->size() << " labels (tags) from " << file << endl; +} + +struct TaggerImpl { +  TaggerImpl(const boost::program_options::variables_map& conf) : +      kXCAT(TD::Convert("X")*-1), +      kNULL(TD::Convert("<eps>")), +      kBINARY(new TRule("[X] ||| [X,1] [X,2] ||| [1] [2]")), +      kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")) { +    if (conf.count("tagger_tagset") == 0) { +      cerr << "Tagger requires --tagger_tagset FILE!\n"; +      exit(1); +    } +    ReadTagset(conf["tagger_tagset"].as<string>(), &tagset_); +  } + +  void BuildTrellis(const vector<WordID>& seq, Hypergraph* forest) { +    int prev_node_id = -1; +    for (int i = 0; i < seq.size(); ++i) { +      const WordID& src = seq[i]; +      const int new_node_id = forest->AddNode(kXCAT)->id_; +      for (int k = 0; k < tagset_.size(); ++k) { +        TRulePtr rule(TRule::CreateLexicalRule(src, tagset_[k])); +        Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector()); +        edge->i_ = i; +        edge->j_ = i+1; +        edge->prev_i_ = i;    // we set these for FastLinearIntersect +        edge->prev_j_ = i+1;  //      "      "            " +        forest->ConnectEdgeToHeadNode(edge->id_, new_node_id); +      } +      if (prev_node_id >= 0) { +        const int comb_node_id = forest->AddNode(kXCAT)->id_; +        Hypergraph::TailNodeVector tail(2, prev_node_id); +        tail[1] = new_node_id; +        Hypergraph::Edge* edge = forest->AddEdge(kBINARY, tail); +        edge->i_ = 0; +        edge->j_ = i+1; +        forest->ConnectEdgeToHeadNode(edge->id_, comb_node_id); +        prev_node_id = comb_node_id; +      } else { +        prev_node_id = new_node_id; +      } +    } +    Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); +    Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1); +    Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); +    forest->ConnectEdgeToHeadNode(hg_edge, goal); +  } + + private: +  vector<WordID> tagset_; +  const WordID kXCAT; +  const WordID kNULL; +  const TRulePtr kBINARY; +  const TRulePtr kGOAL_RULE; +}; + +Tagger::Tagger(const boost::program_options::variables_map& conf) : + pimpl_(new TaggerImpl(conf)) {} + + +bool Tagger::TranslateImpl(const string& input, +                       SentenceMetadata* smeta, +                       const vector<double>& weights, +                       Hypergraph* forest) { +  Lattice lattice; +  LatticeTools::ConvertTextToLattice(input, &lattice); +  smeta->SetSourceLength(lattice.size()); +  vector<WordID> sequence(lattice.size()); +  for (int i = 0; i < lattice.size(); ++i) { +    assert(lattice[i].size() == 1); +    sequence[i] = lattice[i][0].label; +  } +  pimpl_->BuildTrellis(sequence, forest); +  forest->Reweight(weights); +  forest->is_linear_chain_ = true; +  return true; +} + diff --git a/decoder/tagger.h b/decoder/tagger.h new file mode 100644 index 00000000..9ac820d9 --- /dev/null +++ b/decoder/tagger.h @@ -0,0 +1,17 @@ +#ifndef _TAGGER_H_ +#define _TAGGER_H_ + +#include "translator.h" + +struct TaggerImpl; +struct Tagger : public Translator { +  Tagger(const boost::program_options::variables_map& conf); +  bool TranslateImpl(const std::string& input, +                 SentenceMetadata* smeta, +                 const std::vector<double>& weights, +                 Hypergraph* forest); + private: +  boost::shared_ptr<TaggerImpl> pimpl_; +}; + +#endif diff --git a/decoder/tdict.cc b/decoder/tdict.cc new file mode 100644 index 00000000..c00d20b8 --- /dev/null +++ b/decoder/tdict.cc @@ -0,0 +1,49 @@ +#include "Ngram.h" +#include "dict.h" +#include "tdict.h" +#include "Vocab.h" + +using namespace std; + +Vocab* TD::dict_ = new Vocab; + +static const string empty; +static const string space = " "; + +WordID TD::Convert(const std::string& s) { +  return dict_->addWord((VocabString)s.c_str()); +} + +const char* TD::Convert(const WordID& w) { +  return dict_->getWord((VocabIndex)w); +} + +void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) { +  ids->clear(); +  for (vector<string>::const_iterator i = strings.begin(); i != strings.end(); ++i) +    ids->push_back(TD::Convert(*i)); +} + +std::string TD::GetString(const std::vector<WordID>& str) { +  string res; +  for (vector<WordID>::const_iterator i = str.begin(); i != str.end(); ++i) +    res += (i == str.begin() ? empty : space) + TD::Convert(*i); +  return res; +} + +void TD::ConvertSentence(const std::string& sent, std::vector<WordID>* ids) { +  string s = sent; +  int last = 0; +  ids->clear(); +  for (int i=0; i < s.size(); ++i) +    if (s[i] == 32 || s[i] == '\t') { +      s[i]=0; +      if (last != i) { +        ids->push_back(Convert(&s[last])); +      } +      last = i + 1; +    } +  if (last != s.size()) +    ids->push_back(Convert(&s[last])); +} + diff --git a/decoder/tdict.h b/decoder/tdict.h new file mode 100644 index 00000000..31f66367 --- /dev/null +++ b/decoder/tdict.h @@ -0,0 +1,30 @@ +#ifndef _TDICT_H_ +#define _TDICT_H_ + +#include <string> +#include <vector> +#include "wordid.h" + +class Vocab; + +struct TD { +  static Vocab* dict_; +  static void ConvertSentence(const std::string& sent, std::vector<WordID>* ids); +  static void GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids); +  static std::string GetString(const std::vector<WordID>& str); +  static int AppendString(const WordID& w, int pos, int bufsize, char* buffer) { +    const char* word = TD::Convert(w); +    const char* const end_buf = buffer + bufsize; +    char* dest = buffer + pos; +    while(dest < end_buf && *word) { +      *dest = *word; +      ++dest; +      ++word; +    } +    return (dest - buffer); +  } +  static WordID Convert(const std::string& s); +  static const char* Convert(const WordID& w); +}; + +#endif diff --git a/decoder/test_data/dummy.3gram.lm b/decoder/test_data/dummy.3gram.lm new file mode 100644 index 00000000..ae665284 --- /dev/null +++ b/decoder/test_data/dummy.3gram.lm @@ -0,0 +1,2645 @@ + +\data\ +ngram 1=490 +ngram 2=1023 +ngram 3=1119 + +\1-grams: +-2.761928	!	-0.06284945 +-1.91683	"	-0.03559465 +-2.761928	'	-0.06057167 +-2.159868	(	-0.07742823 +-2.159868	)	-0.05637721 +-1.292106	,	-0.04497077 +-3.062958	-	-0.06247065 +-1.429489	.	-0.08555528 +-2.761928	12	-0.06473851 +-3.062958	17	-0.06586801 +-2.585837	2000	-0.05520994 +-3.062958	2002	-0.06360606 +-3.062958	2006	-0.0497812 +-3.062958	2008	-0.06322792 +-3.062958	2009	-0.0497812 +-3.062958	200–400	-0.06549184 +-3.062958	224	-0.06586801 +-1.91683	</s> +-99	<s>	-0.0457003 +-2.761928	?	-0.05751594 +-1.720535	a	-0.05548429 +-2.460898	about	-0.05211611 +-3.062958	acquiesced	-0.05942829 +-3.062958	actually	-0.04349266 +-3.062958	addition	-0.05980976 +-3.062958	admit	-0.06095213 +-3.062958	affected	-0.04071253 +-2.761928	against	-0.06549184 +-3.062958	aging	-0.06586801 +-3.062958	ago	-0.04349266 +-3.062958	ahead	-0.06586801 +-2.761928	al	-0.06284945 +-2.761928	all	-0.0590465 +-3.062958	all-around	-0.06586801 +-3.062958	along	-0.04071253 +-2.761928	also	-0.06322792 +-2.761928	always	-0.06436136 +-2.363988	an	-0.06436135 +-3.062958	analysis	-0.06473851 +-1.631594	and	0.006203346 +-3.062958	anti-divine	-0.06586801 +-3.062958	any	-0.06549184 +-3.062958	approach	-0.05789908 +-3.062958	archive	-0.04071253 +-3.062958	are	-0.05789908 +-2.761928	arkive	-0.06549184 +-2.585837	article	-0.0228177 +-2.21786	as	-0.09020901 +-3.062958	asked	-0.06398387 +-2.585837	at	-0.03145044 +-2.761928	attention	-0.02612664 +-3.062958	available	-0.04349266 +-3.062958	average	-0.04349266 +-3.062958	away	-0.06322792 +-3.062958	ayers	-0.05597997 +-3.062958	b	-0.04349266 +-3.062958	back-and-forth	-0.06586801 +-3.062958	bailie	-0.0497812 +-2.761928	be	-0.06511534 +-3.062958	because	-0.06586801 +-2.460898	been	-0.06322791 +-3.062958	before	-0.04349266 +-2.761928	begin	-0.05520995 +-3.062958	being	-0.06586801 +-2.585837	between	-0.1350269 +-2.460898	bias	-0.04111077 +-3.062958	biased	-0.06511534 +-3.062958	biblical	-0.06586801 +-3.062958	bill	-0.06586801 +-3.062958	blade	-0.06436136 +-3.062958	blood	-0.04349266 +-3.062958	bob	-0.06549184 +-3.062958	book	-0.06436136 +-2.159868	briffa	-0.06804922 +-2.761928	briffa's	-0.06284945 +-2.021565	but	-0.01525023 +-2.21786	by	-0.07600738 +-2.761928	ca	-0.2166343 +-2.761928	can	-0.06473851 +-3.062958	case	-0.06511534 +-3.062958	cast	-0.06473851 +-3.062958	catch	-0.06511534 +-3.062958	caught	-0.06511534 +-3.062958	caveats	-0.06322792 +-3.062958	centennial-scale	-0.06549184 +-3.062958	cf	-0.0497812 +-3.062958	change	-0.06209152 +-3.062958	changing	-0.06360606 +-3.062958	characterizes	-0.06586801 +-3.062958	checked	-0.06586801 +-2.159868	chronology	-0.02240231 +-3.062958	church	-0.06398387 +-3.062958	cocaine	-0.06398387 +-3.062958	collection	-0.06586801 +-3.062958	combination	-0.06209152 +-3.062958	combine	-0.04071253 +-3.062958	combined	-0.06209152 +-3.062958	comment	-0.06360606 +-3.062958	commentary	-0.06322792 +-3.062958	commenter	-0.06586801 +-3.062958	comments	-0.06586801 +-3.062958	compared	-0.05789908 +-3.062958	concerned	-0.06473851 +-3.062958	concrete	-0.06095213 +-3.062958	connection	-0.06209152 +-2.761928	conservatives	-0.06360606 +-3.062958	considered	-0.06095213 +-3.062958	consists	-0.04349266 +-3.062958	constructing	-0.05789908 +-2.761928	control	-0.03991493 +-2.585837	cores	-0.0236473 +-3.062958	corridor	-0.06473851 +-2.761928	crack	-0.06436136 +-3.062958	crossroads	-0.0497812 +-2.460898	cru	-0.1318786 +-3.062958	darkness	-0.05597997 +-2.108715	data	-0.06845023 +-2.761928	day	-0.05674864 +-2.761928	days	-0.04939082 +-3.062958	debt	-0.04349266 +-3.062958	decline	-0.06095213 +-3.062958	deep	-0.06549184 +-3.062958	deeper	-0.06586801 +-3.062958	delete	-0.05789908 +-3.062958	derived	-0.06511534 +-3.062958	described	-0.05942829 +-2.761928	did	-0.06095213 +-2.761928	difference	-0.04860901 +-2.761928	different	-0.06247065 +-2.761928	divergence	-0.2166343 +-2.761928	do	-0.05559513 +-3.062958	does	-0.06247065 +-3.062958	doing	-0.06586801 +-3.062958	don't	-0.06586801 +-3.062958	done	-0.06586801 +-3.062958	doubt	-0.06360606 +-3.062958	down	-0.05789908 +-3.062958	due	-0.06473851 +-3.062958	earlier	-0.06019088 +-3.062958	editors	-0.06511534 +-3.062958	energy	-0.04349266 +-3.062958	enormous	-0.06586801 +-2.761928	et	-0.2166343 +-3.062958	even	-0.06586801 +-3.062958	every	-0.06586801 +-3.062958	exactly	-0.06360606 +-3.062958	exception	-0.05789908 +-3.062958	excluding	-0.06549184 +-3.062958	expect	-0.06511534 +-3.062958	extension	-0.05597997 +-3.062958	factors	-0.04349266 +-3.062958	fantasy	-0.06436136 +-3.062958	far	-0.06511534 +-2.585837	few	-0.1590744 +-2.585837	finally	-0.06511533 +-3.062958	first	-0.04349266 +-3.062958	flesh	-0.05597997 +-3.062958	following:	-0.06095213 +-3.062958	follows:	-0.06095213 +-2.284806	for	-0.06171204 +-3.062958	forests	-0.0497812 +-2.585837	from	-0.05713245 +-3.062958	fully	-0.06586801 +-2.585837	further	-0.06511533 +-3.062958	furthermore	-0.04349266 +-3.062958	future	-0.0497812 +-3.062958	generating	-0.06586801 +-2.761928	get	-0.191855 +-3.062958	ghastly	-0.06586801 +-3.062958	ghostwritten	-0.06360606 +-3.062958	gil	-0.06586801 +-3.062958	given	-0.04071253 +-3.062958	going	-0.05789908 +-3.062958	got	-0.06436136 +-2.761928	great	-0.2166343 +-3.062958	growing	-0.0497812 +-3.062958	grows	-0.06511534 +-2.363988	had	-0.1033177 +-2.585837	hantemirov	-0.09654189 +-2.761928	happening	-0.06436136 +-3.062958	happens	-0.06549184 +-3.062958	hard	-0.05789908 +-3.062958	hardly	-0.06473851 +-2.460898	has	-0.03063563 +-3.062958	hate	-0.05789908 +-2.284806	have	-0.08108715 +-3.062958	haven't	-0.06586801 +-2.363988	he	-0.112982 +-3.062958	here	-0.06586801 +-3.062958	highly	-0.06586801 +-2.761928	him	-0.05751594 +-2.585837	his	-0.06511533 +-3.062958	how	-0.06586801 +-2.761928	however	-0.1946352 +-3.062958	hs	-0.06586801 +-3.062958	humanity	-0.06511534 +-2.108715	i	-0.05980975 +-3.062958	i'd	-0.06586801 +-3.062958	i've	-0.06586801 +-2.761928	idea	-0.02612664 +-2.761928	if	-0.03670979 +-3.062958	illusion	-0.05597997 +-3.062958	immense	-0.06586801 +-3.062958	impact	-0.06322792 +-3.062958	important	-0.06586801 +-1.807685	in	-0.04419087 +-3.062958	included	-0.06209152 +-2.761928	including	-0.0165447 +-3.062958	indeed	-0.06511534 +-3.062958	individual	-0.06511534 +-3.062958	information	-0.06511534 +-3.062958	inhomogeneities	-0.04349266 +-3.062958	initial	-0.06549184 +-2.761928	instead	-0.2109523 +-3.062958	interannual	-0.06549184 +-2.761928	into	-0.03991493 +-3.062958	introduced	-0.06360606 +-1.91683	is	-0.001109093 +-2.062958	it	-0.06621437 +-2.460898	it's	-0.06019088 +-3.062958	its	-0.06586801 +-2.761928	journal	-0.06209152 +-3.062958	jurisdiction	-0.0497812 +-2.460898	just	-0.05520994 +-3.062958	kaufman	-0.06549184 +-3.062958	keeps	-0.06586801 +-2.761928	khadyta	-0.2166343 +-2.460898	know	-0.1105378 +-3.062958	larch	-0.06586801 +-2.761928	larches	-0.04743365 +-3.062958	large-scale	-0.06095213 +-2.761928	like	-0.06511534 +-3.062958	limited	-0.06586801 +-3.062958	living	-0.06549184 +-3.062958	longest	-0.05597997 +-3.062958	looking	-0.06549184 +-3.062958	looks	-0.06586801 +-3.062958	love	-0.05789908 +-3.062958	made	-0.06095213 +-2.761928	mag	-0.2143704 +-3.062958	magnitude	-0.05980976 +-3.062958	magnus	-0.0497812 +-3.062958	makes	-0.04071253 +-3.062958	many	-0.06586801 +-3.062958	may	-0.06586801 +-3.062958	mean	-0.06322792 +-3.062958	measured	-0.06360606 +-2.761928	measurement	-0.213992 +-2.460898	method	-0.03711172 +-3.062958	methodology	-0.06586801 +-3.062958	mind	-0.06511534 +-3.062958	mix	-0.06586801 +-2.585837	more	-0.05636447 +-3.062958	morning	-0.06284945 +-2.585837	most	-0.0647385 +-2.761928	much	-0.06473851 +-3.062958	multi-parters	-0.04349266 +-3.062958	multiproxy	-0.06586801 +-3.062958	mundane	-0.06511534 +-2.585837	my	-0.1598284 +-3.062958	national	-0.06586801 +-3.062958	naughtiness	-0.0497812 +-3.062958	nettle	-0.04349266 +-3.062958	never	-0.06586801 +-3.062958	next	-0.04349266 +-3.062958	no	-0.06586801 +-3.062958	non-robustness	-0.06586801 +-3.062958	northern	-0.06586801 +-2.062958	not	-0.0712041 +-3.062958	noted	-0.06586801 +-3.062958	noticed	-0.06095213 +-3.062958	notwithstanding	-0.06473851 +-3.062958	now	-0.04349266 +-2.761928	obama	-0.03791448 +-3.062958	observed	-0.06586801 +-1.832509	of	-0.04850956 +-2.761928	old	-0.06436136 +-2.585837	older	-0.1053004 +-3.062958	oldie	-0.04349266 +-2.159868	on	-0.09226183 +-2.585837	one	-0.04900008 +-3.062958	online	-0.0497812 +-3.062958	only	-0.06586801 +-3.062958	or	-0.06586801 +-3.062958	originated	-0.06209152 +-3.062958	osborn	-0.05597997 +-3.062958	out	-0.06322792 +-3.062958	outright	-0.06586801 +-3.062958	own	-0.06586801 +-3.062958	paleoclimatologists	-0.05597997 +-3.062958	passage	-0.06284945 +-3.062958	passing	-0.05597997 +-3.062958	path	-0.06095213 +-3.062958	patterns	-0.05942829 +-3.062958	paul	-0.06436136 +-3.062958	people	-0.06095213 +-2.363988	perhaps	-0.06259563 +-2.761928	phil	-0.2166343 +-3.062958	picked	-0.06511534 +-3.062958	piece	-0.06360606 +-3.062958	place	-0.0497812 +-3.062958	placed	-0.06586801 +-3.062958	play	-0.06322792 +-3.062958	point	-0.06095213 +-3.062958	policy	-0.06322792 +-2.585837	politics	-0.02571439 +-2.363988	population	-0.1001791 +-3.062958	position	-0.06095213 +-3.062958	possible	-0.05597997 +-2.761928	potential	-0.06436136 +-3.062958	power	-0.05789908 +-3.062958	powers	-0.05597997 +-3.062958	precipitous	-0.06586801 +-3.062958	precisely	-0.04071253 +-3.062958	predictable	-0.06586801 +-3.062958	presented	-0.06019088 +-3.062958	preserve	-0.06586801 +-3.062958	previous	-0.06549184 +-3.062958	principalities	-0.05980976 +-3.062958	principles	-0.05942829 +-3.062958	prior	-0.06511534 +-3.062958	probable	-0.06095213 +-2.761928	problem	-0.2120946 +-3.062958	projected	-0.06549184 +-3.062958	properly	-0.06586801 +-3.062958	prove	-0.06586801 +-3.062958	provide	-0.04071253 +-3.062958	provided	-0.05789908 +-3.062958	provocative	-0.06586801 +-3.062958	published	-0.05942829 +-3.062958	push	-0.06511534 +-2.585837	rcs	-0.06133225 +-3.062958	react	-0.05789908 +-3.062958	read	-0.06247065 +-2.761928	readers	-0.06398387 +-3.062958	reading	-0.04349266 +-3.062958	real	-0.06322792 +-3.062958	really	-0.06586801 +-3.062958	realm	-0.05980976 +-2.761928	reason	-0.06360606 +-3.062958	recent	-0.06511534 +-2.761928	recently	-0.1946352 +-3.062958	reconstruction	-0.0497812 +-3.062958	refusal	-0.05942829 +-3.062958	refused	-0.05789908 +-3.062958	related	-0.05789908 +-3.062958	relevant	-0.04349266 +-3.062958	relied	-0.06322792 +-3.062958	religion	-0.05597997 +-3.062958	remained	-0.06586801 +-3.062958	remarked	-0.06095213 +-3.062958	reposting	-0.06473851 +-3.062958	requiring	-0.06322792 +-3.062958	response	-0.05789908 +-3.062958	resulting	-0.06322792 +-3.062958	rev	-0.0497812 +-2.460898	right	-0.04821757 +-3.062958	ring	-0.06586801 +-3.062958	ring-width	-0.06511534 +-2.761928	river	-0.1946352 +-3.062958	said	-0.06436136 +-3.062958	same	-0.06473851 +-3.062958	sample	-0.06586801 +-3.062958	sat	-0.05942829 +-2.460898	schweingruber	-0.09101291 +-3.062958	schweingruber's	-0.06549184 +-2.585837	science	-0.1568045 +-3.062958	script	-0.06322792 +-2.585837	see	-0.1112577 +-3.062958	seized	-0.04071253 +-2.761928	selected	-0.04664831 +-2.585837	selection	-0.1491516 +-3.062958	sensitive	-0.06511534 +-3.062958	sensitivity	-0.06095213 +-2.585837	series	-0.1314228 +-3.062958	set	-0.05942829 +-3.062958	several	-0.06549184 +-3.062958	shadow	-0.06586801 +-2.761928	shadows	-0.04309659 +-2.585837	shiyatov	-0.06360605 +-3.062958	should	-0.06247065 +-3.062958	similar	-0.06473851 +-3.062958	similarly	-0.06586801 +-3.062958	since	-0.06019088 +-3.062958	size	-0.05597997 +-3.062958	skimmed	-0.06019088 +-2.761928	slowly	-0.04270015 +-3.062958	small	-0.06586801 +-3.062958	so	-0.06549184 +-3.062958	some	-0.06549184 +-3.062958	someone	-0.06586801 +-3.062958	start	-0.06549184 +-3.062958	staunchly	-0.06586801 +-3.062958	struggling	-0.06549184 +-3.062958	studies	-0.06095213 +-2.761928	study	-0.02612664 +-3.062958	stumbled	-0.06586801 +-2.585837	subfossil	-0.06171205 +-3.062958	subsequent	-0.06549184 +-3.062958	subset	-0.05942829 +-3.062958	success	-0.0497812 +-3.062958	supplement	-0.0497812 +-3.062958	supplemented	-0.06360606 +-3.062958	surface	-0.04349266 +-3.062958	take	-0.06436136 +-3.062958	taken	-0.05789908 +-2.761928	taymir	-0.06247065 +-3.062958	temperature	-0.04349266 +-3.062958	tendency	-0.05789908 +-3.062958	terms	-0.05980976 +-3.062958	than	-0.04071253 +-1.91683	that	-0.06692892 +-1.243414	the	-0.08813193 +-3.062958	their	-0.06511534 +-2.761928	themselves	-0.04111078 +-3.062958	there's	-0.06586801 +-2.460898	these	-0.05942829 +-2.460898	they	-0.06398387 +-2.761928	things	-0.06057167 +-3.062958	think	-0.06549184 +-3.062958	thinking	-0.06586801 +-1.858838	this	-0.08175352 +-2.761928	those	-0.06057167 +-3.062958	thought	-0.0497812 +-3.062958	thousand	-0.04349266 +-3.062958	through	-0.04071253 +-2.761928	time	-0.0326698 +-1.720535	to	-0.07930601 +-2.761928	today	-0.04821758 +-3.062958	took	-0.04071253 +-3.062958	towards	-0.06511534 +-2.761928	trans	-0.06549184 +-2.460898	trees	-0.04704115 +-2.761928	trouble	-0.213234 +-3.062958	true	-0.04349266 +-3.062958	trying	-0.05789908 +-2.761928	two	-0.2166343 +-3.062958	unarchived	-0.0497812 +-3.062958	under	-0.06549184 +-3.062958	unintentional	-0.06473851 +-3.062958	unrepresentativeness	-0.05980976 +-3.062958	until	-0.06549184 +-3.062958	unveiled:	-0.06586801 +-2.761928	up	-0.03185729 +-3.062958	upon	-0.06019088 +-2.761928	use	-0.2109523 +-2.363988	used	-0.0545155 +-2.761928	using	-0.02323271 +-3.062958	usual	-0.06586801 +-3.062958	valid	-0.06549184 +-2.761928	variability	-0.03911585 +-2.761928	versions	-0.04428373 +-2.761928	very	-0.06549184 +-3.062958	violence	-0.06586801 +-3.062958	virtually	-0.06586801 +-3.062958	virtue	-0.05980976 +-3.062958	voted	-0.06398387 +-3.062958	warn	-0.06549184 +-3.062958	warnings	-0.04349266 +-2.363988	was	-0.06171205 +-3.062958	way	-0.06549184 +-3.062958	we	-0.06549184 +-3.062958	well	-0.06398387 +-2.284806	were	-0.07866543 +-2.21786	what	-0.02364731 +-3.062958	what's	-0.06549184 +-2.585837	when	-0.06057167 +-2.585837	where	-0.05597997 +-2.460898	which	-0.0403139 +-2.585837	while	-0.03951557 +-3.062958	whose	-0.06586801 +-3.062958	why	-0.06586801 +-3.062958	widths	-0.05597997 +-2.761928	will	-0.06322792 +-3.062958	wise	-0.06549184 +-2.021565	with	-0.08912028 +-3.062958	within	-0.06549184 +-3.062958	without	-0.06586801 +-3.062958	worth	-0.06586801 +-2.460898	would	-0.1303614 +-3.062958	wright's	-0.06586801 +-3.062958	wrote	-0.04071253 +-2.159868	yamal	-0.0719028 +-2.761928	year	-0.04270015 +-3.062958	years	-0.06549184 +-3.062958	yes	-0.04349266 +-3.062958	yesterday	-0.06473851 +-3.062958	yet	-0.04349266 +-3.062958	you	-0.06511534 +-2.761928	your	-0.06511534 + +\2-grams: +-1.15037	! as	-0.004049858 +-1.15037	! instead	0.2044696 +-1.995468	" (	-0.005168174 +-1.995468	" -	0.05332709 +-1.995468	" </s> +-1.995468	" as	-0.004049858 +-1.995468	" concrete	0.05332709 +-1.995468	" corridor	0.05332709 +-1.249819	" divergence	0.1451325 +-1.995468	" further	0.008061528 +-1.995468	" i'd	0.05332709 +-1.995468	" success	0.05332709 +-1.995468	" that	-0.008505944 +-1.995468	" the	-0.007702977 +-1.995468	" used	-0.0004517734 +-1.15037	' </s> +-1.15037	' yes	0.05332709 +-1.75243	( and	-0.01063527 +-1.75243	( in	0.006514465 +-1.006781	( mag	0.1451325 +-1.75243	( or	0.05332709 +-1.75243	( phil	0.2044696 +-1.75243	( which	0.00272119 +-1.75243	( while	0.008061528 +-1.006781	) ,	-0.002172916 +-1.75243	) </s> +-1.75243	) acquiesced	0.05332709 +-1.75243	) and	-0.002266581 +-1.75243	) had	-0.0004517734 +-1.75243	) things	0.01894335 +-1.75243	) took	0.05332709 +-2.620192	, 2008	0.05332709 +-2.620192	, 224	0.05332709 +-2.620192	, a	-0.01011507 +-2.620192	, all	0.01894335 +-1.955229	, and	-0.006035992 +-2.620192	, as	0.0389223 +-2.620192	, bob	0.05332709 +-2.620192	, briffa	-0.005168174 +-0.8166095	, but	0.05114232 +-2.620192	, cf	0.05332709 +-2.620192	, cru	0.00272119 +-2.620192	, delete	0.05332709 +-2.620192	, for	-0.002554279 +-2.620192	, from	0.008061528 +-2.620192	, he	-0.0004517734 +-2.620192	, his	0.008061528 +-1.955229	, i	0.008061524 +-2.620192	, if	0.01894335 +-2.620192	, including	0.01894335 +-2.620192	, is	-0.008505944 +-1.874543	, it	-0.0004517762 +-1.874543	, it's	0.01894334 +-2.620192	, kaufman	0.05332709 +-2.620192	, most	0.008061528 +-2.620192	, notwithstanding	0.05332709 +-2.620192	, of	0.007685009 +-2.620192	, on	-0.005168174 +-2.620192	, perhaps	0.04797027 +-2.620192	, requiring	0.05332709 +-2.620192	, since	0.05332709 +-1.955229	, the	0.02331641 +-1.955229	, this	0.01715922 +-2.620192	, until	0.05332709 +-2.620192	, using	0.01894335 +-1.874543	, when	0.03010483 +-2.620192	, where	0.008061528 +-1.874543	, which	0.01894334 +-2.620192	, while	0.008061528 +-2.620192	, yamal	-0.005168174 +-0.8493397	- not	-0.006728992 +-2.482808	. "	-0.008505944 +-2.482808	. '	0.01894335 +-2.482808	. (	-0.005168174 +-2.482808	. )	-0.005168174 +-0.6792259	. </s> +-1.737159	. a	0.003078613 +-2.482808	. actually	0.05332709 +-2.482808	. and	-0.01063527 +-2.482808	. as	-0.004049858 +-1.737159	. briffa	0.03257156 +-2.482808	. but	-0.007295175 +-2.482808	. changing	0.05332709 +-2.482808	. first	0.05332709 +-2.482808	. furthermore	0.05332709 +-1.737159	. however	0.1451325 +-2.482808	. i	-0.006035987 +-2.482808	. in	-0.009490006 +-2.482808	. it	0.0164606 +-2.482808	. perhaps	0.04797027 +-2.482808	. science	0.1193421 +-2.482808	. several	0.05332709 +-2.482808	. the	-0.008591395 +-1.737159	. these	0.01894334 +-1.737159	. this	0.0130633 +-2.482808	. violence	0.05332709 +-2.482808	. what	-0.004049858 +-2.482808	. what's	0.05332709 +-2.482808	. while	0.008061528 +-2.482808	. with	0.05785327 +-2.482808	. wright's	0.05332709 +-1.15037	12 cores	0.008061528 +-1.15037	12 picked	0.05332709 +-0.8493397	17 ring-width	0.05332709 +-1.326461	2000 and	-0.01063527 +-1.326461	2000 may	0.05332709 +-1.326461	2000 presented	0.05332709 +-0.8493397	2002 as	-0.004049858 +-0.8493397	2006 .	-0.0114856 +-0.8493397	2008 )	-0.005168174 +-0.8493397	2009 .	0.08907277 +-0.8493397	200–400 year	0.01894335 +-0.8493397	224 individual	0.05332709 +-1.995468	<s> '	0.01894335 +-1.995468	<s> as	0.0389223 +-1.995468	<s> briffa's	0.01894335 +-1.995468	<s> but	-0.007295175 +-1.995468	<s> i	-0.006035987 +-1.995468	<s> if	0.01894335 +-1.995468	<s> in	-0.009490006 +-1.995468	<s> next	0.05332709 +-1.249819	<s> perhaps	0.06234263 +-1.249819	<s> the	0.0223057 +-1.995468	<s> this	-0.009059753 +-1.995468	<s> what	-0.004049858 +-1.15037	? "	-0.008505944 +-1.15037	? i	-0.006035987 +-2.191762	a "	0.01222976 +-2.191762	a case	0.05332709 +-2.191762	a comment	0.05332709 +-2.191762	a commenter	0.05332709 +-2.191762	a different	0.01894335 +-1.5268	a few	0.109396 +-2.191762	a generating	0.05332709 +-2.191762	a great	0.2044696 +-2.191762	a mean	0.05332709 +-2.191762	a prior	0.05332709 +-2.191762	a provocative	0.05332709 +-2.191762	a rcs	0.008061528 +-2.191762	a science	0.008061528 +-2.191762	a shadow	0.05332709 +-2.191762	a similar	0.05332709 +-2.191762	a small	0.05332709 +-2.191762	a surface	0.05332709 +-2.191762	a thousand	0.05332709 +-2.191762	a time	0.01894335 +-2.191762	a valid	0.05332709 +-1.4514	about a	-0.01011507 +-1.4514	about my	0.008061528 +-1.4514	about not	-0.006728992 +-1.4514	about potential	0.01894335 +-0.8493397	acquiesced in	-0.009490006 +-0.8493397	actually ,	-0.01187418 +-0.8493397	addition of	-0.009287588 +-0.8493397	admit that	0.04168737 +-0.8493397	affected the	-0.01198488 +-1.15037	against flesh	0.05332709 +-1.15037	against inhomogeneities	0.05332709 +-0.8493397	aging patterns	0.05332709 +-0.8493397	ago ,	-0.008075343 +-0.8493397	ahead you	0.05332709 +-1.15037	al (	-0.005168174 +-1.15037	al 2009	0.05332709 +-1.15037	all of	-0.009287588 +-1.15037	all those	0.01894335 +-0.8493397	all-around naughtiness	0.05332709 +-0.8493397	along the	-0.01198488 +-1.15037	also has	0.00272119 +-1.15037	also know	0.08231446 +-1.15037	always been	0.00272119 +-1.15037	always worth	0.05332709 +-1.54831	an exception	0.05332709 +-1.54831	an extension	0.05332709 +-1.54831	an immense	0.05332709 +-1.54831	an important	0.05332709 +-1.54831	an unintentional	0.05332709 +-0.8493397	analysis has	0.00272119 +-2.280704	and ,	-0.007080218 +-2.280704	and all-around	0.05332709 +-2.280704	and blood	0.05332709 +-2.280704	and briffa	-0.005168174 +-2.280704	and even	0.05332709 +-2.280704	and got	0.05332709 +-2.280704	and hantemirov	0.09388901 +-2.280704	and he	0.06152429 +-2.280704	and i've	0.05332709 +-2.280704	and it	-0.006728992 +-2.280704	and most	0.008061528 +-2.280704	and outright	0.05332709 +-2.280704	and perhaps	-0.0004517734 +-2.280704	and politics	0.008061528 +-2.280704	and potential	0.01894335 +-2.280704	and principalities	0.05332709 +-2.280704	and sat	0.05332709 +-2.280704	and science	0.1193421 +-1.615741	and shiyatov	0.05332708 +-2.280704	and temperature	0.05332709 +-2.280704	and that	-0.008505944 +-1.615741	and the	-0.005814605 +-2.280704	and they	0.00272119 +-0.8493397	anti-divine powers	0.05332709 +-0.8493397	any journal	0.01894335 +-0.8493397	approach to	-0.01011507 +-0.8493397	archive the	-0.01198488 +-0.8493397	are to	-0.01011507 +-1.15037	arkive down	0.05332709 +-1.15037	arkive under	0.05332709 +-1.326461	article ,	-0.007080218 +-1.326461	article .	-0.004888296 +-1.326461	article on	-0.005168174 +-1.694438	as a	-0.01011507 +-0.9487888	as ca	0.1451325 +-1.694438	as compared	0.05332709 +-1.694438	as follows:	0.05332709 +-1.694438	as it	0.0164606 +-1.694438	as noted	0.05332709 +-0.8493397	asked for	-0.002554279 +-1.326461	at a	-0.01011507 +-1.326461	at precisely	0.05332709 +-1.326461	at the	-0.01198488 +-1.15037	attention ,	0.05896524 +-1.15037	attention .	-0.0114856 +-0.8493397	available ,	-0.008075343 +-0.8493397	average ,	-0.01187418 +-0.8493397	away )	0.03209379 +-0.8493397	ayers and	-0.01063527 +-0.8493397	b ,	-0.01187418 +-0.8493397	back-and-forth yesterday	0.05332709 +-0.8493397	bailie .	-0.0114856 +-1.15037	be happening	0.01894335 +-1.15037	be included	0.05332709 +-0.8493397	because so	0.05332709 +-1.4514	been an	-0.0004517734 +-1.4514	been concerned	0.05332709 +-1.4514	been done	0.05332709 +-1.4514	been projected	0.05332709 +-0.8493397	before ,	-0.01187418 +-1.15037	begin in	-0.009490006 +-1.15037	begin with	-0.007295175 +-0.8493397	being true	0.05332709 +-1.326461	between ring	0.05332709 +-0.580812	between the	-0.06704012 +-1.4514	bias ,	-0.007080218 +-1.4514	bias introduced	0.05332709 +-1.4514	bias towards	0.05332709 +-1.4514	bias would	0.08231446 +-0.8493397	biased selection	0.1193421 +-0.8493397	biblical passage	0.05332709 +-0.8493397	bill ayers	0.05332709 +-0.8493397	blade was	-0.0004517734 +-0.8493397	blood ,	0.05896524 +-0.8493397	bob ?	0.01894335 +-0.8493397	book was	-0.0004517734 +-1.087467	briffa 2000	0.05332708 +-1.75243	briffa 2006	0.05332709 +-1.75243	briffa asked	0.05332709 +-1.75243	briffa et	0.2044696 +-1.75243	briffa to	-0.01011507 +-1.75243	briffa used	-0.0004517734 +-1.15037	briffa's own	0.05332709 +-1.15037	briffa's yamal	-0.005168174 +-1.890732	but ,	-0.01187418 +-1.890732	but anti-divine	0.05332709 +-1.890732	but because	0.05332709 +-1.890732	but between	0.1193421 +-1.890732	but given	0.05332709 +-1.145083	but it	-0.0004517762 +-1.890732	but it's	0.00272119 +-1.890732	but the	-0.01198488 +-1.890732	but this	0.009005655 +-1.890732	but to	0.002916232 +-1.694438	by bill	0.05332709 +-1.694438	by gil	0.05332709 +-1.694438	by hantemirov	0.09388901 +-1.694438	by how	0.05332709 +-1.694438	by magnus	0.05332709 +-0.9487888	by the	-0.01105098 +-0.4047208	ca readers	0.05332709 +-1.15037	can combine	0.05332709 +-1.15037	can see	0.1193421 +-0.8493397	case where	0.008061528 +-0.8493397	cast these	0.00272119 +-0.8493397	catch my	0.1193421 +-0.8493397	caught my	0.1193421 +-0.8493397	caveats on	-0.005168174 +-0.8493397	centennial-scale variability	0.01894335 +-0.8493397	cf .	-0.0114856 +-0.8493397	change with	-0.007295175 +-0.8493397	changing what	-0.004049858 +-0.8493397	characterizes northern	0.05332709 +-0.8493397	checked earlier	0.05332709 +-1.75243	chronology ,	-0.01187418 +-1.75243	chronology also	0.01894335 +-1.75243	chronology briffa	-0.005168174 +-1.75243	chronology has	0.00272119 +-1.75243	chronology in	-0.009490006 +-1.75243	chronology method	0.00272119 +-1.75243	chronology was	-0.0004517734 +-1.75243	chronology with	-0.007295175 +-0.8493397	church for	-0.002554279 +-0.8493397	cocaine for	-0.002554279 +-0.8493397	collection does	0.05332709 +-0.8493397	combination with	0.05785327 +-0.8493397	combine the	-0.01198488 +-0.8493397	combined with	0.05785327 +-0.8493397	comment by	-0.004049858 +-0.8493397	commentary on	0.03209379 +-0.8493397	commenter remarked	0.05332709 +-0.8493397	comments catch	0.05332709 +-0.8493397	compared to	0.02102831 +-0.8493397	concerned about	0.00272119 +-0.8493397	concrete "	-0.008505944 +-0.8493397	connection with	-0.007295175 +-1.15037	conservatives said	0.05332709 +-1.15037	conservatives were	-0.002554279 +-0.8493397	considered "	-0.008505944 +-0.8493397	consists ,	-0.01187418 +-0.8493397	constructing a	-0.01011507 +-1.15037	control !	0.01894335 +-1.15037	control the	-0.01198488 +-1.326461	cores ,	-0.008075343 +-1.326461	cores .	-0.004888296 +-1.326461	cores were	0.04819728 +-0.8493397	corridor method	0.00272119 +-1.15037	crack about	0.00272119 +-1.15037	crack cocaine	0.05332709 +-0.8493397	crossroads .	-0.0114856 +-0.7057508	cru population	0.07636014 +-1.4514	cru selection	0.008061528 +-1.4514	cru staunchly	0.05332709 +-0.8493397	darkness and	-0.01063527 +-1.803582	data (	-0.005168174 +-1.057933	data .	-0.0100497 +-1.803582	data policy	0.05332709 +-1.803582	data remained	0.05332709 +-1.803582	data set	0.05332709 +-1.803582	data used	0.04797027 +-1.803582	data was	-0.0004517734 +-1.803582	data were	-0.002554279 +-1.15037	day politics	0.008061528 +-1.15037	day to	-0.01011507 +-1.15037	days .	0.08907277 +-1.15037	days ago	0.05332709 +-0.8493397	debt ,	-0.007080218 +-0.8493397	decline is	-0.008505944 +-0.8493397	deep into	0.01894335 +-0.8493397	deeper principles	0.05332709 +-0.8493397	delete a	0.0001907796 +-0.8493397	derived from	0.008061528 +-0.8493397	described in	-0.009490006 +-1.15037	did not	-0.006728992 +-1.15037	did they	0.00272119 +-1.15037	difference .	0.08907277 +-1.15037	difference between	0.1193421 +-1.15037	different aging	0.05332709 +-1.15037	different data	-0.006035987 +-0.4047208	divergence problem	0.1451325 +-1.15037	do and	-0.002266581 +-1.15037	do indeed	0.05332709 +-0.8493397	does not	0.0164606 +-0.8493397	doing exactly	0.05332709 +-0.8493397	don't really	0.05332709 +-0.8493397	done without	0.05332709 +-0.8493397	doubt what	-0.004049858 +-0.8493397	down to	-0.01011507 +-0.8493397	due just	0.00272119 +-0.8493397	earlier this	-0.009059753 +-0.8493397	editors finally	0.008061528 +-0.8493397	energy ,	0.05896524 +-0.8493397	enormous hs	0.05332709 +-0.4047208	et al	0.05332709 +-0.8493397	even probable	0.05332709 +-0.8493397	every subsequent	0.05332709 +-0.8493397	exactly what	-0.004049858 +-0.8493397	exception to	0.02102831 +-0.8493397	excluding khadyta	0.2044696 +-0.8493397	expect from	0.008061528 +-0.8493397	extension and	-0.01063527 +-0.8493397	factors ,	0.05896524 +-0.8493397	fantasy had	-0.0004517734 +-0.8493397	far more	0.008061528 +-1.326461	few at	0.008061528 +-0.580812	few days	0.05332709 +-1.326461	finally available	0.05332709 +-1.326461	finally placed	0.05332709 +-1.326461	finally seized	0.05332709 +-0.8493397	first ,	-0.01187418 +-0.8493397	flesh and	-0.01063527 +-0.8493397	following: </s> +-0.8493397	follows: </s> +-1.627491	for all	0.01894335 +-1.627491	for an	-0.0004517734 +-1.627491	for excluding	0.05332709 +-1.627491	for him	0.01894335 +-1.627491	for paleoclimatologists	0.05332709 +-1.627491	for we	0.05332709 +-0.8493397	forests .	-0.004888296 +-1.326461	from 200–400	0.05332709 +-1.326461	from a	-0.01011507 +-1.326461	from someone	0.05332709 +-0.8493397	fully thinking	0.05332709 +-1.326461	further ahead	0.05332709 +-1.326461	further along	0.05332709 +-1.326461	further away	0.05332709 +-0.8493397	furthermore ,	-0.007080218 +-0.8493397	future .	-0.0114856 +-0.8493397	generating script	0.05332709 +-0.4047208	get the	-0.06704012 +-0.8493397	ghastly tendency	0.05332709 +-0.8493397	ghostwritten by	-0.004049858 +-0.8493397	gil bailie	0.05332709 +-0.8493397	given the	-0.01198488 +-0.8493397	going to	-0.01011507 +-0.8493397	got used	0.04797027 +-0.4047208	great idea	0.05332709 +-0.8493397	growing .	0.08907277 +-0.8493397	grows more	0.008061528 +-0.8026608	had a	-0.007295178 +-1.54831	had been	0.00272119 +-1.54831	had in	-0.009490006 +-1.54831	had jurisdiction	0.05332709 +-0.6614985	hantemirov and	-0.5914098 +-1.15037	happening deep	0.05332709 +-1.15037	happening right	0.00272119 +-0.8493397	happens today	0.01894335 +-0.8493397	hard to	-0.01011507 +-0.8493397	hardly know	0.00272119 +-1.4514	has a	-0.01011507 +-1.4514	has always	0.01894335 +-1.4514	has only	0.05332709 +-1.4514	has the	-0.01198488 +-0.8493397	hate to	-0.01011507 +-1.627491	have an	-0.0004517734 +-0.881842	have been	0.01894334 +-1.627491	have relied	0.05332709 +-1.627491	have similarly	0.05332709 +-1.627491	have the	-0.01198488 +-0.8493397	haven't read	0.05332709 +-0.8026608	he is	-0.004049861 +-1.54831	he made	0.05332709 +-1.54831	he would	0.00272119 +-1.54831	he wrote	0.05332709 +-0.8493397	here prove	0.05332709 +-0.8493397	highly possible	0.05332709 +-1.15037	him hate	0.05332709 +-1.15037	him to	0.002916232 +-1.326461	his comments	0.05332709 +-1.326461	his initial	0.05332709 +-1.326461	his precipitous	0.05332709 +-0.8493397	how their	0.05332709 +-0.4047208	however ,	-0.01082908 +-0.8493397	hs blade	0.05332709 +-0.8493397	humanity at	0.008061528 +-1.803582	i can	0.01894335 +-1.803582	i checked	0.05332709 +-1.803582	i had	0.06152429 +-1.803582	i hardly	0.05332709 +-1.803582	i haven't	0.05332709 +-1.803582	i know	0.00272119 +-1.803582	i noticed	0.05332709 +-1.803582	i skimmed	0.05332709 +-1.803582	i stumbled	0.05332709 +-0.8493397	i'd love	0.05332709 +-0.8493397	i've provided	0.05332709 +-1.15037	idea ,	-0.01187418 +-1.15037	idea .	-0.0114856 +-1.15037	if it	-0.006728992 +-1.15037	if the	-0.01198488 +-0.8493397	illusion and	-0.01063527 +-0.8493397	immense energy	0.05332709 +-0.8493397	impact on	-0.005168174 +-0.8493397	important impact	0.05332709 +-1.358963	in a	-0.007295178 +-2.104612	in any	0.05332709 +-2.104612	in briffa	0.02412629 +-2.104612	in briffa's	0.01894335 +-2.104612	in combination	0.05332709 +-2.104612	in connection	0.05332709 +-2.104612	in hantemirov	0.09388901 +-2.104612	in mind	0.05332709 +-2.104612	in one	0.008061528 +-2.104612	in passing	0.05332709 +-2.104612	in response	0.05332709 +-2.104612	in rev	0.05332709 +-2.104612	in terms	0.05332709 +-1.358963	in the	-0.007650165 +-2.104612	in this	-0.009059753 +-2.104612	in virtually	0.05332709 +-0.8493397	included with	0.05785327 +-1.15037	including ,	-0.01187418 +-1.15037	including the	-0.007702977 +-0.8493397	indeed see	0.1193421 +-0.8493397	individual series	0.008061528 +-0.8493397	information finally	0.008061528 +-0.8493397	inhomogeneities ,	0.05896524 +-0.8493397	initial use	0.2044696 +-0.4047208	instead of	0.01149127 +-0.8493397	interannual variability	0.01894335 +-1.15037	into him	0.01894335 +-1.15037	into the	-0.01198488 +-0.8493397	introduced by	-0.004049858 +-1.995468	is ,	-0.007080218 +-1.995468	is always	0.01894335 +-1.995468	is considered	0.05332709 +-1.995468	is derived	0.05332709 +-1.995468	is doing	0.05332709 +-1.995468	is happening	0.01894335 +-1.995468	is highly	0.05332709 +-1.995468	is measured	0.05332709 +-1.995468	is no	0.05332709 +-1.995468	is not	-0.006728992 +-1.995468	is related	0.05332709 +-1.995468	is that	-0.008505944 +-1.995468	is the	-0.01198488 +-1.995468	is within	0.05332709 +-1.84934	it grows	0.05332709 +-1.84934	it has	0.00272119 +-1.184377	it is	0.0004524188 +-1.84934	it just	0.00272119 +-1.84934	it looks	0.05332709 +-1.84934	it originated	0.05332709 +-1.84934	it was	-0.0004517734 +-1.84934	it yet	0.05332709 +-1.4514	it's like	0.01894335 +-1.4514	it's much	0.01894335 +-1.4514	it's not	-0.006728992 +-1.4514	it's very	0.01894335 +-0.8493397	its enormous	0.05332709 +-1.15037	journal (	-0.005168174 +-1.15037	journal article	0.008061528 +-0.8493397	jurisdiction .	-0.004888296 +-1.4514	just between	0.008061528 +-1.4514	just keeps	0.05332709 +-1.4514	just one	0.008061528 +-1.4514	just to	0.02102831 +-0.8493397	kaufman et	0.2044696 +-0.8493397	keeps growing	0.05332709 +-0.4047208	khadyta river	0.1451325 +-1.4514	know !	0.01894335 +-0.7057508	know ,	-0.007021053 +-1.4514	know where	0.008061528 +-0.8493397	larch sample	0.05332709 +-1.15037	larches .	0.08907277 +-1.15037	larches were	0.04819728 +-0.8493397	large-scale "	0.01222976 +-1.15037	like crack	0.01894335 +-1.15037	like trying	0.05332709 +-0.8493397	limited size	0.05332709 +-0.8493397	living larches	0.01894335 +-0.8493397	longest and	-0.01063527 +-0.8493397	looking up	0.01894335 +-0.8493397	looks relevant	0.05332709 +-0.8493397	love to	-0.01011507 +-0.8493397	made that	-0.008505944 +-0.4047208	mag )	0.002721187 +-0.8493397	magnitude of	-0.009287588 +-0.8493397	magnus .	-0.0114856 +-0.8493397	makes the	-0.01198488 +-0.8493397	many multiproxy	0.05332709 +-0.8493397	may well	0.05332709 +-0.8493397	mean chronology	-0.005168174 +-0.8493397	measured by	0.0389223 +-0.4047208	measurement data	0.0009555696 +-1.4514	method "	-0.008505944 +-1.4514	method .	-0.004888296 +-1.4514	method that	-0.008505944 +-1.4514	method which	0.00272119 +-0.8493397	methodology warn	0.05332709 +-0.8493397	mind when	0.008061528 +-0.8493397	mix religion	0.05332709 +-1.326461	more "	-0.008505944 +-1.326461	more it	0.0164606 +-1.326461	more slowly	0.01894335 +-0.8493397	morning i	-0.006035987 +-1.326461	most recent	0.05332709 +-1.326461	most recently	0.2044696 +-1.326461	most sensitive	0.05332709 +-1.15037	much further	0.008061528 +-1.15037	much illusion	0.05332709 +-0.8493397	multi-parters ,	-0.01187418 +-0.8493397	multiproxy studies	0.05332709 +-0.8493397	mundane politics	0.008061528 +-0.580812	my attention	0.05332709 +-1.326461	my ghastly	0.05332709 +-0.8493397	national debt	0.05332709 +-0.8493397	naughtiness .	-0.0114856 +-0.8493397	nettle ,	-0.01187418 +-0.8493397	never properly	0.05332709 +-0.8493397	next ,	-0.008075343 +-0.8493397	no doubt	0.05332709 +-0.8493397	non-robustness observed	0.05332709 +-0.8493397	northern forests	0.05332709 +-1.84934	not be	0.01894335 +-1.84934	not due	0.05332709 +-1.84934	not going	0.05332709 +-1.184377	not have	0.07243546 +-1.84934	not just	0.00272119 +-1.84934	not preserve	0.05332709 +-1.84934	not struggling	0.05332709 +-1.84934	not using	0.01894335 +-0.8493397	noted before	0.05332709 +-0.8493397	noticed that	0.04168737 +-0.8493397	notwithstanding these	0.00272119 +-0.8493397	now ,	0.05896524 +-1.15037	obama ,	-0.007080218 +-1.15037	obama is	-0.008505944 +-0.8493397	observed here	0.05332709 +-2.079789	of 17	0.05332709 +-2.079789	of a	-0.01011507 +-2.079789	of being	0.05332709 +-2.079789	of commentary	0.05332709 +-2.079789	of darkness	0.05332709 +-2.079789	of deeper	0.05332709 +-2.079789	of his	0.008061528 +-2.079789	of interannual	0.05332709 +-2.079789	of mundane	0.05332709 +-2.079789	of old	0.01894335 +-1.33414	of older	0.03455187 +-2.079789	of reposting	0.05332709 +-2.079789	of subfossil	0.008061528 +-1.33414	of the	-0.06704012 +-2.079789	of this	-0.009059753 +-1.15037	old living	0.05332709 +-1.15037	old trees	0.00272119 +-0.6614985	older trees	0.03579502 +-0.8493397	oldie ,	-0.008075343 +-1.006781	on a	-0.007295178 +-1.75243	on average	0.05332709 +-1.75243	on many	0.05332709 +-1.75243	on rcs	0.008061528 +-1.75243	on the	-0.007702977 +-1.006781	on this	-0.005168174 +-1.326461	one .	-0.0114856 +-1.326461	one approach	0.05332709 +-1.326461	one oldie	0.05332709 +-0.8493397	online .	-0.0114856 +-0.8493397	only taken	0.05332709 +-0.8493397	or real	0.05332709 +-0.8493397	originated with	-0.007295175 +-0.8493397	osborn and	-0.01063527 +-0.8493397	out (	-0.005168174 +-0.8493397	outright fantasy	0.05332709 +-0.8493397	own caveats	0.05332709 +-0.8493397	paleoclimatologists and	-0.01063527 +-0.8493397	passage i	-0.006035987 +-0.8493397	passing and	-0.01063527 +-0.8493397	path "	-0.008505944 +-0.8493397	patterns in	0.006514465 +-0.8493397	paul had	-0.0004517734 +-0.8493397	people that	-0.008505944 +-0.8833473	perhaps the	-0.01011507 +-1.54831	perhaps there's	0.05332709 +-1.54831	perhaps they	0.00272119 +-0.4047208	phil trans	0.05332709 +-0.8493397	picked cores	0.008061528 +-0.8493397	piece by	-0.004049858 +-0.8493397	place .	-0.0114856 +-0.8493397	placed online	0.05332709 +-0.8493397	play on	0.03209379 +-0.8493397	point that	-0.008505944 +-0.8493397	policy )	-0.005168174 +-1.326461	politics ,	-0.01187418 +-1.326461	politics .	-0.004888296 +-1.326461	politics are	0.05332709 +-0.8026608	population .	-0.0100497 +-1.54831	population as	-0.004049858 +-1.54831	population consists	0.05332709 +-1.54831	population instead	0.2044696 +-0.8493397	position that	0.04168737 +-0.8493397	possible and	-0.01063527 +-1.15037	potential bias	0.00272119 +-1.15037	potential unrepresentativeness	0.05332709 +-0.8493397	power to	-0.01011507 +-0.8493397	powers and	-0.01063527 +-0.8493397	precipitous decline	0.05332709 +-0.8493397	precisely the	-0.007702977 +-0.8493397	predictable factors	0.05332709 +-0.8493397	presented this	0.009005655 +-0.8493397	preserve centennial-scale	0.05332709 +-0.8493397	previous journal	0.01894335 +-0.8493397	principalities of	-0.009287588 +-0.8493397	principles in	0.006514465 +-0.8493397	prior selection	0.1193421 +-0.8493397	probable that	0.04168737 +-0.4047208	problem "	-0.004049861 +-0.8493397	projected into	0.01894335 +-0.8493397	properly published	0.05332709 +-0.8493397	prove out	0.05332709 +-0.8493397	provide the	-0.01198488 +-0.8493397	provided a	-0.01011507 +-0.8493397	provocative thought	0.05332709 +-0.8493397	published in	-0.009490006 +-0.8493397	push at	0.008061528 +-1.326461	rcs chronology	-0.005168174 +-1.326461	rcs method	0.00272119 +-1.326461	rcs methodology	0.05332709 +-0.8493397	react to	0.002916232 +-0.8493397	read it	-0.006728992 +-1.15037	readers also	0.01894335 +-1.15037	readers know	0.08231446 +-0.8493397	reading ,	-0.01187418 +-0.8493397	real )	-0.005168174 +-0.8493397	really react	0.05332709 +-0.8493397	realm of	-0.009287588 +-1.15037	reason for	-0.002554279 +-1.15037	reason why	0.05332709 +-0.8493397	recent one	0.008061528 +-0.4047208	recently ,	-0.01082908 +-0.8493397	reconstruction .	-0.0114856 +-0.8493397	refusal in	-0.009490006 +-0.8493397	refused to	-0.01011507 +-0.8493397	related to	-0.01011507 +-0.8493397	relevant ,	-0.008075343 +-0.8493397	relied on	0.03209379 +-0.8493397	religion and	-0.01063527 +-0.8493397	remained unarchived	0.05332709 +-0.8493397	remarked that	-0.008505944 +-0.8493397	reposting just	0.00272119 +-0.8493397	requiring briffa	-0.005168174 +-0.8493397	response to	0.02102831 +-0.8493397	resulting yamal	0.02412629 +-0.8493397	rev .	-0.0114856 +-1.4514	right .	-0.0114856 +-1.4514	right now	0.05332709 +-1.4514	right place	0.05332709 +-1.4514	right time	0.01894335 +-0.8493397	ring widths	0.05332709 +-0.8493397	ring-width series	0.1193421 +-0.4047208	river ,	-0.01082908 +-0.8493397	said he	-0.0004517734 +-0.8493397	same bias	0.00272119 +-0.8493397	sample should	0.05332709 +-0.8493397	sat in	-0.009490006 +-1.4514	schweingruber data	-0.006035987 +-0.7864373	schweingruber population	0.09172077 +-0.8493397	schweingruber's khadyta	0.2044696 +-0.580812	science (	-0.02724335 +-1.326461	science article	0.008061528 +-0.8493397	script )	0.03209379 +-1.326461	see ,	-0.008075343 +-0.580812	see the	-0.01105098 +-0.8493397	seized the	-0.01198488 +-1.15037	selected .	-0.004888296 +-1.15037	selected on	0.03209379 +-1.326461	selection is	-0.008505944 +-0.580812	selection of	0.01149127 +-0.8493397	sensitive series	0.1193421 +-0.8493397	sensitivity is	-0.008505944 +-0.580812	series ,	-0.01082908 +-1.326461	series of	-0.009287588 +-0.8493397	set in	-0.009490006 +-0.8493397	several things	0.01894335 +-0.8493397	shadow play	0.05332709 +-1.15037	shadows .	-0.0114856 +-1.15037	shadows of	-0.009287588 +-1.326461	shiyatov 2002	0.05332709 +-1.326461	shiyatov themselves	0.01894335 +-1.326461	shiyatov would	0.08231446 +-0.8493397	should not	-0.006728992 +-0.8493397	similar schweingruber	0.00272119 +-0.8493397	similarly affected	0.05332709 +-0.8493397	since this	-0.009059753 +-0.8493397	size and	-0.01063527 +-0.8493397	skimmed this	-0.009059753 +-1.15037	slowly ,	-0.01187418 +-1.15037	slowly get	0.2044696 +-0.8493397	small push	0.05332709 +-0.8493397	so much	0.01894335 +-0.8493397	some reason	0.01894335 +-0.8493397	someone whose	0.05332709 +-0.8493397	start today	0.01894335 +-0.8493397	staunchly refused	0.05332709 +-0.8493397	struggling against	0.01894335 +-0.8493397	studies that	-0.008505944 +-1.15037	study ,	-0.01187418 +-1.15037	study .	0.08907277 +-0.8493397	stumbled upon	0.05332709 +-1.326461	subfossil collection	0.05332709 +-1.326461	subfossil data	0.02685598 +-1.326461	subfossil larches	0.01894335 +-0.8493397	subsequent study	0.01894335 +-0.8493397	subset in	-0.009490006 +-0.8493397	success .	-0.0114856 +-0.8493397	supplement .	0.08907277 +-0.8493397	supplemented by	0.0389223 +-0.8493397	surface ,	-0.008075343 +-0.8493397	take an	-0.0004517734 +-0.8493397	taken a	0.0001907796 +-1.15037	taymir data	-0.006035987 +-1.15037	taymir supplement	0.05332709 +-0.8493397	temperature ,	0.05896524 +-0.8493397	tendency to	-0.01011507 +-0.8493397	terms of	-0.009287588 +-0.8493397	than the	-0.008591395 +-1.995468	that "	-0.008505944 +-1.995468	that cast	0.05332709 +-1.995468	that characterizes	0.05332709 +-1.995468	that have	-0.002554279 +-1.995468	that he	0.06152429 +-1.995468	that his	0.008061528 +-0.9275748	that the	0.03271748 +-1.995468	that they	0.00272119 +-1.995468	that voted	0.05332709 +-1.995468	that way	0.05332709 +-1.995468	that wise	0.05332709 +-2.668884	the "	-0.008505944 +-1.923235	the 12	0.05332709 +-2.668884	the addition	0.05332709 +-1.923235	the arkive	0.05332709 +-2.668884	the back-and-forth	0.05332709 +-2.668884	the biased	0.05332709 +-2.668884	the biblical	0.05332709 +-2.668884	the chronology	-0.005168174 +-1.923235	the conservatives	0.05332709 +-2.668884	the crossroads	0.05332709 +-2.003921	the cru	0.0632299 +-2.668884	the data	0.02685598 +-2.668884	the day	0.01894335 +-2.668884	the difference	0.01894335 +-2.668884	the far	0.05332709 +-2.668884	the following:	0.05332709 +-2.668884	the further	0.008061528 +-2.668884	the future	0.05332709 +-2.668884	the information	0.05332709 +-2.668884	the large-scale	0.05332709 +-2.668884	the longest	0.05332709 +-2.668884	the magnitude	0.05332709 +-2.668884	the measurement	0.2044696 +-2.668884	the more	0.008061528 +-2.668884	the most	0.008061528 +-2.668884	the multi-parters	0.05332709 +-2.668884	the national	0.05332709 +-2.668884	the nettle	0.05332709 +-2.668884	the non-robustness	0.05332709 +-2.668884	the path	0.05332709 +-2.668884	the people	0.05332709 +-2.668884	the phil	0.2044696 +-2.668884	the point	0.05332709 +-2.668884	the position	0.05332709 +-2.668884	the previous	0.05332709 +-2.668884	the rcs	0.008061528 +-2.668884	the realm	0.05332709 +-2.668884	the resulting	0.05332709 +-1.923235	the right	0.01894334 +-2.668884	the same	0.05332709 +-2.003921	the schweingruber	-0.5245172 +-2.668884	the shadows	0.01894335 +-2.668884	the subfossil	0.008061528 +-1.923235	the taymir	0.05332709 +-1.923235	the trouble	0.1451325 +-1.923235	the two	0.1451325 +-2.668884	the use	0.2044696 +-2.668884	the usual	0.05332709 +-2.668884	the very	0.01894335 +-2.668884	the virtue	0.05332709 +-1.120574	the yamal	0.02719982 +-0.8493397	their cores	0.008061528 +-1.15037	themselves ,	-0.01187418 +-1.15037	themselves were	-0.002554279 +-0.8493397	there's some	0.05332709 +-1.4514	these data	-0.006035987 +-1.4514	these shadows	0.01894335 +-1.4514	these warnings	0.05332709 +-1.4514	these were	-0.002554279 +-1.4514	they can	0.01894335 +-1.4514	they don't	0.05332709 +-1.4514	they expect	0.05332709 +-1.4514	they themselves	0.01894335 +-1.15037	things caught	0.05332709 +-1.15037	things that	-0.008505944 +-0.8493397	think up	0.01894335 +-0.8493397	thinking through	0.05332709 +-2.05346	this analysis	0.05332709 +-2.05346	this article	0.008061528 +-2.05346	this bias	0.00272119 +-1.307811	this chronology	0.002721187 +-2.05346	this difference	0.01894335 +-1.307811	this is	-0.004049861 +-2.05346	this method	0.00272119 +-2.05346	this morning	0.05332709 +-2.05346	this piece	0.05332709 +-2.05346	this refusal	0.05332709 +-2.05346	this study	0.01894335 +-2.05346	this subset	0.05332709 +-2.05346	this will	0.01894335 +-2.05346	this year	0.01894335 +-1.15037	those "	-0.008505944 +-1.15037	those years	0.05332709 +-0.8493397	thought .	-0.0114856 +-0.8493397	thousand ,	0.05896524 +-0.8493397	through the	-0.01198488 +-1.15037	time ,	-0.008075343 +-1.15037	time and	-0.002266581 +-2.191762	to about	0.00272119 +-2.191762	to admit	0.05332709 +-2.191762	to archive	0.05332709 +-1.446113	to begin	0.05332709 +-2.191762	to change	0.05332709 +-2.191762	to constructing	0.05332709 +-2.191762	to control	0.01894335 +-2.191762	to day	0.01894335 +-2.191762	to different	0.01894335 +-2.191762	to get	0.2044696 +-2.191762	to mix	0.05332709 +-2.191762	to provide	0.05332709 +-2.191762	to start	0.05332709 +-1.123869	to the	-0.005761562 +-2.191762	to think	0.05332709 +-2.191762	to those	0.01894335 +-1.446113	to what	0.005001867 +-1.15037	today .	-0.0114856 +-1.15037	today would	0.00272119 +-0.8493397	took the	-0.01198488 +-0.8493397	towards older	0.09388901 +-1.15037	trans b	0.05332709 +-1.15037	trans editors	0.05332709 +-1.4514	trees .	-0.0114856 +-1.4514	trees an	-0.0004517734 +-1.4514	trees described	0.05332709 +-1.4514	trees than	0.05332709 +-0.4047208	trouble with	-0.03998877 +-0.8493397	true ,	-0.01187418 +-0.8493397	trying to	-0.01011507 +-0.4047208	two versions	0.05332709 +-0.8493397	unarchived .	-0.004888296 +-0.8493397	under control	0.01894335 +-0.8493397	unintentional bias	0.00272119 +-0.8493397	unrepresentativeness of	0.007685009 +-0.8493397	until recently	0.2044696 +-0.8493397	unveiled: humanity	0.05332709 +-1.15037	up a	-0.01011507 +-1.15037	up the	-0.01198488 +-0.8493397	upon this	-0.009059753 +-0.4047208	use of	-0.005627823 +-1.54831	used by	-0.004049858 +-0.8833473	used in	0.01371272 +-1.54831	used the	-0.01198488 +-1.15037	using .	0.08907277 +-1.15037	using the	-0.008591395 +-0.8493397	usual predictable	0.05332709 +-0.8493397	valid reason	0.01894335 +-1.15037	variability .	-0.004888296 +-1.15037	variability and	-0.01063527 +-1.15037	versions .	0.08907277 +-1.15037	versions is	-0.008505944 +-1.15037	very hard	0.05332709 +-1.15037	very limited	0.05332709 +-0.8493397	violence unveiled:	0.05332709 +-0.8493397	virtually every	0.05332709 +-0.8493397	virtue of	-0.009287588 +-0.8493397	voted for	-0.002554279 +-0.8493397	warn against	0.01894335 +-0.8493397	warnings ,	-0.01187418 +-1.54831	was finally	0.008061528 +-1.54831	was ghostwritten	0.05332709 +-1.54831	was like	0.01894335 +-1.54831	was never	0.05332709 +-1.54831	was used	0.04797027 +-0.8493397	way slowly	0.01894335 +-0.8493397	we do	0.01894335 +-0.8493397	well have	0.04819728 +-1.627491	were not	-0.006728992 +-1.627491	were right	0.00272119 +-0.881842	were selected	0.05332709 +-1.627491	were supplemented	0.05332709 +-1.627491	were the	-0.01198488 +-1.694438	what a	-0.01011507 +-1.694438	what did	0.01894335 +-1.694438	what happens	0.05332709 +-1.694438	what is	-0.008505944 +-1.694438	what paul	0.05332709 +-1.694438	what the	-0.007702977 +-1.694438	what will	0.01894335 +-0.8493397	what's your	0.01894335 +-1.326461	when combined	0.05332709 +-1.326461	when he	-0.0004517734 +-1.326461	when i	-0.006035987 +-1.326461	where it's	0.00272119 +-1.326461	where sensitivity	0.05332709 +-1.326461	where to	0.002916232 +-1.4514	which ,	-0.01187418 +-1.4514	which did	0.01894335 +-1.4514	which had	0.06152429 +-1.4514	which makes	0.05332709 +-1.326461	while including	0.01894335 +-1.326461	while looking	0.05332709 +-1.326461	while the	0.02129733 +-0.8493397	whose book	0.05332709 +-0.8493397	why schweingruber's	0.05332709 +-0.8493397	widths and	-0.01063527 +-1.15037	will be	0.01894335 +-1.15037	will have	-0.002554279 +-0.8493397	wise crack	0.01894335 +-1.890732	with .	-0.004888296 +-1.890732	with a	-0.01011507 +-1.890732	with briffa	0.02412629 +-1.890732	with its	0.05332709 +-1.145083	with obama	0.05332709 +-1.890732	with osborn	0.05332709 +-0.8228394	with the	0.02898683 +-0.8493397	within your	0.01894335 +-0.8493397	without fully	0.05332709 +-0.8493397	worth reading	0.05332709 +-1.4514	would do	0.01894335 +-0.7057508	would not	-0.04287655 +-1.4514	would take	0.05332709 +-0.8493397	wright's church	0.05332709 +-0.8493397	wrote the	-0.01198488 +-1.087467	yamal chronology	0.01075652 +-1.75243	yamal data	-0.006035987 +-1.75243	yamal larch	0.05332709 +-1.75243	yamal measurement	0.2044696 +-1.75243	yamal reconstruction	0.05332709 +-1.75243	yamal subfossil	0.008061528 +-1.15037	year ,	-0.008075343 +-1.15037	year old	0.01894335 +-0.8493397	years ?	0.01894335 +-0.8493397	yes ,	-0.01187418 +-0.8493397	yesterday about	0.00272119 +-0.8493397	yet ,	0.05896524 +-0.8493397	you see	0.008061528 +-1.15037	your great	0.2044696 +-1.15037	your power	0.05332709 + +\3-grams: +-1.533073	control ! as +-1.533073	know ! instead +-1.533073	. " i'd +-1.533073	? " </s> +-1.533073	a " divergence +-1.533073	concrete " ( +-1.533073	considered " success +-1.533073	large-scale " divergence +-1.533073	method " used +-1.533073	more " concrete +-1.533073	path " as +-1.834103	problem " - +-1.834103	problem " that +-1.533073	that " the +-1.533073	the " corridor +-1.533073	those " further +-1.533073	. ' </s> +-1.533073	<s> ' yes +-1.533073	" ( or +-1.533073	. ( while +-1.533073	al ( phil +-1.533073	data ( in +-1.533073	journal ( which +-1.533073	out ( and +-0.8145491	science ( mag +-1.533073	. ) </s> +-1.533073	2008 ) and +-1.533073	away ) , +-1.834103	mag ) acquiesced +-1.834103	mag ) took +-1.533073	policy ) had +-1.533073	real ) things +-1.533073	script ) , +-1.834103	) , it's +-1.834103	) , this +-1.533073	actually , all +-1.533073	ago , i +-1.533073	and , when +-1.533073	article , it +-1.533073	attention , but +-1.533073	available , this +-1.533073	average , of +-1.533073	b , 2008 +-1.533073	before , briffa +-1.533073	bias , when +-1.533073	blood , but +-1.533073	but , notwithstanding +-1.533073	chronology , 224 +-1.533073	consists , on +-1.533073	cores , this +-1.533073	debt , which +-1.533073	energy , but +-1.533073	factors , but +-1.533073	first , a +-1.533073	furthermore , it +-1.834103	however , as +-1.834103	however , using +-1.533073	idea , bob +-1.533073	including , most +-1.533073	inhomogeneities , but +-1.533073	is , it's +-1.834103	know , the +-1.834103	know , until +-1.533073	multi-parters , delete +-1.533073	nettle , requiring +-1.533073	next , i +-1.533073	now , but +-1.533073	obama , which +-1.533073	oldie , i +-1.533073	politics , he +-1.533073	reading , cf +-1.834103	recently , cru +-1.834103	recently , kaufman +-1.533073	relevant , and +-1.834103	river , while +-1.834103	river , yamal +-1.533073	see , the +-1.834103	series , from +-1.834103	series , where +-1.533073	slowly , is +-1.533073	study , including +-1.533073	surface , and +-1.533073	temperature , but +-1.533073	themselves , since +-1.533073	thousand , but +-1.533073	time , and +-1.533073	true , for +-1.533073	warnings , his +-1.533073	which , if +-1.533073	year , the +-1.533073	yes , perhaps +-1.533073	yet , but +-1.533073	" - not +-1.533073	2006 . while +-1.533073	2009 . </s> +-1.533073	article . however +-1.533073	attention . first +-1.533073	bailie . i +-1.533073	cf . violence +-1.533073	cores . briffa +-1.533073	crossroads . ) +-1.834103	data . as +-1.834103	data . but +-1.533073	days . </s> +-1.533073	difference . </s> +-1.533073	forests . however +-1.533073	future . changing +-1.533073	growing . </s> +-1.533073	idea . what's +-1.533073	jurisdiction . briffa +-1.533073	larches . </s> +-1.533073	magnus . actually +-1.533073	method . this +-1.533073	naughtiness . ( +-1.533073	one . in +-1.533073	online . with +-1.533073	place . ' +-1.533073	politics . this +-1.834103	population . it +-1.834103	population . the +-1.533073	reconstruction . science +-1.533073	rev . wright's +-1.533073	right . what +-1.533073	selected . these +-1.533073	shadows . and +-1.533073	study . </s> +-1.533073	success . " +-1.533073	supplement . </s> +-1.533073	thought . furthermore +-1.533073	today . several +-1.533073	trees . perhaps +-1.533073	unarchived . a +-1.533073	using . </s> +-1.533073	variability . these +-1.533073	versions . </s> +-1.533073	with . a +-1.834103	the 12 cores +-1.834103	the 12 picked +-1.533073	of 17 ring-width +-2.010194	briffa 2000 and +-2.010194	briffa 2000 may +-2.010194	briffa 2000 presented +-1.533073	shiyatov 2002 as +-1.533073	briffa 2006 . +-1.533073	, 2008 ) +-1.533073	al 2009 . +-1.533073	from 200–400 year +-1.533073	, 224 individual +-1.533073	bob ? i +-1.533073	years ? " +-1.533073	, a comment +-1.834103	. a commenter +-1.834103	. a few +-1.533073	about a thousand +-1.533073	as a shadow +-1.533073	at a time +-1.533073	constructing a mean +-1.533073	delete a few +-1.533073	from a prior +-1.834103	had a different +-1.834103	had a great +-1.533073	has a " +-1.834103	in a case +-1.834103	in a science +-1.533073	of a similar +-1.834103	on a rcs +-1.834103	on a surface +-1.533073	provided a generating +-1.533073	taken a few +-1.533073	up a valid +-1.533073	what a provocative +-1.533073	with a small +-1.533073	concerned about potential +-1.533073	crack about not +-1.533073	to about a +-1.533073	yesterday about my +-1.533073	) acquiesced in +-1.533073	. actually , +-1.533073	the addition of +-1.533073	to admit that +-1.533073	similarly affected the +-1.533073	struggling against flesh +-1.533073	warn against inhomogeneities +-1.533073	different aging patterns +-1.533073	days ago , +-1.533073	further ahead you +-1.834103	et al ( +-1.834103	et al 2009 +-1.533073	, all of +-1.533073	for all those +-1.533073	and all-around naughtiness +-1.533073	further along the +-1.533073	chronology also has +-1.533073	readers also know +-1.533073	has always been +-1.533073	is always worth +-1.533073	been an exception +-1.533073	for an extension +-1.533073	have an important +-1.533073	take an immense +-1.533073	trees an unintentional +-1.533073	this analysis has +-1.533073	( and i've +-1.533073	) and the +-2.010194	, and he +-2.010194	, and that +-2.010194	, and they +-1.533073	. and perhaps +-1.533073	2000 and science +-1.533073	ayers and sat +-1.533073	darkness and all-around +-1.533073	do and the +-1.533073	extension and , +-1.533073	flesh and blood +-0.1249387	hantemirov and shiyatov +-1.533073	illusion and outright +-1.533073	longest and most +-1.533073	osborn and briffa +-1.533073	paleoclimatologists and got +-1.533073	passing and it +-1.533073	possible and even +-1.533073	powers and principalities +-1.533073	religion and politics +-1.533073	size and potential +-1.533073	time and the +-1.533073	variability and hantemirov +-1.533073	widths and temperature +-1.533073	but anti-divine powers +-1.533073	in any journal +-1.533073	one approach to +-1.533073	to archive the +-1.533073	politics are to +-1.834103	the arkive down +-1.834103	the arkive under +-1.533073	journal article . +-1.533073	science article , +-1.533073	this article on +-1.533073	! as it +-1.533073	" as a +-1.533073	, as ca +-1.533073	. as noted +-1.533073	2002 as follows: +-1.533073	<s> as ca +-1.533073	population as compared +-1.533073	briffa asked for +-1.533073	few at a +-1.533073	humanity at the +-1.533073	push at precisely +-1.834103	my attention , +-1.834103	my attention . +-1.533073	finally available , +-1.533073	on average , +-1.533073	further away ) +-1.533073	bill ayers and +-1.533073	trans b , +-1.533073	the back-and-forth yesterday +-1.533073	gil bailie . +-1.533073	not be included +-1.533073	will be happening +-1.533073	but because so +-1.533073	always been an +-1.533073	had been projected +-1.834103	have been concerned +-1.834103	have been done +-1.533073	noted before , +-1.834103	to begin in +-1.834103	to begin with +-1.533073	of being true +-1.533073	but between the +-1.533073	difference between the +-1.533073	just between ring +-1.533073	potential bias introduced +-1.533073	same bias towards +-1.533073	this bias would +-1.533073	unintentional bias , +-1.533073	the biased selection +-1.533073	the biblical passage +-1.533073	by bill ayers +-1.533073	hs blade was +-1.533073	and blood , +-1.533073	, bob ? +-1.533073	whose book was +-1.533073	, briffa asked +-1.834103	. briffa 2000 +-1.834103	. briffa used +-1.533073	and briffa 2006 +-1.533073	chronology briffa et +-1.533073	in briffa 2000 +-1.533073	requiring briffa to +-1.533073	with briffa 2000 +-1.533073	<s> briffa's own +-1.533073	in briffa's yamal +-2.487315	, but , +-2.487315	, but anti-divine +-2.487315	, but because +-2.487315	, but between +-1.467762	, but it +-2.487315	, but the +-2.487315	, but this +-2.487315	, but to +-1.533073	. but given +-1.533073	<s> but it's +-1.533073	comment by magnus +-1.533073	ghostwritten by bill +-1.533073	introduced by how +-1.533073	measured by the +-1.533073	piece by gil +-1.533073	supplemented by the +-1.533073	used by hantemirov +-0.8145491	as ca readers +-1.533073	i can combine +-1.533073	they can see +-1.533073	a case where +-1.533073	that cast these +-1.533073	comments catch my +-1.533073	things caught my +-1.533073	own caveats on +-1.533073	preserve centennial-scale variability +-1.533073	, cf . +-1.533073	to change with +-1.533073	. changing what +-1.533073	that characterizes northern +-1.533073	i checked earlier +-1.533073	mean chronology , +-1.533073	rcs chronology method +-1.533073	the chronology briffa +-1.834103	this chronology also +-1.834103	this chronology in +-2.010194	yamal chronology has +-2.010194	yamal chronology was +-2.010194	yamal chronology with +-1.533073	wright's church for +-1.533073	crack cocaine for +-1.533073	subfossil collection does +-1.533073	in combination with +-1.533073	can combine the +-1.533073	when combined with +-1.533073	a comment by +-1.533073	of commentary on +-1.533073	a commenter remarked +-1.533073	his comments catch +-1.533073	as compared to +-1.533073	been concerned about +-1.533073	" concrete " +-1.533073	in connection with +-1.834103	the conservatives said +-1.834103	the conservatives were +-1.533073	is considered " +-1.533073	population consists , +-1.533073	to constructing a +-1.533073	to control the +-1.533073	under control ! +-1.533073	12 cores . +-1.533073	picked cores , +-1.533073	their cores were +-1.533073	" corridor method +-1.533073	like crack cocaine +-1.533073	wise crack about +-1.533073	the crossroads . +-1.533073	, cru staunchly +-0.9906404	the cru population +-2.010194	the cru selection +-1.533073	of darkness and +-1.533073	different data policy +-1.834103	measurement data remained +-1.834103	measurement data used +-1.533073	schweingruber data set +-1.533073	subfossil data . +-1.533073	taymir data ( +-1.533073	the data . +-1.533073	these data were +-1.533073	yamal data was +-1.533073	the day to +-1.533073	to day politics +-1.834103	few days . +-1.834103	few days ago +-1.533073	national debt , +-1.533073	precipitous decline is +-1.533073	happening deep into +-1.533073	of deeper principles +-1.533073	, delete a +-1.533073	is derived from +-1.533073	trees described in +-1.533073	what did they +-1.533073	which did not +-1.533073	the difference between +-1.533073	this difference . +-1.533073	a different data +-1.533073	to different aging +-0.8145491	" divergence problem +-1.533073	we do indeed +-1.533073	would do and +-1.533073	collection does not +-1.533073	is doing exactly +-1.533073	they don't really +-1.533073	been done without +-1.533073	no doubt what +-1.533073	arkive down to +-1.533073	not due just +-1.533073	checked earlier this +-1.533073	trans editors finally +-1.533073	immense energy , +-1.533073	its enormous hs +-1.533073	briffa et al +-1.533073	kaufman et al +-1.533073	and even probable +-1.533073	virtually every subsequent +-1.533073	doing exactly what +-1.533073	an exception to +-1.533073	for excluding khadyta +-1.533073	they expect from +-1.533073	an extension and +-1.533073	predictable factors , +-1.533073	outright fantasy had +-1.533073	the far more +-2.010194	a few at +-0.9906404	a few days +-1.533073	editors finally seized +-1.533073	information finally available +-1.533073	was finally placed +-1.533073	. first , +-1.533073	against flesh and +-1.533073	the following: </s> +-1.533073	as follows: </s> +-1.533073	, for we +-1.533073	asked for an +-1.533073	church for all +-1.533073	cocaine for paleoclimatologists +-1.533073	reason for excluding +-1.533073	voted for him +-1.533073	northern forests . +-1.533073	, from 200–400 +-1.533073	derived from a +-1.533073	expect from someone +-1.533073	without fully thinking +-1.533073	" further along +-1.533073	much further away +-1.533073	the further ahead +-1.533073	. furthermore , +-1.533073	the future . +-1.533073	a generating script +-1.533073	slowly get the +-1.533073	to get the +-1.533073	my ghastly tendency +-1.533073	was ghostwritten by +-1.533073	by gil bailie +-1.533073	but given the +-1.533073	not going to +-1.533073	and got used +-1.533073	a great idea +-1.533073	your great idea +-1.533073	keeps growing . +-1.533073	it grows more +-1.533073	) had jurisdiction +-1.533073	fantasy had been +-1.533073	i had a +-1.533073	paul had in +-1.533073	which had a +-1.533073	and hantemirov and +-1.533073	by hantemirov and +-1.533073	in hantemirov and +-1.533073	be happening deep +-1.533073	is happening right +-1.533073	what happens today +-1.533073	very hard to +-1.533073	i hardly know +-1.533073	also has a +-1.533073	analysis has only +-1.533073	chronology has always +-1.533073	it has the +-1.533073	him hate to +-2.010194	not have been +-2.010194	not have similarly +-2.010194	not have the +-1.533073	that have relied +-1.533073	well have been +-1.533073	will have an +-1.533073	i haven't read +-1.533073	, he wrote +-1.533073	and he is +-1.533073	said he would +-1.533073	that he is +-1.533073	when he made +-1.533073	observed here prove +-1.533073	is highly possible +-1.533073	for him hate +-1.533073	into him to +-1.533073	, his initial +-1.533073	of his comments +-1.533073	that his precipitous +-1.533073	by how their +-0.8145491	. however , +-1.533073	enormous hs blade +-1.533073	unveiled: humanity at +-2.010194	, i can +-2.010194	, i noticed +-2.010194	, i skimmed +-1.533073	. i haven't +-1.533073	<s> i hardly +-1.533073	? i know +-1.533073	morning i had +-1.533073	passage i stumbled +-1.533073	when i checked +-1.533073	" i'd love +-1.533073	and i've provided +-1.834103	great idea , +-1.834103	great idea . +-1.533073	, if it +-1.533073	<s> if the +-1.533073	much illusion and +-1.533073	an immense energy +-1.533073	important impact on +-1.533073	an important impact +-1.533073	( in a +-1.533073	. in response +-1.533073	<s> in one +-1.533073	acquiesced in this +-1.533073	begin in terms +-1.533073	chronology in passing +-1.533073	described in hantemirov +-1.533073	had in mind +-1.533073	patterns in the +-1.533073	principles in the +-1.533073	published in any +-1.533073	refusal in connection +-1.533073	sat in rev +-1.533073	set in combination +-1.533073	subset in briffa +-2.010194	used in a +-2.010194	used in briffa's +-2.010194	used in virtually +-1.533073	be included with +-1.533073	, including , +-1.533073	while including the +-1.533073	do indeed see +-1.533073	224 individual series +-1.533073	the information finally +-1.533073	against inhomogeneities , +-1.533073	his initial use +-1.533073	! instead of +-1.533073	population instead of +-1.533073	of interannual variability +-1.533073	deep into the +-1.533073	projected into him +-1.533073	bias introduced by +-1.533073	, is considered +-1.533073	decline is not +-1.834103	he is always +-1.834103	he is doing +-2.010194	it is , +-2.010194	it is highly +-2.010194	it is within +-1.533073	obama is that +-1.533073	selection is derived +-1.533073	sensitivity is measured +-1.834103	this is no +-1.834103	this is the +-1.533073	versions is related +-1.533073	what is happening +-1.834103	, it has +-1.834103	, it originated +-1.533073	. it is +-1.533073	and it was +-1.533073	as it is +-1.834103	but it just +-1.834103	but it looks +-1.533073	if it grows +-1.533073	more it is +-1.533073	read it yet +-1.834103	, it's like +-1.834103	, it's very +-1.533073	but it's not +-1.533073	where it's much +-1.533073	with its enormous +-1.533073	any journal article +-1.533073	previous journal ( +-1.533073	had jurisdiction . +-1.533073	due just to +-1.533073	it just keeps +-1.533073	not just between +-1.533073	reposting just one +-1.533073	, kaufman et +-1.533073	just keeps growing +-1.533073	excluding khadyta river +-1.533073	schweingruber's khadyta river +-1.533073	also know , +-1.533073	hardly know where +-1.533073	i know ! +-1.533073	readers know , +-1.533073	yamal larch sample +-1.533073	living larches . +-1.533073	subfossil larches were +-1.533073	the large-scale " +-1.533073	it's like trying +-1.533073	was like crack +-1.533073	very limited size +-1.533073	old living larches +-1.533073	the longest and +-1.533073	while looking up +-1.533073	it looks relevant +-1.533073	i'd love to +-1.533073	he made that +-0.8145491	( mag ) +-1.533073	the magnitude of +-1.533073	by magnus . +-1.533073	which makes the +-1.533073	on many multiproxy +-1.533073	2000 may well +-1.533073	a mean chronology +-1.533073	is measured by +-1.533073	the measurement data +-1.533073	yamal measurement data +-1.533073	chronology method that +-1.533073	corridor method " +-1.533073	rcs method . +-1.533073	this method which +-1.533073	rcs methodology warn +-1.533073	in mind when +-1.533073	to mix religion +-1.533073	far more " +-1.533073	grows more slowly +-1.533073	the more it +-1.533073	this morning i +-1.533073	, most recently +-1.533073	and most sensitive +-1.533073	the most recent +-1.533073	it's much further +-1.533073	so much illusion +-1.533073	the multi-parters , +-1.533073	many multiproxy studies +-1.533073	of mundane politics +-1.533073	about my ghastly +-1.533073	catch my attention +-1.533073	caught my attention +-1.533073	the national debt +-1.533073	all-around naughtiness . +-1.533073	the nettle , +-1.533073	was never properly +-1.533073	<s> next , +-1.533073	is no doubt +-1.533073	the non-robustness observed +-1.533073	characterizes northern forests +-1.533073	- not just +-1.533073	about not struggling +-1.533073	did not preserve +-1.533073	does not have +-1.533073	is not due +-1.533073	it's not going +-1.533073	should not be +-1.533073	were not using +-0.8145491	would not have +-1.533073	as noted before +-1.533073	i noticed that +-1.533073	, notwithstanding these +-1.533073	right now , +-1.834103	with obama , +-1.834103	with obama is +-1.533073	non-robustness observed here +-1.533073	, of older +-1.533073	addition of 17 +-1.533073	all of his +-1.834103	instead of reposting +-1.834103	instead of the +-1.533073	magnitude of interannual +-1.533073	principalities of darkness +-1.533073	realm of mundane +-1.834103	selection of old +-1.834103	selection of older +-1.533073	series of subfossil +-1.533073	shadows of deeper +-1.533073	terms of commentary +-1.533073	unrepresentativeness of the +-1.834103	use of a +-1.834103	use of this +-1.533073	virtue of being +-1.533073	of old trees +-1.533073	year old living +-0.8145491	of older trees +-1.533073	towards older trees +-1.533073	one oldie , +-1.533073	, on average +-1.533073	article on the +-1.533073	caveats on rcs +-1.533073	commentary on this +-1.533073	impact on many +-1.533073	play on a +-1.533073	relied on this +-1.533073	selected on a +-1.533073	in one approach +-1.533073	just one oldie +-1.533073	recent one . +-1.533073	placed online . +-1.533073	has only taken +-1.533073	( or real +-1.533073	it originated with +-1.533073	with osborn and +-1.533073	prove out ( +-1.533073	and outright fantasy +-1.533073	briffa's own caveats +-1.533073	for paleoclimatologists and +-1.533073	biblical passage i +-1.533073	in passing and +-1.533073	the path " +-1.533073	aging patterns in +-1.533073	what paul had +-1.533073	the people that +-1.533073	, perhaps the +-1.533073	. perhaps the +-1.834103	<s> perhaps the +-1.834103	<s> perhaps there's +-1.533073	and perhaps they +-1.533073	( phil trans +-1.533073	the phil trans +-1.533073	12 picked cores +-1.533073	this piece by +-1.533073	right place . +-1.533073	finally placed online +-1.533073	shadow play on +-1.533073	the point that +-1.533073	data policy ) +-1.533073	and politics , +-1.533073	day politics are +-1.533073	mundane politics . +-1.834103	cru population . +-1.834103	cru population consists +-2.010194	schweingruber population . +-2.010194	schweingruber population as +-2.010194	schweingruber population instead +-1.533073	the position that +-1.533073	highly possible and +-1.533073	about potential bias +-1.533073	and potential unrepresentativeness +-1.533073	your power to +-1.533073	anti-divine powers and +-1.533073	his precipitous decline +-1.533073	at precisely the +-1.533073	usual predictable factors +-1.533073	2000 presented this +-1.533073	not preserve centennial-scale +-1.533073	the previous journal +-1.533073	and principalities of +-1.533073	deeper principles in +-1.533073	a prior selection +-1.533073	even probable that +-0.8145491	divergence problem " +-1.533073	been projected into +-1.533073	never properly published +-1.533073	here prove out +-1.533073	to provide the +-1.533073	i've provided a +-1.533073	a provocative thought +-1.533073	properly published in +-1.533073	small push at +-1.533073	a rcs chronology +-1.533073	on rcs methodology +-1.533073	the rcs method +-1.533073	really react to +-1.533073	haven't read it +-1.834103	ca readers also +-1.834103	ca readers know +-1.533073	worth reading , +-1.533073	or real ) +-1.533073	don't really react +-1.533073	the realm of +-1.533073	some reason why +-1.533073	valid reason for +-1.533073	most recent one +-1.533073	most recently , +-1.533073	until recently , +-1.533073	yamal reconstruction . +-1.533073	this refusal in +-1.533073	staunchly refused to +-1.533073	is related to +-1.533073	looks relevant , +-1.533073	have relied on +-1.533073	mix religion and +-1.533073	data remained unarchived +-1.533073	commenter remarked that +-1.533073	of reposting just +-1.533073	, requiring briffa +-1.533073	in response to +-1.533073	the resulting yamal +-1.533073	in rev . +-1.533073	happening right now +-1.834103	the right place +-1.834103	the right time +-1.533073	were right . +-1.533073	between ring widths +-1.533073	17 ring-width series +-0.8145491	khadyta river , +-1.533073	conservatives said he +-1.533073	the same bias +-1.533073	larch sample should +-1.533073	and sat in +-1.533073	similar schweingruber data +-0.1249387	the schweingruber population +-1.533073	why schweingruber's khadyta +-1.533073	. science ( +-1.533073	a science article +-1.533073	and science ( +-1.533073	generating script ) +-1.533073	can see the +-1.533073	indeed see the +-1.533073	you see , +-1.533073	finally seized the +-1.834103	were selected . +-1.834103	were selected on +-1.533073	biased selection of +-1.533073	cru selection is +-1.533073	prior selection of +-1.533073	most sensitive series +-1.533073	where sensitivity is +-1.533073	individual series of +-1.533073	ring-width series , +-1.533073	sensitive series , +-1.533073	data set in +-1.533073	. several things +-1.533073	a shadow play +-1.533073	the shadows of +-1.533073	these shadows . +-2.010194	and shiyatov 2002 +-2.010194	and shiyatov themselves +-2.010194	and shiyatov would +-1.533073	sample should not +-1.533073	a similar schweingruber +-1.533073	have similarly affected +-1.533073	, since this +-1.533073	limited size and +-1.533073	i skimmed this +-1.533073	more slowly , +-1.533073	way slowly get +-1.533073	a small push +-1.533073	because so much +-1.533073	there's some reason +-1.533073	from someone whose +-1.533073	to start today +-1.533073	cru staunchly refused +-1.533073	not struggling against +-1.533073	multiproxy studies that +-1.533073	subsequent study , +-1.533073	this study . +-1.533073	i stumbled upon +-1.533073	of subfossil larches +-1.533073	the subfossil collection +-1.533073	yamal subfossil data +-1.533073	every subsequent study +-1.533073	this subset in +-1.533073	" success . +-1.533073	taymir supplement . +-1.533073	were supplemented by +-1.533073	a surface , +-1.533073	would take an +-1.533073	only taken a +-1.834103	the taymir data +-1.834103	the taymir supplement +-1.533073	and temperature , +-1.533073	ghastly tendency to +-1.533073	in terms of +-1.533073	trees than the +-1.533073	" that characterizes +-1.533073	admit that the +-1.533073	and that way +-1.533073	is that he +-1.533073	made that wise +-1.533073	method that they +-1.533073	noticed that the +-1.533073	people that voted +-1.533073	point that his +-1.533073	position that the +-1.533073	probable that the +-1.533073	remarked that " +-1.533073	studies that have +-1.533073	things that cast +-1.533073	" the trouble +-2.010194	, the more +-2.010194	, the resulting +-2.010194	, the yamal +-1.533073	. the cru +-1.834103	<s> the subfossil +-1.834103	<s> the yamal +-1.533073	affected the " +-1.533073	along the path +-2.010194	and the people +-2.010194	and the phil +-2.010194	and the right +-1.533073	archive the data +-1.533073	at the crossroads +-0.8145491	between the two +-1.533073	but the further +-1.834103	by the addition +-1.834103	by the magnitude +-1.533073	combine the multi-parters +-1.533073	control the national +-0.8145491	get the arkive +-1.533073	given the use +-1.533073	has the virtue +-1.533073	have the same +-1.533073	if the non-robustness +-1.834103	in the realm +-1.834103	in the schweingruber +-1.533073	including the taymir +-1.533073	into the future +-1.533073	is the most +-1.533073	makes the point +-0.8145491	of the 12 +-1.533073	on the trouble +-2.010194	perhaps the biased +-2.010194	perhaps the day +-2.010194	perhaps the difference +-1.533073	precisely the right +-1.533073	provide the measurement +-1.834103	see the far +-1.834103	see the shadows +-1.533073	seized the nettle +-1.533073	than the schweingruber +-2.135133	that the conservatives +-2.135133	that the cru +-2.135133	that the previous +-2.135133	that the yamal +-1.533073	through the very +-2.135133	to the back-and-forth +-2.135133	to the cru +-2.135133	to the large-scale +-2.135133	to the usual +-1.533073	took the position +-1.533073	up the biblical +-1.533073	used the chronology +-1.533073	using the schweingruber +-1.533073	were the longest +-1.533073	what the conservatives +-1.533073	while the yamal +-2.135133	with the information +-2.135133	with the rcs +-2.135133	with the taymir +-2.135133	with the yamal +-1.533073	wrote the following: +-1.533073	how their cores +-1.533073	shiyatov themselves , +-1.533073	they themselves were +-1.533073	perhaps there's some +-1.834103	. these data +-1.834103	. these were +-1.533073	cast these shadows +-1.533073	notwithstanding these warnings +-1.533073	and they can +-1.533073	did they expect +-1.533073	perhaps they don't +-1.533073	that they themselves +-1.533073	) things that +-1.533073	several things caught +-1.533073	to think up +-1.533073	fully thinking through +-2.010194	, this analysis +-2.010194	, this chronology +-2.010194	, this will +-1.834103	. this bias +-1.834103	. this is +-1.533073	<s> this morning +-1.533073	but this is +-1.533073	earlier this year +-1.533073	in this refusal +-1.533073	of this subset +-1.834103	on this difference +-1.834103	on this study +-1.533073	presented this chronology +-1.533073	since this method +-1.533073	skimmed this article +-1.533073	upon this piece +-1.533073	all those years +-1.533073	to those " +-1.533073	provocative thought . +-1.533073	a thousand , +-1.533073	thinking through the +-1.533073	a time , +-1.533073	right time and +-1.533073	approach to constructing +-1.533073	are to those +-1.533073	briffa to archive +-1.533073	but to what +-1.533073	compared to the +-1.533073	day to day +-1.533073	down to about +-1.533073	exception to the +-1.533073	going to start +-1.533073	hard to think +-1.533073	hate to admit +-1.533073	him to begin +-1.533073	just to the +-1.533073	love to get +-1.533073	power to change +-1.533073	react to what +-1.533073	refused to provide +-1.533073	related to different +-1.533073	response to the +-1.533073	tendency to mix +-1.533073	trying to control +-1.533073	where to begin +-1.533073	happens today would +-1.533073	start today . +-1.533073	) took the +-1.533073	bias towards older +-1.834103	phil trans b +-1.834103	phil trans editors +-1.533073	old trees described +-2.010194	older trees . +-2.010194	older trees an +-2.010194	older trees than +-0.8145491	the trouble with +-1.533073	being true , +-1.533073	like trying to +-0.8145491	the two versions +-1.533073	remained unarchived . +-1.533073	arkive under control +-1.533073	an unintentional bias +-1.533073	potential unrepresentativeness of +-1.533073	, until recently +-1.533073	violence unveiled: humanity +-1.533073	looking up the +-1.533073	think up a +-1.533073	stumbled upon this +-1.533073	initial use of +-1.533073	the use of +-1.533073	" used by +-1.533073	briffa used the +-1.533073	data used in +-1.533073	got used in +-1.533073	was used in +-1.533073	, using the +-1.533073	not using . +-1.533073	the usual predictable +-1.533073	a valid reason +-1.533073	centennial-scale variability and +-1.533073	interannual variability . +-1.834103	two versions . +-1.834103	two versions is +-1.533073	it's very hard +-1.533073	the very limited +-1.533073	. violence unveiled: +-1.533073	in virtually every +-1.533073	the virtue of +-1.533073	that voted for +-1.533073	methodology warn against +-1.533073	these warnings , +-1.533073	blade was like +-1.533073	book was ghostwritten +-1.533073	chronology was used +-1.533073	data was finally +-1.533073	it was never +-1.533073	that way slowly +-1.533073	for we do +-1.533073	may well have +-1.533073	conservatives were right +-1.533073	cores were selected +-1.533073	data were supplemented +-1.533073	larches were selected +-1.533073	themselves were not +-1.533073	these were the +-1.533073	. what did +-1.533073	<s> what a +-1.533073	changing what happens +-1.533073	doubt what paul +-1.533073	exactly what the +-1.834103	to what is +-1.834103	to what will +-1.533073	. what's your +-1.834103	, when combined +-1.834103	, when i +-1.533073	mind when he +-1.533073	, where sensitivity +-1.533073	case where it's +-1.533073	know where to +-1.533073	( which had +-1.834103	, which , +-1.834103	, which makes +-1.533073	method which did +-1.533073	( while looking +-1.533073	, while including +-1.533073	. while the +-1.533073	someone whose book +-1.533073	reason why schweingruber's +-1.533073	ring widths and +-1.533073	this will have +-1.533073	what will be +-1.533073	that wise crack +-1.533073	. with the +-1.533073	begin with . +-1.533073	change with a +-1.533073	chronology with its +-1.533073	combination with the +-1.533073	combined with the +-1.533073	connection with osborn +-1.533073	included with the +-1.533073	originated with briffa +-0.8145491	trouble with obama +-1.533073	is within your +-1.533073	done without fully +-1.533073	always worth reading +-1.533073	bias would not +-1.533073	he would do +-1.533073	shiyatov would not +-1.533073	today would take +-1.533073	. wright's church +-1.533073	he wrote the +-1.533073	, yamal larch +-1.533073	briffa's yamal reconstruction +-1.533073	resulting yamal chronology +-1.212489	the yamal chronology +-2.232043	the yamal data +-2.232043	the yamal measurement +-2.232043	the yamal subfossil +-1.533073	200–400 year old +-1.533073	this year , +-1.533073	those years ? +-1.533073	' yes , +-1.533073	back-and-forth yesterday about +-1.533073	it yet , +-1.533073	ahead you see +-1.533073	what's your great +-1.533073	within your power + +\end\ diff --git a/decoder/test_data/grammar.prune b/decoder/test_data/grammar.prune new file mode 100644 index 00000000..4ebcb509 --- /dev/null +++ b/decoder/test_data/grammar.prune @@ -0,0 +1,196 @@ +[PHRASE] ||| [PHRASE,1] haus ||| [PHRASE,1] house ||| 1.86183 0 0 0 0.0211892 +[PHRASE] ||| [PHRASE,1] haus ist ||| is [PHRASE,1] house ||| 2.58883 0.311249 0 0.348455 0.0211893 +[PHRASE] ||| [PHRASE,1] haus gibt ||| is [PHRASE,1] house ||| 2.56863 0.291046 0 0.258278 0.0211893 +[PHRASE] ||| [PHRASE,1] ein haus ist ||| [PHRASE,1] is a house ||| 3.16286 0 0 0.576934 0.0211893 +[PHRASE] ||| [PHRASE,1] ist ||| [PHRASE,1] is ||| 2.94101 0 0.676694 0.348455 0 +[PHRASE] ||| [PHRASE,1] ist ||| is [PHRASE,1] ||| 2.36698 0.649056 0.102662 0.348455 0 +[PHRASE] ||| [PHRASE,1] klein ist ||| [PHRASE,1] is small ||| 2.58883 0.124939 0 0.78211 0 +[PHRASE] ||| [PHRASE,1] maus ||| [PHRASE,1] mouse ||| 2.09592 0 0 0 0 +[PHRASE] ||| [PHRASE,1] maus gibt ||| is [PHRASE,1] mouse ||| 2.44865 0 0 0.258278 0 +[PHRASE] ||| [PHRASE,1] kleines ||| [PHRASE,1] small ||| 2.94101 0.439333 0 0.579784 0 +[PHRASE] ||| [PHRASE,1] kleines haus ||| [PHRASE,1] small house ||| 3.24204 0 0 0.579784 0.0211893 +[PHRASE] ||| [PHRASE,1] kleines haus gibt ||| is [PHRASE,1] small house ||| 3.30899 0 0 0.838062 0.0211893 +[PHRASE] ||| [PHRASE,1] kleine ||| [PHRASE,1] small ||| 2.94101 0.439333 0 0.500602 0 +[PHRASE] ||| [PHRASE,1] kleine maus ||| [PHRASE,1] small mouse ||| 3.24204 0 0 0.500602 0 +[PHRASE] ||| [PHRASE,1] kleine maus gibt ||| is [PHRASE,1] small mouse ||| 3.30899 0 0 0.75888 0 +[PHRASE] ||| [PHRASE,1] gelb ||| [PHRASE,1] yellow ||| 2.63998 0 0 0 0 +[PHRASE] ||| [PHRASE,1] gelb haus ||| [PHRASE,1] yellow house ||| 3.24204 0 0 0 0.0211893 +[PHRASE] ||| [PHRASE,1] gelb haus gibt ||| is [PHRASE,1] yellow house ||| 3.30899 0 0 0.258278 0.0211893 +[PHRASE] ||| [PHRASE,1] gelb maus ||| [PHRASE,1] yellow mouse ||| 3.24204 0 0 0 0 +[PHRASE] ||| [PHRASE,1] gelb maus gibt ||| is [PHRASE,1] yellow mouse ||| 3.30899 0 0 0.258278 0 +[PHRASE] ||| [PHRASE,1] gibt ||| is [PHRASE,1] ||| 1.82827 0.110339 0 0.258278 0 +[PHRASE] ||| haus ||| small yellow mouse house ||| 2.46389 0.845098 1.30103 0.278754 1.34341 +[PHRASE] ||| haus ||| house ||| Phrase_0=1.18514 Phrase_2=0.0222764 Phrase_4=0.0211893 +[PHRASE] ||| haus [PHRASE,1] ||| house [PHRASE,1] ||| 2.2878 0 0 0 0.0211893 +[PHRASE] ||| haus ist ||| house is ||| 2.46389 0 0 0.348455 0.0211893 +[PHRASE] ||| haus klein ist ||| house is small ||| 2.2878 0 0 0.78211 0.0211893 +[PHRASE] ||| ein ||| a ||| Phrase_0=1.34995 Phrase_1=0.228479 Phrase_3=0.228479 +[PHRASE] ||| ein [PHRASE,1] ||| a [PHRASE,1] ||| 2.03792 0.290035 0 0.228479 0 +[PHRASE] ||| ein [PHRASE,1] haus ||| a [PHRASE,1] house ||| 2.94101 0 0 0.228479 0.0211893 +[PHRASE] ||| ein [PHRASE,1] haus gibt ||| is a [PHRASE,1] house ||| 3.00796 0 0 0.486757 0.0211893 +[PHRASE] ||| ein [PHRASE,1] ist ||| is a [PHRASE,1] ||| 2.58883 0.535113 0 0.576934 0 +[PHRASE] ||| ein [PHRASE,1] gibt ||| is a [PHRASE,1] ||| 2.56863 0.51491 0 0.486757 0 +[PHRASE] ||| ein haus ||| a house ||| 1.76492 0 0.0791813 0.228479 0.0211893 +[PHRASE] ||| ein haus ||| a small house ||| 2.46389 0.30103 0.778151 0.507233 1.34341 +[PHRASE] ||| ein haus ist ||| is a house ||| 2.76492 0.477121 0 0.576934 0.0211893 +[PHRASE] ||| ein haus gibt ||| is a house ||| 2.46389 0.176091 0.176091 0.486757 0.0211893 +[PHRASE] ||| ein haus gibt ||| is a small house ||| 2.76492 0.39794 0.477121 0.765511 1.34341 +[PHRASE] ||| ein kleines ||| a small ||| 1.86183 0.243038 0 0.808263 0 +[PHRASE] ||| ein kleines [PHRASE,1] ||| a small [PHRASE,1] ||| 3.24204 0.30103 0 0.808263 0 +[PHRASE] ||| ein kleines [PHRASE,1] gibt ||| is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.06654 0 +[PHRASE] ||| ein kleines haus ||| a small house ||| 2.46389 0.30103 0 0.808263 0.0211893 +[PHRASE] ||| ein kleines haus ist ||| is a small house ||| 2.76492 0.39794 0 1.15672 0.0211893 +[PHRASE] ||| ein kleines haus gibt ||| is a small house ||| 3.06595 0.69897 0 1.06654 0.0211893 +[PHRASE] ||| ein kleines gelb ||| a small yellow ||| 2.94101 0.30103 0 0.808263 0 +[PHRASE] ||| ein kleines gelb haus ||| a small yellow house ||| 3.24204 0 0 0.808263 0.0211893 +[PHRASE] ||| ein kleines gelb haus gibt ||| is a small yellow house ||| 3.30899 0 0 1.06654 0.0211893 +[PHRASE] ||| ein gelb ||| a yellow ||| 1.98677 0.221849 0 0.228479 0 +[PHRASE] ||| ein gelb [PHRASE,1] ||| a yellow [PHRASE,1] ||| 3.24204 0.30103 0 0.228479 0 +[PHRASE] ||| ein gelb [PHRASE,1] gibt ||| is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.486757 0 +[PHRASE] ||| ein gelb haus ||| a yellow house ||| 2.63998 0 0 0.228479 0.0211893 +[PHRASE] ||| ein gelb haus ist ||| is a yellow house ||| 3.06595 0.30103 0 0.576934 0.0211893 +[PHRASE] ||| ein gelb haus gibt ||| is a yellow house ||| 3.06595 0.30103 0 0.486757 0.0211893 +[PHRASE] ||| ein gelb kleines ||| a yellow small ||| 2.94101 0.30103 0 0.808263 0 +[PHRASE] ||| ein gelb kleines haus ||| a yellow small house ||| 3.24204 0 0 0.808263 0.0211893 +[PHRASE] ||| ein gelb kleines haus gibt ||| is a yellow small house ||| 3.30899 0 0 1.06654 0.0211893 +[PHRASE] ||| ist ||| is ||| 1.34995 0.348455 0 0.348455 0 +[PHRASE] ||| klein ||| small ||| 1.61879 0.410174 0 0.433656 0 +[PHRASE] ||| klein [PHRASE,1] ||| [PHRASE,1] small ||| 3.06595 0.564271 0 0.433656 0 +[PHRASE] ||| klein [PHRASE,1] ist ||| [PHRASE,1] is small ||| 3.06595 0.60206 0 0.78211 0 +[PHRASE] ||| klein ist ||| is small ||| 1.68574 0 0 0.78211 0 +[PHRASE] ||| klein das [PHRASE,1] ||| the [PHRASE,1] small ||| 3.06595 0 0 0.433656 0.30103 +[PHRASE] ||| klein das haus ist ||| the house is small ||| 3.06595 0.477121 0 0.78211 0.322219 +[PHRASE] ||| maus ||| mouse ||| 1.50965 0 0 0 0 +[PHRASE] ||| maus [PHRASE,1] ||| mouse [PHRASE,1] ||| 2.94101 0 0 0 0 +[PHRASE] ||| maus [PHRASE,1] ist ||| mouse is [PHRASE,1] ||| 2.94101 0 0 0.348455 0 +[PHRASE] ||| maus ein haus ist ||| mouse is a house ||| 2.94101 0 0 0.576934 0.0211893 +[PHRASE] ||| kleines ||| small ||| 1.76492 0.556302 0 0.579784 0 +[PHRASE] ||| kleines [PHRASE,1] ||| small [PHRASE,1] ||| 2.94101 0.30103 0 0.579784 0 +[PHRASE] ||| kleines haus ||| small house ||| 1.86183 0.243038 0 0.579784 0.0211893 +[PHRASE] ||| kleines gelb ||| small yellow ||| 2.46389 0.30103 0 0.579784 0 +[PHRASE] ||| kleines gelb haus ||| small yellow house ||| 2.94101 0 0 0.579784 0.0211893 +[PHRASE] ||| kleine ||| small ||| 1.68574 0.477121 0 0.500602 0 +[PHRASE] ||| kleine [PHRASE,1] ||| small [PHRASE,1] ||| 2.94101 0.30103 0 0.500602 0 +[PHRASE] ||| kleine haus ||| small house ||| 2.16286 0.544068 0 0.500602 0.0211893 +[PHRASE] ||| kleine maus ||| small mouse ||| 1.98677 0 0 0.500602 0 +[PHRASE] ||| kleine gelb ||| small yellow ||| 2.46389 0.30103 0 0.500602 0 +[PHRASE] ||| kleine gelb maus ||| small yellow mouse ||| 2.94101 0 0 0.500602 0 +[PHRASE] ||| gelb ||| yellow ||| 1.61879 0 0 0 0 +[PHRASE] ||| gelb [PHRASE,1] ||| yellow [PHRASE,1] ||| 2.63998 0 0 0 0 +[PHRASE] ||| gelb haus ||| yellow house ||| 1.98677 0 0 0 0.0211893 +[PHRASE] ||| gelb maus ||| yellow mouse ||| 2.16286 0 0 0 0 +[PHRASE] ||| gelb kleines ||| yellow small ||| 2.46389 0.30103 0 0.579784 0 +[PHRASE] ||| gelb kleines haus ||| yellow small house ||| 2.94101 0 0 0.579784 0.0211893 +[PHRASE] ||| gelb kleine ||| yellow small ||| 2.46389 0.30103 0 0.500602 0 +[PHRASE] ||| gelb kleine maus ||| yellow small mouse ||| 2.94101 0 0 0.500602 0 +[PHRASE] ||| eine ||| a ||| 1.50965 0.38818 0 0.38818 0 +[PHRASE] ||| eine [PHRASE,1] ||| a [PHRASE,1] ||| 2.0602 0.312311 0 0.38818 0 +[PHRASE] ||| eine [PHRASE,1] maus ||| a [PHRASE,1] mouse ||| 2.94101 0 0 0.38818 0 +[PHRASE] ||| eine [PHRASE,1] maus gibt ||| is a [PHRASE,1] mouse ||| 3.00796 0 0 0.646458 0 +[PHRASE] ||| eine [PHRASE,1] gibt ||| is a [PHRASE,1] ||| 2.44865 0.394934 0 0.646458 0 +[PHRASE] ||| eine maus ||| a mouse ||| 1.98677 0 0 0.38818 0 +[PHRASE] ||| eine maus [PHRASE,1] ||| a mouse [PHRASE,1] ||| 3.16286 0 0 0.38818 0 +[PHRASE] ||| eine maus [PHRASE,1] ist ||| a mouse is [PHRASE,1] ||| 3.16286 0 0 0.736635 0 +[PHRASE] ||| eine maus ein haus ist ||| a mouse is a house ||| 3.16286 0 0 0.965114 0.0211893 +[PHRASE] ||| eine maus gibt ||| is a mouse ||| 2.46389 0 0 0.646458 0 +[PHRASE] ||| eine kleine ||| a small ||| 1.98677 0.367977 0 0.888783 0 +[PHRASE] ||| eine kleine [PHRASE,1] ||| a small [PHRASE,1] ||| 3.24204 0.30103 0 0.888783 0 +[PHRASE] ||| eine kleine [PHRASE,1] gibt ||| is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.14706 0 +[PHRASE] ||| eine kleine maus ||| a small mouse ||| 2.63998 0 0 0.888783 0 +[PHRASE] ||| eine kleine maus gibt ||| is a small mouse ||| 2.76492 0 0 1.14706 0 +[PHRASE] ||| eine kleine gelb ||| a small yellow ||| 2.94101 0.30103 0 0.888783 0 +[PHRASE] ||| eine kleine gelb maus ||| a small yellow mouse ||| 3.24204 0 0 0.888783 0 +[PHRASE] ||| eine kleine gelb maus gibt ||| is a small yellow mouse ||| 3.30899 0 0 1.14706 0 +[PHRASE] ||| eine gelb ||| a yellow ||| 2.16286 0.39794 0 0.38818 0 +[PHRASE] ||| eine gelb [PHRASE,1] ||| a yellow [PHRASE,1] ||| 3.24204 0.30103 0 0.38818 0 +[PHRASE] ||| eine gelb [PHRASE,1] gibt ||| is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.646458 0 +[PHRASE] ||| eine gelb maus ||| a yellow mouse ||| 2.94101 0 0 0.38818 0 +[PHRASE] ||| eine gelb maus gibt ||| is a yellow mouse ||| 3.06595 0 0 0.646458 0 +[PHRASE] ||| eine gelb kleine ||| a yellow small ||| 2.94101 0.30103 0 0.888783 0 +[PHRASE] ||| eine gelb kleine maus ||| a yellow small mouse ||| 3.24204 0 0 0.888783 0 +[PHRASE] ||| eine gelb kleine maus gibt ||| is a yellow small mouse ||| 3.30899 0 0 1.14706 0 +[PHRASE] ||| eine gruen ||| a green ||| 2.46389 0 0 0.38818 0 +[PHRASE] ||| eine gruen maus ||| a green mouse ||| 2.94101 0 0 0.38818 0 +[PHRASE] ||| gruen ||| green ||| 2.16286 0 0 0 0 +[PHRASE] ||| gruen maus ||| green mouse ||| 2.46389 0 0 0 0 +[PHRASE] ||| tages ||| day ||| 2.46389 0 0 0 0 +[PHRASE] ||| gibt ||| is ||| 1.25977 0.258278 0 0.258278 0 +[PHRASE] ||| meins ||| mine ||| 2.16286 0 0 0 0 +[PHRASE] ||| meins [PHRASE,1] ||| mine [PHRASE,1] ||| 2.76492 0 0 0 0 +[PHRASE] ||| meins ist ||| is mine ||| 2.46389 0 0 0.348455 0 +[PHRASE] ||| meins klein ist ||| mine is small ||| 2.76492 0 0 0.78211 0 +[PHRASE] ||| geld ||| money ||| 1.98677 0 0 0 0 +[PHRASE] ||| geld ist ||| is money ||| 2.46389 0.30103 0 0.348455 0 +[PHRASE] ||| geld gibt ||| is money ||| 2.46389 0.30103 0 0.258278 0 +[PHRASE] ||| keins ||| none ||| 1.98677 0 0 0 0 +[PHRASE] ||| keins [PHRASE,1] ||| none [PHRASE,1] ||| 2.76492 0 0 0 0 +[PHRASE] ||| keins klein ist ||| none is small ||| 2.76492 0 0 0.78211 0 +[PHRASE] ||| keins gibt ||| is none ||| 2.46389 0 0 0.258278 0 +[PHRASE] ||| dem haeuschen ||| of control ||| 2.46389 0 0 0.681241 0.425969 +[PHRASE] ||| eines ||| one ||| 2.46389 0.30103 0 0.30103 0 +[PHRASE] ||| eines tages ||| one day ||| 2.46389 0 0 0.30103 0 +[PHRASE] ||| eins ||| one ||| 2.46389 0.30103 0 0.30103 0 +[PHRASE] ||| aus ||| out ||| 2.46389 0 0.477121 0 0.221849 +[PHRASE] ||| aus ||| out of ||| 2.16286 0 0.176091 0.0791812 0.619789 +[PHRASE] ||| aus [PHRASE,1] ||| out [PHRASE,1] ||| 2.76492 0 0.367977 0 0.221849 +[PHRASE] ||| aus [PHRASE,1] ||| out of [PHRASE,1] ||| 2.63998 0 0.243038 0.0791812 0.619789 +[PHRASE] ||| aus ein ||| out of a ||| 2.46389 0 0 0.307661 0.619789 +[PHRASE] ||| aus ein haus ||| out of a house ||| 2.94101 0 0 0.307661 0.640978 +[PHRASE] ||| aus dem haeuschen ||| out of control ||| 2.76492 0 0 0.681241 0.647817 +[PHRASE] ||| aus das ||| out of the ||| 2.46389 0 0 0.0791812 0.920819 +[PHRASE] ||| aus das haus ||| out of the house ||| 2.94101 0 0 0.0791812 0.942008 +[PHRASE] ||| das ||| the ||| 1.76492 0 0.30103 0 0.30103 +[PHRASE] ||| das ||| that ||| 1.76492 0 0.30103 0 0.30103 +[PHRASE] ||| das [PHRASE,1] ||| the [PHRASE,1] ||| 2.39695 0 0.41972 0 0.30103 +[PHRASE] ||| das [PHRASE,1] ||| that [PHRASE,1] ||| 2.18514 0 0.207913 0 0.30103 +[PHRASE] ||| das [PHRASE,1] haus ist ||| that is [PHRASE,1] house ||| 2.86183 0 0 0.348455 0.322219 +[PHRASE] ||| das [PHRASE,1] ist ||| that is [PHRASE,1] ||| 2.86183 0 0 0.348455 0.30103 +[PHRASE] ||| das haus ||| the house ||| 1.86183 0 0 0 0.322219 +[PHRASE] ||| das haus [PHRASE,1] ||| the house [PHRASE,1] ||| 2.76492 0 0 0 0.322219 +[PHRASE] ||| das haus ist ||| the house is ||| 2.94101 0 0 0.348455 0.322219 +[PHRASE] ||| das haus klein ist ||| the house is small ||| 2.76492 0.176091 0 0.78211 0.322219 +[PHRASE] ||| das ein [PHRASE,1] ist ||| that is a [PHRASE,1] ||| 2.86183 0 0 0.576934 0.30103 +[PHRASE] ||| das ein kleines haus ist ||| that is a small house ||| 3.16286 0 0 1.15672 0.322219 +[PHRASE] ||| das ein gelb haus ist ||| that is a yellow house ||| 3.16286 0 0 0.576934 0.322219 +[PHRASE] ||| das klein ist ||| that is small ||| 2.76492 0 0 0.78211 0.30103 +[PHRASE] ||| das kleine ||| the small ||| 2.46389 0 0 0.500602 0.30103 +[PHRASE] ||| das kleine haus ||| the small house ||| 2.94101 0 0 0.500602 0.322219 +[PHRASE] ||| das meins ist ||| that is mine ||| 2.76492 0 0 0.348455 0.30103 +[PHRASE] ||| das geld ist ||| that is money ||| 2.76492 0 0 0.348455 0.30103 +[PHRASE] ||| es ||| there ||| 1.25977 0 0 0 0 +[PHRASE] ||| es [PHRASE,1] ||| there [PHRASE,1] ||| 1.83672 0 0 0 0 +[PHRASE] ||| es [PHRASE,1] haus gibt ||| there is [PHRASE,1] house ||| 2.62775 0 0 0.258278 0.0211893 +[PHRASE] ||| es [PHRASE,1] maus gibt ||| there is [PHRASE,1] mouse ||| 2.5166 0 0 0.258278 0 +[PHRASE] ||| es [PHRASE,1] kleines haus gibt ||| there is [PHRASE,1] small house ||| 3.30899 0 0 0.838062 0.0211893 +[PHRASE] ||| es [PHRASE,1] kleine maus gibt ||| there is [PHRASE,1] small mouse ||| 3.30899 0 0 0.75888 0 +[PHRASE] ||| es [PHRASE,1] gelb haus gibt ||| there is [PHRASE,1] yellow house ||| 3.30899 0 0 0.258278 0.0211893 +[PHRASE] ||| es [PHRASE,1] gelb maus gibt ||| there is [PHRASE,1] yellow mouse ||| 3.30899 0 0 0.258278 0 +[PHRASE] ||| es [PHRASE,1] gibt ||| there is [PHRASE,1] ||| 1.9536 0 0 0.258278 0 +[PHRASE] ||| es ein [PHRASE,1] haus gibt ||| there is a [PHRASE,1] house ||| 3.00796 0 0 0.486757 0.0211893 +[PHRASE] ||| es ein [PHRASE,1] gibt ||| there is a [PHRASE,1] ||| 2.62775 0.360151 0 0.486757 0 +[PHRASE] ||| es ein haus gibt ||| there is a house ||| 2.63998 0 0.176091 0.486757 0.0211893 +[PHRASE] ||| es ein haus gibt ||| there is a small house ||| 2.94101 0.20412 0.477121 0.765511 1.34341 +[PHRASE] ||| es ein kleines [PHRASE,1] gibt ||| there is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.06654 0 +[PHRASE] ||| es ein kleines haus gibt ||| there is a small house ||| 3.16286 0.425969 0 1.06654 0.0211893 +[PHRASE] ||| es ein gelb [PHRASE,1] gibt ||| there is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.486757 0 +[PHRASE] ||| es ein gelb haus gibt ||| there is a yellow house ||| 3.16286 0 0 0.486757 0.0211893 +[PHRASE] ||| es eine [PHRASE,1] maus gibt ||| there is a [PHRASE,1] mouse ||| 3.00796 0 0 0.646458 0 +[PHRASE] ||| es eine [PHRASE,1] gibt ||| there is a [PHRASE,1] ||| 2.5166 0.249001 0 0.646458 0 +[PHRASE] ||| es eine maus gibt ||| there is a mouse ||| 2.63998 0 0 0.646458 0 +[PHRASE] ||| es eine kleine [PHRASE,1] gibt ||| there is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.14706 0 +[PHRASE] ||| es eine kleine maus gibt ||| there is a small mouse ||| 2.86183 0 0 1.14706 0 +[PHRASE] ||| es eine gelb [PHRASE,1] gibt ||| there is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.646458 0 +[PHRASE] ||| es eine gelb maus gibt ||| there is a yellow mouse ||| 3.16286 0 0 0.646458 0 +[PHRASE] ||| es geld gibt ||| there is money ||| 2.76492 0 0 0.258278 0 +[PHRASE] ||| es keins gibt ||| there is none ||| 2.76492 0 0 0.258278 0 +[PHRASE] ||| dieses ||| this ||| 1.98677 0 0 0 0 +[PHRASE] ||| dieses [PHRASE,1] ||| this [PHRASE,1] ||| 2.56995 0 0 0 0 +[PHRASE] ||| dieses [PHRASE,1] haus ist ||| this is [PHRASE,1] house ||| 3.16286 0 0 0.348455 0.0211893 +[PHRASE] ||| dieses [PHRASE,1] ist ||| this is [PHRASE,1] ||| 3.16286 0 0 0.348455 0 +[PHRASE] ||| dieses haus ||| this house ||| 2.46389 0 0 0 0.0211893 +[PHRASE] ||| dieses haus [PHRASE,1] ||| this house [PHRASE,1] ||| 3.06595 0 0 0 0.0211893 +[PHRASE] ||| dieses haus klein ist ||| this house is small ||| 3.06595 0 0 0.78211 0.0211893 +[PHRASE] ||| dieses ein [PHRASE,1] ist ||| this is a [PHRASE,1] ||| 3.16286 0 0 0.576934 0 +[PHRASE] ||| dieses ein kleines haus ist ||| this is a small house ||| 3.16286 0 0 1.15672 0.0211893 +[PHRASE] ||| dieses kleine ||| this small ||| 2.46389 0 0 0.500602 0 +[PHRASE] ||| dieses kleine haus ||| this small house ||| 2.94101 0 0 0.500602 0.0211893 diff --git a/decoder/test_data/small.json.gz b/decoder/test_data/small.json.gz Binary files differnew file mode 100644 index 00000000..892ba360 --- /dev/null +++ b/decoder/test_data/small.json.gz diff --git a/decoder/test_data/test_2gram.lm.gz b/decoder/test_data/test_2gram.lm.gz Binary files differnew file mode 100644 index 00000000..aafa7274 --- /dev/null +++ b/decoder/test_data/test_2gram.lm.gz diff --git a/decoder/test_data/weights b/decoder/test_data/weights new file mode 100644 index 00000000..ea70229c --- /dev/null +++ b/decoder/test_data/weights @@ -0,0 +1,8 @@ +# hiero +WordPenalty -0.387029 +LanguageModel 0.253195 +PhraseModel_0 0.142926 +PhraseModel_1 0.465119 +PhraseModel_2 0.079503 +CNPosteriorProbability 0.09259 +Inf -inf diff --git a/decoder/test_data/weights.gt b/decoder/test_data/weights.gt new file mode 100644 index 00000000..08931049 --- /dev/null +++ b/decoder/test_data/weights.gt @@ -0,0 +1,4 @@ +Phrase_0 1.0 +Phrase_1 0.5 +Phrase_2 0.3 +Phrase_3 0.2 diff --git a/decoder/timing_stats.cc b/decoder/timing_stats.cc new file mode 100644 index 00000000..85b95de5 --- /dev/null +++ b/decoder/timing_stats.cc @@ -0,0 +1,24 @@ +#include "timing_stats.h" + +#include <iostream> + +using namespace std; + +map<string, TimerInfo> Timer::stats; + +Timer::Timer(const string& timername) : start_t(clock()), cur(stats[timername]) {} + +Timer::~Timer() { +  ++cur.calls; +  const clock_t end_t = clock(); +  const double elapsed = (end_t - start_t) / 1000000.0; +  cur.total_time += elapsed; +} + +void Timer::Summarize() { +  for (map<string, TimerInfo>::iterator it = stats.begin(); it != stats.end(); ++it) { +    cerr << it->first << ": " << it->second.total_time << " secs (" << it->second.calls << " calls)\n"; +  } +  stats.clear(); +} + diff --git a/decoder/timing_stats.h b/decoder/timing_stats.h new file mode 100644 index 00000000..0a9f7656 --- /dev/null +++ b/decoder/timing_stats.h @@ -0,0 +1,25 @@ +#ifndef _TIMING_STATS_H_ +#define _TIMING_STATS_H_ + +#include <string> +#include <map> + +struct TimerInfo { +  int calls; +  double total_time; +  TimerInfo() : calls(), total_time() {} +}; + +struct Timer { +  Timer(const std::string& info); +  ~Timer(); +  static void Summarize(); + private: +  static std::map<std::string, TimerInfo> stats; +  clock_t start_t; +  TimerInfo& cur; +  Timer(const Timer& other); +  const Timer& operator=(const Timer& other); +}; + +#endif diff --git a/decoder/translator.cc b/decoder/translator.cc new file mode 100644 index 00000000..e6c282e1 --- /dev/null +++ b/decoder/translator.cc @@ -0,0 +1,57 @@ +#include "translator.h" + +#include <iostream> +#include <vector> + +using namespace std; + +Translator::~Translator() {} + +void Translator::ProcessMarkupHints(const map<string, string>& kv) { +  if (state_ != kUninitialized) { +    cerr << "Translator::ProcessMarkupHints in wrong state: " << state_ << endl; +    abort(); +  } +  ProcessMarkupHintsImpl(kv); +  state_ = kReadyToTranslate; +} + +bool Translator::Translate(const std::string& src, +                 SentenceMetadata* smeta, +                 const std::vector<double>& weights, +                 Hypergraph* minus_lm_forest) { +  if (state_ == kUninitialized) { +    cerr << "Translator::Translate(...) must not be in uninitialized state!\n"; +    abort(); +  } +  const bool result = TranslateImpl(src, smeta, weights, minus_lm_forest); +  state_ = kTranslated; +  return result; +} + +void Translator::SentenceComplete() { +  if (state_ != kTranslated) { +    cerr << "Translator::Complete in unexpected state: " << state_ << endl; +    // not fatal +  } +  SentenceCompleteImpl(); +  state_ = kUninitialized;  // return to start state +} + +// this may be overridden by translators that want to accept +// metadata +void Translator::ProcessMarkupHintsImpl(const map<string, string>& kv) { +  int unprocessed = kv.size() - kv.count("id"); +	cerr << "Inside translator process hints\n"; +  if (unprocessed > 0) { +    cerr << "Sentence markup contains unprocessed data:\n"; +    for (map<string, string>::const_iterator it = kv.begin(); it != kv.end(); ++it) { +      if (it->first == "id") continue; +      cerr << "  KEY[" << it->first << "] --> " << it->second << endl; +    } +    abort(); +  } +} + +void Translator::SentenceCompleteImpl() {} + diff --git a/decoder/translator.h b/decoder/translator.h new file mode 100644 index 00000000..6b0a02e4 --- /dev/null +++ b/decoder/translator.h @@ -0,0 +1,82 @@ +#ifndef _TRANSLATOR_H_ +#define _TRANSLATOR_H_ + +#include <string> +#include <vector> +#include <map> +#include <boost/shared_ptr.hpp> +#include <boost/program_options/variables_map.hpp> + +class Hypergraph; +class SentenceMetadata; + +// Workflow: for each sentence to be translated +//   1) call ProcessMarkupHints(markup) +//   2) call Translate(...) +//   3) call SentenceComplete() +class Translator { + public: +  Translator() : state_(kUninitialized) {} +  virtual ~Translator(); +  // returns true if goal reached, false otherwise +  // minus_lm_forest will contain the unpruned forest. the +  // feature values from the phrase table / grammar / etc +  // should be in the forest already - the "late" features +  // should not just copy values that are available without +  // any context or computation. +  // SentenceMetadata contains information about the sentence, +  // but it is an input/output parameter since the Translator +  // is also responsible for setting the value of src_len. +  bool Translate(const std::string& src, +                 SentenceMetadata* smeta, +                 const std::vector<double>& weights, +                 Hypergraph* minus_lm_forest); + +  // This is called before Translate(...) with the sentence- +  // level markup passed in. This can be used to set sentence- +  // specific behavior of the translator. +  void ProcessMarkupHints(const std::map<std::string, std::string>& kv); + +  // Free any sentence-specific resources +  void SentenceComplete(); + protected: +  virtual bool TranslateImpl(const std::string& src, +                             SentenceMetadata* smeta, +                             const std::vector<double>& weights, +                             Hypergraph* minus_lm_forest) = 0; +  virtual void ProcessMarkupHintsImpl(const std::map<std::string, std::string>& kv); +  virtual void SentenceCompleteImpl(); + private: +  enum State { kUninitialized, kReadyToTranslate, kTranslated }; +  State state_; +}; + +class SCFGTranslatorImpl; +class SCFGTranslator : public Translator { + public: +  SCFGTranslator(const boost::program_options::variables_map& conf); + protected: +  bool TranslateImpl(const std::string& src, +                 SentenceMetadata* smeta, +                 const std::vector<double>& weights, +                 Hypergraph* minus_lm_forest); +  void ProcessMarkupHintsImpl(const std::map<std::string, std::string>& kv); +  void SentenceCompleteImpl(); + private: +  boost::shared_ptr<SCFGTranslatorImpl> pimpl_; +}; + +class FSTTranslatorImpl; +class FSTTranslator : public Translator { + public: +  FSTTranslator(const boost::program_options::variables_map& conf); + private: +  bool TranslateImpl(const std::string& src, +                 SentenceMetadata* smeta, +                 const std::vector<double>& weights, +                 Hypergraph* minus_lm_forest); + private: +  boost::shared_ptr<FSTTranslatorImpl> pimpl_; +}; + +#endif diff --git a/decoder/tromble_loss.cc b/decoder/tromble_loss.cc new file mode 100644 index 00000000..9ebd8ab1 --- /dev/null +++ b/decoder/tromble_loss.cc @@ -0,0 +1,309 @@ +#include "tromble_loss.h" + +#include <boost/algorithm/string/predicate.hpp> +#include <boost/circular_buffer.hpp> +#include <boost/functional/hash.hpp> +#include <boost/lexical_cast.hpp> +#include <boost/range/iterator_range.hpp> +#include <boost/tokenizer.hpp> +#include <boost/unordered_map.hpp> + +#include <cmath> +#include <fstream> +#include <vector> + +#include "sentence_metadata.h" +#include "trule.h" +#include "tdict.h" + +using namespace std; + +namespace { + +typedef unsigned char GramCount; + +struct RefCounts { +  GramCount max; +  std::vector<GramCount> refs; +  size_t length; +}; + +typedef boost::unordered_map<std::vector<WordID>, size_t, boost::hash<std::vector<WordID> > > NGramMap; + +// Take all the n-grams in the references and stuff them into ngrams. +void MakeNGramMapFromReferences(const vector<vector<WordID> > &references, +                                int n, +                                vector<RefCounts> *counts, +                                NGramMap *ngrams) { +  ngrams->clear(); +  std::pair<vector<WordID>, size_t> insert_me; +  vector<WordID> &ngram = insert_me.first; +  ngram.reserve(n); +  size_t &id = insert_me.second; +  id = 0; +  for (int refi = 0; refi < references.size(); ++refi) { +    const vector<WordID>& ref = references[refi]; +    const int s = ref.size(); +    for (int j=0; j<s; ++j) { +      const int remaining = s-j; +      const int k = (n < remaining ? n : remaining); +      ngram.clear(); +      for (unsigned int i = 0; i < k; ++i) { +        ngram.push_back(ref[j + i]); +        std::pair<NGramMap::iterator, bool> ret(ngrams->insert(insert_me)); +        if (ret.second) { +          counts->resize(id + 1); +          RefCounts &ref_counts = counts->back(); +          ref_counts.max = 1; +          ref_counts.refs.resize(references.size()); +          ref_counts.refs[refi] = 1; +          ref_counts.length = ngram.size(); +          ++id; +        } else { +          RefCounts &ref_counts = (*counts)[ret.first->second]; +          ref_counts.max = std::max(ref_counts.max, ++ref_counts.refs[refi]); +        } +      } +    } +  } +} + +struct MutableState { +  MutableState(void *from, size_t n) : length(reinterpret_cast<size_t*>(from)), left(reinterpret_cast<WordID *>(length + 1)), right(left + n - 1), counts(reinterpret_cast<GramCount *>(right + n - 1)) {} +  size_t *length; +  WordID *left, *right; +  GramCount *counts; +  static size_t Size(size_t n, size_t bound_ngram_id) { return sizeof(size_t) + (n - 1) * 2 * sizeof(WordID) + bound_ngram_id * sizeof(GramCount); } +}; + +struct ConstState { +  ConstState(const void *from, size_t n) : length(reinterpret_cast<const size_t*>(from)), left(reinterpret_cast<const WordID *>(length + 1)), right(left + n - 1), counts(reinterpret_cast<const GramCount *>(right + n - 1)) {} +  const size_t *length; +  const WordID *left, *right; +  const GramCount *counts; +  static size_t Size(size_t n, size_t bound_ngram_id) { return sizeof(size_t) + (n - 1) * 2 * sizeof(WordID) + bound_ngram_id * sizeof(GramCount); } +}; + +template <class T> struct CompatibleHashRange : public std::unary_function<const boost::iterator_range<T> &, size_t> { +  size_t operator()(const boost::iterator_range<T> &range) const { +    return boost::hash_range(range.begin(), range.end()); +  } +}; + +template <class T> struct CompatibleEqualsRange : public std::binary_function<const boost::iterator_range<T> &, const std::vector<WordID> &, size_t> { +  size_t operator()(const boost::iterator_range<T> &range, const std::vector<WordID> &vec) const { +    return boost::algorithm::equals(range, vec); +  } +  size_t operator()(const std::vector<WordID> &vec, const boost::iterator_range<T> &range) const { +    return boost::algorithm::equals(range, vec); +  } +}; + +void AddWord(const boost::circular_buffer<WordID> &segment, size_t min_length, const NGramMap &ref_grams, GramCount *counters) { +  typedef boost::circular_buffer<WordID>::const_iterator BufferIt; +  typedef boost::iterator_range<BufferIt> SegmentRange; +  if (segment.size() < min_length) return; +#if 0 +  CompatibleHashRange<BufferIt> hasher; +  CompatibleEqualsRange<BufferIt> equals; +  for (BufferIt seg_start(segment.end() - min_length); ; --seg_start) { +    NGramMap::const_iterator found = ref_grams.find(SegmentRange(seg_start, segment.end())); +    if (found == ref_grams.end()) break; +    ++counters[found->second]; +    if (seg_start == segment.begin()) break; +  } +#endif +} + +} // namespace + +class TrombleLossComputerImpl { + public: +  explicit TrombleLossComputerImpl(const std::string ¶ms) : star_(TD::Convert("<{STAR}>")) { +    typedef boost::tokenizer<boost::char_separator<char> > Tokenizer; +    // Argument parsing +    std::string ref_file_name; +    Tokenizer tok(params, boost::char_separator<char>(" ")); +    Tokenizer::iterator i = tok.begin(); +    if (i == tok.end()) { +      std::cerr << "TrombleLossComputer needs a reference file name." << std::endl; +      exit(1); +    } +    ref_file_name = *i++; +    if (i == tok.end()) { +      std::cerr << "TrombleLossComputer needs to know how many references." << std::endl; +      exit(1); +    } +    num_refs_ = boost::lexical_cast<unsigned int>(*i++); +    for (; i != tok.end(); ++i) { +     thetas_.push_back(boost::lexical_cast<double>(*i)); +    } +    if (thetas_.empty()) { +      std::cerr << "TrombleLossComputer is pointless with no weight on n-grams." << std::endl; +      exit(1); +    } + +    // Read references file. +    std::ifstream ref_file(ref_file_name.c_str()); +    if (!ref_file) { +      std::cerr << "Could not open TrombleLossComputer file " << ref_file_name << std::endl; +      exit(1); +    } +    std::string ref; +    vector<vector<WordID> > references(num_refs_); +    bound_ngram_id_ = 0; +    for (unsigned int sentence = 0; ref_file; ++sentence) { +      for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) { +        if (!getline(ref_file, ref)) { +          if (refidx == 0) break; +          std::cerr << "Short read of " << refidx << " references for sentence " << sentence << std::endl; +          exit(1); +        } +        TD::ConvertSentence(ref, &references[refidx]); +      } +      ref_ids_.resize(sentence + 1); +      ref_counts_.resize(sentence + 1); +      MakeNGramMapFromReferences(references, thetas_.size(), &ref_counts_.back(), &ref_ids_.back()); +      bound_ngram_id_ = std::max(bound_ngram_id_, ref_ids_.back().size()); +    } +  } + +  size_t StateSize() const { +    // n-1 boundary words plus counts for n-grams currently rendered as bytes even though most would fit in bits. +    // Also, this is cached by higher up classes so no need to cache here.   +    return MutableState::Size(thetas_.size(), bound_ngram_id_); +  } + +  double Traversal( +      const SentenceMetadata &smeta, +      const TRule &rule, +      const vector<const void*> &ant_contexts, +      void *out_context) const { +    // TODO: get refs from sentence metadata.   +    // This will require resizable features.   +    if (smeta.GetSentenceID() >= ref_ids_.size()) { +      std::cerr << "Sentence ID " << smeta.GetSentenceID() << " doesn't have references; there are only " << ref_ids_.size() << " references." << std::endl; +      exit(1); +    } +    const NGramMap &ngrams = ref_ids_[smeta.GetSentenceID()]; +    MutableState out_state(out_context, thetas_.size()); +    memset(out_state.counts, 0, bound_ngram_id_ * sizeof(GramCount)); +    boost::circular_buffer<WordID> history(thetas_.size()); +    std::vector<const void*>::const_iterator ant_context = ant_contexts.begin(); +    *out_state.length = 0; +    size_t pushed = 0; +    const size_t keep = thetas_.size() - 1; +    for (vector<WordID>::const_iterator rhs = rule.e().begin(); rhs != rule.e().end(); ++rhs) { +      if (*rhs < 1) { +        assert(ant_context != ant_contexts.end()); +        // Constituent +        ConstState rhs_state(*ant_context, thetas_.size()); +        *out_state.length += *rhs_state.length; +        { +          GramCount *accum = out_state.counts; +          for (const GramCount *c = rhs_state.counts; c != rhs_state.counts + ngrams.size(); ++c, ++accum) { +            *accum += *c; +          } +        } +        const WordID *w = rhs_state.left; +        bool long_constit = true; +        for (size_t i = 1; i <= keep; ++i, ++w) { +          if (*w == star_) { +            long_constit = false; +            break; +          } +          history.push_back(*w); +          if (++pushed == keep) { +            std::copy(history.begin(), history.end(), out_state.left); +          } +          // Now i is the length of the history coming from this constituent.  So it needs at least i+1 words to have a cross-child add.   +          AddWord(history, i + 1, ngrams, out_state.counts); +        } +        // If the consituent is shorter than thetas_.size(), then the +        // constituent's left is the entire constituent, so history is already +        // correct.  Otherwise, the entire right hand side is the entire +        // history. +        if (long_constit) { +          history.assign(thetas_.size(), rhs_state.right, rhs_state.right + keep); +        } +        ++ant_context; +      } else { +        // Word +        ++*out_state.length; +        history.push_back(*rhs); +        if (++pushed == keep) { +          std::copy(history.begin(), history.end(), out_state.left); +        } +        AddWord(history, 1, ngrams, out_state.counts); +      } +    } +    // Fill in left and right constituents. +    if (pushed < keep) { +      std::copy(history.begin(), history.end(), out_state.left); +      for (WordID *i = out_state.left + pushed; i != out_state.left + keep; ++i) { +        *i = star_; +      } +      std::copy(out_state.left, out_state.left + keep, out_state.right); +    } else if(pushed == keep) { +      std::copy(history.begin(), history.end(), out_state.right); +    } else if ((pushed > keep) && !history.empty()) { +      std::copy(history.begin() + 1, history.end(), out_state.right); +    } +    std::vector<RefCounts>::const_iterator ref_info = ref_counts_[smeta.GetSentenceID()].begin(); +    // Clip the counts and count matches. +    // Indexed by reference then by length. +    std::vector<std::vector<unsigned int> > matches(num_refs_, std::vector<unsigned int>(thetas_.size())); +    for (GramCount *c = out_state.counts; c != out_state.counts + ngrams.size(); ++c, ++ref_info) { +      *c = std::min(*c, ref_info->max); +      if (*c) { +        for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) { +          assert(ref_info->length >= 1); +          assert(ref_info->length - 1 < thetas_.size()); +          matches[refidx][ref_info->length - 1] += std::min(*c, ref_info->refs[refidx]); +        } +      } +    } +    double best_score = 0.0; +    for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) { +      double score = 0.0; +      for (unsigned int j = 0; j < std::min(*out_state.length, thetas_.size()); ++j) { +        score += thetas_[j] * static_cast<double>(matches[refidx][j]) / static_cast<double>(*out_state.length - j); +      } +      best_score = std::max(best_score, score); +    } +    return best_score; +  } + + private: +  unsigned int num_refs_; +  // Indexed by sentence id. +  std::vector<NGramMap> ref_ids_; +  // Then by id from ref_ids_. +  std::vector<std::vector<RefCounts> > ref_counts_; + +  // thetas_[0] is the weight for 1-grams +  std::vector<double> thetas_; + +  // All ngram ids in ref_ids_ are < this value. +  size_t bound_ngram_id_; + +  const WordID star_; +}; + +TrombleLossComputer::TrombleLossComputer(const std::string ¶ms) : +    boost::base_from_member<PImpl>(new TrombleLossComputerImpl(params)), +    FeatureFunction(boost::base_from_member<PImpl>::member->StateSize()), +    fid_(FD::Convert("TrombleLossComputer")) {} + +TrombleLossComputer::~TrombleLossComputer() {} + +void TrombleLossComputer::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const { +  (void) estimated_features; +  const double loss = boost::base_from_member<PImpl>::member->Traversal(smeta, *edge.rule_, ant_contexts, out_context); +  features->set_value(fid_, loss); +} diff --git a/decoder/tromble_loss.h b/decoder/tromble_loss.h new file mode 100644 index 00000000..599a2d54 --- /dev/null +++ b/decoder/tromble_loss.h @@ -0,0 +1,40 @@ +#ifndef _TROMBLE_LOSS_H_ +#define _TROMBLE_LOSS_H_ + +#include <vector> +#include <boost/scoped_ptr.hpp> +#include <boost/utility/base_from_member.hpp> + +#include "ff.h" +#include "wordid.h" + +// this may not be the most elegant way to implement this computation, but since we +// may need cube pruning and state splitting, we reuse the feature detector framework. +// the loss is then stored in a feature #0 (which is guaranteed to have weight 0 and +// never be a "real" feature). +class TrombleLossComputerImpl; +class TrombleLossComputer : private boost::base_from_member<boost::scoped_ptr<TrombleLossComputerImpl> >, public FeatureFunction { + private: +  typedef boost::scoped_ptr<TrombleLossComputerImpl> PImpl; +  typedef FeatureFunction Base; + + public: +  // String parameters are ref.txt num_ref weight1 weight2 ... weightn +  // where ref.txt contains references on per line, with num_ref references per sentence +  // The weights are the weight on each length n-gram. +  explicit TrombleLossComputer(const std::string ¶ms); + +  ~TrombleLossComputer(); + + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const; + private: +  const int fid_; +}; + +#endif diff --git a/decoder/trule.cc b/decoder/trule.cc new file mode 100644 index 00000000..505839c7 --- /dev/null +++ b/decoder/trule.cc @@ -0,0 +1,242 @@ +#include "trule.h" + +#include <sstream> + +#include "stringlib.h" +#include "tdict.h" + +using namespace std; + +static WordID ConvertTrgString(const string& w) { +  int len = w.size(); +  WordID id = 0; +  // [X,0] or [0] +  // for target rules, we ignore the category, just keep the index +  if (len > 2 && w[0]=='[' && w[len-1]==']' && w[len-2] > '0' && w[len-2] <= '9' && +      (len == 3 || (len > 4 && w[len-3] == ','))) { +    id = w[len-2] - '0'; +    id = 1 - id; +  } else { +    id = TD::Convert(w); +  } +  return id; +} + +static WordID ConvertSrcString(const string& w, bool mono = false) { +  int len = w.size(); +  // [X,0] +  // for source rules, we keep the category and ignore the index (source rules are +  // always numbered 1, 2, 3... +  if (mono) { +    if (len > 2 && w[0]=='[' && w[len-1]==']') { +      if (len > 4 && w[len-3] == ',') { +        cerr << "[ERROR] Monolingual rules mut not have non-terminal indices:\n  " +             << w << endl; +        exit(1); +      } +      // TODO check that source indices go 1,2,3,etc. +      return TD::Convert(w.substr(1, len-2)) * -1; +    } else { +      return TD::Convert(w); +    } +  } else { +    if (len > 4 && w[0]=='[' && w[len-1]==']' && w[len-3] == ',' && w[len-2] > '0' && w[len-2] <= '9') { +      return TD::Convert(w.substr(1, len-4)) * -1; +    } else { +      return TD::Convert(w); +    } +  } +} + +static WordID ConvertLHS(const string& w) { +  if (w[0] == '[') { +    int len = w.size(); +    if (len < 3) { cerr << "Format error: " << w << endl; exit(1); } +    return TD::Convert(w.substr(1, len-2)) * -1; +  } else { +    return TD::Convert(w) * -1; +  } +} + +TRule* TRule::CreateRuleSynchronous(const std::string& rule) { +  TRule* res = new TRule; +  if (res->ReadFromString(rule, true, false)) return res; +  cerr << "[ERROR] Failed to creating rule from: " << rule << endl; +  delete res; +  return NULL; +} + +TRule* TRule::CreateRulePhrasetable(const string& rule) { +  // TODO make this faster +  // TODO add configuration for default NT type +  if (rule[0] == '[') { +    cerr << "Phrasetable rules shouldn't have a LHS / non-terminals:\n  " << rule << endl; +    return NULL; +  } +  TRule* res = new TRule("[X] ||| " + rule, true, false); +  if (res->Arity() != 0) { +    cerr << "Phrasetable rules should have arity 0:\n  " << rule << endl; +    delete res; +    return NULL; +  } +  return res; +} + +TRule* TRule::CreateRuleMonolingual(const string& rule) { +  return new TRule(rule, false, true); +} + +bool TRule::ReadFromString(const string& line, bool strict, bool mono) { +  e_.clear(); +  f_.clear(); +  scores_.clear(); + +  string w; +  istringstream is(line); +  int format = CountSubstrings(line, "|||"); +  if (strict && format < 2) { +    cerr << "Bad rule format in strict mode:\n" << line << endl; +    return false; +  } +  if (format >= 2 || (mono && format == 1)) { +    while(is>>w && w!="|||") { lhs_ = ConvertLHS(w); } +    while(is>>w && w!="|||") { f_.push_back(ConvertSrcString(w, mono)); } +    if (!mono) { +      while(is>>w && w!="|||") { e_.push_back(ConvertTrgString(w)); } +    } +    int fv = 0; +    if (is) { +      string ss; +      getline(is, ss); +      //cerr << "L: " << ss << endl; +      int start = 0; +      const int len = ss.size(); +      while (start < len) { +        while(start < len && (ss[start] == ' ' || ss[start] == ';')) +          ++start; +        if (start == len) break; +        int end = start + 1; +        while(end < len && (ss[end] != '=' && ss[end] != ' ' && ss[end] != ';')) +          ++end; +        if (end == len || ss[end] == ' ' || ss[end] == ';') { +          //cerr << "PROC: '" << ss.substr(start, end - start) << "'\n"; +          // non-named features +          if (end != len) { ss[end] = 0; } +          string fname = "PhraseModel_X"; +          if (fv > 9) { cerr << "Too many phrasetable scores - used named format\n"; abort(); } +          fname[12]='0' + fv; +          ++fv; +          // if the feature set is frozen, this may return zero, indicating an +          // undefined feature +          const int fid = FD::Convert(fname); +          if (fid) +            scores_.set_value(fid, atof(&ss[start])); +          //cerr << "F: " << fname << " VAL=" << scores_.value(FD::Convert(fname)) << endl; +        } else { +          const int fid = FD::Convert(ss.substr(start, end - start)); +          start = end + 1; +          end = start + 1; +          while(end < len && (ss[end] != ' ' && ss[end] != ';')) +            ++end; +          if (end < len) { ss[end] = 0; } +	  assert(start < len); +          if (fid) +            scores_.set_value(fid, atof(&ss[start])); +          //cerr << "F: " << FD::Convert(fid) << " VAL=" << scores_.value(fid) << endl; +        } +        start = end + 1; +      } +    } +  } else if (format == 1) { +    while(is>>w && w!="|||") { lhs_ = ConvertLHS(w); } +    while(is>>w && w!="|||") { e_.push_back(ConvertTrgString(w)); } +    f_ = e_; +    int x = ConvertLHS("[X]"); +    for (int i = 0; i < f_.size(); ++i) +      if (f_[i] <= 0) { f_[i] = x; } +  } else { +    cerr << "F: " << format << endl; +    cerr << "[ERROR] Don't know how to read:\n" << line << endl; +  } +  if (mono) { +    e_ = f_; +    int ci = 0; +    for (int i = 0; i < e_.size(); ++i) +      if (e_[i] < 0) +        e_[i] = ci--; +  } +  ComputeArity(); +  return SanityCheck(); +} + +bool TRule::SanityCheck() const { +  vector<int> used(f_.size(), 0); +  int ac = 0; +  for (int i = 0; i < e_.size(); ++i) { +    int ind = e_[i]; +    if (ind > 0) continue; +    ind = -ind; +    if ((++used[ind]) != 1) { +      cerr << "[ERROR] e-side variable index " << (ind+1) << " used more than once!\n"; +      return false; +    } +    ac++; +  } +  if (ac != Arity()) { +    cerr << "[ERROR] e-side arity mismatches f-side\n"; +    return false; +  } +  return true; +} + +void TRule::ComputeArity() { +  int min = 1; +  for (vector<WordID>::const_iterator i = e_.begin(); i != e_.end(); ++i) +    if (*i < min) min = *i; +  arity_ = 1 - min; +} + +static string AnonymousStrVar(int i) { +  string res("[v]"); +  if(!(i <= 0 && i >= -8)) { +    cerr << "Can't handle more than 9 non-terminals: index=" << (-i) << endl; +    abort(); +  } +  res[1] = '1' - i; +  return res; +} + +string TRule::AsString(bool verbose) const { +  ostringstream os; +  int idx = 0; +  if (lhs_ && verbose) { +    os << '[' << TD::Convert(lhs_ * -1) << "] |||"; +    for (int i = 0; i < f_.size(); ++i) { +      const WordID& w = f_[i]; +      if (w < 0) { +        int wi = w * -1; +        ++idx; +        os << " [" << TD::Convert(wi) << ',' << idx << ']'; +      } else { +        os << ' ' << TD::Convert(w); +      } +    } +    os << " ||| "; +  } +  if (idx > 9) { +    cerr << "Too many non-terminals!\n partial: " << os.str() << endl; +    exit(1); +  } +  for (int i =0; i<e_.size(); ++i) { +    if (i) os << ' '; +    const WordID& w = e_[i]; +    if (w < 1) +      os << AnonymousStrVar(w); +    else +      os << TD::Convert(w); +  } +  if (!scores_.empty() && verbose) { +    os << " ||| " << scores_; +  } +  return os.str(); +} diff --git a/decoder/trule.h b/decoder/trule.h new file mode 100644 index 00000000..7fb92924 --- /dev/null +++ b/decoder/trule.h @@ -0,0 +1,145 @@ +#ifndef _RULE_H_ +#define _RULE_H_ + +#include <algorithm> +#include <vector> +#include <cassert> +#include <boost/shared_ptr.hpp> + +#include "sparse_vector.h" +#include "wordid.h" + +class TRule; +typedef boost::shared_ptr<TRule> TRulePtr; + +struct NTSizeSummaryStatistics { +  NTSizeSummaryStatistics(int arity) : means(arity), vars(arity) {} +  std::vector<float> means; +  std::vector<float> vars; +}; + +// Translation rule +class TRule { + public: +  TRule() : lhs_(0), prev_i(-1), prev_j(-1) { } +  TRule(WordID lhs, const WordID* src, int src_size, const WordID* trg, int trg_size, const int* feat_ids, const double* feat_vals, int feat_size, int arity) : +      e_(trg, trg + trg_size), f_(src, src + src_size), lhs_(lhs), arity_(arity), prev_i(-1), prev_j(-1) { +    for (int i = 0; i < feat_size; ++i) +      scores_.set_value(feat_ids[i], feat_vals[i]); +  } + +  explicit TRule(const std::vector<WordID>& e) : e_(e), lhs_(0), prev_i(-1), prev_j(-1) {} +  TRule(const std::vector<WordID>& e, const std::vector<WordID>& f, const WordID& lhs) : +    e_(e), f_(f), lhs_(lhs), prev_i(-1), prev_j(-1) {} + +  // deprecated - this will be private soon +  explicit TRule(const std::string& text, bool strict = false, bool mono = false) : prev_i(-1), prev_j(-1) { +    ReadFromString(text, strict, mono); +  } + +  // deprecated, use lexer +  // make a rule from a hiero-like rule table, e.g. +  //    [X] ||| [X,1] DE [X,2] ||| [X,2] of the [X,1] +  // if misformatted, returns NULL +  static TRule* CreateRuleSynchronous(const std::string& rule); + +  // deprecated, use lexer +  // make a rule from a phrasetable entry (i.e., one that has no LHS type), e.g: +  //    el gato ||| the cat ||| Feature_2=0.34 +  static TRule* CreateRulePhrasetable(const std::string& rule); + +  // deprecated, use lexer +  // make a rule from a non-synchrnous CFG representation, e.g.: +  //    [LHS] ||| term1 [NT] term2 [OTHER_NT] [YET_ANOTHER_NT] +  static TRule* CreateRuleMonolingual(const std::string& rule); + +  static TRule* CreateLexicalRule(const WordID& src, const WordID& trg) { +    return new TRule(src, trg); +  } + +  void ESubstitute(const std::vector<const std::vector<WordID>* >& var_values, +                   std::vector<WordID>* result) const { +    int vc = 0; +    result->clear(); +    for (std::vector<WordID>::const_iterator i = e_.begin(); i != e_.end(); ++i) { +      const WordID& c = *i; +      if (c < 1) { +        ++vc; +        const std::vector<WordID>& var_value = *var_values[-c]; +        std::copy(var_value.begin(), +                  var_value.end(), +                  std::back_inserter(*result)); +      } else { +        result->push_back(c); +      } +    } +    assert(vc == var_values.size()); +  } + +  void FSubstitute(const std::vector<const std::vector<WordID>* >& var_values, +                   std::vector<WordID>* result) const { +    int vc = 0; +    result->clear(); +    for (std::vector<WordID>::const_iterator i = f_.begin(); i != f_.end(); ++i) { +      const WordID& c = *i; +      if (c < 1) { +        const std::vector<WordID>& var_value = *var_values[vc++]; +        std::copy(var_value.begin(), +                  var_value.end(), +                  std::back_inserter(*result)); +      } else { +        result->push_back(c); +      } +    } +    assert(vc == var_values.size()); +  } + +  bool ReadFromString(const std::string& line, bool strict = false, bool monolingual = false); + +  bool Initialized() const { return e_.size(); } + +  std::string AsString(bool verbose = true) const; + +  static TRule DummyRule() { +    TRule res; +    res.e_.resize(1, 0); +    return res; +  } + +  const std::vector<WordID>& f() const { return f_; } +  const std::vector<WordID>& e() const { return e_; } + +  int EWords() const { return ELength() - Arity(); } +  int FWords() const { return FLength() - Arity(); } +  int FLength() const { return f_.size(); } +  int ELength() const { return e_.size(); } +  int Arity() const { return arity_; } +  bool IsUnary() const { return (Arity() == 1) && (f_.size() == 1); } +  const SparseVector<double>& GetFeatureValues() const { return scores_; } +  double Score(int i) const { return scores_[i]; } +  WordID GetLHS() const { return lhs_; } +  void ComputeArity(); + +  // 0 = first variable, -1 = second variable, -2 = third ... +  std::vector<WordID> e_; +  // < 0: *-1 = encoding of category of variable +  std::vector<WordID> f_; +  WordID lhs_; +  SparseVector<double> scores_; +   +  char arity_; +  TRulePtr parent_rule_;  // usually NULL, except when doing constrained decoding + +  // this is only used when doing synchronous parsing +  short int prev_i; +  short int prev_j; + +  // may be null +  boost::shared_ptr<NTSizeSummaryStatistics> nt_size_summary_; + + private: +  TRule(const WordID& src, const WordID& trg) : e_(1, trg), f_(1, src), lhs_(), arity_(), prev_i(), prev_j() {} +  bool SanityCheck() const; +}; + +#endif diff --git a/decoder/trule_test.cc b/decoder/trule_test.cc new file mode 100644 index 00000000..02a70764 --- /dev/null +++ b/decoder/trule_test.cc @@ -0,0 +1,65 @@ +#include "trule.h" + +#include <gtest/gtest.h> +#include <cassert> +#include <iostream> +#include "tdict.h" + +using namespace std; + +class TRuleTest : public testing::Test { + protected: +  virtual void SetUp() { } +  virtual void TearDown() { } +}; + +TEST_F(TRuleTest,TestFSubstitute) { +  TRule r1("[X] ||| ob [X,1] [X,2] sah . ||| whether [X,1] saw [X,2] . ||| 0.99"); +  TRule r2("[X] ||| ich ||| i ||| 1.0"); +  TRule r3("[X] ||| ihn ||| him ||| 1.0"); +  vector<const vector<WordID>*> ants; +  vector<WordID> res2; +  r2.FSubstitute(ants, &res2); +  assert(TD::GetString(res2) == "ich"); +  vector<WordID> res3; +  r3.FSubstitute(ants, &res3); +  assert(TD::GetString(res3) == "ihn"); +  ants.push_back(&res2); +  ants.push_back(&res3); +  vector<WordID> res; +  r1.FSubstitute(ants, &res); +  cerr << TD::GetString(res) << endl; +  assert(TD::GetString(res) == "ob ich ihn sah ."); +} + +TEST_F(TRuleTest,TestPhrasetableRule) { +  TRulePtr t(TRule::CreateRulePhrasetable("gato ||| cat ||| PhraseModel_0=-23.2;Foo=1;Bar=12")); +  cerr << t->AsString() << endl; +  assert(t->scores_.num_active() == 3); +}; + + +TEST_F(TRuleTest,TestMonoRule) { +  TRulePtr m(TRule::CreateRuleMonolingual("[LHS] ||| term1 [NT] term2 [NT2] [NT3]")); +  assert(m->Arity() == 3); +  cerr << m->AsString() << endl; +  TRulePtr m2(TRule::CreateRuleMonolingual("[LHS] ||| term1 [NT] term2 [NT2] [NT3] ||| Feature1=0.23")); +  assert(m2->Arity() == 3); +  cerr << m2->AsString() << endl; +  EXPECT_FLOAT_EQ(m2->scores_.value(FD::Convert("Feature1")), 0.23); +} + +TEST_F(TRuleTest,TestRuleR) { +  TRule t6; +  t6.ReadFromString("[X] ||| den [X,1] sah [X,2] . ||| [X,2] saw the [X,1] . ||| 0.12321 0.23232 0.121"); +  cerr << "TEXT: " << t6.AsString() << endl; +  EXPECT_EQ(t6.Arity(), 2); +  EXPECT_EQ(t6.e_[0], -1); +  EXPECT_EQ(t6.e_[3], 0); +} + +int main(int argc, char** argv) { +  testing::InitGoogleTest(&argc, argv); +  return RUN_ALL_TESTS(); +} + diff --git a/decoder/ttables.cc b/decoder/ttables.cc new file mode 100644 index 00000000..2ea960f0 --- /dev/null +++ b/decoder/ttables.cc @@ -0,0 +1,31 @@ +#include "ttables.h" + +#include <cassert> + +#include "dict.h" + +using namespace std; +using namespace std::tr1; + +void TTable::DeserializeProbsFromText(std::istream* in) { +  int c = 0; +  while(*in) { +    string e; +    string f; +    double p; +    (*in) >> e >> f >> p; +    if (e.empty()) break; +    ++c; +    ttable[TD::Convert(e)][TD::Convert(f)] = prob_t(p); +  } +  cerr << "Loaded " << c << " translation parameters.\n"; +} + +void TTable::SerializeHelper(string* out, const Word2Word2Double& o) { +  assert(!"not implemented"); +} + +void TTable::DeserializeHelper(const string& in, Word2Word2Double* o) { +  assert(!"not implemented"); +} + diff --git a/decoder/ttables.h b/decoder/ttables.h new file mode 100644 index 00000000..3ffc238a --- /dev/null +++ b/decoder/ttables.h @@ -0,0 +1,87 @@ +#ifndef _TTABLES_H_ +#define _TTABLES_H_ + +#include <iostream> +#include <map> + +#include "wordid.h" +#include "prob.h" +#include "tdict.h" + +class TTable { + public: +  TTable() {} +  typedef std::map<WordID, double> Word2Double; +  typedef std::map<WordID, Word2Double> Word2Word2Double; +  inline const prob_t prob(const int& e, const int& f) const { +    const Word2Word2Double::const_iterator cit = ttable.find(e); +    if (cit != ttable.end()) { +      const Word2Double& cpd = cit->second; +      const Word2Double::const_iterator it = cpd.find(f); +      if (it == cpd.end()) return prob_t(0.00001); +      return prob_t(it->second); +    } else { +      return prob_t(0.00001); +    } +  } +  inline void Increment(const int& e, const int& f) { +    counts[e][f] += 1.0; +  } +  inline void Increment(const int& e, const int& f, double x) { +    counts[e][f] += x; +  } +  void Normalize() { +    ttable.swap(counts); +    for (Word2Word2Double::iterator cit = ttable.begin(); +         cit != ttable.end(); ++cit) { +      double tot = 0; +      Word2Double& cpd = cit->second; +      for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) +        tot += it->second; +      for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) +        it->second /= tot; +    } +    counts.clear(); +  } +  // adds counts from another TTable - probabilities remain unchanged +  TTable& operator+=(const TTable& rhs) { +    for (Word2Word2Double::const_iterator it = rhs.counts.begin(); +         it != rhs.counts.end(); ++it) { +      const Word2Double& cpd = it->second; +      Word2Double& tgt = counts[it->first]; +      for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) { +        tgt[j->first] += j->second; +      } +    } +    return *this; +  } +  void ShowTTable() { +    for (Word2Word2Double::iterator it = ttable.begin(); it != ttable.end(); ++it) { +      Word2Double& cpd = it->second; +      for (Word2Double::iterator j = cpd.begin(); j != cpd.end(); ++j) { +        std::cerr << "P(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl; +      } +    } +  } +  void ShowCounts() { +    for (Word2Word2Double::iterator it = counts.begin(); it != counts.end(); ++it) { +      Word2Double& cpd = it->second; +      for (Word2Double::iterator j = cpd.begin(); j != cpd.end(); ++j) { +        std::cerr << "c(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl; +      } +    } +  } +  void DeserializeProbsFromText(std::istream* in); +  void SerializeCounts(std::string* out) const { SerializeHelper(out, counts); } +  void DeserializeCounts(const std::string& in) { DeserializeHelper(in, &counts); } +  void SerializeProbs(std::string* out) const { SerializeHelper(out, ttable); } +  void DeserializeProbs(const std::string& in) { DeserializeHelper(in, &ttable); } + private: +  static void SerializeHelper(std::string*, const Word2Word2Double& o); +  static void DeserializeHelper(const std::string&, Word2Word2Double* o); + public: +  Word2Word2Double ttable; +  Word2Word2Double counts; +}; + +#endif diff --git a/decoder/viterbi.cc b/decoder/viterbi.cc new file mode 100644 index 00000000..82b2ce6d --- /dev/null +++ b/decoder/viterbi.cc @@ -0,0 +1,39 @@ +#include "viterbi.h" + +#include <vector> +#include "hg.h" + +using namespace std; + +string ViterbiETree(const Hypergraph& hg) { +  vector<WordID> tmp; +  const prob_t p = Viterbi<vector<WordID>, ETreeTraversal, prob_t, EdgeProb>(hg, &tmp); +  return TD::GetString(tmp); +} + +string ViterbiFTree(const Hypergraph& hg) { +  vector<WordID> tmp; +  const prob_t p = Viterbi<vector<WordID>, FTreeTraversal, prob_t, EdgeProb>(hg, &tmp); +  return TD::GetString(tmp); +} + +prob_t ViterbiESentence(const Hypergraph& hg, vector<WordID>* result) { +  return Viterbi<vector<WordID>, ESentenceTraversal, prob_t, EdgeProb>(hg, result); +} + +prob_t ViterbiFSentence(const Hypergraph& hg, vector<WordID>* result) { +  return Viterbi<vector<WordID>, FSentenceTraversal, prob_t, EdgeProb>(hg, result); +} + +int ViterbiELength(const Hypergraph& hg) { +  int len = -1; +  Viterbi<int, ELengthTraversal, prob_t, EdgeProb>(hg, &len); +  return len; +} + +int ViterbiPathLength(const Hypergraph& hg) { +  int len = -1; +  Viterbi<int, PathLengthTraversal, prob_t, EdgeProb>(hg, &len); +  return len; +} + diff --git a/decoder/viterbi.h b/decoder/viterbi.h new file mode 100644 index 00000000..8f7534a9 --- /dev/null +++ b/decoder/viterbi.h @@ -0,0 +1,142 @@ +#ifndef _VITERBI_H_ +#define _VITERBI_H_ + +#include <vector> +#include "prob.h" +#include "hg.h" +#include "tdict.h" + +// V must implement: +//  void operator()(const vector<const T*>& ants, T* result); +template<typename T, typename Traversal, typename WeightType, typename WeightFunction> +WeightType Viterbi(const Hypergraph& hg, +                   T* result, +                   const Traversal& traverse = Traversal(), +                   const WeightFunction& weight = WeightFunction()) { +  const int num_nodes = hg.nodes_.size(); +  std::vector<T> vit_result(num_nodes); +  std::vector<WeightType> vit_weight(num_nodes, WeightType::Zero()); + +  for (int i = 0; i < num_nodes; ++i) { +    const Hypergraph::Node& cur_node = hg.nodes_[i]; +    WeightType* const cur_node_best_weight = &vit_weight[i]; +    T*          const cur_node_best_result = &vit_result[i]; +     +    const int num_in_edges = cur_node.in_edges_.size(); +    if (num_in_edges == 0) { +      *cur_node_best_weight = WeightType(1); +      continue; +    } +    for (int j = 0; j < num_in_edges; ++j) { +      const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]]; +      WeightType score = weight(edge); +      std::vector<const T*> ants(edge.tail_nodes_.size()); +      for (int k = 0; k < edge.tail_nodes_.size(); ++k) { +        const int tail_node_index = edge.tail_nodes_[k]; +        score *= vit_weight[tail_node_index]; +        ants[k] = &vit_result[tail_node_index]; +      } +      if (*cur_node_best_weight < score) { +        *cur_node_best_weight = score; +        traverse(edge, ants, cur_node_best_result); +      } +    } +  } +  std::swap(*result, vit_result.back()); +  return vit_weight.back(); +} + +struct PathLengthTraversal { +  void operator()(const Hypergraph::Edge& edge, +                  const std::vector<const int*>& ants, +                  int* result) const { +    (void) edge; +    *result = 1; +    for (int i = 0; i < ants.size(); ++i) *result += *ants[i]; +  } +}; + +struct ESentenceTraversal { +  void operator()(const Hypergraph::Edge& edge, +                  const std::vector<const std::vector<WordID>*>& ants, +                  std::vector<WordID>* result) const { +    edge.rule_->ESubstitute(ants, result); +  } +}; + +struct ELengthTraversal { +  void operator()(const Hypergraph::Edge& edge, +                  const std::vector<const int*>& ants, +                  int* result) const { +    *result = edge.rule_->ELength() - edge.rule_->Arity(); +    for (int i = 0; i < ants.size(); ++i) *result += *ants[i]; +  } +}; + +struct FSentenceTraversal { +  void operator()(const Hypergraph::Edge& edge, +                  const std::vector<const std::vector<WordID>*>& ants, +                  std::vector<WordID>* result) const { +    edge.rule_->FSubstitute(ants, result); +  } +}; + +// create a strings of the form (S (X the man) (X said (X he (X would (X go))))) +struct ETreeTraversal { +  ETreeTraversal() : left("("), space(" "), right(")") {} +  const std::string left; +  const std::string space; +  const std::string right; +  void operator()(const Hypergraph::Edge& edge, +                  const std::vector<const std::vector<WordID>*>& ants, +                  std::vector<WordID>* result) const { +    std::vector<WordID> tmp; +    edge.rule_->ESubstitute(ants, &tmp); +    const std::string cat = TD::Convert(edge.rule_->GetLHS() * -1); +    if (cat == "Goal") +      result->swap(tmp); +    else +      TD::ConvertSentence(left + cat + space + TD::GetString(tmp) + right, +                          result); +  } +}; + +struct FTreeTraversal { +  FTreeTraversal() : left("("), space(" "), right(")") {} +  const std::string left; +  const std::string space; +  const std::string right; +  void operator()(const Hypergraph::Edge& edge, +                  const std::vector<const std::vector<WordID>*>& ants, +                  std::vector<WordID>* result) const { +    std::vector<WordID> tmp; +    edge.rule_->FSubstitute(ants, &tmp); +    const std::string cat = TD::Convert(edge.rule_->GetLHS() * -1); +    if (cat == "Goal") +      result->swap(tmp); +    else +      TD::ConvertSentence(left + cat + space + TD::GetString(tmp) + right, +                          result); +  } +}; + +struct ViterbiPathTraversal { +  void operator()(const Hypergraph::Edge& edge, +                  const std::vector<const std::vector<const Hypergraph::Edge*>* >& ants, +                  std::vector<const Hypergraph::Edge*>* result) const { +    result->clear(); +    for (int i = 0; i < ants.size(); ++i) +      for (int j = 0; j < ants[i]->size(); ++j) +        result->push_back((*ants[i])[j]); +    result->push_back(&edge); +  } +}; + +prob_t ViterbiESentence(const Hypergraph& hg, std::vector<WordID>* result); +std::string ViterbiETree(const Hypergraph& hg); +prob_t ViterbiFSentence(const Hypergraph& hg, std::vector<WordID>* result); +std::string ViterbiFTree(const Hypergraph& hg); +int ViterbiELength(const Hypergraph& hg); +int ViterbiPathLength(const Hypergraph& hg); + +#endif diff --git a/decoder/weights.cc b/decoder/weights.cc new file mode 100644 index 00000000..84647585 --- /dev/null +++ b/decoder/weights.cc @@ -0,0 +1,77 @@ +#include "weights.h" + +#include <sstream> + +#include "fdict.h" +#include "filelib.h" + +using namespace std; + +void Weights::InitFromFile(const std::string& filename, vector<string>* feature_list) { +  cerr << "Reading weights from " << filename << endl; +  ReadFile in_file(filename); +  istream& in = *in_file.stream(); +  assert(in); +  int weight_count = 0; +  bool fl = false; +  while (in) { +    double val = 0; +    string buf; +    getline(in, buf); +    if (buf.size() == 0) continue; +    if (buf[0] == '#') continue; +    for (int i = 0; i < buf.size(); ++i) +      if (buf[i] == '=') buf[i] = ' '; +    int start = 0; +    while(start < buf.size() && buf[start] == ' ') ++start; +    int end = 0; +    while(end < buf.size() && buf[end] != ' ') ++end; +    int fid = FD::Convert(buf.substr(start, end - start)); +    while(end < buf.size() && buf[end] == ' ') ++end; +    val = strtod(&buf.c_str()[end], NULL); +    if (isnan(val)) { +      cerr << FD::Convert(fid) << " has weight NaN!\n"; +      abort(); +    } +    if (wv_.size() <= fid) +      wv_.resize(fid + 1); +    wv_[fid] = val; +    if (feature_list) { feature_list->push_back(FD::Convert(fid)); } +    ++weight_count; +    if (weight_count %   50000 == 0) { cerr << '.' << flush; fl = true; } +    if (weight_count % 2000000 == 0) { cerr << " [" << weight_count << "]\n"; fl = false; } +  } +  if (fl) { cerr << endl; } +  cerr << "Loaded " << weight_count << " feature weights\n"; +} + +void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features) const { +  WriteFile out(fname); +  ostream& o = *out.stream(); +  assert(o); +  o.precision(17); +  const int num_feats = FD::NumFeats(); +  for (int i = 1; i < num_feats; ++i) { +    const double val = (i < wv_.size() ? wv_[i] : 0.0); +    if (hide_zero_value_features && val == 0.0) continue; +    o << FD::Convert(i) << ' ' << val << endl; +  } +} + +void Weights::InitVector(std::vector<double>* w) const { +  *w = wv_; +} + +void Weights::InitSparseVector(SparseVector<double>* w) const { +  for (int i = 1; i < wv_.size(); ++i) { +    const double& weight = wv_[i]; +    if (weight) w->set_value(i, weight); +  } +} + +void Weights::InitFromVector(const std::vector<double>& w) { +  wv_ = w; +  if (wv_.size() > FD::NumFeats()) +    cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n"; +  wv_.resize(FD::NumFeats(), 0); +} diff --git a/decoder/weights.h b/decoder/weights.h new file mode 100644 index 00000000..f19aa3ce --- /dev/null +++ b/decoder/weights.h @@ -0,0 +1,21 @@ +#ifndef _WEIGHTS_H_ +#define _WEIGHTS_H_ + +#include <string> +#include <map> +#include <vector> +#include "sparse_vector.h" + +class Weights { + public: +  Weights() {} +  void InitFromFile(const std::string& fname, std::vector<std::string>* feature_list = NULL); +  void WriteToFile(const std::string& fname, bool hide_zero_value_features = true) const; +  void InitVector(std::vector<double>* w) const; +  void InitSparseVector(SparseVector<double>* w) const; +  void InitFromVector(const std::vector<double>& w); + private: +  std::vector<double> wv_; +}; + +#endif diff --git a/decoder/weights_test.cc b/decoder/weights_test.cc new file mode 100644 index 00000000..aa6b3db2 --- /dev/null +++ b/decoder/weights_test.cc @@ -0,0 +1,28 @@ +#include <cassert> +#include <iostream> +#include <fstream> +#include <vector> +#include <gtest/gtest.h> +#include "weights.h" +#include "tdict.h" +#include "hg.h" + +using namespace std; + +class WeightsTest : public testing::Test { + protected: +  virtual void SetUp() { } +  virtual void TearDown() { } +}; +        + +TEST_F(WeightsTest,Load) { +  Weights w; +  w.InitFromFile("test_data/weights"); +  w.WriteToFile("-"); +} + +int main(int argc, char **argv) { +  testing::InitGoogleTest(&argc, argv); +  return RUN_ALL_TESTS(); +} diff --git a/decoder/wordid.h b/decoder/wordid.h new file mode 100644 index 00000000..fb50bcc1 --- /dev/null +++ b/decoder/wordid.h @@ -0,0 +1,6 @@ +#ifndef _WORD_ID_H_ +#define _WORD_ID_H_ + +typedef int WordID; + +#endif  | 
