summaryrefslogtreecommitdiff
path: root/decoder
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
commit0172721855098ca02b207231a654dffa5e4eb1c9 (patch)
tree8069c3a62e2d72bd64a2cdeee9724b2679c8a56b /decoder
parent37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff)
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder')
-rw-r--r--decoder/JSON_parser.c1012
-rw-r--r--decoder/JSON_parser.h152
-rw-r--r--decoder/Makefile.am84
-rw-r--r--decoder/aligner.cc319
-rw-r--r--decoder/aligner.h27
-rw-r--r--decoder/apply_models.cc426
-rw-r--r--decoder/apply_models.h20
-rw-r--r--decoder/array2d.h172
-rw-r--r--decoder/bottom_up_parser.cc302
-rw-r--r--decoder/bottom_up_parser.h27
-rw-r--r--decoder/cdec.cc592
-rw-r--r--decoder/cdec_ff.cc32
-rw-r--r--decoder/csplit.cc173
-rw-r--r--decoder/csplit.h30
-rw-r--r--decoder/dict.h43
-rw-r--r--decoder/dict_test.cc50
-rw-r--r--decoder/earley_composer.cc726
-rw-r--r--decoder/earley_composer.h29
-rw-r--r--decoder/exp_semiring.h71
-rw-r--r--decoder/fdict.cc129
-rw-r--r--decoder/fdict.h31
-rw-r--r--decoder/ff.cc137
-rw-r--r--decoder/ff.h152
-rw-r--r--decoder/ff_csplit.cc225
-rw-r--r--decoder/ff_csplit.h39
-rw-r--r--decoder/ff_factory.cc35
-rw-r--r--decoder/ff_factory.h39
-rw-r--r--decoder/ff_lm.cc454
-rw-r--r--decoder/ff_lm.h55
-rw-r--r--decoder/ff_tagger.cc96
-rw-r--r--decoder/ff_tagger.h51
-rw-r--r--decoder/ff_test.cc64
-rw-r--r--decoder/ff_wordalign.cc445
-rw-r--r--decoder/ff_wordalign.h196
-rw-r--r--decoder/filelib.cc22
-rw-r--r--decoder/filelib.h70
-rw-r--r--decoder/forest_writer.cc23
-rw-r--r--decoder/forest_writer.h16
-rw-r--r--decoder/freqdict.cc29
-rw-r--r--decoder/freqdict.h20
-rw-r--r--decoder/fst_translator.cc91
-rw-r--r--decoder/grammar.cc148
-rw-r--r--decoder/grammar.h89
-rw-r--r--decoder/grammar_test.cc59
-rw-r--r--decoder/gzstream.cc165
-rw-r--r--decoder/gzstream.h121
-rw-r--r--decoder/hg.cc588
-rw-r--r--decoder/hg.h247
-rw-r--r--decoder/hg_intersect.cc160
-rw-r--r--decoder/hg_intersect.h13
-rw-r--r--decoder/hg_io.cc673
-rw-r--r--decoder/hg_io.h39
-rw-r--r--decoder/hg_test.cc455
-rw-r--r--decoder/inside_outside.h112
-rw-r--r--decoder/json_parse.cc50
-rw-r--r--decoder/json_parse.h58
-rw-r--r--decoder/kbest.h208
-rw-r--r--decoder/lattice.cc62
-rw-r--r--decoder/lattice.h46
-rw-r--r--decoder/lexalign.cc129
-rw-r--r--decoder/lexalign.h18
-rw-r--r--decoder/lextrans.cc119
-rw-r--r--decoder/lextrans.h18
-rw-r--r--decoder/logval.h157
-rw-r--r--decoder/logval_test.cc73
-rw-r--r--decoder/maxtrans_blunsom.cc287
-rw-r--r--decoder/parser_test.cc35
-rw-r--r--decoder/phrasebased_translator.cc206
-rw-r--r--decoder/phrasebased_translator.h18
-rw-r--r--decoder/phrasetable_fst.cc141
-rw-r--r--decoder/phrasetable_fst.h34
-rw-r--r--decoder/prob.h8
-rw-r--r--decoder/rule_lexer.h13
-rw-r--r--decoder/rule_lexer.l269
-rw-r--r--decoder/sampler.h136
-rw-r--r--decoder/scfg_translator.cc132
-rw-r--r--decoder/sentence_metadata.h47
-rw-r--r--decoder/small_vector.h187
-rw-r--r--decoder/small_vector_test.cc129
-rw-r--r--decoder/sparse_vector.cc98
-rw-r--r--decoder/sparse_vector.h274
-rw-r--r--decoder/stringlib.cc98
-rw-r--r--decoder/stringlib.h101
-rw-r--r--decoder/tagger.cc112
-rw-r--r--decoder/tagger.h17
-rw-r--r--decoder/tdict.cc49
-rw-r--r--decoder/tdict.h30
-rw-r--r--decoder/test_data/dummy.3gram.lm2645
-rw-r--r--decoder/test_data/grammar.prune196
-rw-r--r--decoder/test_data/small.json.gzbin0 -> 1561 bytes
-rw-r--r--decoder/test_data/test_2gram.lm.gzbin0 -> 587 bytes
-rw-r--r--decoder/test_data/weights8
-rw-r--r--decoder/test_data/weights.gt4
-rw-r--r--decoder/timing_stats.cc24
-rw-r--r--decoder/timing_stats.h25
-rw-r--r--decoder/translator.cc57
-rw-r--r--decoder/translator.h82
-rw-r--r--decoder/tromble_loss.cc309
-rw-r--r--decoder/tromble_loss.h40
-rw-r--r--decoder/trule.cc242
-rw-r--r--decoder/trule.h145
-rw-r--r--decoder/trule_test.cc65
-rw-r--r--decoder/ttables.cc31
-rw-r--r--decoder/ttables.h87
-rw-r--r--decoder/viterbi.cc39
-rw-r--r--decoder/viterbi.h142
-rw-r--r--decoder/weights.cc77
-rw-r--r--decoder/weights.h21
-rw-r--r--decoder/weights_test.cc28
-rw-r--r--decoder/wordid.h6
110 files changed, 17207 insertions, 0 deletions
diff --git a/decoder/JSON_parser.c b/decoder/JSON_parser.c
new file mode 100644
index 00000000..175b7cc9
--- /dev/null
+++ b/decoder/JSON_parser.c
@@ -0,0 +1,1012 @@
+/* JSON_parser.c */
+
+/* 2007-08-24 */
+
+/*
+Copyright (c) 2005 JSON.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+The Software shall be used for Good, not Evil.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+/*
+ Callbacks, comments, Unicode handling by Jean Gressmann (jean@0x42.de), 2007-2009.
+
+ For the added features the license above applies also.
+
+ Changelog:
+ 2009-05-17
+ Incorporated benrudiak@googlemail.com fix for UTF16 decoding.
+
+ 2009-05-14
+ Fixed float parsing bug related to a locale being set that didn't
+ use '.' as decimal point character (charles@transmissionbt.com).
+
+ 2008-10-14
+ Renamed states.IN to states.IT to avoid name clash which IN macro
+ defined in windef.h (alexey.pelykh@gmail.com)
+
+ 2008-07-19
+ Removed some duplicate code & debugging variable (charles@transmissionbt.com)
+
+ 2008-05-28
+ Made JSON_value structure ansi C compliant. This bug was report by
+ trisk@acm.jhu.edu
+
+ 2008-05-20
+ Fixed bug reported by charles@transmissionbt.com where the switching
+ from static to dynamic parse buffer did not copy the static parse
+ buffer's content.
+*/
+
+
+
+#include <assert.h>
+#include <ctype.h>
+#include <float.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+
+#include "JSON_parser.h"
+
+#ifdef _MSC_VER
+# if _MSC_VER >= 1400 /* Visual Studio 2005 and up */
+# pragma warning(disable:4996) // unsecure sscanf
+# endif
+#endif
+
+
+#define true 1
+#define false 0
+#define __ -1 /* the universal error code */
+
+/* values chosen so that the object size is approx equal to one page (4K) */
+#ifndef JSON_PARSER_STACK_SIZE
+# define JSON_PARSER_STACK_SIZE 128
+#endif
+
+#ifndef JSON_PARSER_PARSE_BUFFER_SIZE
+# define JSON_PARSER_PARSE_BUFFER_SIZE 3500
+#endif
+
+typedef unsigned short UTF16;
+
+struct JSON_parser_struct {
+ JSON_parser_callback callback;
+ void* ctx;
+ signed char state, before_comment_state, type, escaped, comment, allow_comments, handle_floats_manually;
+ UTF16 utf16_high_surrogate;
+ long depth;
+ long top;
+ signed char* stack;
+ long stack_capacity;
+ char decimal_point;
+ char* parse_buffer;
+ size_t parse_buffer_capacity;
+ size_t parse_buffer_count;
+ size_t comment_begin_offset;
+ signed char static_stack[JSON_PARSER_STACK_SIZE];
+ char static_parse_buffer[JSON_PARSER_PARSE_BUFFER_SIZE];
+};
+
+#define COUNTOF(x) (sizeof(x)/sizeof(x[0]))
+
+/*
+ Characters are mapped into these character classes. This allows for
+ a significant reduction in the size of the state transition table.
+*/
+
+
+
+enum classes {
+ C_SPACE, /* space */
+ C_WHITE, /* other whitespace */
+ C_LCURB, /* { */
+ C_RCURB, /* } */
+ C_LSQRB, /* [ */
+ C_RSQRB, /* ] */
+ C_COLON, /* : */
+ C_COMMA, /* , */
+ C_QUOTE, /* " */
+ C_BACKS, /* \ */
+ C_SLASH, /* / */
+ C_PLUS, /* + */
+ C_MINUS, /* - */
+ C_POINT, /* . */
+ C_ZERO , /* 0 */
+ C_DIGIT, /* 123456789 */
+ C_LOW_A, /* a */
+ C_LOW_B, /* b */
+ C_LOW_C, /* c */
+ C_LOW_D, /* d */
+ C_LOW_E, /* e */
+ C_LOW_F, /* f */
+ C_LOW_L, /* l */
+ C_LOW_N, /* n */
+ C_LOW_R, /* r */
+ C_LOW_S, /* s */
+ C_LOW_T, /* t */
+ C_LOW_U, /* u */
+ C_ABCDF, /* ABCDF */
+ C_E, /* E */
+ C_ETC, /* everything else */
+ C_STAR, /* * */
+ NR_CLASSES
+};
+
+static int ascii_class[128] = {
+/*
+ This array maps the 128 ASCII characters into character classes.
+ The remaining Unicode characters should be mapped to C_ETC.
+ Non-whitespace control characters are errors.
+*/
+ __, __, __, __, __, __, __, __,
+ __, C_WHITE, C_WHITE, __, __, C_WHITE, __, __,
+ __, __, __, __, __, __, __, __,
+ __, __, __, __, __, __, __, __,
+
+ C_SPACE, C_ETC, C_QUOTE, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC,
+ C_ETC, C_ETC, C_STAR, C_PLUS, C_COMMA, C_MINUS, C_POINT, C_SLASH,
+ C_ZERO, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT,
+ C_DIGIT, C_DIGIT, C_COLON, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC,
+
+ C_ETC, C_ABCDF, C_ABCDF, C_ABCDF, C_ABCDF, C_E, C_ABCDF, C_ETC,
+ C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC,
+ C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC,
+ C_ETC, C_ETC, C_ETC, C_LSQRB, C_BACKS, C_RSQRB, C_ETC, C_ETC,
+
+ C_ETC, C_LOW_A, C_LOW_B, C_LOW_C, C_LOW_D, C_LOW_E, C_LOW_F, C_ETC,
+ C_ETC, C_ETC, C_ETC, C_ETC, C_LOW_L, C_ETC, C_LOW_N, C_ETC,
+ C_ETC, C_ETC, C_LOW_R, C_LOW_S, C_LOW_T, C_LOW_U, C_ETC, C_ETC,
+ C_ETC, C_ETC, C_ETC, C_LCURB, C_ETC, C_RCURB, C_ETC, C_ETC
+};
+
+
+/*
+ The state codes.
+*/
+enum states {
+ GO, /* start */
+ OK, /* ok */
+ OB, /* object */
+ KE, /* key */
+ CO, /* colon */
+ VA, /* value */
+ AR, /* array */
+ ST, /* string */
+ ES, /* escape */
+ U1, /* u1 */
+ U2, /* u2 */
+ U3, /* u3 */
+ U4, /* u4 */
+ MI, /* minus */
+ ZE, /* zero */
+ IT, /* integer */
+ FR, /* fraction */
+ E1, /* e */
+ E2, /* ex */
+ E3, /* exp */
+ T1, /* tr */
+ T2, /* tru */
+ T3, /* true */
+ F1, /* fa */
+ F2, /* fal */
+ F3, /* fals */
+ F4, /* false */
+ N1, /* nu */
+ N2, /* nul */
+ N3, /* null */
+ C1, /* / */
+ C2, /* / * */
+ C3, /* * */
+ FX, /* *.* *eE* */
+ D1, /* second UTF-16 character decoding started by \ */
+ D2, /* second UTF-16 character proceeded by u */
+ NR_STATES
+};
+
+enum actions
+{
+ CB = -10, /* comment begin */
+ CE = -11, /* comment end */
+ FA = -12, /* false */
+ TR = -13, /* false */
+ NU = -14, /* null */
+ DE = -15, /* double detected by exponent e E */
+ DF = -16, /* double detected by fraction . */
+ SB = -17, /* string begin */
+ MX = -18, /* integer detected by minus */
+ ZX = -19, /* integer detected by zero */
+ IX = -20, /* integer detected by 1-9 */
+ EX = -21, /* next char is escaped */
+ UC = -22 /* Unicode character read */
+};
+
+
+static int state_transition_table[NR_STATES][NR_CLASSES] = {
+/*
+ The state transition table takes the current state and the current symbol,
+ and returns either a new state or an action. An action is represented as a
+ negative number. A JSON text is accepted if at the end of the text the
+ state is OK and if the mode is MODE_DONE.
+
+ white 1-9 ABCDF etc
+ space | { } [ ] : , " \ / + - . 0 | a b c d e f l n r s t u | E | * */
+/*start GO*/ {GO,GO,-6,__,-5,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*ok OK*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*object OB*/ {OB,OB,__,-9,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*key KE*/ {KE,KE,__,__,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*colon CO*/ {CO,CO,__,__,__,__,-2,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*value VA*/ {VA,VA,-6,__,-5,__,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__},
+/*array AR*/ {AR,AR,-6,__,-5,-7,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__},
+/*string ST*/ {ST,__,ST,ST,ST,ST,ST,ST,-4,EX,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST},
+/*escape ES*/ {__,__,__,__,__,__,__,__,ST,ST,ST,__,__,__,__,__,__,ST,__,__,__,ST,__,ST,ST,__,ST,U1,__,__,__,__},
+/*u1 U1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U2,U2,U2,U2,U2,U2,U2,U2,__,__,__,__,__,__,U2,U2,__,__},
+/*u2 U2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U3,U3,U3,U3,U3,U3,U3,U3,__,__,__,__,__,__,U3,U3,__,__},
+/*u3 U3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U4,U4,U4,U4,U4,U4,U4,U4,__,__,__,__,__,__,U4,U4,__,__},
+/*u4 U4*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,UC,UC,UC,UC,UC,UC,UC,UC,__,__,__,__,__,__,UC,UC,__,__},
+/*minus MI*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,ZE,IT,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*zero ZE*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*int IT*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,IT,IT,__,__,__,__,DE,__,__,__,__,__,__,__,__,DE,__,__},
+/*frac FR*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__},
+/*e E1*/ {__,__,__,__,__,__,__,__,__,__,__,E2,E2,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*ex E2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*exp E3*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*tr T1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T2,__,__,__,__,__,__,__},
+/*tru T2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T3,__,__,__,__},
+/*true T3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__},
+/*fa F1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*fal F2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F3,__,__,__,__,__,__,__,__,__},
+/*fals F3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F4,__,__,__,__,__,__},
+/*false F4*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__},
+/*nu N1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N2,__,__,__,__},
+/*nul N2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N3,__,__,__,__,__,__,__,__,__},
+/*null N3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__},
+/*/ C1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,C2},
+/*/* C2*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3},
+/** C3*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,CE,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3},
+/*_. FX*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__},
+/*\ D1*/ {__,__,__,__,__,__,__,__,__,D2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__},
+/*\ D2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,U1,__,__,__,__},
+};
+
+
+/*
+ These modes can be pushed on the stack.
+*/
+enum modes {
+ MODE_ARRAY = 1,
+ MODE_DONE = 2,
+ MODE_KEY = 3,
+ MODE_OBJECT = 4
+};
+
+static int
+push(JSON_parser jc, int mode)
+{
+/*
+ Push a mode onto the stack. Return false if there is overflow.
+*/
+ jc->top += 1;
+ if (jc->depth < 0) {
+ if (jc->top >= jc->stack_capacity) {
+ size_t bytes_to_allocate;
+ jc->stack_capacity *= 2;
+ bytes_to_allocate = jc->stack_capacity * sizeof(jc->static_stack[0]);
+ if (jc->stack == &jc->static_stack[0]) {
+ jc->stack = (signed char*)malloc(bytes_to_allocate);
+ memcpy(jc->stack, jc->static_stack, sizeof(jc->static_stack));
+ } else {
+ jc->stack = (signed char*)realloc(jc->stack, bytes_to_allocate);
+ }
+ }
+ } else {
+ if (jc->top >= jc->depth) {
+ return false;
+ }
+ }
+
+ jc->stack[jc->top] = mode;
+ return true;
+}
+
+
+static int
+pop(JSON_parser jc, int mode)
+{
+/*
+ Pop the stack, assuring that the current mode matches the expectation.
+ Return false if there is underflow or if the modes mismatch.
+*/
+ if (jc->top < 0 || jc->stack[jc->top] != mode) {
+ return false;
+ }
+ jc->top -= 1;
+ return true;
+}
+
+
+#define parse_buffer_clear(jc) \
+ do {\
+ jc->parse_buffer_count = 0;\
+ jc->parse_buffer[0] = 0;\
+ } while (0)
+
+#define parse_buffer_pop_back_char(jc)\
+ do {\
+ assert(jc->parse_buffer_count >= 1);\
+ --jc->parse_buffer_count;\
+ jc->parse_buffer[jc->parse_buffer_count] = 0;\
+ } while (0)
+
+void delete_JSON_parser(JSON_parser jc)
+{
+ if (jc) {
+ if (jc->stack != &jc->static_stack[0]) {
+ free((void*)jc->stack);
+ }
+ if (jc->parse_buffer != &jc->static_parse_buffer[0]) {
+ free((void*)jc->parse_buffer);
+ }
+ free((void*)jc);
+ }
+}
+
+
+JSON_parser
+new_JSON_parser(JSON_config* config)
+{
+/*
+ new_JSON_parser starts the checking process by constructing a JSON_parser
+ object. It takes a depth parameter that restricts the level of maximum
+ nesting.
+
+ To continue the process, call JSON_parser_char for each character in the
+ JSON text, and then call JSON_parser_done to obtain the final result.
+ These functions are fully reentrant.
+*/
+
+ int depth = 0;
+ JSON_config default_config;
+
+ JSON_parser jc = (JSON_parser)malloc(sizeof(struct JSON_parser_struct));
+
+ memset(jc, 0, sizeof(*jc));
+
+
+ /* initialize configuration */
+ init_JSON_config(&default_config);
+
+ /* set to default configuration if none was provided */
+ if (config == NULL) {
+ config = &default_config;
+ }
+
+ depth = config->depth;
+
+ /* We need to be able to push at least one object */
+ if (depth == 0) {
+ depth = 1;
+ }
+
+ jc->state = GO;
+ jc->top = -1;
+
+ /* Do we want non-bound stack? */
+ if (depth > 0) {
+ jc->stack_capacity = depth;
+ jc->depth = depth;
+ if (depth <= (int)COUNTOF(jc->static_stack)) {
+ jc->stack = &jc->static_stack[0];
+ } else {
+ jc->stack = (signed char*)malloc(jc->stack_capacity * sizeof(jc->static_stack[0]));
+ }
+ } else {
+ jc->stack_capacity = COUNTOF(jc->static_stack);
+ jc->depth = -1;
+ jc->stack = &jc->static_stack[0];
+ }
+
+ /* set parser to start */
+ push(jc, MODE_DONE);
+
+ /* set up the parse buffer */
+ jc->parse_buffer = &jc->static_parse_buffer[0];
+ jc->parse_buffer_capacity = COUNTOF(jc->static_parse_buffer);
+ parse_buffer_clear(jc);
+
+ /* set up callback, comment & float handling */
+ jc->callback = config->callback;
+ jc->ctx = config->callback_ctx;
+ jc->allow_comments = config->allow_comments != 0;
+ jc->handle_floats_manually = config->handle_floats_manually != 0;
+
+ /* set up decimal point */
+ jc->decimal_point = *localeconv()->decimal_point;
+
+ return jc;
+}
+
+static void grow_parse_buffer(JSON_parser jc)
+{
+ size_t bytes_to_allocate;
+ jc->parse_buffer_capacity *= 2;
+ bytes_to_allocate = jc->parse_buffer_capacity * sizeof(jc->parse_buffer[0]);
+ if (jc->parse_buffer == &jc->static_parse_buffer[0]) {
+ jc->parse_buffer = (char*)malloc(bytes_to_allocate);
+ memcpy(jc->parse_buffer, jc->static_parse_buffer, jc->parse_buffer_count);
+ } else {
+ jc->parse_buffer = (char*)realloc(jc->parse_buffer, bytes_to_allocate);
+ }
+}
+
+#define parse_buffer_push_back_char(jc, c)\
+ do {\
+ if (jc->parse_buffer_count + 1 >= jc->parse_buffer_capacity) grow_parse_buffer(jc);\
+ jc->parse_buffer[jc->parse_buffer_count++] = c;\
+ jc->parse_buffer[jc->parse_buffer_count] = 0;\
+ } while (0)
+
+#define assert_is_non_container_type(jc) \
+ assert( \
+ jc->type == JSON_T_NULL || \
+ jc->type == JSON_T_FALSE || \
+ jc->type == JSON_T_TRUE || \
+ jc->type == JSON_T_FLOAT || \
+ jc->type == JSON_T_INTEGER || \
+ jc->type == JSON_T_STRING)
+
+
+static int parse_parse_buffer(JSON_parser jc)
+{
+ if (jc->callback) {
+ JSON_value value, *arg = NULL;
+
+ if (jc->type != JSON_T_NONE) {
+ assert_is_non_container_type(jc);
+
+ switch(jc->type) {
+ case JSON_T_FLOAT:
+ arg = &value;
+ if (jc->handle_floats_manually) {
+ value.vu.str.value = jc->parse_buffer;
+ value.vu.str.length = jc->parse_buffer_count;
+ } else {
+ /*sscanf(jc->parse_buffer, "%Lf", &value.vu.float_value);*/
+
+ /* not checking with end pointer b/c there may be trailing ws */
+ value.vu.float_value = strtold(jc->parse_buffer, NULL);
+ }
+ break;
+ case JSON_T_INTEGER:
+ arg = &value;
+ sscanf(jc->parse_buffer, JSON_PARSER_INTEGER_SSCANF_TOKEN, &value.vu.integer_value);
+ break;
+ case JSON_T_STRING:
+ arg = &value;
+ value.vu.str.value = jc->parse_buffer;
+ value.vu.str.length = jc->parse_buffer_count;
+ break;
+ }
+
+ if (!(*jc->callback)(jc->ctx, jc->type, arg)) {
+ return false;
+ }
+ }
+ }
+
+ parse_buffer_clear(jc);
+
+ return true;
+}
+
+#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800)
+#define IS_LOW_SURROGATE(uc) (((uc) & 0xFC00) == 0xDC00)
+#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000)
+static unsigned char utf8_lead_bits[4] = { 0x00, 0xC0, 0xE0, 0xF0 };
+
+static int decode_unicode_char(JSON_parser jc)
+{
+ int i;
+ unsigned uc = 0;
+ char* p;
+ int trail_bytes;
+
+ assert(jc->parse_buffer_count >= 6);
+
+ p = &jc->parse_buffer[jc->parse_buffer_count - 4];
+
+ for (i = 12; i >= 0; i -= 4, ++p) {
+ unsigned x = *p;
+
+ if (x >= 'a') {
+ x -= ('a' - 10);
+ } else if (x >= 'A') {
+ x -= ('A' - 10);
+ } else {
+ x &= ~0x30u;
+ }
+
+ assert(x < 16);
+
+ uc |= x << i;
+ }
+
+ /* clear UTF-16 char from buffer */
+ jc->parse_buffer_count -= 6;
+ jc->parse_buffer[jc->parse_buffer_count] = 0;
+
+ /* attempt decoding ... */
+ if (jc->utf16_high_surrogate) {
+ if (IS_LOW_SURROGATE(uc)) {
+ uc = DECODE_SURROGATE_PAIR(jc->utf16_high_surrogate, uc);
+ trail_bytes = 3;
+ jc->utf16_high_surrogate = 0;
+ } else {
+ /* high surrogate without a following low surrogate */
+ return false;
+ }
+ } else {
+ if (uc < 0x80) {
+ trail_bytes = 0;
+ } else if (uc < 0x800) {
+ trail_bytes = 1;
+ } else if (IS_HIGH_SURROGATE(uc)) {
+ /* save the high surrogate and wait for the low surrogate */
+ jc->utf16_high_surrogate = uc;
+ return true;
+ } else if (IS_LOW_SURROGATE(uc)) {
+ /* low surrogate without a preceding high surrogate */
+ return false;
+ } else {
+ trail_bytes = 2;
+ }
+ }
+
+ jc->parse_buffer[jc->parse_buffer_count++] = (char) ((uc >> (trail_bytes * 6)) | utf8_lead_bits[trail_bytes]);
+
+ for (i = trail_bytes * 6 - 6; i >= 0; i -= 6) {
+ jc->parse_buffer[jc->parse_buffer_count++] = (char) (((uc >> i) & 0x3F) | 0x80);
+ }
+
+ jc->parse_buffer[jc->parse_buffer_count] = 0;
+
+ return true;
+}
+
+static int add_escaped_char_to_parse_buffer(JSON_parser jc, int next_char)
+{
+ jc->escaped = 0;
+ /* remove the backslash */
+ parse_buffer_pop_back_char(jc);
+ switch(next_char) {
+ case 'b':
+ parse_buffer_push_back_char(jc, '\b');
+ break;
+ case 'f':
+ parse_buffer_push_back_char(jc, '\f');
+ break;
+ case 'n':
+ parse_buffer_push_back_char(jc, '\n');
+ break;
+ case 'r':
+ parse_buffer_push_back_char(jc, '\r');
+ break;
+ case 't':
+ parse_buffer_push_back_char(jc, '\t');
+ break;
+ case '"':
+ parse_buffer_push_back_char(jc, '"');
+ break;
+ case '\\':
+ parse_buffer_push_back_char(jc, '\\');
+ break;
+ case '/':
+ parse_buffer_push_back_char(jc, '/');
+ break;
+ case 'u':
+ parse_buffer_push_back_char(jc, '\\');
+ parse_buffer_push_back_char(jc, 'u');
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+#define add_char_to_parse_buffer(jc, next_char, next_class) \
+ do { \
+ if (jc->escaped) { \
+ if (!add_escaped_char_to_parse_buffer(jc, next_char)) \
+ return false; \
+ } else if (!jc->comment) { \
+ if ((jc->type != JSON_T_NONE) | !((next_class == C_SPACE) | (next_class == C_WHITE)) /* non-white-space */) { \
+ parse_buffer_push_back_char(jc, (char)next_char); \
+ } \
+ } \
+ } while (0)
+
+
+#define assert_type_isnt_string_null_or_bool(jc) \
+ assert(jc->type != JSON_T_FALSE); \
+ assert(jc->type != JSON_T_TRUE); \
+ assert(jc->type != JSON_T_NULL); \
+ assert(jc->type != JSON_T_STRING)
+
+
+int
+JSON_parser_char(JSON_parser jc, int next_char)
+{
+/*
+ After calling new_JSON_parser, call this function for each character (or
+ partial character) in your JSON text. It can accept UTF-8, UTF-16, or
+ UTF-32. It returns true if things are looking ok so far. If it rejects the
+ text, it returns false.
+*/
+ int next_class, next_state;
+
+/*
+ Determine the character's class.
+*/
+ if (next_char < 0) {
+ return false;
+ }
+ if (next_char >= 128) {
+ next_class = C_ETC;
+ } else {
+ next_class = ascii_class[next_char];
+ if (next_class <= __) {
+ return false;
+ }
+ }
+
+ add_char_to_parse_buffer(jc, next_char, next_class);
+
+/*
+ Get the next state from the state transition table.
+*/
+ next_state = state_transition_table[jc->state][next_class];
+ if (next_state >= 0) {
+/*
+ Change the state.
+*/
+ jc->state = next_state;
+ } else {
+/*
+ Or perform one of the actions.
+*/
+ switch (next_state) {
+/* Unicode character */
+ case UC:
+ if(!decode_unicode_char(jc)) {
+ return false;
+ }
+ /* check if we need to read a second UTF-16 char */
+ if (jc->utf16_high_surrogate) {
+ jc->state = D1;
+ } else {
+ jc->state = ST;
+ }
+ break;
+/* escaped char */
+ case EX:
+ jc->escaped = 1;
+ jc->state = ES;
+ break;
+/* integer detected by minus */
+ case MX:
+ jc->type = JSON_T_INTEGER;
+ jc->state = MI;
+ break;
+/* integer detected by zero */
+ case ZX:
+ jc->type = JSON_T_INTEGER;
+ jc->state = ZE;
+ break;
+/* integer detected by 1-9 */
+ case IX:
+ jc->type = JSON_T_INTEGER;
+ jc->state = IT;
+ break;
+
+/* floating point number detected by exponent*/
+ case DE:
+ assert_type_isnt_string_null_or_bool(jc);
+ jc->type = JSON_T_FLOAT;
+ jc->state = E1;
+ break;
+
+/* floating point number detected by fraction */
+ case DF:
+ assert_type_isnt_string_null_or_bool(jc);
+ if (!jc->handle_floats_manually) {
+/*
+ Some versions of strtod (which underlies sscanf) don't support converting
+ C-locale formated floating point values.
+*/
+ assert(jc->parse_buffer[jc->parse_buffer_count-1] == '.');
+ jc->parse_buffer[jc->parse_buffer_count-1] = jc->decimal_point;
+ }
+ jc->type = JSON_T_FLOAT;
+ jc->state = FX;
+ break;
+/* string begin " */
+ case SB:
+ parse_buffer_clear(jc);
+ assert(jc->type == JSON_T_NONE);
+ jc->type = JSON_T_STRING;
+ jc->state = ST;
+ break;
+
+/* n */
+ case NU:
+ assert(jc->type == JSON_T_NONE);
+ jc->type = JSON_T_NULL;
+ jc->state = N1;
+ break;
+/* f */
+ case FA:
+ assert(jc->type == JSON_T_NONE);
+ jc->type = JSON_T_FALSE;
+ jc->state = F1;
+ break;
+/* t */
+ case TR:
+ assert(jc->type == JSON_T_NONE);
+ jc->type = JSON_T_TRUE;
+ jc->state = T1;
+ break;
+
+/* closing comment */
+ case CE:
+ jc->comment = 0;
+ assert(jc->parse_buffer_count == 0);
+ assert(jc->type == JSON_T_NONE);
+ jc->state = jc->before_comment_state;
+ break;
+
+/* opening comment */
+ case CB:
+ if (!jc->allow_comments) {
+ return false;
+ }
+ parse_buffer_pop_back_char(jc);
+ if (!parse_parse_buffer(jc)) {
+ return false;
+ }
+ assert(jc->parse_buffer_count == 0);
+ assert(jc->type != JSON_T_STRING);
+ switch (jc->stack[jc->top]) {
+ case MODE_ARRAY:
+ case MODE_OBJECT:
+ switch(jc->state) {
+ case VA:
+ case AR:
+ jc->before_comment_state = jc->state;
+ break;
+ default:
+ jc->before_comment_state = OK;
+ break;
+ }
+ break;
+ default:
+ jc->before_comment_state = jc->state;
+ break;
+ }
+ jc->type = JSON_T_NONE;
+ jc->state = C1;
+ jc->comment = 1;
+ break;
+/* empty } */
+ case -9:
+ parse_buffer_clear(jc);
+ if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) {
+ return false;
+ }
+ if (!pop(jc, MODE_KEY)) {
+ return false;
+ }
+ jc->state = OK;
+ break;
+
+/* } */ case -8:
+ parse_buffer_pop_back_char(jc);
+ if (!parse_parse_buffer(jc)) {
+ return false;
+ }
+ if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) {
+ return false;
+ }
+ if (!pop(jc, MODE_OBJECT)) {
+ return false;
+ }
+ jc->type = JSON_T_NONE;
+ jc->state = OK;
+ break;
+
+/* ] */ case -7:
+ parse_buffer_pop_back_char(jc);
+ if (!parse_parse_buffer(jc)) {
+ return false;
+ }
+ if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_END, NULL)) {
+ return false;
+ }
+ if (!pop(jc, MODE_ARRAY)) {
+ return false;
+ }
+
+ jc->type = JSON_T_NONE;
+ jc->state = OK;
+ break;
+
+/* { */ case -6:
+ parse_buffer_pop_back_char(jc);
+ if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_BEGIN, NULL)) {
+ return false;
+ }
+ if (!push(jc, MODE_KEY)) {
+ return false;
+ }
+ assert(jc->type == JSON_T_NONE);
+ jc->state = OB;
+ break;
+
+/* [ */ case -5:
+ parse_buffer_pop_back_char(jc);
+ if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_BEGIN, NULL)) {
+ return false;
+ }
+ if (!push(jc, MODE_ARRAY)) {
+ return false;
+ }
+ assert(jc->type == JSON_T_NONE);
+ jc->state = AR;
+ break;
+
+/* string end " */ case -4:
+ parse_buffer_pop_back_char(jc);
+ switch (jc->stack[jc->top]) {
+ case MODE_KEY:
+ assert(jc->type == JSON_T_STRING);
+ jc->type = JSON_T_NONE;
+ jc->state = CO;
+
+ if (jc->callback) {
+ JSON_value value;
+ value.vu.str.value = jc->parse_buffer;
+ value.vu.str.length = jc->parse_buffer_count;
+ if (!(*jc->callback)(jc->ctx, JSON_T_KEY, &value)) {
+ return false;
+ }
+ }
+ parse_buffer_clear(jc);
+ break;
+ case MODE_ARRAY:
+ case MODE_OBJECT:
+ assert(jc->type == JSON_T_STRING);
+ if (!parse_parse_buffer(jc)) {
+ return false;
+ }
+ jc->type = JSON_T_NONE;
+ jc->state = OK;
+ break;
+ default:
+ return false;
+ }
+ break;
+
+/* , */ case -3:
+ parse_buffer_pop_back_char(jc);
+ if (!parse_parse_buffer(jc)) {
+ return false;
+ }
+ switch (jc->stack[jc->top]) {
+ case MODE_OBJECT:
+/*
+ A comma causes a flip from object mode to key mode.
+*/
+ if (!pop(jc, MODE_OBJECT) || !push(jc, MODE_KEY)) {
+ return false;
+ }
+ assert(jc->type != JSON_T_STRING);
+ jc->type = JSON_T_NONE;
+ jc->state = KE;
+ break;
+ case MODE_ARRAY:
+ assert(jc->type != JSON_T_STRING);
+ jc->type = JSON_T_NONE;
+ jc->state = VA;
+ break;
+ default:
+ return false;
+ }
+ break;
+
+/* : */ case -2:
+/*
+ A colon causes a flip from key mode to object mode.
+*/
+ parse_buffer_pop_back_char(jc);
+ if (!pop(jc, MODE_KEY) || !push(jc, MODE_OBJECT)) {
+ return false;
+ }
+ assert(jc->type == JSON_T_NONE);
+ jc->state = VA;
+ break;
+/*
+ Bad action.
+*/
+ default:
+ return false;
+ }
+ }
+ return true;
+}
+
+
+int
+JSON_parser_done(JSON_parser jc)
+{
+ const int result = jc->state == OK && pop(jc, MODE_DONE);
+
+ return result;
+}
+
+
+int JSON_parser_is_legal_white_space_string(const char* s)
+{
+ int c, char_class;
+
+ if (s == NULL) {
+ return false;
+ }
+
+ for (; *s; ++s) {
+ c = *s;
+
+ if (c < 0 || c >= 128) {
+ return false;
+ }
+
+ char_class = ascii_class[c];
+
+ if (char_class != C_SPACE && char_class != C_WHITE) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+
+
+void init_JSON_config(JSON_config* config)
+{
+ if (config) {
+ memset(config, 0, sizeof(*config));
+
+ config->depth = JSON_PARSER_STACK_SIZE - 1;
+ }
+}
diff --git a/decoder/JSON_parser.h b/decoder/JSON_parser.h
new file mode 100644
index 00000000..ceb5b24b
--- /dev/null
+++ b/decoder/JSON_parser.h
@@ -0,0 +1,152 @@
+#ifndef JSON_PARSER_H
+#define JSON_PARSER_H
+
+/* JSON_parser.h */
+
+
+#include <stddef.h>
+
+/* Windows DLL stuff */
+#ifdef _WIN32
+# ifdef JSON_PARSER_DLL_EXPORTS
+# define JSON_PARSER_DLL_API __declspec(dllexport)
+# else
+# define JSON_PARSER_DLL_API __declspec(dllimport)
+# endif
+#else
+# define JSON_PARSER_DLL_API
+#endif
+
+/* Determine the integer type use to parse non-floating point numbers */
+#if __STDC_VERSION__ >= 199901L || HAVE_LONG_LONG == 1
+typedef long long JSON_int_t;
+#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%lld"
+#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%lld"
+#else
+typedef long JSON_int_t;
+#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%ld"
+#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%ld"
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum
+{
+ JSON_T_NONE = 0,
+ JSON_T_ARRAY_BEGIN, // 1
+ JSON_T_ARRAY_END, // 2
+ JSON_T_OBJECT_BEGIN, // 3
+ JSON_T_OBJECT_END, // 4
+ JSON_T_INTEGER, // 5
+ JSON_T_FLOAT, // 6
+ JSON_T_NULL, // 7
+ JSON_T_TRUE, // 8
+ JSON_T_FALSE, // 9
+ JSON_T_STRING, // 10
+ JSON_T_KEY, // 11
+ JSON_T_MAX // 12
+} JSON_type;
+
+typedef struct JSON_value_struct {
+ union {
+ JSON_int_t integer_value;
+
+ long double float_value;
+
+ struct {
+ const char* value;
+ size_t length;
+ } str;
+ } vu;
+} JSON_value;
+
+typedef struct JSON_parser_struct* JSON_parser;
+
+/*! \brief JSON parser callback
+
+ \param ctx The pointer passed to new_JSON_parser.
+ \param type An element of JSON_type but not JSON_T_NONE.
+ \param value A representation of the parsed value. This parameter is NULL for
+ JSON_T_ARRAY_BEGIN, JSON_T_ARRAY_END, JSON_T_OBJECT_BEGIN, JSON_T_OBJECT_END,
+ JSON_T_NULL, JSON_T_TRUE, and SON_T_FALSE. String values are always returned
+ as zero-terminated C strings.
+
+ \return Non-zero if parsing should continue, else zero.
+*/
+typedef int (*JSON_parser_callback)(void* ctx, int type, const struct JSON_value_struct* value);
+
+
+/*! \brief The structure used to configure a JSON parser object
+
+ \param depth If negative, the parser can parse arbitrary levels of JSON, otherwise
+ the depth is the limit
+ \param Pointer to a callback. This parameter may be NULL. In this case the input is merely checked for validity.
+ \param Callback context. This parameter may be NULL.
+ \param depth. Specifies the levels of nested JSON to allow. Negative numbers yield unlimited nesting.
+ \param allowComments. To allow C style comments in JSON, set to non-zero.
+ \param handleFloatsManually. To decode floating point numbers manually set this parameter to non-zero.
+
+ \return The parser object.
+*/
+typedef struct {
+ JSON_parser_callback callback;
+ void* callback_ctx;
+ int depth;
+ int allow_comments;
+ int handle_floats_manually;
+} JSON_config;
+
+
+/*! \brief Initializes the JSON parser configuration structure to default values.
+
+ The default configuration is
+ - 127 levels of nested JSON (depends on JSON_PARSER_STACK_SIZE, see json_parser.c)
+ - no parsing, just checking for JSON syntax
+ - no comments
+
+ \param config. Used to configure the parser.
+*/
+JSON_PARSER_DLL_API void init_JSON_config(JSON_config* config);
+
+/*! \brief Create a JSON parser object
+
+ \param config. Used to configure the parser. Set to NULL to use the default configuration.
+ See init_JSON_config
+
+ \return The parser object.
+*/
+JSON_PARSER_DLL_API extern JSON_parser new_JSON_parser(JSON_config* config);
+
+/*! \brief Destroy a previously created JSON parser object. */
+JSON_PARSER_DLL_API extern void delete_JSON_parser(JSON_parser jc);
+
+/*! \brief Parse a character.
+
+ \return Non-zero, if all characters passed to this function are part of are valid JSON.
+*/
+JSON_PARSER_DLL_API extern int JSON_parser_char(JSON_parser jc, int next_char);
+
+/*! \brief Finalize parsing.
+
+ Call this method once after all input characters have been consumed.
+
+ \return Non-zero, if all parsed characters are valid JSON, zero otherwise.
+*/
+JSON_PARSER_DLL_API extern int JSON_parser_done(JSON_parser jc);
+
+/*! \brief Determine if a given string is valid JSON white space
+
+ \return Non-zero if the string is valid, zero otherwise.
+*/
+JSON_PARSER_DLL_API extern int JSON_parser_is_legal_white_space_string(const char* s);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* JSON_PARSER_H */
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
new file mode 100644
index 00000000..a385197c
--- /dev/null
+++ b/decoder/Makefile.am
@@ -0,0 +1,84 @@
+bin_PROGRAMS = cdec
+
+if HAVE_GTEST
+noinst_PROGRAMS = \
+ dict_test \
+ weights_test \
+ trule_test \
+ hg_test \
+ ff_test \
+ logval_test \
+ parser_test \
+ grammar_test \
+ small_vector_test
+endif
+
+cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc ff_factory.cc timing_stats.cc
+small_vector_test_SOURCES = small_vector_test.cc
+small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
+parser_test_SOURCES = parser_test.cc
+parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
+dict_test_SOURCES = dict_test.cc
+dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
+ff_test_SOURCES = ff_test.cc
+ff_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
+grammar_test_SOURCES = grammar_test.cc
+grammar_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
+hg_test_SOURCES = hg_test.cc
+hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
+trule_test_SOURCES = trule_test.cc
+trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
+weights_test_SOURCES = weights_test.cc
+weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
+logval_test_SOURCES = logval_test.cc
+logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
+
+LDADD = libcdec.a
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS)
+AM_LDFLAGS = -lz
+
+rule_lexer.cc: rule_lexer.l
+ $(LEX) -s -CF -8 -o$@ $<
+
+noinst_LIBRARIES = libcdec.a
+
+libcdec_a_SOURCES = \
+ rule_lexer.cc \
+ fst_translator.cc \
+ csplit.cc \
+ translator.cc \
+ scfg_translator.cc \
+ hg.cc \
+ hg_io.cc \
+ hg_intersect.cc \
+ viterbi.cc \
+ lattice.cc \
+ aligner.cc \
+ gzstream.cc \
+ apply_models.cc \
+ earley_composer.cc \
+ phrasetable_fst.cc \
+ sparse_vector.cc \
+ trule.cc \
+ filelib.cc \
+ stringlib.cc \
+ fdict.cc \
+ tdict.cc \
+ weights.cc \
+ ttables.cc \
+ ff.cc \
+ ff_lm.cc \
+ ff_wordalign.cc \
+ ff_csplit.cc \
+ ff_tagger.cc \
+ tromble_loss.cc \
+ freqdict.cc \
+ lexalign.cc \
+ lextrans.cc \
+ tagger.cc \
+ bottom_up_parser.cc \
+ phrasebased_translator.cc \
+ JSON_parser.c \
+ json_parse.cc \
+ grammar.cc
diff --git a/decoder/aligner.cc b/decoder/aligner.cc
new file mode 100644
index 00000000..bad97b74
--- /dev/null
+++ b/decoder/aligner.cc
@@ -0,0 +1,319 @@
+#include "aligner.h"
+
+#include "array2d.h"
+#include "hg.h"
+#include "sentence_metadata.h"
+#include "inside_outside.h"
+#include "viterbi.h"
+#include <set>
+
+using namespace std;
+
+static bool is_digit(char x) { return x >= '0' && x <= '9'; }
+
+boost::shared_ptr<Array2D<bool> > AlignerTools::ReadPharaohAlignmentGrid(const string& al) {
+ int max_x = 0;
+ int max_y = 0;
+ int i = 0;
+ size_t pos = al.rfind(" ||| ");
+ if (pos != string::npos) { i = pos + 5; }
+ while (i < al.size()) {
+ if (al[i] == '\n' || al[i] == '\r') break;
+ int x = 0;
+ while(i < al.size() && is_digit(al[i])) {
+ x *= 10;
+ x += al[i] - '0';
+ ++i;
+ }
+ if (x > max_x) max_x = x;
+ assert(i < al.size());
+ if(al[i] != '-') {
+ cerr << "BAD ALIGNMENT: " << al << endl;
+ abort();
+ }
+ ++i;
+ int y = 0;
+ while(i < al.size() && is_digit(al[i])) {
+ y *= 10;
+ y += al[i] - '0';
+ ++i;
+ }
+ if (y > max_y) max_y = y;
+ while(i < al.size() && al[i] == ' ') { ++i; }
+ }
+
+ boost::shared_ptr<Array2D<bool> > grid(new Array2D<bool>(max_x + 1, max_y + 1));
+ i = 0;
+ if (pos != string::npos) { i = pos + 5; }
+ while (i < al.size()) {
+ if (al[i] == '\n' || al[i] == '\r') break;
+ int x = 0;
+ while(i < al.size() && is_digit(al[i])) {
+ x *= 10;
+ x += al[i] - '0';
+ ++i;
+ }
+ assert(i < al.size());
+ assert(al[i] == '-');
+ ++i;
+ int y = 0;
+ while(i < al.size() && is_digit(al[i])) {
+ y *= 10;
+ y += al[i] - '0';
+ ++i;
+ }
+ (*grid)(x, y) = true;
+ while(i < al.size() && al[i] == ' ') { ++i; }
+ }
+ // cerr << *grid << endl;
+ return grid;
+}
+
+void AlignerTools::SerializePharaohFormat(const Array2D<bool>& alignment, ostream* out) {
+ bool need_space = false;
+ for (int i = 0; i < alignment.width(); ++i)
+ for (int j = 0; j < alignment.height(); ++j)
+ if (alignment(i,j)) {
+ if (need_space) (*out) << ' '; else need_space = true;
+ (*out) << i << '-' << j;
+ }
+ (*out) << endl;
+}
+
+// used with lexical models since they may not fully generate the
+// source string
+void SourceEdgeCoveragesUsingParseIndices(const Hypergraph& g,
+ vector<set<int> >* src_cov) {
+ src_cov->clear();
+ src_cov->resize(g.edges_.size());
+
+ for (int i = 0; i < g.edges_.size(); ++i) {
+ const Hypergraph::Edge& edge = g.edges_[i];
+ set<int>& cov = (*src_cov)[i];
+ // no words
+ if (edge.rule_->EWords() == 0 || edge.rule_->FWords() == 0)
+ continue;
+ // aligned to NULL (crf ibm variant only)
+ if (edge.prev_i_ == -1 || edge.i_ == -1)
+ continue;
+ assert(edge.j_ >= 0);
+ assert(edge.prev_j_ >= 0);
+ if (edge.Arity() == 0) {
+ for (int k = edge.prev_i_; k < edge.prev_j_; ++k)
+ cov.insert(k);
+ } else {
+ // note: this code, which handles mixed NT and terminal
+ // rules assumes that nodes uniquely define a src and trg
+ // span.
+ int k = edge.prev_i_;
+ int j = 0;
+ const vector<WordID>& f = edge.rule_->e(); // rules are inverted
+ while (k < edge.prev_j_) {
+ if (f[j] > 0) {
+ cov.insert(k);
+ // cerr << "src: " << k << endl;
+ ++k;
+ ++j;
+ } else {
+ const Hypergraph::Node& tailnode = g.nodes_[edge.tail_nodes_[-f[j]]];
+ assert(tailnode.in_edges_.size() > 0);
+ // any edge will do:
+ const Hypergraph::Edge& rep_edge = g.edges_[tailnode.in_edges_.front()];
+ //cerr << "skip " << (rep_edge.prev_j_ - rep_edge.prev_i_) << endl; // src span
+ k += (rep_edge.prev_j_ - rep_edge.prev_i_); // src span
+ ++j;
+ }
+ }
+ }
+ }
+}
+
+int SourceEdgeCoveragesUsingTree(const Hypergraph& g,
+ int node_id,
+ int span_start,
+ vector<int>* spans,
+ vector<set<int> >* src_cov) {
+ const Hypergraph::Node& node = g.nodes_[node_id];
+ int k = -1;
+ for (int i = 0; i < node.in_edges_.size(); ++i) {
+ const int edge_id = node.in_edges_[i];
+ const Hypergraph::Edge& edge = g.edges_[edge_id];
+ set<int>& cov = (*src_cov)[edge_id];
+ const vector<WordID>& f = edge.rule_->e(); // rules are inverted
+ int j = 0;
+ k = span_start;
+ while (j < f.size()) {
+ if (f[j] > 0) {
+ cov.insert(k);
+ ++k;
+ ++j;
+ } else {
+ const int tail_node_id = edge.tail_nodes_[-f[j]];
+ int &right_edge = (*spans)[tail_node_id];
+ if (right_edge < 0)
+ right_edge = SourceEdgeCoveragesUsingTree(g, tail_node_id, k, spans, src_cov);
+ k = right_edge;
+ ++j;
+ }
+ }
+ }
+ return k;
+}
+
+void SourceEdgeCoveragesUsingTree(const Hypergraph& g,
+ vector<set<int> >* src_cov) {
+ src_cov->clear();
+ src_cov->resize(g.edges_.size());
+ vector<int> span_sizes(g.nodes_.size(), -1);
+ SourceEdgeCoveragesUsingTree(g, g.nodes_.size() - 1, 0, &span_sizes, src_cov);
+}
+
+int TargetEdgeCoveragesUsingTree(const Hypergraph& g,
+ int node_id,
+ int span_start,
+ vector<int>* spans,
+ vector<set<int> >* trg_cov) {
+ const Hypergraph::Node& node = g.nodes_[node_id];
+ int k = -1;
+ for (int i = 0; i < node.in_edges_.size(); ++i) {
+ const int edge_id = node.in_edges_[i];
+ const Hypergraph::Edge& edge = g.edges_[edge_id];
+ set<int>& cov = (*trg_cov)[edge_id];
+ int ntc = 0;
+ const vector<WordID>& e = edge.rule_->f(); // rules are inverted
+ int j = 0;
+ k = span_start;
+ while (j < e.size()) {
+ if (e[j] > 0) {
+ cov.insert(k);
+ ++k;
+ ++j;
+ } else {
+ const int tail_node_id = edge.tail_nodes_[ntc];
+ ++ntc;
+ int &right_edge = (*spans)[tail_node_id];
+ if (right_edge < 0)
+ right_edge = TargetEdgeCoveragesUsingTree(g, tail_node_id, k, spans, trg_cov);
+ k = right_edge;
+ ++j;
+ }
+ }
+ // cerr << "node=" << node_id << ": k=" << k << endl;
+ }
+ return k;
+}
+
+void TargetEdgeCoveragesUsingTree(const Hypergraph& g,
+ vector<set<int> >* trg_cov) {
+ trg_cov->clear();
+ trg_cov->resize(g.edges_.size());
+ vector<int> span_sizes(g.nodes_.size(), -1);
+ TargetEdgeCoveragesUsingTree(g, g.nodes_.size() - 1, 0, &span_sizes, trg_cov);
+}
+
+struct TransitionEventWeightFunction {
+ inline SparseVector<prob_t> operator()(const Hypergraph::Edge& e) const {
+ SparseVector<prob_t> result;
+ result.set_value(e.id_, e.edge_prob_);
+ return result;
+ }
+};
+
+// this code is rather complicated since it must deal with generating alignments
+// when lattices are specified as input as well as with models that do not generate
+// full sentence pairs (like lexical alignment models)
+void AlignerTools::WriteAlignment(const Lattice& src_lattice,
+ const Lattice& trg_lattice,
+ const Hypergraph& in_g,
+ ostream* out,
+ bool map_instead_of_viterbi,
+ const vector<bool>* edges) {
+ bool fix_up_src_spans = false;
+ const Hypergraph* g = &in_g;
+ if (!src_lattice.IsSentence() ||
+ !trg_lattice.IsSentence()) {
+ if (map_instead_of_viterbi) {
+ cerr << " Lattice alignment: using Viterbi instead of MAP alignment\n";
+ }
+ map_instead_of_viterbi = false;
+ fix_up_src_spans = !src_lattice.IsSentence();
+ }
+ if (!map_instead_of_viterbi || edges) {
+ Hypergraph* new_hg = in_g.CreateViterbiHypergraph(edges);
+ for (int i = 0; i < new_hg->edges_.size(); ++i)
+ new_hg->edges_[i].edge_prob_ = prob_t::One();
+ g = new_hg;
+ }
+
+ vector<prob_t> edge_posteriors(g->edges_.size(), prob_t::Zero());
+ vector<WordID> trg_sent;
+ vector<WordID> src_sent;
+ if (fix_up_src_spans) {
+ ViterbiESentence(*g, &src_sent);
+ } else {
+ src_sent.resize(src_lattice.size());
+ for (int i = 0; i < src_sent.size(); ++i)
+ src_sent[i] = src_lattice[i][0].label;
+ }
+
+ ViterbiFSentence(*g, &trg_sent);
+
+ if (edges || !map_instead_of_viterbi) {
+ for (int i = 0; i < edge_posteriors.size(); ++i)
+ edge_posteriors[i] = prob_t::One();
+ } else {
+ SparseVector<prob_t> posts;
+ const prob_t z = InsideOutside<prob_t, EdgeProb, SparseVector<prob_t>, TransitionEventWeightFunction>(*g, &posts);
+ for (int i = 0; i < edge_posteriors.size(); ++i)
+ edge_posteriors[i] = posts[i] / z;
+ }
+ vector<set<int> > src_cov(g->edges_.size());
+ vector<set<int> > trg_cov(g->edges_.size());
+ TargetEdgeCoveragesUsingTree(*g, &trg_cov);
+
+ if (fix_up_src_spans)
+ SourceEdgeCoveragesUsingTree(*g, &src_cov);
+ else
+ SourceEdgeCoveragesUsingParseIndices(*g, &src_cov);
+
+ // figure out the src and reference size;
+ int src_size = src_sent.size();
+ int ref_size = trg_sent.size();
+ Array2D<prob_t> align(src_size, ref_size, prob_t::Zero());
+ for (int c = 0; c < g->edges_.size(); ++c) {
+ const prob_t& p = edge_posteriors[c];
+ const set<int>& srcs = src_cov[c];
+ const set<int>& trgs = trg_cov[c];
+ for (set<int>::const_iterator si = srcs.begin();
+ si != srcs.end(); ++si) {
+ for (set<int>::const_iterator ti = trgs.begin();
+ ti != trgs.end(); ++ti) {
+ align(*si, *ti) += p;
+ }
+ }
+ }
+ if (g != &in_g) { delete g; g = NULL; }
+
+ prob_t threshold(0.9);
+ const bool use_soft_threshold = true; // TODO configure
+
+ Array2D<bool> grid(src_size, ref_size, false);
+ for (int j = 0; j < ref_size; ++j) {
+ if (use_soft_threshold) {
+ threshold = prob_t::Zero();
+ for (int i = 0; i < src_size; ++i)
+ if (align(i, j) > threshold) threshold = align(i, j);
+ //threshold *= prob_t(0.99);
+ }
+ for (int i = 0; i < src_size; ++i)
+ grid(i, j) = align(i, j) >= threshold;
+ }
+ if (out == &cout) {
+ // TODO need to do some sort of verbose flag
+ cerr << align << endl;
+ cerr << grid << endl;
+ }
+ (*out) << TD::GetString(src_sent) << " ||| " << TD::GetString(trg_sent) << " ||| ";
+ SerializePharaohFormat(grid, out);
+};
+
diff --git a/decoder/aligner.h b/decoder/aligner.h
new file mode 100644
index 00000000..cd159119
--- /dev/null
+++ b/decoder/aligner.h
@@ -0,0 +1,27 @@
+#ifndef _ALIGNER_H_
+
+#include <string>
+#include <iostream>
+#include <boost/shared_ptr.hpp>
+#include "array2d.h"
+#include "lattice.h"
+
+class Hypergraph;
+class SentenceMetadata;
+
+struct AlignerTools {
+ static boost::shared_ptr<Array2D<bool> > ReadPharaohAlignmentGrid(const std::string& al);
+ static void SerializePharaohFormat(const Array2D<bool>& alignment, std::ostream* out);
+
+ // assumption: g contains derivations of input/ref and
+ // ONLY input/ref.
+ // if edges is non-NULL, the alignment corresponding to the edge rules will be written
+ static void WriteAlignment(const Lattice& src,
+ const Lattice& ref,
+ const Hypergraph& g,
+ std::ostream* out,
+ bool map_instead_of_viterbi = true,
+ const std::vector<bool>* edges = NULL);
+};
+
+#endif
diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc
new file mode 100644
index 00000000..2908005f
--- /dev/null
+++ b/decoder/apply_models.cc
@@ -0,0 +1,426 @@
+#include "apply_models.h"
+
+#include <vector>
+#include <algorithm>
+#include <tr1/unordered_map>
+#include <tr1/unordered_set>
+
+#include <boost/functional/hash.hpp>
+
+#include "hg.h"
+#include "ff.h"
+
+using namespace std;
+using namespace std::tr1;
+
+struct Candidate;
+typedef SmallVector JVector;
+typedef vector<Candidate*> CandidateHeap;
+typedef vector<Candidate*> CandidateList;
+
+// default vector size (* sizeof string is memory used)
+static const size_t kRESERVE_NUM_NODES = 500000ul;
+
+// life cycle: candidates are created, placed on the heap
+// and retrieved by their estimated cost, when they're
+// retrieved, they're incorporated into the +LM hypergraph
+// where they also know the head node index they are
+// attached to. After they are added to the +LM hypergraph
+// vit_prob_ and est_prob_ fields may be updated as better
+// derivations are found (this happens since the successor's
+// of derivation d may have a better score- they are
+// explored lazily). However, the updates don't happen
+// when a candidate is in the heap so maintaining the heap
+// property is not an issue.
+struct Candidate {
+ int node_index_; // -1 until incorporated
+ // into the +LM forest
+ const Hypergraph::Edge* in_edge_; // in -LM forest
+ Hypergraph::Edge out_edge_;
+ string state_;
+ const JVector j_;
+ prob_t vit_prob_; // these are fixed until the cand
+ // is popped, then they may be updated
+ prob_t est_prob_;
+
+ Candidate(const Hypergraph::Edge& e,
+ const JVector& j,
+ const Hypergraph& out_hg,
+ const vector<CandidateList>& D,
+ const vector<string>& node_states,
+ const SentenceMetadata& smeta,
+ const ModelSet& models,
+ bool is_goal) :
+ node_index_(-1),
+ in_edge_(&e),
+ j_(j) {
+ InitializeCandidate(out_hg, smeta, D, node_states, models, is_goal);
+ }
+
+ // used to query uniqueness
+ Candidate(const Hypergraph::Edge& e,
+ const JVector& j) : in_edge_(&e), j_(j) {}
+
+ bool IsIncorporatedIntoHypergraph() const {
+ return node_index_ >= 0;
+ }
+
+ void InitializeCandidate(const Hypergraph& out_hg,
+ const SentenceMetadata& smeta,
+ const vector<vector<Candidate*> >& D,
+ const vector<string>& node_states,
+ const ModelSet& models,
+ const bool is_goal) {
+ const Hypergraph::Edge& in_edge = *in_edge_;
+ out_edge_.rule_ = in_edge.rule_;
+ out_edge_.feature_values_ = in_edge.feature_values_;
+ out_edge_.i_ = in_edge.i_;
+ out_edge_.j_ = in_edge.j_;
+ out_edge_.prev_i_ = in_edge.prev_i_;
+ out_edge_.prev_j_ = in_edge.prev_j_;
+ Hypergraph::TailNodeVector& tail = out_edge_.tail_nodes_;
+ tail.resize(j_.size());
+ prob_t p = prob_t::One();
+ // cerr << "\nEstimating application of " << in_edge.rule_->AsString() << endl;
+ for (int i = 0; i < tail.size(); ++i) {
+ const Candidate& ant = *D[in_edge.tail_nodes_[i]][j_[i]];
+ assert(ant.IsIncorporatedIntoHypergraph());
+ tail[i] = ant.node_index_;
+ p *= ant.vit_prob_;
+ }
+ prob_t edge_estimate = prob_t::One();
+ if (is_goal) {
+ assert(tail.size() == 1);
+ const string& ant_state = node_states[tail.front()];
+ models.AddFinalFeatures(ant_state, &out_edge_);
+ } else {
+ models.AddFeaturesToEdge(smeta, out_hg, node_states, &out_edge_, &state_, &edge_estimate);
+ }
+ vit_prob_ = out_edge_.edge_prob_ * p;
+ est_prob_ = vit_prob_ * edge_estimate;
+ }
+};
+
+ostream& operator<<(ostream& os, const Candidate& cand) {
+ os << "CAND[";
+ if (!cand.IsIncorporatedIntoHypergraph()) { os << "PENDING "; }
+ else { os << "+LM_node=" << cand.node_index_; }
+ os << " edge=" << cand.in_edge_->id_;
+ os << " j=<";
+ for (int i = 0; i < cand.j_.size(); ++i)
+ os << (i==0 ? "" : " ") << cand.j_[i];
+ os << "> vit=" << log(cand.vit_prob_);
+ os << " est=" << log(cand.est_prob_);
+ return os << ']';
+}
+
+struct HeapCandCompare {
+ bool operator()(const Candidate* l, const Candidate* r) const {
+ return l->est_prob_ < r->est_prob_;
+ }
+};
+
+struct EstProbSorter {
+ bool operator()(const Candidate* l, const Candidate* r) const {
+ return l->est_prob_ > r->est_prob_;
+ }
+};
+
+// the same candidate <edge, j> can be added multiple times if
+// j is multidimensional (if you're going NW in Manhattan, you
+// can first go north, then west, or you can go west then north)
+// this is a hash function on the relevant variables from
+// Candidate to enforce this.
+struct CandidateUniquenessHash {
+ size_t operator()(const Candidate* c) const {
+ size_t x = 5381;
+ x = ((x << 5) + x) ^ c->in_edge_->id_;
+ for (int i = 0; i < c->j_.size(); ++i)
+ x = ((x << 5) + x) ^ c->j_[i];
+ return x;
+ }
+};
+
+struct CandidateUniquenessEquals {
+ bool operator()(const Candidate* a, const Candidate* b) const {
+ return (a->in_edge_ == b->in_edge_) && (a->j_ == b->j_);
+ }
+};
+
+typedef unordered_set<const Candidate*, CandidateUniquenessHash, CandidateUniquenessEquals> UniqueCandidateSet;
+typedef unordered_map<string, Candidate*, boost::hash<string> > State2Node;
+
+class CubePruningRescorer {
+
+public:
+ CubePruningRescorer(const ModelSet& m,
+ const SentenceMetadata& sm,
+ const Hypergraph& i,
+ int pop_limit,
+ Hypergraph* o) :
+ models(m),
+ smeta(sm),
+ in(i),
+ out(*o),
+ D(in.nodes_.size()),
+ pop_limit_(pop_limit) {
+ cerr << " Applying feature functions (cube pruning, pop_limit = " << pop_limit_ << ')' << endl;
+ node_states_.reserve(kRESERVE_NUM_NODES);
+ }
+
+ void Apply() {
+ int num_nodes = in.nodes_.size();
+ int goal_id = num_nodes - 1;
+ int pregoal = goal_id - 1;
+ int every = 1;
+ if (num_nodes > 100) every = 10;
+ assert(in.nodes_[pregoal].out_edges_.size() == 1);
+ cerr << " ";
+ for (int i = 0; i < in.nodes_.size(); ++i) {
+ if (i % every == 0) cerr << '.';
+ KBest(i, i == goal_id);
+ }
+ cerr << endl;
+ cerr << " Best path: " << log(D[goal_id].front()->vit_prob_)
+ << "\t" << log(D[goal_id].front()->est_prob_) << endl;
+ out.PruneUnreachable(D[goal_id].front()->node_index_);
+ FreeAll();
+ }
+
+ private:
+ void FreeAll() {
+ for (int i = 0; i < D.size(); ++i) {
+ CandidateList& D_i = D[i];
+ for (int j = 0; j < D_i.size(); ++j)
+ delete D_i[j];
+ }
+ D.clear();
+ }
+
+ void IncorporateIntoPlusLMForest(Candidate* item, State2Node* s2n, CandidateList* freelist) {
+ Hypergraph::Edge* new_edge = out.AddEdge(item->out_edge_.rule_, item->out_edge_.tail_nodes_);
+ new_edge->feature_values_ = item->out_edge_.feature_values_;
+ new_edge->edge_prob_ = item->out_edge_.edge_prob_;
+ new_edge->i_ = item->out_edge_.i_;
+ new_edge->j_ = item->out_edge_.j_;
+ new_edge->prev_i_ = item->out_edge_.prev_i_;
+ new_edge->prev_j_ = item->out_edge_.prev_j_;
+ Candidate*& o_item = (*s2n)[item->state_];
+ if (!o_item) o_item = item;
+
+ int& node_id = o_item->node_index_;
+ if (node_id < 0) {
+ Hypergraph::Node* new_node = out.AddNode(in.nodes_[item->in_edge_->head_node_].cat_);
+ node_states_.push_back(item->state_);
+ node_id = new_node->id_;
+ }
+ Hypergraph::Node* node = &out.nodes_[node_id];
+ out.ConnectEdgeToHeadNode(new_edge, node);
+
+ // update candidate if we have a better derivation
+ // note: the difference between the vit score and the estimated
+ // score is the same for all items with a common residual DP
+ // state
+ if (item->vit_prob_ > o_item->vit_prob_) {
+ assert(o_item->state_ == item->state_); // sanity check!
+ o_item->est_prob_ = item->est_prob_;
+ o_item->vit_prob_ = item->vit_prob_;
+ }
+ if (item != o_item) freelist->push_back(item);
+ }
+
+ void KBest(const int vert_index, const bool is_goal) {
+ // cerr << "KBest(" << vert_index << ")\n";
+ CandidateList& D_v = D[vert_index];
+ assert(D_v.empty());
+ const Hypergraph::Node& v = in.nodes_[vert_index];
+ // cerr << " has " << v.in_edges_.size() << " in-coming edges\n";
+ const vector<int>& in_edges = v.in_edges_;
+ CandidateHeap cand;
+ CandidateList freelist;
+ cand.reserve(in_edges.size());
+ UniqueCandidateSet unique_cands;
+ for (int i = 0; i < in_edges.size(); ++i) {
+ const Hypergraph::Edge& edge = in.edges_[in_edges[i]];
+ const JVector j(edge.tail_nodes_.size(), 0);
+ cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal));
+ assert(unique_cands.insert(cand.back()).second); // these should all be unique!
+ }
+// cerr << " making heap of " << cand.size() << " candidates\n";
+ make_heap(cand.begin(), cand.end(), HeapCandCompare());
+ State2Node state2node; // "buf" in Figure 2
+ int pops = 0;
+ while(!cand.empty() && pops < pop_limit_) {
+ pop_heap(cand.begin(), cand.end(), HeapCandCompare());
+ Candidate* item = cand.back();
+ cand.pop_back();
+ // cerr << "POPPED: " << *item << endl;
+ PushSucc(*item, is_goal, &cand, &unique_cands);
+ IncorporateIntoPlusLMForest(item, &state2node, &freelist);
+ ++pops;
+ }
+ D_v.resize(state2node.size());
+ int c = 0;
+ for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i)
+ D_v[c++] = i->second;
+ sort(D_v.begin(), D_v.end(), EstProbSorter());
+ // cerr << " expanded to " << D_v.size() << " nodes\n";
+
+ for (int i = 0; i < cand.size(); ++i)
+ delete cand[i];
+ // freelist is necessary since even after an item merged, it still stays in
+ // the unique set so it can't be deleted til now
+ for (int i = 0; i < freelist.size(); ++i)
+ delete freelist[i];
+ }
+
+ void PushSucc(const Candidate& item, const bool is_goal, CandidateHeap* pcand, UniqueCandidateSet* cs) {
+ CandidateHeap& cand = *pcand;
+ for (int i = 0; i < item.j_.size(); ++i) {
+ JVector j = item.j_;
+ ++j[i];
+ if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) {
+ Candidate query_unique(*item.in_edge_, j);
+ if (cs->count(&query_unique) == 0) {
+ Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal);
+ cand.push_back(new_cand);
+ push_heap(cand.begin(), cand.end(), HeapCandCompare());
+ assert(cs->insert(new_cand).second); // insert into uniqueness set, sanity check
+ }
+ }
+ }
+ }
+
+ const ModelSet& models;
+ const SentenceMetadata& smeta;
+ const Hypergraph& in;
+ Hypergraph& out;
+
+ vector<CandidateList> D; // maps nodes in in-HG to the
+ // equivalent nodes (many due to state
+ // splits) in the out-HG.
+ vector<string> node_states_; // for each node in the out-HG what is
+ // its q function value?
+ const int pop_limit_;
+};
+
+struct NoPruningRescorer {
+ NoPruningRescorer(const ModelSet& m, const SentenceMetadata &sm, const Hypergraph& i, Hypergraph* o) :
+ models(m),
+ smeta(sm),
+ in(i),
+ out(*o),
+ nodemap(i.nodes_.size()) {
+ cerr << " Rescoring forest (full intersection)\n";
+ node_states_.reserve(kRESERVE_NUM_NODES);
+ }
+
+ typedef unordered_map<string, int, boost::hash<string> > State2NodeIndex;
+
+ void ExpandEdge(const Hypergraph::Edge& in_edge, bool is_goal, State2NodeIndex* state2node) {
+ const int arity = in_edge.Arity();
+ Hypergraph::TailNodeVector ends(arity);
+ for (int i = 0; i < arity; ++i)
+ ends[i] = nodemap[in_edge.tail_nodes_[i]].size();
+
+ Hypergraph::TailNodeVector tail_iter(arity, 0);
+ bool done = false;
+ while (!done) {
+ Hypergraph::TailNodeVector tail(arity);
+ for (int i = 0; i < arity; ++i)
+ tail[i] = nodemap[in_edge.tail_nodes_[i]][tail_iter[i]];
+ Hypergraph::Edge* new_edge = out.AddEdge(in_edge.rule_, tail);
+ new_edge->feature_values_ = in_edge.feature_values_;
+ new_edge->i_ = in_edge.i_;
+ new_edge->j_ = in_edge.j_;
+ new_edge->prev_i_ = in_edge.prev_i_;
+ new_edge->prev_j_ = in_edge.prev_j_;
+ string head_state;
+ if (is_goal) {
+ assert(tail.size() == 1);
+ const string& ant_state = node_states_[tail.front()];
+ models.AddFinalFeatures(ant_state, new_edge);
+ } else {
+ prob_t edge_estimate; // this is a full intersection, so we disregard this
+ models.AddFeaturesToEdge(smeta, out, node_states_, new_edge, &head_state, &edge_estimate);
+ }
+ int& head_plus1 = (*state2node)[head_state];
+ if (!head_plus1) {
+ head_plus1 = out.AddNode(in_edge.rule_->GetLHS())->id_ + 1;
+ node_states_.push_back(head_state);
+ nodemap[in_edge.head_node_].push_back(head_plus1 - 1);
+ }
+ const int head_index = head_plus1 - 1;
+ out.ConnectEdgeToHeadNode(new_edge->id_, head_index);
+
+ int ii = 0;
+ for (; ii < arity; ++ii) {
+ ++tail_iter[ii];
+ if (tail_iter[ii] < ends[ii]) break;
+ tail_iter[ii] = 0;
+ }
+ done = (ii == arity);
+ }
+ }
+
+ void ProcessOneNode(const int node_num, const bool is_goal) {
+ State2NodeIndex state2node;
+ const Hypergraph::Node& node = in.nodes_[node_num];
+ for (int i = 0; i < node.in_edges_.size(); ++i) {
+ const Hypergraph::Edge& edge = in.edges_[node.in_edges_[i]];
+ ExpandEdge(edge, is_goal, &state2node);
+ }
+ }
+
+ void Apply() {
+ int num_nodes = in.nodes_.size();
+ int goal_id = num_nodes - 1;
+ int pregoal = goal_id - 1;
+ int every = 1;
+ if (num_nodes > 100) every = 10;
+ assert(in.nodes_[pregoal].out_edges_.size() == 1);
+ cerr << " ";
+ for (int i = 0; i < in.nodes_.size(); ++i) {
+ if (i % every == 0) cerr << '.';
+ ProcessOneNode(i, i == goal_id);
+ }
+ cerr << endl;
+ }
+
+ private:
+ const ModelSet& models;
+ const SentenceMetadata& smeta;
+ const Hypergraph& in;
+ Hypergraph& out;
+
+ vector<vector<int> > nodemap;
+ vector<string> node_states_; // for each node in the out-HG what is
+ // its q function value?
+};
+
+// each node in the graph has one of these, it keeps track of
+void ApplyModelSet(const Hypergraph& in,
+ const SentenceMetadata& smeta,
+ const ModelSet& models,
+ const IntersectionConfiguration& config,
+ Hypergraph* out) {
+ // TODO special handling when all models are stateless
+ if (config.algorithm == 1) {
+ int pl = config.pop_limit;
+ if (pl > 100 && in.nodes_.size() > 80000) {
+ cerr << " Note: reducing pop_limit to " << pl << " for very large forest\n";
+ pl = 30;
+ }
+ CubePruningRescorer ma(models, smeta, in, pl, out);
+ ma.Apply();
+ } else if (config.algorithm == 0) {
+ NoPruningRescorer ma(models, smeta, in, out);
+ ma.Apply();
+ } else {
+ cerr << "Don't understand intersection algorithm " << config.algorithm << endl;
+ exit(1);
+ }
+ out->is_linear_chain_ = in.is_linear_chain_; // TODO remove when this is computed
+ // automatically
+}
+
diff --git a/decoder/apply_models.h b/decoder/apply_models.h
new file mode 100644
index 00000000..d6d8b34a
--- /dev/null
+++ b/decoder/apply_models.h
@@ -0,0 +1,20 @@
+#ifndef _APPLY_MODELS_H_
+#define _APPLY_MODELS_H_
+
+struct ModelSet;
+struct Hypergraph;
+struct SentenceMetadata;
+
+struct IntersectionConfiguration {
+ const int algorithm; // 0 = full intersection, 1 = cube pruning
+ const int pop_limit; // max number of pops off the heap at each node
+ IntersectionConfiguration(int alg, int k) : algorithm(alg), pop_limit(k) {}
+};
+
+void ApplyModelSet(const Hypergraph& in,
+ const SentenceMetadata& smeta,
+ const ModelSet& models,
+ const IntersectionConfiguration& config,
+ Hypergraph* out);
+
+#endif
diff --git a/decoder/array2d.h b/decoder/array2d.h
new file mode 100644
index 00000000..e63eda0d
--- /dev/null
+++ b/decoder/array2d.h
@@ -0,0 +1,172 @@
+#ifndef ARRAY2D_H_
+#define ARRAY2D_H_
+
+#include <iostream>
+#include <algorithm>
+#include <cassert>
+#include <vector>
+#include <string>
+
+template<typename T>
+class Array2D {
+ public:
+ typedef typename std::vector<T>::reference reference;
+ typedef typename std::vector<T>::const_reference const_reference;
+ typedef typename std::vector<T>::iterator iterator;
+ typedef typename std::vector<T>::const_iterator const_iterator;
+ Array2D() : width_(0), height_(0) {}
+ Array2D(int w, int h, const T& d = T()) :
+ width_(w), height_(h), data_(w*h, d) {}
+ Array2D(const Array2D& rhs) :
+ width_(rhs.width_), height_(rhs.height_), data_(rhs.data_) {}
+ bool empty() const { return data_.empty(); }
+ void resize(int w, int h, const T& d = T()) {
+ data_.resize(w * h, d);
+ width_ = w;
+ height_ = h;
+ }
+ const Array2D& operator=(const Array2D& rhs) {
+ data_ = rhs.data_;
+ width_ = rhs.width_;
+ height_ = rhs.height_;
+ return *this;
+ }
+ void fill(const T& v) { data_.assign(data_.size(), v); }
+ int width() const { return width_; }
+ int height() const { return height_; }
+ reference operator()(int i, int j) {
+ return data_[offset(i, j)];
+ }
+ void clear() { data_.clear(); width_=0; height_=0; }
+ const_reference operator()(int i, int j) const {
+ return data_[offset(i, j)];
+ }
+ iterator begin_col(int j) {
+ return data_.begin() + offset(0,j);
+ }
+ const_iterator begin_col(int j) const {
+ return data_.begin() + offset(0,j);
+ }
+ iterator end_col(int j) {
+ return data_.begin() + offset(0,j) + width_;
+ }
+ const_iterator end_col(int j) const {
+ return data_.begin() + offset(0,j) + width_;
+ }
+ iterator end() { return data_.end(); }
+ const_iterator end() const { return data_.end(); }
+ const Array2D<T>& operator*=(const T& x) {
+ std::transform(data_.begin(), data_.end(), data_.begin(),
+ std::bind2nd(std::multiplies<T>(), x));
+ }
+ const Array2D<T>& operator/=(const T& x) {
+ std::transform(data_.begin(), data_.end(), data_.begin(),
+ std::bind2nd(std::divides<T>(), x));
+ }
+ const Array2D<T>& operator+=(const Array2D<T>& m) {
+ std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::plus<T>());
+ }
+ const Array2D<T>& operator-=(const Array2D<T>& m) {
+ std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::minus<T>());
+ }
+
+ private:
+ inline int offset(int i, int j) const {
+ assert(i<width_);
+ assert(j<height_);
+ return i + j * width_;
+ }
+
+ int width_;
+ int height_;
+
+ std::vector<T> data_;
+};
+
+template <typename T>
+Array2D<T> operator*(const Array2D<T>& l, const T& scalar) {
+ Array2D<T> res(l);
+ res *= scalar;
+ return res;
+}
+
+template <typename T>
+Array2D<T> operator*(const T& scalar, const Array2D<T>& l) {
+ Array2D<T> res(l);
+ res *= scalar;
+ return res;
+}
+
+template <typename T>
+Array2D<T> operator/(const Array2D<T>& l, const T& scalar) {
+ Array2D<T> res(l);
+ res /= scalar;
+ return res;
+}
+
+template <typename T>
+Array2D<T> operator+(const Array2D<T>& l, const Array2D<T>& r) {
+ Array2D<T> res(l);
+ res += r;
+ return res;
+}
+
+template <typename T>
+Array2D<T> operator-(const Array2D<T>& l, const Array2D<T>& r) {
+ Array2D<T> res(l);
+ res -= r;
+ return res;
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const Array2D<T>& m) {
+ for (int i=0; i<m.width(); ++i) {
+ for (int j=0; j<m.height(); ++j)
+ os << '\t' << m(i,j);
+ os << '\n';
+ }
+ return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Array2D<bool>& m) {
+ os << ' ';
+ for (int j=0; j<m.height(); ++j)
+ os << (j%10);
+ os << "\n";
+ for (int i=0; i<m.width(); ++i) {
+ os << (i%10);
+ for (int j=0; j<m.height(); ++j)
+ os << (m(i,j) ? '*' : '.');
+ os << (i%10) << "\n";
+ }
+ os << ' ';
+ for (int j=0; j<m.height(); ++j)
+ os << (j%10);
+ os << "\n";
+ return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Array2D<std::vector<bool> >& m) {
+ os << ' ';
+ for (int j=0; j<m.height(); ++j)
+ os << (j%10) << "\t";
+ os << "\n";
+ for (int i=0; i<m.width(); ++i) {
+ os << (i%10);
+ for (int j=0; j<m.height(); ++j) {
+ const std::vector<bool>& ar = m(i,j);
+ for (int k=0; k<ar.size(); ++k)
+ os << (ar[k] ? '*' : '.');
+ }
+ os << "\t";
+ os << (i%10) << "\n";
+ }
+ os << ' ';
+ for (int j=0; j<m.height(); ++j)
+ os << (j%10) << "\t";
+ os << "\n";
+ return os;
+}
+
+#endif
+
diff --git a/decoder/bottom_up_parser.cc b/decoder/bottom_up_parser.cc
new file mode 100644
index 00000000..dd54a606
--- /dev/null
+++ b/decoder/bottom_up_parser.cc
@@ -0,0 +1,302 @@
+#include "bottom_up_parser.h"
+
+#include <iostream>
+#include <map>
+
+#include "hg.h"
+#include "array2d.h"
+#include "tdict.h"
+
+using namespace std;
+
+struct ParserStats {
+ ParserStats() : active_items(), passive_items() {}
+ void Reset() { active_items=0; passive_items=0; }
+ void Report() {
+ cerr << " ACTIVE ITEMS: " << active_items << "\tPASSIVE ITEMS: " << passive_items << endl;
+ }
+ int active_items;
+ int passive_items;
+ void NotifyActive(int i, int j) { ++active_items; }
+ void NotifyPassive(int i, int j) { ++passive_items; }
+};
+
+ParserStats stats;
+
+class ActiveChart;
+class PassiveChart {
+ public:
+ PassiveChart(const string& goal,
+ const vector<GrammarPtr>& grammars,
+ const Lattice& input,
+ Hypergraph* forest);
+ ~PassiveChart();
+
+ inline const vector<int>& operator()(int i, int j) const { return chart_(i,j); }
+ bool Parse();
+ inline int size() const { return chart_.width(); }
+ inline bool GoalFound() const { return goal_idx_ >= 0; }
+ inline int GetGoalIndex() const { return goal_idx_; }
+
+ private:
+ void ApplyRules(const int i,
+ const int j,
+ const RuleBin* rules,
+ const Hypergraph::TailNodeVector& tail,
+ const float lattice_cost);
+
+ void ApplyRule(const int i,
+ const int j,
+ const TRulePtr& r,
+ const Hypergraph::TailNodeVector& ant_nodes,
+ const float lattice_cost);
+
+ void ApplyUnaryRules(const int i, const int j);
+
+ const vector<GrammarPtr>& grammars_;
+ const Lattice& input_;
+ Hypergraph* forest_;
+ Array2D<vector<int> > chart_; // chart_(i,j) is the list of nodes derived spanning i,j
+ typedef map<int, int> Cat2NodeMap;
+ Array2D<Cat2NodeMap> nodemap_;
+ vector<ActiveChart*> act_chart_;
+ const WordID goal_cat_; // category that is being searched for at [0,n]
+ TRulePtr goal_rule_;
+ int goal_idx_; // index of goal node, if found
+ const int lc_fid_;
+
+ static WordID kGOAL; // [Goal]
+};
+
+WordID PassiveChart::kGOAL = 0;
+
+class ActiveChart {
+ public:
+ ActiveChart(const Hypergraph* hg, const PassiveChart& psv_chart) :
+ hg_(hg),
+ act_chart_(psv_chart.size(), psv_chart.size()), psv_chart_(psv_chart) {}
+
+ struct ActiveItem {
+ ActiveItem(const GrammarIter* g, const Hypergraph::TailNodeVector& a, float lcost) :
+ gptr_(g), ant_nodes_(a), lattice_cost(lcost) {}
+ explicit ActiveItem(const GrammarIter* g) :
+ gptr_(g), ant_nodes_(), lattice_cost(0.0) {}
+
+ void ExtendTerminal(int symbol, float src_cost, vector<ActiveItem>* out_cell) const {
+ const GrammarIter* ni = gptr_->Extend(symbol);
+ if (ni) {
+ stats.NotifyActive(-1,-1); // TRACKING STATS
+ out_cell->push_back(ActiveItem(ni, ant_nodes_, lattice_cost + src_cost));
+ }
+ }
+ void ExtendNonTerminal(const Hypergraph* hg, int node_index, vector<ActiveItem>* out_cell) const {
+ int symbol = hg->nodes_[node_index].cat_;
+ const GrammarIter* ni = gptr_->Extend(symbol);
+ if (!ni) return;
+ stats.NotifyActive(-1,-1); // TRACKING STATS
+ Hypergraph::TailNodeVector na(ant_nodes_.size() + 1);
+ for (int i = 0; i < ant_nodes_.size(); ++i)
+ na[i] = ant_nodes_[i];
+ na[ant_nodes_.size()] = node_index;
+ out_cell->push_back(ActiveItem(ni, na, lattice_cost));
+ }
+
+ const GrammarIter* gptr_;
+ Hypergraph::TailNodeVector ant_nodes_;
+ float lattice_cost; // TODO? use SparseVector<double>
+ };
+
+ inline const vector<ActiveItem>& operator()(int i, int j) const { return act_chart_(i,j); }
+ void SeedActiveChart(const Grammar& g) {
+ int size = act_chart_.width();
+ for (int i = 0; i < size; ++i)
+ if (g.HasRuleForSpan(i,i,0))
+ act_chart_(i,i).push_back(ActiveItem(g.GetRoot()));
+ }
+
+ void ExtendActiveItems(int i, int k, int j) {
+ //cerr << " LOOK(" << i << "," << k << ") for completed items in (" << k << "," << j << ")\n";
+ vector<ActiveItem>& cell = act_chart_(i,j);
+ const vector<ActiveItem>& icell = act_chart_(i,k);
+ const vector<int>& idxs = psv_chart_(k, j);
+ //if (!idxs.empty()) { cerr << "FOUND IN (" << k << "," << j << ")\n"; }
+ for (vector<ActiveItem>::const_iterator di = icell.begin(); di != icell.end(); ++di) {
+ for (vector<int>::const_iterator ni = idxs.begin(); ni != idxs.end(); ++ni) {
+ di->ExtendNonTerminal(hg_, *ni, &cell);
+ }
+ }
+ }
+
+ void AdvanceDotsForAllItemsInCell(int i, int j, const vector<vector<LatticeArc> >& input) {
+ //cerr << "ADVANCE(" << i << "," << j << ")\n";
+ for (int k=i+1; k < j; ++k)
+ ExtendActiveItems(i, k, j);
+
+ const vector<LatticeArc>& out_arcs = input[j-1];
+ for (vector<LatticeArc>::const_iterator ai = out_arcs.begin();
+ ai != out_arcs.end(); ++ai) {
+ const WordID& f = ai->label;
+ const double& c = ai->cost;
+ const int& len = ai->dist2next;
+ //VLOG(1) << "F: " << TD::Convert(f) << endl;
+ const vector<ActiveItem>& ec = act_chart_(i, j-1);
+ for (vector<ActiveItem>::const_iterator di = ec.begin(); di != ec.end(); ++di)
+ di->ExtendTerminal(f, c, &act_chart_(i, j + len - 1));
+ }
+ }
+
+ private:
+ const Hypergraph* hg_;
+ Array2D<vector<ActiveItem> > act_chart_;
+ const PassiveChart& psv_chart_;
+};
+
+PassiveChart::PassiveChart(const string& goal,
+ const vector<GrammarPtr>& grammars,
+ const Lattice& input,
+ Hypergraph* forest) :
+ grammars_(grammars),
+ input_(input),
+ forest_(forest),
+ chart_(input.size()+1, input.size()+1),
+ nodemap_(input.size()+1, input.size()+1),
+ goal_cat_(TD::Convert(goal) * -1),
+ goal_rule_(new TRule("[Goal] ||| [" + goal + ",1] ||| [" + goal + ",1]")),
+ goal_idx_(-1),
+ lc_fid_(FD::Convert("LatticeCost")) {
+ act_chart_.resize(grammars_.size());
+ for (int i = 0; i < grammars_.size(); ++i)
+ act_chart_[i] = new ActiveChart(forest, *this);
+ if (!kGOAL) kGOAL = TD::Convert("Goal") * -1;
+ cerr << " Goal category: [" << goal << ']' << endl;
+}
+
+void PassiveChart::ApplyRule(const int i,
+ const int j,
+ const TRulePtr& r,
+ const Hypergraph::TailNodeVector& ant_nodes,
+ const float lattice_cost) {
+ stats.NotifyPassive(i,j); // TRACKING STATS
+ Hypergraph::Edge* new_edge = forest_->AddEdge(r, ant_nodes);
+ new_edge->prev_i_ = r->prev_i;
+ new_edge->prev_j_ = r->prev_j;
+ new_edge->i_ = i;
+ new_edge->j_ = j;
+ new_edge->feature_values_ = r->GetFeatureValues();
+ if (lattice_cost)
+ new_edge->feature_values_.set_value(lc_fid_, lattice_cost);
+ Cat2NodeMap& c2n = nodemap_(i,j);
+ const bool is_goal = (r->GetLHS() == kGOAL);
+ const Cat2NodeMap::iterator ni = c2n.find(r->GetLHS());
+ Hypergraph::Node* node = NULL;
+ if (ni == c2n.end()) {
+ node = forest_->AddNode(r->GetLHS());
+ c2n[r->GetLHS()] = node->id_;
+ if (is_goal) {
+ assert(goal_idx_ == -1);
+ goal_idx_ = node->id_;
+ } else {
+ chart_(i,j).push_back(node->id_);
+ }
+ } else {
+ node = &forest_->nodes_[ni->second];
+ }
+ forest_->ConnectEdgeToHeadNode(new_edge, node);
+}
+
+void PassiveChart::ApplyRules(const int i,
+ const int j,
+ const RuleBin* rules,
+ const Hypergraph::TailNodeVector& tail,
+ const float lattice_cost) {
+ const int n = rules->GetNumRules();
+ for (int k = 0; k < n; ++k)
+ ApplyRule(i, j, rules->GetIthRule(k), tail, lattice_cost);
+}
+
+void PassiveChart::ApplyUnaryRules(const int i, const int j) {
+ const vector<int>& nodes = chart_(i,j); // reference is important!
+ for (int gi = 0; gi < grammars_.size(); ++gi) {
+ if (!grammars_[gi]->HasRuleForSpan(i,j,input_.Distance(i,j))) continue;
+ for (int di = 0; di < nodes.size(); ++di) {
+ const WordID& cat = forest_->nodes_[nodes[di]].cat_;
+ const vector<TRulePtr>& unaries = grammars_[gi]->GetUnaryRulesForRHS(cat);
+ for (int ri = 0; ri < unaries.size(); ++ri) {
+ // cerr << "At (" << i << "," << j << "): applying " << unaries[ri]->AsString() << endl;
+ const Hypergraph::TailNodeVector ant(1, nodes[di]);
+ ApplyRule(i, j, unaries[ri], ant, 0); // may update nodes
+ }
+ }
+ }
+}
+
+bool PassiveChart::Parse() {
+ forest_->nodes_.reserve(input_.size() * input_.size() * 2);
+ forest_->edges_.reserve(input_.size() * input_.size() * 1000); // TODO: reservation??
+ goal_idx_ = -1;
+ for (int gi = 0; gi < grammars_.size(); ++gi)
+ act_chart_[gi]->SeedActiveChart(*grammars_[gi]);
+
+ cerr << " ";
+ for (int l=1; l<input_.size()+1; ++l) {
+ cerr << '.';
+ for (int i=0; i<input_.size() + 1 - l; ++i) {
+ int j = i + l;
+ for (int gi = 0; gi < grammars_.size(); ++gi) {
+ const Grammar& g = *grammars_[gi];
+ if (g.HasRuleForSpan(i, j, input_.Distance(i, j))) {
+ act_chart_[gi]->AdvanceDotsForAllItemsInCell(i, j, input_);
+
+ const vector<ActiveChart::ActiveItem>& cell = (*act_chart_[gi])(i,j);
+ for (vector<ActiveChart::ActiveItem>::const_iterator ai = cell.begin();
+ ai != cell.end(); ++ai) {
+ const RuleBin* rules = (ai->gptr_->GetRules());
+ if (!rules) continue;
+ ApplyRules(i, j, rules, ai->ant_nodes_, ai->lattice_cost);
+ }
+ }
+ }
+ ApplyUnaryRules(i,j);
+
+ for (int gi = 0; gi < grammars_.size(); ++gi) {
+ const Grammar& g = *grammars_[gi];
+ // deal with non-terminals that were just proved
+ if (g.HasRuleForSpan(i, j, input_.Distance(i,j)))
+ act_chart_[gi]->ExtendActiveItems(i, i, j);
+ }
+ }
+ const vector<int>& dh = chart_(0, input_.size());
+ for (int di = 0; di < dh.size(); ++di) {
+ const Hypergraph::Node& node = forest_->nodes_[dh[di]];
+ if (node.cat_ == goal_cat_) {
+ Hypergraph::TailNodeVector ant(1, node.id_);
+ ApplyRule(0, input_.size(), goal_rule_, ant, 0);
+ }
+ }
+ }
+ cerr << endl;
+
+ if (GoalFound())
+ forest_->PruneUnreachable(forest_->nodes_.size() - 1);
+ return GoalFound();
+}
+
+PassiveChart::~PassiveChart() {
+ for (int i = 0; i < act_chart_.size(); ++i)
+ delete act_chart_[i];
+}
+
+ExhaustiveBottomUpParser::ExhaustiveBottomUpParser(
+ const string& goal_sym,
+ const vector<GrammarPtr>& grammars) :
+ goal_sym_(goal_sym),
+ grammars_(grammars) {}
+
+bool ExhaustiveBottomUpParser::Parse(const Lattice& input,
+ Hypergraph* forest) const {
+ stats.Reset();
+ PassiveChart chart(goal_sym_, grammars_, input, forest);
+ const bool result = chart.Parse();
+ stats.Report();
+ return result;
+}
diff --git a/decoder/bottom_up_parser.h b/decoder/bottom_up_parser.h
new file mode 100644
index 00000000..546bfb54
--- /dev/null
+++ b/decoder/bottom_up_parser.h
@@ -0,0 +1,27 @@
+#ifndef _BOTTOM_UP_PARSER_H_
+#define _BOTTOM_UP_PARSER_H_
+
+#include <vector>
+#include <string>
+
+#include "lattice.h"
+#include "grammar.h"
+
+class Hypergraph;
+
+class ExhaustiveBottomUpParser {
+ public:
+ ExhaustiveBottomUpParser(const std::string& goal_sym,
+ const std::vector<GrammarPtr>& grammars);
+
+ // returns true if goal reached spanning the full input
+ // forest contains the full (i.e., unpruned) parse forest
+ bool Parse(const Lattice& input,
+ Hypergraph* forest) const;
+
+ private:
+ const std::string goal_sym_;
+ const std::vector<GrammarPtr> grammars_;
+};
+
+#endif
diff --git a/decoder/cdec.cc b/decoder/cdec.cc
new file mode 100644
index 00000000..dbf32cb3
--- /dev/null
+++ b/decoder/cdec.cc
@@ -0,0 +1,592 @@
+#include <iostream>
+#include <fstream>
+#include <tr1/unordered_map>
+#include <tr1/unordered_set>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "timing_stats.h"
+#include "translator.h"
+#include "phrasebased_translator.h"
+#include "aligner.h"
+#include "stringlib.h"
+#include "forest_writer.h"
+#include "hg_io.h"
+#include "filelib.h"
+#include "sampler.h"
+#include "sparse_vector.h"
+#include "tagger.h"
+#include "lextrans.h"
+#include "lexalign.h"
+#include "csplit.h"
+#include "weights.h"
+#include "tdict.h"
+#include "ff.h"
+#include "ff_factory.h"
+#include "hg_intersect.h"
+#include "apply_models.h"
+#include "viterbi.h"
+#include "kbest.h"
+#include "inside_outside.h"
+#include "exp_semiring.h"
+#include "sentence_metadata.h"
+
+using namespace std;
+using namespace std::tr1;
+using boost::shared_ptr;
+namespace po = boost::program_options;
+
+// some globals ...
+boost::shared_ptr<RandomNumberGenerator<boost::mt19937> > rng;
+static const double kMINUS_EPSILON = -1e-6; // don't be too strict
+
+namespace Hack { void MaxTrans(const Hypergraph& in, int beam_size); }
+namespace NgramCache { void Clear(); }
+
+void ShowBanner() {
+ cerr << "cdec v1.0 (c) 2009-2010 by Chris Dyer\n";
+}
+
+void ConvertSV(const SparseVector<prob_t>& src, SparseVector<double>* trg) {
+ for (SparseVector<prob_t>::const_iterator it = src.begin(); it != src.end(); ++it)
+ trg->set_value(it->first, it->second);
+}
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("formalism,f",po::value<string>(),"Decoding formalism; values include SCFG, FST, PB, LexTrans (lexical translation model, also disc training), CSplit (compound splitting), Tagger (sequence labeling), LexAlign (alignment only, or EM training)")
+ ("input,i",po::value<string>()->default_value("-"),"Source file")
+ ("grammar,g",po::value<vector<string> >()->composing(),"Either SCFG grammar file(s) or phrase tables file(s)")
+ ("weights,w",po::value<string>(),"Feature weights file")
+ ("no_freeze_feature_set,Z", "Do not freeze feature set after reading feature weights file")
+ ("feature_function,F",po::value<vector<string> >()->composing(), "Additional feature function(s) (-L for list)")
+ ("list_feature_functions,L","List available feature functions")
+ ("add_pass_through_rules,P","Add rules to translate OOV words as themselves")
+ ("k_best,k",po::value<int>(),"Extract the k best derivations")
+ ("unique_k_best,r", "Unique k-best translation list")
+ ("aligner,a", "Run as a word/phrase aligner (src & ref required)")
+ ("intersection_strategy,I",po::value<string>()->default_value("cube_pruning"), "Intersection strategy for incorporating finite-state features; values include Cube_pruning, Full")
+ ("cubepruning_pop_limit,K",po::value<int>()->default_value(200), "Max number of pops from the candidate heap at each node")
+ ("goal",po::value<string>()->default_value("S"),"Goal symbol (SCFG & FST)")
+ ("scfg_extra_glue_grammar", po::value<string>(), "Extra glue grammar file (Glue grammars apply when i=0 but have no other span restrictions)")
+ ("scfg_no_hiero_glue_grammar,n", "No Hiero glue grammar (nb. by default the SCFG decoder adds Hiero glue rules)")
+ ("scfg_default_nt,d",po::value<string>()->default_value("X"),"Default non-terminal symbol in SCFG")
+ ("scfg_max_span_limit,S",po::value<int>()->default_value(10),"Maximum non-terminal span limit (except \"glue\" grammar)")
+ ("show_tree_structure", "Show the Viterbi derivation structure")
+ ("show_expected_length", "Show the expected translation length under the model")
+ ("show_partition,z", "Compute and show the partition (inside score)")
+ ("show_cfg_search_space", "Show the search space as a CFG")
+ ("beam_prune", po::value<double>(), "Prune paths from +LM forest")
+ ("lexalign_use_null", "Support source-side null words in lexical translation")
+ ("tagger_tagset,t", po::value<string>(), "(Tagger) file containing tag set")
+ ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format")
+ ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice")
+ ("extract_rules", po::value<string>(), "Extract the rules used in translation (de-duped) to this file")
+ ("graphviz","Show (constrained) translation forest in GraphViz format")
+ ("max_translation_beam,x", po::value<int>(), "Beam approximation to get max translation from the chart")
+ ("max_translation_sample,X", po::value<int>(), "Sample the max translation from the chart")
+ ("pb_max_distortion,D", po::value<int>()->default_value(4), "Phrase-based decoder: maximum distortion")
+ ("cll_gradient,G","Compute conditional log-likelihood gradient and write to STDOUT (src & ref required)")
+ ("crf_uniform_empirical", "If there are multple references use (i.e., lattice) a uniform distribution rather than posterior weighting a la EM")
+ ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)")
+ ("vector_format",po::value<string>()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)")
+ ("combine_size,C",po::value<int>()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)")
+ ("forest_output,O",po::value<string>(),"Directory to write forests to")
+ ("minimal_forests,m","Write minimal forests (excludes Rule information). Such forests can be used for ML/MAP training, but not rescoring, etc.");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config,c", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ const string cfg = (*conf)["config"].as<string>();
+ cerr << "Configuration file: " << cfg << endl;
+ ifstream config(cfg.c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("list_feature_functions")) {
+ cerr << "Available feature functions (specify with -F):\n";
+ global_ff_registry->DisplayList();
+ cerr << endl;
+ exit(1);
+ }
+
+ if (conf->count("help") || conf->count("formalism") == 0) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+
+ const string formalism = LowercaseString((*conf)["formalism"].as<string>());
+ if (formalism != "scfg" && formalism != "fst" && formalism != "lextrans" && formalism != "pb" && formalism != "csplit" && formalism != "tagger" && formalism != "lexalign") {
+ cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit', 'lextrans', 'lexalign', or 'tagger'\n";
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+// TODO move out of cdec into some sampling decoder file
+void SampleRecurse(const Hypergraph& hg, const vector<SampleSet>& ss, int n, vector<WordID>* out) {
+ const SampleSet& s = ss[n];
+ int i = rng->SelectSample(s);
+ const Hypergraph::Edge& edge = hg.edges_[hg.nodes_[n].in_edges_[i]];
+ vector<vector<WordID> > ants(edge.tail_nodes_.size());
+ for (int j = 0; j < ants.size(); ++j)
+ SampleRecurse(hg, ss, edge.tail_nodes_[j], &ants[j]);
+
+ vector<const vector<WordID>*> pants(ants.size());
+ for (int j = 0; j < ants.size(); ++j) pants[j] = &ants[j];
+ edge.rule_->ESubstitute(pants, out);
+}
+
+struct SampleSort {
+ bool operator()(const pair<int,string>& a, const pair<int,string>& b) const {
+ return a.first > b.first;
+ }
+};
+
+// TODO move out of cdec into some sampling decoder file
+void MaxTranslationSample(Hypergraph* hg, const int samples, const int k) {
+ unordered_map<string, int, boost::hash<string> > m;
+ hg->PushWeightsToGoal();
+ const int num_nodes = hg->nodes_.size();
+ vector<SampleSet> ss(num_nodes);
+ for (int i = 0; i < num_nodes; ++i) {
+ SampleSet& s = ss[i];
+ const vector<int>& in_edges = hg->nodes_[i].in_edges_;
+ for (int j = 0; j < in_edges.size(); ++j) {
+ s.add(hg->edges_[in_edges[j]].edge_prob_);
+ }
+ }
+ for (int i = 0; i < samples; ++i) {
+ vector<WordID> yield;
+ SampleRecurse(*hg, ss, hg->nodes_.size() - 1, &yield);
+ const string trans = TD::GetString(yield);
+ ++m[trans];
+ }
+ vector<pair<int, string> > dist;
+ for (unordered_map<string, int, boost::hash<string> >::iterator i = m.begin();
+ i != m.end(); ++i) {
+ dist.push_back(make_pair(i->second, i->first));
+ }
+ sort(dist.begin(), dist.end(), SampleSort());
+ if (k) {
+ for (int i = 0; i < k; ++i)
+ cout << dist[i].first << " ||| " << dist[i].second << endl;
+ } else {
+ cout << dist[0].second << endl;
+ }
+}
+
+// TODO decoder output should probably be moved to another file
+void DumpKBest(const int sent_id, const Hypergraph& forest, const int k, const bool unique) {
+cerr << "In kbest\n";
+ if (unique) {
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique> kbest(forest, k);
+ for (int i = 0; i < k; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique>::Derivation* d =
+ kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+ if (!d) break;
+ cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| "
+ << d->feature_values << " ||| " << log(d->score) << endl;
+ }
+ } else {
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k);
+ for (int i = 0; i < k; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+ if (!d) break;
+ cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| "
+ << d->feature_values << " ||| " << log(d->score) << endl;
+ }
+ }
+}
+
+struct ELengthWeightFunction {
+ double operator()(const Hypergraph::Edge& e) const {
+ return e.rule_->ELength() - e.rule_->Arity();
+ }
+};
+
+
+struct TRPHash {
+ size_t operator()(const TRulePtr& o) const { return reinterpret_cast<size_t>(o.get()); }
+};
+static void ExtractRulesDedupe(const Hypergraph& hg, ostream* os) {
+ static unordered_set<TRulePtr, TRPHash> written;
+ for (int i = 0; i < hg.edges_.size(); ++i) {
+ const TRulePtr& rule = hg.edges_[i].rule_;
+ if (written.insert(rule).second) {
+ (*os) << rule->AsString() << endl;
+ }
+ }
+}
+
+void register_feature_functions();
+
+int main(int argc, char** argv) {
+ global_ff_registry.reset(new FFRegistry);
+ register_feature_functions();
+ ShowBanner();
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const bool write_gradient = conf.count("cll_gradient");
+ const bool feature_expectations = conf.count("feature_expectations");
+ if (write_gradient && feature_expectations) {
+ cerr << "You can only specify --gradient or --feature_expectations, not both!\n";
+ exit(1);
+ }
+ const bool output_training_vector = (write_gradient || feature_expectations);
+
+ boost::shared_ptr<Translator> translator;
+ const string formalism = LowercaseString(conf["formalism"].as<string>());
+ const bool csplit_preserve_full_word = conf.count("csplit_preserve_full_word");
+ if (csplit_preserve_full_word &&
+ (formalism != "csplit" || !conf.count("beam_prune"))) {
+ cerr << "--csplit_preserve_full_word should only be "
+ << "used with csplit AND --beam_prune!\n";
+ exit(1);
+ }
+ const bool csplit_output_plf = conf.count("csplit_output_plf");
+ if (csplit_output_plf && formalism != "csplit") {
+ cerr << "--csplit_output_plf should only be used with csplit!\n";
+ exit(1);
+ }
+
+ // load feature weights (and possibly freeze feature set)
+ vector<double> feature_weights;
+ Weights w;
+ if (conf.count("weights")) {
+ w.InitFromFile(conf["weights"].as<string>());
+ feature_weights.resize(FD::NumFeats());
+ w.InitVector(&feature_weights);
+ if (!conf.count("no_freeze_feature_set")) {
+ cerr << "Freezing feature set (use --no_freeze_feature_set to change)." << endl;
+ FD::Freeze();
+ }
+ }
+
+ // set up translation back end
+ if (formalism == "scfg")
+ translator.reset(new SCFGTranslator(conf));
+ else if (formalism == "fst")
+ translator.reset(new FSTTranslator(conf));
+ else if (formalism == "pb")
+ translator.reset(new PhraseBasedTranslator(conf));
+ else if (formalism == "csplit")
+ translator.reset(new CompoundSplit(conf));
+ else if (formalism == "lextrans")
+ translator.reset(new LexicalTrans(conf));
+ else if (formalism == "lexalign")
+ translator.reset(new LexicalAlign(conf));
+ else if (formalism == "tagger")
+ translator.reset(new Tagger(conf));
+ else
+ assert(!"error");
+
+ // set up additional scoring features
+ vector<shared_ptr<FeatureFunction> > pffs;
+ vector<const FeatureFunction*> late_ffs;
+ if (conf.count("feature_function") > 0) {
+ const vector<string>& add_ffs = conf["feature_function"].as<vector<string> >();
+ for (int i = 0; i < add_ffs.size(); ++i) {
+ string ff, param;
+ SplitCommandAndParam(add_ffs[i], &ff, &param);
+ cerr << "Feature: " << ff;
+ if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n";
+ else cerr << " (no config parameters)\n";
+ shared_ptr<FeatureFunction> pff = global_ff_registry->Create(ff, param);
+ if (!pff) { exit(1); }
+ // TODO check that multiple features aren't trying to set the same fid
+ pffs.push_back(pff);
+ late_ffs.push_back(pff.get());
+ }
+ }
+ ModelSet late_models(feature_weights, late_ffs);
+ int palg = 1;
+ if (LowercaseString(conf["intersection_strategy"].as<string>()) == "full") {
+ palg = 0;
+ cerr << "Using full intersection (no pruning).\n";
+ }
+ const IntersectionConfiguration inter_conf(palg, conf["cubepruning_pop_limit"].as<int>());
+
+ const int sample_max_trans = conf.count("max_translation_sample") ?
+ conf["max_translation_sample"].as<int>() : 0;
+ if (sample_max_trans)
+ rng.reset(new RandomNumberGenerator<boost::mt19937>);
+ const bool aligner_mode = conf.count("aligner");
+ const bool minimal_forests = conf.count("minimal_forests");
+ const bool graphviz = conf.count("graphviz");
+ const bool encode_b64 = conf["vector_format"].as<string>() == "b64";
+ const bool kbest = conf.count("k_best");
+ const bool unique_kbest = conf.count("unique_k_best");
+ const bool crf_uniform_empirical = conf.count("crf_uniform_empirical");
+ shared_ptr<WriteFile> extract_file;
+ if (conf.count("extract_rules"))
+ extract_file.reset(new WriteFile(conf["extract_rules"].as<string>()));
+
+ int combine_size = conf["combine_size"].as<int>();
+ if (combine_size < 1) combine_size = 1;
+ const string input = conf["input"].as<string>();
+ cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl;
+ ReadFile in_read(input);
+ istream *in = in_read.stream();
+ assert(*in);
+
+ SparseVector<prob_t> acc_vec; // accumulate gradient
+ double acc_obj = 0; // accumulate objective
+ int g_count = 0; // number of gradient pieces computed
+ int sent_id = -1; // line counter
+
+ while(*in) {
+ NgramCache::Clear(); // clear ngram cache for remote LM (if used)
+ Timer::Summarize();
+ ++sent_id;
+ string buf;
+ getline(*in, buf);
+ if (buf.empty()) continue;
+ map<string, string> sgml;
+ ProcessAndStripSGML(&buf, &sgml);
+ if (sgml.find("id") != sgml.end())
+ sent_id = atoi(sgml["id"].c_str());
+
+ cerr << "\nINPUT: ";
+ if (buf.size() < 100)
+ cerr << buf << endl;
+ else {
+ size_t x = buf.rfind(" ", 100);
+ if (x == string::npos) x = 100;
+ cerr << buf.substr(0, x) << " ..." << endl;
+ }
+ cerr << " id = " << sent_id << endl;
+ string to_translate;
+ Lattice ref;
+ ParseTranslatorInputLattice(buf, &to_translate, &ref);
+ const bool has_ref = ref.size() > 0;
+ SentenceMetadata smeta(sent_id, ref);
+ const bool hadoop_counters = (write_gradient);
+ Hypergraph forest; // -LM forest
+ translator->ProcessMarkupHints(sgml);
+ Timer t("Translation");
+ const bool translation_successful =
+ translator->Translate(to_translate, &smeta, feature_weights, &forest);
+ translator->SentenceComplete();
+ if (!translation_successful) {
+ cerr << " NO PARSE FOUND.\n";
+ if (hadoop_counters)
+ cerr << "reporter:counter:UserCounters,FParseFailed,1" << endl;
+ cout << endl << flush;
+ continue;
+ }
+ cerr << " -LM forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl;
+ cerr << " -LM forest (paths): " << forest.NumberOfPaths() << endl;
+ if (conf.count("show_expected_length")) {
+ const PRPair<double, double> res =
+ Inside<PRPair<double, double>,
+ PRWeightFunction<double, EdgeProb, double, ELengthWeightFunction> >(forest);
+ cerr << " Expected length (words): " << res.r / res.p << "\t" << res << endl;
+ }
+ if (conf.count("show_partition")) {
+ const prob_t z = Inside<prob_t, EdgeProb>(forest);
+ cerr << " -LM partition log(Z): " << log(z) << endl;
+ }
+ if (extract_file)
+ ExtractRulesDedupe(forest, extract_file->stream());
+ vector<WordID> trans;
+ const prob_t vs = ViterbiESentence(forest, &trans);
+ cerr << " -LM Viterbi: " << TD::GetString(trans) << endl;
+ if (conf.count("show_tree_structure"))
+ cerr << " -LM tree: " << ViterbiETree(forest) << endl;;
+ cerr << " -LM Viterbi: " << log(vs) << endl;
+
+ bool has_late_models = !late_models.empty();
+ if (has_late_models) {
+ forest.Reweight(feature_weights);
+ forest.SortInEdgesByEdgeWeights();
+ Hypergraph lm_forest;
+ ApplyModelSet(forest,
+ smeta,
+ late_models,
+ inter_conf,
+ &lm_forest);
+ forest.swap(lm_forest);
+ forest.Reweight(feature_weights);
+ trans.clear();
+ ViterbiESentence(forest, &trans);
+ cerr << " +LM forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl;
+ cerr << " +LM forest (paths): " << forest.NumberOfPaths() << endl;
+ cerr << " +LM Viterbi: " << TD::GetString(trans) << endl;
+ }
+ if (conf.count("beam_prune")) {
+ vector<bool> preserve_mask(forest.edges_.size(), false);
+ if (csplit_preserve_full_word)
+ preserve_mask[CompoundSplit::GetFullWordEdgeIndex(forest)] = true;
+ forest.BeamPruneInsideOutside(1.0, false, conf["beam_prune"].as<double>(), &preserve_mask);
+ cerr << " Pruned forest (paths): " << forest.NumberOfPaths() << endl;
+ }
+
+ if (conf.count("forest_output") && !has_ref) {
+ ForestWriter writer(conf["forest_output"].as<string>(), sent_id);
+ if (FileExists(writer.fname_)) {
+ cerr << " Unioning...\n";
+ Hypergraph new_hg;
+ {
+ ReadFile rf(writer.fname_);
+ bool succeeded = HypergraphIO::ReadFromJSON(rf.stream(), &new_hg);
+ assert(succeeded);
+ }
+ new_hg.Union(forest);
+ bool succeeded = writer.Write(new_hg, minimal_forests);
+ assert(succeeded);
+ } else {
+ bool succeeded = writer.Write(forest, minimal_forests);
+ assert(succeeded);
+ }
+ }
+
+ if (sample_max_trans) {
+ MaxTranslationSample(&forest, sample_max_trans, conf.count("k_best") ? conf["k_best"].as<int>() : 0);
+ } else {
+ if (kbest) {
+ DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest);
+ } else if (csplit_output_plf) {
+ cout << HypergraphIO::AsPLF(forest, false) << endl;
+ } else {
+ if (!graphviz && !has_ref) {
+ cout << TD::GetString(trans) << endl << flush;
+ }
+ }
+ }
+
+ const int max_trans_beam_size = conf.count("max_translation_beam") ?
+ conf["max_translation_beam"].as<int>() : 0;
+ if (max_trans_beam_size) {
+ Hack::MaxTrans(forest, max_trans_beam_size);
+ continue;
+ }
+
+ if (graphviz && !has_ref) forest.PrintGraphviz();
+
+ // the following are only used if write_gradient is true!
+ SparseVector<prob_t> full_exp, ref_exp, gradient;
+ double log_z = 0, log_ref_z = 0;
+ if (write_gradient) {
+ const prob_t z = InsideOutside<prob_t, EdgeProb, SparseVector<prob_t>, EdgeFeaturesAndProbWeightFunction>(forest, &full_exp);
+ log_z = log(z);
+ full_exp /= z;
+ }
+ if (conf.count("show_cfg_search_space"))
+ HypergraphIO::WriteAsCFG(forest);
+ if (has_ref) {
+ if (HG::Intersect(ref, &forest)) {
+ cerr << " Constr. forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl;
+ cerr << " Constr. forest (paths): " << forest.NumberOfPaths() << endl;
+ if (crf_uniform_empirical) {
+ cerr << " USING UNIFORM WEIGHTS\n";
+ for (int i = 0; i < forest.edges_.size(); ++i)
+ forest.edges_[i].edge_prob_=prob_t::One();
+ } else {
+ forest.Reweight(feature_weights);
+ cerr << " Constr. VitTree: " << ViterbiFTree(forest) << endl;
+ }
+ if (hadoop_counters)
+ cerr << "reporter:counter:UserCounters,SentencePairsParsed,1" << endl;
+ if (conf.count("show_partition")) {
+ const prob_t z = Inside<prob_t, EdgeProb>(forest);
+ cerr << " Contst. partition log(Z): " << log(z) << endl;
+ }
+ //DumpKBest(sent_id, forest, 1000);
+ if (conf.count("forest_output")) {
+ ForestWriter writer(conf["forest_output"].as<string>(), sent_id);
+ if (FileExists(writer.fname_)) {
+ cerr << " Unioning...\n";
+ Hypergraph new_hg;
+ {
+ ReadFile rf(writer.fname_);
+ bool succeeded = HypergraphIO::ReadFromJSON(rf.stream(), &new_hg);
+ assert(succeeded);
+ }
+ new_hg.Union(forest);
+ bool succeeded = writer.Write(new_hg, minimal_forests);
+ assert(succeeded);
+ } else {
+ bool succeeded = writer.Write(forest, minimal_forests);
+ assert(succeeded);
+ }
+ }
+ if (aligner_mode && !output_training_vector)
+ AlignerTools::WriteAlignment(smeta.GetSourceLattice(), smeta.GetReference(), forest, &cout);
+ if (write_gradient) {
+ const prob_t ref_z = InsideOutside<prob_t, EdgeProb, SparseVector<prob_t>, EdgeFeaturesAndProbWeightFunction>(forest, &ref_exp);
+ ref_exp /= ref_z;
+ if (crf_uniform_empirical) {
+ log_ref_z = ref_exp.dot(feature_weights);
+ } else {
+ log_ref_z = log(ref_z);
+ }
+ //cerr << " MODEL LOG Z: " << log_z << endl;
+ //cerr << " EMPIRICAL LOG Z: " << log_ref_z << endl;
+ if ((log_z - log_ref_z) < kMINUS_EPSILON) {
+ cerr << "DIFF. ERR! log_z < log_ref_z: " << log_z << " " << log_ref_z << endl;
+ exit(1);
+ }
+ assert(!isnan(log_ref_z));
+ ref_exp -= full_exp;
+ acc_vec += ref_exp;
+ acc_obj += (log_z - log_ref_z);
+ }
+ if (feature_expectations) {
+ const prob_t z =
+ InsideOutside<prob_t, EdgeProb, SparseVector<prob_t>, EdgeFeaturesAndProbWeightFunction>(forest, &ref_exp);
+ ref_exp /= z;
+ acc_obj += log(z);
+ acc_vec += ref_exp;
+ }
+
+ if (output_training_vector) {
+ acc_vec.clear_value(0);
+ ++g_count;
+ if (g_count % combine_size == 0) {
+ if (encode_b64) {
+ cout << "0\t";
+ SparseVector<double> dav; ConvertSV(acc_vec, &dav);
+ B64::Encode(acc_obj, dav, &cout);
+ cout << endl << flush;
+ } else {
+ cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush;
+ }
+ acc_vec.clear();
+ acc_obj = 0;
+ }
+ }
+ if (conf.count("graphviz")) forest.PrintGraphviz();
+ } else {
+ cerr << " REFERENCE UNREACHABLE.\n";
+ if (write_gradient) {
+ if (hadoop_counters)
+ cerr << "reporter:counter:UserCounters,EFParseFailed,1" << endl;
+ cout << endl << flush;
+ }
+ }
+ }
+ }
+ if (output_training_vector && !acc_vec.empty()) {
+ if (encode_b64) {
+ cout << "0\t";
+ SparseVector<double> dav; ConvertSV(acc_vec, &dav);
+ B64::Encode(acc_obj, dav, &cout);
+ cout << endl << flush;
+ } else {
+ cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush;
+ }
+ }
+}
+
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
new file mode 100644
index 00000000..d0b93795
--- /dev/null
+++ b/decoder/cdec_ff.cc
@@ -0,0 +1,32 @@
+#include <boost/shared_ptr.hpp>
+
+#include "ff.h"
+#include "ff_lm.h"
+#include "ff_csplit.h"
+#include "ff_wordalign.h"
+#include "ff_tagger.h"
+#include "ff_factory.h"
+
+boost::shared_ptr<FFRegistry> global_ff_registry;
+
+void register_feature_functions() {
+ global_ff_registry->Register("LanguageModel", new FFFactory<LanguageModel>);
+#ifdef HAVE_RANDLM
+ global_ff_registry->Register("RandLM", new FFFactory<LanguageModelRandLM>);
+#endif
+ global_ff_registry->Register("WordPenalty", new FFFactory<WordPenalty>);
+ global_ff_registry->Register("SourceWordPenalty", new FFFactory<SourceWordPenalty>);
+ global_ff_registry->Register("ArityPenalty", new FFFactory<ArityPenalty>);
+ global_ff_registry->Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>);
+ global_ff_registry->Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>);
+ global_ff_registry->Register("MarkovJump", new FFFactory<MarkovJump>);
+ global_ff_registry->Register("MarkovJumpFClass", new FFFactory<MarkovJumpFClass>);
+ global_ff_registry->Register("SourcePOSBigram", new FFFactory<SourcePOSBigram>);
+ global_ff_registry->Register("BlunsomSynchronousParseHack", new FFFactory<BlunsomSynchronousParseHack>);
+ global_ff_registry->Register("AlignerResults", new FFFactory<AlignerResults>);
+ global_ff_registry->Register("CSplit_BasicFeatures", new FFFactory<BasicCSplitFeatures>);
+ global_ff_registry->Register("CSplit_ReverseCharLM", new FFFactory<ReverseCharLMCSplitFeature>);
+ global_ff_registry->Register("Tagger_BigramIdentity", new FFFactory<Tagger_BigramIdentity>);
+ global_ff_registry->Register("LexicalPairIdentity", new FFFactory<LexicalPairIdentity>);
+};
+
diff --git a/decoder/csplit.cc b/decoder/csplit.cc
new file mode 100644
index 00000000..b1a30fb0
--- /dev/null
+++ b/decoder/csplit.cc
@@ -0,0 +1,173 @@
+#include "csplit.h"
+
+#include <iostream>
+
+#include "filelib.h"
+#include "stringlib.h"
+#include "hg.h"
+#include "tdict.h"
+#include "grammar.h"
+#include "sentence_metadata.h"
+
+using namespace std;
+
+struct CompoundSplitImpl {
+ CompoundSplitImpl(const boost::program_options::variables_map& conf) :
+ fugen_elements_(true), // TODO configure
+ min_size_(3),
+ kXCAT(TD::Convert("X")*-1),
+ kWORDBREAK_RULE(new TRule("[X] ||| # ||| #")),
+ kTEMPLATE_RULE(new TRule("[X] ||| [X,1] ? ||| [1] ?")),
+ kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")),
+ kFUGEN_S(FD::Convert("FugS")),
+ kFUGEN_N(FD::Convert("FugN")) {}
+
+ void PasteTogetherStrings(const vector<string>& chars,
+ const int i,
+ const int j,
+ string* yield) {
+ int size = 0;
+ for (int k=i; k<j; ++k)
+ size += chars[k].size();
+ yield->resize(size);
+ int cur = 0;
+ for (int k=i; k<j; ++k) {
+ const string& cs = chars[k];
+ for (int l = 0; l < cs.size(); ++l)
+ (*yield)[cur++] = cs[l];
+ }
+ }
+
+ void BuildTrellis(const vector<string>& chars,
+ Hypergraph* forest) {
+ vector<int> nodes(chars.size()+1, -1);
+ nodes[0] = forest->AddNode(kXCAT)->id_; // source
+ const int left_rule = forest->AddEdge(kWORDBREAK_RULE, Hypergraph::TailNodeVector())->id_;
+ forest->ConnectEdgeToHeadNode(left_rule, nodes[0]);
+
+ const int max_split_ = max(static_cast<int>(chars.size()) - min_size_ + 1, 1);
+ cerr << "max: " << max_split_ << " " << " min: " << min_size_ << endl;
+ for (int i = min_size_; i < max_split_; ++i)
+ nodes[i] = forest->AddNode(kXCAT)->id_;
+ assert(nodes.back() == -1);
+ nodes.back() = forest->AddNode(kXCAT)->id_; // sink
+
+ for (int i = 0; i < max_split_; ++i) {
+ if (nodes[i] < 0) continue;
+ const int start = min(i + min_size_, static_cast<int>(chars.size()));
+ for (int j = start; j <= chars.size(); ++j) {
+ if (nodes[j] < 0) continue;
+ string yield;
+ PasteTogetherStrings(chars, i, j, &yield);
+ // cerr << "[" << i << "," << j << "] " << yield << endl;
+ TRulePtr rule = TRulePtr(new TRule(*kTEMPLATE_RULE));
+ rule->e_[1] = rule->f_[1] = TD::Convert(yield);
+ // cerr << rule->AsString() << endl;
+ int edge = forest->AddEdge(
+ rule,
+ Hypergraph::TailNodeVector(1, nodes[i]))->id_;
+ forest->ConnectEdgeToHeadNode(edge, nodes[j]);
+ forest->edges_[edge].i_ = i;
+ forest->edges_[edge].j_ = j;
+
+ // handle "fugenelemente" here
+ // don't delete "fugenelemente" at the end of words
+ if (fugen_elements_ && j != chars.size()) {
+ const int len = yield.size();
+ string alt;
+ int fid = 0;
+ if (len > (min_size_ + 2) && yield[len-1] == 's' && yield[len-2] == 'e') {
+ alt = yield.substr(0, len - 2);
+ fid = kFUGEN_S;
+ } else if (len > (min_size_ + 1) && yield[len-1] == 's') {
+ alt = yield.substr(0, len - 1);
+ fid = kFUGEN_S;
+ } else if (len > (min_size_ + 2) && yield[len-2] == 'e' && yield[len-1] == 'n') {
+ alt = yield.substr(0, len - 1);
+ fid = kFUGEN_N;
+ }
+ if (alt.size()) {
+ TRulePtr altrule = TRulePtr(new TRule(*rule));
+ altrule->e_[1] = TD::Convert(alt);
+ // cerr << altrule->AsString() << endl;
+ int edge = forest->AddEdge(
+ altrule,
+ Hypergraph::TailNodeVector(1, nodes[i]))->id_;
+ forest->ConnectEdgeToHeadNode(edge, nodes[j]);
+ forest->edges_[edge].feature_values_.set_value(fid, 1.0);
+ forest->edges_[edge].i_ = i;
+ forest->edges_[edge].j_ = j;
+ }
+ }
+ }
+ }
+
+ // add goal rule
+ Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1);
+ Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1);
+ Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail);
+ forest->ConnectEdgeToHeadNode(hg_edge, goal);
+ }
+ private:
+ const bool fugen_elements_;
+ const int min_size_;
+ const WordID kXCAT;
+ const TRulePtr kWORDBREAK_RULE;
+ const TRulePtr kTEMPLATE_RULE;
+ const TRulePtr kGOAL_RULE;
+ const int kFUGEN_S;
+ const int kFUGEN_N;
+};
+
+CompoundSplit::CompoundSplit(const boost::program_options::variables_map& conf) :
+ pimpl_(new CompoundSplitImpl(conf)) {}
+
+static void SplitUTF8String(const string& in, vector<string>* out) {
+ out->resize(in.size());
+ int i = 0;
+ int c = 0;
+ while (i < in.size()) {
+ const int len = UTF8Len(in[i]);
+ assert(len);
+ (*out)[c] = in.substr(i, len);
+ ++c;
+ i += len;
+ }
+ out->resize(c);
+}
+
+bool CompoundSplit::TranslateImpl(const string& input,
+ SentenceMetadata* smeta,
+ const vector<double>& weights,
+ Hypergraph* forest) {
+ if (input.find(" ") != string::npos) {
+ cerr << " BAD INPUT: " << input << "\n CompoundSplit expects single words\n";
+ abort();
+ }
+ vector<string> in;
+ SplitUTF8String(input, &in);
+ smeta->SetSourceLength(in.size()); // TODO do utf8 or somethign
+ for (int i = 0; i < in.size(); ++i)
+ smeta->src_lattice_.push_back(vector<LatticeArc>(1, LatticeArc(TD::Convert(in[i]), 0.0, 1)));
+ pimpl_->BuildTrellis(in, forest);
+ forest->Reweight(weights);
+ return true;
+}
+
+int CompoundSplit::GetFullWordEdgeIndex(const Hypergraph& forest) {
+ assert(forest.nodes_.size() > 0);
+ const vector<int> out_edges = forest.nodes_[0].out_edges_;
+ int max_edge = -1;
+ int max_j = -1;
+ for (int i = 0; i < out_edges.size(); ++i) {
+ const int j = forest.edges_[out_edges[i]].j_;
+ if (j > max_j) {
+ max_j = j;
+ max_edge = out_edges[i];
+ }
+ }
+ assert(max_edge >= 0);
+ assert(max_edge < forest.edges_.size());
+ return max_edge;
+}
+
diff --git a/decoder/csplit.h b/decoder/csplit.h
new file mode 100644
index 00000000..82ed23fc
--- /dev/null
+++ b/decoder/csplit.h
@@ -0,0 +1,30 @@
+#ifndef _CSPLIT_H_
+#define _CSPLIT_H_
+
+#include "translator.h"
+#include "lattice.h"
+
+// this "translator" takes single words (with NO SPACES) and segments
+// them using the approach described in:
+//
+// C. Dyer. (2009) Using a maximum entropy model to build segmentation
+// lattices for MT. In Proceedings of NAACL HLT 2009.
+// note, an extra word space marker # is inserted at the left edge of
+// the forest!
+struct CompoundSplitImpl;
+struct CompoundSplit : public Translator {
+ CompoundSplit(const boost::program_options::variables_map& conf);
+ bool TranslateImpl(const std::string& input,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* forest);
+
+ // given a forest generated by CompoundSplit::Translate,
+ // find the edge representing the unsegmented form
+ static int GetFullWordEdgeIndex(const Hypergraph& forest);
+
+ private:
+ boost::shared_ptr<CompoundSplitImpl> pimpl_;
+};
+
+#endif
diff --git a/decoder/dict.h b/decoder/dict.h
new file mode 100644
index 00000000..72e82e6d
--- /dev/null
+++ b/decoder/dict.h
@@ -0,0 +1,43 @@
+#ifndef DICT_H_
+#define DICT_H_
+
+#include <cassert>
+#include <cstring>
+#include <tr1/unordered_map>
+#include <string>
+#include <vector>
+
+#include <boost/functional/hash.hpp>
+
+#include "wordid.h"
+
+class Dict {
+ typedef std::tr1::unordered_map<std::string, WordID, boost::hash<std::string> > Map;
+ public:
+ Dict() : b0_("<bad0>") { words_.reserve(1000); }
+ inline int max() const { return words_.size(); }
+ inline WordID Convert(const std::string& word, bool frozen = false) {
+ Map::iterator i = d_.find(word);
+ if (i == d_.end()) {
+ if (frozen)
+ return 0;
+ words_.push_back(word);
+ d_[word] = words_.size();
+ return words_.size();
+ } else {
+ return i->second;
+ }
+ }
+ inline const std::string& Convert(const WordID& id) const {
+ if (id == 0) return b0_;
+ assert(id <= words_.size());
+ return words_[id-1];
+ }
+ void clear() { words_.clear(); d_.clear(); }
+ private:
+ const std::string b0_;
+ std::vector<std::string> words_;
+ Map d_;
+};
+
+#endif
diff --git a/decoder/dict_test.cc b/decoder/dict_test.cc
new file mode 100644
index 00000000..694877fa
--- /dev/null
+++ b/decoder/dict_test.cc
@@ -0,0 +1,50 @@
+#include "dict.h"
+
+#include "fdict.h"
+
+#include <iostream>
+#include <gtest/gtest.h>
+#include <cassert>
+#include "filelib.h"
+
+#include "tdict.h"
+
+using namespace std;
+
+class DTest : public testing::Test {
+ public:
+ DTest() {}
+ protected:
+ virtual void SetUp() { }
+ virtual void TearDown() { }
+};
+
+TEST_F(DTest, Convert) {
+ Dict d;
+ WordID a = d.Convert("foo");
+ WordID b = d.Convert("bar");
+ std::string x = "foo";
+ WordID c = d.Convert(x);
+ EXPECT_NE(a, b);
+ EXPECT_EQ(a, c);
+ EXPECT_EQ(d.Convert(a), "foo");
+ EXPECT_EQ(d.Convert(b), "bar");
+}
+
+TEST_F(DTest, FDictTest) {
+ int fid = FD::Convert("First");
+ EXPECT_GT(fid, 0);
+ EXPECT_EQ(FD::Convert(fid), "First");
+ string x = FD::Escape("=");
+ cerr << x << endl;
+ EXPECT_NE(x, "=");
+ x = FD::Escape(";");
+ cerr << x << endl;
+ EXPECT_NE(x, ";");
+}
+
+int main(int argc, char** argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc
new file mode 100644
index 00000000..f6a01e52
--- /dev/null
+++ b/decoder/earley_composer.cc
@@ -0,0 +1,726 @@
+#include "earley_composer.h"
+
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <queue>
+#include <tr1/unordered_set>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include "phrasetable_fst.h"
+#include "sparse_vector.h"
+#include "tdict.h"
+#include "hg.h"
+
+using boost::shared_ptr;
+namespace po = boost::program_options;
+using namespace std;
+using namespace std::tr1;
+
+// Define the following macro if you want to see lots of debugging output
+// when you run the chart parser
+#undef DEBUG_CHART_PARSER
+
+// A few constants used by the chart parser ///////////////
+static const int kMAX_NODES = 2000000;
+static const string kPHRASE_STRING = "X";
+static bool constants_need_init = true;
+static WordID kUNIQUE_START;
+static WordID kPHRASE;
+static TRulePtr kX1X2;
+static TRulePtr kX1;
+static WordID kEPS;
+static TRulePtr kEPSRule;
+
+static void InitializeConstants() {
+ if (constants_need_init) {
+ kPHRASE = TD::Convert(kPHRASE_STRING) * -1;
+ kUNIQUE_START = TD::Convert("S") * -1;
+ kX1X2.reset(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]"));
+ kX1.reset(new TRule("[X] ||| [X,1] ||| [X,1]"));
+ kEPSRule.reset(new TRule("[X] ||| <eps> ||| <eps>"));
+ kEPS = TD::Convert("<eps>");
+ constants_need_init = false;
+ }
+}
+////////////////////////////////////////////////////////////
+
+class EGrammarNode {
+ friend bool EarleyComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest);
+ friend void AddGrammarRule(const string& r, map<WordID, EGrammarNode>* g);
+ public:
+#ifdef DEBUG_CHART_PARSER
+ string hint;
+#endif
+ EGrammarNode() : is_some_rule_complete(false), is_root(false) {}
+ const map<WordID, EGrammarNode>& GetTerminals() const { return tptr; }
+ const map<WordID, EGrammarNode>& GetNonTerminals() const { return ntptr; }
+ bool HasNonTerminals() const { return (!ntptr.empty()); }
+ bool HasTerminals() const { return (!tptr.empty()); }
+ bool RuleCompletes() const {
+ return (is_some_rule_complete || (ntptr.empty() && tptr.empty()));
+ }
+ bool GrammarContinues() const {
+ return !(ntptr.empty() && tptr.empty());
+ }
+ bool IsRoot() const {
+ return is_root;
+ }
+ // these are the features associated with the rule from the start
+ // node up to this point. If you use these features, you must
+ // not Extend() this rule.
+ const SparseVector<double>& GetCFGProductionFeatures() const {
+ return input_features;
+ }
+
+ const EGrammarNode* Extend(const WordID& t) const {
+ if (t < 0) {
+ map<WordID, EGrammarNode>::const_iterator it = ntptr.find(t);
+ if (it == ntptr.end()) return NULL;
+ return &it->second;
+ } else {
+ map<WordID, EGrammarNode>::const_iterator it = tptr.find(t);
+ if (it == tptr.end()) return NULL;
+ return &it->second;
+ }
+ }
+
+ private:
+ map<WordID, EGrammarNode> tptr;
+ map<WordID, EGrammarNode> ntptr;
+ SparseVector<double> input_features;
+ bool is_some_rule_complete;
+ bool is_root;
+};
+typedef map<WordID, EGrammarNode> EGrammar; // indexed by the rule LHS
+
+// edges are immutable once created
+struct Edge {
+#ifdef DEBUG_CHART_PARSER
+ static int id_count;
+ const int id;
+#endif
+ const WordID cat; // lhs side of rule proved/being proved
+ const EGrammarNode* const dot; // dot position
+ const FSTNode* const q; // start of span
+ const FSTNode* const r; // end of span
+ const Edge* const active_parent; // back pointer, NULL for PREDICT items
+ const Edge* const passive_parent; // back pointer, NULL for SCAN and PREDICT items
+ const TargetPhraseSet* const tps; // translations
+ shared_ptr<SparseVector<double> > features; // features from CFG rule
+
+ bool IsPassive() const {
+ // when a rule is completed, this value will be set
+ return static_cast<bool>(features);
+ }
+ bool IsActive() const { return !IsPassive(); }
+ bool IsInitial() const {
+ return !(active_parent || passive_parent);
+ }
+ bool IsCreatedByScan() const {
+ return active_parent && !passive_parent && !dot->IsRoot();
+ }
+ bool IsCreatedByPredict() const {
+ return dot->IsRoot();
+ }
+ bool IsCreatedByComplete() const {
+ return active_parent && passive_parent;
+ }
+
+ // constructor for PREDICT
+ Edge(WordID c, const EGrammarNode* d, const FSTNode* q_and_r) :
+#ifdef DEBUG_CHART_PARSER
+ id(++id_count),
+#endif
+ cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(NULL), passive_parent(NULL), tps(NULL) {}
+ Edge(WordID c, const EGrammarNode* d, const FSTNode* q_and_r, const Edge* act_parent) :
+#ifdef DEBUG_CHART_PARSER
+ id(++id_count),
+#endif
+ cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(act_parent), passive_parent(NULL), tps(NULL) {}
+
+ // constructors for SCAN
+ Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j,
+ const Edge* act_par, const TargetPhraseSet* translations) :
+#ifdef DEBUG_CHART_PARSER
+ id(++id_count),
+#endif
+ cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations) {}
+
+ Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j,
+ const Edge* act_par, const TargetPhraseSet* translations,
+ const SparseVector<double>& feats) :
+#ifdef DEBUG_CHART_PARSER
+ id(++id_count),
+#endif
+ cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations),
+ features(new SparseVector<double>(feats)) {}
+
+ // constructors for COMPLETE
+ Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j,
+ const Edge* act_par, const Edge *pas_par) :
+#ifdef DEBUG_CHART_PARSER
+ id(++id_count),
+#endif
+ cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(NULL) {
+ assert(pas_par->IsPassive());
+ assert(act_par->IsActive());
+ }
+
+ Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j,
+ const Edge* act_par, const Edge *pas_par, const SparseVector<double>& feats) :
+#ifdef DEBUG_CHART_PARSER
+ id(++id_count),
+#endif
+ cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(NULL),
+ features(new SparseVector<double>(feats)) {
+ assert(pas_par->IsPassive());
+ assert(act_par->IsActive());
+ }
+
+ // constructor for COMPLETE query
+ Edge(const FSTNode* _r) :
+#ifdef DEBUG_CHART_PARSER
+ id(0),
+#endif
+ cat(0), dot(NULL), q(NULL),
+ r(_r), active_parent(NULL), passive_parent(NULL), tps(NULL) {}
+ // constructor for MERGE quere
+ Edge(const FSTNode* _q, int) :
+#ifdef DEBUG_CHART_PARSER
+ id(0),
+#endif
+ cat(0), dot(NULL), q(_q),
+ r(NULL), active_parent(NULL), passive_parent(NULL), tps(NULL) {}
+};
+#ifdef DEBUG_CHART_PARSER
+int Edge::id_count = 0;
+#endif
+
+ostream& operator<<(ostream& os, const Edge& e) {
+ string type = "PREDICT";
+ if (e.IsCreatedByScan())
+ type = "SCAN";
+ else if (e.IsCreatedByComplete())
+ type = "COMPLETE";
+ os << "["
+#ifdef DEBUG_CHART_PARSER
+ << '(' << e.id << ") "
+#else
+ << '(' << &e << ") "
+#endif
+ << "q=" << e.q << ", r=" << e.r
+ << ", cat="<< TD::Convert(e.cat*-1) << ", dot="
+ << e.dot
+#ifdef DEBUG_CHART_PARSER
+ << e.dot->hint
+#endif
+ << (e.IsActive() ? ", Active" : ", Passive")
+ << ", " << type;
+#ifdef DEBUG_CHART_PARSER
+ if (e.active_parent) { os << ", act.parent=(" << e.active_parent->id << ')'; }
+ if (e.passive_parent) { os << ", psv.parent=(" << e.passive_parent->id << ')'; }
+#endif
+ if (e.tps) { os << ", tps=" << e.tps; }
+ return os << ']';
+}
+
+struct Traversal {
+ const Edge* const edge; // result from the active / passive combination
+ const Edge* const active;
+ const Edge* const passive;
+ Traversal(const Edge* me, const Edge* a, const Edge* p) : edge(me), active(a), passive(p) {}
+};
+
+struct UniqueTraversalHash {
+ size_t operator()(const Traversal* t) const {
+ size_t x = 5381;
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(t->active);
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(t->passive);
+ x = ((x << 5) + x) ^ t->edge->IsActive();
+ return x;
+ }
+};
+
+struct UniqueTraversalEquals {
+ size_t operator()(const Traversal* a, const Traversal* b) const {
+ return (a->passive == b->passive && a->active == b->active && a->edge->IsActive() == b->edge->IsActive());
+ }
+};
+
+struct UniqueEdgeHash {
+ size_t operator()(const Edge* e) const {
+ size_t x = 5381;
+ if (e->IsActive()) {
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->dot);
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q);
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r);
+ x = ((x << 5) + x) ^ static_cast<size_t>(e->cat);
+ x += 13;
+ } else { // with passive edges, we don't care about the dot
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q);
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r);
+ x = ((x << 5) + x) ^ static_cast<size_t>(e->cat);
+ }
+ return x;
+ }
+};
+
+struct UniqueEdgeEquals {
+ bool operator()(const Edge* a, const Edge* b) const {
+ if (a->IsActive() != b->IsActive()) return false;
+ if (a->IsActive()) {
+ return (a->cat == b->cat) && (a->dot == b->dot) && (a->q == b->q) && (a->r == b->r);
+ } else {
+ return (a->cat == b->cat) && (a->q == b->q) && (a->r == b->r);
+ }
+ }
+};
+
+struct REdgeHash {
+ size_t operator()(const Edge* e) const {
+ size_t x = 5381;
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r);
+ return x;
+ }
+};
+
+struct REdgeEquals {
+ bool operator()(const Edge* a, const Edge* b) const {
+ return (a->r == b->r);
+ }
+};
+
+struct QEdgeHash {
+ size_t operator()(const Edge* e) const {
+ size_t x = 5381;
+ x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q);
+ return x;
+ }
+};
+
+struct QEdgeEquals {
+ bool operator()(const Edge* a, const Edge* b) const {
+ return (a->q == b->q);
+ }
+};
+
+struct EdgeQueue {
+ queue<const Edge*> q;
+ EdgeQueue() {}
+ void clear() { while(!q.empty()) q.pop(); }
+ bool HasWork() const { return !q.empty(); }
+ const Edge* Next() { const Edge* res = q.front(); q.pop(); return res; }
+ void AddEdge(const Edge* s) { q.push(s); }
+};
+
+class EarleyComposerImpl {
+ public:
+ EarleyComposerImpl(WordID start_cat, const FSTNode& q_0) : start_cat_(start_cat), q_0_(&q_0) {}
+
+ // returns false if the intersection is empty
+ bool Compose(const EGrammar& g, Hypergraph* forest) {
+ goal_node = NULL;
+ EGrammar::const_iterator sit = g.find(start_cat_);
+ forest->ReserveNodes(kMAX_NODES);
+ assert(sit != g.end());
+ Edge* init = new Edge(start_cat_, &sit->second, q_0_);
+ assert(IncorporateNewEdge(init));
+ while (exp_agenda.HasWork() || agenda.HasWork()) {
+ while(exp_agenda.HasWork()) {
+ const Edge* edge = exp_agenda.Next();
+ FinishEdge(edge, forest);
+ }
+ if (agenda.HasWork()) {
+ const Edge* edge = agenda.Next();
+#ifdef DEBUG_CHART_PARSER
+ cerr << "processing (" << edge->id << ')' << endl;
+#endif
+ if (edge->IsActive()) {
+ if (edge->dot->HasTerminals())
+ DoScan(edge);
+ if (edge->dot->HasNonTerminals()) {
+ DoMergeWithPassives(edge);
+ DoPredict(edge, g);
+ }
+ } else {
+ DoComplete(edge);
+ }
+ }
+ }
+ if (goal_node) {
+ forest->PruneUnreachable(goal_node->id_);
+ forest->EpsilonRemove(kEPS);
+ }
+ FreeAll();
+ return goal_node;
+ }
+
+ void FreeAll() {
+ for (int i = 0; i < free_list_.size(); ++i)
+ delete free_list_[i];
+ free_list_.clear();
+ for (int i = 0; i < traversal_free_list_.size(); ++i)
+ delete traversal_free_list_[i];
+ traversal_free_list_.clear();
+ all_traversals.clear();
+ exp_agenda.clear();
+ agenda.clear();
+ tps2node.clear();
+ edge2node.clear();
+ all_edges.clear();
+ passive_edges.clear();
+ active_edges.clear();
+ }
+
+ ~EarleyComposerImpl() {
+ FreeAll();
+ }
+
+ // returns the total number of edges created during composition
+ int EdgesCreated() const {
+ return free_list_.size();
+ }
+
+ private:
+ void DoScan(const Edge* edge) {
+ // here, we assume that the FST will potentially have many more outgoing
+ // edges than the grammar, which will be just a couple. If you want to
+ // efficiently handle the case where both are relatively large, this code
+ // will need to change how the intersection is done. The best general
+ // solution would probably be the Baeza-Yates double binary search.
+
+ const EGrammarNode* dot = edge->dot;
+ const FSTNode* r = edge->r;
+ const map<WordID, EGrammarNode>& terms = dot->GetTerminals();
+ for (map<WordID, EGrammarNode>::const_iterator git = terms.begin();
+ git != terms.end(); ++git) {
+ const FSTNode* next_r = r->Extend(git->first);
+ if (!next_r) continue;
+ const EGrammarNode* next_dot = &git->second;
+ const bool grammar_continues = next_dot->GrammarContinues();
+ const bool rule_completes = next_dot->RuleCompletes();
+ assert(grammar_continues || rule_completes);
+ const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures();
+ // create up to 4 new edges!
+ if (next_r->HasOutgoingNonEpsilonEdges()) { // are there further symbols in the FST?
+ const TargetPhraseSet* translations = NULL;
+ if (rule_completes)
+ IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, translations, input_features));
+ if (grammar_continues)
+ IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, translations));
+ }
+ if (next_r->HasData()) { // indicates a loop back to q_0 in the FST
+ const TargetPhraseSet* translations = next_r->GetTranslations();
+ if (rule_completes)
+ IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, q_0_, edge, translations, input_features));
+ if (grammar_continues)
+ IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, q_0_, edge, translations));
+ }
+ }
+ }
+
+ void DoPredict(const Edge* edge, const EGrammar& g) {
+ const EGrammarNode* dot = edge->dot;
+ const map<WordID, EGrammarNode>& non_terms = dot->GetNonTerminals();
+ for (map<WordID, EGrammarNode>::const_iterator git = non_terms.begin();
+ git != non_terms.end(); ++git) {
+ const WordID nt_to_predict = git->first;
+ //cerr << edge->id << " -- " << TD::Convert(nt_to_predict*-1) << endl;
+ EGrammar::const_iterator egi = g.find(nt_to_predict);
+ if (egi == g.end()) {
+ cerr << "[ERROR] Can't find any grammar rules with a LHS of type "
+ << TD::Convert(-1*nt_to_predict) << '!' << endl;
+ continue;
+ }
+ assert(edge->IsActive());
+ const EGrammarNode* new_dot = &egi->second;
+ Edge* new_edge = new Edge(nt_to_predict, new_dot, edge->r, edge);
+ IncorporateNewEdge(new_edge);
+ }
+ }
+
+ void DoComplete(const Edge* passive) {
+#ifdef DEBUG_CHART_PARSER
+ cerr << " complete: " << *passive << endl;
+#endif
+ const WordID completed_nt = passive->cat;
+ const FSTNode* q = passive->q;
+ const FSTNode* next_r = passive->r;
+ const Edge query(q);
+ const pair<unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator,
+ unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator > p =
+ active_edges.equal_range(&query);
+ for (unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator it = p.first;
+ it != p.second; ++it) {
+ const Edge* active = *it;
+#ifdef DEBUG_CHART_PARSER
+ cerr << " pos: " << *active << endl;
+#endif
+ const EGrammarNode* next_dot = active->dot->Extend(completed_nt);
+ if (!next_dot) continue;
+ const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures();
+ // add up to 2 rules
+ if (next_dot->RuleCompletes())
+ IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features));
+ if (next_dot->GrammarContinues())
+ IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive));
+ }
+ }
+
+ void DoMergeWithPassives(const Edge* active) {
+ // edge is active, has non-terminals, we need to find the passives that can extend it
+ assert(active->IsActive());
+ assert(active->dot->HasNonTerminals());
+#ifdef DEBUG_CHART_PARSER
+ cerr << " merge active with passives: ACT=" << *active << endl;
+#endif
+ const Edge query(active->r, 1);
+ const pair<unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator,
+ unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator > p =
+ passive_edges.equal_range(&query);
+ for (unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator it = p.first;
+ it != p.second; ++it) {
+ const Edge* passive = *it;
+ const EGrammarNode* next_dot = active->dot->Extend(passive->cat);
+ if (!next_dot) continue;
+ const FSTNode* next_r = passive->r;
+ const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures();
+ if (next_dot->RuleCompletes())
+ IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features));
+ if (next_dot->GrammarContinues())
+ IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive));
+ }
+ }
+
+ // take ownership of edge memory, add to various indexes, etc
+ // returns true if this edge is new
+ bool IncorporateNewEdge(Edge* edge) {
+ free_list_.push_back(edge);
+ if (edge->passive_parent && edge->active_parent) {
+ Traversal* t = new Traversal(edge, edge->active_parent, edge->passive_parent);
+ traversal_free_list_.push_back(t);
+ if (all_traversals.find(t) != all_traversals.end()) {
+ return false;
+ } else {
+ all_traversals.insert(t);
+ }
+ }
+ exp_agenda.AddEdge(edge);
+ return true;
+ }
+
+ bool FinishEdge(const Edge* edge, Hypergraph* hg) {
+ bool is_new = false;
+ if (all_edges.find(edge) == all_edges.end()) {
+#ifdef DEBUG_CHART_PARSER
+ cerr << *edge << " is NEW\n";
+#endif
+ all_edges.insert(edge);
+ is_new = true;
+ if (edge->IsPassive()) passive_edges.insert(edge);
+ if (edge->IsActive()) active_edges.insert(edge);
+ agenda.AddEdge(edge);
+ } else {
+#ifdef DEBUG_CHART_PARSER
+ cerr << *edge << " is NOT NEW.\n";
+#endif
+ }
+ AddEdgeToTranslationForest(edge, hg);
+ return is_new;
+ }
+
+ // build the translation forest
+ void AddEdgeToTranslationForest(const Edge* edge, Hypergraph* hg) {
+ assert(hg->nodes_.size() < kMAX_NODES);
+ Hypergraph::Node* tps = NULL;
+ // first add any target language rules
+ if (edge->tps) {
+ Hypergraph::Node*& node = tps2node[(size_t)edge->tps];
+ if (!node) {
+ // cerr << "Creating phrases for " << edge->tps << endl;
+ const vector<TRulePtr>& rules = edge->tps->GetRules();
+ node = hg->AddNode(kPHRASE);
+ for (int i = 0; i < rules.size(); ++i) {
+ Hypergraph::Edge* hg_edge = hg->AddEdge(rules[i], Hypergraph::TailNodeVector());
+ hg_edge->feature_values_ += rules[i]->GetFeatureValues();
+ hg->ConnectEdgeToHeadNode(hg_edge, node);
+ }
+ }
+ tps = node;
+ }
+ Hypergraph::Node*& head_node = edge2node[edge];
+ if (!head_node)
+ head_node = hg->AddNode(kPHRASE);
+ if (edge->cat == start_cat_ && edge->q == q_0_ && edge->r == q_0_ && edge->IsPassive()) {
+ assert(goal_node == NULL || goal_node == head_node);
+ goal_node = head_node;
+ }
+ Hypergraph::TailNodeVector tail;
+ SparseVector<double> extra;
+ if (edge->IsCreatedByPredict()) {
+ // extra.set_value(FD::Convert("predict"), 1);
+ } else if (edge->IsCreatedByScan()) {
+ tail.push_back(edge2node[edge->active_parent]->id_);
+ if (tps) {
+ tail.push_back(tps->id_);
+ }
+ //extra.set_value(FD::Convert("scan"), 1);
+ } else if (edge->IsCreatedByComplete()) {
+ tail.push_back(edge2node[edge->active_parent]->id_);
+ tail.push_back(edge2node[edge->passive_parent]->id_);
+ //extra.set_value(FD::Convert("complete"), 1);
+ } else {
+ assert(!"unexpected edge type!");
+ }
+ //cerr << head_node->id_ << "<--" << *edge << endl;
+
+#ifdef DEBUG_CHART_PARSER
+ for (int i = 0; i < tail.size(); ++i)
+ if (tail[i] == head_node->id_) {
+ cerr << "ERROR: " << *edge << "\n i=" << i << endl;
+ if (i == 1) { cerr << "\tP: " << *edge->passive_parent << endl; }
+ if (i == 0) { cerr << "\tA: " << *edge->active_parent << endl; }
+ assert(!"self-loop found!");
+ }
+#endif
+ Hypergraph::Edge* hg_edge = NULL;
+ if (tail.size() == 0) {
+ hg_edge = hg->AddEdge(kEPSRule, tail);
+ } else if (tail.size() == 1) {
+ hg_edge = hg->AddEdge(kX1, tail);
+ } else if (tail.size() == 2) {
+ hg_edge = hg->AddEdge(kX1X2, tail);
+ }
+ if (edge->features)
+ hg_edge->feature_values_ += *edge->features;
+ hg_edge->feature_values_ += extra;
+ hg->ConnectEdgeToHeadNode(hg_edge, head_node);
+ }
+
+ Hypergraph::Node* goal_node;
+ EdgeQueue exp_agenda;
+ EdgeQueue agenda;
+ unordered_map<size_t, Hypergraph::Node*> tps2node;
+ unordered_map<const Edge*, Hypergraph::Node*, UniqueEdgeHash, UniqueEdgeEquals> edge2node;
+ unordered_set<const Traversal*, UniqueTraversalHash, UniqueTraversalEquals> all_traversals;
+ unordered_set<const Edge*, UniqueEdgeHash, UniqueEdgeEquals> all_edges;
+ unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals> passive_edges;
+ unordered_multiset<const Edge*, REdgeHash, REdgeEquals> active_edges;
+ vector<Edge*> free_list_;
+ vector<Traversal*> traversal_free_list_;
+ const WordID start_cat_;
+ const FSTNode* const q_0_;
+};
+
+#ifdef DEBUG_CHART_PARSER
+static string TrimRule(const string& r) {
+ size_t start = r.find(" |||") + 5;
+ size_t end = r.rfind(" |||");
+ return r.substr(start, end - start);
+}
+#endif
+
+void AddGrammarRule(const string& r, EGrammar* g) {
+ const size_t pos = r.find(" ||| ");
+ if (pos == string::npos || r[0] != '[') {
+ cerr << "Bad rule: " << r << endl;
+ return;
+ }
+ const size_t rpos = r.rfind(" ||| ");
+ string feats;
+ string rs = r;
+ if (rpos != pos) {
+ feats = r.substr(rpos + 5);
+ rs = r.substr(0, rpos);
+ }
+ string rhs = rs.substr(pos + 5);
+ string trule = rs + " ||| " + rhs + " ||| " + feats;
+ TRule tr(trule);
+#ifdef DEBUG_CHART_PARSER
+ string hint_last_rule;
+#endif
+ EGrammarNode* cur = &(*g)[tr.GetLHS()];
+ cur->is_root = true;
+ for (int i = 0; i < tr.FLength(); ++i) {
+ WordID sym = tr.f()[i];
+#ifdef DEBUG_CHART_PARSER
+ hint_last_rule = TD::Convert(sym < 0 ? -sym : sym);
+ cur->hint += " <@@> (*" + hint_last_rule + ") " + TrimRule(tr.AsString());
+#endif
+ if (sym < 0)
+ cur = &cur->ntptr[sym];
+ else
+ cur = &cur->tptr[sym];
+ }
+#ifdef DEBUG_CHART_PARSER
+ cur->hint += " <@@> (" + hint_last_rule + "*) " + TrimRule(tr.AsString());
+#endif
+ cur->is_some_rule_complete = true;
+ cur->input_features = tr.GetFeatureValues();
+}
+
+EarleyComposer::~EarleyComposer() {
+ delete pimpl_;
+}
+
+EarleyComposer::EarleyComposer(const FSTNode* fst) {
+ InitializeConstants();
+ pimpl_ = new EarleyComposerImpl(kUNIQUE_START, *fst);
+}
+
+bool EarleyComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest) {
+ // first, convert the src forest into an EGrammar
+ EGrammar g;
+ const int nedges = src_forest.edges_.size();
+ const int nnodes = src_forest.nodes_.size();
+ vector<int> cats(nnodes);
+ bool assign_cats = false;
+ for (int i = 0; i < nnodes; ++i)
+ if (assign_cats) {
+ cats[i] = TD::Convert("CAT_" + boost::lexical_cast<string>(i)) * -1;
+ } else {
+ cats[i] = src_forest.nodes_[i].cat_;
+ }
+ // construct the grammar
+ for (int i = 0; i < nedges; ++i) {
+ const Hypergraph::Edge& edge = src_forest.edges_[i];
+ const vector<WordID>& src = edge.rule_->f();
+ EGrammarNode* cur = &g[cats[edge.head_node_]];
+ cur->is_root = true;
+ int ntc = 0;
+ for (int j = 0; j < src.size(); ++j) {
+ WordID sym = src[j];
+ if (sym <= 0) {
+ sym = cats[edge.tail_nodes_[ntc]];
+ ++ntc;
+ cur = &cur->ntptr[sym];
+ } else {
+ cur = &cur->tptr[sym];
+ }
+ }
+ cur->is_some_rule_complete = true;
+ cur->input_features = edge.feature_values_;
+ }
+ EGrammarNode& goal_rule = g[kUNIQUE_START];
+ assert((goal_rule.ntptr.size() == 1 && goal_rule.tptr.size() == 0) ||
+ (goal_rule.ntptr.size() == 0 && goal_rule.tptr.size() == 1));
+
+ return pimpl_->Compose(g, trg_forest);
+}
+
+bool EarleyComposer::Compose(istream* in, Hypergraph* trg_forest) {
+ EGrammar g;
+ while(*in) {
+ string line;
+ getline(*in, line);
+ if (line.empty()) continue;
+ AddGrammarRule(line, &g);
+ }
+
+ return pimpl_->Compose(g, trg_forest);
+}
diff --git a/decoder/earley_composer.h b/decoder/earley_composer.h
new file mode 100644
index 00000000..9f786bf6
--- /dev/null
+++ b/decoder/earley_composer.h
@@ -0,0 +1,29 @@
+#ifndef _EARLEY_COMPOSER_H_
+#define _EARLEY_COMPOSER_H_
+
+#include <iostream>
+
+class EarleyComposerImpl;
+class FSTNode;
+class Hypergraph;
+
+class EarleyComposer {
+ public:
+ ~EarleyComposer();
+ EarleyComposer(const FSTNode* phrasetable_root);
+ bool Compose(const Hypergraph& src_forest, Hypergraph* trg_forest);
+
+ // reads the grammar from a file. There must be a single top-level
+ // S -> X rule. Anything else is possible. Format is:
+ // [S] ||| [SS,1]
+ // [SS] ||| [NP,1] [VP,2] ||| Feature1=0.2 Feature2=-2.3
+ // [SS] ||| [VP,1] [NP,2] ||| Feature1=0.8
+ // [NP] ||| [DET,1] [N,2] ||| Feature3=2
+ // ...
+ bool Compose(std::istream* grammar_file, Hypergraph* trg_forest);
+
+ private:
+ EarleyComposerImpl* pimpl_;
+};
+
+#endif
diff --git a/decoder/exp_semiring.h b/decoder/exp_semiring.h
new file mode 100644
index 00000000..f91beee4
--- /dev/null
+++ b/decoder/exp_semiring.h
@@ -0,0 +1,71 @@
+#ifndef _EXP_SEMIRING_H_
+#define _EXP_SEMIRING_H_
+
+#include <iostream>
+
+// this file implements the first-order expectation semiring described
+// in Li & Eisner (EMNLP 2009)
+
+// requirements:
+// RType * RType ==> RType
+// PType * PType ==> PType
+// RType * PType ==> RType
+// good examples:
+// PType scalar, RType vector
+// BAD examples:
+// PType vector, RType scalar
+template <typename PType, typename RType>
+struct PRPair {
+ PRPair() : p(), r() {}
+ // Inside algorithm requires that T(0) and T(1)
+ // return the 0 and 1 values of the semiring
+ explicit PRPair(double x) : p(x), r() {}
+ PRPair(const PType& p, const RType& r) : p(p), r(r) {}
+ PRPair& operator+=(const PRPair& o) {
+ p += o.p;
+ r += o.r;
+ return *this;
+ }
+ PRPair& operator*=(const PRPair& o) {
+ r = (o.r * p) + (o.p * r);
+ p *= o.p;
+ return *this;
+ }
+ PType p;
+ RType r;
+};
+
+template <typename P, typename R>
+std::ostream& operator<<(std::ostream& o, const PRPair<P,R>& x) {
+ return o << '<' << x.p << ", " << x.r << '>';
+}
+
+template <typename P, typename R>
+const PRPair<P,R> operator+(const PRPair<P,R>& a, const PRPair<P,R>& b) {
+ PRPair<P,R> result = a;
+ result += b;
+ return result;
+}
+
+template <typename P, typename R>
+const PRPair<P,R> operator*(const PRPair<P,R>& a, const PRPair<P,R>& b) {
+ PRPair<P,R> result = a;
+ result *= b;
+ return result;
+}
+
+template <typename P, typename PWeightFunction, typename R, typename RWeightFunction>
+struct PRWeightFunction {
+ explicit PRWeightFunction(const PWeightFunction& pwf = PWeightFunction(),
+ const RWeightFunction& rwf = RWeightFunction()) :
+ pweight(pwf), rweight(rwf) {}
+ PRPair<P,R> operator()(const Hypergraph::Edge& e) const {
+ const P p = pweight(e);
+ const R r = rweight(e);
+ return PRPair<P,R>(p, r * p);
+ }
+ const PWeightFunction pweight;
+ const RWeightFunction rweight;
+};
+
+#endif
diff --git a/decoder/fdict.cc b/decoder/fdict.cc
new file mode 100644
index 00000000..7e1b0e1f
--- /dev/null
+++ b/decoder/fdict.cc
@@ -0,0 +1,129 @@
+#include "fdict.h"
+
+#include <string>
+
+using namespace std;
+
+Dict FD::dict_;
+bool FD::frozen_ = false;
+
+static int HexPairValue(const char * code) {
+ int value = 0;
+ const char * pch = code;
+ for (;;) {
+ int digit = *pch++;
+ if (digit >= '0' && digit <= '9') {
+ value += digit - '0';
+ }
+ else if (digit >= 'A' && digit <= 'F') {
+ value += digit - 'A' + 10;
+ }
+ else if (digit >= 'a' && digit <= 'f') {
+ value += digit - 'a' + 10;
+ }
+ else {
+ return -1;
+ }
+ if (pch == code + 2)
+ return value;
+ value <<= 4;
+ }
+}
+
+int UrlDecode(const char *source, char *dest)
+{
+ char * start = dest;
+
+ while (*source) {
+ switch (*source) {
+ case '+':
+ *(dest++) = ' ';
+ break;
+ case '%':
+ if (source[1] && source[2]) {
+ int value = HexPairValue(source + 1);
+ if (value >= 0) {
+ *(dest++) = value;
+ source += 2;
+ }
+ else {
+ *dest++ = '?';
+ }
+ }
+ else {
+ *dest++ = '?';
+ }
+ break;
+ default:
+ *dest++ = *source;
+ }
+ source++;
+ }
+
+ *dest = 0;
+ return dest - start;
+}
+
+int UrlEncode(const char *source, char *dest, unsigned max) {
+ static const char *digits = "0123456789ABCDEF";
+ unsigned char ch;
+ unsigned len = 0;
+ char *start = dest;
+
+ while (len < max - 4 && *source)
+ {
+ ch = (unsigned char)*source;
+ if (*source == ' ') {
+ *dest++ = '+';
+ }
+ else if (strchr("=:;,_| %", ch)) {
+ *dest++ = '%';
+ *dest++ = digits[(ch >> 4) & 0x0F];
+ *dest++ = digits[ ch & 0x0F];
+ }
+ else {
+ *dest++ = *source;
+ }
+ source++;
+ }
+ *dest = 0;
+ return start - dest;
+}
+
+std::string UrlDecodeString(const std::string & encoded) {
+ const char * sz_encoded = encoded.c_str();
+ size_t needed_length = encoded.length();
+ for (const char * pch = sz_encoded; *pch; pch++) {
+ if (*pch == '%')
+ needed_length += 2;
+ }
+ needed_length += 10;
+ char stackalloc[64];
+ char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ?
+ (char *)malloc(needed_length) : stackalloc;
+ UrlDecode(encoded.c_str(), buf);
+ std::string result(buf);
+ if (buf != stackalloc) {
+ free(buf);
+ }
+ return result;
+}
+
+std::string UrlEncodeString(const std::string & decoded) {
+ const char * sz_decoded = decoded.c_str();
+ size_t needed_length = decoded.length() * 3 + 3;
+ char stackalloc[64];
+ char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ?
+ (char *)malloc(needed_length) : stackalloc;
+ UrlEncode(decoded.c_str(), buf, needed_length);
+ std::string result(buf);
+ if (buf != stackalloc) {
+ free(buf);
+ }
+ return result;
+}
+
+string FD::Escape(const string& s) {
+ return UrlEncodeString(s);
+}
+
diff --git a/decoder/fdict.h b/decoder/fdict.h
new file mode 100644
index 00000000..c4236580
--- /dev/null
+++ b/decoder/fdict.h
@@ -0,0 +1,31 @@
+#ifndef _FDICT_H_
+#define _FDICT_H_
+
+#include <string>
+#include <vector>
+#include "dict.h"
+
+struct FD {
+ // once the FD is frozen, new features not already in the
+ // dictionary will return 0
+ static void Freeze() {
+ frozen_ = true;
+ }
+ static inline int NumFeats() {
+ return dict_.max() + 1;
+ }
+ static inline WordID Convert(const std::string& s) {
+ return dict_.Convert(s, frozen_);
+ }
+ static inline const std::string& Convert(const WordID& w) {
+ return dict_.Convert(w);
+ }
+ // Escape any string to a form that can be used as the name
+ // of a weight in a weights file
+ static std::string Escape(const std::string& s);
+ static Dict dict_;
+ private:
+ static bool frozen_;
+};
+
+#endif
diff --git a/decoder/ff.cc b/decoder/ff.cc
new file mode 100644
index 00000000..61f4f0b6
--- /dev/null
+++ b/decoder/ff.cc
@@ -0,0 +1,137 @@
+#include "ff.h"
+
+#include "tdict.h"
+#include "hg.h"
+
+using namespace std;
+
+FeatureFunction::~FeatureFunction() {}
+
+
+void FeatureFunction::FinalTraversalFeatures(const void* ant_state,
+ SparseVector<double>* features) const {
+ (void) ant_state;
+ (void) features;
+}
+
+// Hiero and Joshua use log_10(e) as the value, so I do to
+WordPenalty::WordPenalty(const string& param) :
+ fid_(FD::Convert("WordPenalty")),
+ value_(-1.0 / log(10)) {
+ if (!param.empty()) {
+ cerr << "Warning WordPenalty ignoring parameter: " << param << endl;
+ }
+}
+
+void WordPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* state) const {
+ (void) smeta;
+ (void) ant_states;
+ (void) state;
+ (void) estimated_features;
+ features->set_value(fid_, edge.rule_->EWords() * value_);
+}
+
+SourceWordPenalty::SourceWordPenalty(const string& param) :
+ fid_(FD::Convert("SourceWordPenalty")),
+ value_(-1.0 / log(10)) {
+ if (!param.empty()) {
+ cerr << "Warning SourceWordPenalty ignoring parameter: " << param << endl;
+ }
+}
+
+void SourceWordPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* state) const {
+ (void) smeta;
+ (void) ant_states;
+ (void) state;
+ (void) estimated_features;
+ features->set_value(fid_, edge.rule_->FWords() * value_);
+}
+
+ArityPenalty::ArityPenalty(const std::string& param) :
+ value_(-1.0 / log(10)) {
+ string fname = "Arity_X";
+ for (int i = 0; i < 10; ++i) {
+ fname[6]=i + '0';
+ fids_[i] = FD::Convert(fname);
+ }
+}
+
+void ArityPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* state) const {
+ (void) smeta;
+ (void) ant_states;
+ (void) state;
+ (void) estimated_features;
+ features->set_value(fids_[edge.Arity()], value_);
+}
+
+ModelSet::ModelSet(const vector<double>& w, const vector<const FeatureFunction*>& models) :
+ models_(models),
+ weights_(w),
+ state_size_(0),
+ model_state_pos_(models.size()) {
+ for (int i = 0; i < models_.size(); ++i) {
+ model_state_pos_[i] = state_size_;
+ state_size_ += models_[i]->NumBytesContext();
+ }
+}
+
+void ModelSet::AddFeaturesToEdge(const SentenceMetadata& smeta,
+ const Hypergraph& hg,
+ const vector<string>& node_states,
+ Hypergraph::Edge* edge,
+ string* context,
+ prob_t* combination_cost_estimate) const {
+ context->resize(state_size_);
+ memset(&(*context)[0], 0, state_size_);
+ SparseVector<double> est_vals; // only computed if combination_cost_estimate is non-NULL
+ if (combination_cost_estimate) *combination_cost_estimate = prob_t::One();
+ for (int i = 0; i < models_.size(); ++i) {
+ const FeatureFunction& ff = *models_[i];
+ void* cur_ff_context = NULL;
+ vector<const void*> ants(edge->tail_nodes_.size());
+ bool has_context = ff.NumBytesContext() > 0;
+ if (has_context) {
+ int spos = model_state_pos_[i];
+ cur_ff_context = &(*context)[spos];
+ for (int i = 0; i < ants.size(); ++i) {
+ ants[i] = &node_states[edge->tail_nodes_[i]][spos];
+ }
+ }
+ ff.TraversalFeatures(smeta, *edge, ants, &edge->feature_values_, &est_vals, cur_ff_context);
+ }
+ if (combination_cost_estimate)
+ combination_cost_estimate->logeq(est_vals.dot(weights_));
+ edge->edge_prob_.logeq(edge->feature_values_.dot(weights_));
+}
+
+void ModelSet::AddFinalFeatures(const std::string& state, Hypergraph::Edge* edge) const {
+ assert(1 == edge->rule_->Arity());
+
+ for (int i = 0; i < models_.size(); ++i) {
+ const FeatureFunction& ff = *models_[i];
+ const void* ant_state = NULL;
+ bool has_context = ff.NumBytesContext() > 0;
+ if (has_context) {
+ int spos = model_state_pos_[i];
+ ant_state = &state[spos];
+ }
+ ff.FinalTraversalFeatures(ant_state, &edge->feature_values_);
+ }
+ edge->edge_prob_.logeq(edge->feature_values_.dot(weights_));
+}
+
diff --git a/decoder/ff.h b/decoder/ff.h
new file mode 100644
index 00000000..630b3208
--- /dev/null
+++ b/decoder/ff.h
@@ -0,0 +1,152 @@
+#ifndef _FF_H_
+#define _FF_H_
+
+#include <vector>
+
+#include "fdict.h"
+#include "hg.h"
+
+class SentenceMetadata;
+class FeatureFunction; // see definition below
+
+// if you want to develop a new feature, inherit from this class and
+// override TraversalFeaturesImpl(...). If it's a feature that returns /
+// depends on context, you may also need to implement
+// FinalTraversalFeatures(...)
+class FeatureFunction {
+ public:
+ FeatureFunction() : state_size_() {}
+ explicit FeatureFunction(int state_size) : state_size_(state_size) {}
+ virtual ~FeatureFunction();
+
+ // returns the number of bytes of context that this feature function will
+ // (maximally) use. By default, 0 ("stateless" models in Hiero/Joshua).
+ // NOTE: this value is fixed for the instance of your class, you cannot
+ // use different amounts of memory for different nodes in the forest.
+ inline int NumBytesContext() const { return state_size_; }
+
+ // Compute the feature values and (if this applies) the estimates of the
+ // feature values when this edge is used incorporated into a larger context
+ inline void TraversalFeatures(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_state) const {
+ TraversalFeaturesImpl(smeta, edge, ant_contexts,
+ features, estimated_features, out_state);
+ // TODO it's easy for careless feature function developers to overwrite
+ // the end of their state and clobber someone else's memory. These bugs
+ // will be horrendously painful to track down. There should be some
+ // optional strict mode that's enforced here that adds some kind of
+ // barrier between the blocks reserved for the residual contexts
+ }
+
+ // if there's some state left when you transition to the goal state, score
+ // it here. For example, the language model computes the cost of adding
+ // <s> and </s>.
+ virtual void FinalTraversalFeatures(const void* residual_state,
+ SparseVector<double>* final_features) const;
+
+ protected:
+ // context is a pointer to a buffer of size NumBytesContext() that the
+ // feature function can write its state to. It's up to the feature function
+ // to determine how much space it needs and to determine how to encode its
+ // residual contextual information since it is OPAQUE to all clients outside
+ // of the particular FeatureFunction class. There is one exception:
+ // equality of the contents (i.e., memcmp) is required to determine whether
+ // two states can be combined.
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const = 0;
+
+ // !!! ONLY call this from subclass *CONSTRUCTORS* !!!
+ void SetStateSize(size_t state_size) {
+ state_size_ = state_size;
+ }
+
+ private:
+ int state_size_;
+};
+
+// word penalty feature, for each word on the E side of a rule,
+// add value_
+class WordPenalty : public FeatureFunction {
+ public:
+ WordPenalty(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const;
+ private:
+ const int fid_;
+ const double value_;
+};
+
+class SourceWordPenalty : public FeatureFunction {
+ public:
+ SourceWordPenalty(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const;
+ private:
+ const int fid_;
+ const double value_;
+};
+
+class ArityPenalty : public FeatureFunction {
+ public:
+ ArityPenalty(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const;
+ private:
+ int fids_[10];
+ const double value_;
+};
+
+// this class is a set of FeatureFunctions that can be used to score, rescore,
+// etc. a (translation?) forest
+class ModelSet {
+ public:
+ ModelSet() : state_size_(0) {}
+
+ ModelSet(const std::vector<double>& weights,
+ const std::vector<const FeatureFunction*>& models);
+
+ // sets edge->feature_values_ and edge->edge_prob_
+ // NOTE: edge must not necessarily be in hg.edges_ but its TAIL nodes
+ // must be.
+ void AddFeaturesToEdge(const SentenceMetadata& smeta,
+ const Hypergraph& hg,
+ const std::vector<std::string>& node_states,
+ Hypergraph::Edge* edge,
+ std::string* residual_context,
+ prob_t* combination_cost_estimate = NULL) const;
+
+ void AddFinalFeatures(const std::string& residual_context,
+ Hypergraph::Edge* edge) const;
+
+ bool empty() const { return models_.empty(); }
+ private:
+ std::vector<const FeatureFunction*> models_;
+ std::vector<double> weights_;
+ int state_size_;
+ std::vector<int> model_state_pos_;
+};
+
+#endif
diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc
new file mode 100644
index 00000000..658603e4
--- /dev/null
+++ b/decoder/ff_csplit.cc
@@ -0,0 +1,225 @@
+#include "ff_csplit.h"
+
+#include <set>
+#include <cstring>
+
+#include "Vocab.h"
+#include "Ngram.h"
+
+#include "sentence_metadata.h"
+#include "lattice.h"
+#include "tdict.h"
+#include "freqdict.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "tdict.h"
+
+using namespace std;
+
+struct BasicCSplitFeaturesImpl {
+ BasicCSplitFeaturesImpl(const string& param) :
+ word_count_(FD::Convert("WordCount")),
+ letters_sq_(FD::Convert("LettersSq")),
+ letters_sqrt_(FD::Convert("LettersSqrt")),
+ in_dict_(FD::Convert("InDict")),
+ short_(FD::Convert("Short")),
+ long_(FD::Convert("Long")),
+ oov_(FD::Convert("OOV")),
+ short_range_(FD::Convert("ShortRange")),
+ high_freq_(FD::Convert("HighFreq")),
+ med_freq_(FD::Convert("MedFreq")),
+ freq_(FD::Convert("Freq")),
+ fl1_(FD::Convert("FreqLen1")),
+ fl2_(FD::Convert("FreqLen2")),
+ bad_(FD::Convert("Bad")) {
+ vector<string> argv;
+ int argc = SplitOnWhitespace(param, &argv);
+ if (argc != 1 && argc != 2) {
+ cerr << "Expected: freqdict.txt [badwords.txt]\n";
+ abort();
+ }
+ freq_dict_.Load(argv[0]);
+ if (argc == 2) {
+ ReadFile rf(argv[1]);
+ istream& in = *rf.stream();
+ while(in) {
+ string badword;
+ in >> badword;
+ if (badword.empty()) continue;
+ bad_words_.insert(TD::Convert(badword));
+ }
+ }
+ }
+
+ void TraversalFeaturesImpl(const Hypergraph::Edge& edge,
+ SparseVector<double>* features) const;
+
+ const int word_count_;
+ const int letters_sq_;
+ const int letters_sqrt_;
+ const int in_dict_;
+ const int short_;
+ const int long_;
+ const int oov_;
+ const int short_range_;
+ const int high_freq_;
+ const int med_freq_;
+ const int freq_;
+ const int fl1_;
+ const int fl2_;
+ const int bad_;
+ FreqDict freq_dict_;
+ set<WordID> bad_words_;
+};
+
+BasicCSplitFeatures::BasicCSplitFeatures(const string& param) :
+ pimpl_(new BasicCSplitFeaturesImpl(param)) {}
+
+void BasicCSplitFeaturesImpl::TraversalFeaturesImpl(
+ const Hypergraph::Edge& edge,
+ SparseVector<double>* features) const {
+ features->set_value(word_count_, 1.0);
+ features->set_value(letters_sq_, (edge.j_ - edge.i_) * (edge.j_ - edge.i_));
+ features->set_value(letters_sqrt_, sqrt(edge.j_ - edge.i_));
+ const WordID word = edge.rule_->e_[1];
+ const char* sword = TD::Convert(word);
+ const int len = strlen(sword);
+ int cur = 0;
+ int chars = 0;
+ while(cur < len) {
+ cur += UTF8Len(sword[cur]);
+ ++chars;
+ }
+
+ // these are corrections that attempt to make chars
+ // more like a phoneme count than a letter count, they
+ // are only really meaningful for german and should
+ // probably be gotten rid of
+ bool has_sch = strstr(sword, "sch");
+ bool has_ch = (!has_sch && strstr(sword, "ch"));
+ bool has_ie = strstr(sword, "ie");
+ bool has_zw = strstr(sword, "zw");
+ if (has_sch) chars -= 2;
+ if (has_ch) --chars;
+ if (has_ie) --chars;
+ if (has_zw) --chars;
+
+ float freq = freq_dict_.LookUp(word);
+ if (freq) {
+ features->set_value(freq_, freq);
+ features->set_value(in_dict_, 1.0);
+ } else {
+ features->set_value(oov_, 1.0);
+ freq = 99.0f;
+ }
+ if (bad_words_.count(word) != 0)
+ features->set_value(bad_, 1.0);
+ if (chars < 5)
+ features->set_value(short_, 1.0);
+ if (chars > 10)
+ features->set_value(long_, 1.0);
+ if (freq < 7.0f)
+ features->set_value(high_freq_, 1.0);
+ if (freq > 8.0f && freq < 10.f)
+ features->set_value(med_freq_, 1.0);
+ if (freq < 10.0f && chars < 5)
+ features->set_value(short_range_, 1.0);
+
+ // i don't understand these features, but they really help!
+ features->set_value(fl1_, sqrt(chars * freq));
+ features->set_value(fl2_, freq / chars);
+}
+
+void BasicCSplitFeatures::TraversalFeaturesImpl(
+ const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const {
+ (void) smeta;
+ (void) ant_contexts;
+ (void) out_context;
+ (void) estimated_features;
+ if (edge.Arity() == 0) return;
+ if (edge.rule_->EWords() != 1) return;
+ pimpl_->TraversalFeaturesImpl(edge, features);
+}
+
+struct ReverseCharLMCSplitFeatureImpl {
+ ReverseCharLMCSplitFeatureImpl(const string& param) :
+ order_(5),
+ vocab_(*TD::dict_),
+ ngram_(vocab_, order_) {
+ kBOS = vocab_.getIndex("<s>");
+ kEOS = vocab_.getIndex("</s>");
+ File file(param.c_str(), "r", 0);
+ assert(file);
+ cerr << "Reading " << order_ << "-gram LM from " << param << endl;
+ ngram_.read(file);
+ }
+
+ double LeftPhonotacticProb(const Lattice& inword, const int start) {
+ const int end = inword.size();
+ for (int i = 0; i < order_; ++i)
+ sc[i] = kBOS;
+ int sp = min(end - start, order_ - 1);
+ // cerr << "[" << start << "," << sp << "]\n";
+ int ci = (order_ - sp - 1);
+ int wi = start;
+ while (sp > 0) {
+ sc[ci] = inword[wi][0].label;
+ // cerr << " CHAR: " << TD::Convert(sc[ci]) << " ci=" << ci << endl;
+ ++wi;
+ ++ci;
+ --sp;
+ }
+ // cerr << " END ci=" << ci << endl;
+ sc[ci] = Vocab_None;
+ const double startprob = ngram_.wordProb(kEOS, sc);
+ // cerr << " PROB=" << startprob << endl;
+ return startprob;
+ }
+ private:
+ const int order_;
+ Vocab& vocab_;
+ VocabIndex kBOS;
+ VocabIndex kEOS;
+ Ngram ngram_;
+ VocabIndex sc[80];
+};
+
+ReverseCharLMCSplitFeature::ReverseCharLMCSplitFeature(const string& param) :
+ pimpl_(new ReverseCharLMCSplitFeatureImpl(param)),
+ fid_(FD::Convert("RevCharLM")) {}
+
+void ReverseCharLMCSplitFeature::TraversalFeaturesImpl(
+ const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const {
+ (void) ant_contexts;
+ (void) estimated_features;
+ (void) out_context;
+
+ if (edge.Arity() != 1) return;
+ if (edge.rule_->EWords() != 1) return;
+ const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_);
+ features->set_value(fid_, lpp);
+#if 0
+ WordID neighbor_word = 0;
+ const WordID word = edge.rule_->e_[1];
+ if (chars > 4 && (sword[0] == 's' || sword[0] == 'n')) {
+ neighbor_word = TD::Convert(string(&sword[1]));
+ }
+ if (neighbor_word) {
+ float nfreq = freq_dict_.LookUp(neighbor_word);
+ cerr << "COMPARE: " << TD::Convert(word) << " & " << TD::Convert(neighbor_word) << endl;
+ if (!nfreq) nfreq = 99.0f;
+ features->set_value(fdoes_deletion_help_, (freq - nfreq));
+ }
+#endif
+}
+
diff --git a/decoder/ff_csplit.h b/decoder/ff_csplit.h
new file mode 100644
index 00000000..c1cfb64b
--- /dev/null
+++ b/decoder/ff_csplit.h
@@ -0,0 +1,39 @@
+#ifndef _FF_CSPLIT_H_
+#define _FF_CSPLIT_H_
+
+#include <boost/shared_ptr.hpp>
+
+#include "ff.h"
+
+class BasicCSplitFeaturesImpl;
+class BasicCSplitFeatures : public FeatureFunction {
+ public:
+ BasicCSplitFeatures(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const;
+ private:
+ boost::shared_ptr<BasicCSplitFeaturesImpl> pimpl_;
+};
+
+class ReverseCharLMCSplitFeatureImpl;
+class ReverseCharLMCSplitFeature : public FeatureFunction {
+ public:
+ ReverseCharLMCSplitFeature(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const;
+ private:
+ boost::shared_ptr<ReverseCharLMCSplitFeatureImpl> pimpl_;
+ const int fid_;
+};
+
+#endif
diff --git a/decoder/ff_factory.cc b/decoder/ff_factory.cc
new file mode 100644
index 00000000..1854e0bb
--- /dev/null
+++ b/decoder/ff_factory.cc
@@ -0,0 +1,35 @@
+#include "ff_factory.h"
+
+#include "ff.h"
+
+using boost::shared_ptr;
+using namespace std;
+
+FFFactoryBase::~FFFactoryBase() {}
+
+void FFRegistry::DisplayList() const {
+ for (map<string, shared_ptr<FFFactoryBase> >::const_iterator it = reg_.begin();
+ it != reg_.end(); ++it) {
+ cerr << " " << it->first << endl;
+ }
+}
+
+shared_ptr<FeatureFunction> FFRegistry::Create(const string& ffname, const string& param) const {
+ map<string, shared_ptr<FFFactoryBase> >::const_iterator it = reg_.find(ffname);
+ shared_ptr<FeatureFunction> res;
+ if (it == reg_.end()) {
+ cerr << "I don't know how to create feature " << ffname << endl;
+ } else {
+ res = it->second->Create(param);
+ }
+ return res;
+}
+
+void FFRegistry::Register(const string& ffname, FFFactoryBase* factory) {
+ if (reg_.find(ffname) != reg_.end()) {
+ cerr << "Duplicate registration of FeatureFunction with name " << ffname << "!\n";
+ abort();
+ }
+ reg_[ffname].reset(factory);
+}
+
diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h
new file mode 100644
index 00000000..bc586567
--- /dev/null
+++ b/decoder/ff_factory.h
@@ -0,0 +1,39 @@
+#ifndef _FF_FACTORY_H_
+#define _FF_FACTORY_H_
+
+#include <iostream>
+#include <string>
+#include <map>
+
+#include <boost/shared_ptr.hpp>
+
+class FeatureFunction;
+class FFRegistry;
+class FFFactoryBase;
+extern boost::shared_ptr<FFRegistry> global_ff_registry;
+
+class FFRegistry {
+ friend int main(int argc, char** argv);
+ friend class FFFactoryBase;
+ public:
+ boost::shared_ptr<FeatureFunction> Create(const std::string& ffname, const std::string& param) const;
+ void DisplayList() const;
+ void Register(const std::string& ffname, FFFactoryBase* factory);
+ private:
+ FFRegistry() {}
+ std::map<std::string, boost::shared_ptr<FFFactoryBase> > reg_;
+};
+
+struct FFFactoryBase {
+ virtual ~FFFactoryBase();
+ virtual boost::shared_ptr<FeatureFunction> Create(const std::string& param) const = 0;
+};
+
+template<class FF>
+class FFFactory : public FFFactoryBase {
+ boost::shared_ptr<FeatureFunction> Create(const std::string& param) const {
+ return boost::shared_ptr<FeatureFunction>(new FF(param));
+ }
+};
+
+#endif
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
new file mode 100644
index 00000000..a1dc8b81
--- /dev/null
+++ b/decoder/ff_lm.cc
@@ -0,0 +1,454 @@
+#include "ff_lm.h"
+
+#include <sstream>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netdb.h>
+
+#include <boost/shared_ptr.hpp>
+
+#include "tdict.h"
+#include "Vocab.h"
+#include "Ngram.h"
+#include "hg.h"
+#include "stringlib.h"
+
+#ifdef HAVE_RANDLM
+#include "RandLM.h"
+#endif
+
+using namespace std;
+
+namespace NgramCache {
+ struct Cache {
+ map<WordID, Cache> tree;
+ float prob;
+ Cache() : prob() {}
+ };
+ static Cache cache_;
+ void Clear() { cache_.tree.clear(); }
+}
+
+struct LMClient {
+
+ LMClient(const char* host) : port(6666) {
+ strcpy(request_buffer, "prob ");
+ s = const_cast<char*>(strchr(host, ':')); // TODO fix const_cast
+ if (s != NULL) {
+ *s = '\0';
+ ++s;
+ port = atoi(s);
+ }
+ sock = socket(AF_INET, SOCK_STREAM, 0);
+ hp = gethostbyname(host);
+ if (hp == NULL) {
+ cerr << "unknown host " << host << endl;
+ abort();
+ }
+ bzero((char *)&server, sizeof(server));
+ bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
+ server.sin_family = hp->h_addrtype;
+ server.sin_port = htons(port);
+
+ int errors = 0;
+ while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) {
+ cerr << "Error: connect()\n";
+ sleep(1);
+ errors++;
+ if (errors > 3) exit(1);
+ }
+ cerr << "Connected to LM on " << host << " on port " << port << endl;
+ }
+
+ float wordProb(int word, int* context) {
+ NgramCache::Cache* cur = &NgramCache::cache_;
+ int i = 0;
+ while (context[i] > 0) {
+ cur = &cur->tree[context[i++]];
+ }
+ cur = &cur->tree[word];
+ if (cur->prob) { return cur->prob; }
+
+ i = 0;
+ int pos = TD::AppendString(word, 5, 16000, request_buffer);
+ while (context[i] > 0) {
+ assert(pos < 15995);
+ request_buffer[pos] = ' ';
+ ++pos;
+ pos = TD::AppendString(context[i], pos, 16000, request_buffer);
+ ++i;
+ }
+ assert(pos < 15999);
+ request_buffer[pos] = '\n';
+ ++pos;
+ request_buffer[pos] = 0;
+ write(sock, request_buffer, pos);
+ int r = read(sock, res, 6);
+ int errors = 0;
+ int cnt = 0;
+ while (1) {
+ if (r < 0) {
+ errors++; sleep(1);
+ cerr << "Error: read()\n";
+ if (errors > 5) exit(1);
+ } else if (r==0 || res[cnt] == '\n') { break; }
+ else {
+ cnt += r;
+ if (cnt==6) break;
+ read(sock, &res[cnt], 6-cnt);
+ }
+ }
+ cur->prob = *reinterpret_cast<float*>(res);
+ return cur->prob;
+ }
+
+ private:
+ int sock, port;
+ char *s;
+ struct hostent *hp;
+ struct sockaddr_in server;
+ char res[8];
+ char request_buffer[16000];
+};
+
+class LanguageModelImpl {
+ public:
+ explicit LanguageModelImpl(int order) :
+ ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
+ floor_(-100.0),
+ client_(),
+ kSTART(TD::Convert("<s>")),
+ kSTOP(TD::Convert("</s>")),
+ kUNKNOWN(TD::Convert("<unk>")),
+ kNONE(-1),
+ kSTAR(TD::Convert("<{STAR}>")) {}
+
+ LanguageModelImpl(int order, const string& f) :
+ ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
+ floor_(-100.0),
+ client_(NULL),
+ kSTART(TD::Convert("<s>")),
+ kSTOP(TD::Convert("</s>")),
+ kUNKNOWN(TD::Convert("<unk>")),
+ kNONE(-1),
+ kSTAR(TD::Convert("<{STAR}>")) {
+ if (f.find("lm://") == 0) {
+ client_ = new LMClient(f.substr(5).c_str());
+ } else {
+ File file(f.c_str(), "r", 0);
+ assert(file);
+ cerr << "Reading " << order_ << "-gram LM from " << f << endl;
+ ngram_.read(file, false);
+ }
+ }
+
+ virtual ~LanguageModelImpl() {
+ delete client_;
+ }
+
+ inline int StateSize(const void* state) const {
+ return *(static_cast<const char*>(state) + state_size_);
+ }
+
+ inline void SetStateSize(int size, void* state) const {
+ *(static_cast<char*>(state) + state_size_) = size;
+ }
+
+ virtual double WordProb(int word, int* context) {
+ return client_ ?
+ client_->wordProb(word, context)
+ : ngram_.wordProb(word, (VocabIndex*)context);
+ }
+
+ inline double LookupProbForBufferContents(int i) {
+// int k = i; cerr << "P("; while(buffer_[k] > 0) { std::cerr << TD::Convert(buffer_[k++]) << " "; }
+ double p = WordProb(buffer_[i], &buffer_[i+1]);
+ if (p < floor_) p = floor_;
+// cerr << ")=" << p << endl;
+ return p;
+ }
+
+ string DebugStateToString(const void* state) const {
+ int len = StateSize(state);
+ const int* astate = reinterpret_cast<const int*>(state);
+ string res = "[";
+ for (int i = 0; i < len; ++i) {
+ res += " ";
+ res += TD::Convert(astate[i]);
+ }
+ res += " ]";
+ return res;
+ }
+
+ inline double ProbNoRemnant(int i, int len) {
+ int edge = len;
+ bool flag = true;
+ double sum = 0.0;
+ while (i >= 0) {
+ if (buffer_[i] == kSTAR) {
+ edge = i;
+ flag = false;
+ } else if (buffer_[i] <= 0) {
+ edge = i;
+ flag = true;
+ } else {
+ if ((edge-i >= order_) || (flag && !(i == (len-1) && buffer_[i] == kSTART)))
+ sum += LookupProbForBufferContents(i);
+ }
+ --i;
+ }
+ return sum;
+ }
+
+ double EstimateProb(const vector<WordID>& phrase) {
+ int len = phrase.size();
+ buffer_.resize(len + 1);
+ buffer_[len] = kNONE;
+ int i = len - 1;
+ for (int j = 0; j < len; ++j,--i)
+ buffer_[i] = phrase[j];
+ return ProbNoRemnant(len - 1, len);
+ }
+
+ double EstimateProb(const void* state) {
+ int len = StateSize(state);
+ // cerr << "residual len: " << len << endl;
+ buffer_.resize(len + 1);
+ buffer_[len] = kNONE;
+ const int* astate = reinterpret_cast<const int*>(state);
+ int i = len - 1;
+ for (int j = 0; j < len; ++j,--i)
+ buffer_[i] = astate[j];
+ return ProbNoRemnant(len - 1, len);
+ }
+
+ double FinalTraversalCost(const void* state) {
+ int slen = StateSize(state);
+ int len = slen + 2;
+ // cerr << "residual len: " << len << endl;
+ buffer_.resize(len + 1);
+ buffer_[len] = kNONE;
+ buffer_[len-1] = kSTART;
+ const int* astate = reinterpret_cast<const int*>(state);
+ int i = len - 2;
+ for (int j = 0; j < slen; ++j,--i)
+ buffer_[i] = astate[j];
+ buffer_[i] = kSTOP;
+ assert(i == 0);
+ return ProbNoRemnant(len - 1, len);
+ }
+
+ double LookupWords(const TRule& rule, const vector<const void*>& ant_states, void* vstate) {
+ int len = rule.ELength() - rule.Arity();
+ for (int i = 0; i < ant_states.size(); ++i)
+ len += StateSize(ant_states[i]);
+ buffer_.resize(len + 1);
+ buffer_[len] = kNONE;
+ int i = len - 1;
+ const vector<WordID>& e = rule.e();
+ for (int j = 0; j < e.size(); ++j) {
+ if (e[j] < 1) {
+ const int* astate = reinterpret_cast<const int*>(ant_states[-e[j]]);
+ int slen = StateSize(astate);
+ for (int k = 0; k < slen; ++k)
+ buffer_[i--] = astate[k];
+ } else {
+ buffer_[i--] = e[j];
+ }
+ }
+
+ double sum = 0.0;
+ int* remnant = reinterpret_cast<int*>(vstate);
+ int j = 0;
+ i = len - 1;
+ int edge = len;
+
+ while (i >= 0) {
+ if (buffer_[i] == kSTAR) {
+ edge = i;
+ } else if (edge-i >= order_) {
+ sum += LookupProbForBufferContents(i);
+ } else if (edge == len && remnant) {
+ remnant[j++] = buffer_[i];
+ }
+ --i;
+ }
+ if (!remnant) return sum;
+
+ if (edge != len || len >= order_) {
+ remnant[j++] = kSTAR;
+ if (order_-1 < edge) edge = order_-1;
+ for (int i = edge-1; i >= 0; --i)
+ remnant[j++] = buffer_[i];
+ }
+
+ SetStateSize(j, vstate);
+ return sum;
+ }
+
+ static int OrderToStateSize(int order) {
+ return ((order-1) * 2 + 1) * sizeof(WordID) + 1;
+ }
+
+ protected:
+ Ngram ngram_;
+ vector<WordID> buffer_;
+ const int order_;
+ const int state_size_;
+ const double floor_;
+ private:
+ LMClient* client_;
+
+ public:
+ const WordID kSTART;
+ const WordID kSTOP;
+ const WordID kUNKNOWN;
+ const WordID kNONE;
+ const WordID kSTAR;
+};
+
+LanguageModel::LanguageModel(const string& param) :
+ fid_(FD::Convert("LanguageModel")) {
+ vector<string> argv;
+ int argc = SplitOnWhitespace(param, &argv);
+ int order = 3;
+ // TODO add support for -n FeatureName
+ string filename;
+ if (argc < 1) { cerr << "LanguageModel requires a filename, minimally!\n"; abort(); }
+ else if (argc == 1) { filename = argv[0]; }
+ else if (argc == 2 || argc > 3) { cerr << "Don't understand 'LanguageModel " << param << "'\n"; }
+ else if (argc == 3) {
+ if (argv[0] == "-o") {
+ order = atoi(argv[1].c_str());
+ filename = argv[2];
+ } else if (argv[1] == "-o") {
+ order = atoi(argv[2].c_str());
+ filename = argv[0];
+ }
+ }
+ SetStateSize(LanguageModelImpl::OrderToStateSize(order));
+ pimpl_ = new LanguageModelImpl(order, filename);
+}
+
+LanguageModel::~LanguageModel() {
+ delete pimpl_;
+}
+
+string LanguageModel::DebugStateToString(const void* state) const{
+ return pimpl_->DebugStateToString(state);
+}
+
+void LanguageModel::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* state) const {
+ (void) smeta;
+ features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state));
+ estimated_features->set_value(fid_, pimpl_->EstimateProb(state));
+}
+
+void LanguageModel::FinalTraversalFeatures(const void* ant_state,
+ SparseVector<double>* features) const {
+ features->set_value(fid_, pimpl_->FinalTraversalCost(ant_state));
+}
+
+#ifdef HAVE_RANDLM
+struct RandLMImpl : public LanguageModelImpl {
+ RandLMImpl(int order, randlm::RandLM* rlm) :
+ LanguageModelImpl(order),
+ rlm_(rlm),
+ oov_(rlm->getWordID(rlm->getOOV())),
+ rb_(1000, oov_) {
+ map<int, randlm::WordID> map_cdec2randlm;
+ int max_wordid = 0;
+ for(map<randlm::Word, randlm::WordID>::const_iterator it = rlm->vocabStart();
+ it != rlm->vocabEnd(); ++it) {
+ const int cur = TD::Convert(it->first);
+ map_cdec2randlm[TD::Convert(it->first)] = it->second;
+ if (cur > max_wordid) max_wordid = cur;
+ }
+ cdec2randlm_.resize(max_wordid + 1, oov_);
+ for (map<int, randlm::WordID>::iterator it = map_cdec2randlm.begin();
+ it != map_cdec2randlm.end(); ++it)
+ cdec2randlm_[it->first] = it->second;
+ map_cdec2randlm.clear();
+ }
+
+ inline randlm::WordID Convert2RandLM(int w) {
+ return (w < cdec2randlm_.size() ? cdec2randlm_[w] : oov_);
+ }
+
+ virtual double WordProb(int word, int* context) {
+ int i = order_;
+ int c = 1;
+ rb_[i] = Convert2RandLM(word);
+ while (i > 1 && *context > 0) {
+ --i;
+ rb_[i] = Convert2RandLM(*context);
+ ++context;
+ ++c;
+ }
+ const void* finalState = 0;
+ int found;
+ //cerr << "I = " << i << endl;
+ return rlm_->getProb(&rb_[i], c, &found, &finalState);
+ }
+ private:
+ boost::shared_ptr<randlm::RandLM> rlm_;
+ randlm::WordID oov_;
+ vector<randlm::WordID> cdec2randlm_;
+ vector<randlm::WordID> rb_;
+};
+
+LanguageModelRandLM::LanguageModelRandLM(const string& param) :
+ fid_(FD::Convert("RandLM")) {
+ vector<string> argv;
+ int argc = SplitOnWhitespace(param, &argv);
+ int order = 3;
+ // TODO add support for -n FeatureName
+ string filename;
+ if (argc < 1) { cerr << "RandLM requires a filename, minimally!\n"; abort(); }
+ else if (argc == 1) { filename = argv[0]; }
+ else if (argc == 2 || argc > 3) { cerr << "Don't understand 'RandLM " << param << "'\n"; }
+ else if (argc == 3) {
+ if (argv[0] == "-o") {
+ order = atoi(argv[1].c_str());
+ filename = argv[2];
+ } else if (argv[1] == "-o") {
+ order = atoi(argv[2].c_str());
+ filename = argv[0];
+ }
+ }
+ SetStateSize(LanguageModelImpl::OrderToStateSize(order));
+ int cache_MB = 200; // increase cache size
+ randlm::RandLM* rlm = randlm::RandLM::initRandLM(filename, order, cache_MB);
+ assert(rlm != NULL);
+ pimpl_ = new RandLMImpl(order, rlm);
+}
+
+LanguageModelRandLM::~LanguageModelRandLM() {
+ delete pimpl_;
+}
+
+void LanguageModelRandLM::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* state) const {
+ (void) smeta;
+ features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state));
+ estimated_features->set_value(fid_, pimpl_->EstimateProb(state));
+}
+
+void LanguageModelRandLM::FinalTraversalFeatures(const void* ant_state,
+ SparseVector<double>* features) const {
+ features->set_value(fid_, pimpl_->FinalTraversalCost(ant_state));
+}
+
+#endif
+
diff --git a/decoder/ff_lm.h b/decoder/ff_lm.h
new file mode 100644
index 00000000..45fc1da7
--- /dev/null
+++ b/decoder/ff_lm.h
@@ -0,0 +1,55 @@
+#ifndef _LM_FF_H_
+#define _LM_FF_H_
+
+#include <vector>
+#include <string>
+
+#include "hg.h"
+#include "ff.h"
+#include "config.h"
+
+class LanguageModelImpl;
+
+class LanguageModel : public FeatureFunction {
+ public:
+ // param = "filename.lm [-o n]"
+ LanguageModel(const std::string& param);
+ ~LanguageModel();
+ virtual void FinalTraversalFeatures(const void* context,
+ SparseVector<double>* features) const;
+ std::string DebugStateToString(const void* state) const;
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const;
+ private:
+ const int fid_;
+ mutable LanguageModelImpl* pimpl_;
+};
+
+#ifdef HAVE_RANDLM
+class LanguageModelRandLM : public FeatureFunction {
+ public:
+ // param = "filename.lm [-o n]"
+ LanguageModelRandLM(const std::string& param);
+ ~LanguageModelRandLM();
+ virtual void FinalTraversalFeatures(const void* context,
+ SparseVector<double>* features) const;
+ std::string DebugStateToString(const void* state) const;
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const;
+ private:
+ const int fid_;
+ mutable LanguageModelImpl* pimpl_;
+};
+#endif
+
+#endif
diff --git a/decoder/ff_tagger.cc b/decoder/ff_tagger.cc
new file mode 100644
index 00000000..7a9d1def
--- /dev/null
+++ b/decoder/ff_tagger.cc
@@ -0,0 +1,96 @@
+#include "ff_tagger.h"
+
+#include "tdict.h"
+#include "sentence_metadata.h"
+
+#include <sstream>
+
+using namespace std;
+
+Tagger_BigramIdentity::Tagger_BigramIdentity(const std::string& param) :
+ FeatureFunction(sizeof(WordID)) {}
+
+void Tagger_BigramIdentity::FireFeature(const WordID& left,
+ const WordID& right,
+ SparseVector<double>* features) const {
+ int& fid = fmap_[left][right];
+ if (!fid) {
+ ostringstream os;
+ if (right == 0) {
+ os << "Uni:" << TD::Convert(left);
+ } else {
+ os << "Bi:";
+ if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); }
+ os << '_';
+ if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); }
+ }
+ fid = FD::Convert(os.str());
+ }
+ features->set_value(fid, 1.0);
+}
+
+void Tagger_BigramIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const {
+ WordID& out_context = *static_cast<WordID*>(context);
+ const int arity = edge.Arity();
+ if (arity == 0) {
+ out_context = edge.rule_->e_[0];
+ FireFeature(out_context, 0, features);
+ } else if (arity == 2) {
+ WordID left = *static_cast<const WordID*>(ant_contexts[0]);
+ WordID right = *static_cast<const WordID*>(ant_contexts[1]);
+ if (edge.i_ == 0 && edge.j_ == 2)
+ FireFeature(-1, left, features);
+ FireFeature(left, right, features);
+ if (edge.i_ == 0 && edge.j_ == smeta.GetSourceLength())
+ FireFeature(right, -1, features);
+ out_context = right;
+ }
+}
+
+LexicalPairIdentity::LexicalPairIdentity(const std::string& param) {}
+
+void LexicalPairIdentity::FireFeature(WordID src,
+ WordID trg,
+ SparseVector<double>* features) const {
+ int& fid = fmap_[src][trg];
+ if (!fid) {
+ static map<WordID, WordID> escape;
+ if (escape.empty()) {
+ escape[TD::Convert("=")] = TD::Convert("__EQ");
+ escape[TD::Convert(";")] = TD::Convert("__SC");
+ escape[TD::Convert(",")] = TD::Convert("__CO");
+ }
+ if (escape.count(src)) src = escape[src];
+ if (escape.count(trg)) trg = escape[trg];
+ ostringstream os;
+ os << "Id:" << TD::Convert(src) << ':' << TD::Convert(trg);
+ fid = FD::Convert(os.str());
+ }
+ features->set_value(fid, 1.0);
+}
+
+void LexicalPairIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const {
+ const vector<WordID>& ew = edge.rule_->e_;
+ const vector<WordID>& fw = edge.rule_->f_;
+ for (int i = 0; i < ew.size(); ++i) {
+ const WordID& e = ew[i];
+ if (e <= 0) continue;
+ for (int j = 0; j < fw.size(); ++j) {
+ const WordID& f = fw[j];
+ if (f <= 0) continue;
+ FireFeature(f, e, features);
+ }
+ }
+}
+
+
diff --git a/decoder/ff_tagger.h b/decoder/ff_tagger.h
new file mode 100644
index 00000000..41c3ee5b
--- /dev/null
+++ b/decoder/ff_tagger.h
@@ -0,0 +1,51 @@
+#ifndef _FF_TAGGER_H_
+#define _FF_TAGGER_H_
+
+#include <map>
+#include "ff.h"
+
+typedef std::map<WordID, int> Class2FID;
+typedef std::map<WordID, Class2FID> Class2Class2FID;
+
+// the reason this is a "tagger" feature is that it assumes that
+// the sequence unfolds from left to right, which means it doesn't
+// have to split states based on left context.
+// fires unigram features as well
+class Tagger_BigramIdentity : public FeatureFunction {
+ public:
+ Tagger_BigramIdentity(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const;
+ private:
+ void FireFeature(const WordID& left,
+ const WordID& right,
+ SparseVector<double>* features) const;
+ mutable Class2Class2FID fmap_;
+};
+
+// for each pair of symbols cooccuring in a lexicalized rule, fire
+// a feature (mostly used for tagging, but could be used for any model)
+class LexicalPairIdentity : public FeatureFunction {
+ public:
+ LexicalPairIdentity(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const;
+ private:
+ void FireFeature(WordID src,
+ WordID trg,
+ SparseVector<double>* features) const;
+ mutable Class2Class2FID fmap_;
+};
+
+
+#endif
diff --git a/decoder/ff_test.cc b/decoder/ff_test.cc
new file mode 100644
index 00000000..9e640517
--- /dev/null
+++ b/decoder/ff_test.cc
@@ -0,0 +1,64 @@
+#include <cassert>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <gtest/gtest.h>
+#include "hg.h"
+#include "ff_lm.h"
+#include "ff.h"
+#include "trule.h"
+#include "sentence_metadata.h"
+
+using namespace std;
+
+LanguageModel* lm_ = NULL;
+LanguageModel* lm3_ = NULL;
+
+class FFTest : public testing::Test {
+ public:
+ FFTest() : smeta(0,Lattice()) {
+ if (!lm_) {
+ static LanguageModel slm("-o 2 ./test_data/test_2gram.lm.gz");
+ lm_ = &slm;
+ static LanguageModel slm3("./test_data/dummy.3gram.lm -o 3");
+ lm3_ = &slm3;
+ }
+ }
+ protected:
+ virtual void SetUp() { }
+ virtual void TearDown() { }
+ SentenceMetadata smeta;
+};
+
+TEST_F(FFTest, LM3) {
+ int x = lm3_->NumBytesContext();
+ Hypergraph::Edge edge1;
+ edge1.rule_.reset(new TRule("[X] ||| x y ||| one ||| 1.0 -2.4 3.0"));
+ Hypergraph::Edge edge2;
+ edge2.rule_.reset(new TRule("[X] ||| [X,1] a ||| [X,1] two ||| 1.0 -2.4 3.0"));
+ Hypergraph::Edge edge3;
+ edge3.rule_.reset(new TRule("[X] ||| [X,1] a ||| zero [X,1] two ||| 1.0 -2.4 3.0"));
+ vector<const void*> ants1;
+ string state(x, '\0');
+ SparseVector<double> feats;
+ SparseVector<double> est;
+ lm3_->TraversalFeatures(smeta, edge1, ants1, &feats, &est, (void *)&state[0]);
+ cerr << "returned " << feats << endl;
+ cerr << edge1.feature_values_ << endl;
+ cerr << lm3_->DebugStateToString((const void*)&state[0]) << endl;
+ EXPECT_EQ("[ one ]", lm3_->DebugStateToString((const void*)&state[0]));
+ ants1.push_back((const void*)&state[0]);
+ string state2(x, '\0');
+ lm3_->TraversalFeatures(smeta, edge2, ants1, &feats, &est, (void *)&state2[0]);
+ cerr << lm3_->DebugStateToString((const void*)&state2[0]) << endl;
+ EXPECT_EQ("[ one two ]", lm3_->DebugStateToString((const void*)&state2[0]));
+ string state3(x, '\0');
+ lm3_->TraversalFeatures(smeta, edge3, ants1, &feats, &est, (void *)&state3[0]);
+ cerr << lm3_->DebugStateToString((const void*)&state3[0]) << endl;
+ EXPECT_EQ("[ zero one <{STAR}> one two ]", lm3_->DebugStateToString((const void*)&state3[0]));
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
new file mode 100644
index 00000000..669aa530
--- /dev/null
+++ b/decoder/ff_wordalign.cc
@@ -0,0 +1,445 @@
+#include "ff_wordalign.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+#include <cmath>
+
+#include "stringlib.h"
+#include "sentence_metadata.h"
+#include "hg.h"
+#include "fdict.h"
+#include "aligner.h"
+#include "tdict.h" // Blunsom hack
+#include "filelib.h" // Blunsom hack
+
+static const int MAX_SENTENCE_SIZE = 100;
+
+using namespace std;
+
+Model2BinaryFeatures::Model2BinaryFeatures(const string& param) :
+ fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) {
+ for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
+ for (int j = 0; j < i; ++j) {
+ for (int k = 0; k < MAX_SENTENCE_SIZE; ++k) {
+ int& val = fids_[i][j][k];
+ val = -1;
+ if (j < i) {
+ ostringstream os;
+ os << "M2FL:" << i << ":TI:" << k << "_SI:" << j;
+ val = FD::Convert(os.str());
+ }
+ }
+ }
+ }
+}
+
+void Model2BinaryFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* state) const {
+ // if the source word is either null or the generated word
+ // has no position in the reference
+ if (edge.i_ == -1 || edge.prev_i_ == -1)
+ return;
+
+ assert(smeta.GetTargetLength() > 0);
+ const int fid = fids_[smeta.GetSourceLength()][edge.i_][edge.prev_i_];
+ features->set_value(fid, 1.0);
+// cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;
+}
+
+
+RelativeSentencePosition::RelativeSentencePosition(const string& param) :
+ fid_(FD::Convert("RelativeSentencePosition")) {
+ if (!param.empty()) {
+ cerr << " Loading word classes from " << param << endl;
+ condition_on_fclass_ = true;
+ ReadFile rf(param);
+ istream& in = *rf.stream();
+ set<WordID> classes;
+ while(in) {
+ string line;
+ getline(in, line);
+ if (line.empty()) continue;
+ vector<WordID> v;
+ TD::ConvertSentence(line, &v);
+ pos_.push_back(v);
+ for (int i = 0; i < v.size(); ++i)
+ classes.insert(v[i]);
+ for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) {
+ ostringstream os;
+ os << "RelPos_FC:" << TD::Convert(*i);
+ fids_[*i] = FD::Convert(os.str());
+ }
+ }
+ } else {
+ condition_on_fclass_ = false;
+ }
+}
+
+void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* state) const {
+ // if the source word is either null or the generated word
+ // has no position in the reference
+ if (edge.i_ == -1 || edge.prev_i_ == -1)
+ return;
+
+ assert(smeta.GetTargetLength() > 0);
+ const double val = fabs(static_cast<double>(edge.i_) / smeta.GetSourceLength() -
+ static_cast<double>(edge.prev_i_) / smeta.GetTargetLength());
+ features->set_value(fid_, val);
+ if (condition_on_fclass_) {
+ assert(smeta.GetSentenceID() < pos_.size());
+ const WordID cur_fclass = pos_[smeta.GetSentenceID()][edge.i_];
+ const int fid = fids_.find(cur_fclass)->second;
+ features->set_value(fid, val);
+ }
+// cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;
+}
+
+MarkovJumpFClass::MarkovJumpFClass(const string& param) :
+ FeatureFunction(1),
+ fids_(MAX_SENTENCE_SIZE) {
+ cerr << " MarkovJumpFClass" << endl;
+ cerr << "Reading source POS tags from " << param << endl;
+ ReadFile rf(param);
+ istream& in = *rf.stream();
+ set<WordID> classes;
+ while(in) {
+ string line;
+ getline(in, line);
+ if (line.empty()) continue;
+ vector<WordID> v;
+ TD::ConvertSentence(line, &v);
+ pos_.push_back(v);
+ for (int i = 0; i < v.size(); ++i)
+ classes.insert(v[i]);
+ }
+ cerr << " (" << pos_.size() << " lines)\n";
+ cerr << " Classes: " << classes.size() << endl;
+ for (int ss = 1; ss < MAX_SENTENCE_SIZE; ++ss) {
+ map<WordID, map<int, int> >& cfids = fids_[ss];
+ for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) {
+ map<int, int> &fids = cfids[*i];
+ for (int j = -ss; j <= ss; ++j) {
+ ostringstream os;
+ os << "Jump_FL:" << ss << "_FC:" << TD::Convert(*i) << "_J:" << j;
+ fids[j] = FD::Convert(os.str());
+ }
+ }
+ }
+}
+
+void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta,
+ int prev_src_pos,
+ int cur_src_pos,
+ SparseVector<double>* features) const {
+ const int jumpsize = cur_src_pos - prev_src_pos;
+ assert(smeta.GetSentenceID() < pos_.size());
+ const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos];
+ const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
+ features->set_value(fid, 1.0);
+}
+
+void MarkovJumpFClass::FinalTraversalFeatures(const void* context,
+ SparseVector<double>* features) const {
+ int left_index = *static_cast<const unsigned char*>(context);
+// int right_index = cur_flen;
+ // TODO
+}
+
+void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* state) const {
+ unsigned char& dpstate = *((unsigned char*)state);
+ if (edge.Arity() == 0) {
+ dpstate = static_cast<unsigned int>(edge.i_);
+ } else if (edge.Arity() == 1) {
+ dpstate = *((unsigned char*)ant_states[0]);
+ } else if (edge.Arity() == 2) {
+ int left_index = *((unsigned char*)ant_states[0]);
+ int right_index = *((unsigned char*)ant_states[1]);
+ if (right_index == -1)
+ dpstate = static_cast<unsigned int>(left_index);
+ else
+ dpstate = static_cast<unsigned int>(right_index);
+// const WordID cur_fclass = pos_[smeta.GetSentenceID()][right_index];
+// cerr << edge.i_ << "," << edge.j_ << ": fclass=" << TD::Convert(cur_fclass) << " j=" << jumpsize << endl;
+// const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
+// features->set_value(fid, 1.0);
+ FireFeature(smeta, left_index, right_index, features);
+ }
+}
+
+// std::vector<std::map<int, int> > flen2jump2fid_;
+MarkovJump::MarkovJump(const string& param) :
+ FeatureFunction(1),
+ fid_(FD::Convert("MarkovJump")),
+ binary_params_(false) {
+ cerr << " MarkovJump";
+ vector<string> argv;
+ int argc = SplitOnWhitespace(param, &argv);
+ if (argc != 1 || !(argv[0] == "-b" || argv[0] == "+b")) {
+ cerr << "MarkovJump: expected parameters to be -b or +b\n";
+ exit(1);
+ }
+ binary_params_ = argv[0] == "+b";
+ if (binary_params_) {
+ flen2jump2fid_.resize(MAX_SENTENCE_SIZE);
+ for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
+ map<int, int>& jump2fid = flen2jump2fid_[i];
+ for (int jump = -i; jump <= i; ++jump) {
+ ostringstream os;
+ os << "Jump:FLen:" << i << "_J:" << jump;
+ jump2fid[jump] = FD::Convert(os.str());
+ }
+ }
+ } else {
+ cerr << " (Blunsom & Cohn definition)";
+ }
+ cerr << endl;
+}
+
+// TODO handle NULLs according to Och 2000
+void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* state) const {
+ unsigned char& dpstate = *((unsigned char*)state);
+ const int flen = smeta.GetSourceLength();
+ if (edge.Arity() == 0) {
+ dpstate = static_cast<unsigned int>(edge.i_);
+ if (edge.prev_i_ == 0) {
+ if (binary_params_) {
+ // NULL will be tricky
+ // TODO initial state distribution, not normal jumps
+ const int fid = flen2jump2fid_[flen].find(edge.i_ + 1)->second;
+ features->set_value(fid, 1.0);
+ }
+ } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) {
+ // NULL will be tricky
+ if (binary_params_) {
+ int jumpsize = flen - edge.i_;
+ const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
+ features->set_value(fid, 1.0);
+ }
+ }
+ } else if (edge.Arity() == 1) {
+ dpstate = *((unsigned char*)ant_states[0]);
+ } else if (edge.Arity() == 2) {
+ int left_index = *((unsigned char*)ant_states[0]);
+ int right_index = *((unsigned char*)ant_states[1]);
+ if (right_index == -1)
+ dpstate = static_cast<unsigned int>(left_index);
+ else
+ dpstate = static_cast<unsigned int>(right_index);
+ const int jumpsize = right_index - left_index;
+
+ if (binary_params_) {
+ const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
+ features->set_value(fid, 1.0);
+ } else {
+ features->set_value(fid_, fabs(jumpsize - 1)); // Blunsom and Cohn def
+ }
+ } else {
+ assert(!"something really unexpected is happening");
+ }
+}
+
+// state: POS of src word used, number of trg words generated
+SourcePOSBigram::SourcePOSBigram(const std::string& param) :
+ FeatureFunction(sizeof(WordID) + sizeof(int)) {
+ cerr << "Reading source POS tags from " << param << endl;
+ ReadFile rf(param);
+ istream& in = *rf.stream();
+ while(in) {
+ string line;
+ getline(in, line);
+ if (line.empty()) continue;
+ vector<WordID> v;
+ TD::ConvertSentence(line, &v);
+ pos_.push_back(v);
+ }
+ cerr << " (" << pos_.size() << " lines)\n";
+}
+
+void SourcePOSBigram::FinalTraversalFeatures(const void* context,
+ SparseVector<double>* features) const {
+ WordID left = *static_cast<const WordID*>(context);
+ int left_wc = *(static_cast<const int*>(context) + 1);
+ if (left_wc == 1)
+ FireFeature(-1, left, features);
+ FireFeature(left, -1, features);
+}
+
+void SourcePOSBigram::FireFeature(WordID left,
+ WordID right,
+ SparseVector<double>* features) const {
+ int& fid = fmap_[left][right];
+ if (!fid) {
+ ostringstream os;
+ os << "SP:";
+ if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); }
+ os << '_';
+ if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); }
+ fid = FD::Convert(os.str());
+ if (fid == 0) fid = -1;
+ }
+ if (fid < 0) return;
+ features->set_value(fid, 1.0);
+}
+
+void SourcePOSBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const {
+ WordID& out_context = *static_cast<WordID*>(context);
+ int& out_word_count = *(static_cast<int*>(context) + 1);
+ const int arity = edge.Arity();
+ if (arity == 0) {
+ assert(smeta.GetSentenceID() < pos_.size());
+ const vector<WordID>& pos_sent = pos_[smeta.GetSentenceID()];
+ assert(edge.i_ < pos_sent.size());
+ out_context = pos_sent[edge.i_];
+ out_word_count = edge.rule_->EWords();
+ assert(out_word_count == 1); // this is only defined for lex translation!
+ // revisit this if you want to translate into null words
+ } else if (arity == 2) {
+ WordID left = *static_cast<const WordID*>(ant_contexts[0]);
+ WordID right = *static_cast<const WordID*>(ant_contexts[1]);
+ int left_wc = *(static_cast<const int*>(ant_contexts[0]) + 1);
+ int right_wc = *(static_cast<const int*>(ant_contexts[0]) + 1);
+ if (left_wc == 1 && right_wc == 1)
+ FireFeature(-1, left, features);
+ FireFeature(left, right, features);
+ out_word_count = left_wc + right_wc;
+ out_context = right;
+ }
+}
+
+AlignerResults::AlignerResults(const std::string& param) :
+ cur_sent_(-1),
+ cur_grid_(NULL) {
+ vector<string> argv;
+ int argc = SplitOnWhitespace(param, &argv);
+ if (argc != 2) {
+ cerr << "Required format: AlignerResults [FeatureName] [file.pharaoh]\n";
+ exit(1);
+ }
+ cerr << " feature: " << argv[0] << "\talignments: " << argv[1] << endl;
+ fid_ = FD::Convert(argv[0]);
+ ReadFile rf(argv[1]);
+ istream& in = *rf.stream(); int lc = 0;
+ while(in) {
+ string line;
+ getline(in, line);
+ if (!in) break;
+ ++lc;
+ is_aligned_.push_back(AlignerTools::ReadPharaohAlignmentGrid(line));
+ }
+ cerr << " Loaded " << lc << " refs\n";
+}
+
+void AlignerResults::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* state) const {
+ if (edge.i_ == -1 || edge.prev_i_ == -1)
+ return;
+
+ if (cur_sent_ != smeta.GetSentenceID()) {
+ assert(smeta.HasReference());
+ cur_sent_ = smeta.GetSentenceID();
+ assert(cur_sent_ < is_aligned_.size());
+ cur_grid_ = is_aligned_[cur_sent_].get();
+ }
+
+ //cerr << edge.rule_->AsString() << endl;
+
+ int j = edge.i_; // source side (f)
+ int i = edge.prev_i_; // target side (e)
+ if (j < cur_grid_->height() && i < cur_grid_->width() && (*cur_grid_)(i, j)) {
+// if (edge.rule_->e_[0] == smeta.GetReference()[i][0].label) {
+ features->set_value(fid_, 1.0);
+// cerr << edge.rule_->AsString() << " (" << i << "," << j << ")\n";
+// }
+ }
+}
+
+BlunsomSynchronousParseHack::BlunsomSynchronousParseHack(const string& param) :
+ FeatureFunction((100 / 8) + 1), fid_(FD::Convert("NotRef")), cur_sent_(-1) {
+ ReadFile rf(param);
+ istream& in = *rf.stream(); int lc = 0;
+ while(in) {
+ string line;
+ getline(in, line);
+ if (!in) break;
+ ++lc;
+ refs_.push_back(vector<WordID>());
+ TD::ConvertSentence(line, &refs_.back());
+ }
+ cerr << " Loaded " << lc << " refs\n";
+}
+
+void BlunsomSynchronousParseHack::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* state) const {
+ if (cur_sent_ != smeta.GetSentenceID()) {
+ // assert(smeta.HasReference());
+ cur_sent_ = smeta.GetSentenceID();
+ assert(cur_sent_ < refs_.size());
+ cur_ref_ = &refs_[cur_sent_];
+ cur_map_.clear();
+ for (int i = 0; i < cur_ref_->size(); ++i) {
+ vector<WordID> phrase;
+ for (int j = i; j < cur_ref_->size(); ++j) {
+ phrase.push_back((*cur_ref_)[j]);
+ cur_map_[phrase] = i;
+ }
+ }
+ }
+ //cerr << edge.rule_->AsString() << endl;
+ for (int i = 0; i < ant_states.size(); ++i) {
+ if (DoesNotBelong(ant_states[i])) {
+ //cerr << " ant " << i << " does not belong\n";
+ return;
+ }
+ }
+ vector<vector<WordID> > ants(ant_states.size());
+ vector<const vector<WordID>* > pants(ant_states.size());
+ for (int i = 0; i < ant_states.size(); ++i) {
+ AppendAntecedentString(ant_states[i], &ants[i]);
+ //cerr << " ant[" << i << "]: " << ((int)*(static_cast<const unsigned char*>(ant_states[i]))) << " " << TD::GetString(ants[i]) << endl;
+ pants[i] = &ants[i];
+ }
+ vector<WordID> yield;
+ edge.rule_->ESubstitute(pants, &yield);
+ //cerr << "YIELD: " << TD::GetString(yield) << endl;
+ Vec2Int::iterator it = cur_map_.find(yield);
+ if (it == cur_map_.end()) {
+ features->set_value(fid_, 1);
+ //cerr << " BAD!\n";
+ return;
+ }
+ SetStateMask(it->second, it->second + yield.size(), state);
+}
+
diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h
new file mode 100644
index 00000000..c44ad26b
--- /dev/null
+++ b/decoder/ff_wordalign.h
@@ -0,0 +1,196 @@
+#ifndef _FF_WORD_ALIGN_H_
+#define _FF_WORD_ALIGN_H_
+
+#include "ff.h"
+#include "array2d.h"
+
+#include <boost/multi_array.hpp>
+
+class RelativeSentencePosition : public FeatureFunction {
+ public:
+ RelativeSentencePosition(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const;
+ private:
+ const int fid_;
+ bool condition_on_fclass_;
+ std::vector<std::vector<WordID> > pos_;
+ std::map<WordID, int> fids_; // fclass -> fid
+};
+
+class Model2BinaryFeatures : public FeatureFunction {
+ public:
+ Model2BinaryFeatures(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const;
+ private:
+ boost::multi_array<int, 3> fids_;
+};
+
+class MarkovJump : public FeatureFunction {
+ public:
+ MarkovJump(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const;
+ private:
+ const int fid_;
+ bool binary_params_;
+ std::vector<std::map<int, int> > flen2jump2fid_;
+};
+
+class MarkovJumpFClass : public FeatureFunction {
+ public:
+ MarkovJumpFClass(const std::string& param);
+ virtual void FinalTraversalFeatures(const void* context,
+ SparseVector<double>* features) const;
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const;
+
+ void FireFeature(const SentenceMetadata& smeta,
+ int prev_src_pos,
+ int cur_src_pos,
+ SparseVector<double>* features) const;
+
+ private:
+ std::vector<std::map<WordID, std::map<int, int> > > fids_; // flen -> fclass -> jumpsize -> fid
+ std::vector<std::vector<WordID> > pos_;
+};
+
+typedef std::map<WordID, int> Class2FID;
+typedef std::map<WordID, Class2FID> Class2Class2FID;
+class SourcePOSBigram : public FeatureFunction {
+ public:
+ SourcePOSBigram(const std::string& param);
+ virtual void FinalTraversalFeatures(const void* context,
+ SparseVector<double>* features) const;
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const;
+ private:
+ void FireFeature(WordID src,
+ WordID trg,
+ SparseVector<double>* features) const;
+ mutable Class2Class2FID fmap_;
+ std::vector<std::vector<WordID> > pos_;
+};
+
+class AlignerResults : public FeatureFunction {
+ public:
+ AlignerResults(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const;
+ private:
+ int fid_;
+ std::vector<boost::shared_ptr<Array2D<bool> > > is_aligned_;
+ mutable int cur_sent_;
+ const Array2D<bool> mutable* cur_grid_;
+};
+
+#include <tr1/unordered_map>
+#include <boost/functional/hash.hpp>
+#include <cassert>
+class BlunsomSynchronousParseHack : public FeatureFunction {
+ public:
+ BlunsomSynchronousParseHack(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const;
+ private:
+ inline bool DoesNotBelong(const void* state) const {
+ for (int i = 0; i < NumBytesContext(); ++i) {
+ if (*(static_cast<const unsigned char*>(state) + i)) return false;
+ }
+ return true;
+ }
+
+ inline void AppendAntecedentString(const void* state, std::vector<WordID>* yield) const {
+ int i = 0;
+ int ind = 0;
+ while (i < NumBytesContext() && !(*(static_cast<const unsigned char*>(state) + i))) { ++i; ind += 8; }
+ // std::cerr << i << " " << NumBytesContext() << std::endl;
+ assert(i != NumBytesContext());
+ assert(ind < cur_ref_->size());
+ int cur = *(static_cast<const unsigned char*>(state) + i);
+ int comp = 1;
+ while (comp < 256 && (comp & cur) == 0) { comp <<= 1; ++ind; }
+ assert(ind < cur_ref_->size());
+ assert(comp < 256);
+ do {
+ assert(ind < cur_ref_->size());
+ yield->push_back((*cur_ref_)[ind]);
+ ++ind;
+ comp <<= 1;
+ if (comp == 256) {
+ comp = 1;
+ ++i;
+ cur = *(static_cast<const unsigned char*>(state) + i);
+ }
+ } while (comp & cur);
+ }
+
+ inline void SetStateMask(int start, int end, void* state) const {
+ assert((end / 8) < NumBytesContext());
+ int i = 0;
+ int comp = 1;
+ for (int j = 0; j < start; ++j) {
+ comp <<= 1;
+ if (comp == 256) {
+ ++i;
+ comp = 1;
+ }
+ }
+ //std::cerr << "SM: " << i << "\n";
+ for (int j = start; j < end; ++j) {
+ *(static_cast<unsigned char*>(state) + i) |= comp;
+ //std::cerr << " " << comp << "\n";
+ comp <<= 1;
+ if (comp == 256) {
+ ++i;
+ comp = 1;
+ }
+ }
+ //std::cerr << " MASK: " << ((int)*(static_cast<unsigned char*>(state))) << "\n";
+ }
+
+ const int fid_;
+ mutable int cur_sent_;
+ typedef std::tr1::unordered_map<std::vector<WordID>, int, boost::hash<std::vector<WordID> > > Vec2Int;
+ mutable Vec2Int cur_map_;
+ const std::vector<WordID> mutable * cur_ref_;
+ mutable std::vector<std::vector<WordID> > refs_;
+};
+
+#endif
diff --git a/decoder/filelib.cc b/decoder/filelib.cc
new file mode 100644
index 00000000..79ad2847
--- /dev/null
+++ b/decoder/filelib.cc
@@ -0,0 +1,22 @@
+#include "filelib.h"
+
+#include <unistd.h>
+#include <sys/stat.h>
+
+using namespace std;
+
+bool FileExists(const std::string& fn) {
+ struct stat info;
+ int s = stat(fn.c_str(), &info);
+ return (s==0);
+}
+
+bool DirectoryExists(const string& dir) {
+ if (access(dir.c_str(),0) == 0) {
+ struct stat status;
+ stat(dir.c_str(), &status);
+ if (status.st_mode & S_IFDIR) return true;
+ }
+ return false;
+}
+
diff --git a/decoder/filelib.h b/decoder/filelib.h
new file mode 100644
index 00000000..03c22b0d
--- /dev/null
+++ b/decoder/filelib.h
@@ -0,0 +1,70 @@
+#ifndef _FILELIB_H_
+#define _FILELIB_H_
+
+#include <cassert>
+#include <string>
+#include <iostream>
+#include <cstdlib>
+#include "gzstream.h"
+
+bool FileExists(const std::string& file_name);
+bool DirectoryExists(const std::string& dir_name);
+
+// reads from standard in if filename is -
+// uncompresses if file ends with .gz
+// otherwise, reads from a normal file
+class ReadFile {
+ public:
+ ReadFile(const std::string& filename) :
+ no_delete_on_exit_(filename == "-"),
+ in_(no_delete_on_exit_ ? static_cast<std::istream*>(&std::cin) :
+ (EndsWith(filename, ".gz") ?
+ static_cast<std::istream*>(new igzstream(filename.c_str())) :
+ static_cast<std::istream*>(new std::ifstream(filename.c_str())))) {
+ if (!no_delete_on_exit_ && !FileExists(filename)) {
+ std::cerr << "File does not exist: " << filename << std::endl;
+ abort();
+ }
+ if (!*in_) {
+ std::cerr << "Failed to open " << filename << std::endl;
+ abort();
+ }
+ }
+ ~ReadFile() {
+ if (!no_delete_on_exit_) delete in_;
+ }
+
+ inline std::istream* stream() { return in_; }
+
+ private:
+ static bool EndsWith(const std::string& f, const std::string& suf) {
+ return (f.size() > suf.size()) && (f.rfind(suf) == f.size() - suf.size());
+ }
+ const bool no_delete_on_exit_;
+ std::istream* const in_;
+};
+
+class WriteFile {
+ public:
+ WriteFile(const std::string& filename) :
+ no_delete_on_exit_(filename == "-"),
+ out_(no_delete_on_exit_ ? static_cast<std::ostream*>(&std::cout) :
+ (EndsWith(filename, ".gz") ?
+ static_cast<std::ostream*>(new ogzstream(filename.c_str())) :
+ static_cast<std::ostream*>(new std::ofstream(filename.c_str())))) {}
+ ~WriteFile() {
+ (*out_) << std::flush;
+ if (!no_delete_on_exit_) delete out_;
+ }
+
+ inline std::ostream* stream() { return out_; }
+
+ private:
+ static bool EndsWith(const std::string& f, const std::string& suf) {
+ return (f.size() > suf.size()) && (f.rfind(suf) == f.size() - suf.size());
+ }
+ const bool no_delete_on_exit_;
+ std::ostream* const out_;
+};
+
+#endif
diff --git a/decoder/forest_writer.cc b/decoder/forest_writer.cc
new file mode 100644
index 00000000..a9117d18
--- /dev/null
+++ b/decoder/forest_writer.cc
@@ -0,0 +1,23 @@
+#include "forest_writer.h"
+
+#include <iostream>
+
+#include <boost/lexical_cast.hpp>
+
+#include "filelib.h"
+#include "hg_io.h"
+#include "hg.h"
+
+using namespace std;
+
+ForestWriter::ForestWriter(const std::string& path, int num) :
+ fname_(path + '/' + boost::lexical_cast<string>(num) + ".json.gz"), used_(false) {}
+
+bool ForestWriter::Write(const Hypergraph& forest, bool minimal_rules) {
+ assert(!used_);
+ used_ = true;
+ cerr << " Writing forest to " << fname_ << endl;
+ WriteFile wf(fname_);
+ return HypergraphIO::WriteToJSON(forest, minimal_rules, wf.stream());
+}
+
diff --git a/decoder/forest_writer.h b/decoder/forest_writer.h
new file mode 100644
index 00000000..819a8940
--- /dev/null
+++ b/decoder/forest_writer.h
@@ -0,0 +1,16 @@
+#ifndef _FOREST_WRITER_H_
+#define _FOREST_WRITER_H_
+
+#include <string>
+
+class Hypergraph;
+
+struct ForestWriter {
+ ForestWriter(const std::string& path, int num);
+ bool Write(const Hypergraph& forest, bool minimal_rules);
+
+ const std::string fname_;
+ bool used_;
+};
+
+#endif
diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc
new file mode 100644
index 00000000..9e25d346
--- /dev/null
+++ b/decoder/freqdict.cc
@@ -0,0 +1,29 @@
+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include "freqdict.h"
+#include "tdict.h"
+#include "filelib.h"
+
+using namespace std;
+
+void FreqDict::Load(const std::string& fname) {
+ cerr << "Reading word frequencies: " << fname << endl;
+ ReadFile rf(fname);
+ istream& ifs = *rf.stream();
+ int cc=0;
+ while (ifs) {
+ std::string word;
+ ifs >> word;
+ if (word.size() == 0) continue;
+ if (word[0] == '#') continue;
+ double count = 0;
+ ifs >> count;
+ assert(count > 0.0); // use -log(f)
+ counts_[TD::Convert(word)]=count;
+ ++cc;
+ if (cc % 10000 == 0) { std::cerr << "."; }
+ }
+ std::cerr << "\n";
+ std::cerr << "Loaded " << cc << " words\n";
+}
diff --git a/decoder/freqdict.h b/decoder/freqdict.h
new file mode 100644
index 00000000..9acf0c33
--- /dev/null
+++ b/decoder/freqdict.h
@@ -0,0 +1,20 @@
+#ifndef _FREQDICT_H_
+#define _FREQDICT_H_
+
+#include <map>
+#include <string>
+#include "wordid.h"
+
+class FreqDict {
+ public:
+ void Load(const std::string& fname);
+ float LookUp(const WordID& word) const {
+ std::map<WordID,float>::const_iterator i = counts_.find(word);
+ if (i == counts_.end()) return 0;
+ return i->second;
+ }
+ private:
+ std::map<WordID, float> counts_;
+};
+
+#endif
diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc
new file mode 100644
index 00000000..38dbd717
--- /dev/null
+++ b/decoder/fst_translator.cc
@@ -0,0 +1,91 @@
+#include "translator.h"
+
+#include <sstream>
+#include <boost/shared_ptr.hpp>
+
+#include "sentence_metadata.h"
+#include "filelib.h"
+#include "hg.h"
+#include "hg_io.h"
+#include "earley_composer.h"
+#include "phrasetable_fst.h"
+#include "tdict.h"
+
+using namespace std;
+
+struct FSTTranslatorImpl {
+ FSTTranslatorImpl(const boost::program_options::variables_map& conf) :
+ goal_sym(conf["goal"].as<string>()),
+ kGOAL_RULE(new TRule("[Goal] ||| [" + goal_sym + ",1] ||| [1]")),
+ kGOAL(TD::Convert("Goal") * -1),
+ add_pass_through_rules(conf.count("add_pass_through_rules")) {
+ fst.reset(LoadTextPhrasetable(conf["grammar"].as<vector<string> >()));
+ ec.reset(new EarleyComposer(fst.get()));
+ }
+
+ bool Translate(const string& input,
+ const vector<double>& weights,
+ Hypergraph* forest) {
+ bool composed = false;
+ if (input.find("{\"rules\"") == 0) {
+ istringstream is(input);
+ Hypergraph src_cfg_hg;
+ assert(HypergraphIO::ReadFromJSON(&is, &src_cfg_hg));
+ if (add_pass_through_rules) {
+ SparseVector<double> feats;
+ feats.set_value(FD::Convert("PassThrough"), 1);
+ for (int i = 0; i < src_cfg_hg.edges_.size(); ++i) {
+ const vector<WordID>& f = src_cfg_hg.edges_[i].rule_->f_;
+ for (int j = 0; j < f.size(); ++j) {
+ if (f[j] > 0) {
+ fst->AddPassThroughTranslation(f[j], feats);
+ }
+ }
+ }
+ }
+ composed = ec->Compose(src_cfg_hg, forest);
+ } else {
+ const string dummy_grammar("[" + goal_sym + "] ||| " + input + " ||| TOP=1");
+ cerr << " Dummy grammar: " << dummy_grammar << endl;
+ istringstream is(dummy_grammar);
+ if (add_pass_through_rules) {
+ vector<WordID> words;
+ TD::ConvertSentence(input, &words);
+ SparseVector<double> feats;
+ feats.set_value(FD::Convert("PassThrough"), 1);
+ for (int i = 0; i < words.size(); ++i)
+ fst->AddPassThroughTranslation(words[i], feats);
+ }
+ composed = ec->Compose(&is, forest);
+ }
+ if (composed) {
+ Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1);
+ Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1);
+ Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail);
+ forest->ConnectEdgeToHeadNode(hg_edge, goal);
+ forest->Reweight(weights);
+ }
+ if (add_pass_through_rules)
+ fst->ClearPassThroughTranslations();
+ return composed;
+ }
+
+ const string goal_sym;
+ const TRulePtr kGOAL_RULE;
+ const WordID kGOAL;
+ const bool add_pass_through_rules;
+ boost::shared_ptr<EarleyComposer> ec;
+ boost::shared_ptr<FSTNode> fst;
+};
+
+FSTTranslator::FSTTranslator(const boost::program_options::variables_map& conf) :
+ pimpl_(new FSTTranslatorImpl(conf)) {}
+
+bool FSTTranslator::TranslateImpl(const string& input,
+ SentenceMetadata* smeta,
+ const vector<double>& weights,
+ Hypergraph* minus_lm_forest) {
+ smeta->SetSourceLength(0); // don't know how to compute this
+ return pimpl_->Translate(input, weights, minus_lm_forest);
+}
+
diff --git a/decoder/grammar.cc b/decoder/grammar.cc
new file mode 100644
index 00000000..5eb7887d
--- /dev/null
+++ b/decoder/grammar.cc
@@ -0,0 +1,148 @@
+#include "grammar.h"
+
+#include <algorithm>
+#include <utility>
+#include <map>
+
+#include "rule_lexer.h"
+#include "filelib.h"
+#include "tdict.h"
+
+using namespace std;
+
+const vector<TRulePtr> Grammar::NO_RULES;
+
+RuleBin::~RuleBin() {}
+GrammarIter::~GrammarIter() {}
+Grammar::~Grammar() {}
+
+bool Grammar::HasRuleForSpan(int i, int j, int distance) const {
+ (void) i;
+ (void) j;
+ (void) distance;
+ return true; // always true by default
+}
+
+struct TextRuleBin : public RuleBin {
+ int GetNumRules() const {
+ return rules_.size();
+ }
+ TRulePtr GetIthRule(int i) const {
+ return rules_[i];
+ }
+ void AddRule(TRulePtr t) {
+ rules_.push_back(t);
+ }
+ int Arity() const {
+ return rules_.front()->Arity();
+ }
+ void Dump() const {
+ for (int i = 0; i < rules_.size(); ++i)
+ cerr << rules_[i]->AsString() << endl;
+ }
+ private:
+ vector<TRulePtr> rules_;
+};
+
+struct TextGrammarNode : public GrammarIter {
+ TextGrammarNode() : rb_(NULL) {}
+ ~TextGrammarNode() {
+ delete rb_;
+ }
+ const GrammarIter* Extend(int symbol) const {
+ map<WordID, TextGrammarNode>::const_iterator i = tree_.find(symbol);
+ if (i == tree_.end()) return NULL;
+ return &i->second;
+ }
+
+ const RuleBin* GetRules() const {
+ if (rb_) {
+ //rb_->Dump();
+ }
+ return rb_;
+ }
+
+ map<WordID, TextGrammarNode> tree_;
+ TextRuleBin* rb_;
+};
+
+struct TGImpl {
+ TextGrammarNode root_;
+};
+
+TextGrammar::TextGrammar() : max_span_(10), pimpl_(new TGImpl) {}
+TextGrammar::TextGrammar(const string& file) :
+ max_span_(10),
+ pimpl_(new TGImpl) {
+ ReadFromFile(file);
+}
+
+const GrammarIter* TextGrammar::GetRoot() const {
+ return &pimpl_->root_;
+}
+
+void TextGrammar::AddRule(const TRulePtr& rule) {
+ if (rule->IsUnary()) {
+ rhs2unaries_[rule->f().front()].push_back(rule);
+ unaries_.push_back(rule);
+ } else {
+ TextGrammarNode* cur = &pimpl_->root_;
+ for (int i = 0; i < rule->f_.size(); ++i)
+ cur = &cur->tree_[rule->f_[i]];
+ if (cur->rb_ == NULL)
+ cur->rb_ = new TextRuleBin;
+ cur->rb_->AddRule(rule);
+ }
+}
+
+static void AddRuleHelper(const TRulePtr& new_rule, void* extra) {
+ static_cast<TextGrammar*>(extra)->AddRule(new_rule);
+}
+
+void TextGrammar::ReadFromFile(const string& filename) {
+ ReadFile in(filename);
+ RuleLexer::ReadRules(in.stream(), &AddRuleHelper, this);
+}
+
+bool TextGrammar::HasRuleForSpan(int i, int j, int distance) const {
+ return (max_span_ >= distance);
+}
+
+GlueGrammar::GlueGrammar(const string& file) : TextGrammar(file) {}
+
+GlueGrammar::GlueGrammar(const string& goal_nt, const string& default_nt) {
+ TRulePtr stop_glue(new TRule("[" + goal_nt + "] ||| [" + default_nt + ",1] ||| [" + default_nt + ",1]"));
+ TRulePtr glue(new TRule("[" + goal_nt + "] ||| [" + goal_nt + ",1] ["
+ + default_nt + ",2] ||| [" + goal_nt + ",1] [" + default_nt + ",2] ||| Glue=1"));
+
+ AddRule(stop_glue);
+ AddRule(glue);
+ //cerr << "GLUE: " << stop_glue->AsString() << endl;
+ //cerr << "GLUE: " << glue->AsString() << endl;
+}
+
+bool GlueGrammar::HasRuleForSpan(int i, int j, int distance) const {
+ (void) j;
+ return (i == 0);
+}
+
+PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat) :
+ has_rule_(input.size() + 1) {
+ for (int i = 0; i < input.size(); ++i) {
+ const vector<LatticeArc>& alts = input[i];
+ for (int k = 0; k < alts.size(); ++k) {
+ const int j = alts[k].dist2next + i;
+ has_rule_[i].insert(j);
+ const string& src = TD::Convert(alts[k].label);
+ TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1"));
+ AddRule(pt);
+// cerr << "PT: " << pt->AsString() << endl;
+ }
+ }
+}
+
+bool PassThroughGrammar::HasRuleForSpan(int i, int j, int distance) const {
+ const set<int>& hr = has_rule_[i];
+ if (i == j) { return !hr.empty(); }
+ return (hr.find(j) != hr.end());
+}
diff --git a/decoder/grammar.h b/decoder/grammar.h
new file mode 100644
index 00000000..46886d3a
--- /dev/null
+++ b/decoder/grammar.h
@@ -0,0 +1,89 @@
+#ifndef GRAMMAR_H_
+#define GRAMMAR_H_
+
+#include <vector>
+#include <map>
+#include <set>
+#include <boost/shared_ptr.hpp>
+#include <string>
+
+#include "lattice.h"
+#include "trule.h"
+
+struct RuleBin {
+ virtual ~RuleBin();
+ virtual int GetNumRules() const = 0;
+ virtual TRulePtr GetIthRule(int i) const = 0;
+ virtual int Arity() const = 0;
+};
+
+struct GrammarIter {
+ virtual ~GrammarIter();
+ virtual const RuleBin* GetRules() const = 0;
+ virtual const GrammarIter* Extend(int symbol) const = 0;
+};
+
+struct Grammar {
+ typedef std::map<WordID, std::vector<TRulePtr> > Cat2Rules;
+ static const std::vector<TRulePtr> NO_RULES;
+
+ virtual ~Grammar();
+ virtual const GrammarIter* GetRoot() const = 0;
+ virtual bool HasRuleForSpan(int i, int j, int distance) const;
+ const std::string GetGrammarName(){return grammar_name_;}
+ void SetGrammarName(std::string n) {grammar_name_ = n; }
+ // cat is the category to be rewritten
+ inline const std::vector<TRulePtr>& GetAllUnaryRules() const {
+ return unaries_;
+ }
+
+ // get all the unary rules that rewrite category cat
+ inline const std::vector<TRulePtr>& GetUnaryRulesForRHS(const WordID& cat) const {
+ Cat2Rules::const_iterator found = rhs2unaries_.find(cat);
+ if (found == rhs2unaries_.end())
+ return NO_RULES;
+ else
+ return found->second;
+ }
+
+ protected:
+ Cat2Rules rhs2unaries_; // these must be filled in by subclasses!
+ std::vector<TRulePtr> unaries_;
+ std::string grammar_name_;
+};
+
+typedef boost::shared_ptr<Grammar> GrammarPtr;
+
+class TGImpl;
+struct TextGrammar : public Grammar {
+ TextGrammar();
+ TextGrammar(const std::string& file);
+ void SetMaxSpan(int m) { max_span_ = m; }
+
+ virtual const GrammarIter* GetRoot() const;
+ void AddRule(const TRulePtr& rule);
+ void ReadFromFile(const std::string& filename);
+ virtual bool HasRuleForSpan(int i, int j, int distance) const;
+ const std::vector<TRulePtr>& GetUnaryRules(const WordID& cat) const;
+
+ private:
+ int max_span_;
+ boost::shared_ptr<TGImpl> pimpl_;
+
+};
+
+struct GlueGrammar : public TextGrammar {
+ // read glue grammar from file
+ explicit GlueGrammar(const std::string& file);
+ GlueGrammar(const std::string& goal_nt, const std::string& default_nt); // "S", "X"
+ virtual bool HasRuleForSpan(int i, int j, int distance) const;
+};
+
+struct PassThroughGrammar : public TextGrammar {
+ PassThroughGrammar(const Lattice& input, const std::string& cat);
+ virtual bool HasRuleForSpan(int i, int j, int distance) const;
+ private:
+ std::vector<std::set<int> > has_rule_; // index by [i][j]
+};
+
+#endif
diff --git a/decoder/grammar_test.cc b/decoder/grammar_test.cc
new file mode 100644
index 00000000..62b8f958
--- /dev/null
+++ b/decoder/grammar_test.cc
@@ -0,0 +1,59 @@
+#include <cassert>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <gtest/gtest.h>
+#include "trule.h"
+#include "tdict.h"
+#include "grammar.h"
+#include "bottom_up_parser.h"
+#include "ff.h"
+#include "weights.h"
+
+using namespace std;
+
+class GrammarTest : public testing::Test {
+ public:
+ GrammarTest() {
+ wts.InitFromFile("test_data/weights.gt");
+ }
+ protected:
+ virtual void SetUp() { }
+ virtual void TearDown() { }
+ Weights wts;
+};
+
+TEST_F(GrammarTest,TestTextGrammar) {
+ vector<double> w;
+ vector<const FeatureFunction*> ms;
+ ModelSet models(w, ms);
+
+ TextGrammar g;
+ TRulePtr r1(new TRule("[X] ||| a b c ||| A B C ||| 0.1 0.2 0.3", true));
+ TRulePtr r2(new TRule("[X] ||| a b c ||| 1 2 3 ||| 0.2 0.3 0.4", true));
+ TRulePtr r3(new TRule("[X] ||| a b c d ||| A B C D ||| 0.1 0.2 0.3", true));
+ cerr << r1->AsString() << endl;
+ g.AddRule(r1);
+ g.AddRule(r2);
+ g.AddRule(r3);
+}
+
+TEST_F(GrammarTest,TestTextGrammarFile) {
+ GrammarPtr g(new TextGrammar("./test_data/grammar.prune"));
+ vector<GrammarPtr> grammars(1, g);
+
+ LatticeArc a(TD::Convert("ein"), 0.0, 1);
+ LatticeArc b(TD::Convert("haus"), 0.0, 1);
+ Lattice lattice(2);
+ lattice[0].push_back(a);
+ lattice[1].push_back(b);
+ Hypergraph forest;
+ ExhaustiveBottomUpParser parser("PHRASE", grammars);
+ parser.Parse(lattice, &forest);
+ forest.PrintGraphviz();
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/decoder/gzstream.cc b/decoder/gzstream.cc
new file mode 100644
index 00000000..9703e6ad
--- /dev/null
+++ b/decoder/gzstream.cc
@@ -0,0 +1,165 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+// ============================================================================
+//
+// File : gzstream.C
+// Revision : $Revision: 1.1 $
+// Revision_date : $Date: 2006/03/30 04:05:52 $
+// Author(s) : Deepak Bandyopadhyay, Lutz Kettner
+//
+// Standard streambuf implementation following Nicolai Josuttis, "The
+// Standard C++ Library".
+// ============================================================================
+
+#include "gzstream.h"
+#include <iostream>
+#include <cstring>
+
+#ifdef GZSTREAM_NAMESPACE
+namespace GZSTREAM_NAMESPACE {
+#endif
+
+// ----------------------------------------------------------------------------
+// Internal classes to implement gzstream. See header file for user classes.
+// ----------------------------------------------------------------------------
+
+// --------------------------------------
+// class gzstreambuf:
+// --------------------------------------
+
+gzstreambuf* gzstreambuf::open( const char* name, int open_mode) {
+ if ( is_open())
+ return (gzstreambuf*)0;
+ mode = open_mode;
+ // no append nor read/write mode
+ if ((mode & std::ios::ate) || (mode & std::ios::app)
+ || ((mode & std::ios::in) && (mode & std::ios::out)))
+ return (gzstreambuf*)0;
+ char fmode[10];
+ char* fmodeptr = fmode;
+ if ( mode & std::ios::in)
+ *fmodeptr++ = 'r';
+ else if ( mode & std::ios::out)
+ *fmodeptr++ = 'w';
+ *fmodeptr++ = 'b';
+ *fmodeptr = '\0';
+ file = gzopen( name, fmode);
+ if (file == 0)
+ return (gzstreambuf*)0;
+ opened = 1;
+ return this;
+}
+
+gzstreambuf * gzstreambuf::close() {
+ if ( is_open()) {
+ sync();
+ opened = 0;
+ if ( gzclose( file) == Z_OK)
+ return this;
+ }
+ return (gzstreambuf*)0;
+}
+
+int gzstreambuf::underflow() { // used for input buffer only
+ if ( gptr() && ( gptr() < egptr()))
+ return * reinterpret_cast<unsigned char *>( gptr());
+
+ if ( ! (mode & std::ios::in) || ! opened)
+ return EOF;
+ // Josuttis' implementation of inbuf
+ int n_putback = gptr() - eback();
+ if ( n_putback > 4)
+ n_putback = 4;
+ memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback);
+
+ int num = gzread( file, buffer+4, bufferSize-4);
+ if (num <= 0) // ERROR or EOF
+ return EOF;
+
+ // reset buffer pointers
+ setg( buffer + (4 - n_putback), // beginning of putback area
+ buffer + 4, // read position
+ buffer + 4 + num); // end of buffer
+
+ // return next character
+ return * reinterpret_cast<unsigned char *>( gptr());
+}
+
+int gzstreambuf::flush_buffer() {
+ // Separate the writing of the buffer from overflow() and
+ // sync() operation.
+ int w = pptr() - pbase();
+ if ( gzwrite( file, pbase(), w) != w)
+ return EOF;
+ pbump( -w);
+ return w;
+}
+
+int gzstreambuf::overflow( int c) { // used for output buffer only
+ if ( ! ( mode & std::ios::out) || ! opened)
+ return EOF;
+ if (c != EOF) {
+ *pptr() = c;
+ pbump(1);
+ }
+ if ( flush_buffer() == EOF)
+ return EOF;
+ return c;
+}
+
+int gzstreambuf::sync() {
+ // Changed to use flush_buffer() instead of overflow( EOF)
+ // which caused improper behavior with std::endl and flush(),
+ // bug reported by Vincent Ricard.
+ if ( pptr() && pptr() > pbase()) {
+ if ( flush_buffer() == EOF)
+ return -1;
+ }
+ return 0;
+}
+
+// --------------------------------------
+// class gzstreambase:
+// --------------------------------------
+
+gzstreambase::gzstreambase( const char* name, int mode) {
+ init( &buf);
+ open( name, mode);
+}
+
+gzstreambase::~gzstreambase() {
+ buf.close();
+}
+
+void gzstreambase::open( const char* name, int open_mode) {
+ if ( ! buf.open( name, open_mode))
+ clear( rdstate() | std::ios::badbit);
+}
+
+void gzstreambase::close() {
+ if ( buf.is_open())
+ if ( ! buf.close())
+ clear( rdstate() | std::ios::badbit);
+}
+
+#ifdef GZSTREAM_NAMESPACE
+} // namespace GZSTREAM_NAMESPACE
+#endif
+
+// ============================================================================
+// EOF //
diff --git a/decoder/gzstream.h b/decoder/gzstream.h
new file mode 100644
index 00000000..ad9785fd
--- /dev/null
+++ b/decoder/gzstream.h
@@ -0,0 +1,121 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+// ============================================================================
+//
+// File : gzstream.h
+// Revision : $Revision: 1.1 $
+// Revision_date : $Date: 2006/03/30 04:05:52 $
+// Author(s) : Deepak Bandyopadhyay, Lutz Kettner
+//
+// Standard streambuf implementation following Nicolai Josuttis, "The
+// Standard C++ Library".
+// ============================================================================
+
+#ifndef GZSTREAM_H
+#define GZSTREAM_H 1
+
+// standard C++ with new header file names and std:: namespace
+#include <iostream>
+#include <fstream>
+#include <zlib.h>
+
+#ifdef GZSTREAM_NAMESPACE
+namespace GZSTREAM_NAMESPACE {
+#endif
+
+// ----------------------------------------------------------------------------
+// Internal classes to implement gzstream. See below for user classes.
+// ----------------------------------------------------------------------------
+
+class gzstreambuf : public std::streambuf {
+private:
+ static const int bufferSize = 47+256; // size of data buff
+ // totals 512 bytes under g++ for igzstream at the end.
+
+ gzFile file; // file handle for compressed file
+ char buffer[bufferSize]; // data buffer
+ char opened; // open/close state of stream
+ int mode; // I/O mode
+
+ int flush_buffer();
+public:
+ gzstreambuf() : opened(0) {
+ setp( buffer, buffer + (bufferSize-1));
+ setg( buffer + 4, // beginning of putback area
+ buffer + 4, // read position
+ buffer + 4); // end position
+ // ASSERT: both input & output capabilities will not be used together
+ }
+ int is_open() { return opened; }
+ gzstreambuf* open( const char* name, int open_mode);
+ gzstreambuf* close();
+ ~gzstreambuf() { close(); }
+
+ virtual int overflow( int c = EOF);
+ virtual int underflow();
+ virtual int sync();
+};
+
+class gzstreambase : virtual public std::ios {
+protected:
+ gzstreambuf buf;
+public:
+ gzstreambase() { init(&buf); }
+ gzstreambase( const char* name, int open_mode);
+ ~gzstreambase();
+ void open( const char* name, int open_mode);
+ void close();
+ gzstreambuf* rdbuf() { return &buf; }
+};
+
+// ----------------------------------------------------------------------------
+// User classes. Use igzstream and ogzstream analogously to ifstream and
+// ofstream respectively. They read and write files based on the gz*
+// function interface of the zlib. Files are compatible with gzip compression.
+// ----------------------------------------------------------------------------
+
+class igzstream : public gzstreambase, public std::istream {
+public:
+ igzstream() : std::istream( &buf) {}
+ igzstream( const char* name, int open_mode = std::ios::in)
+ : gzstreambase( name, open_mode), std::istream( &buf) {}
+ gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
+ void open( const char* name, int open_mode = std::ios::in) {
+ gzstreambase::open( name, open_mode);
+ }
+};
+
+class ogzstream : public gzstreambase, public std::ostream {
+public:
+ ogzstream() : std::ostream( &buf) {}
+ ogzstream( const char* name, int mode = std::ios::out)
+ : gzstreambase( name, mode), std::ostream( &buf) {}
+ gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
+ void open( const char* name, int open_mode = std::ios::out) {
+ gzstreambase::open( name, open_mode);
+ }
+};
+
+#ifdef GZSTREAM_NAMESPACE
+} // namespace GZSTREAM_NAMESPACE
+#endif
+
+#endif // GZSTREAM_H
+// ============================================================================
+// EOF //
+
diff --git a/decoder/hg.cc b/decoder/hg.cc
new file mode 100644
index 00000000..b56f1246
--- /dev/null
+++ b/decoder/hg.cc
@@ -0,0 +1,588 @@
+#include "hg.h"
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <set>
+#include <map>
+#include <iostream>
+
+#include "viterbi.h"
+#include "inside_outside.h"
+#include "tdict.h"
+
+using namespace std;
+
+double Hypergraph::NumberOfPaths() const {
+ return Inside<double, TransitionCountWeightFunction>(*this);
+}
+
+struct ScaledTransitionEventWeightFunction {
+ ScaledTransitionEventWeightFunction(double alpha) : scale_(alpha) {}
+ inline SparseVector<prob_t> operator()(const Hypergraph::Edge& e) const {
+ SparseVector<prob_t> result;
+ result.set_value(e.id_, e.edge_prob_.pow(scale_));
+ return result;
+ }
+ const double scale_;
+};
+
+struct TropicalValue {
+ TropicalValue() : v_() {}
+ explicit TropicalValue(int v) {
+ if (v == 0) v_ = prob_t::Zero();
+ else if (v == 1) v_ = prob_t::One();
+ else { cerr << "Bad value in TropicalValue(int).\n"; abort(); }
+ }
+ explicit TropicalValue(const prob_t& v) : v_(v) {}
+ inline TropicalValue& operator+=(const TropicalValue& o) {
+ if (v_ < o.v_) v_ = o.v_;
+ return *this;
+ }
+ inline TropicalValue& operator*=(const TropicalValue& o) {
+ v_ *= o.v_;
+ return *this;
+ }
+ inline bool operator==(const TropicalValue& o) const { return v_ == o.v_; }
+ prob_t v_;
+};
+
+struct ViterbiWeightFunction {
+ inline TropicalValue operator()(const Hypergraph::Edge& e) const {
+ return TropicalValue(e.edge_prob_);
+ }
+};
+
+struct ViterbiTransitionEventWeightFunction {
+ inline SparseVector<TropicalValue> operator()(const Hypergraph::Edge& e) const {
+ SparseVector<TropicalValue> result;
+ result.set_value(e.id_, TropicalValue(e.edge_prob_));
+ return result;
+ }
+};
+
+
+prob_t Hypergraph::ComputeEdgePosteriors(double scale, vector<prob_t>* posts) const {
+ const ScaledEdgeProb weight(scale);
+ const ScaledTransitionEventWeightFunction w2(scale);
+ SparseVector<prob_t> pv;
+ const double inside = InsideOutside<prob_t,
+ ScaledEdgeProb,
+ SparseVector<prob_t>,
+ ScaledTransitionEventWeightFunction>(*this, &pv, weight, w2);
+ posts->resize(edges_.size());
+ for (int i = 0; i < edges_.size(); ++i)
+ (*posts)[i] = prob_t(pv.value(i));
+ return prob_t(inside);
+}
+
+prob_t Hypergraph::ComputeBestPathThroughEdges(vector<prob_t>* post) const {
+ SparseVector<TropicalValue> pv;
+ const TropicalValue viterbi_weight = InsideOutside<TropicalValue,
+ ViterbiWeightFunction,
+ SparseVector<TropicalValue>,
+ ViterbiTransitionEventWeightFunction>(*this, &pv);
+ post->resize(edges_.size());
+ for (int i = 0; i < edges_.size(); ++i)
+ (*post)[i] = pv.value(i).v_;
+ return viterbi_weight.v_;
+}
+
+void Hypergraph::PushWeightsToSource(double scale) {
+ vector<prob_t> posts;
+ ComputeEdgePosteriors(scale, &posts);
+ for (int i = 0; i < nodes_.size(); ++i) {
+ const Hypergraph::Node& node = nodes_[i];
+ prob_t z = prob_t::Zero();
+ for (int j = 0; j < node.out_edges_.size(); ++j)
+ z += posts[node.out_edges_[j]];
+ for (int j = 0; j < node.out_edges_.size(); ++j) {
+ edges_[node.out_edges_[j]].edge_prob_ = posts[node.out_edges_[j]] / z;
+ }
+ }
+}
+
+void Hypergraph::PushWeightsToGoal(double scale) {
+ vector<prob_t> posts;
+ ComputeEdgePosteriors(scale, &posts);
+ for (int i = 0; i < nodes_.size(); ++i) {
+ const Hypergraph::Node& node = nodes_[i];
+ prob_t z = prob_t::Zero();
+ for (int j = 0; j < node.in_edges_.size(); ++j)
+ z += posts[node.in_edges_[j]];
+ for (int j = 0; j < node.in_edges_.size(); ++j) {
+ edges_[node.in_edges_[j]].edge_prob_ = posts[node.in_edges_[j]] / z;
+ }
+ }
+}
+
+struct EdgeExistsWeightFunction {
+ EdgeExistsWeightFunction(const std::vector<bool>& prunes) : prunes_(prunes) {}
+ double operator()(const Hypergraph::Edge& edge) const {
+ return (prunes_[edge.id_] ? 0.0 : 1.0);
+ }
+ private:
+ const vector<bool>& prunes_;
+};
+
+void Hypergraph::PruneEdges(const std::vector<bool>& prune_edge, bool run_inside_algorithm) {
+ assert(prune_edge.size() == edges_.size());
+ vector<bool> filtered = prune_edge;
+
+ if (run_inside_algorithm) {
+ const EdgeExistsWeightFunction wf(prune_edge);
+ // use double, not bool since vector<bool> causes problems with the Inside algorithm.
+ // I don't know a good c++ way to resolve this short of template specialization which
+ // I dislike. If you know of a better way that doesn't involve specialization,
+ // fix this!
+ vector<double> reachable;
+ bool goal_derivable = (0 < Inside<double, EdgeExistsWeightFunction>(*this, &reachable, wf));
+ if (!goal_derivable) {
+ edges_.clear();
+ nodes_.clear();
+ nodes_.push_back(Node());
+ return;
+ }
+
+ assert(reachable.size() == nodes_.size());
+ for (int i = 0; i < edges_.size(); ++i) {
+ bool prune = prune_edge[i];
+ if (!prune) {
+ const Edge& edge = edges_[i];
+ for (int j = 0; j < edge.tail_nodes_.size(); ++j) {
+ if (!reachable[edge.tail_nodes_[j]]) {
+ prune = true;
+ break;
+ }
+ }
+ }
+ filtered[i] = prune;
+ }
+ }
+
+ TopologicallySortNodesAndEdges(nodes_.size() - 1, &filtered);
+}
+
+void Hypergraph::DensityPruneInsideOutside(const double scale,
+ const bool use_sum_prod_semiring,
+ const double density,
+ const vector<bool>* preserve_mask) {
+ assert(density >= 1.0);
+ const int plen = ViterbiPathLength(*this);
+ vector<WordID> bp;
+ int rnum = min(static_cast<int>(edges_.size()), static_cast<int>(density * static_cast<double>(plen)));
+ if (rnum == edges_.size()) {
+ cerr << "No pruning required: denisty already sufficient";
+ return;
+ }
+ vector<prob_t> io(edges_.size());
+ if (use_sum_prod_semiring)
+ ComputeEdgePosteriors(scale, &io);
+ else
+ ComputeBestPathThroughEdges(&io);
+ assert(edges_.size() == io.size());
+ vector<prob_t> sorted = io;
+ nth_element(sorted.begin(), sorted.begin() + rnum, sorted.end(), greater<prob_t>());
+ const double cutoff = sorted[rnum];
+ vector<bool> prune(edges_.size());
+ for (int i = 0; i < edges_.size(); ++i) {
+ prune[i] = (io[i] < cutoff);
+ if (preserve_mask && (*preserve_mask)[i]) prune[i] = false;
+ }
+ PruneEdges(prune);
+}
+
+void Hypergraph::BeamPruneInsideOutside(
+ const double scale,
+ const bool use_sum_prod_semiring,
+ const double alpha,
+ const vector<bool>* preserve_mask) {
+ assert(alpha > 0.0);
+ assert(scale > 0.0);
+ vector<prob_t> io(edges_.size());
+ if (use_sum_prod_semiring)
+ ComputeEdgePosteriors(scale, &io);
+ else
+ ComputeBestPathThroughEdges(&io);
+ assert(edges_.size() == io.size());
+ prob_t best; // initializes to zero
+ for (int i = 0; i < io.size(); ++i)
+ if (io[i] > best) best = io[i];
+ const prob_t aprob(exp(-alpha));
+ const prob_t cutoff = best * aprob;
+ // cerr << "aprob = " << aprob << "\t CUTOFF=" << cutoff << endl;
+ vector<bool> prune(edges_.size());
+ //cerr << preserve_mask.size() << " " << edges_.size() << endl;
+ int pc = 0;
+ for (int i = 0; i < io.size(); ++i) {
+ const bool prune_edge = (io[i] < cutoff);
+ if (prune_edge) ++pc;
+ prune[i] = (io[i] < cutoff);
+ if (preserve_mask && (*preserve_mask)[i]) prune[i] = false;
+ }
+ // cerr << "Beam pruning " << pc << "/" << io.size() << " edges\n";
+ PruneEdges(prune);
+}
+
+void Hypergraph::PrintGraphviz() const {
+ int ei = 0;
+ cerr << "digraph G {\n rankdir=LR;\n nodesep=.05;\n";
+ for (vector<Edge>::const_iterator i = edges_.begin();
+ i != edges_.end(); ++i) {
+ const Edge& edge=*i;
+ ++ei;
+ static const string none = "<null>";
+ string rule = (edge.rule_ ? edge.rule_->AsString(false) : none);
+
+ cerr << " A_" << ei << " [label=\"" << rule << " p=" << edge.edge_prob_
+ << " F:" << edge.feature_values_
+ << "\" shape=\"rect\"];\n";
+ Hypergraph::TailNodeVector indorder(edge.tail_nodes_.size(), 0);
+ int ntc = 0;
+ for (int i = 0; i < edge.rule_->e_.size(); ++i) {
+ if (edge.rule_->e_[i] <= 0) indorder[ntc++] = 1 + (-1 * edge.rule_->e_[i]);
+ }
+ for (int i = 0; i < edge.tail_nodes_.size(); ++i) {
+ cerr << " " << edge.tail_nodes_[i] << " -> A_" << ei;
+ if (edge.tail_nodes_.size() > 1) {
+ cerr << " [label=\"" << indorder[i] << "\"]";
+ }
+ cerr << ";\n";
+ }
+ cerr << " A_" << ei << " -> " << edge.head_node_ << ";\n";
+ }
+ for (vector<Node>::const_iterator ni = nodes_.begin();
+ ni != nodes_.end(); ++ni) {
+ cerr << " " << ni->id_ << "[label=\"" << (ni->cat_ < 0 ? TD::Convert(ni->cat_ * -1) : "")
+ //cerr << " " << ni->id_ << "[label=\"" << ni->cat_
+ << " n=" << ni->id_
+// << ",x=" << &*ni
+// << ",in=" << ni->in_edges_.size()
+// << ",out=" << ni->out_edges_.size()
+ << "\"];\n";
+ }
+ cerr << "}\n";
+}
+
+void Hypergraph::Union(const Hypergraph& other) {
+ if (&other == this) return;
+ if (nodes_.empty()) { nodes_ = other.nodes_; edges_ = other.edges_; return; }
+ int noff = nodes_.size();
+ int eoff = edges_.size();
+ int ogoal = other.nodes_.size() - 1;
+ int cgoal = noff - 1;
+ // keep a single goal node, so add nodes.size - 1
+ nodes_.resize(nodes_.size() + ogoal);
+ // add all edges
+ edges_.resize(edges_.size() + other.edges_.size());
+
+ for (int i = 0; i < ogoal; ++i) {
+ const Node& on = other.nodes_[i];
+ Node& cn = nodes_[i + noff];
+ cn.id_ = i + noff;
+ cn.in_edges_.resize(on.in_edges_.size());
+ for (int j = 0; j < on.in_edges_.size(); ++j)
+ cn.in_edges_[j] = on.in_edges_[j] + eoff;
+
+ cn.out_edges_.resize(on.out_edges_.size());
+ for (int j = 0; j < on.out_edges_.size(); ++j)
+ cn.out_edges_[j] = on.out_edges_[j] + eoff;
+ }
+
+ for (int i = 0; i < other.edges_.size(); ++i) {
+ const Edge& oe = other.edges_[i];
+ Edge& ce = edges_[i + eoff];
+ ce.id_ = i + eoff;
+ ce.rule_ = oe.rule_;
+ ce.feature_values_ = oe.feature_values_;
+ if (oe.head_node_ == ogoal) {
+ ce.head_node_ = cgoal;
+ nodes_[cgoal].in_edges_.push_back(ce.id_);
+ } else {
+ ce.head_node_ = oe.head_node_ + noff;
+ }
+ ce.tail_nodes_.resize(oe.tail_nodes_.size());
+ for (int j = 0; j < oe.tail_nodes_.size(); ++j)
+ ce.tail_nodes_[j] = oe.tail_nodes_[j] + noff;
+ }
+
+ TopologicallySortNodesAndEdges(cgoal);
+}
+
+void Hypergraph::PruneUnreachable(int goal_node_id) {
+ TopologicallySortNodesAndEdges(goal_node_id, NULL);
+}
+
+void Hypergraph::RemoveNoncoaccessibleStates(int goal_node_id) {
+ if (goal_node_id < 0) goal_node_id += nodes_.size();
+ assert(goal_node_id >= 0);
+ assert(goal_node_id < nodes_.size());
+
+ // TODO finish implementation
+ abort();
+}
+
+struct DFSContext {
+ int node;
+ int edge_iter;
+ int tail_iter;
+ DFSContext(int n, int e, int t) : node(n), edge_iter(e), tail_iter(t) {}
+};
+
+enum ColorType { WHITE, GRAY, BLACK };
+
+template <class T>
+struct BadId {
+ bool operator()(const T& obj) const { return obj.id_ == -1; }
+};
+
+template <class T>
+struct IdCompare {
+ bool operator()(const T& a, const T& b) { return a.id_ < b.id_; }
+};
+
+void Hypergraph::TopologicallySortNodesAndEdges(int goal_index,
+ const vector<bool>* prune_edges) {
+ // figure out which nodes are reachable from the goal
+ vector<int> reloc_node(nodes_.size(), -1);
+ vector<int> reloc_edge(edges_.size(), -1);
+ vector<ColorType> color(nodes_.size(), WHITE);
+ vector<DFSContext> stack;
+ stack.reserve(nodes_.size());
+ stack.push_back(DFSContext(goal_index, 0, 0));
+ int node_count = 0;
+ int edge_count = 0;
+ while(!stack.empty()) {
+ const DFSContext& p = stack.back();
+ int cur_ni = p.node;
+ int edge_i = p.edge_iter;
+ int tail_i = p.tail_iter;
+ stack.pop_back();
+ const Node* cur_node = &nodes_[cur_ni];
+ int edge_end = cur_node->in_edges_.size();
+ while (edge_i != edge_end) {
+ const Edge& cur_edge = edges_[cur_node->in_edges_[edge_i]];
+ const int tail_end = cur_edge.tail_nodes_.size();
+ if ((tail_end == tail_i) || (prune_edges && (*prune_edges)[cur_edge.id_])) {
+ ++edge_i;
+ tail_i = 0;
+ continue;
+ }
+ const int tail_ni = cur_edge.tail_nodes_[tail_i];
+ const int tail_color = color[tail_ni];
+ if (tail_color == WHITE) {
+ stack.push_back(DFSContext(cur_ni, edge_i, ++tail_i));
+ cur_ni = tail_ni;
+ cur_node = &nodes_[cur_ni];
+ color[cur_ni] = GRAY;
+ edge_i = 0;
+ edge_end = cur_node->in_edges_.size();
+ tail_i = 0;
+ } else if (tail_color == BLACK) {
+ ++tail_i;
+ } else if (tail_color == GRAY) {
+ // this can happen if, e.g., it is possible to rederive
+ // a single cell in the CKY chart via a cycle.
+ cerr << "Detected forbidden cycle in HG:\n";
+ cerr << " " << cur_edge.rule_->AsString() << endl;
+ while(!stack.empty()) {
+ const DFSContext& p = stack.back();
+ cerr << " " << edges_[nodes_[p.node].in_edges_[p.edge_iter]].rule_->AsString() << endl;
+ stack.pop_back();
+ }
+ abort();
+ }
+ }
+ color[cur_ni] = BLACK;
+ reloc_node[cur_ni] = node_count++;
+ if (prune_edges) {
+ for (int i = 0; i < edge_end; ++i) {
+ int ei = cur_node->in_edges_[i];
+ if (!(*prune_edges)[ei])
+ reloc_edge[cur_node->in_edges_[i]] = edge_count++;
+ }
+ } else {
+ for (int i = 0; i < edge_end; ++i)
+ reloc_edge[cur_node->in_edges_[i]] = edge_count++;
+ }
+ }
+#ifndef HG_EDGES_TOPO_SORTED
+ int ec = 0;
+ for (int i = 0; i < reloc_edge.size(); ++i) {
+ int& cp = reloc_edge[i];
+ if (cp >= 0) { cp = ec++; }
+ }
+#endif
+
+#if 0
+ cerr << "TOPO:";
+ for (int i = 0; i < reloc_node.size(); ++i)
+ cerr << " " << reloc_node[i];
+ cerr << endl;
+ cerr << "EDGE:";
+ for (int i = 0; i < reloc_edge.size(); ++i)
+ cerr << " " << reloc_edge[i];
+ cerr << endl;
+#endif
+ bool no_op = true;
+ for (int i = 0; i < reloc_node.size() && no_op; ++i)
+ if (reloc_node[i] != i) no_op = false;
+ for (int i = 0; i < reloc_edge.size() && no_op; ++i)
+ if (reloc_edge[i] != i) no_op = false;
+ if (no_op) return;
+ for (int i = 0; i < reloc_node.size(); ++i) {
+ Node& node = nodes_[i];
+ node.id_ = reloc_node[i];
+ int c = 0;
+ for (int j = 0; j < node.in_edges_.size(); ++j) {
+ const int new_index = reloc_edge[node.in_edges_[j]];
+ if (new_index >= 0)
+ node.in_edges_[c++] = new_index;
+ }
+ node.in_edges_.resize(c);
+ c = 0;
+ for (int j = 0; j < node.out_edges_.size(); ++j) {
+ const int new_index = reloc_edge[node.out_edges_[j]];
+ if (new_index >= 0)
+ node.out_edges_[c++] = new_index;
+ }
+ node.out_edges_.resize(c);
+ }
+ for (int i = 0; i < reloc_edge.size(); ++i) {
+ Edge& edge = edges_[i];
+ edge.id_ = reloc_edge[i];
+ edge.head_node_ = reloc_node[edge.head_node_];
+ for (int j = 0; j < edge.tail_nodes_.size(); ++j)
+ edge.tail_nodes_[j] = reloc_node[edge.tail_nodes_[j]];
+ }
+ edges_.erase(remove_if(edges_.begin(), edges_.end(), BadId<Edge>()), edges_.end());
+ nodes_.erase(remove_if(nodes_.begin(), nodes_.end(), BadId<Node>()), nodes_.end());
+ sort(nodes_.begin(), nodes_.end(), IdCompare<Node>());
+#ifndef HG_EDGES_TOPO_SORTED
+ sort(edges_.begin(), edges_.end(), IdCompare<Edge>());
+#endif
+}
+
+TRulePtr Hypergraph::kEPSRule;
+TRulePtr Hypergraph::kUnaryRule;
+
+void Hypergraph::EpsilonRemove(WordID eps) {
+ if (!kEPSRule) {
+ kEPSRule.reset(new TRule("[X] ||| <eps> ||| <eps>"));
+ kUnaryRule.reset(new TRule("[X] ||| [X,1] ||| [X,1]"));
+ }
+ vector<bool> kill(edges_.size(), false);
+ for (int i = 0; i < edges_.size(); ++i) {
+ const Edge& edge = edges_[i];
+ if (edge.tail_nodes_.empty() &&
+ edge.rule_->f_.size() == 1 &&
+ edge.rule_->f_[0] == eps) {
+ kill[i] = true;
+ if (!edge.feature_values_.empty()) {
+ Node& node = nodes_[edge.head_node_];
+ if (node.in_edges_.size() != 1) {
+ cerr << "[WARNING] <eps> edge with features going into non-empty node - can't promote\n";
+ // this *probably* means that there are multiple derivations of the
+ // same sequence via different paths through the input forest
+ // this needs to be investigated and fixed
+ } else {
+ for (int j = 0; j < node.out_edges_.size(); ++j)
+ edges_[node.out_edges_[j]].feature_values_ += edge.feature_values_;
+ // cerr << "PROMOTED " << edge.feature_values_ << endl;
+ }
+ }
+ }
+ }
+ bool created_eps = false;
+ PruneEdges(kill);
+ for (int i = 0; i < nodes_.size(); ++i) {
+ const Node& node = nodes_[i];
+ if (node.in_edges_.empty()) {
+ for (int j = 0; j < node.out_edges_.size(); ++j) {
+ Edge& edge = edges_[node.out_edges_[j]];
+ if (edge.rule_->Arity() == 2) {
+ assert(edge.rule_->f_.size() == 2);
+ assert(edge.rule_->e_.size() == 2);
+ edge.rule_ = kUnaryRule;
+ int cur = node.id_;
+ int t = -1;
+ assert(edge.tail_nodes_.size() == 2);
+ for (int i = 0; i < 2; ++i) if (edge.tail_nodes_[i] != cur) { t = edge.tail_nodes_[i]; }
+ assert(t != -1);
+ edge.tail_nodes_.resize(1);
+ edge.tail_nodes_[0] = t;
+ } else {
+ edge.rule_ = kEPSRule;
+ edge.rule_->f_[0] = eps;
+ edge.rule_->e_[0] = eps;
+ edge.tail_nodes_.clear();
+ created_eps = true;
+ }
+ }
+ }
+ }
+ vector<bool> k2(edges_.size(), false);
+ PruneEdges(k2);
+ if (created_eps) EpsilonRemove(eps);
+}
+
+struct EdgeWeightSorter {
+ const Hypergraph& hg;
+ EdgeWeightSorter(const Hypergraph& h) : hg(h) {}
+ bool operator()(int a, int b) const {
+ return hg.edges_[a].edge_prob_ > hg.edges_[b].edge_prob_;
+ }
+};
+
+void Hypergraph::SortInEdgesByEdgeWeights() {
+ for (int i = 0; i < nodes_.size(); ++i) {
+ Node& node = nodes_[i];
+ sort(node.in_edges_.begin(), node.in_edges_.end(), EdgeWeightSorter(*this));
+ }
+}
+
+Hypergraph* Hypergraph::CreateViterbiHypergraph(const vector<bool>* edges) const {
+ vector<const Edge*> vit_edges;
+ if (edges) {
+ assert(edges->size() == edges_.size());
+ Viterbi<vector<const Edge*>, ViterbiPathTraversal, prob_t, EdgeSelectEdgeWeightFunction>(*this, &vit_edges, ViterbiPathTraversal(), EdgeSelectEdgeWeightFunction(*edges));
+ } else {
+ Viterbi<vector<const Edge*>, ViterbiPathTraversal, prob_t, EdgeProb>(*this, &vit_edges);
+ }
+ map<int, int> old2new_node;
+ int num_new_nodes = 0;
+ for (int i = 0; i < vit_edges.size(); ++i) {
+ const Edge& edge = *vit_edges[i];
+ for (int j = 0; j < edge.tail_nodes_.size(); ++j)
+ assert(old2new_node.count(edge.tail_nodes_[j]) > 0);
+ if (old2new_node.count(edge.head_node_) == 0) {
+ old2new_node[edge.head_node_] = num_new_nodes;
+ ++num_new_nodes;
+ }
+ }
+ Hypergraph* out = new Hypergraph(num_new_nodes, vit_edges.size(), is_linear_chain_);
+ for (map<int, int>::iterator it = old2new_node.begin();
+ it != old2new_node.end(); ++it) {
+ const Node& old_node = nodes_[it->first];
+ Node& new_node = out->nodes_[it->second];
+ new_node.cat_ = old_node.cat_;
+ new_node.id_ = it->second;
+ }
+
+ for (int i = 0; i < vit_edges.size(); ++i) {
+ const Edge& old_edge = *vit_edges[i];
+ Edge& new_edge = out->edges_[i];
+ new_edge = old_edge;
+ new_edge.id_ = i;
+ const int new_head_node = old2new_node[old_edge.head_node_];
+ new_edge.head_node_ = new_head_node;
+ out->nodes_[new_head_node].in_edges_.push_back(i);
+ for (int j = 0; j < old_edge.tail_nodes_.size(); ++j) {
+ const int new_tail_node = old2new_node[old_edge.tail_nodes_[j]];
+ new_edge.tail_nodes_[j] = new_tail_node;
+ out->nodes_[new_tail_node].out_edges_.push_back(i);
+ }
+ }
+ return out;
+}
+
diff --git a/decoder/hg.h b/decoder/hg.h
new file mode 100644
index 00000000..8d056358
--- /dev/null
+++ b/decoder/hg.h
@@ -0,0 +1,247 @@
+#ifndef _HG_H_
+#define _HG_H_
+
+#include <string>
+#include <vector>
+
+#include "small_vector.h"
+#include "sparse_vector.h"
+#include "wordid.h"
+#include "trule.h"
+#include "prob.h"
+
+// if you define this, edges_ will be sorted
+// (normally, just nodes_ are), but this can be quite
+// slow
+#undef HG_EDGES_TOPO_SORTED
+
+// class representing an acyclic hypergraph
+// - edges have 1 head, 0..n tails
+class Hypergraph {
+ public:
+ Hypergraph() : is_linear_chain_(false) {}
+
+ // SmallVector is a fast, small vector<int> implementation for sizes <= 2
+ typedef SmallVector TailNodeVector;
+
+ // TODO get rid of cat_?
+ struct Node {
+ Node() : id_(), cat_() {}
+ int id_; // equal to this object's position in the nodes_ vector
+ WordID cat_; // non-terminal category if <0, 0 if not set
+ std::vector<int> in_edges_; // contents refer to positions in edges_
+ std::vector<int> out_edges_; // contents refer to positions in edges_
+ };
+
+ // TODO get rid of edge_prob_? (can be computed on the fly as the dot
+ // product of the weight vector and the feature values)
+ struct Edge {
+ Edge() : i_(-1), j_(-1), prev_i_(-1), prev_j_(-1) {}
+ inline int Arity() const { return tail_nodes_.size(); }
+ int head_node_; // refers to a position in nodes_
+ TailNodeVector tail_nodes_; // contents refer to positions in nodes_
+ TRulePtr rule_;
+ SparseVector<double> feature_values_;
+ prob_t edge_prob_; // dot product of weights and feat_values
+ int id_; // equal to this object's position in the edges_ vector
+
+ // span info. typically, i_ and j_ refer to indices in the source sentence
+ // if a synchronous parse has been executed i_ and j_ will refer to indices
+ // in the target sentence / lattice and prev_i_ prev_j_ will refer to
+ // positions in the source. Note: it is up to the translator implementation
+ // to properly set these values. For some models (like the Forest-input
+ // phrase based model) it may not be straightforward to do. if these values
+ // are not properly set, most things will work but alignment and any features
+ // that depend on them will be broken.
+ short int i_;
+ short int j_;
+ short int prev_i_;
+ short int prev_j_;
+ };
+
+ void swap(Hypergraph& other) {
+ other.nodes_.swap(nodes_);
+ std::swap(is_linear_chain_, other.is_linear_chain_);
+ other.edges_.swap(edges_);
+ }
+
+ void ResizeNodes(int size) {
+ nodes_.resize(size);
+ for (int i = 0; i < size; ++i) nodes_[i].id_ = i;
+ }
+
+ // reserves space in the nodes vector to prevent memory locations
+ // from changing
+ void ReserveNodes(size_t n, size_t e = 0) {
+ nodes_.reserve(n);
+ if (e) edges_.reserve(e);
+ }
+
+ Edge* AddEdge(const TRulePtr& rule, const TailNodeVector& tail) {
+ edges_.push_back(Edge());
+ Edge* edge = &edges_.back();
+ edge->rule_ = rule;
+ edge->tail_nodes_ = tail;
+ edge->id_ = edges_.size() - 1;
+ for (int i = 0; i < edge->tail_nodes_.size(); ++i)
+ nodes_[edge->tail_nodes_[i]].out_edges_.push_back(edge->id_);
+ return edge;
+ }
+
+ Node* AddNode(const WordID& cat) {
+ nodes_.push_back(Node());
+ nodes_.back().cat_ = cat;
+ nodes_.back().id_ = nodes_.size() - 1;
+ return &nodes_.back();
+ }
+
+ void ConnectEdgeToHeadNode(const int edge_id, const int head_id) {
+ edges_[edge_id].head_node_ = head_id;
+ nodes_[head_id].in_edges_.push_back(edge_id);
+ }
+
+ // TODO remove this - use the version that takes indices
+ void ConnectEdgeToHeadNode(Edge* edge, Node* head) {
+ edge->head_node_ = head->id_;
+ head->in_edges_.push_back(edge->id_);
+ }
+
+ // merge the goal node from other with this goal node
+ void Union(const Hypergraph& other);
+
+ void PrintGraphviz() const;
+
+ // compute the total number of paths in the forest
+ double NumberOfPaths() const;
+
+ // BEWARE. this assumes that the source and target language
+ // strings are identical and that there are no loops.
+ // It assumes a bunch of other things about where the
+ // epsilons will be. It tries to assert failure if you
+ // break these assumptions, but it may not.
+ // TODO - make this work
+ void EpsilonRemove(WordID eps);
+
+ // multiple the weights vector by the edge feature vector
+ // (inner product) to set the edge probabilities
+ template <typename V>
+ void Reweight(const V& weights) {
+ for (int i = 0; i < edges_.size(); ++i) {
+ Edge& e = edges_[i];
+ e.edge_prob_.logeq(e.feature_values_.dot(weights));
+ }
+ }
+
+ // computes inside and outside scores for each
+ // edge in the hypergraph
+ // alpha->size = edges_.size = beta->size
+ // returns inside prob of goal node
+ prob_t ComputeEdgePosteriors(double scale,
+ std::vector<prob_t>* posts) const;
+
+ // find the score of the very best path passing through each edge
+ prob_t ComputeBestPathThroughEdges(std::vector<prob_t>* posts) const;
+
+ // create a new hypergraph consisting only of the nodes / edges
+ // in the Viterbi derivation of this hypergraph
+ // if edges is set, use the EdgeSelectEdgeWeightFunction
+ Hypergraph* CreateViterbiHypergraph(const std::vector<bool>* edges = NULL) const;
+
+ // move weights as near to the source as possible, resulting in a
+ // stochastic automaton. ONLY FUNCTIONAL FOR *LATTICES*.
+ // See M. Mohri and M. Riley. A Weight Pushing Algorithm for Large
+ // Vocabulary Speech Recognition. 2001.
+ // the log semiring (NOT tropical) is used
+ void PushWeightsToSource(double scale = 1.0);
+ // same, except weights are pushed to the goal, works for HGs,
+ // not just lattices
+ void PushWeightsToGoal(double scale = 1.0);
+
+ void SortInEdgesByEdgeWeights();
+
+ void PruneUnreachable(int goal_node_id); // DEPRECATED
+
+ void RemoveNoncoaccessibleStates(int goal_node_id = -1);
+
+ // remove edges from the hypergraph if prune_edge[edge_id] is true
+ // TODO need to investigate why this shouldn't be run for the forest trans
+ // case. To investigate, change false to true and see where ftrans crashes
+ void PruneEdges(const std::vector<bool>& prune_edge, bool run_inside_algorithm = false);
+
+ // if you don't know, use_sum_prod_semiring should be false
+ void DensityPruneInsideOutside(const double scale, const bool use_sum_prod_semiring, const double density,
+ const std::vector<bool>* preserve_mask = NULL);
+
+ // prunes any edge whose score on the best path taking that edge is more than alpha away
+ // from the score of the global best past (or the highest edge posterior)
+ void BeamPruneInsideOutside(const double scale, const bool use_sum_prod_semiring, const double alpha,
+ const std::vector<bool>* preserve_mask = NULL);
+
+ void clear() {
+ nodes_.clear();
+ edges_.clear();
+ }
+
+ inline size_t NumberOfEdges() const { return edges_.size(); }
+ inline size_t NumberOfNodes() const { return nodes_.size(); }
+ inline bool empty() const { return nodes_.empty(); }
+
+ // linear chains can be represented in a number of ways in a hypergraph,
+ // we define them to consist only of lexical translations and monotonic rules
+ inline bool IsLinearChain() const { return is_linear_chain_; }
+ bool is_linear_chain_;
+
+ // nodes_ is sorted in topological order
+ std::vector<Node> nodes_;
+ // edges_ is not guaranteed to be in any particular order
+ std::vector<Edge> edges_;
+
+ // reorder nodes_ so they are in topological order
+ // source nodes at 0 sink nodes at size-1
+ void TopologicallySortNodesAndEdges(int goal_idx,
+ const std::vector<bool>* prune_edges = NULL);
+ private:
+ Hypergraph(int num_nodes, int num_edges, bool is_lc) : is_linear_chain_(is_lc), nodes_(num_nodes), edges_(num_edges) {}
+
+ static TRulePtr kEPSRule;
+ static TRulePtr kUnaryRule;
+};
+
+// common WeightFunctions, map an edge -> WeightType
+// for generic Viterbi/Inside algorithms
+struct EdgeProb {
+ inline const prob_t& operator()(const Hypergraph::Edge& e) const { return e.edge_prob_; }
+};
+
+struct EdgeSelectEdgeWeightFunction {
+ EdgeSelectEdgeWeightFunction(const std::vector<bool>& v) : v_(v) {}
+ inline prob_t operator()(const Hypergraph::Edge& e) const {
+ if (v_[e.id_]) return prob_t::One();
+ else return prob_t::Zero();
+ }
+ private:
+ const std::vector<bool>& v_;
+};
+
+struct ScaledEdgeProb {
+ ScaledEdgeProb(const double& alpha) : alpha_(alpha) {}
+ inline prob_t operator()(const Hypergraph::Edge& e) const { return e.edge_prob_.pow(alpha_); }
+ const double alpha_;
+};
+
+// see Li (2010), Section 3.2.2-- this is 'x_e = p_e*r_e'
+struct EdgeFeaturesAndProbWeightFunction {
+ inline const SparseVector<prob_t> operator()(const Hypergraph::Edge& e) const {
+ SparseVector<prob_t> res;
+ for (SparseVector<double>::const_iterator it = e.feature_values_.begin();
+ it != e.feature_values_.end(); ++it)
+ res.set_value(it->first, prob_t(it->second) * e.edge_prob_);
+ return res;
+ }
+};
+
+struct TransitionCountWeightFunction {
+ inline double operator()(const Hypergraph::Edge& e) const { (void)e; return 1.0; }
+};
+
+#endif
diff --git a/decoder/hg_intersect.cc b/decoder/hg_intersect.cc
new file mode 100644
index 00000000..02ff752e
--- /dev/null
+++ b/decoder/hg_intersect.cc
@@ -0,0 +1,160 @@
+#include "hg_intersect.h"
+
+#include <vector>
+#include <tr1/unordered_map>
+#include <boost/lexical_cast.hpp>
+#include <boost/functional/hash.hpp>
+
+#include "tdict.h"
+#include "hg.h"
+#include "trule.h"
+#include "wordid.h"
+#include "bottom_up_parser.h"
+
+using boost::lexical_cast;
+using namespace std::tr1;
+using namespace std;
+
+struct RuleFilter {
+ unordered_map<vector<WordID>, bool, boost::hash<vector<WordID> > > exists_;
+ bool true_lattice;
+ RuleFilter(const Lattice& target, int max_phrase_size) {
+ true_lattice = false;
+ for (int i = 0; i < target.size(); ++i) {
+ vector<WordID> phrase;
+ int lim = min(static_cast<int>(target.size()), i + max_phrase_size);
+ for (int j = i; j < lim; ++j) {
+ if (target[j].size() > 1) { true_lattice = true; break; }
+ phrase.push_back(target[j][0].label);
+ exists_[phrase] = true;
+ }
+ }
+ vector<WordID> sos(1, TD::Convert("<s>"));
+ exists_[sos] = true;
+ }
+ bool operator()(const TRule& r) const {
+ // TODO do some smarter filtering for lattices
+ if (true_lattice) return false; // don't filter "true lattice" input
+ const vector<WordID>& e = r.e();
+ for (int i = 0; i < e.size(); ++i) {
+ if (e[i] <= 0) continue;
+ vector<WordID> phrase;
+ for (int j = i; j < e.size(); ++j) {
+ if (e[j] <= 0) break;
+ phrase.push_back(e[j]);
+ if (exists_.count(phrase) == 0) return true;
+ }
+ }
+ return false;
+ }
+};
+
+static bool FastLinearIntersect(const Lattice& target, Hypergraph* hg) {
+ cerr << " Fast linear-chain intersection...\n";
+ vector<bool> prune(hg->edges_.size(), false);
+ set<int> cov;
+ map<const TRule*, TRulePtr> inverted_rules;
+ for (int i = 0; i < prune.size(); ++i) {
+ Hypergraph::Edge& edge = hg->edges_[i];
+ if (edge.Arity() == 0) {
+ const int trg_index = edge.prev_i_;
+ const WordID trg = target[trg_index][0].label;
+ assert(edge.rule_->EWords() == 1);
+ TRulePtr& inv_rule = inverted_rules[edge.rule_.get()];
+ if (!inv_rule) {
+ inv_rule.reset(new TRule(*edge.rule_));
+ inv_rule->e_.swap(inv_rule->f_);
+ }
+ prune[i] = (edge.rule_->e_[0] != trg);
+ if (!prune[i]) {
+ cov.insert(trg_index);
+ swap(edge.prev_i_, edge.i_);
+ swap(edge.prev_j_, edge.j_);
+ edge.rule_.swap(inv_rule);
+ }
+ }
+ }
+ hg->PruneEdges(prune, true);
+ return (cov.size() == target.size());
+}
+
+bool HG::Intersect(const Lattice& target, Hypergraph* hg) {
+ // there are a number of faster algorithms available for restricted
+ // classes of hypergraph and/or target.
+ if (hg->IsLinearChain() && target.IsSentence())
+ return FastLinearIntersect(target, hg);
+
+ vector<bool> rem(hg->edges_.size(), false);
+ const RuleFilter filter(target, 15); // TODO make configurable
+ for (int i = 0; i < rem.size(); ++i)
+ rem[i] = filter(*hg->edges_[i].rule_);
+ hg->PruneEdges(rem, true);
+
+ const int nedges = hg->edges_.size();
+ const int nnodes = hg->nodes_.size();
+
+ TextGrammar* g = new TextGrammar;
+ GrammarPtr gp(g);
+ vector<int> cats(nnodes);
+ // each node in the translation forest becomes a "non-terminal" in the new
+ // grammar, create the labels here
+ const string kSEP = "_";
+ for (int i = 0; i < nnodes; ++i) {
+ const char* pstr = "CAT";
+ if (hg->nodes_[i].cat_ < 0)
+ pstr = TD::Convert(-hg->nodes_[i].cat_);
+ cats[i] = TD::Convert(pstr + kSEP + lexical_cast<string>(i)) * -1;
+ }
+
+ // construct the grammar
+ for (int i = 0; i < nedges; ++i) {
+ const Hypergraph::Edge& edge = hg->edges_[i];
+ const vector<WordID>& tgt = edge.rule_->e();
+ const vector<WordID>& src = edge.rule_->f();
+ TRulePtr rule(new TRule);
+ rule->prev_i = edge.i_;
+ rule->prev_j = edge.j_;
+ rule->lhs_ = cats[edge.head_node_];
+ vector<WordID>& f = rule->f_;
+ vector<WordID>& e = rule->e_;
+ f.resize(tgt.size()); // swap source and target, since the parser
+ e.resize(src.size()); // parses using the source side!
+ Hypergraph::TailNodeVector tn(edge.tail_nodes_.size());
+ int ntc = 0;
+ for (int j = 0; j < tgt.size(); ++j) {
+ const WordID& cur = tgt[j];
+ if (cur > 0) {
+ f[j] = cur;
+ } else {
+ tn[ntc++] = cur;
+ f[j] = cats[edge.tail_nodes_[-cur]];
+ }
+ }
+ ntc = 0;
+ for (int j = 0; j < src.size(); ++j) {
+ const WordID& cur = src[j];
+ if (cur > 0) {
+ e[j] = cur;
+ } else {
+ e[j] = tn[ntc++];
+ }
+ }
+ rule->scores_ = edge.feature_values_;
+ rule->parent_rule_ = edge.rule_;
+ rule->ComputeArity();
+ //cerr << "ADD: " << rule->AsString() << endl;
+
+ g->AddRule(rule);
+ }
+ g->SetMaxSpan(target.size() + 1);
+ const string& new_goal = TD::Convert(cats.back() * -1);
+ vector<GrammarPtr> grammars(1, gp);
+ Hypergraph tforest;
+ ExhaustiveBottomUpParser parser(new_goal, grammars);
+ if (!parser.Parse(target, &tforest))
+ return false;
+ else
+ hg->swap(tforest);
+ return true;
+}
+
diff --git a/decoder/hg_intersect.h b/decoder/hg_intersect.h
new file mode 100644
index 00000000..826bdaae
--- /dev/null
+++ b/decoder/hg_intersect.h
@@ -0,0 +1,13 @@
+#ifndef _HG_INTERSECT_H_
+#define _HG_INTERSECT_H_
+
+#include <vector>
+
+#include "lattice.h"
+
+class Hypergraph;
+struct HG {
+ static bool Intersect(const Lattice& target, Hypergraph* hg);
+};
+
+#endif
diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc
new file mode 100644
index 00000000..5161931d
--- /dev/null
+++ b/decoder/hg_io.cc
@@ -0,0 +1,673 @@
+#include "hg_io.h"
+
+#include <sstream>
+#include <iostream>
+
+#include <boost/lexical_cast.hpp>
+
+#include "tdict.h"
+#include "json_parse.h"
+#include "hg.h"
+
+using namespace std;
+
+struct HGReader : public JSONParser {
+ HGReader(Hypergraph* g) : rp("[X] ||| "), state(-1), hg(*g), nodes_needed(true), edges_needed(true) { nodes = 0; edges = 0; }
+
+ void CreateNode(const string& cat, const vector<int>& in_edges) {
+ WordID c = TD::Convert("X") * -1;
+ if (!cat.empty()) c = TD::Convert(cat) * -1;
+ Hypergraph::Node* node = hg.AddNode(c);
+ for (int i = 0; i < in_edges.size(); ++i) {
+ if (in_edges[i] >= hg.edges_.size()) {
+ cerr << "JSONParser: in_edges[" << i << "]=" << in_edges[i]
+ << ", but hg only has " << hg.edges_.size() << " edges!\n";
+ abort();
+ }
+ hg.ConnectEdgeToHeadNode(&hg.edges_[in_edges[i]], node);
+ }
+ }
+ void CreateEdge(const TRulePtr& rule, SparseVector<double>* feats, const SmallVector& tail) {
+ Hypergraph::Edge* edge = hg.AddEdge(rule, tail);
+ feats->swap(edge->feature_values_);
+ edge->i_ = spans[0];
+ edge->j_ = spans[1];
+ edge->prev_i_ = spans[2];
+ edge->prev_j_ = spans[3];
+ }
+
+ bool HandleJSONEvent(int type, const JSON_value* value) {
+ switch(state) {
+ case -1:
+ assert(type == JSON_T_OBJECT_BEGIN);
+ state = 0;
+ break;
+ case 0:
+ if (type == JSON_T_OBJECT_END) {
+ //cerr << "HG created\n"; // TODO, signal some kind of callback
+ } else if (type == JSON_T_KEY) {
+ string val = value->vu.str.value;
+ if (val == "features") { assert(fdict.empty()); state = 1; }
+ else if (val == "is_sorted") { state = 3; }
+ else if (val == "rules") { assert(rules.empty()); state = 4; }
+ else if (val == "node") { state = 8; }
+ else if (val == "edges") { state = 13; }
+ else { cerr << "Unexpected key: " << val << endl; return false; }
+ }
+ break;
+
+ // features
+ case 1:
+ if(type == JSON_T_NULL) { state = 0; break; }
+ assert(type == JSON_T_ARRAY_BEGIN);
+ state = 2;
+ break;
+ case 2:
+ if(type == JSON_T_ARRAY_END) { state = 0; break; }
+ assert(type == JSON_T_STRING);
+ fdict.push_back(FD::Convert(value->vu.str.value));
+ assert(fdict.back() > 0);
+ break;
+
+ // is_sorted
+ case 3:
+ assert(type == JSON_T_TRUE || type == JSON_T_FALSE);
+ is_sorted = (type == JSON_T_TRUE);
+ if (!is_sorted) { cerr << "[WARNING] is_sorted flag is ignored\n"; }
+ state = 0;
+ break;
+
+ // rules
+ case 4:
+ if(type == JSON_T_NULL) { state = 0; break; }
+ assert(type == JSON_T_ARRAY_BEGIN);
+ state = 5;
+ break;
+ case 5:
+ if(type == JSON_T_ARRAY_END) { state = 0; break; }
+ assert(type == JSON_T_INTEGER);
+ state = 6;
+ rule_id = value->vu.integer_value;
+ break;
+ case 6:
+ assert(type == JSON_T_STRING);
+ rules[rule_id] = TRulePtr(new TRule(value->vu.str.value));
+ state = 5;
+ break;
+
+ // Nodes
+ case 8:
+ assert(type == JSON_T_OBJECT_BEGIN);
+ ++nodes;
+ in_edges.clear();
+ cat.clear();
+ state = 9; break;
+ case 9:
+ if (type == JSON_T_OBJECT_END) {
+ //cerr << "Creating NODE\n";
+ CreateNode(cat, in_edges);
+ state = 0; break;
+ }
+ assert(type == JSON_T_KEY);
+ cur_key = value->vu.str.value;
+ if (cur_key == "cat") { assert(cat.empty()); state = 10; break; }
+ if (cur_key == "in_edges") { assert(in_edges.empty()); state = 11; break; }
+ cerr << "Syntax error: unexpected key " << cur_key << " in node specification.\n";
+ return false;
+ case 10:
+ assert(type == JSON_T_STRING || type == JSON_T_NULL);
+ cat = value->vu.str.value;
+ state = 9; break;
+ case 11:
+ if (type == JSON_T_NULL) { state = 9; break; }
+ assert(type == JSON_T_ARRAY_BEGIN);
+ state = 12; break;
+ case 12:
+ if (type == JSON_T_ARRAY_END) { state = 9; break; }
+ assert(type == JSON_T_INTEGER);
+ //cerr << "in_edges: " << value->vu.integer_value << endl;
+ in_edges.push_back(value->vu.integer_value);
+ break;
+
+ // "edges": [ { "tail": null, "feats" : [0,1.63,1,-0.54], "rule": 12},
+ // { "tail": null, "feats" : [0,0.87,1,0.02], "spans":[1,2,3,4], "rule": 17},
+ // { "tail": [0], "feats" : [1,2.3,2,15.3,"ExtraFeature",1.2], "rule": 13}]
+ case 13:
+ assert(type == JSON_T_ARRAY_BEGIN);
+ state = 14;
+ break;
+ case 14:
+ if (type == JSON_T_ARRAY_END) { state = 0; break; }
+ assert(type == JSON_T_OBJECT_BEGIN);
+ //cerr << "New edge\n";
+ ++edges;
+ cur_rule.reset(); feats.clear(); tail.clear();
+ state = 15; break;
+ case 15:
+ if (type == JSON_T_OBJECT_END) {
+ CreateEdge(cur_rule, &feats, tail);
+ state = 14; break;
+ }
+ assert(type == JSON_T_KEY);
+ cur_key = value->vu.str.value;
+ //cerr << "edge key " << cur_key << endl;
+ if (cur_key == "rule") { assert(!cur_rule); state = 16; break; }
+ if (cur_key == "spans") { assert(!cur_rule); state = 22; break; }
+ if (cur_key == "feats") { assert(feats.empty()); state = 17; break; }
+ if (cur_key == "tail") { assert(tail.empty()); state = 20; break; }
+ cerr << "Unexpected key " << cur_key << " in edge specification\n";
+ return false;
+ case 16: // edge.rule
+ if (type == JSON_T_INTEGER) {
+ int rule_id = value->vu.integer_value;
+ if (rules.find(rule_id) == rules.end()) {
+ // rules list must come before the edge definitions!
+ cerr << "Rule_id " << rule_id << " given but only loaded " << rules.size() << " rules\n";
+ return false;
+ }
+ cur_rule = rules[rule_id];
+ } else if (type == JSON_T_STRING) {
+ cur_rule.reset(new TRule(value->vu.str.value));
+ } else {
+ cerr << "Rule must be either a rule id or a rule string" << endl;
+ return false;
+ }
+ // cerr << "Edge: rule=" << cur_rule->AsString() << endl;
+ state = 15;
+ break;
+ case 17: // edge.feats
+ if (type == JSON_T_NULL) { state = 15; break; }
+ assert(type == JSON_T_ARRAY_BEGIN);
+ state = 18; break;
+ case 18:
+ if (type == JSON_T_ARRAY_END) { state = 15; break; }
+ if (type != JSON_T_INTEGER && type != JSON_T_STRING) {
+ cerr << "Unexpected feature id type\n"; return false;
+ }
+ if (type == JSON_T_INTEGER) {
+ fid = value->vu.integer_value;
+ assert(fid < fdict.size());
+ fid = fdict[fid];
+ } else if (JSON_T_STRING) {
+ fid = FD::Convert(value->vu.str.value);
+ } else { abort(); }
+ state = 19;
+ break;
+ case 19:
+ {
+ assert(type == JSON_T_INTEGER || type == JSON_T_FLOAT);
+ double val = (type == JSON_T_INTEGER ? static_cast<double>(value->vu.integer_value) :
+ strtod(value->vu.str.value, NULL));
+ feats.set_value(fid, val);
+ state = 18;
+ break;
+ }
+ case 20: // edge.tail
+ if (type == JSON_T_NULL) { state = 15; break; }
+ assert(type == JSON_T_ARRAY_BEGIN);
+ state = 21; break;
+ case 21:
+ if (type == JSON_T_ARRAY_END) { state = 15; break; }
+ assert(type == JSON_T_INTEGER);
+ tail.push_back(value->vu.integer_value);
+ break;
+ case 22: // edge.spans
+ assert(type == JSON_T_ARRAY_BEGIN);
+ state = 23;
+ spans[0] = spans[1] = spans[2] = spans[3] = -1;
+ spanc = 0;
+ break;
+ case 23:
+ if (type == JSON_T_ARRAY_END) { state = 15; break; }
+ assert(type == JSON_T_INTEGER);
+ assert(spanc < 4);
+ spans[spanc] = value->vu.integer_value;
+ ++spanc;
+ }
+ return true;
+ }
+ string rp;
+ string cat;
+ SmallVector tail;
+ vector<int> in_edges;
+ TRulePtr cur_rule;
+ map<int, TRulePtr> rules;
+ vector<int> fdict;
+ SparseVector<double> feats;
+ int state;
+ int fid;
+ int nodes;
+ int edges;
+ int spans[4];
+ int spanc;
+ string cur_key;
+ Hypergraph& hg;
+ int rule_id;
+ bool nodes_needed;
+ bool edges_needed;
+ bool is_sorted;
+};
+
+bool HypergraphIO::ReadFromJSON(istream* in, Hypergraph* hg) {
+ hg->clear();
+ HGReader reader(hg);
+ return reader.Parse(in);
+}
+
+static void WriteRule(const TRule& r, ostream* out) {
+ if (!r.lhs_) { (*out) << "[X] ||| "; }
+ JSONParser::WriteEscapedString(r.AsString(), out);
+}
+
+bool HypergraphIO::WriteToJSON(const Hypergraph& hg, bool remove_rules, ostream* out) {
+ map<const TRule*, int> rid;
+ ostream& o = *out;
+ rid[NULL] = 0;
+ o << '{';
+ if (!remove_rules) {
+ o << "\"rules\":[";
+ for (int i = 0; i < hg.edges_.size(); ++i) {
+ const TRule* r = hg.edges_[i].rule_.get();
+ int &id = rid[r];
+ if (!id) {
+ id=rid.size() - 1;
+ if (id > 1) o << ',';
+ o << id << ',';
+ WriteRule(*r, &o);
+ };
+ }
+ o << "],";
+ }
+ const bool use_fdict = FD::NumFeats() < 1000;
+ if (use_fdict) {
+ o << "\"features\":[";
+ for (int i = 1; i < FD::NumFeats(); ++i) {
+ o << (i==1 ? "":",") << '"' << FD::Convert(i) << '"';
+ }
+ o << "],";
+ }
+ vector<int> edgemap(hg.edges_.size(), -1); // edges may be in non-topo order
+ int edge_count = 0;
+ for (int i = 0; i < hg.nodes_.size(); ++i) {
+ const Hypergraph::Node& node = hg.nodes_[i];
+ if (i > 0) { o << ","; }
+ o << "\"edges\":[";
+ for (int j = 0; j < node.in_edges_.size(); ++j) {
+ const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+ edgemap[edge.id_] = edge_count;
+ ++edge_count;
+ o << (j == 0 ? "" : ",") << "{";
+
+ o << "\"tail\":[";
+ for (int k = 0; k < edge.tail_nodes_.size(); ++k) {
+ o << (k > 0 ? "," : "") << edge.tail_nodes_[k];
+ }
+ o << "],";
+
+ o << "\"spans\":[" << edge.i_ << "," << edge.j_ << "," << edge.prev_i_ << "," << edge.prev_j_ << "],";
+
+ o << "\"feats\":[";
+ bool first = true;
+ for (SparseVector<double>::const_iterator it = edge.feature_values_.begin(); it != edge.feature_values_.end(); ++it) {
+ if (!it->second) continue; // don't write features that have a zero value
+ if (!it->first) continue; // if the feature set was frozen this might happen
+ if (!first) o << ',';
+ if (use_fdict)
+ o << (it->first - 1);
+ else
+ o << '"' << FD::Convert(it->first) << '"';
+ o << ',' << it->second;
+ first = false;
+ }
+ o << "]";
+ if (!remove_rules) { o << ",\"rule\":" << rid[edge.rule_.get()]; }
+ o << "}";
+ }
+ o << "],";
+
+ o << "\"node\":{\"in_edges\":[";
+ for (int j = 0; j < node.in_edges_.size(); ++j) {
+ int mapped_edge = edgemap[node.in_edges_[j]];
+ assert(mapped_edge >= 0);
+ o << (j == 0 ? "" : ",") << mapped_edge;
+ }
+ o << "]";
+ if (node.cat_ < 0) { o << ",\"cat\":\"" << TD::Convert(node.cat_ * -1) << '"'; }
+ o << "}";
+ }
+ o << "}\n";
+ return true;
+}
+
+bool needs_escape[128];
+void InitEscapes() {
+ memset(needs_escape, false, 128);
+ needs_escape[static_cast<size_t>('\'')] = true;
+ needs_escape[static_cast<size_t>('\\')] = true;
+}
+
+string HypergraphIO::Escape(const string& s) {
+ size_t len = s.size();
+ for (int i = 0; i < s.size(); ++i) {
+ unsigned char c = s[i];
+ if (c < 128 && needs_escape[c]) ++len;
+ }
+ if (len == s.size()) return s;
+ string res(len, ' ');
+ size_t o = 0;
+ for (int i = 0; i < s.size(); ++i) {
+ unsigned char c = s[i];
+ if (c < 128 && needs_escape[c])
+ res[o++] = '\\';
+ res[o++] = c;
+ }
+ assert(o == len);
+ return res;
+}
+
+string HypergraphIO::AsPLF(const Hypergraph& hg, bool include_global_parentheses) {
+ static bool first = true;
+ if (first) { InitEscapes(); first = false; }
+ if (hg.nodes_.empty()) return "()";
+ ostringstream os;
+ if (include_global_parentheses) os << '(';
+ static const string EPS="*EPS*";
+ for (int i = 0; i < hg.nodes_.size()-1; ++i) {
+ if (hg.nodes_[i].out_edges_.empty()) abort();
+ const bool last_node = (i == hg.nodes_.size() - 2);
+ const int out_edges_size = hg.nodes_[i].out_edges_.size();
+ // compound splitter adds an extra goal transition which we suppress with
+ // the following conditional
+ if (!last_node || out_edges_size != 1 ||
+ hg.edges_[hg.nodes_[i].out_edges_[0]].rule_->EWords() == 1) {
+ os << '(';
+ for (int j = 0; j < out_edges_size; ++j) {
+ const Hypergraph::Edge& e = hg.edges_[hg.nodes_[i].out_edges_[j]];
+ const string output = e.rule_->e_.size() ==2 ? Escape(TD::Convert(e.rule_->e_[1])) : EPS;
+ double prob = log(e.edge_prob_);
+ if (isinf(prob)) { prob = -9e20; }
+ if (isnan(prob)) { prob = 0; }
+ os << "('" << output << "'," << prob << "," << e.head_node_ - i << "),";
+ }
+ os << "),";
+ }
+ }
+ if (include_global_parentheses) os << ')';
+ return os.str();
+}
+
+namespace PLF {
+
+const string chars = "'\\";
+const char& quote = chars[0];
+const char& slash = chars[1];
+
+// safe get
+inline char get(const std::string& in, int c) {
+ if (c < 0 || c >= (int)in.size()) return 0;
+ else return in[(size_t)c];
+}
+
+// consume whitespace
+inline void eatws(const std::string& in, int& c) {
+ while (get(in,c) == ' ') { c++; }
+}
+
+// from 'foo' return foo
+std::string getEscapedString(const std::string& in, int &c)
+{
+ eatws(in,c);
+ if (get(in,c++) != quote) return "ERROR";
+ std::string res;
+ char cur = 0;
+ do {
+ cur = get(in,c++);
+ if (cur == slash) { res += get(in,c++); }
+ else if (cur != quote) { res += cur; }
+ } while (get(in,c) != quote && (c < (int)in.size()));
+ c++;
+ eatws(in,c);
+ return res;
+}
+
+// basically atof
+float getFloat(const std::string& in, int &c)
+{
+ std::string tmp;
+ eatws(in,c);
+ while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') {
+ tmp += get(in,c++);
+ }
+ eatws(in,c);
+ if (tmp.empty()) {
+ cerr << "Syntax error while reading number! col=" << c << endl;
+ abort();
+ }
+ return atof(tmp.c_str());
+}
+
+// basically atoi
+int getInt(const std::string& in, int &c)
+{
+ std::string tmp;
+ eatws(in,c);
+ while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') {
+ tmp += get(in,c++);
+ }
+ eatws(in,c);
+ return atoi(tmp.c_str());
+}
+
+// maximum number of nodes permitted
+#define MAX_NODES 100000000
+// parse ('foo', 0.23)
+void ReadPLFEdge(const std::string& in, int &c, int cur_node, Hypergraph* hg) {
+ if (get(in,c++) != '(') { assert(!"PCN/PLF parse error: expected ( at start of cn alt block\n"); }
+ vector<WordID> ewords(2, 0);
+ ewords[1] = TD::Convert(getEscapedString(in,c));
+ TRulePtr r(new TRule(ewords));
+ r->ComputeArity();
+ // cerr << "RULE: " << r->AsString() << endl;
+ if (get(in,c++) != ',') { cerr << in << endl; assert(!"PCN/PLF parse error: expected , after string\n"); }
+ size_t cnNext = 1;
+ std::vector<float> probs;
+ probs.push_back(getFloat(in,c));
+ while (get(in,c) == ',') {
+ c++;
+ float val = getFloat(in,c);
+ probs.push_back(val);
+ // cerr << val << endl; //REMO
+ }
+ //if we read more than one prob, this was a lattice, last item was column increment
+ if (probs.size()>1) {
+ cnNext = static_cast<size_t>(probs.back());
+ probs.pop_back();
+ if (cnNext < 1) { cerr << cnNext << endl;
+ assert(!"PCN/PLF parse error: bad link length at last element of cn alt block\n"); }
+ }
+ if (get(in,c++) != ')') { assert(!"PCN/PLF parse error: expected ) at end of cn alt block\n"); }
+ eatws(in,c);
+ Hypergraph::TailNodeVector tail(1, cur_node);
+ Hypergraph::Edge* edge = hg->AddEdge(r, tail);
+ //cerr << " <--" << cur_node << endl;
+ int head_node = cur_node + cnNext;
+ assert(head_node < MAX_NODES); // prevent malicious PLFs from using all the memory
+ if (hg->nodes_.size() < (head_node + 1)) { hg->ResizeNodes(head_node + 1); }
+ hg->ConnectEdgeToHeadNode(edge, &hg->nodes_[head_node]);
+ for (int i = 0; i < probs.size(); ++i)
+ edge->feature_values_.set_value(FD::Convert("Feature_" + boost::lexical_cast<string>(i)), probs[i]);
+}
+
+// parse (('foo', 0.23), ('bar', 0.77))
+void ReadPLFNode(const std::string& in, int &c, int cur_node, int line, Hypergraph* hg) {
+ //cerr << "PLF READING NODE " << cur_node << endl;
+ if (hg->nodes_.size() < (cur_node + 1)) { hg->ResizeNodes(cur_node + 1); }
+ if (get(in,c++) != '(') { cerr << line << ": Syntax error 1\n"; abort(); }
+ eatws(in,c);
+ while (1) {
+ if (c > (int)in.size()) { break; }
+ if (get(in,c) == ')') {
+ c++;
+ eatws(in,c);
+ break;
+ }
+ if (get(in,c) == ',' && get(in,c+1) == ')') {
+ c+=2;
+ eatws(in,c);
+ break;
+ }
+ if (get(in,c) == ',') { c++; eatws(in,c); }
+ ReadPLFEdge(in, c, cur_node, hg);
+ }
+}
+
+} // namespace PLF
+
+void HypergraphIO::ReadFromPLF(const std::string& in, Hypergraph* hg, int line) {
+ hg->clear();
+ int c = 0;
+ int cur_node = 0;
+ if (in[c++] != '(') { cerr << line << ": Syntax error!\n"; abort(); }
+ while (1) {
+ if (c > (int)in.size()) { break; }
+ if (PLF::get(in,c) == ')') {
+ c++;
+ PLF::eatws(in,c);
+ break;
+ }
+ if (PLF::get(in,c) == ',' && PLF::get(in,c+1) == ')') {
+ c+=2;
+ PLF::eatws(in,c);
+ break;
+ }
+ if (PLF::get(in,c) == ',') { c++; PLF::eatws(in,c); }
+ PLF::ReadPLFNode(in, c, cur_node, line, hg);
+ ++cur_node;
+ }
+ assert(cur_node == hg->nodes_.size() - 1);
+}
+
+void HypergraphIO::PLFtoLattice(const string& plf, Lattice* pl) {
+ Lattice& l = *pl;
+ Hypergraph g;
+ ReadFromPLF(plf, &g, 0);
+ const int num_nodes = g.nodes_.size() - 1;
+ l.resize(num_nodes);
+ for (int i = 0; i < num_nodes; ++i) {
+ vector<LatticeArc>& alts = l[i];
+ const Hypergraph::Node& node = g.nodes_[i];
+ const int num_alts = node.out_edges_.size();
+ alts.resize(num_alts);
+ for (int j = 0; j < num_alts; ++j) {
+ const Hypergraph::Edge& edge = g.edges_[node.out_edges_[j]];
+ alts[j].label = edge.rule_->e_[1];
+ alts[j].cost = edge.feature_values_.value(FD::Convert("Feature_0"));
+ alts[j].dist2next = edge.head_node_ - node.id_;
+ }
+ }
+}
+
+void HypergraphIO::WriteAsCFG(const Hypergraph& hg) {
+ vector<int> cats(hg.nodes_.size());
+ // each node in the translation forest becomes a "non-terminal" in the new
+ // grammar, create the labels here
+ const string kSEP = "_";
+ for (int i = 0; i < hg.nodes_.size(); ++i) {
+ const char* pstr = "CAT";
+ if (hg.nodes_[i].cat_ < 0)
+ pstr = TD::Convert(-hg.nodes_[i].cat_);
+ cats[i] = TD::Convert(pstr + kSEP + boost::lexical_cast<string>(i)) * -1;
+ }
+
+ for (int i = 0; i < hg.edges_.size(); ++i) {
+ const Hypergraph::Edge& edge = hg.edges_[i];
+ const vector<WordID>& tgt = edge.rule_->e();
+ const vector<WordID>& src = edge.rule_->f();
+ TRulePtr rule(new TRule);
+ rule->prev_i = edge.i_;
+ rule->prev_j = edge.j_;
+ rule->lhs_ = cats[edge.head_node_];
+ vector<WordID>& f = rule->f_;
+ vector<WordID>& e = rule->e_;
+ f.resize(tgt.size()); // swap source and target, since the parser
+ e.resize(src.size()); // parses using the source side!
+ Hypergraph::TailNodeVector tn(edge.tail_nodes_.size());
+ int ntc = 0;
+ for (int j = 0; j < tgt.size(); ++j) {
+ const WordID& cur = tgt[j];
+ if (cur > 0) {
+ f[j] = cur;
+ } else {
+ tn[ntc++] = cur;
+ f[j] = cats[edge.tail_nodes_[-cur]];
+ }
+ }
+ ntc = 0;
+ for (int j = 0; j < src.size(); ++j) {
+ const WordID& cur = src[j];
+ if (cur > 0) {
+ e[j] = cur;
+ } else {
+ e[j] = tn[ntc++];
+ }
+ }
+ rule->scores_ = edge.feature_values_;
+ rule->parent_rule_ = edge.rule_;
+ rule->ComputeArity();
+ cout << rule->AsString() << endl;
+ }
+}
+
+namespace B64 {
+
+static const char cb64[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`abcdefghijklmnopq";
+
+static void encodeblock(const unsigned char* in, ostream* os, int len) {
+ char out[4];
+ out[0] = cb64[ in[0] >> 2 ];
+ out[1] = cb64[ ((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4) ];
+ out[2] = (len > 1 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '=');
+ out[3] = (len > 2 ? cb64[ in[2] & 0x3f ] : '=');
+ os->write(out, 4);
+}
+
+void b64encode(const char* data, const size_t size, ostream* out) {
+ size_t cur = 0;
+ while(cur < size) {
+ int len = min(static_cast<size_t>(3), size - cur);
+ encodeblock(reinterpret_cast<const unsigned char*>(&data[cur]), out, len);
+ cur += len;
+ }
+}
+
+static void decodeblock(const unsigned char* in, unsigned char* out) {
+ out[0] = (unsigned char ) (in[0] << 2 | in[1] >> 4);
+ out[1] = (unsigned char ) (in[1] << 4 | in[2] >> 2);
+ out[2] = (unsigned char ) (((in[2] << 6) & 0xc0) | in[3]);
+}
+
+bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize) {
+ size_t cur = 0;
+ size_t ocur = 0;
+ unsigned char in[4];
+ while(cur < insize) {
+ assert(ocur < outsize);
+ for (int i = 0; i < 4; ++i) {
+ unsigned char v = data[cur];
+ v = (unsigned char) ((v < 43 || v > 122) ? '\0' : cd64[ v - 43 ]);
+ if (!v) {
+ cerr << "B64 decode error at offset " << cur << " offending character: " << (int)data[cur] << endl;
+ return false;
+ }
+ v = (unsigned char) ((v == '$') ? '\0' : v - 61);
+ if (v) in[i] = v - 1; else in[i] = 0;
+ ++cur;
+ }
+ decodeblock(in, reinterpret_cast<unsigned char*>(&out[ocur]));
+ ocur += 3;
+ }
+ return true;
+}
+}
+
diff --git a/decoder/hg_io.h b/decoder/hg_io.h
new file mode 100644
index 00000000..7162106e
--- /dev/null
+++ b/decoder/hg_io.h
@@ -0,0 +1,39 @@
+#ifndef _HG_IO_H_
+#define _HG_IO_H_
+
+#include <iostream>
+
+#include "lattice.h"
+class Hypergraph;
+
+struct HypergraphIO {
+
+ // the format is basically a list of nodes and edges in topological order
+ // any edge you read, you must have already read its tail nodes
+ // any node you read, you must have already read its incoming edges
+ // this may make writing a bit more challenging if your forest is not
+ // topologically sorted (but that probably doesn't happen very often),
+ // but it makes reading much more memory efficient.
+ // see test_data/small.json.gz for an email encoding
+ static bool ReadFromJSON(std::istream* in, Hypergraph* out);
+
+ // if remove_rules is used, the hypergraph is serialized without rule information
+ // (so it only contains structure and feature information)
+ static bool WriteToJSON(const Hypergraph& hg, bool remove_rules, std::ostream* out);
+
+ static void WriteAsCFG(const Hypergraph& hg);
+
+ // serialization utils
+ static void ReadFromPLF(const std::string& in, Hypergraph* out, int line = 0);
+ // return PLF string representation (undefined behavior on non-lattices)
+ static std::string AsPLF(const Hypergraph& hg, bool include_global_parentheses = true);
+ static void PLFtoLattice(const std::string& plf, Lattice* pl);
+ static std::string Escape(const std::string& s); // PLF helper
+};
+
+namespace B64 {
+ bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize);
+ void b64encode(const char* data, const size_t size, std::ostream* out);
+}
+
+#endif
diff --git a/decoder/hg_test.cc b/decoder/hg_test.cc
new file mode 100644
index 00000000..51934ad1
--- /dev/null
+++ b/decoder/hg_test.cc
@@ -0,0 +1,455 @@
+#include <cassert>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <gtest/gtest.h>
+#include "tdict.h"
+
+#include "json_parse.h"
+#include "filelib.h"
+#include "hg.h"
+#include "hg_io.h"
+#include "hg_intersect.h"
+#include "viterbi.h"
+#include "kbest.h"
+#include "inside_outside.h"
+
+using namespace std;
+
+class HGTest : public testing::Test {
+ protected:
+ virtual void SetUp() { }
+ virtual void TearDown() { }
+ void CreateHG(Hypergraph* hg) const;
+ void CreateHG_int(Hypergraph* hg) const;
+ void CreateHG_tiny(Hypergraph* hg) const;
+ void CreateHGBalanced(Hypergraph* hg) const;
+ void CreateLatticeHG(Hypergraph* hg) const;
+ void CreateTinyLatticeHG(Hypergraph* hg) const;
+};
+
+void HGTest::CreateTinyLatticeHG(Hypergraph* hg) const {
+ const string json = "{\"rules\":[1,\"[X] ||| [1] a\",2,\"[X] ||| [1] A\",3,\"[X] ||| [1] b\",4,\"[X] ||| [1] B'\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[],\"node\":{\"in_edges\":[]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.2],\"rule\":1},{\"tail\":[0],\"feats\":[0,-0.6],\"rule\":2}],\"node\":{\"in_edges\":[0,1]},\"edges\":[{\"tail\":[1],\"feats\":[0,-0.1],\"rule\":3},{\"tail\":[1],\"feats\":[0,-0.9],\"rule\":4}],\"node\":{\"in_edges\":[2,3]}}";
+ istringstream instr(json);
+ EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg));
+}
+
+void HGTest::CreateLatticeHG(Hypergraph* hg) const {
+ const string json = "{\"rules\":[1,\"[X] ||| [1] a\",2,\"[X] ||| [1] A\",3,\"[X] ||| [1] A A\",4,\"[X] ||| [1] b\",5,\"[X] ||| [1] c\",6,\"[X] ||| [1] B C\",7,\"[X] ||| [1] A B C\",8,\"[X] ||| [1] CC\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[],\"node\":{\"in_edges\":[]},\"edges\":[{\"tail\":[0],\"feats\":[2,-0.3],\"rule\":1},{\"tail\":[0],\"feats\":[2,-0.6],\"rule\":2},{\"tail\":[0],\"feats\":[2,-1.7],\"rule\":3}],\"node\":{\"in_edges\":[0,1,2]},\"edges\":[{\"tail\":[1],\"feats\":[2,-0.5],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[2],\"feats\":[2,-0.6],\"rule\":5},{\"tail\":[1],\"feats\":[2,-0.8],\"rule\":6},{\"tail\":[0],\"feats\":[2,-0.01],\"rule\":7},{\"tail\":[2],\"feats\":[2,-0.8],\"rule\":8}],\"node\":{\"in_edges\":[4,5,6,7]}}";
+ istringstream instr(json);
+ EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg));
+}
+
+void HGTest::CreateHG_tiny(Hypergraph* hg) const {
+ const string json = "{\"rules\":[1,\"[X] ||| <s>\",2,\"[X] ||| X [1]\",3,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,-2,1,-99],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.5,1,-0.8],\"rule\":2},{\"tail\":[0],\"feats\":[0,-0.7,1,-0.9],\"rule\":3}],\"node\":{\"in_edges\":[1,2]}}";
+ istringstream instr(json);
+ EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg));
+}
+
+void HGTest::CreateHG_int(Hypergraph* hg) const {
+ const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| b\",3,\"[X] ||| a [1]\",4,\"[X] ||| [1] b\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,0.1],\"rule\":1},{\"tail\":[],\"feats\":[0,0.1],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X\"},\"edges\":[{\"tail\":[0],\"feats\":[0,0.3],\"rule\":3},{\"tail\":[0],\"feats\":[0,0.2],\"rule\":4}],\"node\":{\"in_edges\":[2,3],\"cat\":\"Goal\"}}";
+ istringstream instr(json);
+ EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg));
+}
+
+void HGTest::CreateHG(Hypergraph* hg) const {
+ string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}";
+ istringstream instr(json);
+ EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg));
+}
+
+void HGTest::CreateHGBalanced(Hypergraph* hg) const {
+ const string json = "{\"rules\":[1,\"[X] ||| i\",2,\"[X] ||| a\",3,\"[X] ||| b\",4,\"[X] ||| [1] [2]\",5,\"[X] ||| [1] [2]\",6,\"[X] ||| c\",7,\"[X] ||| d\",8,\"[X] ||| [1] [2]\",9,\"[X] ||| [1] [2]\",10,\"[X] ||| [1] [2]\",11,\"[X] ||| [1] [2]\",12,\"[X] ||| [1] [2]\",13,\"[X] ||| [1] [2]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[1,2],\"feats\":[],\"rule\":4},{\"tail\":[2,1],\"feats\":[],\"rule\":5}],\"node\":{\"in_edges\":[3,4]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":6}],\"node\":{\"in_edges\":[5]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":7}],\"node\":{\"in_edges\":[6]},\"edges\":[{\"tail\":[4,5],\"feats\":[],\"rule\":8},{\"tail\":[5,4],\"feats\":[],\"rule\":9}],\"node\":{\"in_edges\":[7,8]},\"edges\":[{\"tail\":[3,6],\"feats\":[],\"rule\":10},{\"tail\":[6,3],\"feats\":[],\"rule\":11}],\"node\":{\"in_edges\":[9,10]},\"edges\":[{\"tail\":[7,0],\"feats\":[],\"rule\":12},{\"tail\":[0,7],\"feats\":[],\"rule\":13}],\"node\":{\"in_edges\":[11,12]}}";
+ istringstream instr(json);
+ EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg));
+}
+
+TEST_F(HGTest,Controlled) {
+ Hypergraph hg;
+ CreateHG_tiny(&hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 0.4);
+ wts.set_value(FD::Convert("f2"), 0.8);
+ hg.Reweight(wts);
+ vector<WordID> trans;
+ prob_t prob = ViterbiESentence(hg, &trans);
+ cerr << TD::GetString(trans) << "\n";
+ cerr << "prob: " << prob << "\n";
+ EXPECT_FLOAT_EQ(-80.839996, log(prob));
+ EXPECT_EQ("X <s>", TD::GetString(trans));
+ vector<prob_t> post;
+ hg.PrintGraphviz();
+ prob_t c2 = Inside<prob_t, ScaledEdgeProb>(hg, NULL, ScaledEdgeProb(0.6));
+ EXPECT_FLOAT_EQ(-47.8577, log(c2));
+}
+
+TEST_F(HGTest,Union) {
+ Hypergraph hg1;
+ Hypergraph hg2;
+ CreateHG_tiny(&hg1);
+ CreateHG(&hg2);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 0.4);
+ wts.set_value(FD::Convert("f2"), 1.0);
+ hg1.Reweight(wts);
+ hg2.Reweight(wts);
+ prob_t c1,c2,c3,c4;
+ vector<WordID> t1,t2,t3,t4;
+ c1 = ViterbiESentence(hg1, &t1);
+ c2 = ViterbiESentence(hg2, &t2);
+ int l2 = ViterbiPathLength(hg2);
+ cerr << c1 << "\t" << TD::GetString(t1) << endl;
+ cerr << c2 << "\t" << TD::GetString(t2) << endl;
+ hg1.Union(hg2);
+ hg1.Reweight(wts);
+ c3 = ViterbiESentence(hg1, &t3);
+ int l3 = ViterbiPathLength(hg1);
+ cerr << c3 << "\t" << TD::GetString(t3) << endl;
+ EXPECT_FLOAT_EQ(c2, c3);
+ EXPECT_EQ(TD::GetString(t2), TD::GetString(t3));
+ EXPECT_EQ(l2, l3);
+
+ wts.set_value(FD::Convert("f2"), -1);
+ hg1.Reweight(wts);
+ c4 = ViterbiESentence(hg1, &t4);
+ cerr << c4 << "\t" << TD::GetString(t4) << endl;
+ EXPECT_EQ("Z <s>", TD::GetString(t4));
+ EXPECT_FLOAT_EQ(98.82, log(c4));
+
+ vector<pair<vector<WordID>, prob_t> > list;
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg1, 10);
+ for (int i = 0; i < 10; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg1.nodes_.size() - 1, i);
+ if (!d) break;
+ list.push_back(make_pair(d->yield, d->score));
+ }
+ EXPECT_TRUE(list[0].first == t4);
+ EXPECT_FLOAT_EQ(log(list[0].second), log(c4));
+ EXPECT_EQ(list.size(), 6);
+ EXPECT_FLOAT_EQ(log(list.back().second / list.front().second), -97.7);
+}
+
+TEST_F(HGTest,ControlledKBest) {
+ Hypergraph hg;
+ CreateHG(&hg);
+ vector<double> w(2); w[0]=0.4; w[1]=0.8;
+ hg.Reweight(w);
+ vector<WordID> trans;
+ prob_t cost = ViterbiESentence(hg, &trans);
+ cerr << TD::GetString(trans) << "\n";
+ cerr << "cost: " << cost << "\n";
+
+ int best = 0;
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10);
+ for (int i = 0; i < 10; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ cerr << TD::GetString(d->yield) << endl;
+ ++best;
+ }
+ EXPECT_EQ(4, best);
+}
+
+
+TEST_F(HGTest,InsideScore) {
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 1.0);
+ Hypergraph hg;
+ CreateTinyLatticeHG(&hg);
+ hg.Reweight(wts);
+ vector<WordID> trans;
+ prob_t cost = ViterbiESentence(hg, &trans);
+ cerr << TD::GetString(trans) << "\n";
+ cerr << "cost: " << cost << "\n";
+ hg.PrintGraphviz();
+ prob_t inside = Inside<prob_t, EdgeProb>(hg);
+ EXPECT_FLOAT_EQ(1.7934048, inside); // computed by hand
+ vector<prob_t> post;
+ inside = hg.ComputeBestPathThroughEdges(&post);
+ EXPECT_FLOAT_EQ(-0.3, log(inside)); // computed by hand
+ EXPECT_EQ(post.size(), 4);
+ for (int i = 0; i < 4; ++i) {
+ cerr << "edge post: " << log(post[i]) << '\t' << hg.edges_[i].rule_->AsString() << endl;
+ }
+}
+
+
+TEST_F(HGTest,PruneInsideOutside) {
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("Feature_1"), 1.0);
+ Hypergraph hg;
+ CreateLatticeHG(&hg);
+ hg.Reweight(wts);
+ vector<WordID> trans;
+ prob_t cost = ViterbiESentence(hg, &trans);
+ cerr << TD::GetString(trans) << "\n";
+ cerr << "cost: " << cost << "\n";
+ hg.PrintGraphviz();
+ //hg.DensityPruneInsideOutside(0.5, false, 2.0);
+ hg.BeamPruneInsideOutside(0.5, false, 0.5);
+ cost = ViterbiESentence(hg, &trans);
+ cerr << "Ncst: " << cost << endl;
+ cerr << TD::GetString(trans) << "\n";
+ hg.PrintGraphviz();
+}
+
+TEST_F(HGTest,TestPruneEdges) {
+ Hypergraph hg;
+ CreateLatticeHG(&hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 1.0);
+ hg.Reweight(wts);
+ hg.PrintGraphviz();
+ vector<bool> prune(hg.edges_.size(), true);
+ prune[6] = false;
+ hg.PruneEdges(prune);
+ cerr << "Pruned:\n";
+ hg.PrintGraphviz();
+}
+
+TEST_F(HGTest,TestIntersect) {
+ Hypergraph hg;
+ CreateHG_int(&hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 1.0);
+ hg.Reweight(wts);
+ hg.PrintGraphviz();
+
+ int best = 0;
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10);
+ for (int i = 0; i < 10; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ cerr << TD::GetString(d->yield) << endl;
+ ++best;
+ }
+ EXPECT_EQ(4, best);
+
+ Lattice target(2);
+ target[0].push_back(LatticeArc(TD::Convert("a"), 0.0, 1));
+ target[1].push_back(LatticeArc(TD::Convert("b"), 0.0, 1));
+ HG::Intersect(target, &hg);
+ hg.PrintGraphviz();
+}
+
+TEST_F(HGTest,TestPrune2) {
+ Hypergraph hg;
+ CreateHG_int(&hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 1.0);
+ hg.Reweight(wts);
+ hg.PrintGraphviz();
+ vector<bool> rem(hg.edges_.size(), false);
+ rem[0] = true;
+ rem[1] = true;
+ hg.PruneEdges(rem);
+ hg.PrintGraphviz();
+ cerr << "TODO: fix this pruning behavior-- the resulting HG should be empty!\n";
+}
+
+TEST_F(HGTest,Sample) {
+ Hypergraph hg;
+ CreateLatticeHG(&hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("Feature_1"), 0.0);
+ hg.Reweight(wts);
+ vector<WordID> trans;
+ prob_t cost = ViterbiESentence(hg, &trans);
+ cerr << TD::GetString(trans) << "\n";
+ cerr << "cost: " << cost << "\n";
+ hg.PrintGraphviz();
+}
+
+TEST_F(HGTest,PLF) {
+ Hypergraph hg;
+ string inplf = "((('haupt',-2.06655,1),('hauptgrund',-5.71033,2),),(('grund',-1.78709,1),),(('für\\'',0.1,1),),)";
+ HypergraphIO::ReadFromPLF(inplf, &hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("Feature_0"), 1.0);
+ hg.Reweight(wts);
+ hg.PrintGraphviz();
+ string outplf = HypergraphIO::AsPLF(hg);
+ cerr << " IN: " << inplf << endl;
+ cerr << "OUT: " << outplf << endl;
+ assert(inplf == outplf);
+}
+
+TEST_F(HGTest,PushWeightsToGoal) {
+ Hypergraph hg;
+ CreateHG(&hg);
+ vector<double> w(2); w[0]=0.4; w[1]=0.8;
+ hg.Reweight(w);
+ vector<WordID> trans;
+ prob_t cost = ViterbiESentence(hg, &trans);
+ cerr << TD::GetString(trans) << "\n";
+ cerr << "cost: " << cost << "\n";
+ hg.PrintGraphviz();
+ hg.PushWeightsToGoal();
+ hg.PrintGraphviz();
+}
+
+TEST_F(HGTest,TestSpecialKBest) {
+ Hypergraph hg;
+ CreateHGBalanced(&hg);
+ vector<double> w(1); w[0]=0;
+ hg.Reweight(w);
+ vector<pair<vector<WordID>, prob_t> > list;
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 100000);
+ for (int i = 0; i < 100000; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ cerr << TD::GetString(d->yield) << endl;
+ }
+ hg.PrintGraphviz();
+}
+
+TEST_F(HGTest, TestGenericViterbi) {
+ Hypergraph hg;
+ CreateHG_tiny(&hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 0.4);
+ wts.set_value(FD::Convert("f2"), 0.8);
+ hg.Reweight(wts);
+ vector<WordID> trans;
+ const prob_t prob = ViterbiESentence(hg, &trans);
+ cerr << TD::GetString(trans) << "\n";
+ cerr << "prob: " << prob << "\n";
+ EXPECT_FLOAT_EQ(-80.839996, log(prob));
+ EXPECT_EQ("X <s>", TD::GetString(trans));
+}
+
+TEST_F(HGTest, TestGenericInside) {
+ Hypergraph hg;
+ CreateTinyLatticeHG(&hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 1.0);
+ hg.Reweight(wts);
+ vector<prob_t> inside;
+ prob_t ins = Inside<prob_t, EdgeProb>(hg, &inside);
+ EXPECT_FLOAT_EQ(1.7934048, ins); // computed by hand
+ vector<prob_t> outside;
+ Outside<prob_t, EdgeProb>(hg, inside, &outside);
+ EXPECT_EQ(3, outside.size());
+ EXPECT_FLOAT_EQ(1.7934048, outside[0]);
+ EXPECT_FLOAT_EQ(1.3114071, outside[1]);
+ EXPECT_FLOAT_EQ(1.0, outside[2]);
+}
+
+TEST_F(HGTest,TestGenericInside2) {
+ Hypergraph hg;
+ CreateHG(&hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 0.4);
+ wts.set_value(FD::Convert("f2"), 0.8);
+ hg.Reweight(wts);
+ vector<prob_t> inside, outside;
+ prob_t ins = Inside<prob_t, EdgeProb>(hg, &inside);
+ Outside<prob_t, EdgeProb>(hg, inside, &outside);
+ for (int i = 0; i < hg.nodes_.size(); ++i)
+ cerr << i << "\t" << log(inside[i]) << "\t" << log(outside[i]) << endl;
+ EXPECT_FLOAT_EQ(0, log(inside[0]));
+ EXPECT_FLOAT_EQ(-1.7861683, log(outside[0]));
+ EXPECT_FLOAT_EQ(-0.4, log(inside[1]));
+ EXPECT_FLOAT_EQ(-1.3861683, log(outside[1]));
+ EXPECT_FLOAT_EQ(-0.8, log(inside[2]));
+ EXPECT_FLOAT_EQ(-0.986168, log(outside[2]));
+ EXPECT_FLOAT_EQ(-0.96, log(inside[3]));
+ EXPECT_FLOAT_EQ(-0.8261683, log(outside[3]));
+ EXPECT_FLOAT_EQ(-1.562512, log(inside[4]));
+ EXPECT_FLOAT_EQ(-0.22365622, log(outside[4]));
+ EXPECT_FLOAT_EQ(-1.7861683, log(inside[5]));
+ EXPECT_FLOAT_EQ(0, log(outside[5]));
+}
+
+TEST_F(HGTest,TestAddExpectations) {
+ Hypergraph hg;
+ CreateHG(&hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 0.4);
+ wts.set_value(FD::Convert("f2"), 0.8);
+ hg.Reweight(wts);
+ SparseVector<prob_t> feat_exps;
+ prob_t z = InsideOutside<prob_t, EdgeProb,
+ SparseVector<prob_t>, EdgeFeaturesAndProbWeightFunction>(hg, &feat_exps);
+ EXPECT_FLOAT_EQ(-2.5439765, feat_exps[FD::Convert("f1")] / z);
+ EXPECT_FLOAT_EQ(-2.6357865, feat_exps[FD::Convert("f2")] / z);
+ cerr << feat_exps << endl;
+ cerr << "Z=" << z << endl;
+}
+
+TEST_F(HGTest, Small) {
+ ReadFile rf("test_data/small.json.gz");
+ Hypergraph hg;
+ assert(HypergraphIO::ReadFromJSON(rf.stream(), &hg));
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("Model_0"), -2.0);
+ wts.set_value(FD::Convert("Model_1"), -0.5);
+ wts.set_value(FD::Convert("Model_2"), -1.1);
+ wts.set_value(FD::Convert("Model_3"), -1.0);
+ wts.set_value(FD::Convert("Model_4"), -1.0);
+ wts.set_value(FD::Convert("Model_5"), 0.5);
+ wts.set_value(FD::Convert("Model_6"), 0.2);
+ wts.set_value(FD::Convert("Model_7"), -3.0);
+ hg.Reweight(wts);
+ vector<WordID> trans;
+ prob_t cost = ViterbiESentence(hg, &trans);
+ cerr << TD::GetString(trans) << "\n";
+ cerr << "cost: " << cost << "\n";
+ vector<prob_t> post;
+ prob_t c2 = Inside<prob_t, ScaledEdgeProb>(hg, NULL, ScaledEdgeProb(0.6));
+ EXPECT_FLOAT_EQ(2.1431036, log(c2));
+}
+
+TEST_F(HGTest, JSONTest) {
+ ostringstream os;
+ JSONParser::WriteEscapedString("\"I don't know\", she said.", &os);
+ EXPECT_EQ("\"\\\"I don't know\\\", she said.\"", os.str());
+ ostringstream os2;
+ JSONParser::WriteEscapedString("yes", &os2);
+ EXPECT_EQ("\"yes\"", os2.str());
+}
+
+TEST_F(HGTest, TestGenericKBest) {
+ Hypergraph hg;
+ CreateHG(&hg);
+ //CreateHGBalanced(&hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 0.4);
+ wts.set_value(FD::Convert("f2"), 1.0);
+ hg.Reweight(wts);
+ vector<WordID> trans;
+ prob_t cost = ViterbiESentence(hg, &trans);
+ cerr << TD::GetString(trans) << "\n";
+ cerr << "cost: " << cost << "\n";
+
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 1000);
+ for (int i = 0; i < 1000; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ cerr << TD::GetString(d->yield) << " F:" << d->feature_values << endl;
+ }
+}
+
+TEST_F(HGTest, TestReadWriteHG) {
+ Hypergraph hg,hg2;
+ CreateHG(&hg);
+ hg.edges_.front().j_ = 23;
+ hg.edges_.back().prev_i_ = 99;
+ ostringstream os;
+ HypergraphIO::WriteToJSON(hg, false, &os);
+ istringstream is(os.str());
+ HypergraphIO::ReadFromJSON(&is, &hg2);
+ EXPECT_EQ(hg2.NumberOfPaths(), hg.NumberOfPaths());
+ EXPECT_EQ(hg2.edges_.front().j_, 23);
+ EXPECT_EQ(hg2.edges_.back().prev_i_, 99);
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/decoder/inside_outside.h b/decoder/inside_outside.h
new file mode 100644
index 00000000..3c7518f2
--- /dev/null
+++ b/decoder/inside_outside.h
@@ -0,0 +1,112 @@
+#ifndef _INSIDE_H_
+#define _INSIDE_H_
+
+#include <vector>
+#include <algorithm>
+#include "hg.h"
+
+// run the inside algorithm and return the inside score
+// if result is non-NULL, result will contain the inside
+// score for each node
+// NOTE: WeightType() must construct the semiring's additive identity
+// WeightType(1) must construct the semiring's multiplicative identity
+template<typename WeightType, typename WeightFunction>
+WeightType Inside(const Hypergraph& hg,
+ std::vector<WeightType>* result = NULL,
+ const WeightFunction& weight = WeightFunction()) {
+ const int num_nodes = hg.nodes_.size();
+ std::vector<WeightType> dummy;
+ std::vector<WeightType>& inside_score = result ? *result : dummy;
+ inside_score.resize(num_nodes);
+ std::fill(inside_score.begin(), inside_score.end(), WeightType());
+ for (int i = 0; i < num_nodes; ++i) {
+ const Hypergraph::Node& cur_node = hg.nodes_[i];
+ WeightType* const cur_node_inside_score = &inside_score[i];
+ const int num_in_edges = cur_node.in_edges_.size();
+ if (num_in_edges == 0) {
+ *cur_node_inside_score = WeightType(1);
+ continue;
+ }
+ for (int j = 0; j < num_in_edges; ++j) {
+ const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]];
+ WeightType score = weight(edge);
+ for (int k = 0; k < edge.tail_nodes_.size(); ++k) {
+ const int tail_node_index = edge.tail_nodes_[k];
+ score *= inside_score[tail_node_index];
+ }
+ *cur_node_inside_score += score;
+ }
+ }
+ return inside_score.back();
+}
+
+template<typename WeightType, typename WeightFunction>
+void Outside(const Hypergraph& hg,
+ std::vector<WeightType>& inside_score,
+ std::vector<WeightType>* result,
+ const WeightFunction& weight = WeightFunction()) {
+ assert(result);
+ const int num_nodes = hg.nodes_.size();
+ assert(inside_score.size() == num_nodes);
+ std::vector<WeightType>& outside_score = *result;
+ outside_score.resize(num_nodes);
+ std::fill(outside_score.begin(), outside_score.end(), WeightType());
+ outside_score.back() = WeightType(1);
+ for (int i = num_nodes - 1; i >= 0; --i) {
+ const Hypergraph::Node& cur_node = hg.nodes_[i];
+ const WeightType& head_node_outside_score = outside_score[i];
+ const int num_in_edges = cur_node.in_edges_.size();
+ for (int j = 0; j < num_in_edges; ++j) {
+ const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]];
+ WeightType head_and_edge_weight = weight(edge);
+ head_and_edge_weight *= head_node_outside_score;
+ const int num_tail_nodes = edge.tail_nodes_.size();
+ for (int k = 0; k < num_tail_nodes; ++k) {
+ const int update_tail_node_index = edge.tail_nodes_[k];
+ WeightType* const tail_outside_score = &outside_score[update_tail_node_index];
+ WeightType inside_contribution = WeightType(1);
+ for (int l = 0; l < num_tail_nodes; ++l) {
+ const int other_tail_node_index = edge.tail_nodes_[l];
+ if (update_tail_node_index != other_tail_node_index)
+ inside_contribution *= inside_score[other_tail_node_index];
+ }
+ inside_contribution *= head_and_edge_weight;
+ *tail_outside_score += inside_contribution;
+ }
+ }
+ }
+}
+
+// this is the Inside-Outside optimization described in Li and Eisner (EMNLP 2009)
+// for computing the inside algorithm over expensive semirings
+// (such as expectations over features). See Figure 4.
+// NOTE: XType * KType must be valid (and yield XType)
+// NOTE: This may do things slightly differently than you are used to, please
+// read the description in Li and Eisner (2009) carefully!
+template<typename KType, typename KWeightFunction, typename XType, typename XWeightFunction>
+KType InsideOutside(const Hypergraph& hg,
+ XType* result_x,
+ const KWeightFunction& kwf = KWeightFunction(),
+ const XWeightFunction& xwf = XWeightFunction()) {
+ const int num_nodes = hg.nodes_.size();
+ std::vector<KType> inside, outside;
+ const KType k = Inside<KType,KWeightFunction>(hg, &inside, kwf);
+ Outside<KType,KWeightFunction>(hg, inside, &outside, kwf);
+ XType& x = *result_x;
+ x = XType(); // default constructor is semiring 0
+ for (int i = 0; i < num_nodes; ++i) {
+ const Hypergraph::Node& cur_node = hg.nodes_[i];
+ const int num_in_edges = cur_node.in_edges_.size();
+ for (int j = 0; j < num_in_edges; ++j) {
+ const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]];
+ KType kbar_e = outside[i];
+ const int num_tail_nodes = edge.tail_nodes_.size();
+ for (int k = 0; k < num_tail_nodes; ++k)
+ kbar_e *= inside[edge.tail_nodes_[k]];
+ x += xwf(edge) * kbar_e;
+ }
+ }
+ return k;
+}
+
+#endif
diff --git a/decoder/json_parse.cc b/decoder/json_parse.cc
new file mode 100644
index 00000000..f6fdfea8
--- /dev/null
+++ b/decoder/json_parse.cc
@@ -0,0 +1,50 @@
+#include "json_parse.h"
+
+#include <string>
+#include <iostream>
+
+using namespace std;
+
+static const char *json_hex_chars = "0123456789abcdef";
+
+void JSONParser::WriteEscapedString(const string& in, ostream* out) {
+ int pos = 0;
+ int start_offset = 0;
+ unsigned char c = 0;
+ (*out) << '"';
+ while(pos < in.size()) {
+ c = in[pos];
+ switch(c) {
+ case '\b':
+ case '\n':
+ case '\r':
+ case '\t':
+ case '"':
+ case '\\':
+ case '/':
+ if(pos - start_offset > 0)
+ (*out) << in.substr(start_offset, pos - start_offset);
+ if(c == '\b') (*out) << "\\b";
+ else if(c == '\n') (*out) << "\\n";
+ else if(c == '\r') (*out) << "\\r";
+ else if(c == '\t') (*out) << "\\t";
+ else if(c == '"') (*out) << "\\\"";
+ else if(c == '\\') (*out) << "\\\\";
+ else if(c == '/') (*out) << "\\/";
+ start_offset = ++pos;
+ break;
+ default:
+ if(c < ' ') {
+ cerr << "Warning, bad character (" << static_cast<int>(c) << ") in string\n";
+ if(pos - start_offset > 0)
+ (*out) << in.substr(start_offset, pos - start_offset);
+ (*out) << "\\u00" << json_hex_chars[c >> 4] << json_hex_chars[c & 0xf];
+ start_offset = ++pos;
+ } else pos++;
+ }
+ }
+ if(pos - start_offset > 0)
+ (*out) << in.substr(start_offset, pos - start_offset);
+ (*out) << '"';
+}
+
diff --git a/decoder/json_parse.h b/decoder/json_parse.h
new file mode 100644
index 00000000..c3cba954
--- /dev/null
+++ b/decoder/json_parse.h
@@ -0,0 +1,58 @@
+#ifndef _JSON_WRAPPER_H_
+#define _JSON_WRAPPER_H_
+
+#include <iostream>
+#include <cassert>
+#include "JSON_parser.h"
+
+class JSONParser {
+ public:
+ JSONParser() {
+ init_JSON_config(&config);
+ hack.mf = &JSONParser::Callback;
+ config.depth = 10;
+ config.callback_ctx = reinterpret_cast<void*>(this);
+ config.callback = hack.cb;
+ config.allow_comments = 1;
+ config.handle_floats_manually = 1;
+ jc = new_JSON_parser(&config);
+ }
+ virtual ~JSONParser() {
+ delete_JSON_parser(jc);
+ }
+ bool Parse(std::istream* in) {
+ int count = 0;
+ int lc = 1;
+ for (; in ; ++count) {
+ int next_char = in->get();
+ if (!in->good()) break;
+ if (lc == '\n') { ++lc; }
+ if (!JSON_parser_char(jc, next_char)) {
+ std::cerr << "JSON_parser_char: syntax error, line " << lc << " (byte " << count << ")" << std::endl;
+ return false;
+ }
+ }
+ if (!JSON_parser_done(jc)) {
+ std::cerr << "JSON_parser_done: syntax error\n";
+ return false;
+ }
+ return true;
+ }
+ static void WriteEscapedString(const std::string& in, std::ostream* out);
+ protected:
+ virtual bool HandleJSONEvent(int type, const JSON_value* value) = 0;
+ private:
+ int Callback(int type, const JSON_value* value) {
+ if (HandleJSONEvent(type, value)) return 1;
+ return 0;
+ }
+ JSON_parser_struct* jc;
+ JSON_config config;
+ typedef int (JSONParser::* MF)(int type, const struct JSON_value_struct* value);
+ union CBHack {
+ JSON_parser_callback cb;
+ MF mf;
+ } hack;
+};
+
+#endif
diff --git a/decoder/kbest.h b/decoder/kbest.h
new file mode 100644
index 00000000..fcd40fcd
--- /dev/null
+++ b/decoder/kbest.h
@@ -0,0 +1,208 @@
+#ifndef _HG_KBEST_H_
+#define _HG_KBEST_H_
+
+#include <vector>
+#include <utility>
+#include <tr1/unordered_set>
+
+#include <boost/shared_ptr.hpp>
+
+#include "wordid.h"
+#include "hg.h"
+
+namespace KBest {
+ // default, don't filter any derivations from the k-best list
+ struct NoFilter {
+ bool operator()(const std::vector<WordID>& yield) {
+ (void) yield;
+ return false;
+ }
+ };
+
+ // optional, filter unique yield strings
+ struct FilterUnique {
+ std::tr1::unordered_set<std::vector<WordID>, boost::hash<std::vector<WordID> > > unique;
+
+ bool operator()(const std::vector<WordID>& yield) {
+ return !unique.insert(yield).second;
+ }
+ };
+
+ // utility class to lazily create the k-best derivations from a forest, uses
+ // the lazy k-best algorithm (Algorithm 3) from Huang and Chiang (IWPT 2005)
+ template<typename T, // yield type (returned by Traversal)
+ typename Traversal,
+ typename DerivationFilter = NoFilter,
+ typename WeightType = prob_t,
+ typename WeightFunction = EdgeProb>
+ struct KBestDerivations {
+ KBestDerivations(const Hypergraph& hg,
+ const size_t k,
+ const Traversal& tf = Traversal(),
+ const WeightFunction& wf = WeightFunction()) :
+ traverse(tf), w(wf), g(hg), nds(g.nodes_.size()), k_prime(k) {}
+
+ ~KBestDerivations() {
+ for (int i = 0; i < freelist.size(); ++i)
+ delete freelist[i];
+ }
+
+ struct Derivation {
+ Derivation(const Hypergraph::Edge& e,
+ const SmallVector& jv,
+ const WeightType& w,
+ const SparseVector<double>& f) :
+ edge(&e),
+ j(jv),
+ score(w),
+ feature_values(f) {}
+
+ // dummy constructor, just for query
+ Derivation(const Hypergraph::Edge& e,
+ const SmallVector& jv) : edge(&e), j(jv) {}
+
+ T yield;
+ const Hypergraph::Edge* const edge;
+ const SmallVector j;
+ const WeightType score;
+ const SparseVector<double> feature_values;
+ };
+ struct HeapCompare {
+ bool operator()(const Derivation* a, const Derivation* b) const {
+ return a->score < b->score;
+ }
+ };
+ struct DerivationCompare {
+ bool operator()(const Derivation* a, const Derivation* b) const {
+ return a->score > b->score;
+ }
+ };
+ struct DerivationUniquenessHash {
+ size_t operator()(const Derivation* d) const {
+ size_t x = 5381;
+ x = ((x << 5) + x) ^ d->edge->id_;
+ for (int i = 0; i < d->j.size(); ++i)
+ x = ((x << 5) + x) ^ d->j[i];
+ return x;
+ }
+ };
+ struct DerivationUniquenessEquals {
+ bool operator()(const Derivation* a, const Derivation* b) const {
+ return (a->edge == b->edge) && (a->j == b->j);
+ }
+ };
+ typedef std::vector<Derivation*> CandidateHeap;
+ typedef std::vector<Derivation*> DerivationList;
+ typedef std::tr1::unordered_set<
+ const Derivation*, DerivationUniquenessHash, DerivationUniquenessEquals> UniqueDerivationSet;
+
+ struct NodeDerivationState {
+ CandidateHeap cand;
+ DerivationList D;
+ DerivationFilter filter;
+ UniqueDerivationSet ds;
+ explicit NodeDerivationState(const DerivationFilter& f = DerivationFilter()) : filter(f) {}
+ };
+
+ Derivation* LazyKthBest(int v, int k) {
+ NodeDerivationState& s = GetCandidates(v);
+ CandidateHeap& cand = s.cand;
+ DerivationList& D = s.D;
+ DerivationFilter& filter = s.filter;
+ bool add_next = true;
+ while (D.size() <= k) {
+ if (add_next && D.size() > 0) {
+ const Derivation* d = D.back();
+ LazyNext(d, &cand, &s.ds);
+ }
+ add_next = false;
+
+ if (cand.size() > 0) {
+ std::pop_heap(cand.begin(), cand.end(), HeapCompare());
+ Derivation* d = cand.back();
+ cand.pop_back();
+ std::vector<const T*> ants(d->edge->Arity());
+ for (int j = 0; j < ants.size(); ++j)
+ ants[j] = &LazyKthBest(d->edge->tail_nodes_[j], d->j[j])->yield;
+ traverse(*d->edge, ants, &d->yield);
+ if (!filter(d->yield)) {
+ D.push_back(d);
+ add_next = true;
+ }
+ } else {
+ break;
+ }
+ }
+ if (k < D.size()) return D[k]; else return NULL;
+ }
+
+ private:
+ // creates a derivation object with all fields set but the yield
+ // the yield is computed in LazyKthBest before the derivation is added to D
+ // returns NULL if j refers to derivation numbers larger than the
+ // antecedent structure define
+ Derivation* CreateDerivation(const Hypergraph::Edge& e, const SmallVector& j) {
+ WeightType score = w(e);
+ SparseVector<double> feats = e.feature_values_;
+ for (int i = 0; i < e.Arity(); ++i) {
+ const Derivation* ant = LazyKthBest(e.tail_nodes_[i], j[i]);
+ if (!ant) { return NULL; }
+ score *= ant->score;
+ feats += ant->feature_values;
+ }
+ freelist.push_back(new Derivation(e, j, score, feats));
+ return freelist.back();
+ }
+
+ NodeDerivationState& GetCandidates(int v) {
+ NodeDerivationState& s = nds[v];
+ if (!s.D.empty() || !s.cand.empty()) return s;
+
+ const Hypergraph::Node& node = g.nodes_[v];
+ for (int i = 0; i < node.in_edges_.size(); ++i) {
+ const Hypergraph::Edge& edge = g.edges_[node.in_edges_[i]];
+ SmallVector jv(edge.Arity(), 0);
+ Derivation* d = CreateDerivation(edge, jv);
+ assert(d);
+ s.cand.push_back(d);
+ }
+
+ const int effective_k = std::min(k_prime, s.cand.size());
+ const typename CandidateHeap::iterator kth = s.cand.begin() + effective_k;
+ std::nth_element(s.cand.begin(), kth, s.cand.end(), DerivationCompare());
+ s.cand.resize(effective_k);
+ std::make_heap(s.cand.begin(), s.cand.end(), HeapCompare());
+
+ return s;
+ }
+
+ void LazyNext(const Derivation* d, CandidateHeap* cand, UniqueDerivationSet* ds) {
+ for (int i = 0; i < d->j.size(); ++i) {
+ SmallVector j = d->j;
+ ++j[i];
+ const Derivation* ant = LazyKthBest(d->edge->tail_nodes_[i], j[i]);
+ if (ant) {
+ Derivation query_unique(*d->edge, j);
+ if (ds->count(&query_unique) == 0) {
+ Derivation* new_d = CreateDerivation(*d->edge, j);
+ if (new_d) {
+ cand->push_back(new_d);
+ std::push_heap(cand->begin(), cand->end(), HeapCompare());
+ bool inserted = ds->insert(new_d).second; // insert into uniqueness set
+ assert(inserted);
+ }
+ }
+ }
+ }
+ }
+
+ const Traversal traverse;
+ const WeightFunction w;
+ const Hypergraph& g;
+ std::vector<NodeDerivationState> nds;
+ std::vector<Derivation*> freelist;
+ const size_t k_prime;
+ };
+}
+
+#endif
diff --git a/decoder/lattice.cc b/decoder/lattice.cc
new file mode 100644
index 00000000..e3631e59
--- /dev/null
+++ b/decoder/lattice.cc
@@ -0,0 +1,62 @@
+#include "lattice.h"
+
+#include "tdict.h"
+#include "hg_io.h"
+
+using namespace std;
+
+static const int kUNREACHABLE = 99999999;
+
+void Lattice::ComputeDistances() {
+ const int n = this->size() + 1;
+ dist_.resize(n, n, kUNREACHABLE);
+ for (int i = 0; i < this->size(); ++i) {
+ const vector<LatticeArc>& alts = (*this)[i];
+ for (int j = 0; j < alts.size(); ++j)
+ dist_(i, i + alts[j].dist2next) = 1;
+ }
+ for (int k = 0; k < n; ++k) {
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < n; ++j) {
+ const int dp = dist_(i,k) + dist_(k,j);
+ if (dist_(i,j) > dp)
+ dist_(i,j) = dp;
+ }
+ }
+ }
+
+ for (int i = 0; i < n; ++i) {
+ int latest = kUNREACHABLE;
+ for (int j = n-1; j >= 0; --j) {
+ const int c = dist_(i,j);
+ if (c < kUNREACHABLE)
+ latest = c;
+ else
+ dist_(i,j) = latest;
+ }
+ }
+ // cerr << dist_ << endl;
+}
+
+bool LatticeTools::LooksLikePLF(const string &line) {
+ return (line.size() > 5) && (line.substr(0,4) == "((('");
+}
+
+void LatticeTools::ConvertTextToLattice(const string& text, Lattice* pl) {
+ Lattice& l = *pl;
+ vector<WordID> ids;
+ TD::ConvertSentence(text, &ids);
+ l.resize(ids.size());
+ for (int i = 0; i < l.size(); ++i)
+ l[i].push_back(LatticeArc(ids[i], 0.0, 1));
+ l.is_sentence_ = true;
+}
+
+void LatticeTools::ConvertTextOrPLF(const string& text_or_plf, Lattice* pl) {
+ if (LooksLikePLF(text_or_plf))
+ HypergraphIO::PLFtoLattice(text_or_plf, pl);
+ else
+ ConvertTextToLattice(text_or_plf, pl);
+ pl->ComputeDistances();
+}
+
diff --git a/decoder/lattice.h b/decoder/lattice.h
new file mode 100644
index 00000000..ad4ca50d
--- /dev/null
+++ b/decoder/lattice.h
@@ -0,0 +1,46 @@
+#ifndef __LATTICE_H_
+#define __LATTICE_H_
+
+#include <string>
+#include <vector>
+#include "wordid.h"
+#include "array2d.h"
+
+class Lattice;
+struct LatticeTools {
+ static bool LooksLikePLF(const std::string &line);
+ static void ConvertTextToLattice(const std::string& text, Lattice* pl);
+ static void ConvertTextOrPLF(const std::string& text_or_plf, Lattice* pl);
+};
+
+struct LatticeArc {
+ WordID label;
+ double cost;
+ int dist2next;
+ LatticeArc() : label(), cost(), dist2next() {}
+ LatticeArc(WordID w, double c, int i) : label(w), cost(c), dist2next(i) {}
+};
+
+class Lattice : public std::vector<std::vector<LatticeArc> > {
+ friend void LatticeTools::ConvertTextOrPLF(const std::string& text_or_plf, Lattice* pl);
+ friend void LatticeTools::ConvertTextToLattice(const std::string& text, Lattice* pl);
+ public:
+ Lattice() : is_sentence_(false) {}
+ explicit Lattice(size_t t, const std::vector<LatticeArc>& v = std::vector<LatticeArc>()) :
+ std::vector<std::vector<LatticeArc> >(t, v),
+ is_sentence_(false) {}
+ int Distance(int from, int to) const {
+ if (dist_.empty())
+ return (to - from);
+ return dist_(from, to);
+ }
+ // TODO this should actually be computed based on the contents
+ // of the lattice
+ bool IsSentence() const { return is_sentence_; }
+ private:
+ void ComputeDistances();
+ Array2D<int> dist_;
+ bool is_sentence_;
+};
+
+#endif
diff --git a/decoder/lexalign.cc b/decoder/lexalign.cc
new file mode 100644
index 00000000..6adb1892
--- /dev/null
+++ b/decoder/lexalign.cc
@@ -0,0 +1,129 @@
+#include "lexalign.h"
+
+#include <iostream>
+
+#include "filelib.h"
+#include "hg.h"
+#include "tdict.h"
+#include "grammar.h"
+#include "sentence_metadata.h"
+
+using namespace std;
+
+struct LexicalAlignImpl {
+ LexicalAlignImpl(const boost::program_options::variables_map& conf) :
+ use_null(conf.count("lexcrf_use_null") > 0),
+ kXCAT(TD::Convert("X")*-1),
+ kNULL(TD::Convert("<eps>")),
+ kBINARY(new TRule("[X] ||| [X,1] [X,2] ||| [1] [2]")),
+ kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")) {
+ }
+
+ void BuildTrellis(const Lattice& lattice, const SentenceMetadata& smeta, Hypergraph* forest) {
+ const int e_len = smeta.GetTargetLength();
+ assert(e_len > 0);
+ const Lattice& target = smeta.GetReference();
+ const int f_len = lattice.size();
+ // hack to tell the feature function system how big the sentence pair is
+ const int f_start = (use_null ? -1 : 0);
+ int prev_node_id = -1;
+ for (int i = 0; i < e_len; ++i) { // for each word in the *target*
+ const WordID& e_i = target[i][0].label;
+ Hypergraph::Node* node = forest->AddNode(kXCAT);
+ const int new_node_id = node->id_;
+ int num_srcs = 0;
+ for (int j = f_start; j < f_len; ++j) { // for each word in the source
+ const WordID src_sym = (j < 0 ? kNULL : lattice[j][0].label);
+ const TRulePtr& rule = LexRule(src_sym, e_i);
+ if (rule) {
+ Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector());
+ edge->i_ = j;
+ edge->j_ = j+1;
+ edge->prev_i_ = i;
+ edge->prev_j_ = i+1;
+ edge->feature_values_ += edge->rule_->GetFeatureValues();
+ ++num_srcs;
+ forest->ConnectEdgeToHeadNode(edge->id_, new_node_id);
+ } else {
+ cerr << TD::Convert(src_sym) << " does not translate to " << TD::Convert(e_i) << endl;
+ }
+ }
+ assert(num_srcs > 0);
+ if (prev_node_id >= 0) {
+ const int comb_node_id = forest->AddNode(kXCAT)->id_;
+ Hypergraph::TailNodeVector tail(2, prev_node_id);
+ tail[1] = new_node_id;
+ Hypergraph::Edge* edge = forest->AddEdge(kBINARY, tail);
+ forest->ConnectEdgeToHeadNode(edge->id_, comb_node_id);
+ prev_node_id = comb_node_id;
+ } else {
+ prev_node_id = new_node_id;
+ }
+ }
+ Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1);
+ Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1);
+ Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail);
+ forest->ConnectEdgeToHeadNode(hg_edge, goal);
+ }
+
+ inline int LexFeatureId(const WordID& f, const WordID& e) {
+ map<int, int>& e2fid = f2e2fid[f];
+ map<int, int>::iterator it = e2fid.find(e);
+ if (it != e2fid.end())
+ return it->second;
+ int& fid = e2fid[e];
+ if (f == 0) {
+ fid = FD::Convert("Lx:<eps>_" + FD::Escape(TD::Convert(e)));
+ } else {
+ fid = FD::Convert("Lx:" + FD::Escape(TD::Convert(f)) + "_" + FD::Escape(TD::Convert(e)));
+ }
+ return fid;
+ }
+
+ inline const TRulePtr& LexRule(const WordID& f, const WordID& e) {
+ const int fid = LexFeatureId(f, e);
+ if (!fid) { return kNULL_PTR; }
+ map<int, TRulePtr>& e2rule = f2e2rule[f];
+ map<int, TRulePtr>::iterator it = e2rule.find(e);
+ if (it != e2rule.end())
+ return it->second;
+ TRulePtr& tr = e2rule[e];
+ tr.reset(TRule::CreateLexicalRule(f, e));
+ tr->scores_.set_value(fid, 1.0);
+ return tr;
+ }
+
+ private:
+ const bool use_null;
+ const WordID kXCAT;
+ const WordID kNULL;
+ const TRulePtr kBINARY;
+ const TRulePtr kGOAL_RULE;
+ const TRulePtr kNULL_PTR;
+ map<int, map<int, TRulePtr> > f2e2rule;
+ map<int, map<int, int> > f2e2fid;
+ GrammarPtr grammar;
+};
+
+LexicalAlign::LexicalAlign(const boost::program_options::variables_map& conf) :
+ pimpl_(new LexicalAlignImpl(conf)) {}
+
+bool LexicalAlign::TranslateImpl(const string& input,
+ SentenceMetadata* smeta,
+ const vector<double>& weights,
+ Hypergraph* forest) {
+ Lattice& lattice = smeta->src_lattice_;
+ LatticeTools::ConvertTextOrPLF(input, &lattice);
+ if (!lattice.IsSentence()) {
+ // lexical models make independence assumptions
+ // that don't work with lattices or conf nets
+ cerr << "LexicalTrans: cannot deal with lattice source input!\n";
+ abort();
+ }
+ smeta->SetSourceLength(lattice.size());
+ pimpl_->BuildTrellis(lattice, *smeta, forest);
+ forest->is_linear_chain_ = true;
+ forest->Reweight(weights);
+ return true;
+}
+
diff --git a/decoder/lexalign.h b/decoder/lexalign.h
new file mode 100644
index 00000000..7ba4fe64
--- /dev/null
+++ b/decoder/lexalign.h
@@ -0,0 +1,18 @@
+#ifndef _LEXALIGN_H_
+#define _LEXALIGN_H_
+
+#include "translator.h"
+#include "lattice.h"
+
+struct LexicalAlignImpl;
+struct LexicalAlign : public Translator {
+ LexicalAlign(const boost::program_options::variables_map& conf);
+ bool TranslateImpl(const std::string& input,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* forest);
+ private:
+ boost::shared_ptr<LexicalAlignImpl> pimpl_;
+};
+
+#endif
diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc
new file mode 100644
index 00000000..3fcd1a7d
--- /dev/null
+++ b/decoder/lextrans.cc
@@ -0,0 +1,119 @@
+#include "lextrans.h"
+
+#include <iostream>
+
+#include "filelib.h"
+#include "hg.h"
+#include "tdict.h"
+#include "grammar.h"
+#include "sentence_metadata.h"
+
+using namespace std;
+
+struct LexicalTransImpl {
+ LexicalTransImpl(const boost::program_options::variables_map& conf) :
+ use_null(conf.count("lexcrf_use_null") > 0),
+ kXCAT(TD::Convert("X")*-1),
+ kNULL(TD::Convert("<eps>")),
+ kBINARY(new TRule("[X] ||| [X,1] [X,2] ||| [1] [2]")),
+ kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")) {
+ vector<string> gfiles = conf["grammar"].as<vector<string> >();
+ assert(gfiles.size() == 1);
+ ReadFile rf(gfiles.front());
+ TextGrammar *tg = new TextGrammar;
+ grammar.reset(tg);
+ istream* in = rf.stream();
+ int lc = 0;
+ bool flag = false;
+ while(*in) {
+ string line;
+ getline(*in, line);
+ if (line.empty()) continue;
+ ++lc;
+ TRulePtr r(TRule::CreateRulePhrasetable(line));
+ tg->AddRule(r);
+ if (lc % 50000 == 0) { cerr << '.'; flag = true; }
+ if (lc % 2000000 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
+ }
+ if (flag) cerr << endl;
+ cerr << "Loaded " << lc << " rules\n";
+ }
+
+ void BuildTrellis(const Lattice& lattice, const SentenceMetadata& smeta, Hypergraph* forest) {
+ const int e_len = smeta.GetTargetLength();
+ assert(e_len > 0);
+ const int f_len = lattice.size();
+ // hack to tell the feature function system how big the sentence pair is
+ const int f_start = (use_null ? -1 : 0);
+ int prev_node_id = -1;
+ for (int i = 0; i < e_len; ++i) { // for each word in the *target*
+ Hypergraph::Node* node = forest->AddNode(kXCAT);
+ const int new_node_id = node->id_;
+ for (int j = f_start; j < f_len; ++j) { // for each word in the source
+ const WordID src_sym = (j < 0 ? kNULL : lattice[j][0].label);
+ const GrammarIter* gi = grammar->GetRoot()->Extend(src_sym);
+ if (!gi) {
+ cerr << "No translations found for: " << TD::Convert(src_sym) << "\n";
+ abort();
+ }
+ const RuleBin* rb = gi->GetRules();
+ assert(rb);
+ for (int k = 0; k < rb->GetNumRules(); ++k) {
+ TRulePtr rule = rb->GetIthRule(k);
+ Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector());
+ edge->i_ = j;
+ edge->j_ = j+1;
+ edge->prev_i_ = i;
+ edge->prev_j_ = i+1;
+ edge->feature_values_ += edge->rule_->GetFeatureValues();
+ forest->ConnectEdgeToHeadNode(edge->id_, new_node_id);
+ }
+ }
+ if (prev_node_id >= 0) {
+ const int comb_node_id = forest->AddNode(kXCAT)->id_;
+ Hypergraph::TailNodeVector tail(2, prev_node_id);
+ tail[1] = new_node_id;
+ Hypergraph::Edge* edge = forest->AddEdge(kBINARY, tail);
+ forest->ConnectEdgeToHeadNode(edge->id_, comb_node_id);
+ prev_node_id = comb_node_id;
+ } else {
+ prev_node_id = new_node_id;
+ }
+ }
+ Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1);
+ Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1);
+ Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail);
+ forest->ConnectEdgeToHeadNode(hg_edge, goal);
+ }
+
+ private:
+ const bool use_null;
+ const WordID kXCAT;
+ const WordID kNULL;
+ const TRulePtr kBINARY;
+ const TRulePtr kGOAL_RULE;
+ GrammarPtr grammar;
+};
+
+LexicalTrans::LexicalTrans(const boost::program_options::variables_map& conf) :
+ pimpl_(new LexicalTransImpl(conf)) {}
+
+bool LexicalTrans::TranslateImpl(const string& input,
+ SentenceMetadata* smeta,
+ const vector<double>& weights,
+ Hypergraph* forest) {
+ Lattice& lattice = smeta->src_lattice_;
+ LatticeTools::ConvertTextOrPLF(input, &lattice);
+ if (!lattice.IsSentence()) {
+ // lexical models make independence assumptions
+ // that don't work with lattices or conf nets
+ cerr << "LexicalTrans: cannot deal with lattice source input!\n";
+ abort();
+ }
+ smeta->SetSourceLength(lattice.size());
+ pimpl_->BuildTrellis(lattice, *smeta, forest);
+ forest->is_linear_chain_ = true;
+ forest->Reweight(weights);
+ return true;
+}
+
diff --git a/decoder/lextrans.h b/decoder/lextrans.h
new file mode 100644
index 00000000..2d51e7c0
--- /dev/null
+++ b/decoder/lextrans.h
@@ -0,0 +1,18 @@
+#ifndef _LEXTrans_H_
+#define _LEXTrans_H_
+
+#include "translator.h"
+#include "lattice.h"
+
+struct LexicalTransImpl;
+struct LexicalTrans : public Translator {
+ LexicalTrans(const boost::program_options::variables_map& conf);
+ bool TranslateImpl(const std::string& input,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* forest);
+ private:
+ boost::shared_ptr<LexicalTransImpl> pimpl_;
+};
+
+#endif
diff --git a/decoder/logval.h b/decoder/logval.h
new file mode 100644
index 00000000..7099b9be
--- /dev/null
+++ b/decoder/logval.h
@@ -0,0 +1,157 @@
+#ifndef LOGVAL_H_
+#define LOGVAL_H_
+
+#include <iostream>
+#include <cstdlib>
+#include <cmath>
+#include <limits>
+
+template <typename T>
+class LogVal {
+ public:
+ LogVal() : s_(), v_(-std::numeric_limits<T>::infinity()) {}
+ explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {}
+ static LogVal<T> One() { return LogVal(1); }
+ static LogVal<T> Zero() { return LogVal(); }
+
+ void logeq(const T& v) { s_ = false; v_ = v; }
+
+ LogVal& operator+=(const LogVal& a) {
+ if (a.v_ == -std::numeric_limits<T>::infinity()) return *this;
+ if (a.s_ == s_) {
+ if (a.v_ < v_) {
+ v_ = v_ + log1p(std::exp(a.v_ - v_));
+ } else {
+ v_ = a.v_ + log1p(std::exp(v_ - a.v_));
+ }
+ } else {
+ if (a.v_ < v_) {
+ v_ = v_ + log1p(-std::exp(a.v_ - v_));
+ } else {
+ v_ = a.v_ + log1p(-std::exp(v_ - a.v_));
+ s_ = !s_;
+ }
+ }
+ return *this;
+ }
+
+ LogVal& operator*=(const LogVal& a) {
+ s_ = (s_ != a.s_);
+ v_ += a.v_;
+ return *this;
+ }
+
+ LogVal& operator/=(const LogVal& a) {
+ s_ = (s_ != a.s_);
+ v_ -= a.v_;
+ return *this;
+ }
+
+ LogVal& operator-=(const LogVal& a) {
+ LogVal b = a;
+ b.invert();
+ return *this += b;
+ }
+
+ LogVal& poweq(const T& power) {
+ if (s_) {
+ std::cerr << "poweq(T) not implemented when s_ is true\n";
+ std::abort();
+ } else {
+ v_ *= power;
+ }
+ return *this;
+ }
+
+ void invert() { s_ = !s_; }
+
+ LogVal pow(const T& power) const {
+ LogVal res = *this;
+ res.poweq(power);
+ return res;
+ }
+
+ operator T() const {
+ if (s_) return -std::exp(v_); else return std::exp(v_);
+ }
+
+ bool s_;
+ T v_;
+};
+
+template<typename T>
+LogVal<T> operator+(const LogVal<T>& o1, const LogVal<T>& o2) {
+ LogVal<T> res(o1);
+ res += o2;
+ return res;
+}
+
+template<typename T>
+LogVal<T> operator*(const LogVal<T>& o1, const LogVal<T>& o2) {
+ LogVal<T> res(o1);
+ res *= o2;
+ return res;
+}
+
+template<typename T>
+LogVal<T> operator/(const LogVal<T>& o1, const LogVal<T>& o2) {
+ LogVal<T> res(o1);
+ res /= o2;
+ return res;
+}
+
+template<typename T>
+LogVal<T> operator-(const LogVal<T>& o1, const LogVal<T>& o2) {
+ LogVal<T> res(o1);
+ res -= o2;
+ return res;
+}
+
+template<typename T>
+T log(const LogVal<T>& o) {
+ if (o.s_) return log(-1.0);
+ return o.v_;
+}
+
+template <typename T>
+LogVal<T> pow(const LogVal<T>& b, const T& e) {
+ return b.pow(e);
+}
+
+template <typename T>
+bool operator<(const LogVal<T>& lhs, const LogVal<T>& rhs) {
+ if (lhs.s_ == rhs.s_) {
+ return (lhs.v_ < rhs.v_);
+ } else {
+ return lhs.s_ > rhs.s_;
+ }
+}
+
+#if 0
+template <typename T>
+bool operator<=(const LogVal<T>& lhs, const LogVal<T>& rhs) {
+ return (lhs.v_ <= rhs.v_);
+}
+
+template <typename T>
+bool operator>(const LogVal<T>& lhs, const LogVal<T>& rhs) {
+ return (lhs.v_ > rhs.v_);
+}
+
+template <typename T>
+bool operator>=(const LogVal<T>& lhs, const LogVal<T>& rhs) {
+ return (lhs.v_ >= rhs.v_);
+}
+#endif
+
+template <typename T>
+bool operator==(const LogVal<T>& lhs, const LogVal<T>& rhs) {
+ return (lhs.v_ == rhs.v_) && (lhs.s_ == rhs.s_);
+}
+
+template <typename T>
+bool operator!=(const LogVal<T>& lhs, const LogVal<T>& rhs) {
+ return !(lhs == rhs);
+}
+
+#endif
diff --git a/decoder/logval_test.cc b/decoder/logval_test.cc
new file mode 100644
index 00000000..1a23177d
--- /dev/null
+++ b/decoder/logval_test.cc
@@ -0,0 +1,73 @@
+#include "logval.h"
+
+#include <gtest/gtest.h>
+#include <iostream>
+
+class LogValTest : public testing::Test {
+ protected:
+ virtual void SetUp() { }
+ virtual void TearDown() { }
+};
+
+using namespace std;
+
+TEST_F(LogValTest,Order) {
+ LogVal<double> a(-0.3);
+ LogVal<double> b(0.3);
+ LogVal<double> c(2.4);
+ EXPECT_LT(a,b);
+ EXPECT_LT(b,c);
+ EXPECT_LT(a,c);
+ EXPECT_FALSE(b < a);
+ EXPECT_FALSE(c < a);
+ EXPECT_FALSE(c < b);
+ EXPECT_FALSE(c < c);
+ EXPECT_FALSE(b < b);
+ EXPECT_FALSE(a < a);
+}
+
+TEST_F(LogValTest,Invert) {
+ LogVal<double> x(-2.4);
+ LogVal<double> y(2.4);
+ y.invert();
+ EXPECT_FLOAT_EQ(x,y);
+}
+
+TEST_F(LogValTest,Minus) {
+ LogVal<double> x(12);
+ LogVal<double> y(2);
+ LogVal<double> z1 = x - y;
+ LogVal<double> z2 = x;
+ z2 -= y;
+ EXPECT_FLOAT_EQ(z1, z2);
+ EXPECT_FLOAT_EQ(z1, 10.0);
+ EXPECT_FLOAT_EQ(y - x, -10.0);
+}
+
+TEST_F(LogValTest,TestOps) {
+ LogVal<double> x(-12.12);
+ LogVal<double> y(x);
+ cerr << x << endl;
+ cerr << (x*y) << endl;
+ cerr << (x*y + x) << endl;
+ cerr << (x + x*y) << endl;
+ cerr << log1p(-0.5) << endl;
+ LogVal<double> aa(0.2);
+ LogVal<double> bb(-0.3);
+ cerr << (aa + bb) << endl;
+ cerr << (bb + aa) << endl;
+ EXPECT_FLOAT_EQ((aa + bb), (bb + aa));
+ EXPECT_FLOAT_EQ((aa + bb), -0.1);
+}
+
+TEST_F(LogValTest,TestSizes) {
+ cerr << sizeof(LogVal<double>) << endl;
+ cerr << sizeof(LogVal<float>) << endl;
+ cerr << sizeof(void*) << endl;
+}
+
+int main(int argc, char** argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
diff --git a/decoder/maxtrans_blunsom.cc b/decoder/maxtrans_blunsom.cc
new file mode 100644
index 00000000..34e175db
--- /dev/null
+++ b/decoder/maxtrans_blunsom.cc
@@ -0,0 +1,287 @@
+#include "apply_models.h"
+
+#include <vector>
+#include <algorithm>
+#include <tr1/unordered_map>
+#include <tr1/unordered_set>
+
+#include <boost/tuple/tuple.hpp>
+#include <boost/functional/hash.hpp>
+
+#include "tdict.h"
+#include "hg.h"
+#include "ff.h"
+
+using boost::tuple;
+using namespace std;
+using namespace std::tr1;
+
+namespace Hack {
+
+struct Candidate;
+typedef SmallVector JVector;
+typedef vector<Candidate*> CandidateHeap;
+typedef vector<Candidate*> CandidateList;
+
+// life cycle: candidates are created, placed on the heap
+// and retrieved by their estimated cost, when they're
+// retrieved, they're incorporated into the +LM hypergraph
+// where they also know the head node index they are
+// attached to. After they are added to the +LM hypergraph
+// inside_prob_ and est_prob_ fields may be updated as better
+// derivations are found (this happens since the successor's
+// of derivation d may have a better score- they are
+// explored lazily). However, the updates don't happen
+// when a candidate is in the heap so maintaining the heap
+// property is not an issue.
+struct Candidate {
+ int node_index_; // -1 until incorporated
+ // into the +LM forest
+ const Hypergraph::Edge* in_edge_; // in -LM forest
+ Hypergraph::Edge out_edge_;
+ vector<WordID> state_;
+ const JVector j_;
+ prob_t inside_prob_; // these are fixed until the cand
+ // is popped, then they may be updated
+ prob_t est_prob_;
+
+ Candidate(const Hypergraph::Edge& e,
+ const JVector& j,
+ const vector<CandidateList>& D,
+ bool is_goal) :
+ node_index_(-1),
+ in_edge_(&e),
+ j_(j) {
+ InitializeCandidate(D, is_goal);
+ }
+
+ // used to query uniqueness
+ Candidate(const Hypergraph::Edge& e,
+ const JVector& j) : in_edge_(&e), j_(j) {}
+
+ bool IsIncorporatedIntoHypergraph() const {
+ return node_index_ >= 0;
+ }
+
+ void InitializeCandidate(const vector<vector<Candidate*> >& D,
+ const bool is_goal) {
+ const Hypergraph::Edge& in_edge = *in_edge_;
+ out_edge_.rule_ = in_edge.rule_;
+ out_edge_.feature_values_ = in_edge.feature_values_;
+ Hypergraph::TailNodeVector& tail = out_edge_.tail_nodes_;
+ tail.resize(j_.size());
+ prob_t p = prob_t::One();
+ // cerr << "\nEstimating application of " << in_edge.rule_->AsString() << endl;
+ vector<const vector<WordID>* > ants(tail.size());
+ for (int i = 0; i < tail.size(); ++i) {
+ const Candidate& ant = *D[in_edge.tail_nodes_[i]][j_[i]];
+ ants[i] = &ant.state_;
+ assert(ant.IsIncorporatedIntoHypergraph());
+ tail[i] = ant.node_index_;
+ p *= ant.inside_prob_;
+ }
+ prob_t edge_estimate = prob_t::One();
+ if (is_goal) {
+ assert(tail.size() == 1);
+ out_edge_.edge_prob_ = in_edge.edge_prob_;
+ } else {
+ in_edge.rule_->ESubstitute(ants, &state_);
+ out_edge_.edge_prob_ = in_edge.edge_prob_;
+ }
+ inside_prob_ = out_edge_.edge_prob_ * p;
+ est_prob_ = inside_prob_ * edge_estimate;
+ }
+};
+
+ostream& operator<<(ostream& os, const Candidate& cand) {
+ os << "CAND[";
+ if (!cand.IsIncorporatedIntoHypergraph()) { os << "PENDING "; }
+ else { os << "+LM_node=" << cand.node_index_; }
+ os << " edge=" << cand.in_edge_->id_;
+ os << " j=<";
+ for (int i = 0; i < cand.j_.size(); ++i)
+ os << (i==0 ? "" : " ") << cand.j_[i];
+ os << "> vit=" << log(cand.inside_prob_);
+ os << " est=" << log(cand.est_prob_);
+ return os << ']';
+}
+
+struct HeapCandCompare {
+ bool operator()(const Candidate* l, const Candidate* r) const {
+ return l->est_prob_ < r->est_prob_;
+ }
+};
+
+struct EstProbSorter {
+ bool operator()(const Candidate* l, const Candidate* r) const {
+ return l->est_prob_ > r->est_prob_;
+ }
+};
+
+// the same candidate <edge, j> can be added multiple times if
+// j is multidimensional (if you're going NW in Manhattan, you
+// can first go north, then west, or you can go west then north)
+// this is a hash function on the relevant variables from
+// Candidate to enforce this.
+struct CandidateUniquenessHash {
+ size_t operator()(const Candidate* c) const {
+ size_t x = 5381;
+ x = ((x << 5) + x) ^ c->in_edge_->id_;
+ for (int i = 0; i < c->j_.size(); ++i)
+ x = ((x << 5) + x) ^ c->j_[i];
+ return x;
+ }
+};
+
+struct CandidateUniquenessEquals {
+ bool operator()(const Candidate* a, const Candidate* b) const {
+ return (a->in_edge_ == b->in_edge_) && (a->j_ == b->j_);
+ }
+};
+
+typedef unordered_set<const Candidate*, CandidateUniquenessHash, CandidateUniquenessEquals> UniqueCandidateSet;
+typedef unordered_map<vector<WordID>, Candidate*, boost::hash<vector<WordID> > > State2Node;
+
+class MaxTransBeamSearch {
+
+public:
+ MaxTransBeamSearch(const Hypergraph& i, int pop_limit, Hypergraph* o) :
+ in(i),
+ out(*o),
+ D(in.nodes_.size()),
+ pop_limit_(pop_limit) {
+ cerr << " Finding max translation (cube pruning, pop_limit = " << pop_limit_ << ')' << endl;
+ }
+
+ void Apply() {
+ int num_nodes = in.nodes_.size();
+ int goal_id = num_nodes - 1;
+ int pregoal = goal_id - 1;
+ assert(in.nodes_[pregoal].out_edges_.size() == 1);
+ cerr << " ";
+ for (int i = 0; i < in.nodes_.size(); ++i) {
+ cerr << '.';
+ KBest(i, i == goal_id);
+ }
+ cerr << endl;
+ int best_node = D[goal_id].front()->in_edge_->tail_nodes_.front();
+ Candidate& best = *D[best_node].front();
+ cerr << " Best path: " << log(best.inside_prob_)
+ << "\t" << log(best.est_prob_) << endl;
+ cout << TD::GetString(D[best_node].front()->state_) << endl;
+ FreeAll();
+ }
+
+ private:
+ void FreeAll() {
+ for (int i = 0; i < D.size(); ++i) {
+ CandidateList& D_i = D[i];
+ for (int j = 0; j < D_i.size(); ++j)
+ delete D_i[j];
+ }
+ D.clear();
+ }
+
+ void IncorporateIntoPlusLMForest(Candidate* item, State2Node* s2n, CandidateList* freelist) {
+ Hypergraph::Edge* new_edge = out.AddEdge(item->out_edge_.rule_, item->out_edge_.tail_nodes_);
+ new_edge->feature_values_ = item->out_edge_.feature_values_;
+ new_edge->edge_prob_ = item->out_edge_.edge_prob_;
+ Candidate*& o_item = (*s2n)[item->state_];
+ if (!o_item) o_item = item;
+
+ int& node_id = o_item->node_index_;
+ if (node_id < 0) {
+ Hypergraph::Node* new_node = out.AddNode(in.nodes_[item->in_edge_->head_node_].cat_);
+ node_id = new_node->id_;
+ }
+ Hypergraph::Node* node = &out.nodes_[node_id];
+ out.ConnectEdgeToHeadNode(new_edge, node);
+
+ if (item != o_item) {
+ assert(o_item->state_ == item->state_); // sanity check!
+ o_item->est_prob_ += item->est_prob_;
+ o_item->inside_prob_ += item->inside_prob_;
+ freelist->push_back(item);
+ }
+ }
+
+ void KBest(const int vert_index, const bool is_goal) {
+ // cerr << "KBest(" << vert_index << ")\n";
+ CandidateList& D_v = D[vert_index];
+ assert(D_v.empty());
+ const Hypergraph::Node& v = in.nodes_[vert_index];
+ // cerr << " has " << v.in_edges_.size() << " in-coming edges\n";
+ const vector<int>& in_edges = v.in_edges_;
+ CandidateHeap cand;
+ CandidateList freelist;
+ cand.reserve(in_edges.size());
+ UniqueCandidateSet unique_cands;
+ for (int i = 0; i < in_edges.size(); ++i) {
+ const Hypergraph::Edge& edge = in.edges_[in_edges[i]];
+ const JVector j(edge.tail_nodes_.size(), 0);
+ cand.push_back(new Candidate(edge, j, D, is_goal));
+ assert(unique_cands.insert(cand.back()).second); // these should all be unique!
+ }
+// cerr << " making heap of " << cand.size() << " candidates\n";
+ make_heap(cand.begin(), cand.end(), HeapCandCompare());
+ State2Node state2node; // "buf" in Figure 2
+ int pops = 0;
+ while(!cand.empty() && pops < pop_limit_) {
+ pop_heap(cand.begin(), cand.end(), HeapCandCompare());
+ Candidate* item = cand.back();
+ cand.pop_back();
+ // cerr << "POPPED: " << *item << endl;
+ PushSucc(*item, is_goal, &cand, &unique_cands);
+ IncorporateIntoPlusLMForest(item, &state2node, &freelist);
+ ++pops;
+ }
+ D_v.resize(state2node.size());
+ int c = 0;
+ for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i)
+ D_v[c++] = i->second;
+ sort(D_v.begin(), D_v.end(), EstProbSorter());
+ // cerr << " expanded to " << D_v.size() << " nodes\n";
+
+ for (int i = 0; i < cand.size(); ++i)
+ delete cand[i];
+ // freelist is necessary since even after an item merged, it still stays in
+ // the unique set so it can't be deleted til now
+ for (int i = 0; i < freelist.size(); ++i)
+ delete freelist[i];
+ }
+
+ void PushSucc(const Candidate& item, const bool is_goal, CandidateHeap* pcand, UniqueCandidateSet* cs) {
+ CandidateHeap& cand = *pcand;
+ for (int i = 0; i < item.j_.size(); ++i) {
+ JVector j = item.j_;
+ ++j[i];
+ if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) {
+ Candidate query_unique(*item.in_edge_, j);
+ if (cs->count(&query_unique) == 0) {
+ Candidate* new_cand = new Candidate(*item.in_edge_, j, D, is_goal);
+ cand.push_back(new_cand);
+ push_heap(cand.begin(), cand.end(), HeapCandCompare());
+ assert(cs->insert(new_cand).second); // insert into uniqueness set, sanity check
+ }
+ }
+ }
+ }
+
+ const Hypergraph& in;
+ Hypergraph& out;
+
+ vector<CandidateList> D; // maps nodes in in-HG to the
+ // equivalent nodes (many due to state
+ // splits) in the out-HG.
+ const int pop_limit_;
+};
+
+// each node in the graph has one of these, it keeps track of
+void MaxTrans(const Hypergraph& in,
+ int beam_size) {
+ Hypergraph out;
+ MaxTransBeamSearch ma(in, beam_size, &out);
+ ma.Apply();
+}
+
+}
diff --git a/decoder/parser_test.cc b/decoder/parser_test.cc
new file mode 100644
index 00000000..da1fbd89
--- /dev/null
+++ b/decoder/parser_test.cc
@@ -0,0 +1,35 @@
+#include <cassert>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <gtest/gtest.h>
+#include "hg.h"
+#include "trule.h"
+#include "bottom_up_parser.h"
+#include "tdict.h"
+
+using namespace std;
+
+class ChartTest : public testing::Test {
+ protected:
+ virtual void SetUp() { }
+ virtual void TearDown() { }
+};
+
+TEST_F(ChartTest,LanguageModel) {
+ LatticeArc a(TD::Convert("ein"), 0.0, 1);
+ LatticeArc b(TD::Convert("haus"), 0.0, 1);
+ Lattice lattice(2);
+ lattice[0].push_back(a);
+ lattice[1].push_back(b);
+ Hypergraph forest;
+ GrammarPtr g(new TextGrammar);
+ vector<GrammarPtr> grammars(1, g);
+ ExhaustiveBottomUpParser parser("PHRASE", grammars);
+ parser.Parse(lattice, &forest);
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/decoder/phrasebased_translator.cc b/decoder/phrasebased_translator.cc
new file mode 100644
index 00000000..726b3f9a
--- /dev/null
+++ b/decoder/phrasebased_translator.cc
@@ -0,0 +1,206 @@
+#include "phrasebased_translator.h"
+
+#include <queue>
+#include <iostream>
+#include <tr1/unordered_map>
+#include <tr1/unordered_set>
+
+#include <boost/tuple/tuple.hpp>
+#include <boost/functional/hash.hpp>
+
+#include "sentence_metadata.h"
+#include "tdict.h"
+#include "hg.h"
+#include "filelib.h"
+#include "lattice.h"
+#include "phrasetable_fst.h"
+#include "array2d.h"
+
+using namespace std;
+using namespace std::tr1;
+using namespace boost::tuples;
+
+struct Coverage : public vector<bool> {
+ explicit Coverage(int n, bool v = false) : vector<bool>(n, v), first_gap() {}
+ void Cover(int i, int j) {
+ vector<bool>::iterator it = this->begin() + i;
+ vector<bool>::iterator end = this->begin() + j;
+ while (it != end)
+ *it++ = true;
+ if (first_gap == i) {
+ first_gap = j;
+ it = end;
+ while (*it && it != this->end()) {
+ ++it;
+ ++first_gap;
+ }
+ }
+ }
+ bool Collides(int i, int j) const {
+ vector<bool>::const_iterator it = this->begin() + i;
+ vector<bool>::const_iterator end = this->begin() + j;
+ while (it != end)
+ if (*it++) return true;
+ return false;
+ }
+ int GetFirstGap() const { return first_gap; }
+ private:
+ int first_gap;
+};
+struct CoverageHash {
+ size_t operator()(const Coverage& cov) const {
+ return hasher_(static_cast<const vector<bool>&>(cov));
+ }
+ private:
+ boost::hash<vector<bool> > hasher_;
+};
+ostream& operator<<(ostream& os, const Coverage& cov) {
+ os << '[';
+ for (int i = 0; i < cov.size(); ++i)
+ os << (cov[i] ? '*' : '.');
+ return os << " gap=" << cov.GetFirstGap() << ']';
+}
+
+typedef unordered_map<Coverage, int, CoverageHash> CoverageNodeMap;
+typedef unordered_set<Coverage, CoverageHash> UniqueCoverageSet;
+
+struct PhraseBasedTranslatorImpl {
+ PhraseBasedTranslatorImpl(const boost::program_options::variables_map& conf) :
+ add_pass_through_rules(conf.count("add_pass_through_rules")),
+ max_distortion(conf["pb_max_distortion"].as<int>()),
+ kSOURCE_RULE(new TRule("[X] ||| [X,1] ||| [X,1]", true)),
+ kCONCAT_RULE(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]", true)),
+ kNT_TYPE(TD::Convert("X") * -1) {
+ assert(max_distortion >= 0);
+ vector<string> gfiles = conf["grammar"].as<vector<string> >();
+ assert(gfiles.size() == 1);
+ cerr << "Reading phrasetable from " << gfiles.front() << endl;
+ ReadFile in(gfiles.front());
+ fst.reset(LoadTextPhrasetable(in.stream()));
+ }
+
+ struct State {
+ State(const Coverage& c, int _i, int _j, const FSTNode* q) :
+ coverage(c), i(_i), j(_j), fst(q) {}
+ Coverage coverage;
+ int i;
+ int j;
+ const FSTNode* fst;
+ };
+
+ // we keep track of unique coverages that have been extended since it's
+ // possible to "extend" the same coverage twice, e.g. translate "a b c"
+ // with phrases "a" "b" "a b" and "c". There are two ways to cover "a b"
+ void EnqueuePossibleContinuations(const Coverage& coverage, queue<State>* q, UniqueCoverageSet* ucs) {
+ if (ucs->insert(coverage).second) {
+ const int gap = coverage.GetFirstGap();
+ const int end = min(static_cast<int>(coverage.size()), gap + max_distortion + 1);
+ for (int i = gap; i < end; ++i)
+ if (!coverage[i]) q->push(State(coverage, i, i, fst.get()));
+ }
+ }
+
+ bool Translate(const std::string& input,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* minus_lm_forest) {
+ Lattice lattice;
+ LatticeTools::ConvertTextOrPLF(input, &lattice);
+ smeta->SetSourceLength(lattice.size());
+ size_t est_nodes = lattice.size() * lattice.size() * (1 << max_distortion);
+ minus_lm_forest->ReserveNodes(est_nodes, est_nodes * 100);
+ if (add_pass_through_rules) {
+ SparseVector<double> feats;
+ feats.set_value(FD::Convert("PassThrough"), 1);
+ for (int i = 0; i < lattice.size(); ++i) {
+ const vector<LatticeArc>& arcs = lattice[i];
+ for (int j = 0; j < arcs.size(); ++j) {
+ fst->AddPassThroughTranslation(arcs[j].label, feats);
+ // TODO handle lattice edge features
+ }
+ }
+ }
+ CoverageNodeMap c;
+ queue<State> q;
+ UniqueCoverageSet ucs;
+ const Coverage empty_cov(lattice.size(), false);
+ const Coverage goal_cov(lattice.size(), true);
+ EnqueuePossibleContinuations(empty_cov, &q, &ucs);
+ c[empty_cov] = 0; // have to handle the left edge specially
+ while(!q.empty()) {
+ const State s = q.front();
+ q.pop();
+ // cerr << "(" << s.i << "," << s.j << " ptr=" << s.fst << ") cov=" << s.coverage << endl;
+ const vector<LatticeArc>& arcs = lattice[s.j];
+ if (s.fst->HasData()) {
+ Coverage new_cov = s.coverage;
+ new_cov.Cover(s.i, s.j);
+ EnqueuePossibleContinuations(new_cov, &q, &ucs);
+ const vector<TRulePtr>& phrases = s.fst->GetTranslations()->GetRules();
+ const int phrase_head_index = minus_lm_forest->AddNode(kNT_TYPE)->id_;
+ for (int i = 0; i < phrases.size(); ++i) {
+ Hypergraph::Edge* edge = minus_lm_forest->AddEdge(phrases[i], Hypergraph::TailNodeVector());
+ edge->feature_values_ = edge->rule_->scores_;
+ minus_lm_forest->ConnectEdgeToHeadNode(edge->id_, phrase_head_index);
+ }
+ CoverageNodeMap::iterator cit = c.find(s.coverage);
+ assert(cit != c.end());
+ const int tail_node_plus1 = cit->second;
+ if (tail_node_plus1 == 0) { // left edge
+ c[new_cov] = phrase_head_index + 1;
+ } else { // not left edge
+ int& head_node_plus1 = c[new_cov];
+ if (!head_node_plus1)
+ head_node_plus1 = minus_lm_forest->AddNode(kNT_TYPE)->id_ + 1;
+ Hypergraph::TailNodeVector tail(2, tail_node_plus1 - 1);
+ tail[1] = phrase_head_index;
+ const int concat_edge = minus_lm_forest->AddEdge(kCONCAT_RULE, tail)->id_;
+ minus_lm_forest->ConnectEdgeToHeadNode(concat_edge, head_node_plus1 - 1);
+ }
+ }
+ if (s.j == lattice.size()) continue;
+ for (int l = 0; l < arcs.size(); ++l) {
+ const LatticeArc& arc = arcs[l];
+
+ const FSTNode* next_fst_state = s.fst->Extend(arc.label);
+ const int next_j = s.j + arc.dist2next;
+ if (next_fst_state &&
+ !s.coverage.Collides(s.i, next_j)) {
+ q.push(State(s.coverage, s.i, next_j, next_fst_state));
+ }
+ }
+ }
+ if (add_pass_through_rules)
+ fst->ClearPassThroughTranslations();
+ int pregoal_plus1 = c[goal_cov];
+ if (pregoal_plus1 > 0) {
+ TRulePtr kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [X,1]"));
+ int goal = minus_lm_forest->AddNode(TD::Convert("Goal") * -1)->id_;
+ int gedge = minus_lm_forest->AddEdge(kGOAL_RULE, Hypergraph::TailNodeVector(1, pregoal_plus1 - 1))->id_;
+ minus_lm_forest->ConnectEdgeToHeadNode(gedge, goal);
+ // they are almost topo, but not quite always
+ minus_lm_forest->TopologicallySortNodesAndEdges(goal);
+ minus_lm_forest->Reweight(weights);
+ return true;
+ } else {
+ return false; // composition failed
+ }
+ }
+
+ const bool add_pass_through_rules;
+ const int max_distortion;
+ TRulePtr kSOURCE_RULE;
+ const TRulePtr kCONCAT_RULE;
+ const WordID kNT_TYPE;
+ boost::shared_ptr<FSTNode> fst;
+};
+
+PhraseBasedTranslator::PhraseBasedTranslator(const boost::program_options::variables_map& conf) :
+ pimpl_(new PhraseBasedTranslatorImpl(conf)) {}
+
+bool PhraseBasedTranslator::TranslateImpl(const std::string& input,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* minus_lm_forest) {
+ return pimpl_->Translate(input, smeta, weights, minus_lm_forest);
+}
diff --git a/decoder/phrasebased_translator.h b/decoder/phrasebased_translator.h
new file mode 100644
index 00000000..e5e3f8a2
--- /dev/null
+++ b/decoder/phrasebased_translator.h
@@ -0,0 +1,18 @@
+#ifndef _PHRASEBASED_TRANSLATOR_H_
+#define _PHRASEBASED_TRANSLATOR_H_
+
+#include "translator.h"
+
+class PhraseBasedTranslatorImpl;
+class PhraseBasedTranslator : public Translator {
+ public:
+ PhraseBasedTranslator(const boost::program_options::variables_map& conf);
+ bool TranslateImpl(const std::string& input,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* minus_lm_forest);
+ private:
+ boost::shared_ptr<PhraseBasedTranslatorImpl> pimpl_;
+};
+
+#endif
diff --git a/decoder/phrasetable_fst.cc b/decoder/phrasetable_fst.cc
new file mode 100644
index 00000000..f421e941
--- /dev/null
+++ b/decoder/phrasetable_fst.cc
@@ -0,0 +1,141 @@
+#include "phrasetable_fst.h"
+
+#include <cassert>
+#include <iostream>
+#include <map>
+
+#include <boost/shared_ptr.hpp>
+
+#include "filelib.h"
+#include "tdict.h"
+
+using boost::shared_ptr;
+using namespace std;
+
+TargetPhraseSet::~TargetPhraseSet() {}
+FSTNode::~FSTNode() {}
+
+class TextTargetPhraseSet : public TargetPhraseSet {
+ public:
+ void AddRule(TRulePtr rule) {
+ rules_.push_back(rule);
+ }
+ const vector<TRulePtr>& GetRules() const {
+ return rules_;
+ }
+
+ private:
+ // all rules must have arity 0
+ vector<TRulePtr> rules_;
+};
+
+class TextFSTNode : public FSTNode {
+ public:
+ const TargetPhraseSet* GetTranslations() const { return data.get(); }
+ bool HasData() const { return (bool)data; }
+ bool HasOutgoingNonEpsilonEdges() const { return !ptr.empty(); }
+ const FSTNode* Extend(const WordID& t) const {
+ map<WordID, TextFSTNode>::const_iterator it = ptr.find(t);
+ if (it == ptr.end()) return NULL;
+ return &it->second;
+ }
+
+ void AddPhrase(const string& phrase);
+
+ void AddPassThroughTranslation(const WordID& w, const SparseVector<double>& feats);
+ void ClearPassThroughTranslations();
+ private:
+ vector<WordID> passthroughs;
+ shared_ptr<TargetPhraseSet> data;
+ map<WordID, TextFSTNode> ptr;
+};
+
+#ifdef DEBUG_CHART_PARSER
+static string TrimRule(const string& r) {
+ size_t start = r.find(" |||") + 5;
+ size_t end = r.rfind(" |||");
+ return r.substr(start, end - start);
+}
+#endif
+
+void TextFSTNode::AddPhrase(const string& phrase) {
+ vector<WordID> words;
+ TRulePtr rule(TRule::CreateRulePhrasetable(phrase));
+ if (!rule) {
+ static int err = 0;
+ ++err;
+ if (err > 2) { cerr << "TOO MANY PHRASETABLE ERRORS\n"; exit(1); }
+ return;
+ }
+
+ TextFSTNode* fsa = this;
+ for (int i = 0; i < rule->FLength(); ++i)
+ fsa = &fsa->ptr[rule->f_[i]];
+
+ if (!fsa->data)
+ fsa->data.reset(new TextTargetPhraseSet);
+ static_cast<TextTargetPhraseSet*>(fsa->data.get())->AddRule(rule);
+}
+
+void TextFSTNode::AddPassThroughTranslation(const WordID& w, const SparseVector<double>& feats) {
+ TextFSTNode* next = &ptr[w];
+ // current, rules are only added if the symbol is completely missing as a
+ // word starting the phrase. As a result, it is possible that some sentences
+ // won't parse. If this becomes a problem, fix it here.
+ if (!next->data) {
+ TextTargetPhraseSet* tps = new TextTargetPhraseSet;
+ next->data.reset(tps);
+ TRule* rule = new TRule;
+ rule->e_.resize(1, w);
+ rule->f_.resize(1, w);
+ rule->lhs_ = TD::Convert("___PHRASE") * -1;
+ rule->scores_ = feats;
+ rule->arity_ = 0;
+ tps->AddRule(TRulePtr(rule));
+ passthroughs.push_back(w);
+ }
+}
+
+void TextFSTNode::ClearPassThroughTranslations() {
+ for (int i = 0; i < passthroughs.size(); ++i)
+ ptr.erase(passthroughs[i]);
+ passthroughs.clear();
+}
+
+static void AddPhrasetableToFST(istream* in, TextFSTNode* fst) {
+ int lc = 0;
+ bool flag = false;
+ while(*in) {
+ string line;
+ getline(*in, line);
+ if (line.empty()) continue;
+ ++lc;
+ fst->AddPhrase(line);
+ if (lc % 10000 == 0) { flag = true; cerr << '.' << flush; }
+ if (lc % 500000 == 0) { flag = false; cerr << " [" << lc << ']' << endl << flush; }
+ }
+ if (flag) cerr << endl;
+ cerr << "Loaded " << lc << " source phrases\n";
+}
+
+FSTNode* LoadTextPhrasetable(istream* in) {
+ TextFSTNode *fst = new TextFSTNode;
+ AddPhrasetableToFST(in, fst);
+ return fst;
+}
+
+FSTNode* LoadTextPhrasetable(const vector<string>& filenames) {
+ TextFSTNode* fst = new TextFSTNode;
+ for (int i = 0; i < filenames.size(); ++i) {
+ ReadFile rf(filenames[i]);
+ cerr << "Reading phrase from " << filenames[i] << endl;
+ AddPhrasetableToFST(rf.stream(), fst);
+ }
+ return fst;
+}
+
+FSTNode* LoadBinaryPhrasetable(const string& fname_prefix) {
+ (void) fname_prefix;
+ assert(!"not implemented yet");
+}
+
diff --git a/decoder/phrasetable_fst.h b/decoder/phrasetable_fst.h
new file mode 100644
index 00000000..477de1f7
--- /dev/null
+++ b/decoder/phrasetable_fst.h
@@ -0,0 +1,34 @@
+#ifndef _PHRASETABLE_FST_H_
+#define _PHRASETABLE_FST_H_
+
+#include <vector>
+#include <string>
+
+#include "sparse_vector.h"
+#include "trule.h"
+
+class TargetPhraseSet {
+ public:
+ virtual ~TargetPhraseSet();
+ virtual const std::vector<TRulePtr>& GetRules() const = 0;
+};
+
+class FSTNode {
+ public:
+ virtual ~FSTNode();
+ virtual const TargetPhraseSet* GetTranslations() const = 0;
+ virtual bool HasData() const = 0;
+ virtual bool HasOutgoingNonEpsilonEdges() const = 0;
+ virtual const FSTNode* Extend(const WordID& t) const = 0;
+
+ // these should only be called on q_0:
+ virtual void AddPassThroughTranslation(const WordID& w, const SparseVector<double>& feats) = 0;
+ virtual void ClearPassThroughTranslations() = 0;
+};
+
+// attn caller: you own the memory
+FSTNode* LoadTextPhrasetable(const std::vector<std::string>& filenames);
+FSTNode* LoadTextPhrasetable(std::istream* in);
+FSTNode* LoadBinaryPhrasetable(const std::string& fname_prefix);
+
+#endif
diff --git a/decoder/prob.h b/decoder/prob.h
new file mode 100644
index 00000000..bc297870
--- /dev/null
+++ b/decoder/prob.h
@@ -0,0 +1,8 @@
+#ifndef _PROB_H_
+#define _PROB_H_
+
+#include "logval.h"
+
+typedef LogVal<double> prob_t;
+
+#endif
diff --git a/decoder/rule_lexer.h b/decoder/rule_lexer.h
new file mode 100644
index 00000000..e5db4018
--- /dev/null
+++ b/decoder/rule_lexer.h
@@ -0,0 +1,13 @@
+#ifndef _RULE_LEXER_H_
+#define _RULE_LEXER_H_
+
+#include <iostream>
+
+#include "trule.h"
+
+struct RuleLexer {
+ typedef void (*RuleCallback)(const TRulePtr& new_rule, void* extra);
+ static void ReadRules(std::istream* in, RuleCallback func, void* extra);
+};
+
+#endif
diff --git a/decoder/rule_lexer.l b/decoder/rule_lexer.l
new file mode 100644
index 00000000..ff8f10b0
--- /dev/null
+++ b/decoder/rule_lexer.l
@@ -0,0 +1,269 @@
+%{
+#include "rule_lexer.h"
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <cstring>
+#include <cassert>
+#include "tdict.h"
+#include "fdict.h"
+#include "trule.h"
+
+int lex_line = 0;
+std::istream* scfglex_stream = NULL;
+RuleLexer::RuleCallback rule_callback = NULL;
+void* rule_callback_extra = NULL;
+std::vector<int> scfglex_phrase_fnames;
+
+#undef YY_INPUT
+#define YY_INPUT(buf, result, max_size) (result = scfglex_stream->read(buf, max_size).gcount())
+
+#define YY_SKIP_YYWRAP 1
+int num_rules = 0;
+int yywrap() { return 1; }
+bool fl = true;
+#define MAX_TOKEN_SIZE 255
+std::string scfglex_tmp_token(MAX_TOKEN_SIZE, '\0');
+
+#define MAX_RULE_SIZE 48
+WordID scfglex_src_rhs[MAX_RULE_SIZE];
+WordID scfglex_trg_rhs[MAX_RULE_SIZE];
+int scfglex_src_rhs_size;
+int scfglex_trg_rhs_size;
+WordID scfglex_lhs;
+int scfglex_src_arity;
+int scfglex_trg_arity;
+
+#define MAX_FEATS 20
+int scfglex_feat_ids[MAX_FEATS];
+double scfglex_feat_vals[MAX_FEATS];
+int scfglex_num_feats;
+
+#define MAX_ARITY 20
+int scfglex_nt_sanity[MAX_ARITY];
+int scfglex_src_nts[MAX_ARITY];
+float scfglex_nt_size_means[MAX_ARITY];
+float scfglex_nt_size_vars[MAX_ARITY];
+
+
+void sanity_check_trg_symbol(WordID nt, int index) {
+ if (scfglex_src_nts[index-1] != nt) {
+ std::cerr << "Target symbol with index " << index << " is of type " << TD::Convert(nt*-1)
+ << " but corresponding source is of type "
+ << TD::Convert(scfglex_src_nts[index-1] * -1) << std::endl;
+ abort();
+ }
+}
+
+void sanity_check_trg_index(int index) {
+ if (index > scfglex_src_arity) {
+ std::cerr << "Target index " << index << " exceeds source arity " << scfglex_src_arity << std::endl;
+ abort();
+ }
+ int& flag = scfglex_nt_sanity[index - 1];
+ if (flag) {
+ std::cerr << "Target index " << index << " used multiple times!" << std::endl;
+ abort();
+ }
+ flag = 1;
+}
+
+void scfglex_reset() {
+ scfglex_src_arity = 0;
+ scfglex_trg_arity = 0;
+ scfglex_num_feats = 0;
+ scfglex_src_rhs_size = 0;
+ scfglex_trg_rhs_size = 0;
+}
+
+%}
+
+REAL [\-+]?[0-9]+(\.[0-9]*([eE][-+]*[0-9]+)?)?|inf|[\-+]inf
+NT [\-#$A-Z_:=.",\\][\-#$".A-Z+/=_0-9!:@\\]*
+
+%x LHS_END SRC TRG FEATS FEATVAL ALIGNS
+%%
+
+<INITIAL>[ \t] ;
+
+<INITIAL>\[{NT}\] {
+ scfglex_tmp_token.assign(yytext + 1, yyleng - 2);
+ scfglex_lhs = -TD::Convert(scfglex_tmp_token);
+ // std::cerr << scfglex_tmp_token << "\n";
+ BEGIN(LHS_END);
+ }
+
+<SRC>\[{NT}\] {
+ scfglex_tmp_token.assign(yytext + 1, yyleng - 2);
+ scfglex_src_nts[scfglex_src_arity] = scfglex_src_rhs[scfglex_src_rhs_size] = -TD::Convert(scfglex_tmp_token);
+ ++scfglex_src_arity;
+ ++scfglex_src_rhs_size;
+ }
+
+<SRC>\[{NT},[1-9][0-9]?\] {
+ int index = yytext[yyleng - 2] - '0';
+ if (yytext[yyleng - 3] == ',') {
+ scfglex_tmp_token.assign(yytext + 1, yyleng - 4);
+ } else {
+ scfglex_tmp_token.assign(yytext + 1, yyleng - 5);
+ index += 10 * (yytext[yyleng - 3] - '0');
+ }
+ if ((scfglex_src_arity+1) != index) {
+ std::cerr << "Src indices must go in order: expected " << scfglex_src_arity << " but got " << index << std::endl;
+ abort();
+ }
+ scfglex_src_nts[scfglex_src_arity] = scfglex_src_rhs[scfglex_src_rhs_size] = -TD::Convert(scfglex_tmp_token);
+ ++scfglex_src_rhs_size;
+ ++scfglex_src_arity;
+ }
+
+<TRG>\[{NT},[1-9][0-9]?\] {
+ int index = yytext[yyleng - 2] - '0';
+ if (yytext[yyleng - 3] == ',') {
+ scfglex_tmp_token.assign(yytext + 1, yyleng - 4);
+ } else {
+ scfglex_tmp_token.assign(yytext + 1, yyleng - 5);
+ index += 10 * (yytext[yyleng - 3] - '0');
+ }
+ ++scfglex_trg_arity;
+ // std::cerr << "TRG INDEX: " << index << std::endl;
+ sanity_check_trg_symbol(-TD::Convert(scfglex_tmp_token), index);
+ sanity_check_trg_index(index);
+ scfglex_trg_rhs[scfglex_trg_rhs_size] = 1 - index;
+ ++scfglex_trg_rhs_size;
+}
+
+<TRG>\[[1-9][0-9]?\] {
+ int index = yytext[yyleng - 2] - '0';
+ if (yyleng == 4) {
+ index += 10 * (yytext[yyleng - 3] - '0');
+ }
+ ++scfglex_trg_arity;
+ sanity_check_trg_index(index);
+ scfglex_trg_rhs[scfglex_trg_rhs_size] = 1 - index;
+ ++scfglex_trg_rhs_size;
+}
+
+<LHS_END>[ \t] { ; }
+<LHS_END>\|\|\| {
+ scfglex_reset();
+ BEGIN(SRC);
+ }
+<INITIAL,LHS_END>. {
+ std::cerr << "Line " << lex_line << ": unexpected input in LHS: " << yytext << std::endl;
+ abort();
+ }
+
+<SRC>\|\|\| {
+ memset(scfglex_nt_sanity, 0, scfglex_src_arity * sizeof(int));
+ BEGIN(TRG);
+ }
+<SRC>[^ \t]+ {
+ scfglex_tmp_token.assign(yytext, yyleng);
+ scfglex_src_rhs[scfglex_src_rhs_size] = TD::Convert(scfglex_tmp_token);
+ ++scfglex_src_rhs_size;
+ }
+<SRC>[ \t]+ { ; }
+
+<TRG>\|\|\| {
+ BEGIN(FEATS);
+ }
+<TRG>[^ \t]+ {
+ scfglex_tmp_token.assign(yytext, yyleng);
+ scfglex_trg_rhs[scfglex_trg_rhs_size] = TD::Convert(scfglex_tmp_token);
+ ++scfglex_trg_rhs_size;
+ }
+<TRG>[ \t]+ { ; }
+
+<TRG,FEATS,ALIGNS>\n {
+ if (scfglex_src_arity != scfglex_trg_arity) {
+ std::cerr << "Line " << lex_line << ": LHS and RHS arity mismatch!\n";
+ abort();
+ }
+ TRulePtr rp(new TRule(scfglex_lhs, scfglex_src_rhs, scfglex_src_rhs_size, scfglex_trg_rhs, scfglex_trg_rhs_size, scfglex_feat_ids, scfglex_feat_vals, scfglex_num_feats, scfglex_src_arity));
+ rule_callback(rp, rule_callback_extra);
+ // std::cerr << rp->AsString() << std::endl;
+ num_rules++;
+ lex_line++;
+ if (num_rules % 50000 == 0) { std::cerr << '.' << std::flush; fl = true; }
+ if (num_rules % 2000000 == 0) { std::cerr << " [" << num_rules << "]\n"; fl = false; }
+ BEGIN(INITIAL);
+ }
+
+<FEATS>[ \t;] { ; }
+<FEATS>[^ \t=;]+= {
+ scfglex_tmp_token.assign(yytext, yyleng - 1);
+ const int fid = FD::Convert(scfglex_tmp_token);
+ if (fid < 1) {
+ std::cerr << "\nUNWEIGHED FEATURE " << scfglex_tmp_token << std::endl;
+ abort();
+ }
+ scfglex_feat_ids[scfglex_num_feats] = fid;
+ BEGIN(FEATVAL);
+ }
+<FEATS>\|\|\| {
+ BEGIN(ALIGNS);
+ }
+<FEATVAL>{REAL} {
+ scfglex_feat_vals[scfglex_num_feats] = strtod(yytext, NULL);
+ ++scfglex_num_feats;
+ BEGIN(FEATS);
+ }
+<FEATVAL>. {
+ std::cerr << "Line " << lex_line << ": unexpected input in feature value: " << yytext << std::endl;
+ abort();
+ }
+<FEATS>{REAL} {
+ scfglex_feat_ids[scfglex_num_feats] = scfglex_phrase_fnames[scfglex_num_feats];
+ scfglex_feat_vals[scfglex_num_feats] = strtod(yytext, NULL);
+ ++scfglex_num_feats;
+ }
+<FEATS>. {
+ std::cerr << "Line " << lex_line << " unexpected input in features: " << yytext << std::endl;
+ abort();
+ }
+<ALIGNS>[0-9]+-[0-9]+ {
+ int i = 0;
+ int a = 0;
+ int b = 0;
+ while (i < yyleng) {
+ char c = yytext[i];
+ if (c == '-') break;
+ a *= 10;
+ a += c - '0';
+ ++i;
+ }
+ ++i;
+ while (i < yyleng) {
+ b *= 10;
+ b += yytext[i] - '0';
+ ++i;
+ }
+ // TODO store alignment points somewhere
+ }
+<ALIGNS>[ \t] ;
+<ALIGNS>. {
+ std::cerr << "Line " << lex_line << ": unexpected input in alignment: " << yytext << std::endl;
+ abort();
+ }
+%%
+
+#include "filelib.h"
+
+void RuleLexer::ReadRules(std::istream* in, RuleLexer::RuleCallback func, void* extra) {
+ if (scfglex_phrase_fnames.empty()) {
+ scfglex_phrase_fnames.resize(100);
+ for (int i = 0; i < scfglex_phrase_fnames.size(); ++i) {
+ std::ostringstream os;
+ os << "PhraseModel_" << i;
+ scfglex_phrase_fnames[i] = FD::Convert(os.str());
+ }
+ }
+ lex_line = 1;
+ scfglex_stream = in;
+ rule_callback_extra = extra,
+ rule_callback = func;
+ yylex();
+}
+
diff --git a/decoder/sampler.h b/decoder/sampler.h
new file mode 100644
index 00000000..e5840f41
--- /dev/null
+++ b/decoder/sampler.h
@@ -0,0 +1,136 @@
+#ifndef SAMPLER_H_
+#define SAMPLER_H_
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real.hpp>
+#include <boost/random/variate_generator.hpp>
+#include <boost/random/normal_distribution.hpp>
+#include <boost/random/poisson_distribution.hpp>
+
+#include "prob.h"
+
+struct SampleSet;
+
+template <typename RNG>
+struct RandomNumberGenerator {
+ static uint32_t GetTrulyRandomSeed() {
+ uint32_t seed;
+ std::ifstream r("/dev/urandom");
+ if (r) {
+ r.read((char*)&seed,sizeof(uint32_t));
+ }
+ if (r.fail() || !r) {
+ std::cerr << "Warning: could not read from /dev/urandom. Seeding from clock" << std::endl;
+ seed = time(NULL);
+ }
+ std::cerr << "Seeding random number sequence to " << seed << std::endl;
+ return seed;
+ }
+
+ RandomNumberGenerator() : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) {
+ uint32_t seed = GetTrulyRandomSeed();
+ m_generator.seed(seed);
+ }
+ explicit RandomNumberGenerator(uint32_t seed) : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) {
+ if (!seed) seed = GetTrulyRandomSeed();
+ m_generator.seed(seed);
+ }
+
+ size_t SelectSample(const prob_t& a, const prob_t& b, double T = 1.0) {
+ if (T == 1.0) {
+ if (this->next() > (a / (a + b))) return 1; else return 0;
+ } else {
+ assert(!"not implemented");
+ }
+ }
+
+ // T is the annealing temperature, if desired
+ size_t SelectSample(const SampleSet& ss, double T = 1.0);
+
+ // draw a value from U(0,1)
+ double next() {return m_random();}
+
+ // draw a value from N(mean,var)
+ double NextNormal(double mean, double var) {
+ return boost::normal_distribution<double>(mean, var)(m_random);
+ }
+
+ // draw a value from a Poisson distribution
+ // lambda must be greater than 0
+ int NextPoisson(int lambda) {
+ return boost::poisson_distribution<int>(lambda)(m_random);
+ }
+
+ bool AcceptMetropolisHastings(const prob_t& p_cur,
+ const prob_t& p_prev,
+ const prob_t& q_cur,
+ const prob_t& q_prev) {
+ const prob_t a = (p_cur / p_prev) * (q_prev / q_cur);
+ if (log(a) >= 0.0) return true;
+ return (prob_t(this->next()) < a);
+ }
+
+ private:
+ boost::uniform_real<> m_dist;
+ RNG m_generator;
+ boost::variate_generator<RNG&, boost::uniform_real<> > m_random;
+};
+
+typedef RandomNumberGenerator<boost::mt19937> MT19937;
+
+class SampleSet {
+ public:
+ const prob_t& operator[](int i) const { return m_scores[i]; }
+ bool empty() const { return m_scores.empty(); }
+ void add(const prob_t& s) { m_scores.push_back(s); }
+ void clear() { m_scores.clear(); }
+ size_t size() const { return m_scores.size(); }
+ std::vector<prob_t> m_scores;
+};
+
+template <typename RNG>
+size_t RandomNumberGenerator<RNG>::SelectSample(const SampleSet& ss, double T) {
+ assert(T > 0.0);
+ assert(ss.m_scores.size() > 0);
+ if (ss.m_scores.size() == 1) return 0;
+ const prob_t annealing_factor(1.0 / T);
+ const bool anneal = (annealing_factor != prob_t::One());
+ prob_t sum = prob_t::Zero();
+ if (anneal) {
+ for (int i = 0; i < ss.m_scores.size(); ++i)
+ sum += ss.m_scores[i].pow(annealing_factor); // p^(1/T)
+ } else {
+ sum = std::accumulate(ss.m_scores.begin(), ss.m_scores.end(), prob_t::Zero());
+ }
+ //for (size_t i = 0; i < ss.m_scores.size(); ++i) std::cerr << ss.m_scores[i] << ",";
+ //std::cerr << std::endl;
+
+ prob_t random(this->next()); // random number between 0 and 1
+ random *= sum; // scale with normalization factor
+ //std::cerr << "Random number " << random << std::endl;
+
+ //now figure out which sample
+ size_t position = 1;
+ sum = ss.m_scores[0];
+ if (anneal) {
+ sum.poweq(annealing_factor);
+ for (; position < ss.m_scores.size() && sum < random; ++position)
+ sum += ss.m_scores[position].pow(annealing_factor);
+ } else {
+ for (; position < ss.m_scores.size() && sum < random; ++position)
+ sum += ss.m_scores[position];
+ }
+ //std::cout << "random: " << random << " sample: " << position << std::endl;
+ //std::cerr << "Sample: " << position-1 << std::endl;
+ //exit(1);
+ return position-1;
+}
+
+#endif
diff --git a/decoder/scfg_translator.cc b/decoder/scfg_translator.cc
new file mode 100644
index 00000000..c215eea6
--- /dev/null
+++ b/decoder/scfg_translator.cc
@@ -0,0 +1,132 @@
+#include "translator.h"
+
+#include <vector>
+
+#include "hg.h"
+#include "grammar.h"
+#include "bottom_up_parser.h"
+#include "sentence_metadata.h"
+
+using namespace std;
+static bool usingSentenceGrammar = false;
+static bool printGrammarsUsed = false;
+
+struct SCFGTranslatorImpl {
+ SCFGTranslatorImpl(const boost::program_options::variables_map& conf) :
+ max_span_limit(conf["scfg_max_span_limit"].as<int>()),
+ add_pass_through_rules(conf.count("add_pass_through_rules")),
+ goal(conf["goal"].as<string>()),
+ default_nt(conf["scfg_default_nt"].as<string>()) {
+ if(conf.count("grammar"))
+ {
+ vector<string> gfiles = conf["grammar"].as<vector<string> >();
+ for (int i = 0; i < gfiles.size(); ++i) {
+ cerr << "Reading SCFG grammar from " << gfiles[i] << endl;
+ TextGrammar* g = new TextGrammar(gfiles[i]);
+ g->SetMaxSpan(max_span_limit);
+ g->SetGrammarName(gfiles[i]);
+ grammars.push_back(GrammarPtr(g));
+
+ }
+ }
+ if (!conf.count("scfg_no_hiero_glue_grammar"))
+ {
+ GlueGrammar* g = new GlueGrammar(goal, default_nt);
+ g->SetGrammarName("GlueGrammar");
+ grammars.push_back(GrammarPtr(g));
+ cerr << "Adding glue grammar" << endl;
+ }
+ if (conf.count("scfg_extra_glue_grammar"))
+ {
+ GlueGrammar* g = new GlueGrammar(conf["scfg_extra_glue_grammar"].as<string>());
+ g->SetGrammarName("ExtraGlueGrammar");
+ grammars.push_back(GrammarPtr(g));
+ cerr << "Adding extra glue grammar" << endl;
+ }
+ }
+
+ const int max_span_limit;
+ const bool add_pass_through_rules;
+ const string goal;
+ const string default_nt;
+ vector<GrammarPtr> grammars;
+
+ bool Translate(const string& input,
+ SentenceMetadata* smeta,
+ const vector<double>& weights,
+ Hypergraph* forest) {
+ vector<GrammarPtr> glist = grammars;
+ Lattice& lattice = smeta->src_lattice_;
+ LatticeTools::ConvertTextOrPLF(input, &lattice);
+ smeta->SetSourceLength(lattice.size());
+ if (add_pass_through_rules){
+ PassThroughGrammar* g = new PassThroughGrammar(lattice, default_nt);
+ g->SetGrammarName("PassThrough");
+ glist.push_back(GrammarPtr(g));
+ cerr << "Adding pass through grammar" << endl;
+ }
+
+
+
+ if(printGrammarsUsed){ //Iterate trough grammars we have for this sentence and list them
+ for (int gi = 0; gi < glist.size(); ++gi)
+ {
+ cerr << "Using grammar::" << glist[gi]->GetGrammarName() << endl;
+ }
+ }
+
+ ExhaustiveBottomUpParser parser(goal, glist);
+ if (!parser.Parse(lattice, forest))
+ return false;
+ forest->Reweight(weights);
+ return true;
+ }
+};
+
+/*
+Called once from cdec.cc to setup the initial SCFG translation structure backend
+*/
+SCFGTranslator::SCFGTranslator(const boost::program_options::variables_map& conf) :
+ pimpl_(new SCFGTranslatorImpl(conf)) {}
+
+/*
+Called for each sentence to perform translation using the SCFG backend
+*/
+bool SCFGTranslator::TranslateImpl(const string& input,
+ SentenceMetadata* smeta,
+ const vector<double>& weights,
+ Hypergraph* minus_lm_forest) {
+
+ return pimpl_->Translate(input, smeta, weights, minus_lm_forest);
+}
+
+/*
+Check for grammar pointer in the sentence markup, for use with sentence specific grammars
+ */
+void SCFGTranslator::ProcessMarkupHintsImpl(const map<string, string>& kv) {
+ map<string,string>::const_iterator it = kv.find("grammar");
+
+
+ if (it == kv.end()) {
+ usingSentenceGrammar= false;
+ return;
+ }
+ //Create sentence specific grammar from specified file name and load grammar into list of grammars
+ cerr << "Loading sentence grammar from:" << it->second << endl;
+ usingSentenceGrammar = true;
+ TextGrammar* sentGrammar = new TextGrammar(it->second);
+ sentGrammar->SetMaxSpan(pimpl_->max_span_limit);
+ sentGrammar->SetGrammarName(it->second);
+ pimpl_->grammars.push_back(GrammarPtr(sentGrammar));
+
+}
+
+void SCFGTranslator::SentenceCompleteImpl() {
+
+ if(usingSentenceGrammar) // Drop the last sentence grammar from the list of grammars
+ {
+ cerr << "Clearing grammar" << endl;
+ pimpl_->grammars.pop_back();
+ }
+}
+
diff --git a/decoder/sentence_metadata.h b/decoder/sentence_metadata.h
new file mode 100644
index 00000000..ef9eb388
--- /dev/null
+++ b/decoder/sentence_metadata.h
@@ -0,0 +1,47 @@
+#ifndef _SENTENCE_METADATA_H_
+#define _SENTENCE_METADATA_H_
+
+#include <cassert>
+#include "lattice.h"
+
+struct SentenceMetadata {
+ SentenceMetadata(int id, const Lattice& ref) :
+ sent_id_(id),
+ src_len_(-1),
+ has_reference_(ref.size() > 0),
+ trg_len_(ref.size()),
+ ref_(has_reference_ ? &ref : NULL) {}
+
+ // this should be called by the Translator object after
+ // it has parsed the source
+ void SetSourceLength(int sl) { src_len_ = sl; }
+
+ // this should be called if a separate model needs to
+ // specify how long the target sentence should be
+ void SetTargetLength(int tl) {
+ assert(!has_reference_);
+ trg_len_ = tl;
+ }
+ bool HasReference() const { return has_reference_; }
+ const Lattice& GetReference() const { return *ref_; }
+ int GetSourceLength() const { return src_len_; }
+ int GetTargetLength() const { return trg_len_; }
+ int GetSentenceID() const { return sent_id_; }
+ // this will be empty if the translator accepts non FS input!
+ const Lattice& GetSourceLattice() const { return src_lattice_; }
+
+ private:
+ const int sent_id_;
+ // the following should be set, if possible, by the Translator
+ int src_len_;
+ public:
+ Lattice src_lattice_; // this will only be set if inputs are finite state!
+ private:
+ // you need to be very careful when depending on these values
+ // they will only be set during training / alignment contexts
+ const bool has_reference_;
+ int trg_len_;
+ const Lattice* const ref_;
+};
+
+#endif
diff --git a/decoder/small_vector.h b/decoder/small_vector.h
new file mode 100644
index 00000000..800c1df1
--- /dev/null
+++ b/decoder/small_vector.h
@@ -0,0 +1,187 @@
+#ifndef _SMALL_VECTOR_H_
+
+#include <streambuf> // std::max - where to get this?
+#include <cstring>
+#include <cassert>
+
+#define __SV_MAX_STATIC 2
+
+class SmallVector {
+
+ public:
+ SmallVector() : size_(0) {}
+
+ explicit SmallVector(size_t s, int v = 0) : size_(s) {
+ assert(s < 0x80);
+ if (s <= __SV_MAX_STATIC) {
+ for (int i = 0; i < s; ++i) data_.vals[i] = v;
+ } else {
+ capacity_ = s;
+ size_ = s;
+ data_.ptr = new int[s];
+ for (int i = 0; i < size_; ++i) data_.ptr[i] = v;
+ }
+ }
+
+ SmallVector(const SmallVector& o) : size_(o.size_) {
+ if (size_ <= __SV_MAX_STATIC) {
+ for (int i = 0; i < __SV_MAX_STATIC; ++i) data_.vals[i] = o.data_.vals[i];
+ } else {
+ capacity_ = size_ = o.size_;
+ data_.ptr = new int[capacity_];
+ std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(int));
+ }
+ }
+
+ const SmallVector& operator=(const SmallVector& o) {
+ if (size_ <= __SV_MAX_STATIC) {
+ if (o.size_ <= __SV_MAX_STATIC) {
+ size_ = o.size_;
+ for (int i = 0; i < __SV_MAX_STATIC; ++i) data_.vals[i] = o.data_.vals[i];
+ } else {
+ capacity_ = size_ = o.size_;
+ data_.ptr = new int[capacity_];
+ std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(int));
+ }
+ } else {
+ if (o.size_ <= __SV_MAX_STATIC) {
+ delete[] data_.ptr;
+ size_ = o.size_;
+ for (int i = 0; i < size_; ++i) data_.vals[i] = o.data_.vals[i];
+ } else {
+ if (capacity_ < o.size_) {
+ delete[] data_.ptr;
+ capacity_ = o.size_;
+ data_.ptr = new int[capacity_];
+ }
+ size_ = o.size_;
+ for (int i = 0; i < size_; ++i)
+ data_.ptr[i] = o.data_.ptr[i];
+ }
+ }
+ return *this;
+ }
+
+ ~SmallVector() {
+ if (size_ <= __SV_MAX_STATIC) return;
+ delete[] data_.ptr;
+ }
+
+ void clear() {
+ if (size_ > __SV_MAX_STATIC) {
+ delete[] data_.ptr;
+ }
+ size_ = 0;
+ }
+
+ bool empty() const { return size_ == 0; }
+ size_t size() const { return size_; }
+
+ inline void ensure_capacity(unsigned char min_size) {
+ assert(min_size > __SV_MAX_STATIC);
+ if (min_size < capacity_) return;
+ unsigned char new_cap = std::max(static_cast<unsigned char>(capacity_ << 1), min_size);
+ int* tmp = new int[new_cap];
+ std::memcpy(tmp, data_.ptr, capacity_ * sizeof(int));
+ delete[] data_.ptr;
+ data_.ptr = tmp;
+ capacity_ = new_cap;
+ }
+
+ inline void copy_vals_to_ptr() {
+ capacity_ = __SV_MAX_STATIC * 2;
+ int* tmp = new int[capacity_];
+ for (int i = 0; i < __SV_MAX_STATIC; ++i) tmp[i] = data_.vals[i];
+ data_.ptr = tmp;
+ }
+
+ inline void push_back(int v) {
+ if (size_ < __SV_MAX_STATIC) {
+ data_.vals[size_] = v;
+ ++size_;
+ return;
+ } else if (size_ == __SV_MAX_STATIC) {
+ copy_vals_to_ptr();
+ } else if (size_ == capacity_) {
+ ensure_capacity(size_ + 1);
+ }
+ data_.ptr[size_] = v;
+ ++size_;
+ }
+
+ int& back() { return this->operator[](size_ - 1); }
+ const int& back() const { return this->operator[](size_ - 1); }
+ int& front() { return this->operator[](0); }
+ const int& front() const { return this->operator[](0); }
+
+ void resize(size_t s, int v = 0) {
+ if (s <= __SV_MAX_STATIC) {
+ if (size_ > __SV_MAX_STATIC) {
+ int tmp[__SV_MAX_STATIC];
+ for (int i = 0; i < s; ++i) tmp[i] = data_.ptr[i];
+ delete[] data_.ptr;
+ for (int i = 0; i < s; ++i) data_.vals[i] = tmp[i];
+ size_ = s;
+ return;
+ }
+ if (s <= size_) {
+ size_ = s;
+ return;
+ } else {
+ for (int i = size_; i < s; ++i)
+ data_.vals[i] = v;
+ size_ = s;
+ return;
+ }
+ } else {
+ if (size_ <= __SV_MAX_STATIC)
+ copy_vals_to_ptr();
+ if (s > capacity_)
+ ensure_capacity(s);
+ if (s > size_) {
+ for (int i = size_; i < s; ++i)
+ data_.ptr[i] = v;
+ }
+ size_ = s;
+ }
+ }
+
+ int& operator[](size_t i) {
+ if (size_ <= __SV_MAX_STATIC) return data_.vals[i];
+ return data_.ptr[i];
+ }
+
+ const int& operator[](size_t i) const {
+ if (size_ <= __SV_MAX_STATIC) return data_.vals[i];
+ return data_.ptr[i];
+ }
+
+ bool operator==(const SmallVector& o) const {
+ if (size_ != o.size_) return false;
+ if (size_ <= __SV_MAX_STATIC) {
+ for (size_t i = 0; i < size_; ++i)
+ if (data_.vals[i] != o.data_.vals[i]) return false;
+ return true;
+ } else {
+ for (size_t i = 0; i < size_; ++i)
+ if (data_.ptr[i] != o.data_.ptr[i]) return false;
+ return true;
+ }
+ }
+
+ private:
+ unsigned char capacity_; // only defined when size_ >= __SV_MAX_STATIC
+ unsigned char size_;
+ union StorageType {
+ int vals[__SV_MAX_STATIC];
+ int* ptr;
+ };
+ StorageType data_;
+
+};
+
+inline bool operator!=(const SmallVector& a, const SmallVector& b) {
+ return !(a==b);
+}
+
+#endif
diff --git a/decoder/small_vector_test.cc b/decoder/small_vector_test.cc
new file mode 100644
index 00000000..84237791
--- /dev/null
+++ b/decoder/small_vector_test.cc
@@ -0,0 +1,129 @@
+#include "small_vector.h"
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cassert>
+#include <vector>
+
+using namespace std;
+
+class SVTest : public testing::Test {
+ protected:
+ virtual void SetUp() { }
+ virtual void TearDown() { }
+};
+
+TEST_F(SVTest, LargerThan2) {
+ SmallVector v;
+ SmallVector v2;
+ v.push_back(0);
+ v.push_back(1);
+ v.push_back(2);
+ assert(v.size() == 3);
+ assert(v[2] == 2);
+ assert(v[1] == 1);
+ assert(v[0] == 0);
+ v2 = v;
+ SmallVector copy(v);
+ assert(copy.size() == 3);
+ assert(copy[0] == 0);
+ assert(copy[1] == 1);
+ assert(copy[2] == 2);
+ assert(copy == v2);
+ copy[1] = 99;
+ assert(copy != v2);
+ assert(v2.size() == 3);
+ assert(v2[2] == 2);
+ assert(v2[1] == 1);
+ assert(v2[0] == 0);
+ v2[0] = -2;
+ v2[1] = -1;
+ v2[2] = 0;
+ assert(v2[2] == 0);
+ assert(v2[1] == -1);
+ assert(v2[0] == -2);
+ SmallVector v3(1,1);
+ assert(v3[0] == 1);
+ v2 = v3;
+ assert(v2.size() == 1);
+ assert(v2[0] == 1);
+ SmallVector v4(10, 1);
+ assert(v4.size() == 10);
+ assert(v4[5] == 1);
+ assert(v4[9] == 1);
+ v4 = v;
+ assert(v4.size() == 3);
+ assert(v4[2] == 2);
+ assert(v4[1] == 1);
+ assert(v4[0] == 0);
+ SmallVector v5(10, 2);
+ assert(v5.size() == 10);
+ assert(v5[7] == 2);
+ assert(v5[0] == 2);
+ assert(v.size() == 3);
+ v = v5;
+ assert(v.size() == 10);
+ assert(v[2] == 2);
+ assert(v[9] == 2);
+ SmallVector cc;
+ for (int i = 0; i < 33; ++i)
+ cc.push_back(i);
+ for (int i = 0; i < 33; ++i)
+ assert(cc[i] == i);
+ cc.resize(20);
+ assert(cc.size() == 20);
+ for (int i = 0; i < 20; ++i)
+ assert(cc[i] == i);
+ cc[0]=-1;
+ cc.resize(1, 999);
+ assert(cc.size() == 1);
+ assert(cc[0] == -1);
+ cc.resize(99, 99);
+ for (int i = 1; i < 99; ++i) {
+ cerr << i << " " << cc[i] << endl;
+ assert(cc[i] == 99);
+ }
+ cc.clear();
+ assert(cc.size() == 0);
+}
+
+TEST_F(SVTest, Small) {
+ SmallVector v;
+ SmallVector v1(1,0);
+ SmallVector v2(2,10);
+ SmallVector v1a(2,0);
+ EXPECT_TRUE(v1 != v1a);
+ EXPECT_TRUE(v1 == v1);
+ EXPECT_EQ(v1[0], 0);
+ EXPECT_EQ(v2[1], 10);
+ EXPECT_EQ(v2[0], 10);
+ ++v2[1];
+ --v2[0];
+ EXPECT_EQ(v2[0], 9);
+ EXPECT_EQ(v2[1], 11);
+ SmallVector v3(v2);
+ assert(v3[0] == 9);
+ assert(v3[1] == 11);
+ assert(!v3.empty());
+ assert(v3.size() == 2);
+ v3.clear();
+ assert(v3.empty());
+ assert(v3.size() == 0);
+ assert(v3 != v2);
+ assert(v2 != v3);
+ v3 = v2;
+ assert(v3 == v2);
+ assert(v2 == v3);
+ assert(v3[0] == 9);
+ assert(v3[1] == 11);
+ assert(!v3.empty());
+ assert(v3.size() == 2);
+ cerr << sizeof(SmallVector) << endl;
+ cerr << sizeof(vector<int>) << endl;
+}
+
+int main(int argc, char** argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
diff --git a/decoder/sparse_vector.cc b/decoder/sparse_vector.cc
new file mode 100644
index 00000000..4035b9ef
--- /dev/null
+++ b/decoder/sparse_vector.cc
@@ -0,0 +1,98 @@
+#include "sparse_vector.h"
+
+#include <iostream>
+#include <cstring>
+
+#include "hg_io.h"
+
+using namespace std;
+
+namespace B64 {
+
+void Encode(double objective, const SparseVector<double>& v, ostream* out) {
+ const int num_feats = v.num_active();
+ size_t tot_size = 0;
+ const size_t off_objective = tot_size;
+ tot_size += sizeof(double); // objective
+ const size_t off_num_feats = tot_size;
+ tot_size += sizeof(int); // num_feats
+ const size_t off_data = tot_size;
+ tot_size += sizeof(unsigned char) * num_feats; // lengths of feature names;
+ typedef SparseVector<double>::const_iterator const_iterator;
+ for (const_iterator it = v.begin(); it != v.end(); ++it)
+ tot_size += FD::Convert(it->first).size(); // feature names;
+ tot_size += sizeof(double) * num_feats; // gradient
+ const size_t off_magic = tot_size;
+ tot_size += 4; // magic
+
+ // size_t b64_size = tot_size * 4 / 3;
+ // cerr << "Sparse vector binary size: " << tot_size << " (b64 size=" << b64_size << ")\n";
+ char* data = new char[tot_size];
+ *reinterpret_cast<double*>(&data[off_objective]) = objective;
+ *reinterpret_cast<int*>(&data[off_num_feats]) = num_feats;
+ char* cur = &data[off_data];
+ assert(cur - data == off_data);
+ for (const_iterator it = v.begin(); it != v.end(); ++it) {
+ const string& fname = FD::Convert(it->first);
+ *cur++ = static_cast<char>(fname.size()); // name len
+ memcpy(cur, &fname[0], fname.size());
+ cur += fname.size();
+ *reinterpret_cast<double*>(cur) = it->second;
+ cur += sizeof(double);
+ }
+ assert(cur - data == off_magic);
+ *reinterpret_cast<unsigned int*>(cur) = 0xBAABABBAu;
+ cur += sizeof(unsigned int);
+ assert(cur - data == tot_size);
+ b64encode(data, tot_size, out);
+ delete[] data;
+}
+
+bool Decode(double* objective, SparseVector<double>* v, const char* in, size_t size) {
+ v->clear();
+ if (size % 4 != 0) {
+ cerr << "B64 error - line % 4 != 0\n";
+ return false;
+ }
+ const size_t decoded_size = size * 3 / 4 - sizeof(unsigned int);
+ const size_t buf_size = decoded_size + sizeof(unsigned int);
+ if (decoded_size < 6) { cerr << "SparseVector decoding error: too short!\n"; return false; }
+ char* data = new char[buf_size];
+ if (!b64decode(reinterpret_cast<const unsigned char*>(in), size, data, buf_size)) {
+ delete[] data;
+ return false;
+ }
+ size_t cur = 0;
+ *objective = *reinterpret_cast<double*>(data);
+ cur += sizeof(double);
+ const int num_feats = *reinterpret_cast<int*>(&data[cur]);
+ cur += sizeof(int);
+ int fc = 0;
+ while(fc < num_feats && cur < decoded_size) {
+ ++fc;
+ const int fname_len = data[cur++];
+ assert(fname_len > 0);
+ assert(fname_len < 256);
+ string fname(fname_len, '\0');
+ memcpy(&fname[0], &data[cur], fname_len);
+ cur += fname_len;
+ const double val = *reinterpret_cast<double*>(&data[cur]);
+ cur += sizeof(double);
+ int fid = FD::Convert(fname);
+ v->set_value(fid, val);
+ }
+ if(num_feats != fc) {
+ cerr << "Expected " << num_feats << " but only decoded " << fc << "!\n";
+ delete[] data;
+ return false;
+ }
+ if (*reinterpret_cast<unsigned int*>(&data[cur]) != 0xBAABABBAu) {
+ cerr << "SparseVector decodeding error : magic does not match!\n";
+ delete[] data;
+ return false;
+ }
+ delete[] data;
+ return true;
+}
+
+}
diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h
new file mode 100644
index 00000000..66c9b10d
--- /dev/null
+++ b/decoder/sparse_vector.h
@@ -0,0 +1,274 @@
+#ifndef _SPARSE_VECTOR_H_
+#define _SPARSE_VECTOR_H_
+
+// this is a modified version of code originally written
+// by Phil Blunsom
+
+#include <iostream>
+#include <map>
+#include <tr1/unordered_map>
+#include <vector>
+#include <valarray>
+
+#include "fdict.h"
+
+template <typename T>
+class SparseVector {
+public:
+ typedef std::map<int, T> MapType;
+ typedef typename std::map<int, T>::const_iterator const_iterator;
+ SparseVector() {}
+
+ const T operator[](int index) const {
+ typename MapType::const_iterator found = values_.find(index);
+ if (found == values_.end())
+ return T(0);
+ else
+ return found->second;
+ }
+
+ void set_value(int index, const T &value) {
+ values_[index] = value;
+ }
+
+ T add_value(int index, const T &value) {
+ return values_[index] += value;
+ }
+
+ T value(int index) const {
+ typename MapType::const_iterator found = values_.find(index);
+ if (found != values_.end())
+ return found->second;
+ else
+ return T(0);
+ }
+
+ void store(std::valarray<T>* target) const {
+ (*target) *= 0;
+ for (typename MapType::const_iterator
+ it = values_.begin(); it != values_.end(); ++it) {
+ if (it->first >= target->size()) break;
+ (*target)[it->first] = it->second;
+ }
+ }
+
+ int max_index() const {
+ if (values_.empty()) return 0;
+ typename MapType::const_iterator found =values_.end();
+ --found;
+ return found->first;
+ }
+
+ // dot product with a unit vector of the same length
+ // as the sparse vector
+ T dot() const {
+ T sum = 0;
+ for (typename MapType::const_iterator
+ it = values_.begin(); it != values_.end(); ++it)
+ sum += it->second;
+ return sum;
+ }
+
+ template<typename S>
+ S dot(const SparseVector<S> &vec) const {
+ S sum = 0;
+ for (typename MapType::const_iterator
+ it = values_.begin(); it != values_.end(); ++it)
+ {
+ typename MapType::const_iterator
+ found = vec.values_.find(it->first);
+ if (found != vec.values_.end())
+ sum += it->second * found->second;
+ }
+ return sum;
+ }
+
+ template<typename S>
+ S dot(const std::vector<S> &vec) const {
+ S sum = 0;
+ for (typename MapType::const_iterator
+ it = values_.begin(); it != values_.end(); ++it)
+ {
+ if (it->first < static_cast<int>(vec.size()))
+ sum += it->second * vec[it->first];
+ }
+ return sum;
+ }
+
+ template<typename S>
+ S dot(const S *vec) const {
+ // this is not range checked!
+ S sum = 0;
+ for (typename MapType::const_iterator
+ it = values_.begin(); it != values_.end(); ++it)
+ sum += it->second * vec[it->first];
+ std::cout << "dot(*vec) " << sum << std::endl;
+ return sum;
+ }
+
+ T l1norm() const {
+ T sum = 0;
+ for (typename MapType::const_iterator
+ it = values_.begin(); it != values_.end(); ++it)
+ sum += fabs(it->second);
+ return sum;
+ }
+
+ T l2norm() const {
+ T sum = 0;
+ for (typename MapType::const_iterator
+ it = values_.begin(); it != values_.end(); ++it)
+ sum += it->second * it->second;
+ return sqrt(sum);
+ }
+
+ SparseVector<T> &operator+=(const SparseVector<T> &other) {
+ for (typename MapType::const_iterator
+ it = other.values_.begin(); it != other.values_.end(); ++it)
+ {
+ T v = (values_[it->first] += it->second);
+ if (v == T())
+ values_.erase(it->first);
+ }
+ return *this;
+ }
+
+ SparseVector<T> &operator-=(const SparseVector<T> &other) {
+ for (typename MapType::const_iterator
+ it = other.values_.begin(); it != other.values_.end(); ++it)
+ {
+ T v = (values_[it->first] -= it->second);
+ if (v == T(0))
+ values_.erase(it->first);
+ }
+ return *this;
+ }
+
+ SparseVector<T> &operator-=(const double &x) {
+ for (typename MapType::iterator
+ it = values_.begin(); it != values_.end(); ++it)
+ it->second -= x;
+ return *this;
+ }
+
+ SparseVector<T> &operator+=(const double &x) {
+ for (typename MapType::iterator
+ it = values_.begin(); it != values_.end(); ++it)
+ it->second += x;
+ return *this;
+ }
+
+ SparseVector<T> &operator/=(const T &x) {
+ for (typename MapType::iterator
+ it = values_.begin(); it != values_.end(); ++it)
+ it->second /= x;
+ return *this;
+ }
+
+ SparseVector<T> &operator*=(const T& x) {
+ for (typename MapType::iterator
+ it = values_.begin(); it != values_.end(); ++it)
+ it->second *= x;
+ return *this;
+ }
+
+ SparseVector<T> operator+(const double &x) const {
+ SparseVector<T> result = *this;
+ return result += x;
+ }
+
+ SparseVector<T> operator-(const double &x) const {
+ SparseVector<T> result = *this;
+ return result -= x;
+ }
+
+ SparseVector<T> operator/(const double &x) const {
+ SparseVector<T> result = *this;
+ return result /= x;
+ }
+
+ std::ostream &operator<<(std::ostream &out) const {
+ bool first = true;
+ for (typename MapType::const_iterator
+ it = values_.begin(); it != values_.end(); ++it) {
+ // by definition feature id 0 is a dummy value
+ if (it->first == 0) continue;
+ out << (first ? "" : ";")
+ << FD::Convert(it->first) << '=' << it->second;
+ first = false;
+ }
+ return out;
+ }
+
+ bool operator<(const SparseVector<T> &other) const {
+ typename MapType::const_iterator it = values_.begin();
+ typename MapType::const_iterator other_it = other.values_.begin();
+
+ for (; it != values_.end() && other_it != other.values_.end(); ++it, ++other_it)
+ {
+ if (it->first < other_it->first) return true;
+ if (it->first > other_it->first) return false;
+ if (it->second < other_it->second) return true;
+ if (it->second > other_it->second) return false;
+ }
+ return values_.size() < other.values_.size();
+ }
+
+ int num_active() const { return values_.size(); }
+ bool empty() const { return values_.empty(); }
+
+ const_iterator begin() const { return values_.begin(); }
+ const_iterator end() const { return values_.end(); }
+
+ void clear() {
+ values_.clear();
+ }
+ void clear_value(int index) {
+ values_.erase(index);
+ }
+
+ void swap(SparseVector<T>& other) {
+ values_.swap(other.values_);
+ }
+
+private:
+ MapType values_;
+};
+
+template <typename T>
+SparseVector<T> operator+(const SparseVector<T>& a, const SparseVector<T>& b) {
+ SparseVector<T> result = a;
+ return result += b;
+}
+
+template <typename T>
+SparseVector<T> operator*(const SparseVector<T>& a, const double& b) {
+ SparseVector<T> result = a;
+ return result *= b;
+}
+
+template <typename T>
+SparseVector<T> operator*(const SparseVector<T>& a, const T& b) {
+ SparseVector<T> result = a;
+ return result *= b;
+}
+
+template <typename T>
+SparseVector<T> operator*(const double& a, const SparseVector<T>& b) {
+ SparseVector<T> result = b;
+ return result *= a;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &out, const SparseVector<T> &vec)
+{
+ return vec.operator<<(out);
+}
+
+namespace B64 {
+ void Encode(double objective, const SparseVector<double>& v, std::ostream* out);
+ // returns false if failed to decode
+ bool Decode(double* objective, SparseVector<double>* v, const char* data, size_t size);
+}
+
+#endif
diff --git a/decoder/stringlib.cc b/decoder/stringlib.cc
new file mode 100644
index 00000000..3e52ae87
--- /dev/null
+++ b/decoder/stringlib.cc
@@ -0,0 +1,98 @@
+#include "stringlib.h"
+
+#include <cstring>
+#include <cstdlib>
+#include <cassert>
+#include <iostream>
+#include <map>
+
+#include "lattice.h"
+
+using namespace std;
+
+void ParseTranslatorInput(const string& line, string* input, string* ref) {
+ size_t hint = 0;
+ if (line.find("{\"rules\":") == 0) {
+ hint = line.find("}}");
+ if (hint == string::npos) {
+ cerr << "Syntax error: " << line << endl;
+ abort();
+ }
+ hint += 2;
+ }
+ size_t pos = line.find("|||", hint);
+ if (pos == string::npos) { *input = line; return; }
+ ref->clear();
+ *input = line.substr(0, pos - 1);
+ string rline = line.substr(pos + 4);
+ if (rline.size() > 0) {
+ assert(ref);
+ *ref = rline;
+ }
+}
+
+void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) {
+ string sref;
+ ParseTranslatorInput(line, input, &sref);
+ if (sref.size() > 0) {
+ assert(ref);
+ LatticeTools::ConvertTextOrPLF(sref, ref);
+ }
+}
+
+void ProcessAndStripSGML(string* pline, map<string, string>* out) {
+ map<string, string>& meta = *out;
+ string& line = *pline;
+ string lline = LowercaseString(line);
+ if (lline.find("<seg")!=0) return;
+ size_t close = lline.find(">");
+ if (close == string::npos) return; // error
+ size_t end = lline.find("</seg>");
+ string seg = Trim(lline.substr(4, close-4));
+ string text = line.substr(close+1, end - close - 1);
+ for (size_t i = 1; i < seg.size(); i++) {
+ if (seg[i] == '=' && seg[i-1] == ' ') {
+ string less = seg.substr(0, i-1) + seg.substr(i);
+ seg = less; i = 0; continue;
+ }
+ if (seg[i] == '=' && seg[i+1] == ' ') {
+ string less = seg.substr(0, i+1);
+ if (i+2 < seg.size()) less += seg.substr(i+2);
+ seg = less; i = 0; continue;
+ }
+ }
+ line = Trim(text);
+ if (seg == "") return;
+ for (size_t i = 1; i < seg.size(); i++) {
+ if (seg[i] == '=') {
+ string label = seg.substr(0, i);
+ string val = seg.substr(i+1);
+ if (val[0] == '"') {
+ val = val.substr(1);
+ size_t close = val.find('"');
+ if (close == string::npos) {
+ cerr << "SGML parse error: missing \"\n";
+ seg = "";
+ i = 0;
+ } else {
+ seg = val.substr(close+1);
+ val = val.substr(0, close);
+ i = 0;
+ }
+ } else {
+ size_t close = val.find(' ');
+ if (close == string::npos) {
+ seg = "";
+ i = 0;
+ } else {
+ seg = val.substr(close+1);
+ val = val.substr(0, close);
+ }
+ }
+ label = Trim(label);
+ seg = Trim(seg);
+ meta[label] = val;
+ }
+ }
+}
+
diff --git a/decoder/stringlib.h b/decoder/stringlib.h
new file mode 100644
index 00000000..76efee8f
--- /dev/null
+++ b/decoder/stringlib.h
@@ -0,0 +1,101 @@
+#ifndef _STRINGLIB_H_
+
+#include <map>
+#include <vector>
+#include <cctype>
+#include <string>
+
+// read line in the form of either:
+// source
+// source ||| target
+// source will be returned as a string, target must be a sentence or
+// a lattice (in PLF format) and will be returned as a Lattice object
+void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref);
+struct Lattice;
+void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref);
+
+inline const std::string Trim(const std::string& str, const std::string& dropChars = " \t") {
+ std::string res = str;
+ res.erase(str.find_last_not_of(dropChars)+1);
+ return res.erase(0, res.find_first_not_of(dropChars));
+}
+
+inline void Tokenize(const std::string& str, char delimiter, std::vector<std::string>* res) {
+ std::string s = str;
+ int last = 0;
+ res->clear();
+ for (int i=0; i < s.size(); ++i)
+ if (s[i] == delimiter) {
+ s[i]=0;
+ if (last != i) {
+ res->push_back(&s[last]);
+ }
+ last = i + 1;
+ }
+ if (last != s.size())
+ res->push_back(&s[last]);
+}
+
+inline std::string LowercaseString(const std::string& in) {
+ std::string res(in.size(),' ');
+ for (int i = 0; i < in.size(); ++i)
+ res[i] = tolower(in[i]);
+ return res;
+}
+
+inline int CountSubstrings(const std::string& str, const std::string& sub) {
+ size_t p = 0;
+ int res = 0;
+ while (p < str.size()) {
+ p = str.find(sub, p);
+ if (p == std::string::npos) break;
+ ++res;
+ p += sub.size();
+ }
+ return res;
+}
+
+inline int SplitOnWhitespace(const std::string& in, std::vector<std::string>* out) {
+ out->clear();
+ int i = 0;
+ int start = 0;
+ std::string cur;
+ while(i < in.size()) {
+ if (in[i] == ' ' || in[i] == '\t') {
+ if (i - start > 0)
+ out->push_back(in.substr(start, i - start));
+ start = i + 1;
+ }
+ ++i;
+ }
+ if (i > start)
+ out->push_back(in.substr(start, i - start));
+ return out->size();
+}
+
+inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) {
+ cmd->clear();
+ param->clear();
+ std::vector<std::string> x;
+ SplitOnWhitespace(in, &x);
+ if (x.size() == 0) return;
+ *cmd = x[0];
+ for (int i = 1; i < x.size(); ++i) {
+ if (i > 1) { *param += " "; }
+ *param += x[i];
+ }
+}
+
+void ProcessAndStripSGML(std::string* line, std::map<std::string, std::string>* out);
+
+// given the first character of a UTF8 block, find out how wide it is
+// see http://en.wikipedia.org/wiki/UTF-8 for more info
+inline unsigned int UTF8Len(unsigned char x) {
+ if (x < 0x80) return 1;
+ else if ((x >> 5) == 0x06) return 2;
+ else if ((x >> 4) == 0x0e) return 3;
+ else if ((x >> 3) == 0x1e) return 4;
+ else return 0;
+}
+
+#endif
diff --git a/decoder/tagger.cc b/decoder/tagger.cc
new file mode 100644
index 00000000..4dded35f
--- /dev/null
+++ b/decoder/tagger.cc
@@ -0,0 +1,112 @@
+#include "tagger.h"
+
+#include "tdict.h"
+#include "hg_io.h"
+#include "filelib.h"
+#include "hg.h"
+#include "wordid.h"
+#include "sentence_metadata.h"
+
+using namespace std;
+
+// This is a really simple linear chain tagger.
+// You specify a tagset, and it hypothesizes that each word in the
+// input can be tagged with any member of the tagset.
+// The are a couple sample features implemented in ff_tagger.h/cc
+// One thing to note, that while CRFs typically define the label
+// sequence as corresponding to the hidden states in a trellis,
+// in our model the labels are on edges, but mathematically
+// they are identical.
+//
+// Things to do if you want to make this a "real" tagger:
+// - support dictionaries (for each word, limit the tags considered)
+// - add latent variables - this is really easy to do
+
+static void ReadTagset(const string& file, vector<WordID>* tags) {
+ ReadFile rf(file);
+ istream& in(*rf.stream());
+ while(in) {
+ string tag;
+ in >> tag;
+ if (tag.empty()) continue;
+ tags->push_back(TD::Convert(tag));
+ }
+ cerr << "Read " << tags->size() << " labels (tags) from " << file << endl;
+}
+
+struct TaggerImpl {
+ TaggerImpl(const boost::program_options::variables_map& conf) :
+ kXCAT(TD::Convert("X")*-1),
+ kNULL(TD::Convert("<eps>")),
+ kBINARY(new TRule("[X] ||| [X,1] [X,2] ||| [1] [2]")),
+ kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")) {
+ if (conf.count("tagger_tagset") == 0) {
+ cerr << "Tagger requires --tagger_tagset FILE!\n";
+ exit(1);
+ }
+ ReadTagset(conf["tagger_tagset"].as<string>(), &tagset_);
+ }
+
+ void BuildTrellis(const vector<WordID>& seq, Hypergraph* forest) {
+ int prev_node_id = -1;
+ for (int i = 0; i < seq.size(); ++i) {
+ const WordID& src = seq[i];
+ const int new_node_id = forest->AddNode(kXCAT)->id_;
+ for (int k = 0; k < tagset_.size(); ++k) {
+ TRulePtr rule(TRule::CreateLexicalRule(src, tagset_[k]));
+ Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector());
+ edge->i_ = i;
+ edge->j_ = i+1;
+ edge->prev_i_ = i; // we set these for FastLinearIntersect
+ edge->prev_j_ = i+1; // " " "
+ forest->ConnectEdgeToHeadNode(edge->id_, new_node_id);
+ }
+ if (prev_node_id >= 0) {
+ const int comb_node_id = forest->AddNode(kXCAT)->id_;
+ Hypergraph::TailNodeVector tail(2, prev_node_id);
+ tail[1] = new_node_id;
+ Hypergraph::Edge* edge = forest->AddEdge(kBINARY, tail);
+ edge->i_ = 0;
+ edge->j_ = i+1;
+ forest->ConnectEdgeToHeadNode(edge->id_, comb_node_id);
+ prev_node_id = comb_node_id;
+ } else {
+ prev_node_id = new_node_id;
+ }
+ }
+ Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1);
+ Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1);
+ Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail);
+ forest->ConnectEdgeToHeadNode(hg_edge, goal);
+ }
+
+ private:
+ vector<WordID> tagset_;
+ const WordID kXCAT;
+ const WordID kNULL;
+ const TRulePtr kBINARY;
+ const TRulePtr kGOAL_RULE;
+};
+
+Tagger::Tagger(const boost::program_options::variables_map& conf) :
+ pimpl_(new TaggerImpl(conf)) {}
+
+
+bool Tagger::TranslateImpl(const string& input,
+ SentenceMetadata* smeta,
+ const vector<double>& weights,
+ Hypergraph* forest) {
+ Lattice lattice;
+ LatticeTools::ConvertTextToLattice(input, &lattice);
+ smeta->SetSourceLength(lattice.size());
+ vector<WordID> sequence(lattice.size());
+ for (int i = 0; i < lattice.size(); ++i) {
+ assert(lattice[i].size() == 1);
+ sequence[i] = lattice[i][0].label;
+ }
+ pimpl_->BuildTrellis(sequence, forest);
+ forest->Reweight(weights);
+ forest->is_linear_chain_ = true;
+ return true;
+}
+
diff --git a/decoder/tagger.h b/decoder/tagger.h
new file mode 100644
index 00000000..9ac820d9
--- /dev/null
+++ b/decoder/tagger.h
@@ -0,0 +1,17 @@
+#ifndef _TAGGER_H_
+#define _TAGGER_H_
+
+#include "translator.h"
+
+struct TaggerImpl;
+struct Tagger : public Translator {
+ Tagger(const boost::program_options::variables_map& conf);
+ bool TranslateImpl(const std::string& input,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* forest);
+ private:
+ boost::shared_ptr<TaggerImpl> pimpl_;
+};
+
+#endif
diff --git a/decoder/tdict.cc b/decoder/tdict.cc
new file mode 100644
index 00000000..c00d20b8
--- /dev/null
+++ b/decoder/tdict.cc
@@ -0,0 +1,49 @@
+#include "Ngram.h"
+#include "dict.h"
+#include "tdict.h"
+#include "Vocab.h"
+
+using namespace std;
+
+Vocab* TD::dict_ = new Vocab;
+
+static const string empty;
+static const string space = " ";
+
+WordID TD::Convert(const std::string& s) {
+ return dict_->addWord((VocabString)s.c_str());
+}
+
+const char* TD::Convert(const WordID& w) {
+ return dict_->getWord((VocabIndex)w);
+}
+
+void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) {
+ ids->clear();
+ for (vector<string>::const_iterator i = strings.begin(); i != strings.end(); ++i)
+ ids->push_back(TD::Convert(*i));
+}
+
+std::string TD::GetString(const std::vector<WordID>& str) {
+ string res;
+ for (vector<WordID>::const_iterator i = str.begin(); i != str.end(); ++i)
+ res += (i == str.begin() ? empty : space) + TD::Convert(*i);
+ return res;
+}
+
+void TD::ConvertSentence(const std::string& sent, std::vector<WordID>* ids) {
+ string s = sent;
+ int last = 0;
+ ids->clear();
+ for (int i=0; i < s.size(); ++i)
+ if (s[i] == 32 || s[i] == '\t') {
+ s[i]=0;
+ if (last != i) {
+ ids->push_back(Convert(&s[last]));
+ }
+ last = i + 1;
+ }
+ if (last != s.size())
+ ids->push_back(Convert(&s[last]));
+}
+
diff --git a/decoder/tdict.h b/decoder/tdict.h
new file mode 100644
index 00000000..31f66367
--- /dev/null
+++ b/decoder/tdict.h
@@ -0,0 +1,30 @@
+#ifndef _TDICT_H_
+#define _TDICT_H_
+
+#include <string>
+#include <vector>
+#include "wordid.h"
+
+class Vocab;
+
+struct TD {
+ static Vocab* dict_;
+ static void ConvertSentence(const std::string& sent, std::vector<WordID>* ids);
+ static void GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids);
+ static std::string GetString(const std::vector<WordID>& str);
+ static int AppendString(const WordID& w, int pos, int bufsize, char* buffer) {
+ const char* word = TD::Convert(w);
+ const char* const end_buf = buffer + bufsize;
+ char* dest = buffer + pos;
+ while(dest < end_buf && *word) {
+ *dest = *word;
+ ++dest;
+ ++word;
+ }
+ return (dest - buffer);
+ }
+ static WordID Convert(const std::string& s);
+ static const char* Convert(const WordID& w);
+};
+
+#endif
diff --git a/decoder/test_data/dummy.3gram.lm b/decoder/test_data/dummy.3gram.lm
new file mode 100644
index 00000000..ae665284
--- /dev/null
+++ b/decoder/test_data/dummy.3gram.lm
@@ -0,0 +1,2645 @@
+
+\data\
+ngram 1=490
+ngram 2=1023
+ngram 3=1119
+
+\1-grams:
+-2.761928 ! -0.06284945
+-1.91683 " -0.03559465
+-2.761928 ' -0.06057167
+-2.159868 ( -0.07742823
+-2.159868 ) -0.05637721
+-1.292106 , -0.04497077
+-3.062958 - -0.06247065
+-1.429489 . -0.08555528
+-2.761928 12 -0.06473851
+-3.062958 17 -0.06586801
+-2.585837 2000 -0.05520994
+-3.062958 2002 -0.06360606
+-3.062958 2006 -0.0497812
+-3.062958 2008 -0.06322792
+-3.062958 2009 -0.0497812
+-3.062958 200–400 -0.06549184
+-3.062958 224 -0.06586801
+-1.91683 </s>
+-99 <s> -0.0457003
+-2.761928 ? -0.05751594
+-1.720535 a -0.05548429
+-2.460898 about -0.05211611
+-3.062958 acquiesced -0.05942829
+-3.062958 actually -0.04349266
+-3.062958 addition -0.05980976
+-3.062958 admit -0.06095213
+-3.062958 affected -0.04071253
+-2.761928 against -0.06549184
+-3.062958 aging -0.06586801
+-3.062958 ago -0.04349266
+-3.062958 ahead -0.06586801
+-2.761928 al -0.06284945
+-2.761928 all -0.0590465
+-3.062958 all-around -0.06586801
+-3.062958 along -0.04071253
+-2.761928 also -0.06322792
+-2.761928 always -0.06436136
+-2.363988 an -0.06436135
+-3.062958 analysis -0.06473851
+-1.631594 and 0.006203346
+-3.062958 anti-divine -0.06586801
+-3.062958 any -0.06549184
+-3.062958 approach -0.05789908
+-3.062958 archive -0.04071253
+-3.062958 are -0.05789908
+-2.761928 arkive -0.06549184
+-2.585837 article -0.0228177
+-2.21786 as -0.09020901
+-3.062958 asked -0.06398387
+-2.585837 at -0.03145044
+-2.761928 attention -0.02612664
+-3.062958 available -0.04349266
+-3.062958 average -0.04349266
+-3.062958 away -0.06322792
+-3.062958 ayers -0.05597997
+-3.062958 b -0.04349266
+-3.062958 back-and-forth -0.06586801
+-3.062958 bailie -0.0497812
+-2.761928 be -0.06511534
+-3.062958 because -0.06586801
+-2.460898 been -0.06322791
+-3.062958 before -0.04349266
+-2.761928 begin -0.05520995
+-3.062958 being -0.06586801
+-2.585837 between -0.1350269
+-2.460898 bias -0.04111077
+-3.062958 biased -0.06511534
+-3.062958 biblical -0.06586801
+-3.062958 bill -0.06586801
+-3.062958 blade -0.06436136
+-3.062958 blood -0.04349266
+-3.062958 bob -0.06549184
+-3.062958 book -0.06436136
+-2.159868 briffa -0.06804922
+-2.761928 briffa's -0.06284945
+-2.021565 but -0.01525023
+-2.21786 by -0.07600738
+-2.761928 ca -0.2166343
+-2.761928 can -0.06473851
+-3.062958 case -0.06511534
+-3.062958 cast -0.06473851
+-3.062958 catch -0.06511534
+-3.062958 caught -0.06511534
+-3.062958 caveats -0.06322792
+-3.062958 centennial-scale -0.06549184
+-3.062958 cf -0.0497812
+-3.062958 change -0.06209152
+-3.062958 changing -0.06360606
+-3.062958 characterizes -0.06586801
+-3.062958 checked -0.06586801
+-2.159868 chronology -0.02240231
+-3.062958 church -0.06398387
+-3.062958 cocaine -0.06398387
+-3.062958 collection -0.06586801
+-3.062958 combination -0.06209152
+-3.062958 combine -0.04071253
+-3.062958 combined -0.06209152
+-3.062958 comment -0.06360606
+-3.062958 commentary -0.06322792
+-3.062958 commenter -0.06586801
+-3.062958 comments -0.06586801
+-3.062958 compared -0.05789908
+-3.062958 concerned -0.06473851
+-3.062958 concrete -0.06095213
+-3.062958 connection -0.06209152
+-2.761928 conservatives -0.06360606
+-3.062958 considered -0.06095213
+-3.062958 consists -0.04349266
+-3.062958 constructing -0.05789908
+-2.761928 control -0.03991493
+-2.585837 cores -0.0236473
+-3.062958 corridor -0.06473851
+-2.761928 crack -0.06436136
+-3.062958 crossroads -0.0497812
+-2.460898 cru -0.1318786
+-3.062958 darkness -0.05597997
+-2.108715 data -0.06845023
+-2.761928 day -0.05674864
+-2.761928 days -0.04939082
+-3.062958 debt -0.04349266
+-3.062958 decline -0.06095213
+-3.062958 deep -0.06549184
+-3.062958 deeper -0.06586801
+-3.062958 delete -0.05789908
+-3.062958 derived -0.06511534
+-3.062958 described -0.05942829
+-2.761928 did -0.06095213
+-2.761928 difference -0.04860901
+-2.761928 different -0.06247065
+-2.761928 divergence -0.2166343
+-2.761928 do -0.05559513
+-3.062958 does -0.06247065
+-3.062958 doing -0.06586801
+-3.062958 don't -0.06586801
+-3.062958 done -0.06586801
+-3.062958 doubt -0.06360606
+-3.062958 down -0.05789908
+-3.062958 due -0.06473851
+-3.062958 earlier -0.06019088
+-3.062958 editors -0.06511534
+-3.062958 energy -0.04349266
+-3.062958 enormous -0.06586801
+-2.761928 et -0.2166343
+-3.062958 even -0.06586801
+-3.062958 every -0.06586801
+-3.062958 exactly -0.06360606
+-3.062958 exception -0.05789908
+-3.062958 excluding -0.06549184
+-3.062958 expect -0.06511534
+-3.062958 extension -0.05597997
+-3.062958 factors -0.04349266
+-3.062958 fantasy -0.06436136
+-3.062958 far -0.06511534
+-2.585837 few -0.1590744
+-2.585837 finally -0.06511533
+-3.062958 first -0.04349266
+-3.062958 flesh -0.05597997
+-3.062958 following: -0.06095213
+-3.062958 follows: -0.06095213
+-2.284806 for -0.06171204
+-3.062958 forests -0.0497812
+-2.585837 from -0.05713245
+-3.062958 fully -0.06586801
+-2.585837 further -0.06511533
+-3.062958 furthermore -0.04349266
+-3.062958 future -0.0497812
+-3.062958 generating -0.06586801
+-2.761928 get -0.191855
+-3.062958 ghastly -0.06586801
+-3.062958 ghostwritten -0.06360606
+-3.062958 gil -0.06586801
+-3.062958 given -0.04071253
+-3.062958 going -0.05789908
+-3.062958 got -0.06436136
+-2.761928 great -0.2166343
+-3.062958 growing -0.0497812
+-3.062958 grows -0.06511534
+-2.363988 had -0.1033177
+-2.585837 hantemirov -0.09654189
+-2.761928 happening -0.06436136
+-3.062958 happens -0.06549184
+-3.062958 hard -0.05789908
+-3.062958 hardly -0.06473851
+-2.460898 has -0.03063563
+-3.062958 hate -0.05789908
+-2.284806 have -0.08108715
+-3.062958 haven't -0.06586801
+-2.363988 he -0.112982
+-3.062958 here -0.06586801
+-3.062958 highly -0.06586801
+-2.761928 him -0.05751594
+-2.585837 his -0.06511533
+-3.062958 how -0.06586801
+-2.761928 however -0.1946352
+-3.062958 hs -0.06586801
+-3.062958 humanity -0.06511534
+-2.108715 i -0.05980975
+-3.062958 i'd -0.06586801
+-3.062958 i've -0.06586801
+-2.761928 idea -0.02612664
+-2.761928 if -0.03670979
+-3.062958 illusion -0.05597997
+-3.062958 immense -0.06586801
+-3.062958 impact -0.06322792
+-3.062958 important -0.06586801
+-1.807685 in -0.04419087
+-3.062958 included -0.06209152
+-2.761928 including -0.0165447
+-3.062958 indeed -0.06511534
+-3.062958 individual -0.06511534
+-3.062958 information -0.06511534
+-3.062958 inhomogeneities -0.04349266
+-3.062958 initial -0.06549184
+-2.761928 instead -0.2109523
+-3.062958 interannual -0.06549184
+-2.761928 into -0.03991493
+-3.062958 introduced -0.06360606
+-1.91683 is -0.001109093
+-2.062958 it -0.06621437
+-2.460898 it's -0.06019088
+-3.062958 its -0.06586801
+-2.761928 journal -0.06209152
+-3.062958 jurisdiction -0.0497812
+-2.460898 just -0.05520994
+-3.062958 kaufman -0.06549184
+-3.062958 keeps -0.06586801
+-2.761928 khadyta -0.2166343
+-2.460898 know -0.1105378
+-3.062958 larch -0.06586801
+-2.761928 larches -0.04743365
+-3.062958 large-scale -0.06095213
+-2.761928 like -0.06511534
+-3.062958 limited -0.06586801
+-3.062958 living -0.06549184
+-3.062958 longest -0.05597997
+-3.062958 looking -0.06549184
+-3.062958 looks -0.06586801
+-3.062958 love -0.05789908
+-3.062958 made -0.06095213
+-2.761928 mag -0.2143704
+-3.062958 magnitude -0.05980976
+-3.062958 magnus -0.0497812
+-3.062958 makes -0.04071253
+-3.062958 many -0.06586801
+-3.062958 may -0.06586801
+-3.062958 mean -0.06322792
+-3.062958 measured -0.06360606
+-2.761928 measurement -0.213992
+-2.460898 method -0.03711172
+-3.062958 methodology -0.06586801
+-3.062958 mind -0.06511534
+-3.062958 mix -0.06586801
+-2.585837 more -0.05636447
+-3.062958 morning -0.06284945
+-2.585837 most -0.0647385
+-2.761928 much -0.06473851
+-3.062958 multi-parters -0.04349266
+-3.062958 multiproxy -0.06586801
+-3.062958 mundane -0.06511534
+-2.585837 my -0.1598284
+-3.062958 national -0.06586801
+-3.062958 naughtiness -0.0497812
+-3.062958 nettle -0.04349266
+-3.062958 never -0.06586801
+-3.062958 next -0.04349266
+-3.062958 no -0.06586801
+-3.062958 non-robustness -0.06586801
+-3.062958 northern -0.06586801
+-2.062958 not -0.0712041
+-3.062958 noted -0.06586801
+-3.062958 noticed -0.06095213
+-3.062958 notwithstanding -0.06473851
+-3.062958 now -0.04349266
+-2.761928 obama -0.03791448
+-3.062958 observed -0.06586801
+-1.832509 of -0.04850956
+-2.761928 old -0.06436136
+-2.585837 older -0.1053004
+-3.062958 oldie -0.04349266
+-2.159868 on -0.09226183
+-2.585837 one -0.04900008
+-3.062958 online -0.0497812
+-3.062958 only -0.06586801
+-3.062958 or -0.06586801
+-3.062958 originated -0.06209152
+-3.062958 osborn -0.05597997
+-3.062958 out -0.06322792
+-3.062958 outright -0.06586801
+-3.062958 own -0.06586801
+-3.062958 paleoclimatologists -0.05597997
+-3.062958 passage -0.06284945
+-3.062958 passing -0.05597997
+-3.062958 path -0.06095213
+-3.062958 patterns -0.05942829
+-3.062958 paul -0.06436136
+-3.062958 people -0.06095213
+-2.363988 perhaps -0.06259563
+-2.761928 phil -0.2166343
+-3.062958 picked -0.06511534
+-3.062958 piece -0.06360606
+-3.062958 place -0.0497812
+-3.062958 placed -0.06586801
+-3.062958 play -0.06322792
+-3.062958 point -0.06095213
+-3.062958 policy -0.06322792
+-2.585837 politics -0.02571439
+-2.363988 population -0.1001791
+-3.062958 position -0.06095213
+-3.062958 possible -0.05597997
+-2.761928 potential -0.06436136
+-3.062958 power -0.05789908
+-3.062958 powers -0.05597997
+-3.062958 precipitous -0.06586801
+-3.062958 precisely -0.04071253
+-3.062958 predictable -0.06586801
+-3.062958 presented -0.06019088
+-3.062958 preserve -0.06586801
+-3.062958 previous -0.06549184
+-3.062958 principalities -0.05980976
+-3.062958 principles -0.05942829
+-3.062958 prior -0.06511534
+-3.062958 probable -0.06095213
+-2.761928 problem -0.2120946
+-3.062958 projected -0.06549184
+-3.062958 properly -0.06586801
+-3.062958 prove -0.06586801
+-3.062958 provide -0.04071253
+-3.062958 provided -0.05789908
+-3.062958 provocative -0.06586801
+-3.062958 published -0.05942829
+-3.062958 push -0.06511534
+-2.585837 rcs -0.06133225
+-3.062958 react -0.05789908
+-3.062958 read -0.06247065
+-2.761928 readers -0.06398387
+-3.062958 reading -0.04349266
+-3.062958 real -0.06322792
+-3.062958 really -0.06586801
+-3.062958 realm -0.05980976
+-2.761928 reason -0.06360606
+-3.062958 recent -0.06511534
+-2.761928 recently -0.1946352
+-3.062958 reconstruction -0.0497812
+-3.062958 refusal -0.05942829
+-3.062958 refused -0.05789908
+-3.062958 related -0.05789908
+-3.062958 relevant -0.04349266
+-3.062958 relied -0.06322792
+-3.062958 religion -0.05597997
+-3.062958 remained -0.06586801
+-3.062958 remarked -0.06095213
+-3.062958 reposting -0.06473851
+-3.062958 requiring -0.06322792
+-3.062958 response -0.05789908
+-3.062958 resulting -0.06322792
+-3.062958 rev -0.0497812
+-2.460898 right -0.04821757
+-3.062958 ring -0.06586801
+-3.062958 ring-width -0.06511534
+-2.761928 river -0.1946352
+-3.062958 said -0.06436136
+-3.062958 same -0.06473851
+-3.062958 sample -0.06586801
+-3.062958 sat -0.05942829
+-2.460898 schweingruber -0.09101291
+-3.062958 schweingruber's -0.06549184
+-2.585837 science -0.1568045
+-3.062958 script -0.06322792
+-2.585837 see -0.1112577
+-3.062958 seized -0.04071253
+-2.761928 selected -0.04664831
+-2.585837 selection -0.1491516
+-3.062958 sensitive -0.06511534
+-3.062958 sensitivity -0.06095213
+-2.585837 series -0.1314228
+-3.062958 set -0.05942829
+-3.062958 several -0.06549184
+-3.062958 shadow -0.06586801
+-2.761928 shadows -0.04309659
+-2.585837 shiyatov -0.06360605
+-3.062958 should -0.06247065
+-3.062958 similar -0.06473851
+-3.062958 similarly -0.06586801
+-3.062958 since -0.06019088
+-3.062958 size -0.05597997
+-3.062958 skimmed -0.06019088
+-2.761928 slowly -0.04270015
+-3.062958 small -0.06586801
+-3.062958 so -0.06549184
+-3.062958 some -0.06549184
+-3.062958 someone -0.06586801
+-3.062958 start -0.06549184
+-3.062958 staunchly -0.06586801
+-3.062958 struggling -0.06549184
+-3.062958 studies -0.06095213
+-2.761928 study -0.02612664
+-3.062958 stumbled -0.06586801
+-2.585837 subfossil -0.06171205
+-3.062958 subsequent -0.06549184
+-3.062958 subset -0.05942829
+-3.062958 success -0.0497812
+-3.062958 supplement -0.0497812
+-3.062958 supplemented -0.06360606
+-3.062958 surface -0.04349266
+-3.062958 take -0.06436136
+-3.062958 taken -0.05789908
+-2.761928 taymir -0.06247065
+-3.062958 temperature -0.04349266
+-3.062958 tendency -0.05789908
+-3.062958 terms -0.05980976
+-3.062958 than -0.04071253
+-1.91683 that -0.06692892
+-1.243414 the -0.08813193
+-3.062958 their -0.06511534
+-2.761928 themselves -0.04111078
+-3.062958 there's -0.06586801
+-2.460898 these -0.05942829
+-2.460898 they -0.06398387
+-2.761928 things -0.06057167
+-3.062958 think -0.06549184
+-3.062958 thinking -0.06586801
+-1.858838 this -0.08175352
+-2.761928 those -0.06057167
+-3.062958 thought -0.0497812
+-3.062958 thousand -0.04349266
+-3.062958 through -0.04071253
+-2.761928 time -0.0326698
+-1.720535 to -0.07930601
+-2.761928 today -0.04821758
+-3.062958 took -0.04071253
+-3.062958 towards -0.06511534
+-2.761928 trans -0.06549184
+-2.460898 trees -0.04704115
+-2.761928 trouble -0.213234
+-3.062958 true -0.04349266
+-3.062958 trying -0.05789908
+-2.761928 two -0.2166343
+-3.062958 unarchived -0.0497812
+-3.062958 under -0.06549184
+-3.062958 unintentional -0.06473851
+-3.062958 unrepresentativeness -0.05980976
+-3.062958 until -0.06549184
+-3.062958 unveiled: -0.06586801
+-2.761928 up -0.03185729
+-3.062958 upon -0.06019088
+-2.761928 use -0.2109523
+-2.363988 used -0.0545155
+-2.761928 using -0.02323271
+-3.062958 usual -0.06586801
+-3.062958 valid -0.06549184
+-2.761928 variability -0.03911585
+-2.761928 versions -0.04428373
+-2.761928 very -0.06549184
+-3.062958 violence -0.06586801
+-3.062958 virtually -0.06586801
+-3.062958 virtue -0.05980976
+-3.062958 voted -0.06398387
+-3.062958 warn -0.06549184
+-3.062958 warnings -0.04349266
+-2.363988 was -0.06171205
+-3.062958 way -0.06549184
+-3.062958 we -0.06549184
+-3.062958 well -0.06398387
+-2.284806 were -0.07866543
+-2.21786 what -0.02364731
+-3.062958 what's -0.06549184
+-2.585837 when -0.06057167
+-2.585837 where -0.05597997
+-2.460898 which -0.0403139
+-2.585837 while -0.03951557
+-3.062958 whose -0.06586801
+-3.062958 why -0.06586801
+-3.062958 widths -0.05597997
+-2.761928 will -0.06322792
+-3.062958 wise -0.06549184
+-2.021565 with -0.08912028
+-3.062958 within -0.06549184
+-3.062958 without -0.06586801
+-3.062958 worth -0.06586801
+-2.460898 would -0.1303614
+-3.062958 wright's -0.06586801
+-3.062958 wrote -0.04071253
+-2.159868 yamal -0.0719028
+-2.761928 year -0.04270015
+-3.062958 years -0.06549184
+-3.062958 yes -0.04349266
+-3.062958 yesterday -0.06473851
+-3.062958 yet -0.04349266
+-3.062958 you -0.06511534
+-2.761928 your -0.06511534
+
+\2-grams:
+-1.15037 ! as -0.004049858
+-1.15037 ! instead 0.2044696
+-1.995468 " ( -0.005168174
+-1.995468 " - 0.05332709
+-1.995468 " </s>
+-1.995468 " as -0.004049858
+-1.995468 " concrete 0.05332709
+-1.995468 " corridor 0.05332709
+-1.249819 " divergence 0.1451325
+-1.995468 " further 0.008061528
+-1.995468 " i'd 0.05332709
+-1.995468 " success 0.05332709
+-1.995468 " that -0.008505944
+-1.995468 " the -0.007702977
+-1.995468 " used -0.0004517734
+-1.15037 ' </s>
+-1.15037 ' yes 0.05332709
+-1.75243 ( and -0.01063527
+-1.75243 ( in 0.006514465
+-1.006781 ( mag 0.1451325
+-1.75243 ( or 0.05332709
+-1.75243 ( phil 0.2044696
+-1.75243 ( which 0.00272119
+-1.75243 ( while 0.008061528
+-1.006781 ) , -0.002172916
+-1.75243 ) </s>
+-1.75243 ) acquiesced 0.05332709
+-1.75243 ) and -0.002266581
+-1.75243 ) had -0.0004517734
+-1.75243 ) things 0.01894335
+-1.75243 ) took 0.05332709
+-2.620192 , 2008 0.05332709
+-2.620192 , 224 0.05332709
+-2.620192 , a -0.01011507
+-2.620192 , all 0.01894335
+-1.955229 , and -0.006035992
+-2.620192 , as 0.0389223
+-2.620192 , bob 0.05332709
+-2.620192 , briffa -0.005168174
+-0.8166095 , but 0.05114232
+-2.620192 , cf 0.05332709
+-2.620192 , cru 0.00272119
+-2.620192 , delete 0.05332709
+-2.620192 , for -0.002554279
+-2.620192 , from 0.008061528
+-2.620192 , he -0.0004517734
+-2.620192 , his 0.008061528
+-1.955229 , i 0.008061524
+-2.620192 , if 0.01894335
+-2.620192 , including 0.01894335
+-2.620192 , is -0.008505944
+-1.874543 , it -0.0004517762
+-1.874543 , it's 0.01894334
+-2.620192 , kaufman 0.05332709
+-2.620192 , most 0.008061528
+-2.620192 , notwithstanding 0.05332709
+-2.620192 , of 0.007685009
+-2.620192 , on -0.005168174
+-2.620192 , perhaps 0.04797027
+-2.620192 , requiring 0.05332709
+-2.620192 , since 0.05332709
+-1.955229 , the 0.02331641
+-1.955229 , this 0.01715922
+-2.620192 , until 0.05332709
+-2.620192 , using 0.01894335
+-1.874543 , when 0.03010483
+-2.620192 , where 0.008061528
+-1.874543 , which 0.01894334
+-2.620192 , while 0.008061528
+-2.620192 , yamal -0.005168174
+-0.8493397 - not -0.006728992
+-2.482808 . " -0.008505944
+-2.482808 . ' 0.01894335
+-2.482808 . ( -0.005168174
+-2.482808 . ) -0.005168174
+-0.6792259 . </s>
+-1.737159 . a 0.003078613
+-2.482808 . actually 0.05332709
+-2.482808 . and -0.01063527
+-2.482808 . as -0.004049858
+-1.737159 . briffa 0.03257156
+-2.482808 . but -0.007295175
+-2.482808 . changing 0.05332709
+-2.482808 . first 0.05332709
+-2.482808 . furthermore 0.05332709
+-1.737159 . however 0.1451325
+-2.482808 . i -0.006035987
+-2.482808 . in -0.009490006
+-2.482808 . it 0.0164606
+-2.482808 . perhaps 0.04797027
+-2.482808 . science 0.1193421
+-2.482808 . several 0.05332709
+-2.482808 . the -0.008591395
+-1.737159 . these 0.01894334
+-1.737159 . this 0.0130633
+-2.482808 . violence 0.05332709
+-2.482808 . what -0.004049858
+-2.482808 . what's 0.05332709
+-2.482808 . while 0.008061528
+-2.482808 . with 0.05785327
+-2.482808 . wright's 0.05332709
+-1.15037 12 cores 0.008061528
+-1.15037 12 picked 0.05332709
+-0.8493397 17 ring-width 0.05332709
+-1.326461 2000 and -0.01063527
+-1.326461 2000 may 0.05332709
+-1.326461 2000 presented 0.05332709
+-0.8493397 2002 as -0.004049858
+-0.8493397 2006 . -0.0114856
+-0.8493397 2008 ) -0.005168174
+-0.8493397 2009 . 0.08907277
+-0.8493397 200–400 year 0.01894335
+-0.8493397 224 individual 0.05332709
+-1.995468 <s> ' 0.01894335
+-1.995468 <s> as 0.0389223
+-1.995468 <s> briffa's 0.01894335
+-1.995468 <s> but -0.007295175
+-1.995468 <s> i -0.006035987
+-1.995468 <s> if 0.01894335
+-1.995468 <s> in -0.009490006
+-1.995468 <s> next 0.05332709
+-1.249819 <s> perhaps 0.06234263
+-1.249819 <s> the 0.0223057
+-1.995468 <s> this -0.009059753
+-1.995468 <s> what -0.004049858
+-1.15037 ? " -0.008505944
+-1.15037 ? i -0.006035987
+-2.191762 a " 0.01222976
+-2.191762 a case 0.05332709
+-2.191762 a comment 0.05332709
+-2.191762 a commenter 0.05332709
+-2.191762 a different 0.01894335
+-1.5268 a few 0.109396
+-2.191762 a generating 0.05332709
+-2.191762 a great 0.2044696
+-2.191762 a mean 0.05332709
+-2.191762 a prior 0.05332709
+-2.191762 a provocative 0.05332709
+-2.191762 a rcs 0.008061528
+-2.191762 a science 0.008061528
+-2.191762 a shadow 0.05332709
+-2.191762 a similar 0.05332709
+-2.191762 a small 0.05332709
+-2.191762 a surface 0.05332709
+-2.191762 a thousand 0.05332709
+-2.191762 a time 0.01894335
+-2.191762 a valid 0.05332709
+-1.4514 about a -0.01011507
+-1.4514 about my 0.008061528
+-1.4514 about not -0.006728992
+-1.4514 about potential 0.01894335
+-0.8493397 acquiesced in -0.009490006
+-0.8493397 actually , -0.01187418
+-0.8493397 addition of -0.009287588
+-0.8493397 admit that 0.04168737
+-0.8493397 affected the -0.01198488
+-1.15037 against flesh 0.05332709
+-1.15037 against inhomogeneities 0.05332709
+-0.8493397 aging patterns 0.05332709
+-0.8493397 ago , -0.008075343
+-0.8493397 ahead you 0.05332709
+-1.15037 al ( -0.005168174
+-1.15037 al 2009 0.05332709
+-1.15037 all of -0.009287588
+-1.15037 all those 0.01894335
+-0.8493397 all-around naughtiness 0.05332709
+-0.8493397 along the -0.01198488
+-1.15037 also has 0.00272119
+-1.15037 also know 0.08231446
+-1.15037 always been 0.00272119
+-1.15037 always worth 0.05332709
+-1.54831 an exception 0.05332709
+-1.54831 an extension 0.05332709
+-1.54831 an immense 0.05332709
+-1.54831 an important 0.05332709
+-1.54831 an unintentional 0.05332709
+-0.8493397 analysis has 0.00272119
+-2.280704 and , -0.007080218
+-2.280704 and all-around 0.05332709
+-2.280704 and blood 0.05332709
+-2.280704 and briffa -0.005168174
+-2.280704 and even 0.05332709
+-2.280704 and got 0.05332709
+-2.280704 and hantemirov 0.09388901
+-2.280704 and he 0.06152429
+-2.280704 and i've 0.05332709
+-2.280704 and it -0.006728992
+-2.280704 and most 0.008061528
+-2.280704 and outright 0.05332709
+-2.280704 and perhaps -0.0004517734
+-2.280704 and politics 0.008061528
+-2.280704 and potential 0.01894335
+-2.280704 and principalities 0.05332709
+-2.280704 and sat 0.05332709
+-2.280704 and science 0.1193421
+-1.615741 and shiyatov 0.05332708
+-2.280704 and temperature 0.05332709
+-2.280704 and that -0.008505944
+-1.615741 and the -0.005814605
+-2.280704 and they 0.00272119
+-0.8493397 anti-divine powers 0.05332709
+-0.8493397 any journal 0.01894335
+-0.8493397 approach to -0.01011507
+-0.8493397 archive the -0.01198488
+-0.8493397 are to -0.01011507
+-1.15037 arkive down 0.05332709
+-1.15037 arkive under 0.05332709
+-1.326461 article , -0.007080218
+-1.326461 article . -0.004888296
+-1.326461 article on -0.005168174
+-1.694438 as a -0.01011507
+-0.9487888 as ca 0.1451325
+-1.694438 as compared 0.05332709
+-1.694438 as follows: 0.05332709
+-1.694438 as it 0.0164606
+-1.694438 as noted 0.05332709
+-0.8493397 asked for -0.002554279
+-1.326461 at a -0.01011507
+-1.326461 at precisely 0.05332709
+-1.326461 at the -0.01198488
+-1.15037 attention , 0.05896524
+-1.15037 attention . -0.0114856
+-0.8493397 available , -0.008075343
+-0.8493397 average , -0.01187418
+-0.8493397 away ) 0.03209379
+-0.8493397 ayers and -0.01063527
+-0.8493397 b , -0.01187418
+-0.8493397 back-and-forth yesterday 0.05332709
+-0.8493397 bailie . -0.0114856
+-1.15037 be happening 0.01894335
+-1.15037 be included 0.05332709
+-0.8493397 because so 0.05332709
+-1.4514 been an -0.0004517734
+-1.4514 been concerned 0.05332709
+-1.4514 been done 0.05332709
+-1.4514 been projected 0.05332709
+-0.8493397 before , -0.01187418
+-1.15037 begin in -0.009490006
+-1.15037 begin with -0.007295175
+-0.8493397 being true 0.05332709
+-1.326461 between ring 0.05332709
+-0.580812 between the -0.06704012
+-1.4514 bias , -0.007080218
+-1.4514 bias introduced 0.05332709
+-1.4514 bias towards 0.05332709
+-1.4514 bias would 0.08231446
+-0.8493397 biased selection 0.1193421
+-0.8493397 biblical passage 0.05332709
+-0.8493397 bill ayers 0.05332709
+-0.8493397 blade was -0.0004517734
+-0.8493397 blood , 0.05896524
+-0.8493397 bob ? 0.01894335
+-0.8493397 book was -0.0004517734
+-1.087467 briffa 2000 0.05332708
+-1.75243 briffa 2006 0.05332709
+-1.75243 briffa asked 0.05332709
+-1.75243 briffa et 0.2044696
+-1.75243 briffa to -0.01011507
+-1.75243 briffa used -0.0004517734
+-1.15037 briffa's own 0.05332709
+-1.15037 briffa's yamal -0.005168174
+-1.890732 but , -0.01187418
+-1.890732 but anti-divine 0.05332709
+-1.890732 but because 0.05332709
+-1.890732 but between 0.1193421
+-1.890732 but given 0.05332709
+-1.145083 but it -0.0004517762
+-1.890732 but it's 0.00272119
+-1.890732 but the -0.01198488
+-1.890732 but this 0.009005655
+-1.890732 but to 0.002916232
+-1.694438 by bill 0.05332709
+-1.694438 by gil 0.05332709
+-1.694438 by hantemirov 0.09388901
+-1.694438 by how 0.05332709
+-1.694438 by magnus 0.05332709
+-0.9487888 by the -0.01105098
+-0.4047208 ca readers 0.05332709
+-1.15037 can combine 0.05332709
+-1.15037 can see 0.1193421
+-0.8493397 case where 0.008061528
+-0.8493397 cast these 0.00272119
+-0.8493397 catch my 0.1193421
+-0.8493397 caught my 0.1193421
+-0.8493397 caveats on -0.005168174
+-0.8493397 centennial-scale variability 0.01894335
+-0.8493397 cf . -0.0114856
+-0.8493397 change with -0.007295175
+-0.8493397 changing what -0.004049858
+-0.8493397 characterizes northern 0.05332709
+-0.8493397 checked earlier 0.05332709
+-1.75243 chronology , -0.01187418
+-1.75243 chronology also 0.01894335
+-1.75243 chronology briffa -0.005168174
+-1.75243 chronology has 0.00272119
+-1.75243 chronology in -0.009490006
+-1.75243 chronology method 0.00272119
+-1.75243 chronology was -0.0004517734
+-1.75243 chronology with -0.007295175
+-0.8493397 church for -0.002554279
+-0.8493397 cocaine for -0.002554279
+-0.8493397 collection does 0.05332709
+-0.8493397 combination with 0.05785327
+-0.8493397 combine the -0.01198488
+-0.8493397 combined with 0.05785327
+-0.8493397 comment by -0.004049858
+-0.8493397 commentary on 0.03209379
+-0.8493397 commenter remarked 0.05332709
+-0.8493397 comments catch 0.05332709
+-0.8493397 compared to 0.02102831
+-0.8493397 concerned about 0.00272119
+-0.8493397 concrete " -0.008505944
+-0.8493397 connection with -0.007295175
+-1.15037 conservatives said 0.05332709
+-1.15037 conservatives were -0.002554279
+-0.8493397 considered " -0.008505944
+-0.8493397 consists , -0.01187418
+-0.8493397 constructing a -0.01011507
+-1.15037 control ! 0.01894335
+-1.15037 control the -0.01198488
+-1.326461 cores , -0.008075343
+-1.326461 cores . -0.004888296
+-1.326461 cores were 0.04819728
+-0.8493397 corridor method 0.00272119
+-1.15037 crack about 0.00272119
+-1.15037 crack cocaine 0.05332709
+-0.8493397 crossroads . -0.0114856
+-0.7057508 cru population 0.07636014
+-1.4514 cru selection 0.008061528
+-1.4514 cru staunchly 0.05332709
+-0.8493397 darkness and -0.01063527
+-1.803582 data ( -0.005168174
+-1.057933 data . -0.0100497
+-1.803582 data policy 0.05332709
+-1.803582 data remained 0.05332709
+-1.803582 data set 0.05332709
+-1.803582 data used 0.04797027
+-1.803582 data was -0.0004517734
+-1.803582 data were -0.002554279
+-1.15037 day politics 0.008061528
+-1.15037 day to -0.01011507
+-1.15037 days . 0.08907277
+-1.15037 days ago 0.05332709
+-0.8493397 debt , -0.007080218
+-0.8493397 decline is -0.008505944
+-0.8493397 deep into 0.01894335
+-0.8493397 deeper principles 0.05332709
+-0.8493397 delete a 0.0001907796
+-0.8493397 derived from 0.008061528
+-0.8493397 described in -0.009490006
+-1.15037 did not -0.006728992
+-1.15037 did they 0.00272119
+-1.15037 difference . 0.08907277
+-1.15037 difference between 0.1193421
+-1.15037 different aging 0.05332709
+-1.15037 different data -0.006035987
+-0.4047208 divergence problem 0.1451325
+-1.15037 do and -0.002266581
+-1.15037 do indeed 0.05332709
+-0.8493397 does not 0.0164606
+-0.8493397 doing exactly 0.05332709
+-0.8493397 don't really 0.05332709
+-0.8493397 done without 0.05332709
+-0.8493397 doubt what -0.004049858
+-0.8493397 down to -0.01011507
+-0.8493397 due just 0.00272119
+-0.8493397 earlier this -0.009059753
+-0.8493397 editors finally 0.008061528
+-0.8493397 energy , 0.05896524
+-0.8493397 enormous hs 0.05332709
+-0.4047208 et al 0.05332709
+-0.8493397 even probable 0.05332709
+-0.8493397 every subsequent 0.05332709
+-0.8493397 exactly what -0.004049858
+-0.8493397 exception to 0.02102831
+-0.8493397 excluding khadyta 0.2044696
+-0.8493397 expect from 0.008061528
+-0.8493397 extension and -0.01063527
+-0.8493397 factors , 0.05896524
+-0.8493397 fantasy had -0.0004517734
+-0.8493397 far more 0.008061528
+-1.326461 few at 0.008061528
+-0.580812 few days 0.05332709
+-1.326461 finally available 0.05332709
+-1.326461 finally placed 0.05332709
+-1.326461 finally seized 0.05332709
+-0.8493397 first , -0.01187418
+-0.8493397 flesh and -0.01063527
+-0.8493397 following: </s>
+-0.8493397 follows: </s>
+-1.627491 for all 0.01894335
+-1.627491 for an -0.0004517734
+-1.627491 for excluding 0.05332709
+-1.627491 for him 0.01894335
+-1.627491 for paleoclimatologists 0.05332709
+-1.627491 for we 0.05332709
+-0.8493397 forests . -0.004888296
+-1.326461 from 200–400 0.05332709
+-1.326461 from a -0.01011507
+-1.326461 from someone 0.05332709
+-0.8493397 fully thinking 0.05332709
+-1.326461 further ahead 0.05332709
+-1.326461 further along 0.05332709
+-1.326461 further away 0.05332709
+-0.8493397 furthermore , -0.007080218
+-0.8493397 future . -0.0114856
+-0.8493397 generating script 0.05332709
+-0.4047208 get the -0.06704012
+-0.8493397 ghastly tendency 0.05332709
+-0.8493397 ghostwritten by -0.004049858
+-0.8493397 gil bailie 0.05332709
+-0.8493397 given the -0.01198488
+-0.8493397 going to -0.01011507
+-0.8493397 got used 0.04797027
+-0.4047208 great idea 0.05332709
+-0.8493397 growing . 0.08907277
+-0.8493397 grows more 0.008061528
+-0.8026608 had a -0.007295178
+-1.54831 had been 0.00272119
+-1.54831 had in -0.009490006
+-1.54831 had jurisdiction 0.05332709
+-0.6614985 hantemirov and -0.5914098
+-1.15037 happening deep 0.05332709
+-1.15037 happening right 0.00272119
+-0.8493397 happens today 0.01894335
+-0.8493397 hard to -0.01011507
+-0.8493397 hardly know 0.00272119
+-1.4514 has a -0.01011507
+-1.4514 has always 0.01894335
+-1.4514 has only 0.05332709
+-1.4514 has the -0.01198488
+-0.8493397 hate to -0.01011507
+-1.627491 have an -0.0004517734
+-0.881842 have been 0.01894334
+-1.627491 have relied 0.05332709
+-1.627491 have similarly 0.05332709
+-1.627491 have the -0.01198488
+-0.8493397 haven't read 0.05332709
+-0.8026608 he is -0.004049861
+-1.54831 he made 0.05332709
+-1.54831 he would 0.00272119
+-1.54831 he wrote 0.05332709
+-0.8493397 here prove 0.05332709
+-0.8493397 highly possible 0.05332709
+-1.15037 him hate 0.05332709
+-1.15037 him to 0.002916232
+-1.326461 his comments 0.05332709
+-1.326461 his initial 0.05332709
+-1.326461 his precipitous 0.05332709
+-0.8493397 how their 0.05332709
+-0.4047208 however , -0.01082908
+-0.8493397 hs blade 0.05332709
+-0.8493397 humanity at 0.008061528
+-1.803582 i can 0.01894335
+-1.803582 i checked 0.05332709
+-1.803582 i had 0.06152429
+-1.803582 i hardly 0.05332709
+-1.803582 i haven't 0.05332709
+-1.803582 i know 0.00272119
+-1.803582 i noticed 0.05332709
+-1.803582 i skimmed 0.05332709
+-1.803582 i stumbled 0.05332709
+-0.8493397 i'd love 0.05332709
+-0.8493397 i've provided 0.05332709
+-1.15037 idea , -0.01187418
+-1.15037 idea . -0.0114856
+-1.15037 if it -0.006728992
+-1.15037 if the -0.01198488
+-0.8493397 illusion and -0.01063527
+-0.8493397 immense energy 0.05332709
+-0.8493397 impact on -0.005168174
+-0.8493397 important impact 0.05332709
+-1.358963 in a -0.007295178
+-2.104612 in any 0.05332709
+-2.104612 in briffa 0.02412629
+-2.104612 in briffa's 0.01894335
+-2.104612 in combination 0.05332709
+-2.104612 in connection 0.05332709
+-2.104612 in hantemirov 0.09388901
+-2.104612 in mind 0.05332709
+-2.104612 in one 0.008061528
+-2.104612 in passing 0.05332709
+-2.104612 in response 0.05332709
+-2.104612 in rev 0.05332709
+-2.104612 in terms 0.05332709
+-1.358963 in the -0.007650165
+-2.104612 in this -0.009059753
+-2.104612 in virtually 0.05332709
+-0.8493397 included with 0.05785327
+-1.15037 including , -0.01187418
+-1.15037 including the -0.007702977
+-0.8493397 indeed see 0.1193421
+-0.8493397 individual series 0.008061528
+-0.8493397 information finally 0.008061528
+-0.8493397 inhomogeneities , 0.05896524
+-0.8493397 initial use 0.2044696
+-0.4047208 instead of 0.01149127
+-0.8493397 interannual variability 0.01894335
+-1.15037 into him 0.01894335
+-1.15037 into the -0.01198488
+-0.8493397 introduced by -0.004049858
+-1.995468 is , -0.007080218
+-1.995468 is always 0.01894335
+-1.995468 is considered 0.05332709
+-1.995468 is derived 0.05332709
+-1.995468 is doing 0.05332709
+-1.995468 is happening 0.01894335
+-1.995468 is highly 0.05332709
+-1.995468 is measured 0.05332709
+-1.995468 is no 0.05332709
+-1.995468 is not -0.006728992
+-1.995468 is related 0.05332709
+-1.995468 is that -0.008505944
+-1.995468 is the -0.01198488
+-1.995468 is within 0.05332709
+-1.84934 it grows 0.05332709
+-1.84934 it has 0.00272119
+-1.184377 it is 0.0004524188
+-1.84934 it just 0.00272119
+-1.84934 it looks 0.05332709
+-1.84934 it originated 0.05332709
+-1.84934 it was -0.0004517734
+-1.84934 it yet 0.05332709
+-1.4514 it's like 0.01894335
+-1.4514 it's much 0.01894335
+-1.4514 it's not -0.006728992
+-1.4514 it's very 0.01894335
+-0.8493397 its enormous 0.05332709
+-1.15037 journal ( -0.005168174
+-1.15037 journal article 0.008061528
+-0.8493397 jurisdiction . -0.004888296
+-1.4514 just between 0.008061528
+-1.4514 just keeps 0.05332709
+-1.4514 just one 0.008061528
+-1.4514 just to 0.02102831
+-0.8493397 kaufman et 0.2044696
+-0.8493397 keeps growing 0.05332709
+-0.4047208 khadyta river 0.1451325
+-1.4514 know ! 0.01894335
+-0.7057508 know , -0.007021053
+-1.4514 know where 0.008061528
+-0.8493397 larch sample 0.05332709
+-1.15037 larches . 0.08907277
+-1.15037 larches were 0.04819728
+-0.8493397 large-scale " 0.01222976
+-1.15037 like crack 0.01894335
+-1.15037 like trying 0.05332709
+-0.8493397 limited size 0.05332709
+-0.8493397 living larches 0.01894335
+-0.8493397 longest and -0.01063527
+-0.8493397 looking up 0.01894335
+-0.8493397 looks relevant 0.05332709
+-0.8493397 love to -0.01011507
+-0.8493397 made that -0.008505944
+-0.4047208 mag ) 0.002721187
+-0.8493397 magnitude of -0.009287588
+-0.8493397 magnus . -0.0114856
+-0.8493397 makes the -0.01198488
+-0.8493397 many multiproxy 0.05332709
+-0.8493397 may well 0.05332709
+-0.8493397 mean chronology -0.005168174
+-0.8493397 measured by 0.0389223
+-0.4047208 measurement data 0.0009555696
+-1.4514 method " -0.008505944
+-1.4514 method . -0.004888296
+-1.4514 method that -0.008505944
+-1.4514 method which 0.00272119
+-0.8493397 methodology warn 0.05332709
+-0.8493397 mind when 0.008061528
+-0.8493397 mix religion 0.05332709
+-1.326461 more " -0.008505944
+-1.326461 more it 0.0164606
+-1.326461 more slowly 0.01894335
+-0.8493397 morning i -0.006035987
+-1.326461 most recent 0.05332709
+-1.326461 most recently 0.2044696
+-1.326461 most sensitive 0.05332709
+-1.15037 much further 0.008061528
+-1.15037 much illusion 0.05332709
+-0.8493397 multi-parters , -0.01187418
+-0.8493397 multiproxy studies 0.05332709
+-0.8493397 mundane politics 0.008061528
+-0.580812 my attention 0.05332709
+-1.326461 my ghastly 0.05332709
+-0.8493397 national debt 0.05332709
+-0.8493397 naughtiness . -0.0114856
+-0.8493397 nettle , -0.01187418
+-0.8493397 never properly 0.05332709
+-0.8493397 next , -0.008075343
+-0.8493397 no doubt 0.05332709
+-0.8493397 non-robustness observed 0.05332709
+-0.8493397 northern forests 0.05332709
+-1.84934 not be 0.01894335
+-1.84934 not due 0.05332709
+-1.84934 not going 0.05332709
+-1.184377 not have 0.07243546
+-1.84934 not just 0.00272119
+-1.84934 not preserve 0.05332709
+-1.84934 not struggling 0.05332709
+-1.84934 not using 0.01894335
+-0.8493397 noted before 0.05332709
+-0.8493397 noticed that 0.04168737
+-0.8493397 notwithstanding these 0.00272119
+-0.8493397 now , 0.05896524
+-1.15037 obama , -0.007080218
+-1.15037 obama is -0.008505944
+-0.8493397 observed here 0.05332709
+-2.079789 of 17 0.05332709
+-2.079789 of a -0.01011507
+-2.079789 of being 0.05332709
+-2.079789 of commentary 0.05332709
+-2.079789 of darkness 0.05332709
+-2.079789 of deeper 0.05332709
+-2.079789 of his 0.008061528
+-2.079789 of interannual 0.05332709
+-2.079789 of mundane 0.05332709
+-2.079789 of old 0.01894335
+-1.33414 of older 0.03455187
+-2.079789 of reposting 0.05332709
+-2.079789 of subfossil 0.008061528
+-1.33414 of the -0.06704012
+-2.079789 of this -0.009059753
+-1.15037 old living 0.05332709
+-1.15037 old trees 0.00272119
+-0.6614985 older trees 0.03579502
+-0.8493397 oldie , -0.008075343
+-1.006781 on a -0.007295178
+-1.75243 on average 0.05332709
+-1.75243 on many 0.05332709
+-1.75243 on rcs 0.008061528
+-1.75243 on the -0.007702977
+-1.006781 on this -0.005168174
+-1.326461 one . -0.0114856
+-1.326461 one approach 0.05332709
+-1.326461 one oldie 0.05332709
+-0.8493397 online . -0.0114856
+-0.8493397 only taken 0.05332709
+-0.8493397 or real 0.05332709
+-0.8493397 originated with -0.007295175
+-0.8493397 osborn and -0.01063527
+-0.8493397 out ( -0.005168174
+-0.8493397 outright fantasy 0.05332709
+-0.8493397 own caveats 0.05332709
+-0.8493397 paleoclimatologists and -0.01063527
+-0.8493397 passage i -0.006035987
+-0.8493397 passing and -0.01063527
+-0.8493397 path " -0.008505944
+-0.8493397 patterns in 0.006514465
+-0.8493397 paul had -0.0004517734
+-0.8493397 people that -0.008505944
+-0.8833473 perhaps the -0.01011507
+-1.54831 perhaps there's 0.05332709
+-1.54831 perhaps they 0.00272119
+-0.4047208 phil trans 0.05332709
+-0.8493397 picked cores 0.008061528
+-0.8493397 piece by -0.004049858
+-0.8493397 place . -0.0114856
+-0.8493397 placed online 0.05332709
+-0.8493397 play on 0.03209379
+-0.8493397 point that -0.008505944
+-0.8493397 policy ) -0.005168174
+-1.326461 politics , -0.01187418
+-1.326461 politics . -0.004888296
+-1.326461 politics are 0.05332709
+-0.8026608 population . -0.0100497
+-1.54831 population as -0.004049858
+-1.54831 population consists 0.05332709
+-1.54831 population instead 0.2044696
+-0.8493397 position that 0.04168737
+-0.8493397 possible and -0.01063527
+-1.15037 potential bias 0.00272119
+-1.15037 potential unrepresentativeness 0.05332709
+-0.8493397 power to -0.01011507
+-0.8493397 powers and -0.01063527
+-0.8493397 precipitous decline 0.05332709
+-0.8493397 precisely the -0.007702977
+-0.8493397 predictable factors 0.05332709
+-0.8493397 presented this 0.009005655
+-0.8493397 preserve centennial-scale 0.05332709
+-0.8493397 previous journal 0.01894335
+-0.8493397 principalities of -0.009287588
+-0.8493397 principles in 0.006514465
+-0.8493397 prior selection 0.1193421
+-0.8493397 probable that 0.04168737
+-0.4047208 problem " -0.004049861
+-0.8493397 projected into 0.01894335
+-0.8493397 properly published 0.05332709
+-0.8493397 prove out 0.05332709
+-0.8493397 provide the -0.01198488
+-0.8493397 provided a -0.01011507
+-0.8493397 provocative thought 0.05332709
+-0.8493397 published in -0.009490006
+-0.8493397 push at 0.008061528
+-1.326461 rcs chronology -0.005168174
+-1.326461 rcs method 0.00272119
+-1.326461 rcs methodology 0.05332709
+-0.8493397 react to 0.002916232
+-0.8493397 read it -0.006728992
+-1.15037 readers also 0.01894335
+-1.15037 readers know 0.08231446
+-0.8493397 reading , -0.01187418
+-0.8493397 real ) -0.005168174
+-0.8493397 really react 0.05332709
+-0.8493397 realm of -0.009287588
+-1.15037 reason for -0.002554279
+-1.15037 reason why 0.05332709
+-0.8493397 recent one 0.008061528
+-0.4047208 recently , -0.01082908
+-0.8493397 reconstruction . -0.0114856
+-0.8493397 refusal in -0.009490006
+-0.8493397 refused to -0.01011507
+-0.8493397 related to -0.01011507
+-0.8493397 relevant , -0.008075343
+-0.8493397 relied on 0.03209379
+-0.8493397 religion and -0.01063527
+-0.8493397 remained unarchived 0.05332709
+-0.8493397 remarked that -0.008505944
+-0.8493397 reposting just 0.00272119
+-0.8493397 requiring briffa -0.005168174
+-0.8493397 response to 0.02102831
+-0.8493397 resulting yamal 0.02412629
+-0.8493397 rev . -0.0114856
+-1.4514 right . -0.0114856
+-1.4514 right now 0.05332709
+-1.4514 right place 0.05332709
+-1.4514 right time 0.01894335
+-0.8493397 ring widths 0.05332709
+-0.8493397 ring-width series 0.1193421
+-0.4047208 river , -0.01082908
+-0.8493397 said he -0.0004517734
+-0.8493397 same bias 0.00272119
+-0.8493397 sample should 0.05332709
+-0.8493397 sat in -0.009490006
+-1.4514 schweingruber data -0.006035987
+-0.7864373 schweingruber population 0.09172077
+-0.8493397 schweingruber's khadyta 0.2044696
+-0.580812 science ( -0.02724335
+-1.326461 science article 0.008061528
+-0.8493397 script ) 0.03209379
+-1.326461 see , -0.008075343
+-0.580812 see the -0.01105098
+-0.8493397 seized the -0.01198488
+-1.15037 selected . -0.004888296
+-1.15037 selected on 0.03209379
+-1.326461 selection is -0.008505944
+-0.580812 selection of 0.01149127
+-0.8493397 sensitive series 0.1193421
+-0.8493397 sensitivity is -0.008505944
+-0.580812 series , -0.01082908
+-1.326461 series of -0.009287588
+-0.8493397 set in -0.009490006
+-0.8493397 several things 0.01894335
+-0.8493397 shadow play 0.05332709
+-1.15037 shadows . -0.0114856
+-1.15037 shadows of -0.009287588
+-1.326461 shiyatov 2002 0.05332709
+-1.326461 shiyatov themselves 0.01894335
+-1.326461 shiyatov would 0.08231446
+-0.8493397 should not -0.006728992
+-0.8493397 similar schweingruber 0.00272119
+-0.8493397 similarly affected 0.05332709
+-0.8493397 since this -0.009059753
+-0.8493397 size and -0.01063527
+-0.8493397 skimmed this -0.009059753
+-1.15037 slowly , -0.01187418
+-1.15037 slowly get 0.2044696
+-0.8493397 small push 0.05332709
+-0.8493397 so much 0.01894335
+-0.8493397 some reason 0.01894335
+-0.8493397 someone whose 0.05332709
+-0.8493397 start today 0.01894335
+-0.8493397 staunchly refused 0.05332709
+-0.8493397 struggling against 0.01894335
+-0.8493397 studies that -0.008505944
+-1.15037 study , -0.01187418
+-1.15037 study . 0.08907277
+-0.8493397 stumbled upon 0.05332709
+-1.326461 subfossil collection 0.05332709
+-1.326461 subfossil data 0.02685598
+-1.326461 subfossil larches 0.01894335
+-0.8493397 subsequent study 0.01894335
+-0.8493397 subset in -0.009490006
+-0.8493397 success . -0.0114856
+-0.8493397 supplement . 0.08907277
+-0.8493397 supplemented by 0.0389223
+-0.8493397 surface , -0.008075343
+-0.8493397 take an -0.0004517734
+-0.8493397 taken a 0.0001907796
+-1.15037 taymir data -0.006035987
+-1.15037 taymir supplement 0.05332709
+-0.8493397 temperature , 0.05896524
+-0.8493397 tendency to -0.01011507
+-0.8493397 terms of -0.009287588
+-0.8493397 than the -0.008591395
+-1.995468 that " -0.008505944
+-1.995468 that cast 0.05332709
+-1.995468 that characterizes 0.05332709
+-1.995468 that have -0.002554279
+-1.995468 that he 0.06152429
+-1.995468 that his 0.008061528
+-0.9275748 that the 0.03271748
+-1.995468 that they 0.00272119
+-1.995468 that voted 0.05332709
+-1.995468 that way 0.05332709
+-1.995468 that wise 0.05332709
+-2.668884 the " -0.008505944
+-1.923235 the 12 0.05332709
+-2.668884 the addition 0.05332709
+-1.923235 the arkive 0.05332709
+-2.668884 the back-and-forth 0.05332709
+-2.668884 the biased 0.05332709
+-2.668884 the biblical 0.05332709
+-2.668884 the chronology -0.005168174
+-1.923235 the conservatives 0.05332709
+-2.668884 the crossroads 0.05332709
+-2.003921 the cru 0.0632299
+-2.668884 the data 0.02685598
+-2.668884 the day 0.01894335
+-2.668884 the difference 0.01894335
+-2.668884 the far 0.05332709
+-2.668884 the following: 0.05332709
+-2.668884 the further 0.008061528
+-2.668884 the future 0.05332709
+-2.668884 the information 0.05332709
+-2.668884 the large-scale 0.05332709
+-2.668884 the longest 0.05332709
+-2.668884 the magnitude 0.05332709
+-2.668884 the measurement 0.2044696
+-2.668884 the more 0.008061528
+-2.668884 the most 0.008061528
+-2.668884 the multi-parters 0.05332709
+-2.668884 the national 0.05332709
+-2.668884 the nettle 0.05332709
+-2.668884 the non-robustness 0.05332709
+-2.668884 the path 0.05332709
+-2.668884 the people 0.05332709
+-2.668884 the phil 0.2044696
+-2.668884 the point 0.05332709
+-2.668884 the position 0.05332709
+-2.668884 the previous 0.05332709
+-2.668884 the rcs 0.008061528
+-2.668884 the realm 0.05332709
+-2.668884 the resulting 0.05332709
+-1.923235 the right 0.01894334
+-2.668884 the same 0.05332709
+-2.003921 the schweingruber -0.5245172
+-2.668884 the shadows 0.01894335
+-2.668884 the subfossil 0.008061528
+-1.923235 the taymir 0.05332709
+-1.923235 the trouble 0.1451325
+-1.923235 the two 0.1451325
+-2.668884 the use 0.2044696
+-2.668884 the usual 0.05332709
+-2.668884 the very 0.01894335
+-2.668884 the virtue 0.05332709
+-1.120574 the yamal 0.02719982
+-0.8493397 their cores 0.008061528
+-1.15037 themselves , -0.01187418
+-1.15037 themselves were -0.002554279
+-0.8493397 there's some 0.05332709
+-1.4514 these data -0.006035987
+-1.4514 these shadows 0.01894335
+-1.4514 these warnings 0.05332709
+-1.4514 these were -0.002554279
+-1.4514 they can 0.01894335
+-1.4514 they don't 0.05332709
+-1.4514 they expect 0.05332709
+-1.4514 they themselves 0.01894335
+-1.15037 things caught 0.05332709
+-1.15037 things that -0.008505944
+-0.8493397 think up 0.01894335
+-0.8493397 thinking through 0.05332709
+-2.05346 this analysis 0.05332709
+-2.05346 this article 0.008061528
+-2.05346 this bias 0.00272119
+-1.307811 this chronology 0.002721187
+-2.05346 this difference 0.01894335
+-1.307811 this is -0.004049861
+-2.05346 this method 0.00272119
+-2.05346 this morning 0.05332709
+-2.05346 this piece 0.05332709
+-2.05346 this refusal 0.05332709
+-2.05346 this study 0.01894335
+-2.05346 this subset 0.05332709
+-2.05346 this will 0.01894335
+-2.05346 this year 0.01894335
+-1.15037 those " -0.008505944
+-1.15037 those years 0.05332709
+-0.8493397 thought . -0.0114856
+-0.8493397 thousand , 0.05896524
+-0.8493397 through the -0.01198488
+-1.15037 time , -0.008075343
+-1.15037 time and -0.002266581
+-2.191762 to about 0.00272119
+-2.191762 to admit 0.05332709
+-2.191762 to archive 0.05332709
+-1.446113 to begin 0.05332709
+-2.191762 to change 0.05332709
+-2.191762 to constructing 0.05332709
+-2.191762 to control 0.01894335
+-2.191762 to day 0.01894335
+-2.191762 to different 0.01894335
+-2.191762 to get 0.2044696
+-2.191762 to mix 0.05332709
+-2.191762 to provide 0.05332709
+-2.191762 to start 0.05332709
+-1.123869 to the -0.005761562
+-2.191762 to think 0.05332709
+-2.191762 to those 0.01894335
+-1.446113 to what 0.005001867
+-1.15037 today . -0.0114856
+-1.15037 today would 0.00272119
+-0.8493397 took the -0.01198488
+-0.8493397 towards older 0.09388901
+-1.15037 trans b 0.05332709
+-1.15037 trans editors 0.05332709
+-1.4514 trees . -0.0114856
+-1.4514 trees an -0.0004517734
+-1.4514 trees described 0.05332709
+-1.4514 trees than 0.05332709
+-0.4047208 trouble with -0.03998877
+-0.8493397 true , -0.01187418
+-0.8493397 trying to -0.01011507
+-0.4047208 two versions 0.05332709
+-0.8493397 unarchived . -0.004888296
+-0.8493397 under control 0.01894335
+-0.8493397 unintentional bias 0.00272119
+-0.8493397 unrepresentativeness of 0.007685009
+-0.8493397 until recently 0.2044696
+-0.8493397 unveiled: humanity 0.05332709
+-1.15037 up a -0.01011507
+-1.15037 up the -0.01198488
+-0.8493397 upon this -0.009059753
+-0.4047208 use of -0.005627823
+-1.54831 used by -0.004049858
+-0.8833473 used in 0.01371272
+-1.54831 used the -0.01198488
+-1.15037 using . 0.08907277
+-1.15037 using the -0.008591395
+-0.8493397 usual predictable 0.05332709
+-0.8493397 valid reason 0.01894335
+-1.15037 variability . -0.004888296
+-1.15037 variability and -0.01063527
+-1.15037 versions . 0.08907277
+-1.15037 versions is -0.008505944
+-1.15037 very hard 0.05332709
+-1.15037 very limited 0.05332709
+-0.8493397 violence unveiled: 0.05332709
+-0.8493397 virtually every 0.05332709
+-0.8493397 virtue of -0.009287588
+-0.8493397 voted for -0.002554279
+-0.8493397 warn against 0.01894335
+-0.8493397 warnings , -0.01187418
+-1.54831 was finally 0.008061528
+-1.54831 was ghostwritten 0.05332709
+-1.54831 was like 0.01894335
+-1.54831 was never 0.05332709
+-1.54831 was used 0.04797027
+-0.8493397 way slowly 0.01894335
+-0.8493397 we do 0.01894335
+-0.8493397 well have 0.04819728
+-1.627491 were not -0.006728992
+-1.627491 were right 0.00272119
+-0.881842 were selected 0.05332709
+-1.627491 were supplemented 0.05332709
+-1.627491 were the -0.01198488
+-1.694438 what a -0.01011507
+-1.694438 what did 0.01894335
+-1.694438 what happens 0.05332709
+-1.694438 what is -0.008505944
+-1.694438 what paul 0.05332709
+-1.694438 what the -0.007702977
+-1.694438 what will 0.01894335
+-0.8493397 what's your 0.01894335
+-1.326461 when combined 0.05332709
+-1.326461 when he -0.0004517734
+-1.326461 when i -0.006035987
+-1.326461 where it's 0.00272119
+-1.326461 where sensitivity 0.05332709
+-1.326461 where to 0.002916232
+-1.4514 which , -0.01187418
+-1.4514 which did 0.01894335
+-1.4514 which had 0.06152429
+-1.4514 which makes 0.05332709
+-1.326461 while including 0.01894335
+-1.326461 while looking 0.05332709
+-1.326461 while the 0.02129733
+-0.8493397 whose book 0.05332709
+-0.8493397 why schweingruber's 0.05332709
+-0.8493397 widths and -0.01063527
+-1.15037 will be 0.01894335
+-1.15037 will have -0.002554279
+-0.8493397 wise crack 0.01894335
+-1.890732 with . -0.004888296
+-1.890732 with a -0.01011507
+-1.890732 with briffa 0.02412629
+-1.890732 with its 0.05332709
+-1.145083 with obama 0.05332709
+-1.890732 with osborn 0.05332709
+-0.8228394 with the 0.02898683
+-0.8493397 within your 0.01894335
+-0.8493397 without fully 0.05332709
+-0.8493397 worth reading 0.05332709
+-1.4514 would do 0.01894335
+-0.7057508 would not -0.04287655
+-1.4514 would take 0.05332709
+-0.8493397 wright's church 0.05332709
+-0.8493397 wrote the -0.01198488
+-1.087467 yamal chronology 0.01075652
+-1.75243 yamal data -0.006035987
+-1.75243 yamal larch 0.05332709
+-1.75243 yamal measurement 0.2044696
+-1.75243 yamal reconstruction 0.05332709
+-1.75243 yamal subfossil 0.008061528
+-1.15037 year , -0.008075343
+-1.15037 year old 0.01894335
+-0.8493397 years ? 0.01894335
+-0.8493397 yes , -0.01187418
+-0.8493397 yesterday about 0.00272119
+-0.8493397 yet , 0.05896524
+-0.8493397 you see 0.008061528
+-1.15037 your great 0.2044696
+-1.15037 your power 0.05332709
+
+\3-grams:
+-1.533073 control ! as
+-1.533073 know ! instead
+-1.533073 . " i'd
+-1.533073 ? " </s>
+-1.533073 a " divergence
+-1.533073 concrete " (
+-1.533073 considered " success
+-1.533073 large-scale " divergence
+-1.533073 method " used
+-1.533073 more " concrete
+-1.533073 path " as
+-1.834103 problem " -
+-1.834103 problem " that
+-1.533073 that " the
+-1.533073 the " corridor
+-1.533073 those " further
+-1.533073 . ' </s>
+-1.533073 <s> ' yes
+-1.533073 " ( or
+-1.533073 . ( while
+-1.533073 al ( phil
+-1.533073 data ( in
+-1.533073 journal ( which
+-1.533073 out ( and
+-0.8145491 science ( mag
+-1.533073 . ) </s>
+-1.533073 2008 ) and
+-1.533073 away ) ,
+-1.834103 mag ) acquiesced
+-1.834103 mag ) took
+-1.533073 policy ) had
+-1.533073 real ) things
+-1.533073 script ) ,
+-1.834103 ) , it's
+-1.834103 ) , this
+-1.533073 actually , all
+-1.533073 ago , i
+-1.533073 and , when
+-1.533073 article , it
+-1.533073 attention , but
+-1.533073 available , this
+-1.533073 average , of
+-1.533073 b , 2008
+-1.533073 before , briffa
+-1.533073 bias , when
+-1.533073 blood , but
+-1.533073 but , notwithstanding
+-1.533073 chronology , 224
+-1.533073 consists , on
+-1.533073 cores , this
+-1.533073 debt , which
+-1.533073 energy , but
+-1.533073 factors , but
+-1.533073 first , a
+-1.533073 furthermore , it
+-1.834103 however , as
+-1.834103 however , using
+-1.533073 idea , bob
+-1.533073 including , most
+-1.533073 inhomogeneities , but
+-1.533073 is , it's
+-1.834103 know , the
+-1.834103 know , until
+-1.533073 multi-parters , delete
+-1.533073 nettle , requiring
+-1.533073 next , i
+-1.533073 now , but
+-1.533073 obama , which
+-1.533073 oldie , i
+-1.533073 politics , he
+-1.533073 reading , cf
+-1.834103 recently , cru
+-1.834103 recently , kaufman
+-1.533073 relevant , and
+-1.834103 river , while
+-1.834103 river , yamal
+-1.533073 see , the
+-1.834103 series , from
+-1.834103 series , where
+-1.533073 slowly , is
+-1.533073 study , including
+-1.533073 surface , and
+-1.533073 temperature , but
+-1.533073 themselves , since
+-1.533073 thousand , but
+-1.533073 time , and
+-1.533073 true , for
+-1.533073 warnings , his
+-1.533073 which , if
+-1.533073 year , the
+-1.533073 yes , perhaps
+-1.533073 yet , but
+-1.533073 " - not
+-1.533073 2006 . while
+-1.533073 2009 . </s>
+-1.533073 article . however
+-1.533073 attention . first
+-1.533073 bailie . i
+-1.533073 cf . violence
+-1.533073 cores . briffa
+-1.533073 crossroads . )
+-1.834103 data . as
+-1.834103 data . but
+-1.533073 days . </s>
+-1.533073 difference . </s>
+-1.533073 forests . however
+-1.533073 future . changing
+-1.533073 growing . </s>
+-1.533073 idea . what's
+-1.533073 jurisdiction . briffa
+-1.533073 larches . </s>
+-1.533073 magnus . actually
+-1.533073 method . this
+-1.533073 naughtiness . (
+-1.533073 one . in
+-1.533073 online . with
+-1.533073 place . '
+-1.533073 politics . this
+-1.834103 population . it
+-1.834103 population . the
+-1.533073 reconstruction . science
+-1.533073 rev . wright's
+-1.533073 right . what
+-1.533073 selected . these
+-1.533073 shadows . and
+-1.533073 study . </s>
+-1.533073 success . "
+-1.533073 supplement . </s>
+-1.533073 thought . furthermore
+-1.533073 today . several
+-1.533073 trees . perhaps
+-1.533073 unarchived . a
+-1.533073 using . </s>
+-1.533073 variability . these
+-1.533073 versions . </s>
+-1.533073 with . a
+-1.834103 the 12 cores
+-1.834103 the 12 picked
+-1.533073 of 17 ring-width
+-2.010194 briffa 2000 and
+-2.010194 briffa 2000 may
+-2.010194 briffa 2000 presented
+-1.533073 shiyatov 2002 as
+-1.533073 briffa 2006 .
+-1.533073 , 2008 )
+-1.533073 al 2009 .
+-1.533073 from 200–400 year
+-1.533073 , 224 individual
+-1.533073 bob ? i
+-1.533073 years ? "
+-1.533073 , a comment
+-1.834103 . a commenter
+-1.834103 . a few
+-1.533073 about a thousand
+-1.533073 as a shadow
+-1.533073 at a time
+-1.533073 constructing a mean
+-1.533073 delete a few
+-1.533073 from a prior
+-1.834103 had a different
+-1.834103 had a great
+-1.533073 has a "
+-1.834103 in a case
+-1.834103 in a science
+-1.533073 of a similar
+-1.834103 on a rcs
+-1.834103 on a surface
+-1.533073 provided a generating
+-1.533073 taken a few
+-1.533073 up a valid
+-1.533073 what a provocative
+-1.533073 with a small
+-1.533073 concerned about potential
+-1.533073 crack about not
+-1.533073 to about a
+-1.533073 yesterday about my
+-1.533073 ) acquiesced in
+-1.533073 . actually ,
+-1.533073 the addition of
+-1.533073 to admit that
+-1.533073 similarly affected the
+-1.533073 struggling against flesh
+-1.533073 warn against inhomogeneities
+-1.533073 different aging patterns
+-1.533073 days ago ,
+-1.533073 further ahead you
+-1.834103 et al (
+-1.834103 et al 2009
+-1.533073 , all of
+-1.533073 for all those
+-1.533073 and all-around naughtiness
+-1.533073 further along the
+-1.533073 chronology also has
+-1.533073 readers also know
+-1.533073 has always been
+-1.533073 is always worth
+-1.533073 been an exception
+-1.533073 for an extension
+-1.533073 have an important
+-1.533073 take an immense
+-1.533073 trees an unintentional
+-1.533073 this analysis has
+-1.533073 ( and i've
+-1.533073 ) and the
+-2.010194 , and he
+-2.010194 , and that
+-2.010194 , and they
+-1.533073 . and perhaps
+-1.533073 2000 and science
+-1.533073 ayers and sat
+-1.533073 darkness and all-around
+-1.533073 do and the
+-1.533073 extension and ,
+-1.533073 flesh and blood
+-0.1249387 hantemirov and shiyatov
+-1.533073 illusion and outright
+-1.533073 longest and most
+-1.533073 osborn and briffa
+-1.533073 paleoclimatologists and got
+-1.533073 passing and it
+-1.533073 possible and even
+-1.533073 powers and principalities
+-1.533073 religion and politics
+-1.533073 size and potential
+-1.533073 time and the
+-1.533073 variability and hantemirov
+-1.533073 widths and temperature
+-1.533073 but anti-divine powers
+-1.533073 in any journal
+-1.533073 one approach to
+-1.533073 to archive the
+-1.533073 politics are to
+-1.834103 the arkive down
+-1.834103 the arkive under
+-1.533073 journal article .
+-1.533073 science article ,
+-1.533073 this article on
+-1.533073 ! as it
+-1.533073 " as a
+-1.533073 , as ca
+-1.533073 . as noted
+-1.533073 2002 as follows:
+-1.533073 <s> as ca
+-1.533073 population as compared
+-1.533073 briffa asked for
+-1.533073 few at a
+-1.533073 humanity at the
+-1.533073 push at precisely
+-1.834103 my attention ,
+-1.834103 my attention .
+-1.533073 finally available ,
+-1.533073 on average ,
+-1.533073 further away )
+-1.533073 bill ayers and
+-1.533073 trans b ,
+-1.533073 the back-and-forth yesterday
+-1.533073 gil bailie .
+-1.533073 not be included
+-1.533073 will be happening
+-1.533073 but because so
+-1.533073 always been an
+-1.533073 had been projected
+-1.834103 have been concerned
+-1.834103 have been done
+-1.533073 noted before ,
+-1.834103 to begin in
+-1.834103 to begin with
+-1.533073 of being true
+-1.533073 but between the
+-1.533073 difference between the
+-1.533073 just between ring
+-1.533073 potential bias introduced
+-1.533073 same bias towards
+-1.533073 this bias would
+-1.533073 unintentional bias ,
+-1.533073 the biased selection
+-1.533073 the biblical passage
+-1.533073 by bill ayers
+-1.533073 hs blade was
+-1.533073 and blood ,
+-1.533073 , bob ?
+-1.533073 whose book was
+-1.533073 , briffa asked
+-1.834103 . briffa 2000
+-1.834103 . briffa used
+-1.533073 and briffa 2006
+-1.533073 chronology briffa et
+-1.533073 in briffa 2000
+-1.533073 requiring briffa to
+-1.533073 with briffa 2000
+-1.533073 <s> briffa's own
+-1.533073 in briffa's yamal
+-2.487315 , but ,
+-2.487315 , but anti-divine
+-2.487315 , but because
+-2.487315 , but between
+-1.467762 , but it
+-2.487315 , but the
+-2.487315 , but this
+-2.487315 , but to
+-1.533073 . but given
+-1.533073 <s> but it's
+-1.533073 comment by magnus
+-1.533073 ghostwritten by bill
+-1.533073 introduced by how
+-1.533073 measured by the
+-1.533073 piece by gil
+-1.533073 supplemented by the
+-1.533073 used by hantemirov
+-0.8145491 as ca readers
+-1.533073 i can combine
+-1.533073 they can see
+-1.533073 a case where
+-1.533073 that cast these
+-1.533073 comments catch my
+-1.533073 things caught my
+-1.533073 own caveats on
+-1.533073 preserve centennial-scale variability
+-1.533073 , cf .
+-1.533073 to change with
+-1.533073 . changing what
+-1.533073 that characterizes northern
+-1.533073 i checked earlier
+-1.533073 mean chronology ,
+-1.533073 rcs chronology method
+-1.533073 the chronology briffa
+-1.834103 this chronology also
+-1.834103 this chronology in
+-2.010194 yamal chronology has
+-2.010194 yamal chronology was
+-2.010194 yamal chronology with
+-1.533073 wright's church for
+-1.533073 crack cocaine for
+-1.533073 subfossil collection does
+-1.533073 in combination with
+-1.533073 can combine the
+-1.533073 when combined with
+-1.533073 a comment by
+-1.533073 of commentary on
+-1.533073 a commenter remarked
+-1.533073 his comments catch
+-1.533073 as compared to
+-1.533073 been concerned about
+-1.533073 " concrete "
+-1.533073 in connection with
+-1.834103 the conservatives said
+-1.834103 the conservatives were
+-1.533073 is considered "
+-1.533073 population consists ,
+-1.533073 to constructing a
+-1.533073 to control the
+-1.533073 under control !
+-1.533073 12 cores .
+-1.533073 picked cores ,
+-1.533073 their cores were
+-1.533073 " corridor method
+-1.533073 like crack cocaine
+-1.533073 wise crack about
+-1.533073 the crossroads .
+-1.533073 , cru staunchly
+-0.9906404 the cru population
+-2.010194 the cru selection
+-1.533073 of darkness and
+-1.533073 different data policy
+-1.834103 measurement data remained
+-1.834103 measurement data used
+-1.533073 schweingruber data set
+-1.533073 subfossil data .
+-1.533073 taymir data (
+-1.533073 the data .
+-1.533073 these data were
+-1.533073 yamal data was
+-1.533073 the day to
+-1.533073 to day politics
+-1.834103 few days .
+-1.834103 few days ago
+-1.533073 national debt ,
+-1.533073 precipitous decline is
+-1.533073 happening deep into
+-1.533073 of deeper principles
+-1.533073 , delete a
+-1.533073 is derived from
+-1.533073 trees described in
+-1.533073 what did they
+-1.533073 which did not
+-1.533073 the difference between
+-1.533073 this difference .
+-1.533073 a different data
+-1.533073 to different aging
+-0.8145491 " divergence problem
+-1.533073 we do indeed
+-1.533073 would do and
+-1.533073 collection does not
+-1.533073 is doing exactly
+-1.533073 they don't really
+-1.533073 been done without
+-1.533073 no doubt what
+-1.533073 arkive down to
+-1.533073 not due just
+-1.533073 checked earlier this
+-1.533073 trans editors finally
+-1.533073 immense energy ,
+-1.533073 its enormous hs
+-1.533073 briffa et al
+-1.533073 kaufman et al
+-1.533073 and even probable
+-1.533073 virtually every subsequent
+-1.533073 doing exactly what
+-1.533073 an exception to
+-1.533073 for excluding khadyta
+-1.533073 they expect from
+-1.533073 an extension and
+-1.533073 predictable factors ,
+-1.533073 outright fantasy had
+-1.533073 the far more
+-2.010194 a few at
+-0.9906404 a few days
+-1.533073 editors finally seized
+-1.533073 information finally available
+-1.533073 was finally placed
+-1.533073 . first ,
+-1.533073 against flesh and
+-1.533073 the following: </s>
+-1.533073 as follows: </s>
+-1.533073 , for we
+-1.533073 asked for an
+-1.533073 church for all
+-1.533073 cocaine for paleoclimatologists
+-1.533073 reason for excluding
+-1.533073 voted for him
+-1.533073 northern forests .
+-1.533073 , from 200–400
+-1.533073 derived from a
+-1.533073 expect from someone
+-1.533073 without fully thinking
+-1.533073 " further along
+-1.533073 much further away
+-1.533073 the further ahead
+-1.533073 . furthermore ,
+-1.533073 the future .
+-1.533073 a generating script
+-1.533073 slowly get the
+-1.533073 to get the
+-1.533073 my ghastly tendency
+-1.533073 was ghostwritten by
+-1.533073 by gil bailie
+-1.533073 but given the
+-1.533073 not going to
+-1.533073 and got used
+-1.533073 a great idea
+-1.533073 your great idea
+-1.533073 keeps growing .
+-1.533073 it grows more
+-1.533073 ) had jurisdiction
+-1.533073 fantasy had been
+-1.533073 i had a
+-1.533073 paul had in
+-1.533073 which had a
+-1.533073 and hantemirov and
+-1.533073 by hantemirov and
+-1.533073 in hantemirov and
+-1.533073 be happening deep
+-1.533073 is happening right
+-1.533073 what happens today
+-1.533073 very hard to
+-1.533073 i hardly know
+-1.533073 also has a
+-1.533073 analysis has only
+-1.533073 chronology has always
+-1.533073 it has the
+-1.533073 him hate to
+-2.010194 not have been
+-2.010194 not have similarly
+-2.010194 not have the
+-1.533073 that have relied
+-1.533073 well have been
+-1.533073 will have an
+-1.533073 i haven't read
+-1.533073 , he wrote
+-1.533073 and he is
+-1.533073 said he would
+-1.533073 that he is
+-1.533073 when he made
+-1.533073 observed here prove
+-1.533073 is highly possible
+-1.533073 for him hate
+-1.533073 into him to
+-1.533073 , his initial
+-1.533073 of his comments
+-1.533073 that his precipitous
+-1.533073 by how their
+-0.8145491 . however ,
+-1.533073 enormous hs blade
+-1.533073 unveiled: humanity at
+-2.010194 , i can
+-2.010194 , i noticed
+-2.010194 , i skimmed
+-1.533073 . i haven't
+-1.533073 <s> i hardly
+-1.533073 ? i know
+-1.533073 morning i had
+-1.533073 passage i stumbled
+-1.533073 when i checked
+-1.533073 " i'd love
+-1.533073 and i've provided
+-1.834103 great idea ,
+-1.834103 great idea .
+-1.533073 , if it
+-1.533073 <s> if the
+-1.533073 much illusion and
+-1.533073 an immense energy
+-1.533073 important impact on
+-1.533073 an important impact
+-1.533073 ( in a
+-1.533073 . in response
+-1.533073 <s> in one
+-1.533073 acquiesced in this
+-1.533073 begin in terms
+-1.533073 chronology in passing
+-1.533073 described in hantemirov
+-1.533073 had in mind
+-1.533073 patterns in the
+-1.533073 principles in the
+-1.533073 published in any
+-1.533073 refusal in connection
+-1.533073 sat in rev
+-1.533073 set in combination
+-1.533073 subset in briffa
+-2.010194 used in a
+-2.010194 used in briffa's
+-2.010194 used in virtually
+-1.533073 be included with
+-1.533073 , including ,
+-1.533073 while including the
+-1.533073 do indeed see
+-1.533073 224 individual series
+-1.533073 the information finally
+-1.533073 against inhomogeneities ,
+-1.533073 his initial use
+-1.533073 ! instead of
+-1.533073 population instead of
+-1.533073 of interannual variability
+-1.533073 deep into the
+-1.533073 projected into him
+-1.533073 bias introduced by
+-1.533073 , is considered
+-1.533073 decline is not
+-1.834103 he is always
+-1.834103 he is doing
+-2.010194 it is ,
+-2.010194 it is highly
+-2.010194 it is within
+-1.533073 obama is that
+-1.533073 selection is derived
+-1.533073 sensitivity is measured
+-1.834103 this is no
+-1.834103 this is the
+-1.533073 versions is related
+-1.533073 what is happening
+-1.834103 , it has
+-1.834103 , it originated
+-1.533073 . it is
+-1.533073 and it was
+-1.533073 as it is
+-1.834103 but it just
+-1.834103 but it looks
+-1.533073 if it grows
+-1.533073 more it is
+-1.533073 read it yet
+-1.834103 , it's like
+-1.834103 , it's very
+-1.533073 but it's not
+-1.533073 where it's much
+-1.533073 with its enormous
+-1.533073 any journal article
+-1.533073 previous journal (
+-1.533073 had jurisdiction .
+-1.533073 due just to
+-1.533073 it just keeps
+-1.533073 not just between
+-1.533073 reposting just one
+-1.533073 , kaufman et
+-1.533073 just keeps growing
+-1.533073 excluding khadyta river
+-1.533073 schweingruber's khadyta river
+-1.533073 also know ,
+-1.533073 hardly know where
+-1.533073 i know !
+-1.533073 readers know ,
+-1.533073 yamal larch sample
+-1.533073 living larches .
+-1.533073 subfossil larches were
+-1.533073 the large-scale "
+-1.533073 it's like trying
+-1.533073 was like crack
+-1.533073 very limited size
+-1.533073 old living larches
+-1.533073 the longest and
+-1.533073 while looking up
+-1.533073 it looks relevant
+-1.533073 i'd love to
+-1.533073 he made that
+-0.8145491 ( mag )
+-1.533073 the magnitude of
+-1.533073 by magnus .
+-1.533073 which makes the
+-1.533073 on many multiproxy
+-1.533073 2000 may well
+-1.533073 a mean chronology
+-1.533073 is measured by
+-1.533073 the measurement data
+-1.533073 yamal measurement data
+-1.533073 chronology method that
+-1.533073 corridor method "
+-1.533073 rcs method .
+-1.533073 this method which
+-1.533073 rcs methodology warn
+-1.533073 in mind when
+-1.533073 to mix religion
+-1.533073 far more "
+-1.533073 grows more slowly
+-1.533073 the more it
+-1.533073 this morning i
+-1.533073 , most recently
+-1.533073 and most sensitive
+-1.533073 the most recent
+-1.533073 it's much further
+-1.533073 so much illusion
+-1.533073 the multi-parters ,
+-1.533073 many multiproxy studies
+-1.533073 of mundane politics
+-1.533073 about my ghastly
+-1.533073 catch my attention
+-1.533073 caught my attention
+-1.533073 the national debt
+-1.533073 all-around naughtiness .
+-1.533073 the nettle ,
+-1.533073 was never properly
+-1.533073 <s> next ,
+-1.533073 is no doubt
+-1.533073 the non-robustness observed
+-1.533073 characterizes northern forests
+-1.533073 - not just
+-1.533073 about not struggling
+-1.533073 did not preserve
+-1.533073 does not have
+-1.533073 is not due
+-1.533073 it's not going
+-1.533073 should not be
+-1.533073 were not using
+-0.8145491 would not have
+-1.533073 as noted before
+-1.533073 i noticed that
+-1.533073 , notwithstanding these
+-1.533073 right now ,
+-1.834103 with obama ,
+-1.834103 with obama is
+-1.533073 non-robustness observed here
+-1.533073 , of older
+-1.533073 addition of 17
+-1.533073 all of his
+-1.834103 instead of reposting
+-1.834103 instead of the
+-1.533073 magnitude of interannual
+-1.533073 principalities of darkness
+-1.533073 realm of mundane
+-1.834103 selection of old
+-1.834103 selection of older
+-1.533073 series of subfossil
+-1.533073 shadows of deeper
+-1.533073 terms of commentary
+-1.533073 unrepresentativeness of the
+-1.834103 use of a
+-1.834103 use of this
+-1.533073 virtue of being
+-1.533073 of old trees
+-1.533073 year old living
+-0.8145491 of older trees
+-1.533073 towards older trees
+-1.533073 one oldie ,
+-1.533073 , on average
+-1.533073 article on the
+-1.533073 caveats on rcs
+-1.533073 commentary on this
+-1.533073 impact on many
+-1.533073 play on a
+-1.533073 relied on this
+-1.533073 selected on a
+-1.533073 in one approach
+-1.533073 just one oldie
+-1.533073 recent one .
+-1.533073 placed online .
+-1.533073 has only taken
+-1.533073 ( or real
+-1.533073 it originated with
+-1.533073 with osborn and
+-1.533073 prove out (
+-1.533073 and outright fantasy
+-1.533073 briffa's own caveats
+-1.533073 for paleoclimatologists and
+-1.533073 biblical passage i
+-1.533073 in passing and
+-1.533073 the path "
+-1.533073 aging patterns in
+-1.533073 what paul had
+-1.533073 the people that
+-1.533073 , perhaps the
+-1.533073 . perhaps the
+-1.834103 <s> perhaps the
+-1.834103 <s> perhaps there's
+-1.533073 and perhaps they
+-1.533073 ( phil trans
+-1.533073 the phil trans
+-1.533073 12 picked cores
+-1.533073 this piece by
+-1.533073 right place .
+-1.533073 finally placed online
+-1.533073 shadow play on
+-1.533073 the point that
+-1.533073 data policy )
+-1.533073 and politics ,
+-1.533073 day politics are
+-1.533073 mundane politics .
+-1.834103 cru population .
+-1.834103 cru population consists
+-2.010194 schweingruber population .
+-2.010194 schweingruber population as
+-2.010194 schweingruber population instead
+-1.533073 the position that
+-1.533073 highly possible and
+-1.533073 about potential bias
+-1.533073 and potential unrepresentativeness
+-1.533073 your power to
+-1.533073 anti-divine powers and
+-1.533073 his precipitous decline
+-1.533073 at precisely the
+-1.533073 usual predictable factors
+-1.533073 2000 presented this
+-1.533073 not preserve centennial-scale
+-1.533073 the previous journal
+-1.533073 and principalities of
+-1.533073 deeper principles in
+-1.533073 a prior selection
+-1.533073 even probable that
+-0.8145491 divergence problem "
+-1.533073 been projected into
+-1.533073 never properly published
+-1.533073 here prove out
+-1.533073 to provide the
+-1.533073 i've provided a
+-1.533073 a provocative thought
+-1.533073 properly published in
+-1.533073 small push at
+-1.533073 a rcs chronology
+-1.533073 on rcs methodology
+-1.533073 the rcs method
+-1.533073 really react to
+-1.533073 haven't read it
+-1.834103 ca readers also
+-1.834103 ca readers know
+-1.533073 worth reading ,
+-1.533073 or real )
+-1.533073 don't really react
+-1.533073 the realm of
+-1.533073 some reason why
+-1.533073 valid reason for
+-1.533073 most recent one
+-1.533073 most recently ,
+-1.533073 until recently ,
+-1.533073 yamal reconstruction .
+-1.533073 this refusal in
+-1.533073 staunchly refused to
+-1.533073 is related to
+-1.533073 looks relevant ,
+-1.533073 have relied on
+-1.533073 mix religion and
+-1.533073 data remained unarchived
+-1.533073 commenter remarked that
+-1.533073 of reposting just
+-1.533073 , requiring briffa
+-1.533073 in response to
+-1.533073 the resulting yamal
+-1.533073 in rev .
+-1.533073 happening right now
+-1.834103 the right place
+-1.834103 the right time
+-1.533073 were right .
+-1.533073 between ring widths
+-1.533073 17 ring-width series
+-0.8145491 khadyta river ,
+-1.533073 conservatives said he
+-1.533073 the same bias
+-1.533073 larch sample should
+-1.533073 and sat in
+-1.533073 similar schweingruber data
+-0.1249387 the schweingruber population
+-1.533073 why schweingruber's khadyta
+-1.533073 . science (
+-1.533073 a science article
+-1.533073 and science (
+-1.533073 generating script )
+-1.533073 can see the
+-1.533073 indeed see the
+-1.533073 you see ,
+-1.533073 finally seized the
+-1.834103 were selected .
+-1.834103 were selected on
+-1.533073 biased selection of
+-1.533073 cru selection is
+-1.533073 prior selection of
+-1.533073 most sensitive series
+-1.533073 where sensitivity is
+-1.533073 individual series of
+-1.533073 ring-width series ,
+-1.533073 sensitive series ,
+-1.533073 data set in
+-1.533073 . several things
+-1.533073 a shadow play
+-1.533073 the shadows of
+-1.533073 these shadows .
+-2.010194 and shiyatov 2002
+-2.010194 and shiyatov themselves
+-2.010194 and shiyatov would
+-1.533073 sample should not
+-1.533073 a similar schweingruber
+-1.533073 have similarly affected
+-1.533073 , since this
+-1.533073 limited size and
+-1.533073 i skimmed this
+-1.533073 more slowly ,
+-1.533073 way slowly get
+-1.533073 a small push
+-1.533073 because so much
+-1.533073 there's some reason
+-1.533073 from someone whose
+-1.533073 to start today
+-1.533073 cru staunchly refused
+-1.533073 not struggling against
+-1.533073 multiproxy studies that
+-1.533073 subsequent study ,
+-1.533073 this study .
+-1.533073 i stumbled upon
+-1.533073 of subfossil larches
+-1.533073 the subfossil collection
+-1.533073 yamal subfossil data
+-1.533073 every subsequent study
+-1.533073 this subset in
+-1.533073 " success .
+-1.533073 taymir supplement .
+-1.533073 were supplemented by
+-1.533073 a surface ,
+-1.533073 would take an
+-1.533073 only taken a
+-1.834103 the taymir data
+-1.834103 the taymir supplement
+-1.533073 and temperature ,
+-1.533073 ghastly tendency to
+-1.533073 in terms of
+-1.533073 trees than the
+-1.533073 " that characterizes
+-1.533073 admit that the
+-1.533073 and that way
+-1.533073 is that he
+-1.533073 made that wise
+-1.533073 method that they
+-1.533073 noticed that the
+-1.533073 people that voted
+-1.533073 point that his
+-1.533073 position that the
+-1.533073 probable that the
+-1.533073 remarked that "
+-1.533073 studies that have
+-1.533073 things that cast
+-1.533073 " the trouble
+-2.010194 , the more
+-2.010194 , the resulting
+-2.010194 , the yamal
+-1.533073 . the cru
+-1.834103 <s> the subfossil
+-1.834103 <s> the yamal
+-1.533073 affected the "
+-1.533073 along the path
+-2.010194 and the people
+-2.010194 and the phil
+-2.010194 and the right
+-1.533073 archive the data
+-1.533073 at the crossroads
+-0.8145491 between the two
+-1.533073 but the further
+-1.834103 by the addition
+-1.834103 by the magnitude
+-1.533073 combine the multi-parters
+-1.533073 control the national
+-0.8145491 get the arkive
+-1.533073 given the use
+-1.533073 has the virtue
+-1.533073 have the same
+-1.533073 if the non-robustness
+-1.834103 in the realm
+-1.834103 in the schweingruber
+-1.533073 including the taymir
+-1.533073 into the future
+-1.533073 is the most
+-1.533073 makes the point
+-0.8145491 of the 12
+-1.533073 on the trouble
+-2.010194 perhaps the biased
+-2.010194 perhaps the day
+-2.010194 perhaps the difference
+-1.533073 precisely the right
+-1.533073 provide the measurement
+-1.834103 see the far
+-1.834103 see the shadows
+-1.533073 seized the nettle
+-1.533073 than the schweingruber
+-2.135133 that the conservatives
+-2.135133 that the cru
+-2.135133 that the previous
+-2.135133 that the yamal
+-1.533073 through the very
+-2.135133 to the back-and-forth
+-2.135133 to the cru
+-2.135133 to the large-scale
+-2.135133 to the usual
+-1.533073 took the position
+-1.533073 up the biblical
+-1.533073 used the chronology
+-1.533073 using the schweingruber
+-1.533073 were the longest
+-1.533073 what the conservatives
+-1.533073 while the yamal
+-2.135133 with the information
+-2.135133 with the rcs
+-2.135133 with the taymir
+-2.135133 with the yamal
+-1.533073 wrote the following:
+-1.533073 how their cores
+-1.533073 shiyatov themselves ,
+-1.533073 they themselves were
+-1.533073 perhaps there's some
+-1.834103 . these data
+-1.834103 . these were
+-1.533073 cast these shadows
+-1.533073 notwithstanding these warnings
+-1.533073 and they can
+-1.533073 did they expect
+-1.533073 perhaps they don't
+-1.533073 that they themselves
+-1.533073 ) things that
+-1.533073 several things caught
+-1.533073 to think up
+-1.533073 fully thinking through
+-2.010194 , this analysis
+-2.010194 , this chronology
+-2.010194 , this will
+-1.834103 . this bias
+-1.834103 . this is
+-1.533073 <s> this morning
+-1.533073 but this is
+-1.533073 earlier this year
+-1.533073 in this refusal
+-1.533073 of this subset
+-1.834103 on this difference
+-1.834103 on this study
+-1.533073 presented this chronology
+-1.533073 since this method
+-1.533073 skimmed this article
+-1.533073 upon this piece
+-1.533073 all those years
+-1.533073 to those "
+-1.533073 provocative thought .
+-1.533073 a thousand ,
+-1.533073 thinking through the
+-1.533073 a time ,
+-1.533073 right time and
+-1.533073 approach to constructing
+-1.533073 are to those
+-1.533073 briffa to archive
+-1.533073 but to what
+-1.533073 compared to the
+-1.533073 day to day
+-1.533073 down to about
+-1.533073 exception to the
+-1.533073 going to start
+-1.533073 hard to think
+-1.533073 hate to admit
+-1.533073 him to begin
+-1.533073 just to the
+-1.533073 love to get
+-1.533073 power to change
+-1.533073 react to what
+-1.533073 refused to provide
+-1.533073 related to different
+-1.533073 response to the
+-1.533073 tendency to mix
+-1.533073 trying to control
+-1.533073 where to begin
+-1.533073 happens today would
+-1.533073 start today .
+-1.533073 ) took the
+-1.533073 bias towards older
+-1.834103 phil trans b
+-1.834103 phil trans editors
+-1.533073 old trees described
+-2.010194 older trees .
+-2.010194 older trees an
+-2.010194 older trees than
+-0.8145491 the trouble with
+-1.533073 being true ,
+-1.533073 like trying to
+-0.8145491 the two versions
+-1.533073 remained unarchived .
+-1.533073 arkive under control
+-1.533073 an unintentional bias
+-1.533073 potential unrepresentativeness of
+-1.533073 , until recently
+-1.533073 violence unveiled: humanity
+-1.533073 looking up the
+-1.533073 think up a
+-1.533073 stumbled upon this
+-1.533073 initial use of
+-1.533073 the use of
+-1.533073 " used by
+-1.533073 briffa used the
+-1.533073 data used in
+-1.533073 got used in
+-1.533073 was used in
+-1.533073 , using the
+-1.533073 not using .
+-1.533073 the usual predictable
+-1.533073 a valid reason
+-1.533073 centennial-scale variability and
+-1.533073 interannual variability .
+-1.834103 two versions .
+-1.834103 two versions is
+-1.533073 it's very hard
+-1.533073 the very limited
+-1.533073 . violence unveiled:
+-1.533073 in virtually every
+-1.533073 the virtue of
+-1.533073 that voted for
+-1.533073 methodology warn against
+-1.533073 these warnings ,
+-1.533073 blade was like
+-1.533073 book was ghostwritten
+-1.533073 chronology was used
+-1.533073 data was finally
+-1.533073 it was never
+-1.533073 that way slowly
+-1.533073 for we do
+-1.533073 may well have
+-1.533073 conservatives were right
+-1.533073 cores were selected
+-1.533073 data were supplemented
+-1.533073 larches were selected
+-1.533073 themselves were not
+-1.533073 these were the
+-1.533073 . what did
+-1.533073 <s> what a
+-1.533073 changing what happens
+-1.533073 doubt what paul
+-1.533073 exactly what the
+-1.834103 to what is
+-1.834103 to what will
+-1.533073 . what's your
+-1.834103 , when combined
+-1.834103 , when i
+-1.533073 mind when he
+-1.533073 , where sensitivity
+-1.533073 case where it's
+-1.533073 know where to
+-1.533073 ( which had
+-1.834103 , which ,
+-1.834103 , which makes
+-1.533073 method which did
+-1.533073 ( while looking
+-1.533073 , while including
+-1.533073 . while the
+-1.533073 someone whose book
+-1.533073 reason why schweingruber's
+-1.533073 ring widths and
+-1.533073 this will have
+-1.533073 what will be
+-1.533073 that wise crack
+-1.533073 . with the
+-1.533073 begin with .
+-1.533073 change with a
+-1.533073 chronology with its
+-1.533073 combination with the
+-1.533073 combined with the
+-1.533073 connection with osborn
+-1.533073 included with the
+-1.533073 originated with briffa
+-0.8145491 trouble with obama
+-1.533073 is within your
+-1.533073 done without fully
+-1.533073 always worth reading
+-1.533073 bias would not
+-1.533073 he would do
+-1.533073 shiyatov would not
+-1.533073 today would take
+-1.533073 . wright's church
+-1.533073 he wrote the
+-1.533073 , yamal larch
+-1.533073 briffa's yamal reconstruction
+-1.533073 resulting yamal chronology
+-1.212489 the yamal chronology
+-2.232043 the yamal data
+-2.232043 the yamal measurement
+-2.232043 the yamal subfossil
+-1.533073 200–400 year old
+-1.533073 this year ,
+-1.533073 those years ?
+-1.533073 ' yes ,
+-1.533073 back-and-forth yesterday about
+-1.533073 it yet ,
+-1.533073 ahead you see
+-1.533073 what's your great
+-1.533073 within your power
+
+\end\
diff --git a/decoder/test_data/grammar.prune b/decoder/test_data/grammar.prune
new file mode 100644
index 00000000..4ebcb509
--- /dev/null
+++ b/decoder/test_data/grammar.prune
@@ -0,0 +1,196 @@
+[PHRASE] ||| [PHRASE,1] haus ||| [PHRASE,1] house ||| 1.86183 0 0 0 0.0211892
+[PHRASE] ||| [PHRASE,1] haus ist ||| is [PHRASE,1] house ||| 2.58883 0.311249 0 0.348455 0.0211893
+[PHRASE] ||| [PHRASE,1] haus gibt ||| is [PHRASE,1] house ||| 2.56863 0.291046 0 0.258278 0.0211893
+[PHRASE] ||| [PHRASE,1] ein haus ist ||| [PHRASE,1] is a house ||| 3.16286 0 0 0.576934 0.0211893
+[PHRASE] ||| [PHRASE,1] ist ||| [PHRASE,1] is ||| 2.94101 0 0.676694 0.348455 0
+[PHRASE] ||| [PHRASE,1] ist ||| is [PHRASE,1] ||| 2.36698 0.649056 0.102662 0.348455 0
+[PHRASE] ||| [PHRASE,1] klein ist ||| [PHRASE,1] is small ||| 2.58883 0.124939 0 0.78211 0
+[PHRASE] ||| [PHRASE,1] maus ||| [PHRASE,1] mouse ||| 2.09592 0 0 0 0
+[PHRASE] ||| [PHRASE,1] maus gibt ||| is [PHRASE,1] mouse ||| 2.44865 0 0 0.258278 0
+[PHRASE] ||| [PHRASE,1] kleines ||| [PHRASE,1] small ||| 2.94101 0.439333 0 0.579784 0
+[PHRASE] ||| [PHRASE,1] kleines haus ||| [PHRASE,1] small house ||| 3.24204 0 0 0.579784 0.0211893
+[PHRASE] ||| [PHRASE,1] kleines haus gibt ||| is [PHRASE,1] small house ||| 3.30899 0 0 0.838062 0.0211893
+[PHRASE] ||| [PHRASE,1] kleine ||| [PHRASE,1] small ||| 2.94101 0.439333 0 0.500602 0
+[PHRASE] ||| [PHRASE,1] kleine maus ||| [PHRASE,1] small mouse ||| 3.24204 0 0 0.500602 0
+[PHRASE] ||| [PHRASE,1] kleine maus gibt ||| is [PHRASE,1] small mouse ||| 3.30899 0 0 0.75888 0
+[PHRASE] ||| [PHRASE,1] gelb ||| [PHRASE,1] yellow ||| 2.63998 0 0 0 0
+[PHRASE] ||| [PHRASE,1] gelb haus ||| [PHRASE,1] yellow house ||| 3.24204 0 0 0 0.0211893
+[PHRASE] ||| [PHRASE,1] gelb haus gibt ||| is [PHRASE,1] yellow house ||| 3.30899 0 0 0.258278 0.0211893
+[PHRASE] ||| [PHRASE,1] gelb maus ||| [PHRASE,1] yellow mouse ||| 3.24204 0 0 0 0
+[PHRASE] ||| [PHRASE,1] gelb maus gibt ||| is [PHRASE,1] yellow mouse ||| 3.30899 0 0 0.258278 0
+[PHRASE] ||| [PHRASE,1] gibt ||| is [PHRASE,1] ||| 1.82827 0.110339 0 0.258278 0
+[PHRASE] ||| haus ||| small yellow mouse house ||| 2.46389 0.845098 1.30103 0.278754 1.34341
+[PHRASE] ||| haus ||| house ||| Phrase_0=1.18514 Phrase_2=0.0222764 Phrase_4=0.0211893
+[PHRASE] ||| haus [PHRASE,1] ||| house [PHRASE,1] ||| 2.2878 0 0 0 0.0211893
+[PHRASE] ||| haus ist ||| house is ||| 2.46389 0 0 0.348455 0.0211893
+[PHRASE] ||| haus klein ist ||| house is small ||| 2.2878 0 0 0.78211 0.0211893
+[PHRASE] ||| ein ||| a ||| Phrase_0=1.34995 Phrase_1=0.228479 Phrase_3=0.228479
+[PHRASE] ||| ein [PHRASE,1] ||| a [PHRASE,1] ||| 2.03792 0.290035 0 0.228479 0
+[PHRASE] ||| ein [PHRASE,1] haus ||| a [PHRASE,1] house ||| 2.94101 0 0 0.228479 0.0211893
+[PHRASE] ||| ein [PHRASE,1] haus gibt ||| is a [PHRASE,1] house ||| 3.00796 0 0 0.486757 0.0211893
+[PHRASE] ||| ein [PHRASE,1] ist ||| is a [PHRASE,1] ||| 2.58883 0.535113 0 0.576934 0
+[PHRASE] ||| ein [PHRASE,1] gibt ||| is a [PHRASE,1] ||| 2.56863 0.51491 0 0.486757 0
+[PHRASE] ||| ein haus ||| a house ||| 1.76492 0 0.0791813 0.228479 0.0211893
+[PHRASE] ||| ein haus ||| a small house ||| 2.46389 0.30103 0.778151 0.507233 1.34341
+[PHRASE] ||| ein haus ist ||| is a house ||| 2.76492 0.477121 0 0.576934 0.0211893
+[PHRASE] ||| ein haus gibt ||| is a house ||| 2.46389 0.176091 0.176091 0.486757 0.0211893
+[PHRASE] ||| ein haus gibt ||| is a small house ||| 2.76492 0.39794 0.477121 0.765511 1.34341
+[PHRASE] ||| ein kleines ||| a small ||| 1.86183 0.243038 0 0.808263 0
+[PHRASE] ||| ein kleines [PHRASE,1] ||| a small [PHRASE,1] ||| 3.24204 0.30103 0 0.808263 0
+[PHRASE] ||| ein kleines [PHRASE,1] gibt ||| is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.06654 0
+[PHRASE] ||| ein kleines haus ||| a small house ||| 2.46389 0.30103 0 0.808263 0.0211893
+[PHRASE] ||| ein kleines haus ist ||| is a small house ||| 2.76492 0.39794 0 1.15672 0.0211893
+[PHRASE] ||| ein kleines haus gibt ||| is a small house ||| 3.06595 0.69897 0 1.06654 0.0211893
+[PHRASE] ||| ein kleines gelb ||| a small yellow ||| 2.94101 0.30103 0 0.808263 0
+[PHRASE] ||| ein kleines gelb haus ||| a small yellow house ||| 3.24204 0 0 0.808263 0.0211893
+[PHRASE] ||| ein kleines gelb haus gibt ||| is a small yellow house ||| 3.30899 0 0 1.06654 0.0211893
+[PHRASE] ||| ein gelb ||| a yellow ||| 1.98677 0.221849 0 0.228479 0
+[PHRASE] ||| ein gelb [PHRASE,1] ||| a yellow [PHRASE,1] ||| 3.24204 0.30103 0 0.228479 0
+[PHRASE] ||| ein gelb [PHRASE,1] gibt ||| is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.486757 0
+[PHRASE] ||| ein gelb haus ||| a yellow house ||| 2.63998 0 0 0.228479 0.0211893
+[PHRASE] ||| ein gelb haus ist ||| is a yellow house ||| 3.06595 0.30103 0 0.576934 0.0211893
+[PHRASE] ||| ein gelb haus gibt ||| is a yellow house ||| 3.06595 0.30103 0 0.486757 0.0211893
+[PHRASE] ||| ein gelb kleines ||| a yellow small ||| 2.94101 0.30103 0 0.808263 0
+[PHRASE] ||| ein gelb kleines haus ||| a yellow small house ||| 3.24204 0 0 0.808263 0.0211893
+[PHRASE] ||| ein gelb kleines haus gibt ||| is a yellow small house ||| 3.30899 0 0 1.06654 0.0211893
+[PHRASE] ||| ist ||| is ||| 1.34995 0.348455 0 0.348455 0
+[PHRASE] ||| klein ||| small ||| 1.61879 0.410174 0 0.433656 0
+[PHRASE] ||| klein [PHRASE,1] ||| [PHRASE,1] small ||| 3.06595 0.564271 0 0.433656 0
+[PHRASE] ||| klein [PHRASE,1] ist ||| [PHRASE,1] is small ||| 3.06595 0.60206 0 0.78211 0
+[PHRASE] ||| klein ist ||| is small ||| 1.68574 0 0 0.78211 0
+[PHRASE] ||| klein das [PHRASE,1] ||| the [PHRASE,1] small ||| 3.06595 0 0 0.433656 0.30103
+[PHRASE] ||| klein das haus ist ||| the house is small ||| 3.06595 0.477121 0 0.78211 0.322219
+[PHRASE] ||| maus ||| mouse ||| 1.50965 0 0 0 0
+[PHRASE] ||| maus [PHRASE,1] ||| mouse [PHRASE,1] ||| 2.94101 0 0 0 0
+[PHRASE] ||| maus [PHRASE,1] ist ||| mouse is [PHRASE,1] ||| 2.94101 0 0 0.348455 0
+[PHRASE] ||| maus ein haus ist ||| mouse is a house ||| 2.94101 0 0 0.576934 0.0211893
+[PHRASE] ||| kleines ||| small ||| 1.76492 0.556302 0 0.579784 0
+[PHRASE] ||| kleines [PHRASE,1] ||| small [PHRASE,1] ||| 2.94101 0.30103 0 0.579784 0
+[PHRASE] ||| kleines haus ||| small house ||| 1.86183 0.243038 0 0.579784 0.0211893
+[PHRASE] ||| kleines gelb ||| small yellow ||| 2.46389 0.30103 0 0.579784 0
+[PHRASE] ||| kleines gelb haus ||| small yellow house ||| 2.94101 0 0 0.579784 0.0211893
+[PHRASE] ||| kleine ||| small ||| 1.68574 0.477121 0 0.500602 0
+[PHRASE] ||| kleine [PHRASE,1] ||| small [PHRASE,1] ||| 2.94101 0.30103 0 0.500602 0
+[PHRASE] ||| kleine haus ||| small house ||| 2.16286 0.544068 0 0.500602 0.0211893
+[PHRASE] ||| kleine maus ||| small mouse ||| 1.98677 0 0 0.500602 0
+[PHRASE] ||| kleine gelb ||| small yellow ||| 2.46389 0.30103 0 0.500602 0
+[PHRASE] ||| kleine gelb maus ||| small yellow mouse ||| 2.94101 0 0 0.500602 0
+[PHRASE] ||| gelb ||| yellow ||| 1.61879 0 0 0 0
+[PHRASE] ||| gelb [PHRASE,1] ||| yellow [PHRASE,1] ||| 2.63998 0 0 0 0
+[PHRASE] ||| gelb haus ||| yellow house ||| 1.98677 0 0 0 0.0211893
+[PHRASE] ||| gelb maus ||| yellow mouse ||| 2.16286 0 0 0 0
+[PHRASE] ||| gelb kleines ||| yellow small ||| 2.46389 0.30103 0 0.579784 0
+[PHRASE] ||| gelb kleines haus ||| yellow small house ||| 2.94101 0 0 0.579784 0.0211893
+[PHRASE] ||| gelb kleine ||| yellow small ||| 2.46389 0.30103 0 0.500602 0
+[PHRASE] ||| gelb kleine maus ||| yellow small mouse ||| 2.94101 0 0 0.500602 0
+[PHRASE] ||| eine ||| a ||| 1.50965 0.38818 0 0.38818 0
+[PHRASE] ||| eine [PHRASE,1] ||| a [PHRASE,1] ||| 2.0602 0.312311 0 0.38818 0
+[PHRASE] ||| eine [PHRASE,1] maus ||| a [PHRASE,1] mouse ||| 2.94101 0 0 0.38818 0
+[PHRASE] ||| eine [PHRASE,1] maus gibt ||| is a [PHRASE,1] mouse ||| 3.00796 0 0 0.646458 0
+[PHRASE] ||| eine [PHRASE,1] gibt ||| is a [PHRASE,1] ||| 2.44865 0.394934 0 0.646458 0
+[PHRASE] ||| eine maus ||| a mouse ||| 1.98677 0 0 0.38818 0
+[PHRASE] ||| eine maus [PHRASE,1] ||| a mouse [PHRASE,1] ||| 3.16286 0 0 0.38818 0
+[PHRASE] ||| eine maus [PHRASE,1] ist ||| a mouse is [PHRASE,1] ||| 3.16286 0 0 0.736635 0
+[PHRASE] ||| eine maus ein haus ist ||| a mouse is a house ||| 3.16286 0 0 0.965114 0.0211893
+[PHRASE] ||| eine maus gibt ||| is a mouse ||| 2.46389 0 0 0.646458 0
+[PHRASE] ||| eine kleine ||| a small ||| 1.98677 0.367977 0 0.888783 0
+[PHRASE] ||| eine kleine [PHRASE,1] ||| a small [PHRASE,1] ||| 3.24204 0.30103 0 0.888783 0
+[PHRASE] ||| eine kleine [PHRASE,1] gibt ||| is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.14706 0
+[PHRASE] ||| eine kleine maus ||| a small mouse ||| 2.63998 0 0 0.888783 0
+[PHRASE] ||| eine kleine maus gibt ||| is a small mouse ||| 2.76492 0 0 1.14706 0
+[PHRASE] ||| eine kleine gelb ||| a small yellow ||| 2.94101 0.30103 0 0.888783 0
+[PHRASE] ||| eine kleine gelb maus ||| a small yellow mouse ||| 3.24204 0 0 0.888783 0
+[PHRASE] ||| eine kleine gelb maus gibt ||| is a small yellow mouse ||| 3.30899 0 0 1.14706 0
+[PHRASE] ||| eine gelb ||| a yellow ||| 2.16286 0.39794 0 0.38818 0
+[PHRASE] ||| eine gelb [PHRASE,1] ||| a yellow [PHRASE,1] ||| 3.24204 0.30103 0 0.38818 0
+[PHRASE] ||| eine gelb [PHRASE,1] gibt ||| is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.646458 0
+[PHRASE] ||| eine gelb maus ||| a yellow mouse ||| 2.94101 0 0 0.38818 0
+[PHRASE] ||| eine gelb maus gibt ||| is a yellow mouse ||| 3.06595 0 0 0.646458 0
+[PHRASE] ||| eine gelb kleine ||| a yellow small ||| 2.94101 0.30103 0 0.888783 0
+[PHRASE] ||| eine gelb kleine maus ||| a yellow small mouse ||| 3.24204 0 0 0.888783 0
+[PHRASE] ||| eine gelb kleine maus gibt ||| is a yellow small mouse ||| 3.30899 0 0 1.14706 0
+[PHRASE] ||| eine gruen ||| a green ||| 2.46389 0 0 0.38818 0
+[PHRASE] ||| eine gruen maus ||| a green mouse ||| 2.94101 0 0 0.38818 0
+[PHRASE] ||| gruen ||| green ||| 2.16286 0 0 0 0
+[PHRASE] ||| gruen maus ||| green mouse ||| 2.46389 0 0 0 0
+[PHRASE] ||| tages ||| day ||| 2.46389 0 0 0 0
+[PHRASE] ||| gibt ||| is ||| 1.25977 0.258278 0 0.258278 0
+[PHRASE] ||| meins ||| mine ||| 2.16286 0 0 0 0
+[PHRASE] ||| meins [PHRASE,1] ||| mine [PHRASE,1] ||| 2.76492 0 0 0 0
+[PHRASE] ||| meins ist ||| is mine ||| 2.46389 0 0 0.348455 0
+[PHRASE] ||| meins klein ist ||| mine is small ||| 2.76492 0 0 0.78211 0
+[PHRASE] ||| geld ||| money ||| 1.98677 0 0 0 0
+[PHRASE] ||| geld ist ||| is money ||| 2.46389 0.30103 0 0.348455 0
+[PHRASE] ||| geld gibt ||| is money ||| 2.46389 0.30103 0 0.258278 0
+[PHRASE] ||| keins ||| none ||| 1.98677 0 0 0 0
+[PHRASE] ||| keins [PHRASE,1] ||| none [PHRASE,1] ||| 2.76492 0 0 0 0
+[PHRASE] ||| keins klein ist ||| none is small ||| 2.76492 0 0 0.78211 0
+[PHRASE] ||| keins gibt ||| is none ||| 2.46389 0 0 0.258278 0
+[PHRASE] ||| dem haeuschen ||| of control ||| 2.46389 0 0 0.681241 0.425969
+[PHRASE] ||| eines ||| one ||| 2.46389 0.30103 0 0.30103 0
+[PHRASE] ||| eines tages ||| one day ||| 2.46389 0 0 0.30103 0
+[PHRASE] ||| eins ||| one ||| 2.46389 0.30103 0 0.30103 0
+[PHRASE] ||| aus ||| out ||| 2.46389 0 0.477121 0 0.221849
+[PHRASE] ||| aus ||| out of ||| 2.16286 0 0.176091 0.0791812 0.619789
+[PHRASE] ||| aus [PHRASE,1] ||| out [PHRASE,1] ||| 2.76492 0 0.367977 0 0.221849
+[PHRASE] ||| aus [PHRASE,1] ||| out of [PHRASE,1] ||| 2.63998 0 0.243038 0.0791812 0.619789
+[PHRASE] ||| aus ein ||| out of a ||| 2.46389 0 0 0.307661 0.619789
+[PHRASE] ||| aus ein haus ||| out of a house ||| 2.94101 0 0 0.307661 0.640978
+[PHRASE] ||| aus dem haeuschen ||| out of control ||| 2.76492 0 0 0.681241 0.647817
+[PHRASE] ||| aus das ||| out of the ||| 2.46389 0 0 0.0791812 0.920819
+[PHRASE] ||| aus das haus ||| out of the house ||| 2.94101 0 0 0.0791812 0.942008
+[PHRASE] ||| das ||| the ||| 1.76492 0 0.30103 0 0.30103
+[PHRASE] ||| das ||| that ||| 1.76492 0 0.30103 0 0.30103
+[PHRASE] ||| das [PHRASE,1] ||| the [PHRASE,1] ||| 2.39695 0 0.41972 0 0.30103
+[PHRASE] ||| das [PHRASE,1] ||| that [PHRASE,1] ||| 2.18514 0 0.207913 0 0.30103
+[PHRASE] ||| das [PHRASE,1] haus ist ||| that is [PHRASE,1] house ||| 2.86183 0 0 0.348455 0.322219
+[PHRASE] ||| das [PHRASE,1] ist ||| that is [PHRASE,1] ||| 2.86183 0 0 0.348455 0.30103
+[PHRASE] ||| das haus ||| the house ||| 1.86183 0 0 0 0.322219
+[PHRASE] ||| das haus [PHRASE,1] ||| the house [PHRASE,1] ||| 2.76492 0 0 0 0.322219
+[PHRASE] ||| das haus ist ||| the house is ||| 2.94101 0 0 0.348455 0.322219
+[PHRASE] ||| das haus klein ist ||| the house is small ||| 2.76492 0.176091 0 0.78211 0.322219
+[PHRASE] ||| das ein [PHRASE,1] ist ||| that is a [PHRASE,1] ||| 2.86183 0 0 0.576934 0.30103
+[PHRASE] ||| das ein kleines haus ist ||| that is a small house ||| 3.16286 0 0 1.15672 0.322219
+[PHRASE] ||| das ein gelb haus ist ||| that is a yellow house ||| 3.16286 0 0 0.576934 0.322219
+[PHRASE] ||| das klein ist ||| that is small ||| 2.76492 0 0 0.78211 0.30103
+[PHRASE] ||| das kleine ||| the small ||| 2.46389 0 0 0.500602 0.30103
+[PHRASE] ||| das kleine haus ||| the small house ||| 2.94101 0 0 0.500602 0.322219
+[PHRASE] ||| das meins ist ||| that is mine ||| 2.76492 0 0 0.348455 0.30103
+[PHRASE] ||| das geld ist ||| that is money ||| 2.76492 0 0 0.348455 0.30103
+[PHRASE] ||| es ||| there ||| 1.25977 0 0 0 0
+[PHRASE] ||| es [PHRASE,1] ||| there [PHRASE,1] ||| 1.83672 0 0 0 0
+[PHRASE] ||| es [PHRASE,1] haus gibt ||| there is [PHRASE,1] house ||| 2.62775 0 0 0.258278 0.0211893
+[PHRASE] ||| es [PHRASE,1] maus gibt ||| there is [PHRASE,1] mouse ||| 2.5166 0 0 0.258278 0
+[PHRASE] ||| es [PHRASE,1] kleines haus gibt ||| there is [PHRASE,1] small house ||| 3.30899 0 0 0.838062 0.0211893
+[PHRASE] ||| es [PHRASE,1] kleine maus gibt ||| there is [PHRASE,1] small mouse ||| 3.30899 0 0 0.75888 0
+[PHRASE] ||| es [PHRASE,1] gelb haus gibt ||| there is [PHRASE,1] yellow house ||| 3.30899 0 0 0.258278 0.0211893
+[PHRASE] ||| es [PHRASE,1] gelb maus gibt ||| there is [PHRASE,1] yellow mouse ||| 3.30899 0 0 0.258278 0
+[PHRASE] ||| es [PHRASE,1] gibt ||| there is [PHRASE,1] ||| 1.9536 0 0 0.258278 0
+[PHRASE] ||| es ein [PHRASE,1] haus gibt ||| there is a [PHRASE,1] house ||| 3.00796 0 0 0.486757 0.0211893
+[PHRASE] ||| es ein [PHRASE,1] gibt ||| there is a [PHRASE,1] ||| 2.62775 0.360151 0 0.486757 0
+[PHRASE] ||| es ein haus gibt ||| there is a house ||| 2.63998 0 0.176091 0.486757 0.0211893
+[PHRASE] ||| es ein haus gibt ||| there is a small house ||| 2.94101 0.20412 0.477121 0.765511 1.34341
+[PHRASE] ||| es ein kleines [PHRASE,1] gibt ||| there is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.06654 0
+[PHRASE] ||| es ein kleines haus gibt ||| there is a small house ||| 3.16286 0.425969 0 1.06654 0.0211893
+[PHRASE] ||| es ein gelb [PHRASE,1] gibt ||| there is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.486757 0
+[PHRASE] ||| es ein gelb haus gibt ||| there is a yellow house ||| 3.16286 0 0 0.486757 0.0211893
+[PHRASE] ||| es eine [PHRASE,1] maus gibt ||| there is a [PHRASE,1] mouse ||| 3.00796 0 0 0.646458 0
+[PHRASE] ||| es eine [PHRASE,1] gibt ||| there is a [PHRASE,1] ||| 2.5166 0.249001 0 0.646458 0
+[PHRASE] ||| es eine maus gibt ||| there is a mouse ||| 2.63998 0 0 0.646458 0
+[PHRASE] ||| es eine kleine [PHRASE,1] gibt ||| there is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.14706 0
+[PHRASE] ||| es eine kleine maus gibt ||| there is a small mouse ||| 2.86183 0 0 1.14706 0
+[PHRASE] ||| es eine gelb [PHRASE,1] gibt ||| there is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.646458 0
+[PHRASE] ||| es eine gelb maus gibt ||| there is a yellow mouse ||| 3.16286 0 0 0.646458 0
+[PHRASE] ||| es geld gibt ||| there is money ||| 2.76492 0 0 0.258278 0
+[PHRASE] ||| es keins gibt ||| there is none ||| 2.76492 0 0 0.258278 0
+[PHRASE] ||| dieses ||| this ||| 1.98677 0 0 0 0
+[PHRASE] ||| dieses [PHRASE,1] ||| this [PHRASE,1] ||| 2.56995 0 0 0 0
+[PHRASE] ||| dieses [PHRASE,1] haus ist ||| this is [PHRASE,1] house ||| 3.16286 0 0 0.348455 0.0211893
+[PHRASE] ||| dieses [PHRASE,1] ist ||| this is [PHRASE,1] ||| 3.16286 0 0 0.348455 0
+[PHRASE] ||| dieses haus ||| this house ||| 2.46389 0 0 0 0.0211893
+[PHRASE] ||| dieses haus [PHRASE,1] ||| this house [PHRASE,1] ||| 3.06595 0 0 0 0.0211893
+[PHRASE] ||| dieses haus klein ist ||| this house is small ||| 3.06595 0 0 0.78211 0.0211893
+[PHRASE] ||| dieses ein [PHRASE,1] ist ||| this is a [PHRASE,1] ||| 3.16286 0 0 0.576934 0
+[PHRASE] ||| dieses ein kleines haus ist ||| this is a small house ||| 3.16286 0 0 1.15672 0.0211893
+[PHRASE] ||| dieses kleine ||| this small ||| 2.46389 0 0 0.500602 0
+[PHRASE] ||| dieses kleine haus ||| this small house ||| 2.94101 0 0 0.500602 0.0211893
diff --git a/decoder/test_data/small.json.gz b/decoder/test_data/small.json.gz
new file mode 100644
index 00000000..892ba360
--- /dev/null
+++ b/decoder/test_data/small.json.gz
Binary files differ
diff --git a/decoder/test_data/test_2gram.lm.gz b/decoder/test_data/test_2gram.lm.gz
new file mode 100644
index 00000000..aafa7274
--- /dev/null
+++ b/decoder/test_data/test_2gram.lm.gz
Binary files differ
diff --git a/decoder/test_data/weights b/decoder/test_data/weights
new file mode 100644
index 00000000..ea70229c
--- /dev/null
+++ b/decoder/test_data/weights
@@ -0,0 +1,8 @@
+# hiero
+WordPenalty -0.387029
+LanguageModel 0.253195
+PhraseModel_0 0.142926
+PhraseModel_1 0.465119
+PhraseModel_2 0.079503
+CNPosteriorProbability 0.09259
+Inf -inf
diff --git a/decoder/test_data/weights.gt b/decoder/test_data/weights.gt
new file mode 100644
index 00000000..08931049
--- /dev/null
+++ b/decoder/test_data/weights.gt
@@ -0,0 +1,4 @@
+Phrase_0 1.0
+Phrase_1 0.5
+Phrase_2 0.3
+Phrase_3 0.2
diff --git a/decoder/timing_stats.cc b/decoder/timing_stats.cc
new file mode 100644
index 00000000..85b95de5
--- /dev/null
+++ b/decoder/timing_stats.cc
@@ -0,0 +1,24 @@
+#include "timing_stats.h"
+
+#include <iostream>
+
+using namespace std;
+
+map<string, TimerInfo> Timer::stats;
+
+Timer::Timer(const string& timername) : start_t(clock()), cur(stats[timername]) {}
+
+Timer::~Timer() {
+ ++cur.calls;
+ const clock_t end_t = clock();
+ const double elapsed = (end_t - start_t) / 1000000.0;
+ cur.total_time += elapsed;
+}
+
+void Timer::Summarize() {
+ for (map<string, TimerInfo>::iterator it = stats.begin(); it != stats.end(); ++it) {
+ cerr << it->first << ": " << it->second.total_time << " secs (" << it->second.calls << " calls)\n";
+ }
+ stats.clear();
+}
+
diff --git a/decoder/timing_stats.h b/decoder/timing_stats.h
new file mode 100644
index 00000000..0a9f7656
--- /dev/null
+++ b/decoder/timing_stats.h
@@ -0,0 +1,25 @@
+#ifndef _TIMING_STATS_H_
+#define _TIMING_STATS_H_
+
+#include <string>
+#include <map>
+
+struct TimerInfo {
+ int calls;
+ double total_time;
+ TimerInfo() : calls(), total_time() {}
+};
+
+struct Timer {
+ Timer(const std::string& info);
+ ~Timer();
+ static void Summarize();
+ private:
+ static std::map<std::string, TimerInfo> stats;
+ clock_t start_t;
+ TimerInfo& cur;
+ Timer(const Timer& other);
+ const Timer& operator=(const Timer& other);
+};
+
+#endif
diff --git a/decoder/translator.cc b/decoder/translator.cc
new file mode 100644
index 00000000..e6c282e1
--- /dev/null
+++ b/decoder/translator.cc
@@ -0,0 +1,57 @@
+#include "translator.h"
+
+#include <iostream>
+#include <vector>
+
+using namespace std;
+
+Translator::~Translator() {}
+
+void Translator::ProcessMarkupHints(const map<string, string>& kv) {
+ if (state_ != kUninitialized) {
+ cerr << "Translator::ProcessMarkupHints in wrong state: " << state_ << endl;
+ abort();
+ }
+ ProcessMarkupHintsImpl(kv);
+ state_ = kReadyToTranslate;
+}
+
+bool Translator::Translate(const std::string& src,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* minus_lm_forest) {
+ if (state_ == kUninitialized) {
+ cerr << "Translator::Translate(...) must not be in uninitialized state!\n";
+ abort();
+ }
+ const bool result = TranslateImpl(src, smeta, weights, minus_lm_forest);
+ state_ = kTranslated;
+ return result;
+}
+
+void Translator::SentenceComplete() {
+ if (state_ != kTranslated) {
+ cerr << "Translator::Complete in unexpected state: " << state_ << endl;
+ // not fatal
+ }
+ SentenceCompleteImpl();
+ state_ = kUninitialized; // return to start state
+}
+
+// this may be overridden by translators that want to accept
+// metadata
+void Translator::ProcessMarkupHintsImpl(const map<string, string>& kv) {
+ int unprocessed = kv.size() - kv.count("id");
+ cerr << "Inside translator process hints\n";
+ if (unprocessed > 0) {
+ cerr << "Sentence markup contains unprocessed data:\n";
+ for (map<string, string>::const_iterator it = kv.begin(); it != kv.end(); ++it) {
+ if (it->first == "id") continue;
+ cerr << " KEY[" << it->first << "] --> " << it->second << endl;
+ }
+ abort();
+ }
+}
+
+void Translator::SentenceCompleteImpl() {}
+
diff --git a/decoder/translator.h b/decoder/translator.h
new file mode 100644
index 00000000..6b0a02e4
--- /dev/null
+++ b/decoder/translator.h
@@ -0,0 +1,82 @@
+#ifndef _TRANSLATOR_H_
+#define _TRANSLATOR_H_
+
+#include <string>
+#include <vector>
+#include <map>
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+class Hypergraph;
+class SentenceMetadata;
+
+// Workflow: for each sentence to be translated
+// 1) call ProcessMarkupHints(markup)
+// 2) call Translate(...)
+// 3) call SentenceComplete()
+class Translator {
+ public:
+ Translator() : state_(kUninitialized) {}
+ virtual ~Translator();
+ // returns true if goal reached, false otherwise
+ // minus_lm_forest will contain the unpruned forest. the
+ // feature values from the phrase table / grammar / etc
+ // should be in the forest already - the "late" features
+ // should not just copy values that are available without
+ // any context or computation.
+ // SentenceMetadata contains information about the sentence,
+ // but it is an input/output parameter since the Translator
+ // is also responsible for setting the value of src_len.
+ bool Translate(const std::string& src,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* minus_lm_forest);
+
+ // This is called before Translate(...) with the sentence-
+ // level markup passed in. This can be used to set sentence-
+ // specific behavior of the translator.
+ void ProcessMarkupHints(const std::map<std::string, std::string>& kv);
+
+ // Free any sentence-specific resources
+ void SentenceComplete();
+ protected:
+ virtual bool TranslateImpl(const std::string& src,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* minus_lm_forest) = 0;
+ virtual void ProcessMarkupHintsImpl(const std::map<std::string, std::string>& kv);
+ virtual void SentenceCompleteImpl();
+ private:
+ enum State { kUninitialized, kReadyToTranslate, kTranslated };
+ State state_;
+};
+
+class SCFGTranslatorImpl;
+class SCFGTranslator : public Translator {
+ public:
+ SCFGTranslator(const boost::program_options::variables_map& conf);
+ protected:
+ bool TranslateImpl(const std::string& src,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* minus_lm_forest);
+ void ProcessMarkupHintsImpl(const std::map<std::string, std::string>& kv);
+ void SentenceCompleteImpl();
+ private:
+ boost::shared_ptr<SCFGTranslatorImpl> pimpl_;
+};
+
+class FSTTranslatorImpl;
+class FSTTranslator : public Translator {
+ public:
+ FSTTranslator(const boost::program_options::variables_map& conf);
+ private:
+ bool TranslateImpl(const std::string& src,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* minus_lm_forest);
+ private:
+ boost::shared_ptr<FSTTranslatorImpl> pimpl_;
+};
+
+#endif
diff --git a/decoder/tromble_loss.cc b/decoder/tromble_loss.cc
new file mode 100644
index 00000000..9ebd8ab1
--- /dev/null
+++ b/decoder/tromble_loss.cc
@@ -0,0 +1,309 @@
+#include "tromble_loss.h"
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/circular_buffer.hpp>
+#include <boost/functional/hash.hpp>
+#include <boost/lexical_cast.hpp>
+#include <boost/range/iterator_range.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/unordered_map.hpp>
+
+#include <cmath>
+#include <fstream>
+#include <vector>
+
+#include "sentence_metadata.h"
+#include "trule.h"
+#include "tdict.h"
+
+using namespace std;
+
+namespace {
+
+typedef unsigned char GramCount;
+
+struct RefCounts {
+ GramCount max;
+ std::vector<GramCount> refs;
+ size_t length;
+};
+
+typedef boost::unordered_map<std::vector<WordID>, size_t, boost::hash<std::vector<WordID> > > NGramMap;
+
+// Take all the n-grams in the references and stuff them into ngrams.
+void MakeNGramMapFromReferences(const vector<vector<WordID> > &references,
+ int n,
+ vector<RefCounts> *counts,
+ NGramMap *ngrams) {
+ ngrams->clear();
+ std::pair<vector<WordID>, size_t> insert_me;
+ vector<WordID> &ngram = insert_me.first;
+ ngram.reserve(n);
+ size_t &id = insert_me.second;
+ id = 0;
+ for (int refi = 0; refi < references.size(); ++refi) {
+ const vector<WordID>& ref = references[refi];
+ const int s = ref.size();
+ for (int j=0; j<s; ++j) {
+ const int remaining = s-j;
+ const int k = (n < remaining ? n : remaining);
+ ngram.clear();
+ for (unsigned int i = 0; i < k; ++i) {
+ ngram.push_back(ref[j + i]);
+ std::pair<NGramMap::iterator, bool> ret(ngrams->insert(insert_me));
+ if (ret.second) {
+ counts->resize(id + 1);
+ RefCounts &ref_counts = counts->back();
+ ref_counts.max = 1;
+ ref_counts.refs.resize(references.size());
+ ref_counts.refs[refi] = 1;
+ ref_counts.length = ngram.size();
+ ++id;
+ } else {
+ RefCounts &ref_counts = (*counts)[ret.first->second];
+ ref_counts.max = std::max(ref_counts.max, ++ref_counts.refs[refi]);
+ }
+ }
+ }
+ }
+}
+
+struct MutableState {
+ MutableState(void *from, size_t n) : length(reinterpret_cast<size_t*>(from)), left(reinterpret_cast<WordID *>(length + 1)), right(left + n - 1), counts(reinterpret_cast<GramCount *>(right + n - 1)) {}
+ size_t *length;
+ WordID *left, *right;
+ GramCount *counts;
+ static size_t Size(size_t n, size_t bound_ngram_id) { return sizeof(size_t) + (n - 1) * 2 * sizeof(WordID) + bound_ngram_id * sizeof(GramCount); }
+};
+
+struct ConstState {
+ ConstState(const void *from, size_t n) : length(reinterpret_cast<const size_t*>(from)), left(reinterpret_cast<const WordID *>(length + 1)), right(left + n - 1), counts(reinterpret_cast<const GramCount *>(right + n - 1)) {}
+ const size_t *length;
+ const WordID *left, *right;
+ const GramCount *counts;
+ static size_t Size(size_t n, size_t bound_ngram_id) { return sizeof(size_t) + (n - 1) * 2 * sizeof(WordID) + bound_ngram_id * sizeof(GramCount); }
+};
+
+template <class T> struct CompatibleHashRange : public std::unary_function<const boost::iterator_range<T> &, size_t> {
+ size_t operator()(const boost::iterator_range<T> &range) const {
+ return boost::hash_range(range.begin(), range.end());
+ }
+};
+
+template <class T> struct CompatibleEqualsRange : public std::binary_function<const boost::iterator_range<T> &, const std::vector<WordID> &, size_t> {
+ size_t operator()(const boost::iterator_range<T> &range, const std::vector<WordID> &vec) const {
+ return boost::algorithm::equals(range, vec);
+ }
+ size_t operator()(const std::vector<WordID> &vec, const boost::iterator_range<T> &range) const {
+ return boost::algorithm::equals(range, vec);
+ }
+};
+
+void AddWord(const boost::circular_buffer<WordID> &segment, size_t min_length, const NGramMap &ref_grams, GramCount *counters) {
+ typedef boost::circular_buffer<WordID>::const_iterator BufferIt;
+ typedef boost::iterator_range<BufferIt> SegmentRange;
+ if (segment.size() < min_length) return;
+#if 0
+ CompatibleHashRange<BufferIt> hasher;
+ CompatibleEqualsRange<BufferIt> equals;
+ for (BufferIt seg_start(segment.end() - min_length); ; --seg_start) {
+ NGramMap::const_iterator found = ref_grams.find(SegmentRange(seg_start, segment.end()));
+ if (found == ref_grams.end()) break;
+ ++counters[found->second];
+ if (seg_start == segment.begin()) break;
+ }
+#endif
+}
+
+} // namespace
+
+class TrombleLossComputerImpl {
+ public:
+ explicit TrombleLossComputerImpl(const std::string &params) : star_(TD::Convert("<{STAR}>")) {
+ typedef boost::tokenizer<boost::char_separator<char> > Tokenizer;
+ // Argument parsing
+ std::string ref_file_name;
+ Tokenizer tok(params, boost::char_separator<char>(" "));
+ Tokenizer::iterator i = tok.begin();
+ if (i == tok.end()) {
+ std::cerr << "TrombleLossComputer needs a reference file name." << std::endl;
+ exit(1);
+ }
+ ref_file_name = *i++;
+ if (i == tok.end()) {
+ std::cerr << "TrombleLossComputer needs to know how many references." << std::endl;
+ exit(1);
+ }
+ num_refs_ = boost::lexical_cast<unsigned int>(*i++);
+ for (; i != tok.end(); ++i) {
+ thetas_.push_back(boost::lexical_cast<double>(*i));
+ }
+ if (thetas_.empty()) {
+ std::cerr << "TrombleLossComputer is pointless with no weight on n-grams." << std::endl;
+ exit(1);
+ }
+
+ // Read references file.
+ std::ifstream ref_file(ref_file_name.c_str());
+ if (!ref_file) {
+ std::cerr << "Could not open TrombleLossComputer file " << ref_file_name << std::endl;
+ exit(1);
+ }
+ std::string ref;
+ vector<vector<WordID> > references(num_refs_);
+ bound_ngram_id_ = 0;
+ for (unsigned int sentence = 0; ref_file; ++sentence) {
+ for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) {
+ if (!getline(ref_file, ref)) {
+ if (refidx == 0) break;
+ std::cerr << "Short read of " << refidx << " references for sentence " << sentence << std::endl;
+ exit(1);
+ }
+ TD::ConvertSentence(ref, &references[refidx]);
+ }
+ ref_ids_.resize(sentence + 1);
+ ref_counts_.resize(sentence + 1);
+ MakeNGramMapFromReferences(references, thetas_.size(), &ref_counts_.back(), &ref_ids_.back());
+ bound_ngram_id_ = std::max(bound_ngram_id_, ref_ids_.back().size());
+ }
+ }
+
+ size_t StateSize() const {
+ // n-1 boundary words plus counts for n-grams currently rendered as bytes even though most would fit in bits.
+ // Also, this is cached by higher up classes so no need to cache here.
+ return MutableState::Size(thetas_.size(), bound_ngram_id_);
+ }
+
+ double Traversal(
+ const SentenceMetadata &smeta,
+ const TRule &rule,
+ const vector<const void*> &ant_contexts,
+ void *out_context) const {
+ // TODO: get refs from sentence metadata.
+ // This will require resizable features.
+ if (smeta.GetSentenceID() >= ref_ids_.size()) {
+ std::cerr << "Sentence ID " << smeta.GetSentenceID() << " doesn't have references; there are only " << ref_ids_.size() << " references." << std::endl;
+ exit(1);
+ }
+ const NGramMap &ngrams = ref_ids_[smeta.GetSentenceID()];
+ MutableState out_state(out_context, thetas_.size());
+ memset(out_state.counts, 0, bound_ngram_id_ * sizeof(GramCount));
+ boost::circular_buffer<WordID> history(thetas_.size());
+ std::vector<const void*>::const_iterator ant_context = ant_contexts.begin();
+ *out_state.length = 0;
+ size_t pushed = 0;
+ const size_t keep = thetas_.size() - 1;
+ for (vector<WordID>::const_iterator rhs = rule.e().begin(); rhs != rule.e().end(); ++rhs) {
+ if (*rhs < 1) {
+ assert(ant_context != ant_contexts.end());
+ // Constituent
+ ConstState rhs_state(*ant_context, thetas_.size());
+ *out_state.length += *rhs_state.length;
+ {
+ GramCount *accum = out_state.counts;
+ for (const GramCount *c = rhs_state.counts; c != rhs_state.counts + ngrams.size(); ++c, ++accum) {
+ *accum += *c;
+ }
+ }
+ const WordID *w = rhs_state.left;
+ bool long_constit = true;
+ for (size_t i = 1; i <= keep; ++i, ++w) {
+ if (*w == star_) {
+ long_constit = false;
+ break;
+ }
+ history.push_back(*w);
+ if (++pushed == keep) {
+ std::copy(history.begin(), history.end(), out_state.left);
+ }
+ // Now i is the length of the history coming from this constituent. So it needs at least i+1 words to have a cross-child add.
+ AddWord(history, i + 1, ngrams, out_state.counts);
+ }
+ // If the consituent is shorter than thetas_.size(), then the
+ // constituent's left is the entire constituent, so history is already
+ // correct. Otherwise, the entire right hand side is the entire
+ // history.
+ if (long_constit) {
+ history.assign(thetas_.size(), rhs_state.right, rhs_state.right + keep);
+ }
+ ++ant_context;
+ } else {
+ // Word
+ ++*out_state.length;
+ history.push_back(*rhs);
+ if (++pushed == keep) {
+ std::copy(history.begin(), history.end(), out_state.left);
+ }
+ AddWord(history, 1, ngrams, out_state.counts);
+ }
+ }
+ // Fill in left and right constituents.
+ if (pushed < keep) {
+ std::copy(history.begin(), history.end(), out_state.left);
+ for (WordID *i = out_state.left + pushed; i != out_state.left + keep; ++i) {
+ *i = star_;
+ }
+ std::copy(out_state.left, out_state.left + keep, out_state.right);
+ } else if(pushed == keep) {
+ std::copy(history.begin(), history.end(), out_state.right);
+ } else if ((pushed > keep) && !history.empty()) {
+ std::copy(history.begin() + 1, history.end(), out_state.right);
+ }
+ std::vector<RefCounts>::const_iterator ref_info = ref_counts_[smeta.GetSentenceID()].begin();
+ // Clip the counts and count matches.
+ // Indexed by reference then by length.
+ std::vector<std::vector<unsigned int> > matches(num_refs_, std::vector<unsigned int>(thetas_.size()));
+ for (GramCount *c = out_state.counts; c != out_state.counts + ngrams.size(); ++c, ++ref_info) {
+ *c = std::min(*c, ref_info->max);
+ if (*c) {
+ for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) {
+ assert(ref_info->length >= 1);
+ assert(ref_info->length - 1 < thetas_.size());
+ matches[refidx][ref_info->length - 1] += std::min(*c, ref_info->refs[refidx]);
+ }
+ }
+ }
+ double best_score = 0.0;
+ for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) {
+ double score = 0.0;
+ for (unsigned int j = 0; j < std::min(*out_state.length, thetas_.size()); ++j) {
+ score += thetas_[j] * static_cast<double>(matches[refidx][j]) / static_cast<double>(*out_state.length - j);
+ }
+ best_score = std::max(best_score, score);
+ }
+ return best_score;
+ }
+
+ private:
+ unsigned int num_refs_;
+ // Indexed by sentence id.
+ std::vector<NGramMap> ref_ids_;
+ // Then by id from ref_ids_.
+ std::vector<std::vector<RefCounts> > ref_counts_;
+
+ // thetas_[0] is the weight for 1-grams
+ std::vector<double> thetas_;
+
+ // All ngram ids in ref_ids_ are < this value.
+ size_t bound_ngram_id_;
+
+ const WordID star_;
+};
+
+TrombleLossComputer::TrombleLossComputer(const std::string &params) :
+ boost::base_from_member<PImpl>(new TrombleLossComputerImpl(params)),
+ FeatureFunction(boost::base_from_member<PImpl>::member->StateSize()),
+ fid_(FD::Convert("TrombleLossComputer")) {}
+
+TrombleLossComputer::~TrombleLossComputer() {}
+
+void TrombleLossComputer::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const {
+ (void) estimated_features;
+ const double loss = boost::base_from_member<PImpl>::member->Traversal(smeta, *edge.rule_, ant_contexts, out_context);
+ features->set_value(fid_, loss);
+}
diff --git a/decoder/tromble_loss.h b/decoder/tromble_loss.h
new file mode 100644
index 00000000..599a2d54
--- /dev/null
+++ b/decoder/tromble_loss.h
@@ -0,0 +1,40 @@
+#ifndef _TROMBLE_LOSS_H_
+#define _TROMBLE_LOSS_H_
+
+#include <vector>
+#include <boost/scoped_ptr.hpp>
+#include <boost/utility/base_from_member.hpp>
+
+#include "ff.h"
+#include "wordid.h"
+
+// this may not be the most elegant way to implement this computation, but since we
+// may need cube pruning and state splitting, we reuse the feature detector framework.
+// the loss is then stored in a feature #0 (which is guaranteed to have weight 0 and
+// never be a "real" feature).
+class TrombleLossComputerImpl;
+class TrombleLossComputer : private boost::base_from_member<boost::scoped_ptr<TrombleLossComputerImpl> >, public FeatureFunction {
+ private:
+ typedef boost::scoped_ptr<TrombleLossComputerImpl> PImpl;
+ typedef FeatureFunction Base;
+
+ public:
+ // String parameters are ref.txt num_ref weight1 weight2 ... weightn
+ // where ref.txt contains references on per line, with num_ref references per sentence
+ // The weights are the weight on each length n-gram.
+ explicit TrombleLossComputer(const std::string &params);
+
+ ~TrombleLossComputer();
+
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* out_context) const;
+ private:
+ const int fid_;
+};
+
+#endif
diff --git a/decoder/trule.cc b/decoder/trule.cc
new file mode 100644
index 00000000..505839c7
--- /dev/null
+++ b/decoder/trule.cc
@@ -0,0 +1,242 @@
+#include "trule.h"
+
+#include <sstream>
+
+#include "stringlib.h"
+#include "tdict.h"
+
+using namespace std;
+
+static WordID ConvertTrgString(const string& w) {
+ int len = w.size();
+ WordID id = 0;
+ // [X,0] or [0]
+ // for target rules, we ignore the category, just keep the index
+ if (len > 2 && w[0]=='[' && w[len-1]==']' && w[len-2] > '0' && w[len-2] <= '9' &&
+ (len == 3 || (len > 4 && w[len-3] == ','))) {
+ id = w[len-2] - '0';
+ id = 1 - id;
+ } else {
+ id = TD::Convert(w);
+ }
+ return id;
+}
+
+static WordID ConvertSrcString(const string& w, bool mono = false) {
+ int len = w.size();
+ // [X,0]
+ // for source rules, we keep the category and ignore the index (source rules are
+ // always numbered 1, 2, 3...
+ if (mono) {
+ if (len > 2 && w[0]=='[' && w[len-1]==']') {
+ if (len > 4 && w[len-3] == ',') {
+ cerr << "[ERROR] Monolingual rules mut not have non-terminal indices:\n "
+ << w << endl;
+ exit(1);
+ }
+ // TODO check that source indices go 1,2,3,etc.
+ return TD::Convert(w.substr(1, len-2)) * -1;
+ } else {
+ return TD::Convert(w);
+ }
+ } else {
+ if (len > 4 && w[0]=='[' && w[len-1]==']' && w[len-3] == ',' && w[len-2] > '0' && w[len-2] <= '9') {
+ return TD::Convert(w.substr(1, len-4)) * -1;
+ } else {
+ return TD::Convert(w);
+ }
+ }
+}
+
+static WordID ConvertLHS(const string& w) {
+ if (w[0] == '[') {
+ int len = w.size();
+ if (len < 3) { cerr << "Format error: " << w << endl; exit(1); }
+ return TD::Convert(w.substr(1, len-2)) * -1;
+ } else {
+ return TD::Convert(w) * -1;
+ }
+}
+
+TRule* TRule::CreateRuleSynchronous(const std::string& rule) {
+ TRule* res = new TRule;
+ if (res->ReadFromString(rule, true, false)) return res;
+ cerr << "[ERROR] Failed to creating rule from: " << rule << endl;
+ delete res;
+ return NULL;
+}
+
+TRule* TRule::CreateRulePhrasetable(const string& rule) {
+ // TODO make this faster
+ // TODO add configuration for default NT type
+ if (rule[0] == '[') {
+ cerr << "Phrasetable rules shouldn't have a LHS / non-terminals:\n " << rule << endl;
+ return NULL;
+ }
+ TRule* res = new TRule("[X] ||| " + rule, true, false);
+ if (res->Arity() != 0) {
+ cerr << "Phrasetable rules should have arity 0:\n " << rule << endl;
+ delete res;
+ return NULL;
+ }
+ return res;
+}
+
+TRule* TRule::CreateRuleMonolingual(const string& rule) {
+ return new TRule(rule, false, true);
+}
+
+bool TRule::ReadFromString(const string& line, bool strict, bool mono) {
+ e_.clear();
+ f_.clear();
+ scores_.clear();
+
+ string w;
+ istringstream is(line);
+ int format = CountSubstrings(line, "|||");
+ if (strict && format < 2) {
+ cerr << "Bad rule format in strict mode:\n" << line << endl;
+ return false;
+ }
+ if (format >= 2 || (mono && format == 1)) {
+ while(is>>w && w!="|||") { lhs_ = ConvertLHS(w); }
+ while(is>>w && w!="|||") { f_.push_back(ConvertSrcString(w, mono)); }
+ if (!mono) {
+ while(is>>w && w!="|||") { e_.push_back(ConvertTrgString(w)); }
+ }
+ int fv = 0;
+ if (is) {
+ string ss;
+ getline(is, ss);
+ //cerr << "L: " << ss << endl;
+ int start = 0;
+ const int len = ss.size();
+ while (start < len) {
+ while(start < len && (ss[start] == ' ' || ss[start] == ';'))
+ ++start;
+ if (start == len) break;
+ int end = start + 1;
+ while(end < len && (ss[end] != '=' && ss[end] != ' ' && ss[end] != ';'))
+ ++end;
+ if (end == len || ss[end] == ' ' || ss[end] == ';') {
+ //cerr << "PROC: '" << ss.substr(start, end - start) << "'\n";
+ // non-named features
+ if (end != len) { ss[end] = 0; }
+ string fname = "PhraseModel_X";
+ if (fv > 9) { cerr << "Too many phrasetable scores - used named format\n"; abort(); }
+ fname[12]='0' + fv;
+ ++fv;
+ // if the feature set is frozen, this may return zero, indicating an
+ // undefined feature
+ const int fid = FD::Convert(fname);
+ if (fid)
+ scores_.set_value(fid, atof(&ss[start]));
+ //cerr << "F: " << fname << " VAL=" << scores_.value(FD::Convert(fname)) << endl;
+ } else {
+ const int fid = FD::Convert(ss.substr(start, end - start));
+ start = end + 1;
+ end = start + 1;
+ while(end < len && (ss[end] != ' ' && ss[end] != ';'))
+ ++end;
+ if (end < len) { ss[end] = 0; }
+ assert(start < len);
+ if (fid)
+ scores_.set_value(fid, atof(&ss[start]));
+ //cerr << "F: " << FD::Convert(fid) << " VAL=" << scores_.value(fid) << endl;
+ }
+ start = end + 1;
+ }
+ }
+ } else if (format == 1) {
+ while(is>>w && w!="|||") { lhs_ = ConvertLHS(w); }
+ while(is>>w && w!="|||") { e_.push_back(ConvertTrgString(w)); }
+ f_ = e_;
+ int x = ConvertLHS("[X]");
+ for (int i = 0; i < f_.size(); ++i)
+ if (f_[i] <= 0) { f_[i] = x; }
+ } else {
+ cerr << "F: " << format << endl;
+ cerr << "[ERROR] Don't know how to read:\n" << line << endl;
+ }
+ if (mono) {
+ e_ = f_;
+ int ci = 0;
+ for (int i = 0; i < e_.size(); ++i)
+ if (e_[i] < 0)
+ e_[i] = ci--;
+ }
+ ComputeArity();
+ return SanityCheck();
+}
+
+bool TRule::SanityCheck() const {
+ vector<int> used(f_.size(), 0);
+ int ac = 0;
+ for (int i = 0; i < e_.size(); ++i) {
+ int ind = e_[i];
+ if (ind > 0) continue;
+ ind = -ind;
+ if ((++used[ind]) != 1) {
+ cerr << "[ERROR] e-side variable index " << (ind+1) << " used more than once!\n";
+ return false;
+ }
+ ac++;
+ }
+ if (ac != Arity()) {
+ cerr << "[ERROR] e-side arity mismatches f-side\n";
+ return false;
+ }
+ return true;
+}
+
+void TRule::ComputeArity() {
+ int min = 1;
+ for (vector<WordID>::const_iterator i = e_.begin(); i != e_.end(); ++i)
+ if (*i < min) min = *i;
+ arity_ = 1 - min;
+}
+
+static string AnonymousStrVar(int i) {
+ string res("[v]");
+ if(!(i <= 0 && i >= -8)) {
+ cerr << "Can't handle more than 9 non-terminals: index=" << (-i) << endl;
+ abort();
+ }
+ res[1] = '1' - i;
+ return res;
+}
+
+string TRule::AsString(bool verbose) const {
+ ostringstream os;
+ int idx = 0;
+ if (lhs_ && verbose) {
+ os << '[' << TD::Convert(lhs_ * -1) << "] |||";
+ for (int i = 0; i < f_.size(); ++i) {
+ const WordID& w = f_[i];
+ if (w < 0) {
+ int wi = w * -1;
+ ++idx;
+ os << " [" << TD::Convert(wi) << ',' << idx << ']';
+ } else {
+ os << ' ' << TD::Convert(w);
+ }
+ }
+ os << " ||| ";
+ }
+ if (idx > 9) {
+ cerr << "Too many non-terminals!\n partial: " << os.str() << endl;
+ exit(1);
+ }
+ for (int i =0; i<e_.size(); ++i) {
+ if (i) os << ' ';
+ const WordID& w = e_[i];
+ if (w < 1)
+ os << AnonymousStrVar(w);
+ else
+ os << TD::Convert(w);
+ }
+ if (!scores_.empty() && verbose) {
+ os << " ||| " << scores_;
+ }
+ return os.str();
+}
diff --git a/decoder/trule.h b/decoder/trule.h
new file mode 100644
index 00000000..7fb92924
--- /dev/null
+++ b/decoder/trule.h
@@ -0,0 +1,145 @@
+#ifndef _RULE_H_
+#define _RULE_H_
+
+#include <algorithm>
+#include <vector>
+#include <cassert>
+#include <boost/shared_ptr.hpp>
+
+#include "sparse_vector.h"
+#include "wordid.h"
+
+class TRule;
+typedef boost::shared_ptr<TRule> TRulePtr;
+
+struct NTSizeSummaryStatistics {
+ NTSizeSummaryStatistics(int arity) : means(arity), vars(arity) {}
+ std::vector<float> means;
+ std::vector<float> vars;
+};
+
+// Translation rule
+class TRule {
+ public:
+ TRule() : lhs_(0), prev_i(-1), prev_j(-1) { }
+ TRule(WordID lhs, const WordID* src, int src_size, const WordID* trg, int trg_size, const int* feat_ids, const double* feat_vals, int feat_size, int arity) :
+ e_(trg, trg + trg_size), f_(src, src + src_size), lhs_(lhs), arity_(arity), prev_i(-1), prev_j(-1) {
+ for (int i = 0; i < feat_size; ++i)
+ scores_.set_value(feat_ids[i], feat_vals[i]);
+ }
+
+ explicit TRule(const std::vector<WordID>& e) : e_(e), lhs_(0), prev_i(-1), prev_j(-1) {}
+ TRule(const std::vector<WordID>& e, const std::vector<WordID>& f, const WordID& lhs) :
+ e_(e), f_(f), lhs_(lhs), prev_i(-1), prev_j(-1) {}
+
+ // deprecated - this will be private soon
+ explicit TRule(const std::string& text, bool strict = false, bool mono = false) : prev_i(-1), prev_j(-1) {
+ ReadFromString(text, strict, mono);
+ }
+
+ // deprecated, use lexer
+ // make a rule from a hiero-like rule table, e.g.
+ // [X] ||| [X,1] DE [X,2] ||| [X,2] of the [X,1]
+ // if misformatted, returns NULL
+ static TRule* CreateRuleSynchronous(const std::string& rule);
+
+ // deprecated, use lexer
+ // make a rule from a phrasetable entry (i.e., one that has no LHS type), e.g:
+ // el gato ||| the cat ||| Feature_2=0.34
+ static TRule* CreateRulePhrasetable(const std::string& rule);
+
+ // deprecated, use lexer
+ // make a rule from a non-synchrnous CFG representation, e.g.:
+ // [LHS] ||| term1 [NT] term2 [OTHER_NT] [YET_ANOTHER_NT]
+ static TRule* CreateRuleMonolingual(const std::string& rule);
+
+ static TRule* CreateLexicalRule(const WordID& src, const WordID& trg) {
+ return new TRule(src, trg);
+ }
+
+ void ESubstitute(const std::vector<const std::vector<WordID>* >& var_values,
+ std::vector<WordID>* result) const {
+ int vc = 0;
+ result->clear();
+ for (std::vector<WordID>::const_iterator i = e_.begin(); i != e_.end(); ++i) {
+ const WordID& c = *i;
+ if (c < 1) {
+ ++vc;
+ const std::vector<WordID>& var_value = *var_values[-c];
+ std::copy(var_value.begin(),
+ var_value.end(),
+ std::back_inserter(*result));
+ } else {
+ result->push_back(c);
+ }
+ }
+ assert(vc == var_values.size());
+ }
+
+ void FSubstitute(const std::vector<const std::vector<WordID>* >& var_values,
+ std::vector<WordID>* result) const {
+ int vc = 0;
+ result->clear();
+ for (std::vector<WordID>::const_iterator i = f_.begin(); i != f_.end(); ++i) {
+ const WordID& c = *i;
+ if (c < 1) {
+ const std::vector<WordID>& var_value = *var_values[vc++];
+ std::copy(var_value.begin(),
+ var_value.end(),
+ std::back_inserter(*result));
+ } else {
+ result->push_back(c);
+ }
+ }
+ assert(vc == var_values.size());
+ }
+
+ bool ReadFromString(const std::string& line, bool strict = false, bool monolingual = false);
+
+ bool Initialized() const { return e_.size(); }
+
+ std::string AsString(bool verbose = true) const;
+
+ static TRule DummyRule() {
+ TRule res;
+ res.e_.resize(1, 0);
+ return res;
+ }
+
+ const std::vector<WordID>& f() const { return f_; }
+ const std::vector<WordID>& e() const { return e_; }
+
+ int EWords() const { return ELength() - Arity(); }
+ int FWords() const { return FLength() - Arity(); }
+ int FLength() const { return f_.size(); }
+ int ELength() const { return e_.size(); }
+ int Arity() const { return arity_; }
+ bool IsUnary() const { return (Arity() == 1) && (f_.size() == 1); }
+ const SparseVector<double>& GetFeatureValues() const { return scores_; }
+ double Score(int i) const { return scores_[i]; }
+ WordID GetLHS() const { return lhs_; }
+ void ComputeArity();
+
+ // 0 = first variable, -1 = second variable, -2 = third ...
+ std::vector<WordID> e_;
+ // < 0: *-1 = encoding of category of variable
+ std::vector<WordID> f_;
+ WordID lhs_;
+ SparseVector<double> scores_;
+
+ char arity_;
+ TRulePtr parent_rule_; // usually NULL, except when doing constrained decoding
+
+ // this is only used when doing synchronous parsing
+ short int prev_i;
+ short int prev_j;
+
+ // may be null
+ boost::shared_ptr<NTSizeSummaryStatistics> nt_size_summary_;
+
+ private:
+ TRule(const WordID& src, const WordID& trg) : e_(1, trg), f_(1, src), lhs_(), arity_(), prev_i(), prev_j() {}
+ bool SanityCheck() const;
+};
+
+#endif
diff --git a/decoder/trule_test.cc b/decoder/trule_test.cc
new file mode 100644
index 00000000..02a70764
--- /dev/null
+++ b/decoder/trule_test.cc
@@ -0,0 +1,65 @@
+#include "trule.h"
+
+#include <gtest/gtest.h>
+#include <cassert>
+#include <iostream>
+#include "tdict.h"
+
+using namespace std;
+
+class TRuleTest : public testing::Test {
+ protected:
+ virtual void SetUp() { }
+ virtual void TearDown() { }
+};
+
+TEST_F(TRuleTest,TestFSubstitute) {
+ TRule r1("[X] ||| ob [X,1] [X,2] sah . ||| whether [X,1] saw [X,2] . ||| 0.99");
+ TRule r2("[X] ||| ich ||| i ||| 1.0");
+ TRule r3("[X] ||| ihn ||| him ||| 1.0");
+ vector<const vector<WordID>*> ants;
+ vector<WordID> res2;
+ r2.FSubstitute(ants, &res2);
+ assert(TD::GetString(res2) == "ich");
+ vector<WordID> res3;
+ r3.FSubstitute(ants, &res3);
+ assert(TD::GetString(res3) == "ihn");
+ ants.push_back(&res2);
+ ants.push_back(&res3);
+ vector<WordID> res;
+ r1.FSubstitute(ants, &res);
+ cerr << TD::GetString(res) << endl;
+ assert(TD::GetString(res) == "ob ich ihn sah .");
+}
+
+TEST_F(TRuleTest,TestPhrasetableRule) {
+ TRulePtr t(TRule::CreateRulePhrasetable("gato ||| cat ||| PhraseModel_0=-23.2;Foo=1;Bar=12"));
+ cerr << t->AsString() << endl;
+ assert(t->scores_.num_active() == 3);
+};
+
+
+TEST_F(TRuleTest,TestMonoRule) {
+ TRulePtr m(TRule::CreateRuleMonolingual("[LHS] ||| term1 [NT] term2 [NT2] [NT3]"));
+ assert(m->Arity() == 3);
+ cerr << m->AsString() << endl;
+ TRulePtr m2(TRule::CreateRuleMonolingual("[LHS] ||| term1 [NT] term2 [NT2] [NT3] ||| Feature1=0.23"));
+ assert(m2->Arity() == 3);
+ cerr << m2->AsString() << endl;
+ EXPECT_FLOAT_EQ(m2->scores_.value(FD::Convert("Feature1")), 0.23);
+}
+
+TEST_F(TRuleTest,TestRuleR) {
+ TRule t6;
+ t6.ReadFromString("[X] ||| den [X,1] sah [X,2] . ||| [X,2] saw the [X,1] . ||| 0.12321 0.23232 0.121");
+ cerr << "TEXT: " << t6.AsString() << endl;
+ EXPECT_EQ(t6.Arity(), 2);
+ EXPECT_EQ(t6.e_[0], -1);
+ EXPECT_EQ(t6.e_[3], 0);
+}
+
+int main(int argc, char** argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
diff --git a/decoder/ttables.cc b/decoder/ttables.cc
new file mode 100644
index 00000000..2ea960f0
--- /dev/null
+++ b/decoder/ttables.cc
@@ -0,0 +1,31 @@
+#include "ttables.h"
+
+#include <cassert>
+
+#include "dict.h"
+
+using namespace std;
+using namespace std::tr1;
+
+void TTable::DeserializeProbsFromText(std::istream* in) {
+ int c = 0;
+ while(*in) {
+ string e;
+ string f;
+ double p;
+ (*in) >> e >> f >> p;
+ if (e.empty()) break;
+ ++c;
+ ttable[TD::Convert(e)][TD::Convert(f)] = prob_t(p);
+ }
+ cerr << "Loaded " << c << " translation parameters.\n";
+}
+
+void TTable::SerializeHelper(string* out, const Word2Word2Double& o) {
+ assert(!"not implemented");
+}
+
+void TTable::DeserializeHelper(const string& in, Word2Word2Double* o) {
+ assert(!"not implemented");
+}
+
diff --git a/decoder/ttables.h b/decoder/ttables.h
new file mode 100644
index 00000000..3ffc238a
--- /dev/null
+++ b/decoder/ttables.h
@@ -0,0 +1,87 @@
+#ifndef _TTABLES_H_
+#define _TTABLES_H_
+
+#include <iostream>
+#include <map>
+
+#include "wordid.h"
+#include "prob.h"
+#include "tdict.h"
+
+class TTable {
+ public:
+ TTable() {}
+ typedef std::map<WordID, double> Word2Double;
+ typedef std::map<WordID, Word2Double> Word2Word2Double;
+ inline const prob_t prob(const int& e, const int& f) const {
+ const Word2Word2Double::const_iterator cit = ttable.find(e);
+ if (cit != ttable.end()) {
+ const Word2Double& cpd = cit->second;
+ const Word2Double::const_iterator it = cpd.find(f);
+ if (it == cpd.end()) return prob_t(0.00001);
+ return prob_t(it->second);
+ } else {
+ return prob_t(0.00001);
+ }
+ }
+ inline void Increment(const int& e, const int& f) {
+ counts[e][f] += 1.0;
+ }
+ inline void Increment(const int& e, const int& f, double x) {
+ counts[e][f] += x;
+ }
+ void Normalize() {
+ ttable.swap(counts);
+ for (Word2Word2Double::iterator cit = ttable.begin();
+ cit != ttable.end(); ++cit) {
+ double tot = 0;
+ Word2Double& cpd = cit->second;
+ for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
+ tot += it->second;
+ for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
+ it->second /= tot;
+ }
+ counts.clear();
+ }
+ // adds counts from another TTable - probabilities remain unchanged
+ TTable& operator+=(const TTable& rhs) {
+ for (Word2Word2Double::const_iterator it = rhs.counts.begin();
+ it != rhs.counts.end(); ++it) {
+ const Word2Double& cpd = it->second;
+ Word2Double& tgt = counts[it->first];
+ for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) {
+ tgt[j->first] += j->second;
+ }
+ }
+ return *this;
+ }
+ void ShowTTable() {
+ for (Word2Word2Double::iterator it = ttable.begin(); it != ttable.end(); ++it) {
+ Word2Double& cpd = it->second;
+ for (Word2Double::iterator j = cpd.begin(); j != cpd.end(); ++j) {
+ std::cerr << "P(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl;
+ }
+ }
+ }
+ void ShowCounts() {
+ for (Word2Word2Double::iterator it = counts.begin(); it != counts.end(); ++it) {
+ Word2Double& cpd = it->second;
+ for (Word2Double::iterator j = cpd.begin(); j != cpd.end(); ++j) {
+ std::cerr << "c(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl;
+ }
+ }
+ }
+ void DeserializeProbsFromText(std::istream* in);
+ void SerializeCounts(std::string* out) const { SerializeHelper(out, counts); }
+ void DeserializeCounts(const std::string& in) { DeserializeHelper(in, &counts); }
+ void SerializeProbs(std::string* out) const { SerializeHelper(out, ttable); }
+ void DeserializeProbs(const std::string& in) { DeserializeHelper(in, &ttable); }
+ private:
+ static void SerializeHelper(std::string*, const Word2Word2Double& o);
+ static void DeserializeHelper(const std::string&, Word2Word2Double* o);
+ public:
+ Word2Word2Double ttable;
+ Word2Word2Double counts;
+};
+
+#endif
diff --git a/decoder/viterbi.cc b/decoder/viterbi.cc
new file mode 100644
index 00000000..82b2ce6d
--- /dev/null
+++ b/decoder/viterbi.cc
@@ -0,0 +1,39 @@
+#include "viterbi.h"
+
+#include <vector>
+#include "hg.h"
+
+using namespace std;
+
+string ViterbiETree(const Hypergraph& hg) {
+ vector<WordID> tmp;
+ const prob_t p = Viterbi<vector<WordID>, ETreeTraversal, prob_t, EdgeProb>(hg, &tmp);
+ return TD::GetString(tmp);
+}
+
+string ViterbiFTree(const Hypergraph& hg) {
+ vector<WordID> tmp;
+ const prob_t p = Viterbi<vector<WordID>, FTreeTraversal, prob_t, EdgeProb>(hg, &tmp);
+ return TD::GetString(tmp);
+}
+
+prob_t ViterbiESentence(const Hypergraph& hg, vector<WordID>* result) {
+ return Viterbi<vector<WordID>, ESentenceTraversal, prob_t, EdgeProb>(hg, result);
+}
+
+prob_t ViterbiFSentence(const Hypergraph& hg, vector<WordID>* result) {
+ return Viterbi<vector<WordID>, FSentenceTraversal, prob_t, EdgeProb>(hg, result);
+}
+
+int ViterbiELength(const Hypergraph& hg) {
+ int len = -1;
+ Viterbi<int, ELengthTraversal, prob_t, EdgeProb>(hg, &len);
+ return len;
+}
+
+int ViterbiPathLength(const Hypergraph& hg) {
+ int len = -1;
+ Viterbi<int, PathLengthTraversal, prob_t, EdgeProb>(hg, &len);
+ return len;
+}
+
diff --git a/decoder/viterbi.h b/decoder/viterbi.h
new file mode 100644
index 00000000..8f7534a9
--- /dev/null
+++ b/decoder/viterbi.h
@@ -0,0 +1,142 @@
+#ifndef _VITERBI_H_
+#define _VITERBI_H_
+
+#include <vector>
+#include "prob.h"
+#include "hg.h"
+#include "tdict.h"
+
+// V must implement:
+// void operator()(const vector<const T*>& ants, T* result);
+template<typename T, typename Traversal, typename WeightType, typename WeightFunction>
+WeightType Viterbi(const Hypergraph& hg,
+ T* result,
+ const Traversal& traverse = Traversal(),
+ const WeightFunction& weight = WeightFunction()) {
+ const int num_nodes = hg.nodes_.size();
+ std::vector<T> vit_result(num_nodes);
+ std::vector<WeightType> vit_weight(num_nodes, WeightType::Zero());
+
+ for (int i = 0; i < num_nodes; ++i) {
+ const Hypergraph::Node& cur_node = hg.nodes_[i];
+ WeightType* const cur_node_best_weight = &vit_weight[i];
+ T* const cur_node_best_result = &vit_result[i];
+
+ const int num_in_edges = cur_node.in_edges_.size();
+ if (num_in_edges == 0) {
+ *cur_node_best_weight = WeightType(1);
+ continue;
+ }
+ for (int j = 0; j < num_in_edges; ++j) {
+ const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]];
+ WeightType score = weight(edge);
+ std::vector<const T*> ants(edge.tail_nodes_.size());
+ for (int k = 0; k < edge.tail_nodes_.size(); ++k) {
+ const int tail_node_index = edge.tail_nodes_[k];
+ score *= vit_weight[tail_node_index];
+ ants[k] = &vit_result[tail_node_index];
+ }
+ if (*cur_node_best_weight < score) {
+ *cur_node_best_weight = score;
+ traverse(edge, ants, cur_node_best_result);
+ }
+ }
+ }
+ std::swap(*result, vit_result.back());
+ return vit_weight.back();
+}
+
+struct PathLengthTraversal {
+ void operator()(const Hypergraph::Edge& edge,
+ const std::vector<const int*>& ants,
+ int* result) const {
+ (void) edge;
+ *result = 1;
+ for (int i = 0; i < ants.size(); ++i) *result += *ants[i];
+ }
+};
+
+struct ESentenceTraversal {
+ void operator()(const Hypergraph::Edge& edge,
+ const std::vector<const std::vector<WordID>*>& ants,
+ std::vector<WordID>* result) const {
+ edge.rule_->ESubstitute(ants, result);
+ }
+};
+
+struct ELengthTraversal {
+ void operator()(const Hypergraph::Edge& edge,
+ const std::vector<const int*>& ants,
+ int* result) const {
+ *result = edge.rule_->ELength() - edge.rule_->Arity();
+ for (int i = 0; i < ants.size(); ++i) *result += *ants[i];
+ }
+};
+
+struct FSentenceTraversal {
+ void operator()(const Hypergraph::Edge& edge,
+ const std::vector<const std::vector<WordID>*>& ants,
+ std::vector<WordID>* result) const {
+ edge.rule_->FSubstitute(ants, result);
+ }
+};
+
+// create a strings of the form (S (X the man) (X said (X he (X would (X go)))))
+struct ETreeTraversal {
+ ETreeTraversal() : left("("), space(" "), right(")") {}
+ const std::string left;
+ const std::string space;
+ const std::string right;
+ void operator()(const Hypergraph::Edge& edge,
+ const std::vector<const std::vector<WordID>*>& ants,
+ std::vector<WordID>* result) const {
+ std::vector<WordID> tmp;
+ edge.rule_->ESubstitute(ants, &tmp);
+ const std::string cat = TD::Convert(edge.rule_->GetLHS() * -1);
+ if (cat == "Goal")
+ result->swap(tmp);
+ else
+ TD::ConvertSentence(left + cat + space + TD::GetString(tmp) + right,
+ result);
+ }
+};
+
+struct FTreeTraversal {
+ FTreeTraversal() : left("("), space(" "), right(")") {}
+ const std::string left;
+ const std::string space;
+ const std::string right;
+ void operator()(const Hypergraph::Edge& edge,
+ const std::vector<const std::vector<WordID>*>& ants,
+ std::vector<WordID>* result) const {
+ std::vector<WordID> tmp;
+ edge.rule_->FSubstitute(ants, &tmp);
+ const std::string cat = TD::Convert(edge.rule_->GetLHS() * -1);
+ if (cat == "Goal")
+ result->swap(tmp);
+ else
+ TD::ConvertSentence(left + cat + space + TD::GetString(tmp) + right,
+ result);
+ }
+};
+
+struct ViterbiPathTraversal {
+ void operator()(const Hypergraph::Edge& edge,
+ const std::vector<const std::vector<const Hypergraph::Edge*>* >& ants,
+ std::vector<const Hypergraph::Edge*>* result) const {
+ result->clear();
+ for (int i = 0; i < ants.size(); ++i)
+ for (int j = 0; j < ants[i]->size(); ++j)
+ result->push_back((*ants[i])[j]);
+ result->push_back(&edge);
+ }
+};
+
+prob_t ViterbiESentence(const Hypergraph& hg, std::vector<WordID>* result);
+std::string ViterbiETree(const Hypergraph& hg);
+prob_t ViterbiFSentence(const Hypergraph& hg, std::vector<WordID>* result);
+std::string ViterbiFTree(const Hypergraph& hg);
+int ViterbiELength(const Hypergraph& hg);
+int ViterbiPathLength(const Hypergraph& hg);
+
+#endif
diff --git a/decoder/weights.cc b/decoder/weights.cc
new file mode 100644
index 00000000..84647585
--- /dev/null
+++ b/decoder/weights.cc
@@ -0,0 +1,77 @@
+#include "weights.h"
+
+#include <sstream>
+
+#include "fdict.h"
+#include "filelib.h"
+
+using namespace std;
+
+void Weights::InitFromFile(const std::string& filename, vector<string>* feature_list) {
+ cerr << "Reading weights from " << filename << endl;
+ ReadFile in_file(filename);
+ istream& in = *in_file.stream();
+ assert(in);
+ int weight_count = 0;
+ bool fl = false;
+ while (in) {
+ double val = 0;
+ string buf;
+ getline(in, buf);
+ if (buf.size() == 0) continue;
+ if (buf[0] == '#') continue;
+ for (int i = 0; i < buf.size(); ++i)
+ if (buf[i] == '=') buf[i] = ' ';
+ int start = 0;
+ while(start < buf.size() && buf[start] == ' ') ++start;
+ int end = 0;
+ while(end < buf.size() && buf[end] != ' ') ++end;
+ int fid = FD::Convert(buf.substr(start, end - start));
+ while(end < buf.size() && buf[end] == ' ') ++end;
+ val = strtod(&buf.c_str()[end], NULL);
+ if (isnan(val)) {
+ cerr << FD::Convert(fid) << " has weight NaN!\n";
+ abort();
+ }
+ if (wv_.size() <= fid)
+ wv_.resize(fid + 1);
+ wv_[fid] = val;
+ if (feature_list) { feature_list->push_back(FD::Convert(fid)); }
+ ++weight_count;
+ if (weight_count % 50000 == 0) { cerr << '.' << flush; fl = true; }
+ if (weight_count % 2000000 == 0) { cerr << " [" << weight_count << "]\n"; fl = false; }
+ }
+ if (fl) { cerr << endl; }
+ cerr << "Loaded " << weight_count << " feature weights\n";
+}
+
+void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features) const {
+ WriteFile out(fname);
+ ostream& o = *out.stream();
+ assert(o);
+ o.precision(17);
+ const int num_feats = FD::NumFeats();
+ for (int i = 1; i < num_feats; ++i) {
+ const double val = (i < wv_.size() ? wv_[i] : 0.0);
+ if (hide_zero_value_features && val == 0.0) continue;
+ o << FD::Convert(i) << ' ' << val << endl;
+ }
+}
+
+void Weights::InitVector(std::vector<double>* w) const {
+ *w = wv_;
+}
+
+void Weights::InitSparseVector(SparseVector<double>* w) const {
+ for (int i = 1; i < wv_.size(); ++i) {
+ const double& weight = wv_[i];
+ if (weight) w->set_value(i, weight);
+ }
+}
+
+void Weights::InitFromVector(const std::vector<double>& w) {
+ wv_ = w;
+ if (wv_.size() > FD::NumFeats())
+ cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n";
+ wv_.resize(FD::NumFeats(), 0);
+}
diff --git a/decoder/weights.h b/decoder/weights.h
new file mode 100644
index 00000000..f19aa3ce
--- /dev/null
+++ b/decoder/weights.h
@@ -0,0 +1,21 @@
+#ifndef _WEIGHTS_H_
+#define _WEIGHTS_H_
+
+#include <string>
+#include <map>
+#include <vector>
+#include "sparse_vector.h"
+
+class Weights {
+ public:
+ Weights() {}
+ void InitFromFile(const std::string& fname, std::vector<std::string>* feature_list = NULL);
+ void WriteToFile(const std::string& fname, bool hide_zero_value_features = true) const;
+ void InitVector(std::vector<double>* w) const;
+ void InitSparseVector(SparseVector<double>* w) const;
+ void InitFromVector(const std::vector<double>& w);
+ private:
+ std::vector<double> wv_;
+};
+
+#endif
diff --git a/decoder/weights_test.cc b/decoder/weights_test.cc
new file mode 100644
index 00000000..aa6b3db2
--- /dev/null
+++ b/decoder/weights_test.cc
@@ -0,0 +1,28 @@
+#include <cassert>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <gtest/gtest.h>
+#include "weights.h"
+#include "tdict.h"
+#include "hg.h"
+
+using namespace std;
+
+class WeightsTest : public testing::Test {
+ protected:
+ virtual void SetUp() { }
+ virtual void TearDown() { }
+};
+
+
+TEST_F(WeightsTest,Load) {
+ Weights w;
+ w.InitFromFile("test_data/weights");
+ w.WriteToFile("-");
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/decoder/wordid.h b/decoder/wordid.h
new file mode 100644
index 00000000..fb50bcc1
--- /dev/null
+++ b/decoder/wordid.h
@@ -0,0 +1,6 @@
+#ifndef _WORD_ID_H_
+#define _WORD_ID_H_
+
+typedef int WordID;
+
+#endif