From 851e389dffdd6996ea32d70defb8906de80b9edc Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 14 Dec 2009 20:35:11 -0500 Subject: few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec --- .gitignore | 28 +- Makefile.am | 2 +- compound-split/compound-split.pl | 2 +- configure.ac | 2 +- decoder/JSON_parser.c | 1012 ++++++++++++++ decoder/JSON_parser.h | 152 +++ decoder/Makefile.am | 69 + decoder/aligner.cc | 204 +++ decoder/aligner.h | 23 + decoder/apply_models.cc | 344 +++++ decoder/apply_models.h | 20 + decoder/array2d.h | 172 +++ decoder/bottom_up_parser.cc | 279 ++++ decoder/bottom_up_parser.h | 27 + decoder/cdec.cc | 507 +++++++ decoder/cdec_ff.cc | 22 + decoder/csplit.cc | 173 +++ decoder/csplit.h | 30 + decoder/dict.h | 40 + decoder/dict_test.cc | 30 + decoder/earley_composer.cc | 726 ++++++++++ decoder/earley_composer.h | 29 + decoder/exp_semiring.h | 71 + decoder/fdict.cc | 4 + decoder/fdict.h | 21 + decoder/ff.cc | 114 ++ decoder/ff.h | 136 ++ decoder/ff_csplit.cc | 212 +++ decoder/ff_csplit.h | 39 + decoder/ff_factory.cc | 35 + decoder/ff_factory.h | 39 + decoder/ff_lm.cc | 328 +++++ decoder/ff_lm.h | 32 + decoder/ff_test.cc | 134 ++ decoder/ff_wordalign.cc | 240 ++++ decoder/ff_wordalign.h | 136 ++ decoder/filelib.cc | 22 + decoder/filelib.h | 66 + decoder/forest_writer.cc | 23 + decoder/forest_writer.h | 16 + decoder/freqdict.cc | 29 + decoder/freqdict.h | 20 + decoder/fst_translator.cc | 91 ++ decoder/grammar.cc | 164 +++ decoder/grammar.h | 83 ++ decoder/grammar_test.cc | 59 + decoder/gzstream.cc | 165 +++ decoder/gzstream.h | 121 ++ decoder/hg.cc | 486 +++++++ decoder/hg.h | 225 +++ decoder/hg_intersect.cc | 121 ++ decoder/hg_intersect.h | 13 + decoder/hg_io.cc | 599 ++++++++ decoder/hg_io.h | 37 + decoder/hg_test.cc | 441 ++++++ decoder/inside_outside.h | 111 ++ decoder/json_parse.cc | 50 + decoder/json_parse.h | 58 + decoder/kbest.h | 207 +++ decoder/lattice.cc | 61 + decoder/lattice.h | 41 + decoder/lexcrf.cc | 112 ++ decoder/lexcrf.h | 18 + decoder/logval.h | 136 ++ decoder/maxtrans_blunsom.cc | 287 ++++ decoder/parser_test.cc | 35 + decoder/phrasebased_translator.cc | 206 +++ decoder/phrasebased_translator.h | 18 + decoder/phrasetable_fst.cc | 141 ++ decoder/phrasetable_fst.h | 34 + decoder/prob.h | 8 + decoder/sampler.h | 136 ++ decoder/scfg_translator.cc | 66 + decoder/sentence_metadata.h | 47 + decoder/small_vector.h | 187 +++ decoder/small_vector_test.cc | 129 ++ decoder/sparse_vector.cc | 98 ++ decoder/sparse_vector.h | 264 ++++ decoder/stringlib.cc | 97 ++ decoder/stringlib.h | 101 ++ decoder/tdict.cc | 49 + decoder/tdict.h | 19 + decoder/test_data/dummy.3gram.lm | 2645 ++++++++++++++++++++++++++++++++++++ decoder/test_data/grammar.prune | 196 +++ decoder/test_data/small.json.gz | Bin 0 -> 1561 bytes decoder/test_data/test_2gram.lm.gz | Bin 0 -> 587 bytes decoder/test_data/weights | 8 + decoder/test_data/weights.gt | 4 + decoder/timing_stats.cc | 24 + decoder/timing_stats.h | 25 + decoder/translator.h | 54 + decoder/trule.cc | 237 ++++ decoder/trule.h | 122 ++ decoder/trule_test.cc | 65 + decoder/ttables.cc | 31 + decoder/ttables.h | 87 ++ decoder/viterbi.cc | 39 + decoder/viterbi.h | 130 ++ decoder/weights.cc | 73 + decoder/weights.h | 21 + decoder/weights_test.cc | 28 + decoder/wordid.h | 6 + src/JSON_parser.c | 1012 -------------- src/JSON_parser.h | 152 --- src/Makefile.am | 69 - src/aligner.cc | 204 --- src/aligner.h | 23 - src/apply_models.cc | 344 ----- src/apply_models.h | 20 - src/array2d.h | 172 --- src/bottom_up_parser.cc | 279 ---- src/bottom_up_parser.h | 27 - src/cdec.cc | 507 ------- src/cdec_ff.cc | 22 - src/csplit.cc | 173 --- src/csplit.h | 30 - src/dict.h | 40 - src/dict_test.cc | 30 - src/earley_composer.cc | 726 ---------- src/earley_composer.h | 29 - src/exp_semiring.h | 71 - src/fdict.cc | 4 - src/fdict.h | 21 - src/ff.cc | 114 -- src/ff.h | 136 -- src/ff_csplit.cc | 212 --- src/ff_csplit.h | 39 - src/ff_factory.cc | 35 - src/ff_factory.h | 39 - src/ff_lm.cc | 328 ----- src/ff_lm.h | 32 - src/ff_test.cc | 134 -- src/ff_wordalign.cc | 221 --- src/ff_wordalign.h | 133 -- src/filelib.cc | 22 - src/filelib.h | 66 - src/forest_writer.cc | 23 - src/forest_writer.h | 16 - src/freqdict.cc | 29 - src/freqdict.h | 20 - src/fst_translator.cc | 91 -- src/grammar.cc | 164 --- src/grammar.h | 83 -- src/grammar_test.cc | 59 - src/gzstream.cc | 165 --- src/gzstream.h | 121 -- src/hg.cc | 486 ------- src/hg.h | 225 --- src/hg_intersect.cc | 121 -- src/hg_intersect.h | 13 - src/hg_io.cc | 598 -------- src/hg_io.h | 37 - src/hg_test.cc | 441 ------ src/inside_outside.h | 111 -- src/json_parse.cc | 50 - src/json_parse.h | 58 - src/kbest.h | 207 --- src/lattice.cc | 61 - src/lattice.h | 41 - src/lexcrf.cc | 112 -- src/lexcrf.h | 18 - src/logval.h | 136 -- src/maxtrans_blunsom.cc | 287 ---- src/parser_test.cc | 35 - src/phrasebased_translator.cc | 206 --- src/phrasebased_translator.h | 18 - src/phrasetable_fst.cc | 141 -- src/phrasetable_fst.h | 34 - src/prob.h | 8 - src/sampler.h | 136 -- src/scfg_translator.cc | 66 - src/sentence_metadata.h | 47 - src/small_vector.h | 187 --- src/small_vector_test.cc | 129 -- src/sparse_vector.cc | 98 -- src/sparse_vector.h | 264 ---- src/stringlib.cc | 97 -- src/stringlib.h | 101 -- src/tdict.cc | 49 - src/tdict.h | 19 - src/test_data/dummy.3gram.lm | 2645 ------------------------------------ src/test_data/grammar.prune | 196 --- src/test_data/small.json.gz | Bin 1561 -> 0 bytes src/test_data/test_2gram.lm.gz | Bin 587 -> 0 bytes src/test_data/weights | 8 - src/test_data/weights.gt | 4 - src/timing_stats.cc | 24 - src/timing_stats.h | 25 - src/translator.h | 54 - src/trule.cc | 237 ---- src/trule.h | 122 -- src/trule_test.cc | 65 - src/ttables.cc | 31 - src/ttables.h | 87 -- src/viterbi.cc | 39 - src/viterbi.h | 130 -- src/weights.cc | 73 - src/weights.h | 21 - src/weights_test.cc | 28 - src/wordid.h | 6 - tests/run-system-tests.pl | 2 +- training/Makefile.am | 20 +- training/atools.cc | 96 ++ training/cluster-ptrain.pl | 52 +- training/make-lexcrf-grammar.pl | 73 +- vest/Makefile.am | 16 +- vest/dist-vest.pl | 65 +- 207 files changed, 14946 insertions(+), 14773 deletions(-) create mode 100644 decoder/JSON_parser.c create mode 100644 decoder/JSON_parser.h create mode 100644 decoder/Makefile.am create mode 100644 decoder/aligner.cc create mode 100644 decoder/aligner.h create mode 100644 decoder/apply_models.cc create mode 100644 decoder/apply_models.h create mode 100644 decoder/array2d.h create mode 100644 decoder/bottom_up_parser.cc create mode 100644 decoder/bottom_up_parser.h create mode 100644 decoder/cdec.cc create mode 100644 decoder/cdec_ff.cc create mode 100644 decoder/csplit.cc create mode 100644 decoder/csplit.h create mode 100644 decoder/dict.h create mode 100644 decoder/dict_test.cc create mode 100644 decoder/earley_composer.cc create mode 100644 decoder/earley_composer.h create mode 100644 decoder/exp_semiring.h create mode 100644 decoder/fdict.cc create mode 100644 decoder/fdict.h create mode 100644 decoder/ff.cc create mode 100644 decoder/ff.h create mode 100644 decoder/ff_csplit.cc create mode 100644 decoder/ff_csplit.h create mode 100644 decoder/ff_factory.cc create mode 100644 decoder/ff_factory.h create mode 100644 decoder/ff_lm.cc create mode 100644 decoder/ff_lm.h create mode 100644 decoder/ff_test.cc create mode 100644 decoder/ff_wordalign.cc create mode 100644 decoder/ff_wordalign.h create mode 100644 decoder/filelib.cc create mode 100644 decoder/filelib.h create mode 100644 decoder/forest_writer.cc create mode 100644 decoder/forest_writer.h create mode 100644 decoder/freqdict.cc create mode 100644 decoder/freqdict.h create mode 100644 decoder/fst_translator.cc create mode 100644 decoder/grammar.cc create mode 100644 decoder/grammar.h create mode 100644 decoder/grammar_test.cc create mode 100644 decoder/gzstream.cc create mode 100644 decoder/gzstream.h create mode 100644 decoder/hg.cc create mode 100644 decoder/hg.h create mode 100644 decoder/hg_intersect.cc create mode 100644 decoder/hg_intersect.h create mode 100644 decoder/hg_io.cc create mode 100644 decoder/hg_io.h create mode 100644 decoder/hg_test.cc create mode 100644 decoder/inside_outside.h create mode 100644 decoder/json_parse.cc create mode 100644 decoder/json_parse.h create mode 100644 decoder/kbest.h create mode 100644 decoder/lattice.cc create mode 100644 decoder/lattice.h create mode 100644 decoder/lexcrf.cc create mode 100644 decoder/lexcrf.h create mode 100644 decoder/logval.h create mode 100644 decoder/maxtrans_blunsom.cc create mode 100644 decoder/parser_test.cc create mode 100644 decoder/phrasebased_translator.cc create mode 100644 decoder/phrasebased_translator.h create mode 100644 decoder/phrasetable_fst.cc create mode 100644 decoder/phrasetable_fst.h create mode 100644 decoder/prob.h create mode 100644 decoder/sampler.h create mode 100644 decoder/scfg_translator.cc create mode 100644 decoder/sentence_metadata.h create mode 100644 decoder/small_vector.h create mode 100644 decoder/small_vector_test.cc create mode 100644 decoder/sparse_vector.cc create mode 100644 decoder/sparse_vector.h create mode 100644 decoder/stringlib.cc create mode 100644 decoder/stringlib.h create mode 100644 decoder/tdict.cc create mode 100644 decoder/tdict.h create mode 100644 decoder/test_data/dummy.3gram.lm create mode 100644 decoder/test_data/grammar.prune create mode 100644 decoder/test_data/small.json.gz create mode 100644 decoder/test_data/test_2gram.lm.gz create mode 100644 decoder/test_data/weights create mode 100644 decoder/test_data/weights.gt create mode 100644 decoder/timing_stats.cc create mode 100644 decoder/timing_stats.h create mode 100644 decoder/translator.h create mode 100644 decoder/trule.cc create mode 100644 decoder/trule.h create mode 100644 decoder/trule_test.cc create mode 100644 decoder/ttables.cc create mode 100644 decoder/ttables.h create mode 100644 decoder/viterbi.cc create mode 100644 decoder/viterbi.h create mode 100644 decoder/weights.cc create mode 100644 decoder/weights.h create mode 100644 decoder/weights_test.cc create mode 100644 decoder/wordid.h delete mode 100644 src/JSON_parser.c delete mode 100644 src/JSON_parser.h delete mode 100644 src/Makefile.am delete mode 100644 src/aligner.cc delete mode 100644 src/aligner.h delete mode 100644 src/apply_models.cc delete mode 100644 src/apply_models.h delete mode 100644 src/array2d.h delete mode 100644 src/bottom_up_parser.cc delete mode 100644 src/bottom_up_parser.h delete mode 100644 src/cdec.cc delete mode 100644 src/cdec_ff.cc delete mode 100644 src/csplit.cc delete mode 100644 src/csplit.h delete mode 100644 src/dict.h delete mode 100644 src/dict_test.cc delete mode 100644 src/earley_composer.cc delete mode 100644 src/earley_composer.h delete mode 100644 src/exp_semiring.h delete mode 100644 src/fdict.cc delete mode 100644 src/fdict.h delete mode 100644 src/ff.cc delete mode 100644 src/ff.h delete mode 100644 src/ff_csplit.cc delete mode 100644 src/ff_csplit.h delete mode 100644 src/ff_factory.cc delete mode 100644 src/ff_factory.h delete mode 100644 src/ff_lm.cc delete mode 100644 src/ff_lm.h delete mode 100644 src/ff_test.cc delete mode 100644 src/ff_wordalign.cc delete mode 100644 src/ff_wordalign.h delete mode 100644 src/filelib.cc delete mode 100644 src/filelib.h delete mode 100644 src/forest_writer.cc delete mode 100644 src/forest_writer.h delete mode 100644 src/freqdict.cc delete mode 100644 src/freqdict.h delete mode 100644 src/fst_translator.cc delete mode 100644 src/grammar.cc delete mode 100644 src/grammar.h delete mode 100644 src/grammar_test.cc delete mode 100644 src/gzstream.cc delete mode 100644 src/gzstream.h delete mode 100644 src/hg.cc delete mode 100644 src/hg.h delete mode 100644 src/hg_intersect.cc delete mode 100644 src/hg_intersect.h delete mode 100644 src/hg_io.cc delete mode 100644 src/hg_io.h delete mode 100644 src/hg_test.cc delete mode 100644 src/inside_outside.h delete mode 100644 src/json_parse.cc delete mode 100644 src/json_parse.h delete mode 100644 src/kbest.h delete mode 100644 src/lattice.cc delete mode 100644 src/lattice.h delete mode 100644 src/lexcrf.cc delete mode 100644 src/lexcrf.h delete mode 100644 src/logval.h delete mode 100644 src/maxtrans_blunsom.cc delete mode 100644 src/parser_test.cc delete mode 100644 src/phrasebased_translator.cc delete mode 100644 src/phrasebased_translator.h delete mode 100644 src/phrasetable_fst.cc delete mode 100644 src/phrasetable_fst.h delete mode 100644 src/prob.h delete mode 100644 src/sampler.h delete mode 100644 src/scfg_translator.cc delete mode 100644 src/sentence_metadata.h delete mode 100644 src/small_vector.h delete mode 100644 src/small_vector_test.cc delete mode 100644 src/sparse_vector.cc delete mode 100644 src/sparse_vector.h delete mode 100644 src/stringlib.cc delete mode 100644 src/stringlib.h delete mode 100644 src/tdict.cc delete mode 100644 src/tdict.h delete mode 100644 src/test_data/dummy.3gram.lm delete mode 100644 src/test_data/grammar.prune delete mode 100644 src/test_data/small.json.gz delete mode 100644 src/test_data/test_2gram.lm.gz delete mode 100644 src/test_data/weights delete mode 100644 src/test_data/weights.gt delete mode 100644 src/timing_stats.cc delete mode 100644 src/timing_stats.h delete mode 100644 src/translator.h delete mode 100644 src/trule.cc delete mode 100644 src/trule.h delete mode 100644 src/trule_test.cc delete mode 100644 src/ttables.cc delete mode 100644 src/ttables.h delete mode 100644 src/viterbi.cc delete mode 100644 src/viterbi.h delete mode 100644 src/weights.cc delete mode 100644 src/weights.h delete mode 100644 src/weights_test.cc delete mode 100644 src/wordid.h diff --git a/.gitignore b/.gitignore index 76e8610f..d2fb0f82 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,9 @@ config.h.in~ -src/ff_test -src/grammar_test -src/hg_test -src/parser_test -src/small_vector_test +decoder/ff_test +decoder/grammar_test +decoder/hg_test +decoder/parser_test +decoder/small_vector_test training/atools training/collapse_weights training/lbfgs_test @@ -29,15 +29,15 @@ configure depcomp install-sh missing -src/.deps/ -src/*.o -src/Makefile -src/Makefile.in -src/cdec -src/dict_test -src/libhg.a -src/trule_test -src/weights_test +decoder/.deps/ +decoder/*.o +decoder/Makefile +decoder/Makefile.in +decoder/cdec +decoder/dict_test +decoder/libcdec.a +decoder/trule_test +decoder/weights_test stamp-h1 training/.deps/ training/Makefile diff --git a/Makefile.am b/Makefile.am index c3780d88..b0e750f6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = src training vest +SUBDIRS = decoder training vest AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 diff --git a/compound-split/compound-split.pl b/compound-split/compound-split.pl index beca4dc0..490a5bc5 100755 --- a/compound-split/compound-split.pl +++ b/compound-split/compound-split.pl @@ -5,7 +5,7 @@ my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir use Getopt::Long; use IPC::Open2; -my $CDEC = "$script_dir/../src/cdec"; +my $CDEC = "$script_dir/../decoder/cdec"; my $LANG = 'de'; my $BEAM = 2.1; diff --git a/configure.ac b/configure.ac index c18342b3..0fd43e08 100644 --- a/configure.ac +++ b/configure.ac @@ -42,5 +42,5 @@ then AM_CONDITIONAL([SRI_LM], true) fi -AC_OUTPUT(Makefile src/Makefile training/Makefile vest/Makefile) +AC_OUTPUT(Makefile decoder/Makefile training/Makefile vest/Makefile) diff --git a/decoder/JSON_parser.c b/decoder/JSON_parser.c new file mode 100644 index 00000000..175b7cc9 --- /dev/null +++ b/decoder/JSON_parser.c @@ -0,0 +1,1012 @@ +/* JSON_parser.c */ + +/* 2007-08-24 */ + +/* +Copyright (c) 2005 JSON.org + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +The Software shall be used for Good, not Evil. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* + Callbacks, comments, Unicode handling by Jean Gressmann (jean@0x42.de), 2007-2009. + + For the added features the license above applies also. + + Changelog: + 2009-05-17 + Incorporated benrudiak@googlemail.com fix for UTF16 decoding. + + 2009-05-14 + Fixed float parsing bug related to a locale being set that didn't + use '.' as decimal point character (charles@transmissionbt.com). + + 2008-10-14 + Renamed states.IN to states.IT to avoid name clash which IN macro + defined in windef.h (alexey.pelykh@gmail.com) + + 2008-07-19 + Removed some duplicate code & debugging variable (charles@transmissionbt.com) + + 2008-05-28 + Made JSON_value structure ansi C compliant. This bug was report by + trisk@acm.jhu.edu + + 2008-05-20 + Fixed bug reported by charles@transmissionbt.com where the switching + from static to dynamic parse buffer did not copy the static parse + buffer's content. +*/ + + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "JSON_parser.h" + +#ifdef _MSC_VER +# if _MSC_VER >= 1400 /* Visual Studio 2005 and up */ +# pragma warning(disable:4996) // unsecure sscanf +# endif +#endif + + +#define true 1 +#define false 0 +#define __ -1 /* the universal error code */ + +/* values chosen so that the object size is approx equal to one page (4K) */ +#ifndef JSON_PARSER_STACK_SIZE +# define JSON_PARSER_STACK_SIZE 128 +#endif + +#ifndef JSON_PARSER_PARSE_BUFFER_SIZE +# define JSON_PARSER_PARSE_BUFFER_SIZE 3500 +#endif + +typedef unsigned short UTF16; + +struct JSON_parser_struct { + JSON_parser_callback callback; + void* ctx; + signed char state, before_comment_state, type, escaped, comment, allow_comments, handle_floats_manually; + UTF16 utf16_high_surrogate; + long depth; + long top; + signed char* stack; + long stack_capacity; + char decimal_point; + char* parse_buffer; + size_t parse_buffer_capacity; + size_t parse_buffer_count; + size_t comment_begin_offset; + signed char static_stack[JSON_PARSER_STACK_SIZE]; + char static_parse_buffer[JSON_PARSER_PARSE_BUFFER_SIZE]; +}; + +#define COUNTOF(x) (sizeof(x)/sizeof(x[0])) + +/* + Characters are mapped into these character classes. This allows for + a significant reduction in the size of the state transition table. +*/ + + + +enum classes { + C_SPACE, /* space */ + C_WHITE, /* other whitespace */ + C_LCURB, /* { */ + C_RCURB, /* } */ + C_LSQRB, /* [ */ + C_RSQRB, /* ] */ + C_COLON, /* : */ + C_COMMA, /* , */ + C_QUOTE, /* " */ + C_BACKS, /* \ */ + C_SLASH, /* / */ + C_PLUS, /* + */ + C_MINUS, /* - */ + C_POINT, /* . */ + C_ZERO , /* 0 */ + C_DIGIT, /* 123456789 */ + C_LOW_A, /* a */ + C_LOW_B, /* b */ + C_LOW_C, /* c */ + C_LOW_D, /* d */ + C_LOW_E, /* e */ + C_LOW_F, /* f */ + C_LOW_L, /* l */ + C_LOW_N, /* n */ + C_LOW_R, /* r */ + C_LOW_S, /* s */ + C_LOW_T, /* t */ + C_LOW_U, /* u */ + C_ABCDF, /* ABCDF */ + C_E, /* E */ + C_ETC, /* everything else */ + C_STAR, /* * */ + NR_CLASSES +}; + +static int ascii_class[128] = { +/* + This array maps the 128 ASCII characters into character classes. + The remaining Unicode characters should be mapped to C_ETC. + Non-whitespace control characters are errors. +*/ + __, __, __, __, __, __, __, __, + __, C_WHITE, C_WHITE, __, __, C_WHITE, __, __, + __, __, __, __, __, __, __, __, + __, __, __, __, __, __, __, __, + + C_SPACE, C_ETC, C_QUOTE, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_STAR, C_PLUS, C_COMMA, C_MINUS, C_POINT, C_SLASH, + C_ZERO, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, + C_DIGIT, C_DIGIT, C_COLON, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + + C_ETC, C_ABCDF, C_ABCDF, C_ABCDF, C_ABCDF, C_E, C_ABCDF, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_LSQRB, C_BACKS, C_RSQRB, C_ETC, C_ETC, + + C_ETC, C_LOW_A, C_LOW_B, C_LOW_C, C_LOW_D, C_LOW_E, C_LOW_F, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_LOW_L, C_ETC, C_LOW_N, C_ETC, + C_ETC, C_ETC, C_LOW_R, C_LOW_S, C_LOW_T, C_LOW_U, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_LCURB, C_ETC, C_RCURB, C_ETC, C_ETC +}; + + +/* + The state codes. +*/ +enum states { + GO, /* start */ + OK, /* ok */ + OB, /* object */ + KE, /* key */ + CO, /* colon */ + VA, /* value */ + AR, /* array */ + ST, /* string */ + ES, /* escape */ + U1, /* u1 */ + U2, /* u2 */ + U3, /* u3 */ + U4, /* u4 */ + MI, /* minus */ + ZE, /* zero */ + IT, /* integer */ + FR, /* fraction */ + E1, /* e */ + E2, /* ex */ + E3, /* exp */ + T1, /* tr */ + T2, /* tru */ + T3, /* true */ + F1, /* fa */ + F2, /* fal */ + F3, /* fals */ + F4, /* false */ + N1, /* nu */ + N2, /* nul */ + N3, /* null */ + C1, /* / */ + C2, /* / * */ + C3, /* * */ + FX, /* *.* *eE* */ + D1, /* second UTF-16 character decoding started by \ */ + D2, /* second UTF-16 character proceeded by u */ + NR_STATES +}; + +enum actions +{ + CB = -10, /* comment begin */ + CE = -11, /* comment end */ + FA = -12, /* false */ + TR = -13, /* false */ + NU = -14, /* null */ + DE = -15, /* double detected by exponent e E */ + DF = -16, /* double detected by fraction . */ + SB = -17, /* string begin */ + MX = -18, /* integer detected by minus */ + ZX = -19, /* integer detected by zero */ + IX = -20, /* integer detected by 1-9 */ + EX = -21, /* next char is escaped */ + UC = -22 /* Unicode character read */ +}; + + +static int state_transition_table[NR_STATES][NR_CLASSES] = { +/* + The state transition table takes the current state and the current symbol, + and returns either a new state or an action. An action is represented as a + negative number. A JSON text is accepted if at the end of the text the + state is OK and if the mode is MODE_DONE. + + white 1-9 ABCDF etc + space | { } [ ] : , " \ / + - . 0 | a b c d e f l n r s t u | E | * */ +/*start GO*/ {GO,GO,-6,__,-5,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*ok OK*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*object OB*/ {OB,OB,__,-9,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*key KE*/ {KE,KE,__,__,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*colon CO*/ {CO,CO,__,__,__,__,-2,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*value VA*/ {VA,VA,-6,__,-5,__,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__}, +/*array AR*/ {AR,AR,-6,__,-5,-7,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__}, +/*string ST*/ {ST,__,ST,ST,ST,ST,ST,ST,-4,EX,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST}, +/*escape ES*/ {__,__,__,__,__,__,__,__,ST,ST,ST,__,__,__,__,__,__,ST,__,__,__,ST,__,ST,ST,__,ST,U1,__,__,__,__}, +/*u1 U1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U2,U2,U2,U2,U2,U2,U2,U2,__,__,__,__,__,__,U2,U2,__,__}, +/*u2 U2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U3,U3,U3,U3,U3,U3,U3,U3,__,__,__,__,__,__,U3,U3,__,__}, +/*u3 U3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U4,U4,U4,U4,U4,U4,U4,U4,__,__,__,__,__,__,U4,U4,__,__}, +/*u4 U4*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,UC,UC,UC,UC,UC,UC,UC,UC,__,__,__,__,__,__,UC,UC,__,__}, +/*minus MI*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,ZE,IT,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*zero ZE*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*int IT*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,IT,IT,__,__,__,__,DE,__,__,__,__,__,__,__,__,DE,__,__}, +/*frac FR*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__}, +/*e E1*/ {__,__,__,__,__,__,__,__,__,__,__,E2,E2,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*ex E2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*exp E3*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*tr T1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T2,__,__,__,__,__,__,__}, +/*tru T2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T3,__,__,__,__}, +/*true T3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__}, +/*fa F1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*fal F2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F3,__,__,__,__,__,__,__,__,__}, +/*fals F3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F4,__,__,__,__,__,__}, +/*false F4*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__}, +/*nu N1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N2,__,__,__,__}, +/*nul N2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N3,__,__,__,__,__,__,__,__,__}, +/*null N3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__}, +/*/ C1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,C2}, +/*/* C2*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3}, +/** C3*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,CE,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3}, +/*_. FX*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__}, +/*\ D1*/ {__,__,__,__,__,__,__,__,__,D2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*\ D2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,U1,__,__,__,__}, +}; + + +/* + These modes can be pushed on the stack. +*/ +enum modes { + MODE_ARRAY = 1, + MODE_DONE = 2, + MODE_KEY = 3, + MODE_OBJECT = 4 +}; + +static int +push(JSON_parser jc, int mode) +{ +/* + Push a mode onto the stack. Return false if there is overflow. +*/ + jc->top += 1; + if (jc->depth < 0) { + if (jc->top >= jc->stack_capacity) { + size_t bytes_to_allocate; + jc->stack_capacity *= 2; + bytes_to_allocate = jc->stack_capacity * sizeof(jc->static_stack[0]); + if (jc->stack == &jc->static_stack[0]) { + jc->stack = (signed char*)malloc(bytes_to_allocate); + memcpy(jc->stack, jc->static_stack, sizeof(jc->static_stack)); + } else { + jc->stack = (signed char*)realloc(jc->stack, bytes_to_allocate); + } + } + } else { + if (jc->top >= jc->depth) { + return false; + } + } + + jc->stack[jc->top] = mode; + return true; +} + + +static int +pop(JSON_parser jc, int mode) +{ +/* + Pop the stack, assuring that the current mode matches the expectation. + Return false if there is underflow or if the modes mismatch. +*/ + if (jc->top < 0 || jc->stack[jc->top] != mode) { + return false; + } + jc->top -= 1; + return true; +} + + +#define parse_buffer_clear(jc) \ + do {\ + jc->parse_buffer_count = 0;\ + jc->parse_buffer[0] = 0;\ + } while (0) + +#define parse_buffer_pop_back_char(jc)\ + do {\ + assert(jc->parse_buffer_count >= 1);\ + --jc->parse_buffer_count;\ + jc->parse_buffer[jc->parse_buffer_count] = 0;\ + } while (0) + +void delete_JSON_parser(JSON_parser jc) +{ + if (jc) { + if (jc->stack != &jc->static_stack[0]) { + free((void*)jc->stack); + } + if (jc->parse_buffer != &jc->static_parse_buffer[0]) { + free((void*)jc->parse_buffer); + } + free((void*)jc); + } +} + + +JSON_parser +new_JSON_parser(JSON_config* config) +{ +/* + new_JSON_parser starts the checking process by constructing a JSON_parser + object. It takes a depth parameter that restricts the level of maximum + nesting. + + To continue the process, call JSON_parser_char for each character in the + JSON text, and then call JSON_parser_done to obtain the final result. + These functions are fully reentrant. +*/ + + int depth = 0; + JSON_config default_config; + + JSON_parser jc = (JSON_parser)malloc(sizeof(struct JSON_parser_struct)); + + memset(jc, 0, sizeof(*jc)); + + + /* initialize configuration */ + init_JSON_config(&default_config); + + /* set to default configuration if none was provided */ + if (config == NULL) { + config = &default_config; + } + + depth = config->depth; + + /* We need to be able to push at least one object */ + if (depth == 0) { + depth = 1; + } + + jc->state = GO; + jc->top = -1; + + /* Do we want non-bound stack? */ + if (depth > 0) { + jc->stack_capacity = depth; + jc->depth = depth; + if (depth <= (int)COUNTOF(jc->static_stack)) { + jc->stack = &jc->static_stack[0]; + } else { + jc->stack = (signed char*)malloc(jc->stack_capacity * sizeof(jc->static_stack[0])); + } + } else { + jc->stack_capacity = COUNTOF(jc->static_stack); + jc->depth = -1; + jc->stack = &jc->static_stack[0]; + } + + /* set parser to start */ + push(jc, MODE_DONE); + + /* set up the parse buffer */ + jc->parse_buffer = &jc->static_parse_buffer[0]; + jc->parse_buffer_capacity = COUNTOF(jc->static_parse_buffer); + parse_buffer_clear(jc); + + /* set up callback, comment & float handling */ + jc->callback = config->callback; + jc->ctx = config->callback_ctx; + jc->allow_comments = config->allow_comments != 0; + jc->handle_floats_manually = config->handle_floats_manually != 0; + + /* set up decimal point */ + jc->decimal_point = *localeconv()->decimal_point; + + return jc; +} + +static void grow_parse_buffer(JSON_parser jc) +{ + size_t bytes_to_allocate; + jc->parse_buffer_capacity *= 2; + bytes_to_allocate = jc->parse_buffer_capacity * sizeof(jc->parse_buffer[0]); + if (jc->parse_buffer == &jc->static_parse_buffer[0]) { + jc->parse_buffer = (char*)malloc(bytes_to_allocate); + memcpy(jc->parse_buffer, jc->static_parse_buffer, jc->parse_buffer_count); + } else { + jc->parse_buffer = (char*)realloc(jc->parse_buffer, bytes_to_allocate); + } +} + +#define parse_buffer_push_back_char(jc, c)\ + do {\ + if (jc->parse_buffer_count + 1 >= jc->parse_buffer_capacity) grow_parse_buffer(jc);\ + jc->parse_buffer[jc->parse_buffer_count++] = c;\ + jc->parse_buffer[jc->parse_buffer_count] = 0;\ + } while (0) + +#define assert_is_non_container_type(jc) \ + assert( \ + jc->type == JSON_T_NULL || \ + jc->type == JSON_T_FALSE || \ + jc->type == JSON_T_TRUE || \ + jc->type == JSON_T_FLOAT || \ + jc->type == JSON_T_INTEGER || \ + jc->type == JSON_T_STRING) + + +static int parse_parse_buffer(JSON_parser jc) +{ + if (jc->callback) { + JSON_value value, *arg = NULL; + + if (jc->type != JSON_T_NONE) { + assert_is_non_container_type(jc); + + switch(jc->type) { + case JSON_T_FLOAT: + arg = &value; + if (jc->handle_floats_manually) { + value.vu.str.value = jc->parse_buffer; + value.vu.str.length = jc->parse_buffer_count; + } else { + /*sscanf(jc->parse_buffer, "%Lf", &value.vu.float_value);*/ + + /* not checking with end pointer b/c there may be trailing ws */ + value.vu.float_value = strtold(jc->parse_buffer, NULL); + } + break; + case JSON_T_INTEGER: + arg = &value; + sscanf(jc->parse_buffer, JSON_PARSER_INTEGER_SSCANF_TOKEN, &value.vu.integer_value); + break; + case JSON_T_STRING: + arg = &value; + value.vu.str.value = jc->parse_buffer; + value.vu.str.length = jc->parse_buffer_count; + break; + } + + if (!(*jc->callback)(jc->ctx, jc->type, arg)) { + return false; + } + } + } + + parse_buffer_clear(jc); + + return true; +} + +#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800) +#define IS_LOW_SURROGATE(uc) (((uc) & 0xFC00) == 0xDC00) +#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000) +static unsigned char utf8_lead_bits[4] = { 0x00, 0xC0, 0xE0, 0xF0 }; + +static int decode_unicode_char(JSON_parser jc) +{ + int i; + unsigned uc = 0; + char* p; + int trail_bytes; + + assert(jc->parse_buffer_count >= 6); + + p = &jc->parse_buffer[jc->parse_buffer_count - 4]; + + for (i = 12; i >= 0; i -= 4, ++p) { + unsigned x = *p; + + if (x >= 'a') { + x -= ('a' - 10); + } else if (x >= 'A') { + x -= ('A' - 10); + } else { + x &= ~0x30u; + } + + assert(x < 16); + + uc |= x << i; + } + + /* clear UTF-16 char from buffer */ + jc->parse_buffer_count -= 6; + jc->parse_buffer[jc->parse_buffer_count] = 0; + + /* attempt decoding ... */ + if (jc->utf16_high_surrogate) { + if (IS_LOW_SURROGATE(uc)) { + uc = DECODE_SURROGATE_PAIR(jc->utf16_high_surrogate, uc); + trail_bytes = 3; + jc->utf16_high_surrogate = 0; + } else { + /* high surrogate without a following low surrogate */ + return false; + } + } else { + if (uc < 0x80) { + trail_bytes = 0; + } else if (uc < 0x800) { + trail_bytes = 1; + } else if (IS_HIGH_SURROGATE(uc)) { + /* save the high surrogate and wait for the low surrogate */ + jc->utf16_high_surrogate = uc; + return true; + } else if (IS_LOW_SURROGATE(uc)) { + /* low surrogate without a preceding high surrogate */ + return false; + } else { + trail_bytes = 2; + } + } + + jc->parse_buffer[jc->parse_buffer_count++] = (char) ((uc >> (trail_bytes * 6)) | utf8_lead_bits[trail_bytes]); + + for (i = trail_bytes * 6 - 6; i >= 0; i -= 6) { + jc->parse_buffer[jc->parse_buffer_count++] = (char) (((uc >> i) & 0x3F) | 0x80); + } + + jc->parse_buffer[jc->parse_buffer_count] = 0; + + return true; +} + +static int add_escaped_char_to_parse_buffer(JSON_parser jc, int next_char) +{ + jc->escaped = 0; + /* remove the backslash */ + parse_buffer_pop_back_char(jc); + switch(next_char) { + case 'b': + parse_buffer_push_back_char(jc, '\b'); + break; + case 'f': + parse_buffer_push_back_char(jc, '\f'); + break; + case 'n': + parse_buffer_push_back_char(jc, '\n'); + break; + case 'r': + parse_buffer_push_back_char(jc, '\r'); + break; + case 't': + parse_buffer_push_back_char(jc, '\t'); + break; + case '"': + parse_buffer_push_back_char(jc, '"'); + break; + case '\\': + parse_buffer_push_back_char(jc, '\\'); + break; + case '/': + parse_buffer_push_back_char(jc, '/'); + break; + case 'u': + parse_buffer_push_back_char(jc, '\\'); + parse_buffer_push_back_char(jc, 'u'); + break; + default: + return false; + } + + return true; +} + +#define add_char_to_parse_buffer(jc, next_char, next_class) \ + do { \ + if (jc->escaped) { \ + if (!add_escaped_char_to_parse_buffer(jc, next_char)) \ + return false; \ + } else if (!jc->comment) { \ + if ((jc->type != JSON_T_NONE) | !((next_class == C_SPACE) | (next_class == C_WHITE)) /* non-white-space */) { \ + parse_buffer_push_back_char(jc, (char)next_char); \ + } \ + } \ + } while (0) + + +#define assert_type_isnt_string_null_or_bool(jc) \ + assert(jc->type != JSON_T_FALSE); \ + assert(jc->type != JSON_T_TRUE); \ + assert(jc->type != JSON_T_NULL); \ + assert(jc->type != JSON_T_STRING) + + +int +JSON_parser_char(JSON_parser jc, int next_char) +{ +/* + After calling new_JSON_parser, call this function for each character (or + partial character) in your JSON text. It can accept UTF-8, UTF-16, or + UTF-32. It returns true if things are looking ok so far. If it rejects the + text, it returns false. +*/ + int next_class, next_state; + +/* + Determine the character's class. +*/ + if (next_char < 0) { + return false; + } + if (next_char >= 128) { + next_class = C_ETC; + } else { + next_class = ascii_class[next_char]; + if (next_class <= __) { + return false; + } + } + + add_char_to_parse_buffer(jc, next_char, next_class); + +/* + Get the next state from the state transition table. +*/ + next_state = state_transition_table[jc->state][next_class]; + if (next_state >= 0) { +/* + Change the state. +*/ + jc->state = next_state; + } else { +/* + Or perform one of the actions. +*/ + switch (next_state) { +/* Unicode character */ + case UC: + if(!decode_unicode_char(jc)) { + return false; + } + /* check if we need to read a second UTF-16 char */ + if (jc->utf16_high_surrogate) { + jc->state = D1; + } else { + jc->state = ST; + } + break; +/* escaped char */ + case EX: + jc->escaped = 1; + jc->state = ES; + break; +/* integer detected by minus */ + case MX: + jc->type = JSON_T_INTEGER; + jc->state = MI; + break; +/* integer detected by zero */ + case ZX: + jc->type = JSON_T_INTEGER; + jc->state = ZE; + break; +/* integer detected by 1-9 */ + case IX: + jc->type = JSON_T_INTEGER; + jc->state = IT; + break; + +/* floating point number detected by exponent*/ + case DE: + assert_type_isnt_string_null_or_bool(jc); + jc->type = JSON_T_FLOAT; + jc->state = E1; + break; + +/* floating point number detected by fraction */ + case DF: + assert_type_isnt_string_null_or_bool(jc); + if (!jc->handle_floats_manually) { +/* + Some versions of strtod (which underlies sscanf) don't support converting + C-locale formated floating point values. +*/ + assert(jc->parse_buffer[jc->parse_buffer_count-1] == '.'); + jc->parse_buffer[jc->parse_buffer_count-1] = jc->decimal_point; + } + jc->type = JSON_T_FLOAT; + jc->state = FX; + break; +/* string begin " */ + case SB: + parse_buffer_clear(jc); + assert(jc->type == JSON_T_NONE); + jc->type = JSON_T_STRING; + jc->state = ST; + break; + +/* n */ + case NU: + assert(jc->type == JSON_T_NONE); + jc->type = JSON_T_NULL; + jc->state = N1; + break; +/* f */ + case FA: + assert(jc->type == JSON_T_NONE); + jc->type = JSON_T_FALSE; + jc->state = F1; + break; +/* t */ + case TR: + assert(jc->type == JSON_T_NONE); + jc->type = JSON_T_TRUE; + jc->state = T1; + break; + +/* closing comment */ + case CE: + jc->comment = 0; + assert(jc->parse_buffer_count == 0); + assert(jc->type == JSON_T_NONE); + jc->state = jc->before_comment_state; + break; + +/* opening comment */ + case CB: + if (!jc->allow_comments) { + return false; + } + parse_buffer_pop_back_char(jc); + if (!parse_parse_buffer(jc)) { + return false; + } + assert(jc->parse_buffer_count == 0); + assert(jc->type != JSON_T_STRING); + switch (jc->stack[jc->top]) { + case MODE_ARRAY: + case MODE_OBJECT: + switch(jc->state) { + case VA: + case AR: + jc->before_comment_state = jc->state; + break; + default: + jc->before_comment_state = OK; + break; + } + break; + default: + jc->before_comment_state = jc->state; + break; + } + jc->type = JSON_T_NONE; + jc->state = C1; + jc->comment = 1; + break; +/* empty } */ + case -9: + parse_buffer_clear(jc); + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) { + return false; + } + if (!pop(jc, MODE_KEY)) { + return false; + } + jc->state = OK; + break; + +/* } */ case -8: + parse_buffer_pop_back_char(jc); + if (!parse_parse_buffer(jc)) { + return false; + } + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) { + return false; + } + if (!pop(jc, MODE_OBJECT)) { + return false; + } + jc->type = JSON_T_NONE; + jc->state = OK; + break; + +/* ] */ case -7: + parse_buffer_pop_back_char(jc); + if (!parse_parse_buffer(jc)) { + return false; + } + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_END, NULL)) { + return false; + } + if (!pop(jc, MODE_ARRAY)) { + return false; + } + + jc->type = JSON_T_NONE; + jc->state = OK; + break; + +/* { */ case -6: + parse_buffer_pop_back_char(jc); + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_BEGIN, NULL)) { + return false; + } + if (!push(jc, MODE_KEY)) { + return false; + } + assert(jc->type == JSON_T_NONE); + jc->state = OB; + break; + +/* [ */ case -5: + parse_buffer_pop_back_char(jc); + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_BEGIN, NULL)) { + return false; + } + if (!push(jc, MODE_ARRAY)) { + return false; + } + assert(jc->type == JSON_T_NONE); + jc->state = AR; + break; + +/* string end " */ case -4: + parse_buffer_pop_back_char(jc); + switch (jc->stack[jc->top]) { + case MODE_KEY: + assert(jc->type == JSON_T_STRING); + jc->type = JSON_T_NONE; + jc->state = CO; + + if (jc->callback) { + JSON_value value; + value.vu.str.value = jc->parse_buffer; + value.vu.str.length = jc->parse_buffer_count; + if (!(*jc->callback)(jc->ctx, JSON_T_KEY, &value)) { + return false; + } + } + parse_buffer_clear(jc); + break; + case MODE_ARRAY: + case MODE_OBJECT: + assert(jc->type == JSON_T_STRING); + if (!parse_parse_buffer(jc)) { + return false; + } + jc->type = JSON_T_NONE; + jc->state = OK; + break; + default: + return false; + } + break; + +/* , */ case -3: + parse_buffer_pop_back_char(jc); + if (!parse_parse_buffer(jc)) { + return false; + } + switch (jc->stack[jc->top]) { + case MODE_OBJECT: +/* + A comma causes a flip from object mode to key mode. +*/ + if (!pop(jc, MODE_OBJECT) || !push(jc, MODE_KEY)) { + return false; + } + assert(jc->type != JSON_T_STRING); + jc->type = JSON_T_NONE; + jc->state = KE; + break; + case MODE_ARRAY: + assert(jc->type != JSON_T_STRING); + jc->type = JSON_T_NONE; + jc->state = VA; + break; + default: + return false; + } + break; + +/* : */ case -2: +/* + A colon causes a flip from key mode to object mode. +*/ + parse_buffer_pop_back_char(jc); + if (!pop(jc, MODE_KEY) || !push(jc, MODE_OBJECT)) { + return false; + } + assert(jc->type == JSON_T_NONE); + jc->state = VA; + break; +/* + Bad action. +*/ + default: + return false; + } + } + return true; +} + + +int +JSON_parser_done(JSON_parser jc) +{ + const int result = jc->state == OK && pop(jc, MODE_DONE); + + return result; +} + + +int JSON_parser_is_legal_white_space_string(const char* s) +{ + int c, char_class; + + if (s == NULL) { + return false; + } + + for (; *s; ++s) { + c = *s; + + if (c < 0 || c >= 128) { + return false; + } + + char_class = ascii_class[c]; + + if (char_class != C_SPACE && char_class != C_WHITE) { + return false; + } + } + + return true; +} + + + +void init_JSON_config(JSON_config* config) +{ + if (config) { + memset(config, 0, sizeof(*config)); + + config->depth = JSON_PARSER_STACK_SIZE - 1; + } +} diff --git a/decoder/JSON_parser.h b/decoder/JSON_parser.h new file mode 100644 index 00000000..ceb5b24b --- /dev/null +++ b/decoder/JSON_parser.h @@ -0,0 +1,152 @@ +#ifndef JSON_PARSER_H +#define JSON_PARSER_H + +/* JSON_parser.h */ + + +#include + +/* Windows DLL stuff */ +#ifdef _WIN32 +# ifdef JSON_PARSER_DLL_EXPORTS +# define JSON_PARSER_DLL_API __declspec(dllexport) +# else +# define JSON_PARSER_DLL_API __declspec(dllimport) +# endif +#else +# define JSON_PARSER_DLL_API +#endif + +/* Determine the integer type use to parse non-floating point numbers */ +#if __STDC_VERSION__ >= 199901L || HAVE_LONG_LONG == 1 +typedef long long JSON_int_t; +#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%lld" +#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%lld" +#else +typedef long JSON_int_t; +#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%ld" +#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%ld" +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum +{ + JSON_T_NONE = 0, + JSON_T_ARRAY_BEGIN, // 1 + JSON_T_ARRAY_END, // 2 + JSON_T_OBJECT_BEGIN, // 3 + JSON_T_OBJECT_END, // 4 + JSON_T_INTEGER, // 5 + JSON_T_FLOAT, // 6 + JSON_T_NULL, // 7 + JSON_T_TRUE, // 8 + JSON_T_FALSE, // 9 + JSON_T_STRING, // 10 + JSON_T_KEY, // 11 + JSON_T_MAX // 12 +} JSON_type; + +typedef struct JSON_value_struct { + union { + JSON_int_t integer_value; + + long double float_value; + + struct { + const char* value; + size_t length; + } str; + } vu; +} JSON_value; + +typedef struct JSON_parser_struct* JSON_parser; + +/*! \brief JSON parser callback + + \param ctx The pointer passed to new_JSON_parser. + \param type An element of JSON_type but not JSON_T_NONE. + \param value A representation of the parsed value. This parameter is NULL for + JSON_T_ARRAY_BEGIN, JSON_T_ARRAY_END, JSON_T_OBJECT_BEGIN, JSON_T_OBJECT_END, + JSON_T_NULL, JSON_T_TRUE, and SON_T_FALSE. String values are always returned + as zero-terminated C strings. + + \return Non-zero if parsing should continue, else zero. +*/ +typedef int (*JSON_parser_callback)(void* ctx, int type, const struct JSON_value_struct* value); + + +/*! \brief The structure used to configure a JSON parser object + + \param depth If negative, the parser can parse arbitrary levels of JSON, otherwise + the depth is the limit + \param Pointer to a callback. This parameter may be NULL. In this case the input is merely checked for validity. + \param Callback context. This parameter may be NULL. + \param depth. Specifies the levels of nested JSON to allow. Negative numbers yield unlimited nesting. + \param allowComments. To allow C style comments in JSON, set to non-zero. + \param handleFloatsManually. To decode floating point numbers manually set this parameter to non-zero. + + \return The parser object. +*/ +typedef struct { + JSON_parser_callback callback; + void* callback_ctx; + int depth; + int allow_comments; + int handle_floats_manually; +} JSON_config; + + +/*! \brief Initializes the JSON parser configuration structure to default values. + + The default configuration is + - 127 levels of nested JSON (depends on JSON_PARSER_STACK_SIZE, see json_parser.c) + - no parsing, just checking for JSON syntax + - no comments + + \param config. Used to configure the parser. +*/ +JSON_PARSER_DLL_API void init_JSON_config(JSON_config* config); + +/*! \brief Create a JSON parser object + + \param config. Used to configure the parser. Set to NULL to use the default configuration. + See init_JSON_config + + \return The parser object. +*/ +JSON_PARSER_DLL_API extern JSON_parser new_JSON_parser(JSON_config* config); + +/*! \brief Destroy a previously created JSON parser object. */ +JSON_PARSER_DLL_API extern void delete_JSON_parser(JSON_parser jc); + +/*! \brief Parse a character. + + \return Non-zero, if all characters passed to this function are part of are valid JSON. +*/ +JSON_PARSER_DLL_API extern int JSON_parser_char(JSON_parser jc, int next_char); + +/*! \brief Finalize parsing. + + Call this method once after all input characters have been consumed. + + \return Non-zero, if all parsed characters are valid JSON, zero otherwise. +*/ +JSON_PARSER_DLL_API extern int JSON_parser_done(JSON_parser jc); + +/*! \brief Determine if a given string is valid JSON white space + + \return Non-zero if the string is valid, zero otherwise. +*/ +JSON_PARSER_DLL_API extern int JSON_parser_is_legal_white_space_string(const char* s); + + +#ifdef __cplusplus +} +#endif + + +#endif /* JSON_PARSER_H */ diff --git a/decoder/Makefile.am b/decoder/Makefile.am new file mode 100644 index 00000000..f3843102 --- /dev/null +++ b/decoder/Makefile.am @@ -0,0 +1,69 @@ +bin_PROGRAMS = \ + dict_test \ + weights_test \ + trule_test \ + hg_test \ + ff_test \ + parser_test \ + grammar_test \ + cdec \ + small_vector_test + +cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc ff_factory.cc timing_stats.cc +small_vector_test_SOURCES = small_vector_test.cc +small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +parser_test_SOURCES = parser_test.cc +parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +dict_test_SOURCES = dict_test.cc +dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +ff_test_SOURCES = ff_test.cc +ff_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +grammar_test_SOURCES = grammar_test.cc +grammar_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +hg_test_SOURCES = hg_test.cc +hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +trule_test_SOURCES = trule_test.cc +trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +weights_test_SOURCES = weights_test.cc +weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a + +LDADD = libcdec.a + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) +AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB) -lz + +noinst_LIBRARIES = libcdec.a + +libcdec_a_SOURCES = \ + fst_translator.cc \ + csplit.cc \ + scfg_translator.cc \ + hg.cc \ + hg_io.cc \ + hg_intersect.cc \ + viterbi.cc \ + lattice.cc \ + aligner.cc \ + gzstream.cc \ + apply_models.cc \ + earley_composer.cc \ + phrasetable_fst.cc \ + sparse_vector.cc \ + trule.cc \ + filelib.cc \ + stringlib.cc \ + fdict.cc \ + tdict.cc \ + weights.cc \ + ttables.cc \ + ff.cc \ + ff_lm.cc \ + ff_wordalign.cc \ + ff_csplit.cc \ + freqdict.cc \ + lexcrf.cc \ + bottom_up_parser.cc \ + phrasebased_translator.cc \ + JSON_parser.c \ + json_parse.cc \ + grammar.cc diff --git a/decoder/aligner.cc b/decoder/aligner.cc new file mode 100644 index 00000000..d9d067e5 --- /dev/null +++ b/decoder/aligner.cc @@ -0,0 +1,204 @@ +#include "aligner.h" + +#include "array2d.h" +#include "hg.h" +#include "inside_outside.h" +#include + +using namespace std; + +struct EdgeCoverageInfo { + set src_indices; + set trg_indices; +}; + +static bool is_digit(char x) { return x >= '0' && x <= '9'; } + +boost::shared_ptr > AlignerTools::ReadPharaohAlignmentGrid(const string& al) { + int max_x = 0; + int max_y = 0; + int i = 0; + while (i < al.size()) { + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + if (x > max_x) max_x = x; + assert(i < al.size()); + assert(al[i] == '-'); + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + if (y > max_y) max_y = y; + while(i < al.size() && al[i] == ' ') { ++i; } + } + + boost::shared_ptr > grid(new Array2D(max_x + 1, max_y + 1)); + i = 0; + while (i < al.size()) { + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + assert(i < al.size()); + assert(al[i] == '-'); + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + (*grid)(x, y) = true; + while(i < al.size() && al[i] == ' ') { ++i; } + } + // cerr << *grid << endl; + return grid; +} + +void AlignerTools::SerializePharaohFormat(const Array2D& alignment, ostream* out) { + bool need_space = false; + for (int i = 0; i < alignment.width(); ++i) + for (int j = 0; j < alignment.height(); ++j) + if (alignment(i,j)) { + if (need_space) (*out) << ' '; else need_space = true; + (*out) << i << '-' << j; + } + (*out) << endl; +} + +// compute the coverage vectors of each edge +// prereq: all derivations yield the same string pair +void ComputeCoverages(const Hypergraph& g, + vector* pcovs) { + for (int i = 0; i < g.edges_.size(); ++i) { + const Hypergraph::Edge& edge = g.edges_[i]; + EdgeCoverageInfo& cov = (*pcovs)[i]; + // no words + if (edge.rule_->EWords() == 0 || edge.rule_->FWords() == 0) + continue; + // aligned to NULL (crf ibm variant only) + if (edge.prev_i_ == -1 || edge.i_ == -1) + continue; + assert(edge.j_ >= 0); + assert(edge.prev_j_ >= 0); + if (edge.Arity() == 0) { + for (int k = edge.i_; k < edge.j_; ++k) + cov.trg_indices.insert(k); + for (int k = edge.prev_i_; k < edge.prev_j_; ++k) + cov.src_indices.insert(k); + } else { + // note: this code, which handles mixed NT and terminal + // rules assumes that nodes uniquely define a src and trg + // span. + int k = edge.prev_i_; + int j = 0; + const vector& f = edge.rule_->e(); // rules are inverted + while (k < edge.prev_j_) { + if (f[j] > 0) { + cov.src_indices.insert(k); + // cerr << "src: " << k << endl; + ++k; + ++j; + } else { + const Hypergraph::Node& tailnode = g.nodes_[edge.tail_nodes_[-f[j]]]; + assert(tailnode.in_edges_.size() > 0); + // any edge will do: + const Hypergraph::Edge& rep_edge = g.edges_[tailnode.in_edges_.front()]; + //cerr << "skip " << (rep_edge.prev_j_ - rep_edge.prev_i_) << endl; // src span + k += (rep_edge.prev_j_ - rep_edge.prev_i_); // src span + ++j; + } + } + int tc = 0; + const vector& e = edge.rule_->f(); // rules are inverted + k = edge.i_; + j = 0; + // cerr << edge.rule_->AsString() << endl; + // cerr << "i=" << k << " j=" << edge.j_ << endl; + while (k < edge.j_) { + //cerr << " k=" << k << endl; + if (e[j] > 0) { + cov.trg_indices.insert(k); + // cerr << "trg: " << k << endl; + ++k; + ++j; + } else { + assert(tc < edge.tail_nodes_.size()); + const Hypergraph::Node& tailnode = g.nodes_[edge.tail_nodes_[tc]]; + assert(tailnode.in_edges_.size() > 0); + // any edge will do: + const Hypergraph::Edge& rep_edge = g.edges_[tailnode.in_edges_.front()]; + // cerr << "t skip " << (rep_edge.j_ - rep_edge.i_) << endl; // src span + k += (rep_edge.j_ - rep_edge.i_); // src span + ++j; + ++tc; + } + } + //abort(); + } + } +} + +void AlignerTools::WriteAlignment(const string& input, + const Lattice& ref, + const Hypergraph& g, + bool map_instead_of_viterbi) { + if (!map_instead_of_viterbi) { + assert(!"not implemented!"); + } + vector edge_posteriors(g.edges_.size()); + { + SparseVector posts; + InsideOutside, TransitionEventWeightFunction>(g, &posts); + for (int i = 0; i < edge_posteriors.size(); ++i) + edge_posteriors[i] = posts[i]; + } + vector edge2cov(g.edges_.size()); + ComputeCoverages(g, &edge2cov); + + Lattice src; + // currently only dealing with src text, even if the + // model supports lattice translation (which it probably does) + LatticeTools::ConvertTextToLattice(input, &src); + // TODO assert that src is a "real lattice" + + Array2D align(src.size(), ref.size(), prob_t::Zero()); + for (int c = 0; c < g.edges_.size(); ++c) { + const prob_t& p = edge_posteriors[c]; + const EdgeCoverageInfo& eci = edge2cov[c]; + for (set::const_iterator si = eci.src_indices.begin(); + si != eci.src_indices.end(); ++si) { + for (set::const_iterator ti = eci.trg_indices.begin(); + ti != eci.trg_indices.end(); ++ti) { + align(*si, *ti) += p; + } + } + } + prob_t threshold(0.9); + const bool use_soft_threshold = true; // TODO configure + + Array2D grid(src.size(), ref.size(), false); + for (int j = 0; j < ref.size(); ++j) { + if (use_soft_threshold) { + threshold = prob_t::Zero(); + for (int i = 0; i < src.size(); ++i) + if (align(i, j) > threshold) threshold = align(i, j); + //threshold *= prob_t(0.99); + } + for (int i = 0; i < src.size(); ++i) + grid(i, j) = align(i, j) >= threshold; + } + cerr << align << endl; + cerr << grid << endl; + SerializePharaohFormat(grid, &cout); +}; + diff --git a/decoder/aligner.h b/decoder/aligner.h new file mode 100644 index 00000000..970c72f2 --- /dev/null +++ b/decoder/aligner.h @@ -0,0 +1,23 @@ +#ifndef _ALIGNER_H_ + +#include +#include +#include +#include "array2d.h" +#include "lattice.h" + +class Hypergraph; + +struct AlignerTools { + static boost::shared_ptr > ReadPharaohAlignmentGrid(const std::string& al); + static void SerializePharaohFormat(const Array2D& alignment, std::ostream* out); + + // assumption: g contains derivations of input/ref and + // ONLY input/ref. + static void WriteAlignment(const std::string& input, + const Lattice& ref, + const Hypergraph& g, + bool map_instead_of_viterbi = true); +}; + +#endif diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc new file mode 100644 index 00000000..b1d002f4 --- /dev/null +++ b/decoder/apply_models.cc @@ -0,0 +1,344 @@ +#include "apply_models.h" + +#include +#include +#include +#include + +#include + +#include "hg.h" +#include "ff.h" + +using namespace std; +using namespace std::tr1; + +struct Candidate; +typedef SmallVector JVector; +typedef vector CandidateHeap; +typedef vector CandidateList; + +// life cycle: candidates are created, placed on the heap +// and retrieved by their estimated cost, when they're +// retrieved, they're incorporated into the +LM hypergraph +// where they also know the head node index they are +// attached to. After they are added to the +LM hypergraph +// vit_prob_ and est_prob_ fields may be updated as better +// derivations are found (this happens since the successor's +// of derivation d may have a better score- they are +// explored lazily). However, the updates don't happen +// when a candidate is in the heap so maintaining the heap +// property is not an issue. +struct Candidate { + int node_index_; // -1 until incorporated + // into the +LM forest + const Hypergraph::Edge* in_edge_; // in -LM forest + Hypergraph::Edge out_edge_; + string state_; + const JVector j_; + prob_t vit_prob_; // these are fixed until the cand + // is popped, then they may be updated + prob_t est_prob_; + + Candidate(const Hypergraph::Edge& e, + const JVector& j, + const Hypergraph& out_hg, + const vector& D, + const SentenceMetadata& smeta, + const ModelSet& models, + bool is_goal) : + node_index_(-1), + in_edge_(&e), + j_(j) { + InitializeCandidate(out_hg, smeta, D, models, is_goal); + } + + // used to query uniqueness + Candidate(const Hypergraph::Edge& e, + const JVector& j) : in_edge_(&e), j_(j) {} + + bool IsIncorporatedIntoHypergraph() const { + return node_index_ >= 0; + } + + void InitializeCandidate(const Hypergraph& out_hg, + const SentenceMetadata& smeta, + const vector >& D, + const ModelSet& models, + const bool is_goal) { + const Hypergraph::Edge& in_edge = *in_edge_; + out_edge_.rule_ = in_edge.rule_; + out_edge_.feature_values_ = in_edge.feature_values_; + out_edge_.i_ = in_edge.i_; + out_edge_.j_ = in_edge.j_; + out_edge_.prev_i_ = in_edge.prev_i_; + out_edge_.prev_j_ = in_edge.prev_j_; + Hypergraph::TailNodeVector& tail = out_edge_.tail_nodes_; + tail.resize(j_.size()); + prob_t p = prob_t::One(); + // cerr << "\nEstimating application of " << in_edge.rule_->AsString() << endl; + for (int i = 0; i < tail.size(); ++i) { + const Candidate& ant = *D[in_edge.tail_nodes_[i]][j_[i]]; + assert(ant.IsIncorporatedIntoHypergraph()); + tail[i] = ant.node_index_; + p *= ant.vit_prob_; + } + prob_t edge_estimate = prob_t::One(); + if (is_goal) { + assert(tail.size() == 1); + const string& ant_state = out_hg.nodes_[tail.front()].state_; + models.AddFinalFeatures(ant_state, &out_edge_); + } else { + models.AddFeaturesToEdge(smeta, out_hg, &out_edge_, &state_, &edge_estimate); + } + vit_prob_ = out_edge_.edge_prob_ * p; + est_prob_ = vit_prob_ * edge_estimate; + } +}; + +ostream& operator<<(ostream& os, const Candidate& cand) { + os << "CAND["; + if (!cand.IsIncorporatedIntoHypergraph()) { os << "PENDING "; } + else { os << "+LM_node=" << cand.node_index_; } + os << " edge=" << cand.in_edge_->id_; + os << " j=<"; + for (int i = 0; i < cand.j_.size(); ++i) + os << (i==0 ? "" : " ") << cand.j_[i]; + os << "> vit=" << log(cand.vit_prob_); + os << " est=" << log(cand.est_prob_); + return os << ']'; +} + +struct HeapCandCompare { + bool operator()(const Candidate* l, const Candidate* r) const { + return l->est_prob_ < r->est_prob_; + } +}; + +struct EstProbSorter { + bool operator()(const Candidate* l, const Candidate* r) const { + return l->est_prob_ > r->est_prob_; + } +}; + +// the same candidate can be added multiple times if +// j is multidimensional (if you're going NW in Manhattan, you +// can first go north, then west, or you can go west then north) +// this is a hash function on the relevant variables from +// Candidate to enforce this. +struct CandidateUniquenessHash { + size_t operator()(const Candidate* c) const { + size_t x = 5381; + x = ((x << 5) + x) ^ c->in_edge_->id_; + for (int i = 0; i < c->j_.size(); ++i) + x = ((x << 5) + x) ^ c->j_[i]; + return x; + } +}; + +struct CandidateUniquenessEquals { + bool operator()(const Candidate* a, const Candidate* b) const { + return (a->in_edge_ == b->in_edge_) && (a->j_ == b->j_); + } +}; + +typedef unordered_set UniqueCandidateSet; +typedef unordered_map > State2Node; + +class CubePruningRescorer { + +public: + CubePruningRescorer(const ModelSet& m, + const SentenceMetadata& sm, + const Hypergraph& i, + int pop_limit, + Hypergraph* o) : + models(m), + smeta(sm), + in(i), + out(*o), + D(in.nodes_.size()), + pop_limit_(pop_limit) { + cerr << " Applying feature functions (cube pruning, pop_limit = " << pop_limit_ << ')' << endl; + } + + void Apply() { + int num_nodes = in.nodes_.size(); + int goal_id = num_nodes - 1; + int pregoal = goal_id - 1; + int every = 1; + if (num_nodes > 100) every = 10; + assert(in.nodes_[pregoal].out_edges_.size() == 1); + cerr << " "; + for (int i = 0; i < in.nodes_.size(); ++i) { + if (i % every == 0) cerr << '.'; + KBest(i, i == goal_id); + } + cerr << endl; + cerr << " Best path: " << log(D[goal_id].front()->vit_prob_) + << "\t" << log(D[goal_id].front()->est_prob_) << endl; + out.PruneUnreachable(D[goal_id].front()->node_index_); + FreeAll(); + } + + private: + void FreeAll() { + for (int i = 0; i < D.size(); ++i) { + CandidateList& D_i = D[i]; + for (int j = 0; j < D_i.size(); ++j) + delete D_i[j]; + } + D.clear(); + } + + void IncorporateIntoPlusLMForest(Candidate* item, State2Node* s2n, CandidateList* freelist) { + Hypergraph::Edge* new_edge = out.AddEdge(item->out_edge_.rule_, item->out_edge_.tail_nodes_); + new_edge->feature_values_ = item->out_edge_.feature_values_; + new_edge->edge_prob_ = item->out_edge_.edge_prob_; + new_edge->i_ = item->out_edge_.i_; + new_edge->j_ = item->out_edge_.j_; + new_edge->prev_i_ = item->out_edge_.prev_i_; + new_edge->prev_j_ = item->out_edge_.prev_j_; + Candidate*& o_item = (*s2n)[item->state_]; + if (!o_item) o_item = item; + + int& node_id = o_item->node_index_; + if (node_id < 0) { + Hypergraph::Node* new_node = out.AddNode(in.nodes_[item->in_edge_->head_node_].cat_, item->state_); + node_id = new_node->id_; + } + Hypergraph::Node* node = &out.nodes_[node_id]; + out.ConnectEdgeToHeadNode(new_edge, node); + + // update candidate if we have a better derivation + // note: the difference between the vit score and the estimated + // score is the same for all items with a common residual DP + // state + if (item->vit_prob_ > o_item->vit_prob_) { + assert(o_item->state_ == item->state_); // sanity check! + o_item->est_prob_ = item->est_prob_; + o_item->vit_prob_ = item->vit_prob_; + } + if (item != o_item) freelist->push_back(item); + } + + void KBest(const int vert_index, const bool is_goal) { + // cerr << "KBest(" << vert_index << ")\n"; + CandidateList& D_v = D[vert_index]; + assert(D_v.empty()); + const Hypergraph::Node& v = in.nodes_[vert_index]; + // cerr << " has " << v.in_edges_.size() << " in-coming edges\n"; + const vector& in_edges = v.in_edges_; + CandidateHeap cand; + CandidateList freelist; + cand.reserve(in_edges.size()); + UniqueCandidateSet unique_cands; + for (int i = 0; i < in_edges.size(); ++i) { + const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; + const JVector j(edge.tail_nodes_.size(), 0); + cand.push_back(new Candidate(edge, j, out, D, smeta, models, is_goal)); + assert(unique_cands.insert(cand.back()).second); // these should all be unique! + } +// cerr << " making heap of " << cand.size() << " candidates\n"; + make_heap(cand.begin(), cand.end(), HeapCandCompare()); + State2Node state2node; // "buf" in Figure 2 + int pops = 0; + while(!cand.empty() && pops < pop_limit_) { + pop_heap(cand.begin(), cand.end(), HeapCandCompare()); + Candidate* item = cand.back(); + cand.pop_back(); + // cerr << "POPPED: " << *item << endl; + PushSucc(*item, is_goal, &cand, &unique_cands); + IncorporateIntoPlusLMForest(item, &state2node, &freelist); + ++pops; + } + D_v.resize(state2node.size()); + int c = 0; + for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i) + D_v[c++] = i->second; + sort(D_v.begin(), D_v.end(), EstProbSorter()); + // cerr << " expanded to " << D_v.size() << " nodes\n"; + + for (int i = 0; i < cand.size(); ++i) + delete cand[i]; + // freelist is necessary since even after an item merged, it still stays in + // the unique set so it can't be deleted til now + for (int i = 0; i < freelist.size(); ++i) + delete freelist[i]; + } + + void PushSucc(const Candidate& item, const bool is_goal, CandidateHeap* pcand, UniqueCandidateSet* cs) { + CandidateHeap& cand = *pcand; + for (int i = 0; i < item.j_.size(); ++i) { + JVector j = item.j_; + ++j[i]; + if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) { + Candidate query_unique(*item.in_edge_, j); + if (cs->count(&query_unique) == 0) { + Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, smeta, models, is_goal); + cand.push_back(new_cand); + push_heap(cand.begin(), cand.end(), HeapCandCompare()); + assert(cs->insert(new_cand).second); // insert into uniqueness set, sanity check + } + } + } + } + + const ModelSet& models; + const SentenceMetadata& smeta; + const Hypergraph& in; + Hypergraph& out; + + vector D; // maps nodes in in-HG to the + // equivalent nodes (many due to state + // splits) in the out-HG. + const int pop_limit_; +}; + +struct NoPruningRescorer { + NoPruningRescorer(const ModelSet& m, const Hypergraph& i, Hypergraph* o) : + models(m), + in(i), + out(*o) { + cerr << " Rescoring forest (full intersection)\n"; + } + + void RescoreNode(const int node_num, const bool is_goal) { + } + + void Apply() { + int num_nodes = in.nodes_.size(); + int goal_id = num_nodes - 1; + int pregoal = goal_id - 1; + int every = 1; + if (num_nodes > 100) every = 10; + assert(in.nodes_[pregoal].out_edges_.size() == 1); + cerr << " "; + for (int i = 0; i < in.nodes_.size(); ++i) { + if (i % every == 0) cerr << '.'; + RescoreNode(i, i == goal_id); + } + cerr << endl; + } + + private: + const ModelSet& models; + const Hypergraph& in; + Hypergraph& out; +}; + +// each node in the graph has one of these, it keeps track of +void ApplyModelSet(const Hypergraph& in, + const SentenceMetadata& smeta, + const ModelSet& models, + const PruningConfiguration& config, + Hypergraph* out) { + int pl = config.pop_limit; + if (pl > 100 && in.nodes_.size() > 80000) { + cerr << " Note: reducing pop_limit to " << pl << " for very large forest\n"; + pl = 30; + } + CubePruningRescorer ma(models, smeta, in, pl, out); + ma.Apply(); +} + diff --git a/decoder/apply_models.h b/decoder/apply_models.h new file mode 100644 index 00000000..08fce037 --- /dev/null +++ b/decoder/apply_models.h @@ -0,0 +1,20 @@ +#ifndef _APPLY_MODELS_H_ +#define _APPLY_MODELS_H_ + +struct ModelSet; +struct Hypergraph; +struct SentenceMetadata; + +struct PruningConfiguration { + const int algorithm; // 0 = full intersection, 1 = cube pruning + const int pop_limit; // max number of pops off the heap at each node + explicit PruningConfiguration(int k) : algorithm(1), pop_limit(k) {} +}; + +void ApplyModelSet(const Hypergraph& in, + const SentenceMetadata& smeta, + const ModelSet& models, + const PruningConfiguration& config, + Hypergraph* out); + +#endif diff --git a/decoder/array2d.h b/decoder/array2d.h new file mode 100644 index 00000000..e63eda0d --- /dev/null +++ b/decoder/array2d.h @@ -0,0 +1,172 @@ +#ifndef ARRAY2D_H_ +#define ARRAY2D_H_ + +#include +#include +#include +#include +#include + +template +class Array2D { + public: + typedef typename std::vector::reference reference; + typedef typename std::vector::const_reference const_reference; + typedef typename std::vector::iterator iterator; + typedef typename std::vector::const_iterator const_iterator; + Array2D() : width_(0), height_(0) {} + Array2D(int w, int h, const T& d = T()) : + width_(w), height_(h), data_(w*h, d) {} + Array2D(const Array2D& rhs) : + width_(rhs.width_), height_(rhs.height_), data_(rhs.data_) {} + bool empty() const { return data_.empty(); } + void resize(int w, int h, const T& d = T()) { + data_.resize(w * h, d); + width_ = w; + height_ = h; + } + const Array2D& operator=(const Array2D& rhs) { + data_ = rhs.data_; + width_ = rhs.width_; + height_ = rhs.height_; + return *this; + } + void fill(const T& v) { data_.assign(data_.size(), v); } + int width() const { return width_; } + int height() const { return height_; } + reference operator()(int i, int j) { + return data_[offset(i, j)]; + } + void clear() { data_.clear(); width_=0; height_=0; } + const_reference operator()(int i, int j) const { + return data_[offset(i, j)]; + } + iterator begin_col(int j) { + return data_.begin() + offset(0,j); + } + const_iterator begin_col(int j) const { + return data_.begin() + offset(0,j); + } + iterator end_col(int j) { + return data_.begin() + offset(0,j) + width_; + } + const_iterator end_col(int j) const { + return data_.begin() + offset(0,j) + width_; + } + iterator end() { return data_.end(); } + const_iterator end() const { return data_.end(); } + const Array2D& operator*=(const T& x) { + std::transform(data_.begin(), data_.end(), data_.begin(), + std::bind2nd(std::multiplies(), x)); + } + const Array2D& operator/=(const T& x) { + std::transform(data_.begin(), data_.end(), data_.begin(), + std::bind2nd(std::divides(), x)); + } + const Array2D& operator+=(const Array2D& m) { + std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::plus()); + } + const Array2D& operator-=(const Array2D& m) { + std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::minus()); + } + + private: + inline int offset(int i, int j) const { + assert(i data_; +}; + +template +Array2D operator*(const Array2D& l, const T& scalar) { + Array2D res(l); + res *= scalar; + return res; +} + +template +Array2D operator*(const T& scalar, const Array2D& l) { + Array2D res(l); + res *= scalar; + return res; +} + +template +Array2D operator/(const Array2D& l, const T& scalar) { + Array2D res(l); + res /= scalar; + return res; +} + +template +Array2D operator+(const Array2D& l, const Array2D& r) { + Array2D res(l); + res += r; + return res; +} + +template +Array2D operator-(const Array2D& l, const Array2D& r) { + Array2D res(l); + res -= r; + return res; +} + +template +inline std::ostream& operator<<(std::ostream& os, const Array2D& m) { + for (int i=0; i& m) { + os << ' '; + for (int j=0; j >& m) { + os << ' '; + for (int j=0; j& ar = m(i,j); + for (int k=0; k + +#include "hg.h" +#include "array2d.h" +#include "tdict.h" + +using namespace std; + +class ActiveChart; +class PassiveChart { + public: + PassiveChart(const string& goal, + const vector& grammars, + const Lattice& input, + Hypergraph* forest); + ~PassiveChart(); + + inline const vector& operator()(int i, int j) const { return chart_(i,j); } + bool Parse(); + inline int size() const { return chart_.width(); } + inline bool GoalFound() const { return goal_idx_ >= 0; } + inline int GetGoalIndex() const { return goal_idx_; } + + private: + void ApplyRules(const int i, + const int j, + const RuleBin* rules, + const Hypergraph::TailNodeVector& tail, + const float lattice_cost); + + void ApplyRule(const int i, + const int j, + const TRulePtr& r, + const Hypergraph::TailNodeVector& ant_nodes, + const float lattice_cost); + + void ApplyUnaryRules(const int i, const int j); + + const vector& grammars_; + const Lattice& input_; + Hypergraph* forest_; + Array2D > chart_; // chart_(i,j) is the list of nodes derived spanning i,j + typedef map Cat2NodeMap; + Array2D nodemap_; + vector act_chart_; + const WordID goal_cat_; // category that is being searched for at [0,n] + TRulePtr goal_rule_; + int goal_idx_; // index of goal node, if found + const int lc_fid_; + + static WordID kGOAL; // [Goal] +}; + +WordID PassiveChart::kGOAL = 0; + +class ActiveChart { + public: + ActiveChart(const Hypergraph* hg, const PassiveChart& psv_chart) : + hg_(hg), + act_chart_(psv_chart.size(), psv_chart.size()), psv_chart_(psv_chart) {} + + struct ActiveItem { + ActiveItem(const GrammarIter* g, const Hypergraph::TailNodeVector& a, float lcost) : + gptr_(g), ant_nodes_(a), lattice_cost(lcost) {} + explicit ActiveItem(const GrammarIter* g) : + gptr_(g), ant_nodes_(), lattice_cost(0.0) {} + + void ExtendTerminal(int symbol, float src_cost, vector* out_cell) const { + const GrammarIter* ni = gptr_->Extend(symbol); + if (ni) out_cell->push_back(ActiveItem(ni, ant_nodes_, lattice_cost + src_cost)); + } + void ExtendNonTerminal(const Hypergraph* hg, int node_index, vector* out_cell) const { + int symbol = hg->nodes_[node_index].cat_; + const GrammarIter* ni = gptr_->Extend(symbol); + if (!ni) return; + Hypergraph::TailNodeVector na(ant_nodes_.size() + 1); + for (int i = 0; i < ant_nodes_.size(); ++i) + na[i] = ant_nodes_[i]; + na[ant_nodes_.size()] = node_index; + out_cell->push_back(ActiveItem(ni, na, lattice_cost)); + } + + const GrammarIter* gptr_; + Hypergraph::TailNodeVector ant_nodes_; + float lattice_cost; // TODO? use SparseVector + }; + + inline const vector& operator()(int i, int j) const { return act_chart_(i,j); } + void SeedActiveChart(const Grammar& g) { + int size = act_chart_.width(); + for (int i = 0; i < size; ++i) + if (g.HasRuleForSpan(i,i,0)) + act_chart_(i,i).push_back(ActiveItem(g.GetRoot())); + } + + void ExtendActiveItems(int i, int k, int j) { + //cerr << " LOOK(" << i << "," << k << ") for completed items in (" << k << "," << j << ")\n"; + vector& cell = act_chart_(i,j); + const vector& icell = act_chart_(i,k); + const vector& idxs = psv_chart_(k, j); + //if (!idxs.empty()) { cerr << "FOUND IN (" << k << "," << j << ")\n"; } + for (vector::const_iterator di = icell.begin(); di != icell.end(); ++di) { + for (vector::const_iterator ni = idxs.begin(); ni != idxs.end(); ++ni) { + di->ExtendNonTerminal(hg_, *ni, &cell); + } + } + } + + void AdvanceDotsForAllItemsInCell(int i, int j, const vector >& input) { + //cerr << "ADVANCE(" << i << "," << j << ")\n"; + for (int k=i+1; k < j; ++k) + ExtendActiveItems(i, k, j); + + const vector& out_arcs = input[j-1]; + for (vector::const_iterator ai = out_arcs.begin(); + ai != out_arcs.end(); ++ai) { + const WordID& f = ai->label; + const double& c = ai->cost; + const int& len = ai->dist2next; + //VLOG(1) << "F: " << TD::Convert(f) << endl; + const vector& ec = act_chart_(i, j-1); + for (vector::const_iterator di = ec.begin(); di != ec.end(); ++di) + di->ExtendTerminal(f, c, &act_chart_(i, j + len - 1)); + } + } + + private: + const Hypergraph* hg_; + Array2D > act_chart_; + const PassiveChart& psv_chart_; +}; + +PassiveChart::PassiveChart(const string& goal, + const vector& grammars, + const Lattice& input, + Hypergraph* forest) : + grammars_(grammars), + input_(input), + forest_(forest), + chart_(input.size()+1, input.size()+1), + nodemap_(input.size()+1, input.size()+1), + goal_cat_(TD::Convert(goal) * -1), + goal_rule_(new TRule("[Goal] ||| [" + goal + ",1] ||| [" + goal + ",1]")), + goal_idx_(-1), + lc_fid_(FD::Convert("LatticeCost")) { + act_chart_.resize(grammars_.size()); + for (int i = 0; i < grammars_.size(); ++i) + act_chart_[i] = new ActiveChart(forest, *this); + if (!kGOAL) kGOAL = TD::Convert("Goal") * -1; + cerr << " Goal category: [" << goal << ']' << endl; +} + +void PassiveChart::ApplyRule(const int i, + const int j, + const TRulePtr& r, + const Hypergraph::TailNodeVector& ant_nodes, + const float lattice_cost) { + Hypergraph::Edge* new_edge = forest_->AddEdge(r, ant_nodes); + new_edge->prev_i_ = r->prev_i; + new_edge->prev_j_ = r->prev_j; + new_edge->i_ = i; + new_edge->j_ = j; + new_edge->feature_values_ = r->GetFeatureValues(); + if (lattice_cost) + new_edge->feature_values_.set_value(lc_fid_, lattice_cost); + Cat2NodeMap& c2n = nodemap_(i,j); + const bool is_goal = (r->GetLHS() == kGOAL); + const Cat2NodeMap::iterator ni = c2n.find(r->GetLHS()); + Hypergraph::Node* node = NULL; + if (ni == c2n.end()) { + node = forest_->AddNode(r->GetLHS(), ""); + c2n[r->GetLHS()] = node->id_; + if (is_goal) { + assert(goal_idx_ == -1); + goal_idx_ = node->id_; + } else { + chart_(i,j).push_back(node->id_); + } + } else { + node = &forest_->nodes_[ni->second]; + } + forest_->ConnectEdgeToHeadNode(new_edge, node); +} + +void PassiveChart::ApplyRules(const int i, + const int j, + const RuleBin* rules, + const Hypergraph::TailNodeVector& tail, + const float lattice_cost) { + const int n = rules->GetNumRules(); + for (int k = 0; k < n; ++k) + ApplyRule(i, j, rules->GetIthRule(k), tail, lattice_cost); +} + +void PassiveChart::ApplyUnaryRules(const int i, const int j) { + const vector& nodes = chart_(i,j); // reference is important! + for (int gi = 0; gi < grammars_.size(); ++gi) { + if (!grammars_[gi]->HasRuleForSpan(i,j,input_.Distance(i,j))) continue; + for (int di = 0; di < nodes.size(); ++di) { + const WordID& cat = forest_->nodes_[nodes[di]].cat_; + const vector& unaries = grammars_[gi]->GetUnaryRulesForRHS(cat); + for (int ri = 0; ri < unaries.size(); ++ri) { + // cerr << "At (" << i << "," << j << "): applying " << unaries[ri]->AsString() << endl; + const Hypergraph::TailNodeVector ant(1, nodes[di]); + ApplyRule(i, j, unaries[ri], ant, 0); // may update nodes + } + } + } +} + +bool PassiveChart::Parse() { + forest_->nodes_.reserve(input_.size() * input_.size() * 2); + forest_->edges_.reserve(input_.size() * input_.size() * 1000); // TODO: reservation?? + goal_idx_ = -1; + for (int gi = 0; gi < grammars_.size(); ++gi) + act_chart_[gi]->SeedActiveChart(*grammars_[gi]); + + cerr << " "; + for (int l=1; lAdvanceDotsForAllItemsInCell(i, j, input_); + + const vector& cell = (*act_chart_[gi])(i,j); + for (vector::const_iterator ai = cell.begin(); + ai != cell.end(); ++ai) { + const RuleBin* rules = (ai->gptr_->GetRules()); + if (!rules) continue; + ApplyRules(i, j, rules, ai->ant_nodes_, ai->lattice_cost); + } + } + } + ApplyUnaryRules(i,j); + + for (int gi = 0; gi < grammars_.size(); ++gi) { + const Grammar& g = *grammars_[gi]; + // deal with non-terminals that were just proved + if (g.HasRuleForSpan(i, j, input_.Distance(i,j))) + act_chart_[gi]->ExtendActiveItems(i, i, j); + } + } + const vector& dh = chart_(0, input_.size()); + for (int di = 0; di < dh.size(); ++di) { + const Hypergraph::Node& node = forest_->nodes_[dh[di]]; + if (node.cat_ == goal_cat_) { + Hypergraph::TailNodeVector ant(1, node.id_); + ApplyRule(0, input_.size(), goal_rule_, ant, 0); + } + } + } + cerr << endl; + + if (GoalFound()) + forest_->PruneUnreachable(forest_->nodes_.size() - 1); + return GoalFound(); +} + +PassiveChart::~PassiveChart() { + for (int i = 0; i < act_chart_.size(); ++i) + delete act_chart_[i]; +} + +ExhaustiveBottomUpParser::ExhaustiveBottomUpParser( + const string& goal_sym, + const vector& grammars) : + goal_sym_(goal_sym), + grammars_(grammars) {} + +bool ExhaustiveBottomUpParser::Parse(const Lattice& input, + Hypergraph* forest) const { + PassiveChart chart(goal_sym_, grammars_, input, forest); + return chart.Parse(); +} diff --git a/decoder/bottom_up_parser.h b/decoder/bottom_up_parser.h new file mode 100644 index 00000000..546bfb54 --- /dev/null +++ b/decoder/bottom_up_parser.h @@ -0,0 +1,27 @@ +#ifndef _BOTTOM_UP_PARSER_H_ +#define _BOTTOM_UP_PARSER_H_ + +#include +#include + +#include "lattice.h" +#include "grammar.h" + +class Hypergraph; + +class ExhaustiveBottomUpParser { + public: + ExhaustiveBottomUpParser(const std::string& goal_sym, + const std::vector& grammars); + + // returns true if goal reached spanning the full input + // forest contains the full (i.e., unpruned) parse forest + bool Parse(const Lattice& input, + Hypergraph* forest) const; + + private: + const std::string goal_sym_; + const std::vector grammars_; +}; + +#endif diff --git a/decoder/cdec.cc b/decoder/cdec.cc new file mode 100644 index 00000000..6185c79b --- /dev/null +++ b/decoder/cdec.cc @@ -0,0 +1,507 @@ +#include +#include +#include +#include + +#include +#include +#include + +#include "timing_stats.h" +#include "translator.h" +#include "phrasebased_translator.h" +#include "aligner.h" +#include "stringlib.h" +#include "forest_writer.h" +#include "hg_io.h" +#include "filelib.h" +#include "sampler.h" +#include "sparse_vector.h" +#include "lexcrf.h" +#include "csplit.h" +#include "weights.h" +#include "tdict.h" +#include "ff.h" +#include "ff_factory.h" +#include "hg_intersect.h" +#include "apply_models.h" +#include "viterbi.h" +#include "kbest.h" +#include "inside_outside.h" +#include "exp_semiring.h" +#include "sentence_metadata.h" + +using namespace std; +using namespace std::tr1; +using boost::shared_ptr; +namespace po = boost::program_options; + +// some globals ... +boost::shared_ptr > rng; + +namespace Hack { void MaxTrans(const Hypergraph& in, int beam_size); } + +void ShowBanner() { + cerr << "cdec v1.0 (c) 2009 by Chris Dyer\n"; +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("formalism,f",po::value(),"Translation formalism; values include SCFG, FST, PB, LexCRF (lexical translation model), CSplit (compound splitting)") + ("input,i",po::value()->default_value("-"),"Source file") + ("grammar,g",po::value >()->composing(),"Either SCFG grammar file(s) or phrase tables file(s)") + ("weights,w",po::value(),"Feature weights file") + ("feature_function,F",po::value >()->composing(), "Additional feature function(s) (-L for list)") + ("list_feature_functions,L","List available feature functions") + ("add_pass_through_rules,P","Add rules to translate OOV words as themselves") + ("k_best,k",po::value(),"Extract the k best derivations") + ("unique_k_best,r", "Unique k-best translation list") + ("aligner,a", "Run as a word/phrase aligner (src & ref required)") + ("cubepruning_pop_limit,K",po::value()->default_value(200), "Max number of pops from the candidate heap at each node") + ("goal",po::value()->default_value("S"),"Goal symbol (SCFG & FST)") + ("scfg_extra_glue_grammar", po::value(), "Extra glue grammar file (Glue grammars apply when i=0 but have no other span restrictions)") + ("scfg_no_hiero_glue_grammar,n", "No Hiero glue grammar (nb. by default the SCFG decoder adds Hiero glue rules)") + ("scfg_default_nt,d",po::value()->default_value("X"),"Default non-terminal symbol in SCFG") + ("scfg_max_span_limit,S",po::value()->default_value(10),"Maximum non-terminal span limit (except \"glue\" grammar)") + ("show_tree_structure,T", "Show the Viterbi derivation structure") + ("show_expected_length", "Show the expected translation length under the model") + ("show_partition,z", "Compute and show the partition (inside score)") + ("beam_prune", po::value(), "Prune paths from +LM forest") + ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format") + ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice") + ("extract_rules", po::value(), "Extract the rules used in translation (de-duped) to this file") + ("graphviz","Show (constrained) translation forest in GraphViz format") + ("max_translation_beam,x", po::value(), "Beam approximation to get max translation from the chart") + ("max_translation_sample,X", po::value(), "Sample the max translation from the chart") + ("pb_max_distortion,D", po::value()->default_value(4), "Phrase-based decoder: maximum distortion") + ("gradient,G","Compute d log p(e|f) / d lambda_i and write to STDOUT (src & ref required)") + ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)") + ("vector_format",po::value()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)") + ("combine_size,C",po::value()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)") + ("forest_output,O",po::value(),"Directory to write forests to") + ("minimal_forests,m","Write minimal forests (excludes Rule information). Such forests can be used for ML/MAP training, but not rescoring, etc."); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + const string cfg = (*conf)["config"].as(); + cerr << "Configuration file: " << cfg << endl; + ifstream config(cfg.c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("list_feature_functions")) { + cerr << "Available feature functions (specify with -F):\n"; + global_ff_registry->DisplayList(); + cerr << endl; + exit(1); + } + + if (conf->count("help") || conf->count("formalism") == 0) { + cerr << dcmdline_options << endl; + exit(1); + } + + const string formalism = LowercaseString((*conf)["formalism"].as()); + if (formalism != "scfg" && formalism != "fst" && formalism != "lexcrf" && formalism != "pb" && formalism != "csplit") { + cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit' or 'lexcrf'\n"; + cerr << dcmdline_options << endl; + exit(1); + } +} + +// TODO move out of cdec into some sampling decoder file +void SampleRecurse(const Hypergraph& hg, const vector& ss, int n, vector* out) { + const SampleSet& s = ss[n]; + int i = rng->SelectSample(s); + const Hypergraph::Edge& edge = hg.edges_[hg.nodes_[n].in_edges_[i]]; + vector > ants(edge.tail_nodes_.size()); + for (int j = 0; j < ants.size(); ++j) + SampleRecurse(hg, ss, edge.tail_nodes_[j], &ants[j]); + + vector*> pants(ants.size()); + for (int j = 0; j < ants.size(); ++j) pants[j] = &ants[j]; + edge.rule_->ESubstitute(pants, out); +} + +struct SampleSort { + bool operator()(const pair& a, const pair& b) const { + return a.first > b.first; + } +}; + +// TODO move out of cdec into some sampling decoder file +void MaxTranslationSample(Hypergraph* hg, const int samples, const int k) { + unordered_map > m; + hg->PushWeightsToGoal(); + const int num_nodes = hg->nodes_.size(); + vector ss(num_nodes); + for (int i = 0; i < num_nodes; ++i) { + SampleSet& s = ss[i]; + const vector& in_edges = hg->nodes_[i].in_edges_; + for (int j = 0; j < in_edges.size(); ++j) { + s.add(hg->edges_[in_edges[j]].edge_prob_); + } + } + for (int i = 0; i < samples; ++i) { + vector yield; + SampleRecurse(*hg, ss, hg->nodes_.size() - 1, &yield); + const string trans = TD::GetString(yield); + ++m[trans]; + } + vector > dist; + for (unordered_map >::iterator i = m.begin(); + i != m.end(); ++i) { + dist.push_back(make_pair(i->second, i->first)); + } + sort(dist.begin(), dist.end(), SampleSort()); + if (k) { + for (int i = 0; i < k; ++i) + cout << dist[i].first << " ||| " << dist[i].second << endl; + } else { + cout << dist[0].second << endl; + } +} + +// TODO decoder output should probably be moved to another file +void DumpKBest(const int sent_id, const Hypergraph& forest, const int k, const bool unique) { + if (unique) { + KBest::KBestDerivations, ESentenceTraversal, KBest::FilterUnique> kbest(forest, k); + for (int i = 0; i < k; ++i) { + const KBest::KBestDerivations, ESentenceTraversal, KBest::FilterUnique>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " + << d->feature_values << " ||| " << log(d->score) << endl; + } + } else { + KBest::KBestDerivations, ESentenceTraversal> kbest(forest, k); + for (int i = 0; i < k; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " + << d->feature_values << " ||| " << log(d->score) << endl; + } + } +} + +struct ELengthWeightFunction { + double operator()(const Hypergraph::Edge& e) const { + return e.rule_->ELength() - e.rule_->Arity(); + } +}; + + +struct TRPHash { + size_t operator()(const TRulePtr& o) const { return reinterpret_cast(o.get()); } +}; +static void ExtractRulesDedupe(const Hypergraph& hg, ostream* os) { + static unordered_set written; + for (int i = 0; i < hg.edges_.size(); ++i) { + const TRulePtr& rule = hg.edges_[i].rule_; + if (written.insert(rule).second) { + (*os) << rule->AsString() << endl; + } + } +} + +void register_feature_functions(); + +int main(int argc, char** argv) { + global_ff_registry.reset(new FFRegistry); + register_feature_functions(); + ShowBanner(); + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const bool write_gradient = conf.count("gradient"); + const bool feature_expectations = conf.count("feature_expectations"); + if (write_gradient && feature_expectations) { + cerr << "You can only specify --gradient or --feature_expectations, not both!\n"; + exit(1); + } + const bool output_training_vector = (write_gradient || feature_expectations); + + boost::shared_ptr translator; + const string formalism = LowercaseString(conf["formalism"].as()); + const bool csplit_preserve_full_word = conf.count("csplit_preserve_full_word"); + if (csplit_preserve_full_word && + (formalism != "csplit" || !conf.count("beam_prune"))) { + cerr << "--csplit_preserve_full_word should only be " + << "used with csplit AND --beam_prune!\n"; + exit(1); + } + const bool csplit_output_plf = conf.count("csplit_output_plf"); + if (csplit_output_plf && formalism != "csplit") { + cerr << "--csplit_output_plf should only be used with csplit!\n"; + exit(1); + } + + if (formalism == "scfg") + translator.reset(new SCFGTranslator(conf)); + else if (formalism == "fst") + translator.reset(new FSTTranslator(conf)); + else if (formalism == "pb") + translator.reset(new PhraseBasedTranslator(conf)); + else if (formalism == "csplit") + translator.reset(new CompoundSplit(conf)); + else if (formalism == "lexcrf") + translator.reset(new LexicalCRF(conf)); + else + assert(!"error"); + + vector feature_weights; + Weights w; + if (conf.count("weights")) { + w.InitFromFile(conf["weights"].as()); + feature_weights.resize(FD::NumFeats()); + w.InitVector(&feature_weights); + } + + // set up additional scoring features + vector > pffs; + vector late_ffs; + if (conf.count("feature_function") > 0) { + const vector& add_ffs = conf["feature_function"].as >(); + for (int i = 0; i < add_ffs.size(); ++i) { + string ff, param; + SplitCommandAndParam(add_ffs[i], &ff, ¶m); + cerr << "Feature: " << ff; + if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n"; + else cerr << " (no config parameters)\n"; + shared_ptr pff = global_ff_registry->Create(ff, param); + if (!pff) { exit(1); } + // TODO check that multiple features aren't trying to set the same fid + pffs.push_back(pff); + late_ffs.push_back(pff.get()); + } + } + ModelSet late_models(feature_weights, late_ffs); + + const int sample_max_trans = conf.count("max_translation_sample") ? + conf["max_translation_sample"].as() : 0; + if (sample_max_trans) + rng.reset(new RandomNumberGenerator); + const bool aligner_mode = conf.count("aligner"); + const bool minimal_forests = conf.count("minimal_forests"); + const bool graphviz = conf.count("graphviz"); + const bool encode_b64 = conf["vector_format"].as() == "b64"; + const bool kbest = conf.count("k_best"); + const bool unique_kbest = conf.count("unique_k_best"); + shared_ptr extract_file; + if (conf.count("extract_rules")) + extract_file.reset(new WriteFile(conf["extract_rules"].as())); + + int combine_size = conf["combine_size"].as(); + if (combine_size < 1) combine_size = 1; + const string input = conf["input"].as(); + cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; + ReadFile in_read(input); + istream *in = in_read.stream(); + assert(*in); + + SparseVector acc_vec; // accumulate gradient + double acc_obj = 0; // accumulate objective + int g_count = 0; // number of gradient pieces computed + int sent_id = -1; // line counter + + while(*in) { + Timer::Summarize(); + ++sent_id; + string buf; + getline(*in, buf); + if (buf.empty()) continue; + map sgml; + ProcessAndStripSGML(&buf, &sgml); + if (sgml.find("id") != sgml.end()) + sent_id = atoi(sgml["id"].c_str()); + + cerr << "\nINPUT: "; + if (buf.size() < 100) + cerr << buf << endl; + else { + size_t x = buf.rfind(" ", 100); + if (x == string::npos) x = 100; + cerr << buf.substr(0, x) << " ..." << endl; + } + cerr << " id = " << sent_id << endl; + string to_translate; + Lattice ref; + ParseTranslatorInputLattice(buf, &to_translate, &ref); + const bool has_ref = ref.size() > 0; + SentenceMetadata smeta(sent_id, ref); + const bool hadoop_counters = (write_gradient); + Hypergraph forest; // -LM forest + Timer t("Translation"); + if (!translator->Translate(to_translate, &smeta, feature_weights, &forest)) { + cerr << " NO PARSE FOUND.\n"; + if (hadoop_counters) + cerr << "reporter:counter:UserCounters,FParseFailed,1" << endl; + cout << endl << flush; + continue; + } + cerr << " -LM forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; + cerr << " -LM forest (paths): " << forest.NumberOfPaths() << endl; + if (conf.count("show_expected_length")) { + const PRPair res = + Inside, + PRWeightFunction >(forest); + cerr << " Expected length (words): " << res.r / res.p << "\t" << res << endl; + } + if (conf.count("show_partition")) { + const prob_t z = Inside(forest); + cerr << " -LM partition log(Z): " << log(z) << endl; + } + if (extract_file) + ExtractRulesDedupe(forest, extract_file->stream()); + vector trans; + const prob_t vs = ViterbiESentence(forest, &trans); + cerr << " -LM Viterbi: " << TD::GetString(trans) << endl; + if (conf.count("show_tree_structure")) + cerr << " -LM tree: " << ViterbiETree(forest) << endl;; + cerr << " -LM Viterbi: " << log(vs) << endl; + + bool has_late_models = !late_models.empty(); + if (has_late_models) { + forest.Reweight(feature_weights); + forest.SortInEdgesByEdgeWeights(); + Hypergraph lm_forest; + int cubepruning_pop_limit = conf["cubepruning_pop_limit"].as(); + ApplyModelSet(forest, + smeta, + late_models, + PruningConfiguration(cubepruning_pop_limit), + &lm_forest); + forest.swap(lm_forest); + forest.Reweight(feature_weights); + trans.clear(); + ViterbiESentence(forest, &trans); + cerr << " +LM forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; + cerr << " +LM forest (paths): " << forest.NumberOfPaths() << endl; + cerr << " +LM Viterbi: " << TD::GetString(trans) << endl; + } + if (conf.count("beam_prune")) { + vector preserve_mask(forest.edges_.size(), false); + if (csplit_preserve_full_word) + preserve_mask[CompoundSplit::GetFullWordEdgeIndex(forest)] = true; + forest.BeamPruneInsideOutside(1.0, false, conf["beam_prune"].as(), &preserve_mask); + cerr << " Pruned forest (paths): " << forest.NumberOfPaths() << endl; + } + + if (conf.count("forest_output") && !has_ref) { + ForestWriter writer(conf["forest_output"].as(), sent_id); + assert(writer.Write(forest, minimal_forests)); + } + + if (sample_max_trans) { + MaxTranslationSample(&forest, sample_max_trans, conf.count("k_best") ? conf["k_best"].as() : 0); + } else { + if (kbest) { + DumpKBest(sent_id, forest, conf["k_best"].as(), unique_kbest); + } else if (csplit_output_plf) { + cout << HypergraphIO::AsPLF(forest, false) << endl; + } else { + if (!graphviz && !has_ref) { + cout << TD::GetString(trans) << endl << flush; + } + } + } + + const int max_trans_beam_size = conf.count("max_translation_beam") ? + conf["max_translation_beam"].as() : 0; + if (max_trans_beam_size) { + Hack::MaxTrans(forest, max_trans_beam_size); + continue; + } + + if (graphviz && !has_ref) forest.PrintGraphviz(); + + // the following are only used if write_gradient is true! + SparseVector full_exp, ref_exp, gradient; + double log_z = 0, log_ref_z = 0; + if (write_gradient) + log_z = log( + InsideOutside, EdgeFeaturesWeightFunction>(forest, &full_exp)); + + if (has_ref) { + if (HG::Intersect(ref, &forest)) { + cerr << " Constr. forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; + cerr << " Constr. forest (paths): " << forest.NumberOfPaths() << endl; + forest.Reweight(feature_weights); + cerr << " Constr. VitTree: " << ViterbiFTree(forest) << endl; + if (hadoop_counters) + cerr << "reporter:counter:UserCounters,SentencePairsParsed,1" << endl; + if (conf.count("show_partition")) { + const prob_t z = Inside(forest); + cerr << " Contst. partition log(Z): " << log(z) << endl; + } + //DumpKBest(sent_id, forest, 1000); + if (conf.count("forest_output")) { + ForestWriter writer(conf["forest_output"].as(), sent_id); + assert(writer.Write(forest, minimal_forests)); + } + if (aligner_mode && !output_training_vector) + AlignerTools::WriteAlignment(to_translate, ref, forest); + if (write_gradient) { + log_ref_z = log( + InsideOutside, EdgeFeaturesWeightFunction>(forest, &ref_exp)); + if (log_z < log_ref_z) { + cerr << "DIFF. ERR! log_z < log_ref_z: " << log_z << " " << log_ref_z << endl; + exit(1); + } + //cerr << "FULL: " << full_exp << endl; + //cerr << " REF: " << ref_exp << endl; + ref_exp -= full_exp; + acc_vec += ref_exp; + acc_obj += (log_z - log_ref_z); + } + if (feature_expectations) { + acc_obj += log( + InsideOutside, EdgeFeaturesWeightFunction>(forest, &ref_exp)); + acc_vec += ref_exp; + } + + if (output_training_vector) { + ++g_count; + if (g_count % combine_size == 0) { + if (encode_b64) { + cout << "0\t"; + B64::Encode(acc_obj, acc_vec, &cout); + cout << endl << flush; + } else { + cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush; + } + acc_vec.clear(); + acc_obj = 0; + } + } + if (conf.count("graphviz")) forest.PrintGraphviz(); + } else { + cerr << " REFERENCE UNREACHABLE.\n"; + if (write_gradient) { + if (hadoop_counters) + cerr << "reporter:counter:UserCounters,EFParseFailed,1" << endl; + cout << endl << flush; + } + } + } + } + if (output_training_vector && !acc_vec.empty()) { + if (encode_b64) { + cout << "0\t"; + B64::Encode(acc_obj, acc_vec, &cout); + cout << endl << flush; + } else { + cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush; + } + } +} + diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc new file mode 100644 index 00000000..0a4f3d5e --- /dev/null +++ b/decoder/cdec_ff.cc @@ -0,0 +1,22 @@ +#include + +#include "ff.h" +#include "ff_lm.h" +#include "ff_csplit.h" +#include "ff_wordalign.h" +#include "ff_factory.h" + +boost::shared_ptr global_ff_registry; + +void register_feature_functions() { + global_ff_registry->Register("LanguageModel", new FFFactory); + global_ff_registry->Register("WordPenalty", new FFFactory); + global_ff_registry->Register("SourceWordPenalty", new FFFactory); + global_ff_registry->Register("RelativeSentencePosition", new FFFactory); + global_ff_registry->Register("MarkovJump", new FFFactory); + global_ff_registry->Register("BlunsomSynchronousParseHack", new FFFactory); + global_ff_registry->Register("AlignerResults", new FFFactory); + global_ff_registry->Register("CSplit_BasicFeatures", new FFFactory); + global_ff_registry->Register("CSplit_ReverseCharLM", new FFFactory); +}; + diff --git a/decoder/csplit.cc b/decoder/csplit.cc new file mode 100644 index 00000000..47197782 --- /dev/null +++ b/decoder/csplit.cc @@ -0,0 +1,173 @@ +#include "csplit.h" + +#include + +#include "filelib.h" +#include "stringlib.h" +#include "hg.h" +#include "tdict.h" +#include "grammar.h" +#include "sentence_metadata.h" + +using namespace std; + +struct CompoundSplitImpl { + CompoundSplitImpl(const boost::program_options::variables_map& conf) : + fugen_elements_(true), // TODO configure + min_size_(3), + kXCAT(TD::Convert("X")*-1), + kWORDBREAK_RULE(new TRule("[X] ||| # ||| #")), + kTEMPLATE_RULE(new TRule("[X] ||| [X,1] ? ||| [1] ?")), + kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")), + kFUGEN_S(FD::Convert("FugS")), + kFUGEN_N(FD::Convert("FugN")) {} + + void PasteTogetherStrings(const vector& chars, + const int i, + const int j, + string* yield) { + int size = 0; + for (int k=i; kresize(size); + int cur = 0; + for (int k=i; k& chars, + Hypergraph* forest) { + vector nodes(chars.size()+1, -1); + nodes[0] = forest->AddNode(kXCAT)->id_; // source + const int left_rule = forest->AddEdge(kWORDBREAK_RULE, Hypergraph::TailNodeVector())->id_; + forest->ConnectEdgeToHeadNode(left_rule, nodes[0]); + + const int max_split_ = max(static_cast(chars.size()) - min_size_ + 1, 1); + cerr << "max: " << max_split_ << " " << " min: " << min_size_ << endl; + for (int i = min_size_; i < max_split_; ++i) + nodes[i] = forest->AddNode(kXCAT)->id_; + assert(nodes.back() == -1); + nodes.back() = forest->AddNode(kXCAT)->id_; // sink + + for (int i = 0; i < max_split_; ++i) { + if (nodes[i] < 0) continue; + const int start = min(i + min_size_, static_cast(chars.size())); + for (int j = start; j <= chars.size(); ++j) { + if (nodes[j] < 0) continue; + string yield; + PasteTogetherStrings(chars, i, j, &yield); + // cerr << "[" << i << "," << j << "] " << yield << endl; + TRulePtr rule = TRulePtr(new TRule(*kTEMPLATE_RULE)); + rule->e_[1] = rule->f_[1] = TD::Convert(yield); + // cerr << rule->AsString() << endl; + int edge = forest->AddEdge( + rule, + Hypergraph::TailNodeVector(1, nodes[i]))->id_; + forest->ConnectEdgeToHeadNode(edge, nodes[j]); + forest->edges_[edge].i_ = i; + forest->edges_[edge].j_ = j; + + // handle "fugenelemente" here + // don't delete "fugenelemente" at the end of words + if (fugen_elements_ && j != chars.size()) { + const int len = yield.size(); + string alt; + int fid = 0; + if (len > (min_size_ + 2) && yield[len-1] == 's' && yield[len-2] == 'e') { + alt = yield.substr(0, len - 2); + fid = kFUGEN_S; + } else if (len > (min_size_ + 1) && yield[len-1] == 's') { + alt = yield.substr(0, len - 1); + fid = kFUGEN_S; + } else if (len > (min_size_ + 2) && yield[len-2] == 'e' && yield[len-1] == 'n') { + alt = yield.substr(0, len - 1); + fid = kFUGEN_N; + } + if (alt.size()) { + TRulePtr altrule = TRulePtr(new TRule(*rule)); + altrule->e_[1] = TD::Convert(alt); + // cerr << altrule->AsString() << endl; + int edge = forest->AddEdge( + altrule, + Hypergraph::TailNodeVector(1, nodes[i]))->id_; + forest->ConnectEdgeToHeadNode(edge, nodes[j]); + forest->edges_[edge].feature_values_.set_value(fid, 1.0); + forest->edges_[edge].i_ = i; + forest->edges_[edge].j_ = j; + } + } + } + } + + // add goal rule + Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); + Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1); + Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); + forest->ConnectEdgeToHeadNode(hg_edge, goal); + } + private: + const bool fugen_elements_; + const int min_size_; + const WordID kXCAT; + const TRulePtr kWORDBREAK_RULE; + const TRulePtr kTEMPLATE_RULE; + const TRulePtr kGOAL_RULE; + const int kFUGEN_S; + const int kFUGEN_N; +}; + +CompoundSplit::CompoundSplit(const boost::program_options::variables_map& conf) : + pimpl_(new CompoundSplitImpl(conf)) {} + +static void SplitUTF8String(const string& in, vector* out) { + out->resize(in.size()); + int i = 0; + int c = 0; + while (i < in.size()) { + const int len = UTF8Len(in[i]); + assert(len); + (*out)[c] = in.substr(i, len); + ++c; + i += len; + } + out->resize(c); +} + +bool CompoundSplit::Translate(const string& input, + SentenceMetadata* smeta, + const vector& weights, + Hypergraph* forest) { + if (input.find(" ") != string::npos) { + cerr << " BAD INPUT: " << input << "\n CompoundSplit expects single words\n"; + abort(); + } + vector in; + SplitUTF8String(input, &in); + smeta->SetSourceLength(in.size()); // TODO do utf8 or somethign + for (int i = 0; i < in.size(); ++i) + smeta->src_lattice_.push_back(vector(1, LatticeArc(TD::Convert(in[i]), 0.0, 1))); + pimpl_->BuildTrellis(in, forest); + forest->Reweight(weights); + return true; +} + +int CompoundSplit::GetFullWordEdgeIndex(const Hypergraph& forest) { + assert(forest.nodes_.size() > 0); + const vector out_edges = forest.nodes_[0].out_edges_; + int max_edge = -1; + int max_j = -1; + for (int i = 0; i < out_edges.size(); ++i) { + const int j = forest.edges_[out_edges[i]].j_; + if (j > max_j) { + max_j = j; + max_edge = out_edges[i]; + } + } + assert(max_edge >= 0); + assert(max_edge < forest.edges_.size()); + return max_edge; +} + diff --git a/decoder/csplit.h b/decoder/csplit.h new file mode 100644 index 00000000..ce6295c1 --- /dev/null +++ b/decoder/csplit.h @@ -0,0 +1,30 @@ +#ifndef _CSPLIT_H_ +#define _CSPLIT_H_ + +#include "translator.h" +#include "lattice.h" + +// this "translator" takes single words (with NO SPACES) and segments +// them using the approach described in: +// +// C. Dyer. (2009) Using a maximum entropy model to build segmentation +// lattices for MT. In Proceedings of NAACL HLT 2009. +// note, an extra word space marker # is inserted at the left edge of +// the forest! +struct CompoundSplitImpl; +struct CompoundSplit : public Translator { + CompoundSplit(const boost::program_options::variables_map& conf); + bool Translate(const std::string& input, + SentenceMetadata* smeta, + const std::vector& weights, + Hypergraph* forest); + + // given a forest generated by CompoundSplit::Translate, + // find the edge representing the unsegmented form + static int GetFullWordEdgeIndex(const Hypergraph& forest); + + private: + boost::shared_ptr pimpl_; +}; + +#endif diff --git a/decoder/dict.h b/decoder/dict.h new file mode 100644 index 00000000..bae9debe --- /dev/null +++ b/decoder/dict.h @@ -0,0 +1,40 @@ +#ifndef DICT_H_ +#define DICT_H_ + +#include +#include +#include +#include +#include + +#include + +#include "wordid.h" + +class Dict { + typedef std::tr1::unordered_map > Map; + public: + Dict() : b0_("") { words_.reserve(1000); } + inline int max() const { return words_.size(); } + inline WordID Convert(const std::string& word) { + Map::iterator i = d_.find(word); + if (i == d_.end()) { + words_.push_back(word); + d_[word] = words_.size(); + return words_.size(); + } else { + return i->second; + } + } + inline const std::string& Convert(const WordID& id) const { + if (id == 0) return b0_; + assert(id <= words_.size()); + return words_[id-1]; + } + private: + const std::string b0_; + std::vector words_; + Map d_; +}; + +#endif diff --git a/decoder/dict_test.cc b/decoder/dict_test.cc new file mode 100644 index 00000000..5c5d84f0 --- /dev/null +++ b/decoder/dict_test.cc @@ -0,0 +1,30 @@ +#include "dict.h" + +#include +#include + +class DTest : public testing::Test { + public: + DTest() {} + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +TEST_F(DTest, Convert) { + Dict d; + WordID a = d.Convert("foo"); + WordID b = d.Convert("bar"); + std::string x = "foo"; + WordID c = d.Convert(x); + EXPECT_NE(a, b); + EXPECT_EQ(a, c); + EXPECT_EQ(d.Convert(a), "foo"); + EXPECT_EQ(d.Convert(b), "bar"); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc new file mode 100644 index 00000000..a59686e0 --- /dev/null +++ b/decoder/earley_composer.cc @@ -0,0 +1,726 @@ +#include "earley_composer.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "phrasetable_fst.h" +#include "sparse_vector.h" +#include "tdict.h" +#include "hg.h" + +using boost::shared_ptr; +namespace po = boost::program_options; +using namespace std; +using namespace std::tr1; + +// Define the following macro if you want to see lots of debugging output +// when you run the chart parser +#undef DEBUG_CHART_PARSER + +// A few constants used by the chart parser /////////////// +static const int kMAX_NODES = 2000000; +static const string kPHRASE_STRING = "X"; +static bool constants_need_init = true; +static WordID kUNIQUE_START; +static WordID kPHRASE; +static TRulePtr kX1X2; +static TRulePtr kX1; +static WordID kEPS; +static TRulePtr kEPSRule; + +static void InitializeConstants() { + if (constants_need_init) { + kPHRASE = TD::Convert(kPHRASE_STRING) * -1; + kUNIQUE_START = TD::Convert("S") * -1; + kX1X2.reset(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]")); + kX1.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); + kEPSRule.reset(new TRule("[X] ||| ||| ")); + kEPS = TD::Convert(""); + constants_need_init = false; + } +} +//////////////////////////////////////////////////////////// + +class EGrammarNode { + friend bool EarleyComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); + friend void AddGrammarRule(const string& r, map* g); + public: +#ifdef DEBUG_CHART_PARSER + string hint; +#endif + EGrammarNode() : is_some_rule_complete(false), is_root(false) {} + const map& GetTerminals() const { return tptr; } + const map& GetNonTerminals() const { return ntptr; } + bool HasNonTerminals() const { return (!ntptr.empty()); } + bool HasTerminals() const { return (!tptr.empty()); } + bool RuleCompletes() const { + return (is_some_rule_complete || (ntptr.empty() && tptr.empty())); + } + bool GrammarContinues() const { + return !(ntptr.empty() && tptr.empty()); + } + bool IsRoot() const { + return is_root; + } + // these are the features associated with the rule from the start + // node up to this point. If you use these features, you must + // not Extend() this rule. + const SparseVector& GetCFGProductionFeatures() const { + return input_features; + } + + const EGrammarNode* Extend(const WordID& t) const { + if (t < 0) { + map::const_iterator it = ntptr.find(t); + if (it == ntptr.end()) return NULL; + return &it->second; + } else { + map::const_iterator it = tptr.find(t); + if (it == tptr.end()) return NULL; + return &it->second; + } + } + + private: + map tptr; + map ntptr; + SparseVector input_features; + bool is_some_rule_complete; + bool is_root; +}; +typedef map EGrammar; // indexed by the rule LHS + +// edges are immutable once created +struct Edge { +#ifdef DEBUG_CHART_PARSER + static int id_count; + const int id; +#endif + const WordID cat; // lhs side of rule proved/being proved + const EGrammarNode* const dot; // dot position + const FSTNode* const q; // start of span + const FSTNode* const r; // end of span + const Edge* const active_parent; // back pointer, NULL for PREDICT items + const Edge* const passive_parent; // back pointer, NULL for SCAN and PREDICT items + const TargetPhraseSet* const tps; // translations + shared_ptr > features; // features from CFG rule + + bool IsPassive() const { + // when a rule is completed, this value will be set + return static_cast(features); + } + bool IsActive() const { return !IsPassive(); } + bool IsInitial() const { + return !(active_parent || passive_parent); + } + bool IsCreatedByScan() const { + return active_parent && !passive_parent && !dot->IsRoot(); + } + bool IsCreatedByPredict() const { + return dot->IsRoot(); + } + bool IsCreatedByComplete() const { + return active_parent && passive_parent; + } + + // constructor for PREDICT + Edge(WordID c, const EGrammarNode* d, const FSTNode* q_and_r) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(NULL), passive_parent(NULL), tps(NULL) {} + Edge(WordID c, const EGrammarNode* d, const FSTNode* q_and_r, const Edge* act_parent) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(act_parent), passive_parent(NULL), tps(NULL) {} + + // constructors for SCAN + Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, + const Edge* act_par, const TargetPhraseSet* translations) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations) {} + + Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, + const Edge* act_par, const TargetPhraseSet* translations, + const SparseVector& feats) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations), + features(new SparseVector(feats)) {} + + // constructors for COMPLETE + Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, + const Edge* act_par, const Edge *pas_par) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(NULL) { + assert(pas_par->IsPassive()); + assert(act_par->IsActive()); + } + + Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, + const Edge* act_par, const Edge *pas_par, const SparseVector& feats) : +#ifdef DEBUG_CHART_PARSER + id(++id_count), +#endif + cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(NULL), + features(new SparseVector(feats)) { + assert(pas_par->IsPassive()); + assert(act_par->IsActive()); + } + + // constructor for COMPLETE query + Edge(const FSTNode* _r) : +#ifdef DEBUG_CHART_PARSER + id(0), +#endif + cat(0), dot(NULL), q(NULL), + r(_r), active_parent(NULL), passive_parent(NULL), tps(NULL) {} + // constructor for MERGE quere + Edge(const FSTNode* _q, int) : +#ifdef DEBUG_CHART_PARSER + id(0), +#endif + cat(0), dot(NULL), q(_q), + r(NULL), active_parent(NULL), passive_parent(NULL), tps(NULL) {} +}; +#ifdef DEBUG_CHART_PARSER +int Edge::id_count = 0; +#endif + +ostream& operator<<(ostream& os, const Edge& e) { + string type = "PREDICT"; + if (e.IsCreatedByScan()) + type = "SCAN"; + else if (e.IsCreatedByComplete()) + type = "COMPLETE"; + os << "[" +#ifdef DEBUG_CHART_PARSER + << '(' << e.id << ") " +#else + << '(' << &e << ") " +#endif + << "q=" << e.q << ", r=" << e.r + << ", cat="<< TD::Convert(e.cat*-1) << ", dot=" + << e.dot +#ifdef DEBUG_CHART_PARSER + << e.dot->hint +#endif + << (e.IsActive() ? ", Active" : ", Passive") + << ", " << type; +#ifdef DEBUG_CHART_PARSER + if (e.active_parent) { os << ", act.parent=(" << e.active_parent->id << ')'; } + if (e.passive_parent) { os << ", psv.parent=(" << e.passive_parent->id << ')'; } +#endif + if (e.tps) { os << ", tps=" << e.tps; } + return os << ']'; +} + +struct Traversal { + const Edge* const edge; // result from the active / passive combination + const Edge* const active; + const Edge* const passive; + Traversal(const Edge* me, const Edge* a, const Edge* p) : edge(me), active(a), passive(p) {} +}; + +struct UniqueTraversalHash { + size_t operator()(const Traversal* t) const { + size_t x = 5381; + x = ((x << 5) + x) ^ reinterpret_cast(t->active); + x = ((x << 5) + x) ^ reinterpret_cast(t->passive); + x = ((x << 5) + x) ^ t->edge->IsActive(); + return x; + } +}; + +struct UniqueTraversalEquals { + size_t operator()(const Traversal* a, const Traversal* b) const { + return (a->passive == b->passive && a->active == b->active && a->edge->IsActive() == b->edge->IsActive()); + } +}; + +struct UniqueEdgeHash { + size_t operator()(const Edge* e) const { + size_t x = 5381; + if (e->IsActive()) { + x = ((x << 5) + x) ^ reinterpret_cast(e->dot); + x = ((x << 5) + x) ^ reinterpret_cast(e->q); + x = ((x << 5) + x) ^ reinterpret_cast(e->r); + x = ((x << 5) + x) ^ static_cast(e->cat); + x += 13; + } else { // with passive edges, we don't care about the dot + x = ((x << 5) + x) ^ reinterpret_cast(e->q); + x = ((x << 5) + x) ^ reinterpret_cast(e->r); + x = ((x << 5) + x) ^ static_cast(e->cat); + } + return x; + } +}; + +struct UniqueEdgeEquals { + bool operator()(const Edge* a, const Edge* b) const { + if (a->IsActive() != b->IsActive()) return false; + if (a->IsActive()) { + return (a->cat == b->cat) && (a->dot == b->dot) && (a->q == b->q) && (a->r == b->r); + } else { + return (a->cat == b->cat) && (a->q == b->q) && (a->r == b->r); + } + } +}; + +struct REdgeHash { + size_t operator()(const Edge* e) const { + size_t x = 5381; + x = ((x << 5) + x) ^ reinterpret_cast(e->r); + return x; + } +}; + +struct REdgeEquals { + bool operator()(const Edge* a, const Edge* b) const { + return (a->r == b->r); + } +}; + +struct QEdgeHash { + size_t operator()(const Edge* e) const { + size_t x = 5381; + x = ((x << 5) + x) ^ reinterpret_cast(e->q); + return x; + } +}; + +struct QEdgeEquals { + bool operator()(const Edge* a, const Edge* b) const { + return (a->q == b->q); + } +}; + +struct EdgeQueue { + queue q; + EdgeQueue() {} + void clear() { while(!q.empty()) q.pop(); } + bool HasWork() const { return !q.empty(); } + const Edge* Next() { const Edge* res = q.front(); q.pop(); return res; } + void AddEdge(const Edge* s) { q.push(s); } +}; + +class EarleyComposerImpl { + public: + EarleyComposerImpl(WordID start_cat, const FSTNode& q_0) : start_cat_(start_cat), q_0_(&q_0) {} + + // returns false if the intersection is empty + bool Compose(const EGrammar& g, Hypergraph* forest) { + goal_node = NULL; + EGrammar::const_iterator sit = g.find(start_cat_); + forest->ReserveNodes(kMAX_NODES); + assert(sit != g.end()); + Edge* init = new Edge(start_cat_, &sit->second, q_0_); + assert(IncorporateNewEdge(init)); + while (exp_agenda.HasWork() || agenda.HasWork()) { + while(exp_agenda.HasWork()) { + const Edge* edge = exp_agenda.Next(); + FinishEdge(edge, forest); + } + if (agenda.HasWork()) { + const Edge* edge = agenda.Next(); +#ifdef DEBUG_CHART_PARSER + cerr << "processing (" << edge->id << ')' << endl; +#endif + if (edge->IsActive()) { + if (edge->dot->HasTerminals()) + DoScan(edge); + if (edge->dot->HasNonTerminals()) { + DoMergeWithPassives(edge); + DoPredict(edge, g); + } + } else { + DoComplete(edge); + } + } + } + if (goal_node) { + forest->PruneUnreachable(goal_node->id_); + forest->EpsilonRemove(kEPS); + } + FreeAll(); + return goal_node; + } + + void FreeAll() { + for (int i = 0; i < free_list_.size(); ++i) + delete free_list_[i]; + free_list_.clear(); + for (int i = 0; i < traversal_free_list_.size(); ++i) + delete traversal_free_list_[i]; + traversal_free_list_.clear(); + all_traversals.clear(); + exp_agenda.clear(); + agenda.clear(); + tps2node.clear(); + edge2node.clear(); + all_edges.clear(); + passive_edges.clear(); + active_edges.clear(); + } + + ~EarleyComposerImpl() { + FreeAll(); + } + + // returns the total number of edges created during composition + int EdgesCreated() const { + return free_list_.size(); + } + + private: + void DoScan(const Edge* edge) { + // here, we assume that the FST will potentially have many more outgoing + // edges than the grammar, which will be just a couple. If you want to + // efficiently handle the case where both are relatively large, this code + // will need to change how the intersection is done. The best general + // solution would probably be the Baeza-Yates double binary search. + + const EGrammarNode* dot = edge->dot; + const FSTNode* r = edge->r; + const map& terms = dot->GetTerminals(); + for (map::const_iterator git = terms.begin(); + git != terms.end(); ++git) { + const FSTNode* next_r = r->Extend(git->first); + if (!next_r) continue; + const EGrammarNode* next_dot = &git->second; + const bool grammar_continues = next_dot->GrammarContinues(); + const bool rule_completes = next_dot->RuleCompletes(); + assert(grammar_continues || rule_completes); + const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); + // create up to 4 new edges! + if (next_r->HasOutgoingNonEpsilonEdges()) { // are there further symbols in the FST? + const TargetPhraseSet* translations = NULL; + if (rule_completes) + IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, translations, input_features)); + if (grammar_continues) + IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, translations)); + } + if (next_r->HasData()) { // indicates a loop back to q_0 in the FST + const TargetPhraseSet* translations = next_r->GetTranslations(); + if (rule_completes) + IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, q_0_, edge, translations, input_features)); + if (grammar_continues) + IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, q_0_, edge, translations)); + } + } + } + + void DoPredict(const Edge* edge, const EGrammar& g) { + const EGrammarNode* dot = edge->dot; + const map& non_terms = dot->GetNonTerminals(); + for (map::const_iterator git = non_terms.begin(); + git != non_terms.end(); ++git) { + const WordID nt_to_predict = git->first; + //cerr << edge->id << " -- " << TD::Convert(nt_to_predict*-1) << endl; + EGrammar::const_iterator egi = g.find(nt_to_predict); + if (egi == g.end()) { + cerr << "[ERROR] Can't find any grammar rules with a LHS of type " + << TD::Convert(-1*nt_to_predict) << '!' << endl; + continue; + } + assert(edge->IsActive()); + const EGrammarNode* new_dot = &egi->second; + Edge* new_edge = new Edge(nt_to_predict, new_dot, edge->r, edge); + IncorporateNewEdge(new_edge); + } + } + + void DoComplete(const Edge* passive) { +#ifdef DEBUG_CHART_PARSER + cerr << " complete: " << *passive << endl; +#endif + const WordID completed_nt = passive->cat; + const FSTNode* q = passive->q; + const FSTNode* next_r = passive->r; + const Edge query(q); + const pair::iterator, + unordered_multiset::iterator > p = + active_edges.equal_range(&query); + for (unordered_multiset::iterator it = p.first; + it != p.second; ++it) { + const Edge* active = *it; +#ifdef DEBUG_CHART_PARSER + cerr << " pos: " << *active << endl; +#endif + const EGrammarNode* next_dot = active->dot->Extend(completed_nt); + if (!next_dot) continue; + const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); + // add up to 2 rules + if (next_dot->RuleCompletes()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); + if (next_dot->GrammarContinues()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); + } + } + + void DoMergeWithPassives(const Edge* active) { + // edge is active, has non-terminals, we need to find the passives that can extend it + assert(active->IsActive()); + assert(active->dot->HasNonTerminals()); +#ifdef DEBUG_CHART_PARSER + cerr << " merge active with passives: ACT=" << *active << endl; +#endif + const Edge query(active->r, 1); + const pair::iterator, + unordered_multiset::iterator > p = + passive_edges.equal_range(&query); + for (unordered_multiset::iterator it = p.first; + it != p.second; ++it) { + const Edge* passive = *it; + const EGrammarNode* next_dot = active->dot->Extend(passive->cat); + if (!next_dot) continue; + const FSTNode* next_r = passive->r; + const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); + if (next_dot->RuleCompletes()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); + if (next_dot->GrammarContinues()) + IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); + } + } + + // take ownership of edge memory, add to various indexes, etc + // returns true if this edge is new + bool IncorporateNewEdge(Edge* edge) { + free_list_.push_back(edge); + if (edge->passive_parent && edge->active_parent) { + Traversal* t = new Traversal(edge, edge->active_parent, edge->passive_parent); + traversal_free_list_.push_back(t); + if (all_traversals.find(t) != all_traversals.end()) { + return false; + } else { + all_traversals.insert(t); + } + } + exp_agenda.AddEdge(edge); + return true; + } + + bool FinishEdge(const Edge* edge, Hypergraph* hg) { + bool is_new = false; + if (all_edges.find(edge) == all_edges.end()) { +#ifdef DEBUG_CHART_PARSER + cerr << *edge << " is NEW\n"; +#endif + all_edges.insert(edge); + is_new = true; + if (edge->IsPassive()) passive_edges.insert(edge); + if (edge->IsActive()) active_edges.insert(edge); + agenda.AddEdge(edge); + } else { +#ifdef DEBUG_CHART_PARSER + cerr << *edge << " is NOT NEW.\n"; +#endif + } + AddEdgeToTranslationForest(edge, hg); + return is_new; + } + + // build the translation forest + void AddEdgeToTranslationForest(const Edge* edge, Hypergraph* hg) { + assert(hg->nodes_.size() < kMAX_NODES); + Hypergraph::Node* tps = NULL; + // first add any target language rules + if (edge->tps) { + Hypergraph::Node*& node = tps2node[(size_t)edge->tps]; + if (!node) { + // cerr << "Creating phrases for " << edge->tps << endl; + const vector& rules = edge->tps->GetRules(); + node = hg->AddNode(kPHRASE, ""); + for (int i = 0; i < rules.size(); ++i) { + Hypergraph::Edge* hg_edge = hg->AddEdge(rules[i], Hypergraph::TailNodeVector()); + hg_edge->feature_values_ += rules[i]->GetFeatureValues(); + hg->ConnectEdgeToHeadNode(hg_edge, node); + } + } + tps = node; + } + Hypergraph::Node*& head_node = edge2node[edge]; + if (!head_node) + head_node = hg->AddNode(kPHRASE, ""); + if (edge->cat == start_cat_ && edge->q == q_0_ && edge->r == q_0_ && edge->IsPassive()) { + assert(goal_node == NULL || goal_node == head_node); + goal_node = head_node; + } + Hypergraph::TailNodeVector tail; + SparseVector extra; + if (edge->IsCreatedByPredict()) { + // extra.set_value(FD::Convert("predict"), 1); + } else if (edge->IsCreatedByScan()) { + tail.push_back(edge2node[edge->active_parent]->id_); + if (tps) { + tail.push_back(tps->id_); + } + //extra.set_value(FD::Convert("scan"), 1); + } else if (edge->IsCreatedByComplete()) { + tail.push_back(edge2node[edge->active_parent]->id_); + tail.push_back(edge2node[edge->passive_parent]->id_); + //extra.set_value(FD::Convert("complete"), 1); + } else { + assert(!"unexpected edge type!"); + } + //cerr << head_node->id_ << "<--" << *edge << endl; + +#ifdef DEBUG_CHART_PARSER + for (int i = 0; i < tail.size(); ++i) + if (tail[i] == head_node->id_) { + cerr << "ERROR: " << *edge << "\n i=" << i << endl; + if (i == 1) { cerr << "\tP: " << *edge->passive_parent << endl; } + if (i == 0) { cerr << "\tA: " << *edge->active_parent << endl; } + assert(!"self-loop found!"); + } +#endif + Hypergraph::Edge* hg_edge = NULL; + if (tail.size() == 0) { + hg_edge = hg->AddEdge(kEPSRule, tail); + } else if (tail.size() == 1) { + hg_edge = hg->AddEdge(kX1, tail); + } else if (tail.size() == 2) { + hg_edge = hg->AddEdge(kX1X2, tail); + } + if (edge->features) + hg_edge->feature_values_ += *edge->features; + hg_edge->feature_values_ += extra; + hg->ConnectEdgeToHeadNode(hg_edge, head_node); + } + + Hypergraph::Node* goal_node; + EdgeQueue exp_agenda; + EdgeQueue agenda; + unordered_map tps2node; + unordered_map edge2node; + unordered_set all_traversals; + unordered_set all_edges; + unordered_multiset passive_edges; + unordered_multiset active_edges; + vector free_list_; + vector traversal_free_list_; + const WordID start_cat_; + const FSTNode* const q_0_; +}; + +#ifdef DEBUG_CHART_PARSER +static string TrimRule(const string& r) { + size_t start = r.find(" |||") + 5; + size_t end = r.rfind(" |||"); + return r.substr(start, end - start); +} +#endif + +void AddGrammarRule(const string& r, EGrammar* g) { + const size_t pos = r.find(" ||| "); + if (pos == string::npos || r[0] != '[') { + cerr << "Bad rule: " << r << endl; + return; + } + const size_t rpos = r.rfind(" ||| "); + string feats; + string rs = r; + if (rpos != pos) { + feats = r.substr(rpos + 5); + rs = r.substr(0, rpos); + } + string rhs = rs.substr(pos + 5); + string trule = rs + " ||| " + rhs + " ||| " + feats; + TRule tr(trule); +#ifdef DEBUG_CHART_PARSER + string hint_last_rule; +#endif + EGrammarNode* cur = &(*g)[tr.GetLHS()]; + cur->is_root = true; + for (int i = 0; i < tr.FLength(); ++i) { + WordID sym = tr.f()[i]; +#ifdef DEBUG_CHART_PARSER + hint_last_rule = TD::Convert(sym < 0 ? -sym : sym); + cur->hint += " <@@> (*" + hint_last_rule + ") " + TrimRule(tr.AsString()); +#endif + if (sym < 0) + cur = &cur->ntptr[sym]; + else + cur = &cur->tptr[sym]; + } +#ifdef DEBUG_CHART_PARSER + cur->hint += " <@@> (" + hint_last_rule + "*) " + TrimRule(tr.AsString()); +#endif + cur->is_some_rule_complete = true; + cur->input_features = tr.GetFeatureValues(); +} + +EarleyComposer::~EarleyComposer() { + delete pimpl_; +} + +EarleyComposer::EarleyComposer(const FSTNode* fst) { + InitializeConstants(); + pimpl_ = new EarleyComposerImpl(kUNIQUE_START, *fst); +} + +bool EarleyComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest) { + // first, convert the src forest into an EGrammar + EGrammar g; + const int nedges = src_forest.edges_.size(); + const int nnodes = src_forest.nodes_.size(); + vector cats(nnodes); + bool assign_cats = false; + for (int i = 0; i < nnodes; ++i) + if (assign_cats) { + cats[i] = TD::Convert("CAT_" + boost::lexical_cast(i)) * -1; + } else { + cats[i] = src_forest.nodes_[i].cat_; + } + // construct the grammar + for (int i = 0; i < nedges; ++i) { + const Hypergraph::Edge& edge = src_forest.edges_[i]; + const vector& src = edge.rule_->f(); + EGrammarNode* cur = &g[cats[edge.head_node_]]; + cur->is_root = true; + int ntc = 0; + for (int j = 0; j < src.size(); ++j) { + WordID sym = src[j]; + if (sym <= 0) { + sym = cats[edge.tail_nodes_[ntc]]; + ++ntc; + cur = &cur->ntptr[sym]; + } else { + cur = &cur->tptr[sym]; + } + } + cur->is_some_rule_complete = true; + cur->input_features = edge.feature_values_; + } + EGrammarNode& goal_rule = g[kUNIQUE_START]; + assert((goal_rule.ntptr.size() == 1 && goal_rule.tptr.size() == 0) || + (goal_rule.ntptr.size() == 0 && goal_rule.tptr.size() == 1)); + + return pimpl_->Compose(g, trg_forest); +} + +bool EarleyComposer::Compose(istream* in, Hypergraph* trg_forest) { + EGrammar g; + while(*in) { + string line; + getline(*in, line); + if (line.empty()) continue; + AddGrammarRule(line, &g); + } + + return pimpl_->Compose(g, trg_forest); +} diff --git a/decoder/earley_composer.h b/decoder/earley_composer.h new file mode 100644 index 00000000..9f786bf6 --- /dev/null +++ b/decoder/earley_composer.h @@ -0,0 +1,29 @@ +#ifndef _EARLEY_COMPOSER_H_ +#define _EARLEY_COMPOSER_H_ + +#include + +class EarleyComposerImpl; +class FSTNode; +class Hypergraph; + +class EarleyComposer { + public: + ~EarleyComposer(); + EarleyComposer(const FSTNode* phrasetable_root); + bool Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); + + // reads the grammar from a file. There must be a single top-level + // S -> X rule. Anything else is possible. Format is: + // [S] ||| [SS,1] + // [SS] ||| [NP,1] [VP,2] ||| Feature1=0.2 Feature2=-2.3 + // [SS] ||| [VP,1] [NP,2] ||| Feature1=0.8 + // [NP] ||| [DET,1] [N,2] ||| Feature3=2 + // ... + bool Compose(std::istream* grammar_file, Hypergraph* trg_forest); + + private: + EarleyComposerImpl* pimpl_; +}; + +#endif diff --git a/decoder/exp_semiring.h b/decoder/exp_semiring.h new file mode 100644 index 00000000..f91beee4 --- /dev/null +++ b/decoder/exp_semiring.h @@ -0,0 +1,71 @@ +#ifndef _EXP_SEMIRING_H_ +#define _EXP_SEMIRING_H_ + +#include + +// this file implements the first-order expectation semiring described +// in Li & Eisner (EMNLP 2009) + +// requirements: +// RType * RType ==> RType +// PType * PType ==> PType +// RType * PType ==> RType +// good examples: +// PType scalar, RType vector +// BAD examples: +// PType vector, RType scalar +template +struct PRPair { + PRPair() : p(), r() {} + // Inside algorithm requires that T(0) and T(1) + // return the 0 and 1 values of the semiring + explicit PRPair(double x) : p(x), r() {} + PRPair(const PType& p, const RType& r) : p(p), r(r) {} + PRPair& operator+=(const PRPair& o) { + p += o.p; + r += o.r; + return *this; + } + PRPair& operator*=(const PRPair& o) { + r = (o.r * p) + (o.p * r); + p *= o.p; + return *this; + } + PType p; + RType r; +}; + +template +std::ostream& operator<<(std::ostream& o, const PRPair& x) { + return o << '<' << x.p << ", " << x.r << '>'; +} + +template +const PRPair operator+(const PRPair& a, const PRPair& b) { + PRPair result = a; + result += b; + return result; +} + +template +const PRPair operator*(const PRPair& a, const PRPair& b) { + PRPair result = a; + result *= b; + return result; +} + +template +struct PRWeightFunction { + explicit PRWeightFunction(const PWeightFunction& pwf = PWeightFunction(), + const RWeightFunction& rwf = RWeightFunction()) : + pweight(pwf), rweight(rwf) {} + PRPair operator()(const Hypergraph::Edge& e) const { + const P p = pweight(e); + const R r = rweight(e); + return PRPair(p, r * p); + } + const PWeightFunction pweight; + const RWeightFunction rweight; +}; + +#endif diff --git a/decoder/fdict.cc b/decoder/fdict.cc new file mode 100644 index 00000000..83aa7cea --- /dev/null +++ b/decoder/fdict.cc @@ -0,0 +1,4 @@ +#include "fdict.h" + +Dict FD::dict_; + diff --git a/decoder/fdict.h b/decoder/fdict.h new file mode 100644 index 00000000..ff491cfb --- /dev/null +++ b/decoder/fdict.h @@ -0,0 +1,21 @@ +#ifndef _FDICT_H_ +#define _FDICT_H_ + +#include +#include +#include "dict.h" + +struct FD { + static Dict dict_; + static inline int NumFeats() { + return dict_.max() + 1; + } + static inline WordID Convert(const std::string& s) { + return dict_.Convert(s); + } + static inline const std::string& Convert(const WordID& w) { + return dict_.Convert(w); + } +}; + +#endif diff --git a/decoder/ff.cc b/decoder/ff.cc new file mode 100644 index 00000000..2ae5b9eb --- /dev/null +++ b/decoder/ff.cc @@ -0,0 +1,114 @@ +#include "ff.h" + +#include "tdict.h" +#include "hg.h" + +using namespace std; + +FeatureFunction::~FeatureFunction() {} + + +void FeatureFunction::FinalTraversalFeatures(const void* ant_state, + SparseVector* features) const { + (void) ant_state; + (void) features; +} + +// Hiero and Joshua use log_10(e) as the value, so I do to +WordPenalty::WordPenalty(const string& param) : + fid_(FD::Convert("WordPenalty")), + value_(-1.0 / log(10)) { + if (!param.empty()) { + cerr << "Warning WordPenalty ignoring parameter: " << param << endl; + } +} + +void WordPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_states, + SparseVector* features, + SparseVector* estimated_features, + void* state) const { + (void) smeta; + (void) ant_states; + (void) state; + (void) estimated_features; + features->set_value(fid_, edge.rule_->EWords() * value_); +} + +SourceWordPenalty::SourceWordPenalty(const string& param) : + fid_(FD::Convert("SourceWordPenalty")), + value_(-1.0 / log(10)) { + if (!param.empty()) { + cerr << "Warning SourceWordPenalty ignoring parameter: " << param << endl; + } +} + +void SourceWordPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_states, + SparseVector* features, + SparseVector* estimated_features, + void* state) const { + (void) smeta; + (void) ant_states; + (void) state; + (void) estimated_features; + features->set_value(fid_, edge.rule_->FWords() * value_); +} + +ModelSet::ModelSet(const vector& w, const vector& models) : + models_(models), + weights_(w), + state_size_(0), + model_state_pos_(models.size()) { + for (int i = 0; i < models_.size(); ++i) { + model_state_pos_[i] = state_size_; + state_size_ += models_[i]->NumBytesContext(); + } +} + +void ModelSet::AddFeaturesToEdge(const SentenceMetadata& smeta, + const Hypergraph& hg, + Hypergraph::Edge* edge, + string* context, + prob_t* combination_cost_estimate) const { + context->resize(state_size_); + memset(&(*context)[0], 0, state_size_); + SparseVector est_vals; // only computed if combination_cost_estimate is non-NULL + if (combination_cost_estimate) *combination_cost_estimate = prob_t::One(); + for (int i = 0; i < models_.size(); ++i) { + const FeatureFunction& ff = *models_[i]; + void* cur_ff_context = NULL; + vector ants(edge->tail_nodes_.size()); + bool has_context = ff.NumBytesContext() > 0; + if (has_context) { + int spos = model_state_pos_[i]; + cur_ff_context = &(*context)[spos]; + for (int i = 0; i < ants.size(); ++i) { + ants[i] = &hg.nodes_[edge->tail_nodes_[i]].state_[spos]; + } + } + ff.TraversalFeatures(smeta, *edge, ants, &edge->feature_values_, &est_vals, cur_ff_context); + } + if (combination_cost_estimate) + combination_cost_estimate->logeq(est_vals.dot(weights_)); + edge->edge_prob_.logeq(edge->feature_values_.dot(weights_)); +} + +void ModelSet::AddFinalFeatures(const std::string& state, Hypergraph::Edge* edge) const { + assert(1 == edge->rule_->Arity()); + + for (int i = 0; i < models_.size(); ++i) { + const FeatureFunction& ff = *models_[i]; + const void* ant_state = NULL; + bool has_context = ff.NumBytesContext() > 0; + if (has_context) { + int spos = model_state_pos_[i]; + ant_state = &state[spos]; + } + ff.FinalTraversalFeatures(ant_state, &edge->feature_values_); + } + edge->edge_prob_.logeq(edge->feature_values_.dot(weights_)); +} + diff --git a/decoder/ff.h b/decoder/ff.h new file mode 100644 index 00000000..e962b4ba --- /dev/null +++ b/decoder/ff.h @@ -0,0 +1,136 @@ +#ifndef _FF_H_ +#define _FF_H_ + +#include + +#include "fdict.h" +#include "hg.h" + +class SentenceMetadata; +class FeatureFunction; // see definition below + +// if you want to develop a new feature, inherit from this class and +// override TraversalFeaturesImpl(...). If it's a feature that returns / +// depends on context, you may also need to implement +// FinalTraversalFeatures(...) +class FeatureFunction { + public: + FeatureFunction() : state_size_() {} + explicit FeatureFunction(int state_size) : state_size_(state_size) {} + virtual ~FeatureFunction(); + + // returns the number of bytes of context that this feature function will + // (maximally) use. By default, 0 ("stateless" models in Hiero/Joshua). + // NOTE: this value is fixed for the instance of your class, you cannot + // use different amounts of memory for different nodes in the forest. + inline int NumBytesContext() const { return state_size_; } + + // Compute the feature values and (if this applies) the estimates of the + // feature values when this edge is used incorporated into a larger context + inline void TraversalFeatures(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_state) const { + TraversalFeaturesImpl(smeta, edge, ant_contexts, + features, estimated_features, out_state); + // TODO it's easy for careless feature function developers to overwrite + // the end of their state and clobber someone else's memory. These bugs + // will be horrendously painful to track down. There should be some + // optional strict mode that's enforced here that adds some kind of + // barrier between the blocks reserved for the residual contexts + } + + // if there's some state left when you transition to the goal state, score + // it here. For example, the language model computes the cost of adding + // and . + virtual void FinalTraversalFeatures(const void* residual_state, + SparseVector* final_features) const; + + protected: + // context is a pointer to a buffer of size NumBytesContext() that the + // feature function can write its state to. It's up to the feature function + // to determine how much space it needs and to determine how to encode its + // residual contextual information since it is OPAQUE to all clients outside + // of the particular FeatureFunction class. There is one exception: + // equality of the contents (i.e., memcmp) is required to determine whether + // two states can be combined. + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const = 0; + + // !!! ONLY call this from subclass *CONSTRUCTORS* !!! + void SetStateSize(size_t state_size) { + state_size_ = state_size; + } + + private: + int state_size_; +}; + +// word penalty feature, for each word on the E side of a rule, +// add value_ +class WordPenalty : public FeatureFunction { + public: + WordPenalty(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + private: + const int fid_; + const double value_; +}; + +class SourceWordPenalty : public FeatureFunction { + public: + SourceWordPenalty(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + private: + const int fid_; + const double value_; +}; + +// this class is a set of FeatureFunctions that can be used to score, rescore, +// etc. a (translation?) forest +class ModelSet { + public: + ModelSet() : state_size_(0) {} + + ModelSet(const std::vector& weights, + const std::vector& models); + + // sets edge->feature_values_ and edge->edge_prob_ + // NOTE: edge must not necessarily be in hg.edges_ but its TAIL nodes + // must be. + void AddFeaturesToEdge(const SentenceMetadata& smeta, + const Hypergraph& hg, + Hypergraph::Edge* edge, + std::string* residual_context, + prob_t* combination_cost_estimate = NULL) const; + + void AddFinalFeatures(const std::string& residual_context, + Hypergraph::Edge* edge) const; + + bool empty() const { return models_.empty(); } + private: + std::vector models_; + std::vector weights_; + int state_size_; + std::vector model_state_pos_; +}; + +#endif diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc new file mode 100644 index 00000000..cac4bb8e --- /dev/null +++ b/decoder/ff_csplit.cc @@ -0,0 +1,212 @@ +#include "ff_csplit.h" + +#include +#include + +#include "Vocab.h" +#include "Ngram.h" + +#include "sentence_metadata.h" +#include "lattice.h" +#include "tdict.h" +#include "freqdict.h" +#include "filelib.h" +#include "stringlib.h" +#include "tdict.h" + +using namespace std; + +struct BasicCSplitFeaturesImpl { + BasicCSplitFeaturesImpl(const string& param) : + word_count_(FD::Convert("WordCount")), + letters_sq_(FD::Convert("LettersSq")), + letters_sqrt_(FD::Convert("LettersSqrt")), + in_dict_(FD::Convert("InDict")), + short_(FD::Convert("Short")), + long_(FD::Convert("Long")), + oov_(FD::Convert("OOV")), + short_range_(FD::Convert("ShortRange")), + high_freq_(FD::Convert("HighFreq")), + med_freq_(FD::Convert("MedFreq")), + freq_(FD::Convert("Freq")), + fl1_(FD::Convert("FreqLen1")), + fl2_(FD::Convert("FreqLen2")), + bad_(FD::Convert("Bad")) { + vector argv; + int argc = SplitOnWhitespace(param, &argv); + if (argc != 1 && argc != 2) { + cerr << "Expected: freqdict.txt [badwords.txt]\n"; + abort(); + } + freq_dict_.Load(argv[0]); + if (argc == 2) { + ReadFile rf(argv[1]); + istream& in = *rf.stream(); + while(in) { + string badword; + in >> badword; + if (badword.empty()) continue; + bad_words_.insert(TD::Convert(badword)); + } + } + } + + void TraversalFeaturesImpl(const Hypergraph::Edge& edge, + SparseVector* features) const; + + const int word_count_; + const int letters_sq_; + const int letters_sqrt_; + const int in_dict_; + const int short_; + const int long_; + const int oov_; + const int short_range_; + const int high_freq_; + const int med_freq_; + const int freq_; + const int fl1_; + const int fl2_; + const int bad_; + FreqDict freq_dict_; + set bad_words_; +}; + +BasicCSplitFeatures::BasicCSplitFeatures(const string& param) : + pimpl_(new BasicCSplitFeaturesImpl(param)) {} + +void BasicCSplitFeaturesImpl::TraversalFeaturesImpl( + const Hypergraph::Edge& edge, + SparseVector* features) const { + features->set_value(word_count_, 1.0); + features->set_value(letters_sq_, (edge.j_ - edge.i_) * (edge.j_ - edge.i_)); + features->set_value(letters_sqrt_, sqrt(edge.j_ - edge.i_)); + const WordID word = edge.rule_->e_[1]; + const char* sword = TD::Convert(word); + const int len = strlen(sword); + int cur = 0; + int chars = 0; + while(cur < len) { + cur += UTF8Len(sword[cur]); + ++chars; + } + + // these are corrections that attempt to make chars + // more like a phoneme count than a letter count, they + // are only really meaningful for german and should + // probably be gotten rid of + bool has_sch = strstr(sword, "sch"); + bool has_ch = (!has_sch && strstr(sword, "ch")); + bool has_ie = strstr(sword, "ie"); + bool has_zw = strstr(sword, "zw"); + if (has_sch) chars -= 2; + if (has_ch) --chars; + if (has_ie) --chars; + if (has_zw) --chars; + + float freq = freq_dict_.LookUp(word); + if (freq) { + features->set_value(freq_, freq); + features->set_value(in_dict_, 1.0); + } else { + features->set_value(oov_, 1.0); + freq = 99.0f; + } + if (bad_words_.count(word) != 0) + features->set_value(bad_, 1.0); + if (chars < 5) + features->set_value(short_, 1.0); + if (chars > 10) + features->set_value(long_, 1.0); + if (freq < 7.0f) + features->set_value(high_freq_, 1.0); + if (freq > 8.0f && freq < 10.f) + features->set_value(med_freq_, 1.0); + if (freq < 10.0f && chars < 5) + features->set_value(short_range_, 1.0); + + // i don't understand these features, but they really help! + features->set_value(fl1_, sqrt(chars * freq)); + features->set_value(fl2_, freq / chars); +} + +void BasicCSplitFeatures::TraversalFeaturesImpl( + const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_context) const { + (void) smeta; + (void) ant_contexts; + (void) out_context; + (void) estimated_features; + if (edge.Arity() == 0) return; + if (edge.rule_->EWords() != 1) return; + pimpl_->TraversalFeaturesImpl(edge, features); +} + +struct ReverseCharLMCSplitFeatureImpl { + ReverseCharLMCSplitFeatureImpl(const string& param) : + order_(5), + vocab_(*TD::dict_), + ngram_(vocab_, order_) { + kBOS = vocab_.getIndex(""); + kEOS = vocab_.getIndex(""); + File file(param.c_str(), "r", 0); + assert(file); + cerr << "Reading " << order_ << "-gram LM from " << param << endl; + ngram_.read(file); + } + + double LeftPhonotacticProb(const Lattice& inword, const int start) { + const int end = inword.size(); + for (int i = 0; i < order_; ++i) + sc[i] = kBOS; + int sp = min(end - start, order_ - 1); + // cerr << "[" << start << "," << sp << "]\n"; + int ci = (order_ - sp - 1); + int wi = start; + while (sp > 0) { + sc[ci] = inword[wi][0].label; + // cerr << " CHAR: " << TD::Convert(sc[ci]) << " ci=" << ci << endl; + ++wi; + ++ci; + --sp; + } + // cerr << " END ci=" << ci << endl; + sc[ci] = Vocab_None; + const double startprob = ngram_.wordProb(kEOS, sc); + // cerr << " PROB=" << startprob << endl; + return startprob; + } + private: + const int order_; + Vocab& vocab_; + VocabIndex kBOS; + VocabIndex kEOS; + Ngram ngram_; + VocabIndex sc[80]; +}; + +ReverseCharLMCSplitFeature::ReverseCharLMCSplitFeature(const string& param) : + pimpl_(new ReverseCharLMCSplitFeatureImpl(param)), + fid_(FD::Convert("RevCharLM")) {} + +void ReverseCharLMCSplitFeature::TraversalFeaturesImpl( + const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_context) const { + (void) ant_contexts; + (void) estimated_features; + (void) out_context; + + if (edge.Arity() != 1) return; + if (edge.rule_->EWords() != 1) return; + const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_); + features->set_value(fid_, lpp); +} + diff --git a/decoder/ff_csplit.h b/decoder/ff_csplit.h new file mode 100644 index 00000000..c1cfb64b --- /dev/null +++ b/decoder/ff_csplit.h @@ -0,0 +1,39 @@ +#ifndef _FF_CSPLIT_H_ +#define _FF_CSPLIT_H_ + +#include + +#include "ff.h" + +class BasicCSplitFeaturesImpl; +class BasicCSplitFeatures : public FeatureFunction { + public: + BasicCSplitFeatures(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_context) const; + private: + boost::shared_ptr pimpl_; +}; + +class ReverseCharLMCSplitFeatureImpl; +class ReverseCharLMCSplitFeature : public FeatureFunction { + public: + ReverseCharLMCSplitFeature(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_context) const; + private: + boost::shared_ptr pimpl_; + const int fid_; +}; + +#endif diff --git a/decoder/ff_factory.cc b/decoder/ff_factory.cc new file mode 100644 index 00000000..1854e0bb --- /dev/null +++ b/decoder/ff_factory.cc @@ -0,0 +1,35 @@ +#include "ff_factory.h" + +#include "ff.h" + +using boost::shared_ptr; +using namespace std; + +FFFactoryBase::~FFFactoryBase() {} + +void FFRegistry::DisplayList() const { + for (map >::const_iterator it = reg_.begin(); + it != reg_.end(); ++it) { + cerr << " " << it->first << endl; + } +} + +shared_ptr FFRegistry::Create(const string& ffname, const string& param) const { + map >::const_iterator it = reg_.find(ffname); + shared_ptr res; + if (it == reg_.end()) { + cerr << "I don't know how to create feature " << ffname << endl; + } else { + res = it->second->Create(param); + } + return res; +} + +void FFRegistry::Register(const string& ffname, FFFactoryBase* factory) { + if (reg_.find(ffname) != reg_.end()) { + cerr << "Duplicate registration of FeatureFunction with name " << ffname << "!\n"; + abort(); + } + reg_[ffname].reset(factory); +} + diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h new file mode 100644 index 00000000..bc586567 --- /dev/null +++ b/decoder/ff_factory.h @@ -0,0 +1,39 @@ +#ifndef _FF_FACTORY_H_ +#define _FF_FACTORY_H_ + +#include +#include +#include + +#include + +class FeatureFunction; +class FFRegistry; +class FFFactoryBase; +extern boost::shared_ptr global_ff_registry; + +class FFRegistry { + friend int main(int argc, char** argv); + friend class FFFactoryBase; + public: + boost::shared_ptr Create(const std::string& ffname, const std::string& param) const; + void DisplayList() const; + void Register(const std::string& ffname, FFFactoryBase* factory); + private: + FFRegistry() {} + std::map > reg_; +}; + +struct FFFactoryBase { + virtual ~FFFactoryBase(); + virtual boost::shared_ptr Create(const std::string& param) const = 0; +}; + +template +class FFFactory : public FFFactoryBase { + boost::shared_ptr Create(const std::string& param) const { + return boost::shared_ptr(new FF(param)); + } +}; + +#endif diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc new file mode 100644 index 00000000..354787ec --- /dev/null +++ b/decoder/ff_lm.cc @@ -0,0 +1,328 @@ +#include "ff_lm.h" + +#include +#include +#include +#include +#include +#include + +#include "tdict.h" +#include "Vocab.h" +#include "Ngram.h" +#include "hg.h" +#include "stringlib.h" + +using namespace std; + +struct LMClient { + struct Cache { + map tree; + float prob; + Cache() : prob() {} + }; + + LMClient(const char* host) : port(6666) { + s = strchr(host, ':'); + if (s != NULL) { + *s = '\0'; + ++s; + port = atoi(s); + } + sock = socket(AF_INET, SOCK_STREAM, 0); + hp = gethostbyname(host); + if (hp == NULL) { + cerr << "unknown host " << host << endl; + abort(); + } + bzero((char *)&server, sizeof(server)); + bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); + server.sin_family = hp->h_addrtype; + server.sin_port = htons(port); + + int errors = 0; + while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) { + cerr << "Error: connect()\n"; + sleep(1); + errors++; + if (errors > 3) exit(1); + } + cerr << "Connected to LM on " << host << " on port " << port << endl; + } + + float wordProb(int word, int* context) { + Cache* cur = &cache; + int i = 0; + while (context[i] > 0) { + cur = &cur->tree[context[i++]]; + } + cur = &cur->tree[word]; + if (cur->prob) { return cur->prob; } + + i = 0; + ostringstream os; + os << "prob " << TD::Convert(word); + while (context[i] > 0) { + os << ' ' << TD::Convert(context[i++]); + } + os << endl; + string out = os.str(); + write(sock, out.c_str(), out.size()); + int r = read(sock, res, 6); + int errors = 0; + int cnt = 0; + while (1) { + if (r < 0) { + errors++; sleep(1); + cerr << "Error: read()\n"; + if (errors > 5) exit(1); + } else if (r==0 || res[cnt] == '\n') { break; } + else { + cnt += r; + if (cnt==6) break; + read(sock, &res[cnt], 6-cnt); + } + } + cur->prob = *reinterpret_cast(res); + return cur->prob; + } + + void clear() { + cache.tree.clear(); + } + + private: + Cache cache; + int sock, port; + char *s; + struct hostent *hp; + struct sockaddr_in server; + char res[8]; +}; + +class LanguageModelImpl { + public: + LanguageModelImpl(int order, const string& f) : + ngram_(*TD::dict_), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), + floor_(-100.0), + client_(NULL), + kSTART(TD::Convert("")), + kSTOP(TD::Convert("")), + kUNKNOWN(TD::Convert("")), + kNONE(-1), + kSTAR(TD::Convert("<{STAR}>")) { + if (f.find("lm://") == 0) { + client_ = new LMClient(f.substr(5).c_str()); + } else { + File file(f.c_str(), "r", 0); + assert(file); + cerr << "Reading " << order_ << "-gram LM from " << f << endl; + ngram_.read(file, false); + } + } + + ~LanguageModelImpl() { + delete client_; + } + + inline int StateSize(const void* state) const { + return *(static_cast(state) + state_size_); + } + + inline void SetStateSize(int size, void* state) const { + *(static_cast(state) + state_size_) = size; + } + + inline double LookupProbForBufferContents(int i) { + double p = client_ ? + client_->wordProb(buffer_[i], &buffer_[i+1]) + : ngram_.wordProb(buffer_[i], (VocabIndex*)&buffer_[i+1]); + if (p < floor_) p = floor_; + return p; + } + + string DebugStateToString(const void* state) const { + int len = StateSize(state); + const int* astate = reinterpret_cast(state); + string res = "["; + for (int i = 0; i < len; ++i) { + res += " "; + res += TD::Convert(astate[i]); + } + res += " ]"; + return res; + } + + inline double ProbNoRemnant(int i, int len) { + int edge = len; + bool flag = true; + double sum = 0.0; + while (i >= 0) { + if (buffer_[i] == kSTAR) { + edge = i; + flag = false; + } else if (buffer_[i] <= 0) { + edge = i; + flag = true; + } else { + if ((edge-i >= order_) || (flag && !(i == (len-1) && buffer_[i] == kSTART))) + sum += LookupProbForBufferContents(i); + } + --i; + } + return sum; + } + + double EstimateProb(const vector& phrase) { + int len = phrase.size(); + buffer_.resize(len + 1); + buffer_[len] = kNONE; + int i = len - 1; + for (int j = 0; j < len; ++j,--i) + buffer_[i] = phrase[j]; + return ProbNoRemnant(len - 1, len); + } + + double EstimateProb(const void* state) { + int len = StateSize(state); + // cerr << "residual len: " << len << endl; + buffer_.resize(len + 1); + buffer_[len] = kNONE; + const int* astate = reinterpret_cast(state); + int i = len - 1; + for (int j = 0; j < len; ++j,--i) + buffer_[i] = astate[j]; + return ProbNoRemnant(len - 1, len); + } + + double FinalTraversalCost(const void* state) { + int slen = StateSize(state); + int len = slen + 2; + // cerr << "residual len: " << len << endl; + buffer_.resize(len + 1); + buffer_[len] = kNONE; + buffer_[len-1] = kSTART; + const int* astate = reinterpret_cast(state); + int i = len - 2; + for (int j = 0; j < slen; ++j,--i) + buffer_[i] = astate[j]; + buffer_[i] = kSTOP; + assert(i == 0); + return ProbNoRemnant(len - 1, len); + } + + double LookupWords(const TRule& rule, const vector& ant_states, void* vstate) { + int len = rule.ELength() - rule.Arity(); + for (int i = 0; i < ant_states.size(); ++i) + len += StateSize(ant_states[i]); + buffer_.resize(len + 1); + buffer_[len] = kNONE; + int i = len - 1; + const vector& e = rule.e(); + for (int j = 0; j < e.size(); ++j) { + if (e[j] < 1) { + const int* astate = reinterpret_cast(ant_states[-e[j]]); + int slen = StateSize(astate); + for (int k = 0; k < slen; ++k) + buffer_[i--] = astate[k]; + } else { + buffer_[i--] = e[j]; + } + } + + double sum = 0.0; + int* remnant = reinterpret_cast(vstate); + int j = 0; + i = len - 1; + int edge = len; + + while (i >= 0) { + if (buffer_[i] == kSTAR) { + edge = i; + } else if (edge-i >= order_) { + sum += LookupProbForBufferContents(i); + } else if (edge == len && remnant) { + remnant[j++] = buffer_[i]; + } + --i; + } + if (!remnant) return sum; + + if (edge != len || len >= order_) { + remnant[j++] = kSTAR; + if (order_-1 < edge) edge = order_-1; + for (int i = edge-1; i >= 0; --i) + remnant[j++] = buffer_[i]; + } + + SetStateSize(j, vstate); + return sum; + } + + static int OrderToStateSize(int order) { + return ((order-1) * 2 + 1) * sizeof(WordID) + 1; + } + + private: + Ngram ngram_; + vector buffer_; + const int order_; + const int state_size_; + const double floor_; + LMClient* client_; + + public: + const WordID kSTART; + const WordID kSTOP; + const WordID kUNKNOWN; + const WordID kNONE; + const WordID kSTAR; +}; + +LanguageModel::LanguageModel(const string& param) : + fid_(FD::Convert("LanguageModel")) { + vector argv; + int argc = SplitOnWhitespace(param, &argv); + int order = 3; + // TODO add support for -n FeatureName + string filename; + if (argc < 1) { cerr << "LanguageModel requires a filename, minimally!\n"; abort(); } + else if (argc == 1) { filename = argv[0]; } + else if (argc == 2 || argc > 3) { cerr << "Don't understand 'LanguageModel " << param << "'\n"; } + else if (argc == 3) { + if (argv[0] == "-o") { + order = atoi(argv[1].c_str()); + filename = argv[2]; + } else if (argv[1] == "-o") { + order = atoi(argv[2].c_str()); + filename = argv[0]; + } + } + SetStateSize(LanguageModelImpl::OrderToStateSize(order)); + pimpl_ = new LanguageModelImpl(order, filename); +} + +LanguageModel::~LanguageModel() { + delete pimpl_; +} + +string LanguageModel::DebugStateToString(const void* state) const{ + return pimpl_->DebugStateToString(state); +} + +void LanguageModel::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_states, + SparseVector* features, + SparseVector* estimated_features, + void* state) const { + (void) smeta; + features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state)); + estimated_features->set_value(fid_, pimpl_->EstimateProb(state)); +} + +void LanguageModel::FinalTraversalFeatures(const void* ant_state, + SparseVector* features) const { + features->set_value(fid_, pimpl_->FinalTraversalCost(ant_state)); +} + diff --git a/decoder/ff_lm.h b/decoder/ff_lm.h new file mode 100644 index 00000000..cd717360 --- /dev/null +++ b/decoder/ff_lm.h @@ -0,0 +1,32 @@ +#ifndef _LM_FF_H_ +#define _LM_FF_H_ + +#include +#include + +#include "hg.h" +#include "ff.h" + +class LanguageModelImpl; + +class LanguageModel : public FeatureFunction { + public: + // param = "filename.lm [-o n]" + LanguageModel(const std::string& param); + ~LanguageModel(); + virtual void FinalTraversalFeatures(const void* context, + SparseVector* features) const; + std::string DebugStateToString(const void* state) const; + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_context) const; + private: + const int fid_; + mutable LanguageModelImpl* pimpl_; +}; + +#endif diff --git a/decoder/ff_test.cc b/decoder/ff_test.cc new file mode 100644 index 00000000..babaf985 --- /dev/null +++ b/decoder/ff_test.cc @@ -0,0 +1,134 @@ +#include +#include +#include +#include +#include +#include "hg.h" +#include "ff_lm.h" +#include "ff.h" +#include "trule.h" +#include "sentence_metadata.h" + +using namespace std; + +LanguageModel* lm_ = NULL; +LanguageModel* lm3_ = NULL; + +class FFTest : public testing::Test { + public: + FFTest() : smeta(0,Lattice()) { + if (!lm_) { + static LanguageModel slm("-o 2 ./test_data/test_2gram.lm.gz"); + lm_ = &slm; + static LanguageModel slm3("./test_data/dummy.3gram.lm -o 3"); + lm3_ = &slm3; + } + } + protected: + virtual void SetUp() { } + virtual void TearDown() { } + SentenceMetadata smeta; +}; + +TEST_F(FFTest,LanguageModel) { + vector ms(1, lm_); + TRulePtr tr1(new TRule("[X] ||| [X,1] said")); + TRulePtr tr2(new TRule("[X] ||| the man said")); + TRulePtr tr3(new TRule("[X] ||| the fat man")); + Hypergraph hg; + const int lm_fid = FD::Convert("LanguageModel"); + vector w(lm_fid + 1,1); + ModelSet models(w, ms); + string state; + Hypergraph::Edge edge; + edge.rule_ = tr2; + models.AddFeaturesToEdge(smeta, hg, &edge, &state); + double lm1 = edge.feature_values_.dot(w); + cerr << "lm=" << edge.feature_values_[lm_fid] << endl; + + hg.nodes_.resize(1); + hg.edges_.resize(2); + hg.edges_[0].rule_ = tr3; + models.AddFeaturesToEdge(smeta, hg, &hg.edges_[0], &hg.nodes_[0].state_); + hg.edges_[1].tail_nodes_.push_back(0); + hg.edges_[1].rule_ = tr1; + string state2; + models.AddFeaturesToEdge(smeta, hg, &hg.edges_[1], &state2); + double tot = hg.edges_[1].feature_values_[lm_fid] + hg.edges_[0].feature_values_[lm_fid]; + cerr << "lm=" << tot << endl; + EXPECT_TRUE(state2 == state); + EXPECT_FALSE(state == hg.nodes_[0].state_); +} + +TEST_F(FFTest, Small) { + WordPenalty wp(""); + vector ms(2, lm_); + ms[1] = ℘ + TRulePtr tr1(new TRule("[X] ||| [X,1] said")); + TRulePtr tr2(new TRule("[X] ||| john said")); + TRulePtr tr3(new TRule("[X] ||| john")); + cerr << "RULE: " << tr1->AsString() << endl; + Hypergraph hg; + vector w(2); w[0]=1.0; w[1]=-2.0; + ModelSet models(w, ms); + string state; + Hypergraph::Edge edge; + edge.rule_ = tr2; + cerr << tr2->AsString() << endl; + models.AddFeaturesToEdge(smeta, hg, &edge, &state); + double s1 = edge.feature_values_.dot(w); + cerr << "lm=" << edge.feature_values_[0] << endl; + cerr << "wp=" << edge.feature_values_[1] << endl; + + hg.nodes_.resize(1); + hg.edges_.resize(2); + hg.edges_[0].rule_ = tr3; + models.AddFeaturesToEdge(smeta, hg, &hg.edges_[0], &hg.nodes_[0].state_); + double acc = hg.edges_[0].feature_values_.dot(w); + cerr << hg.edges_[0].feature_values_[0] << endl; + hg.edges_[1].tail_nodes_.push_back(0); + hg.edges_[1].rule_ = tr1; + string state2; + models.AddFeaturesToEdge(smeta, hg, &hg.edges_[1], &state2); + acc += hg.edges_[1].feature_values_.dot(w); + double tot = hg.edges_[1].feature_values_[0] + hg.edges_[0].feature_values_[0]; + cerr << "lm=" << tot << endl; + cerr << "acc=" << acc << endl; + cerr << " s1=" << s1 << endl; + EXPECT_TRUE(state2 == state); + EXPECT_FALSE(state == hg.nodes_[0].state_); + EXPECT_FLOAT_EQ(acc, s1); +} + +TEST_F(FFTest, LM3) { + int x = lm3_->NumBytesContext(); + Hypergraph::Edge edge1; + edge1.rule_.reset(new TRule("[X] ||| x y ||| one ||| 1.0 -2.4 3.0")); + Hypergraph::Edge edge2; + edge2.rule_.reset(new TRule("[X] ||| [X,1] a ||| [X,1] two ||| 1.0 -2.4 3.0")); + Hypergraph::Edge edge3; + edge3.rule_.reset(new TRule("[X] ||| [X,1] a ||| zero [X,1] two ||| 1.0 -2.4 3.0")); + vector ants1; + string state(x, '\0'); + SparseVector feats; + SparseVector est; + lm3_->TraversalFeatures(smeta, edge1, ants1, &feats, &est, (void *)&state[0]); + cerr << "returned " << feats << endl; + cerr << edge1.feature_values_ << endl; + cerr << lm3_->DebugStateToString((const void*)&state[0]) << endl; + EXPECT_EQ("[ one ]", lm3_->DebugStateToString((const void*)&state[0])); + ants1.push_back((const void*)&state[0]); + string state2(x, '\0'); + lm3_->TraversalFeatures(smeta, edge2, ants1, &feats, &est, (void *)&state2[0]); + cerr << lm3_->DebugStateToString((const void*)&state2[0]) << endl; + EXPECT_EQ("[ one two ]", lm3_->DebugStateToString((const void*)&state2[0])); + string state3(x, '\0'); + lm3_->TraversalFeatures(smeta, edge3, ants1, &feats, &est, (void *)&state3[0]); + cerr << lm3_->DebugStateToString((const void*)&state3[0]) << endl; + EXPECT_EQ("[ zero one <{STAR}> one two ]", lm3_->DebugStateToString((const void*)&state3[0])); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc new file mode 100644 index 00000000..a00b2c76 --- /dev/null +++ b/decoder/ff_wordalign.cc @@ -0,0 +1,240 @@ +#include "ff_wordalign.h" + +#include +#include + +#include "stringlib.h" +#include "sentence_metadata.h" +#include "hg.h" +#include "fdict.h" +#include "aligner.h" +#include "tdict.h" // Blunsom hack +#include "filelib.h" // Blunsom hack + +using namespace std; + +RelativeSentencePosition::RelativeSentencePosition(const string& param) : + fid_(FD::Convert("RelativeSentencePosition")) { + if (!param.empty()) { + cerr << " Loading word classes from " << param << endl; + condition_on_fclass_ = true; + template_ = "RSP:FC000"; + assert(!"not implemented"); + } else { + condition_on_fclass_ = false; + } +} + +void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_states, + SparseVector* features, + SparseVector* estimated_features, + void* state) const { + // if the source word is either null or the generated word + // has no position in the reference + if (edge.i_ == -1 || edge.prev_i_ == -1) + return; + + assert(smeta.GetTargetLength() > 0); + const double val = fabs(static_cast(edge.i_) / smeta.GetSourceLength() - + static_cast(edge.prev_i_) / smeta.GetTargetLength()); + features->set_value(fid_, val); + if (condition_on_fclass_) { + assert(!"not implemented"); + } +// cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl; +} + +MarkovJump::MarkovJump(const string& param) : + FeatureFunction(1), + fid_(FD::Convert("MarkovJump")), + individual_params_per_jumpsize_(false), + condition_on_flen_(false) { + cerr << " MarkovJump"; + vector argv; + int argc = SplitOnWhitespace(param, &argv); + if (argc > 0) { + if (argv[0] == "--fclasses") { + argc--; + assert(argc > 0); + const string f_class_file = argv[1]; + } + if (argc != 1 || !(argv[0] == "-f" || argv[0] == "-i" || argv[0] == "-if")) { + cerr << "MarkovJump: expected parameters to be -f, -i, or -if\n"; + exit(1); + } + individual_params_per_jumpsize_ = (argv[0][1] == 'i'); + condition_on_flen_ = (argv[0][argv[0].size() - 1] == 'f'); + if (individual_params_per_jumpsize_) { + template_ = "Jump:000"; + cerr << ", individual jump parameters"; + if (condition_on_flen_) { + template_ += ":F00"; + cerr << " (split by f-length)"; + } + } + } else { + cerr << " (Blunsom & Cohn definition)"; + } + cerr << endl; +} + +void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_states, + SparseVector* features, + SparseVector* estimated_features, + void* state) const { + unsigned char& dpstate = *((unsigned char*)state); + if (edge.Arity() == 0) { + dpstate = static_cast(edge.i_); + } else if (edge.Arity() == 1) { + dpstate = *((unsigned char*)ant_states[0]); + } else if (edge.Arity() == 2) { + int left_index = *((unsigned char*)ant_states[0]); + int right_index = *((unsigned char*)ant_states[1]); + if (right_index == -1) + dpstate = static_cast(left_index); + else + dpstate = static_cast(right_index); + const int jumpsize = right_index - left_index; + features->set_value(fid_, fabs(jumpsize - 1)); // Blunsom and Cohn def + + if (individual_params_per_jumpsize_) { + string fname = template_; + int param = jumpsize; + if (jumpsize < 0) { + param *= -1; + fname[5]='L'; + } else if (jumpsize > 0) { + fname[5]='R'; + } + if (param) { + fname[6] = '0' + (param / 10); + fname[7] = '0' + (param % 10); + } + if (condition_on_flen_) { + const int flen = smeta.GetSourceLength(); + fname[10] = '0' + (flen / 10); + fname[11] = '0' + (flen % 10); + } + features->set_value(FD::Convert(fname), 1.0); + } + } else { + assert(!"something really unexpected is happening"); + } +} + +AlignerResults::AlignerResults(const std::string& param) : + cur_sent_(-1), + cur_grid_(NULL) { + vector argv; + int argc = SplitOnWhitespace(param, &argv); + if (argc != 2) { + cerr << "Required format: AlignerResults [FeatureName] [file.pharaoh]\n"; + exit(1); + } + cerr << " feature: " << argv[0] << "\talignments: " << argv[1] << endl; + fid_ = FD::Convert(argv[0]); + ReadFile rf(argv[1]); + istream& in = *rf.stream(); int lc = 0; + while(in) { + string line; + getline(in, line); + if (!in) break; + ++lc; + is_aligned_.push_back(AlignerTools::ReadPharaohAlignmentGrid(line)); + } + cerr << " Loaded " << lc << " refs\n"; +} + +void AlignerResults::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_states, + SparseVector* features, + SparseVector* estimated_features, + void* state) const { + if (edge.i_ == -1 || edge.prev_i_ == -1) + return; + + if (cur_sent_ != smeta.GetSentenceID()) { + assert(smeta.HasReference()); + cur_sent_ = smeta.GetSentenceID(); + assert(cur_sent_ < is_aligned_.size()); + cur_grid_ = is_aligned_[cur_sent_].get(); + } + + //cerr << edge.rule_->AsString() << endl; + + int j = edge.i_; // source side (f) + int i = edge.prev_i_; // target side (e) + if (j < cur_grid_->height() && i < cur_grid_->width() && (*cur_grid_)(i, j)) { +// if (edge.rule_->e_[0] == smeta.GetReference()[i][0].label) { + features->set_value(fid_, 1.0); +// cerr << edge.rule_->AsString() << " (" << i << "," << j << ")\n"; +// } + } +} + +BlunsomSynchronousParseHack::BlunsomSynchronousParseHack(const string& param) : + FeatureFunction((100 / 8) + 1), fid_(FD::Convert("NotRef")), cur_sent_(-1) { + ReadFile rf(param); + istream& in = *rf.stream(); int lc = 0; + while(in) { + string line; + getline(in, line); + if (!in) break; + ++lc; + refs_.push_back(vector()); + TD::ConvertSentence(line, &refs_.back()); + } + cerr << " Loaded " << lc << " refs\n"; +} + +void BlunsomSynchronousParseHack::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_states, + SparseVector* features, + SparseVector* estimated_features, + void* state) const { + if (cur_sent_ != smeta.GetSentenceID()) { + // assert(smeta.HasReference()); + cur_sent_ = smeta.GetSentenceID(); + assert(cur_sent_ < refs_.size()); + cur_ref_ = &refs_[cur_sent_]; + cur_map_.clear(); + for (int i = 0; i < cur_ref_->size(); ++i) { + vector phrase; + for (int j = i; j < cur_ref_->size(); ++j) { + phrase.push_back((*cur_ref_)[j]); + cur_map_[phrase] = i; + } + } + } + //cerr << edge.rule_->AsString() << endl; + for (int i = 0; i < ant_states.size(); ++i) { + if (DoesNotBelong(ant_states[i])) { + //cerr << " ant " << i << " does not belong\n"; + return; + } + } + vector > ants(ant_states.size()); + vector* > pants(ant_states.size()); + for (int i = 0; i < ant_states.size(); ++i) { + AppendAntecedentString(ant_states[i], &ants[i]); + //cerr << " ant[" << i << "]: " << ((int)*(static_cast(ant_states[i]))) << " " << TD::GetString(ants[i]) << endl; + pants[i] = &ants[i]; + } + vector yield; + edge.rule_->ESubstitute(pants, &yield); + //cerr << "YIELD: " << TD::GetString(yield) << endl; + Vec2Int::iterator it = cur_map_.find(yield); + if (it == cur_map_.end()) { + features->set_value(fid_, 1); + //cerr << " BAD!\n"; + return; + } + SetStateMask(it->second, it->second + yield.size(), state); +} + diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h new file mode 100644 index 00000000..4a8b59c7 --- /dev/null +++ b/decoder/ff_wordalign.h @@ -0,0 +1,136 @@ +#ifndef _FF_WORD_ALIGN_H_ +#define _FF_WORD_ALIGN_H_ + +#include "ff.h" +#include "array2d.h" + +class RelativeSentencePosition : public FeatureFunction { + public: + RelativeSentencePosition(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_context) const; + private: + const int fid_; + bool condition_on_fclass_; + std::string template_; +}; + +class MarkovJump : public FeatureFunction { + public: + MarkovJump(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_context) const; + private: + const int fid_; + bool individual_params_per_jumpsize_; + bool condition_on_flen_; + bool condition_on_fclass_; + std::string template_; +}; + +class AlignerResults : public FeatureFunction { + public: + AlignerResults(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_context) const; + private: + int fid_; + std::vector > > is_aligned_; + mutable int cur_sent_; + const Array2D mutable* cur_grid_; +}; + +#include +#include +#include +class BlunsomSynchronousParseHack : public FeatureFunction { + public: + BlunsomSynchronousParseHack(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_context) const; + private: + inline bool DoesNotBelong(const void* state) const { + for (int i = 0; i < NumBytesContext(); ++i) { + if (*(static_cast(state) + i)) return false; + } + return true; + } + + inline void AppendAntecedentString(const void* state, std::vector* yield) const { + int i = 0; + int ind = 0; + while (i < NumBytesContext() && !(*(static_cast(state) + i))) { ++i; ind += 8; } + // std::cerr << i << " " << NumBytesContext() << std::endl; + assert(i != NumBytesContext()); + assert(ind < cur_ref_->size()); + int cur = *(static_cast(state) + i); + int comp = 1; + while (comp < 256 && (comp & cur) == 0) { comp <<= 1; ++ind; } + assert(ind < cur_ref_->size()); + assert(comp < 256); + do { + assert(ind < cur_ref_->size()); + yield->push_back((*cur_ref_)[ind]); + ++ind; + comp <<= 1; + if (comp == 256) { + comp = 1; + ++i; + cur = *(static_cast(state) + i); + } + } while (comp & cur); + } + + inline void SetStateMask(int start, int end, void* state) const { + assert((end / 8) < NumBytesContext()); + int i = 0; + int comp = 1; + for (int j = 0; j < start; ++j) { + comp <<= 1; + if (comp == 256) { + ++i; + comp = 1; + } + } + //std::cerr << "SM: " << i << "\n"; + for (int j = start; j < end; ++j) { + *(static_cast(state) + i) |= comp; + //std::cerr << " " << comp << "\n"; + comp <<= 1; + if (comp == 256) { + ++i; + comp = 1; + } + } + //std::cerr << " MASK: " << ((int)*(static_cast(state))) << "\n"; + } + + const int fid_; + mutable int cur_sent_; + typedef std::tr1::unordered_map, int, boost::hash > > Vec2Int; + mutable Vec2Int cur_map_; + const std::vector mutable * cur_ref_; + mutable std::vector > refs_; +}; + +#endif diff --git a/decoder/filelib.cc b/decoder/filelib.cc new file mode 100644 index 00000000..79ad2847 --- /dev/null +++ b/decoder/filelib.cc @@ -0,0 +1,22 @@ +#include "filelib.h" + +#include +#include + +using namespace std; + +bool FileExists(const std::string& fn) { + struct stat info; + int s = stat(fn.c_str(), &info); + return (s==0); +} + +bool DirectoryExists(const string& dir) { + if (access(dir.c_str(),0) == 0) { + struct stat status; + stat(dir.c_str(), &status); + if (status.st_mode & S_IFDIR) return true; + } + return false; +} + diff --git a/decoder/filelib.h b/decoder/filelib.h new file mode 100644 index 00000000..62cb9427 --- /dev/null +++ b/decoder/filelib.h @@ -0,0 +1,66 @@ +#ifndef _FILELIB_H_ +#define _FILELIB_H_ + +#include +#include +#include +#include +#include "gzstream.h" + +// reads from standard in if filename is - +// uncompresses if file ends with .gz +// otherwise, reads from a normal file +class ReadFile { + public: + ReadFile(const std::string& filename) : + no_delete_on_exit_(filename == "-"), + in_(no_delete_on_exit_ ? static_cast(&std::cin) : + (EndsWith(filename, ".gz") ? + static_cast(new igzstream(filename.c_str())) : + static_cast(new std::ifstream(filename.c_str())))) { + if (!*in_) { + std::cerr << "Failed to open " << filename << std::endl; + abort(); + } + } + ~ReadFile() { + if (!no_delete_on_exit_) delete in_; + } + + inline std::istream* stream() { return in_; } + + private: + static bool EndsWith(const std::string& f, const std::string& suf) { + return (f.size() > suf.size()) && (f.rfind(suf) == f.size() - suf.size()); + } + const bool no_delete_on_exit_; + std::istream* const in_; +}; + +class WriteFile { + public: + WriteFile(const std::string& filename) : + no_delete_on_exit_(filename == "-"), + out_(no_delete_on_exit_ ? static_cast(&std::cout) : + (EndsWith(filename, ".gz") ? + static_cast(new ogzstream(filename.c_str())) : + static_cast(new std::ofstream(filename.c_str())))) {} + ~WriteFile() { + (*out_) << std::flush; + if (!no_delete_on_exit_) delete out_; + } + + inline std::ostream* stream() { return out_; } + + private: + static bool EndsWith(const std::string& f, const std::string& suf) { + return (f.size() > suf.size()) && (f.rfind(suf) == f.size() - suf.size()); + } + const bool no_delete_on_exit_; + std::ostream* const out_; +}; + +bool FileExists(const std::string& file_name); +bool DirectoryExists(const std::string& dir_name); + +#endif diff --git a/decoder/forest_writer.cc b/decoder/forest_writer.cc new file mode 100644 index 00000000..a9117d18 --- /dev/null +++ b/decoder/forest_writer.cc @@ -0,0 +1,23 @@ +#include "forest_writer.h" + +#include + +#include + +#include "filelib.h" +#include "hg_io.h" +#include "hg.h" + +using namespace std; + +ForestWriter::ForestWriter(const std::string& path, int num) : + fname_(path + '/' + boost::lexical_cast(num) + ".json.gz"), used_(false) {} + +bool ForestWriter::Write(const Hypergraph& forest, bool minimal_rules) { + assert(!used_); + used_ = true; + cerr << " Writing forest to " << fname_ << endl; + WriteFile wf(fname_); + return HypergraphIO::WriteToJSON(forest, minimal_rules, wf.stream()); +} + diff --git a/decoder/forest_writer.h b/decoder/forest_writer.h new file mode 100644 index 00000000..819a8940 --- /dev/null +++ b/decoder/forest_writer.h @@ -0,0 +1,16 @@ +#ifndef _FOREST_WRITER_H_ +#define _FOREST_WRITER_H_ + +#include + +class Hypergraph; + +struct ForestWriter { + ForestWriter(const std::string& path, int num); + bool Write(const Hypergraph& forest, bool minimal_rules); + + const std::string fname_; + bool used_; +}; + +#endif diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc new file mode 100644 index 00000000..9e25d346 --- /dev/null +++ b/decoder/freqdict.cc @@ -0,0 +1,29 @@ +#include +#include +#include +#include "freqdict.h" +#include "tdict.h" +#include "filelib.h" + +using namespace std; + +void FreqDict::Load(const std::string& fname) { + cerr << "Reading word frequencies: " << fname << endl; + ReadFile rf(fname); + istream& ifs = *rf.stream(); + int cc=0; + while (ifs) { + std::string word; + ifs >> word; + if (word.size() == 0) continue; + if (word[0] == '#') continue; + double count = 0; + ifs >> count; + assert(count > 0.0); // use -log(f) + counts_[TD::Convert(word)]=count; + ++cc; + if (cc % 10000 == 0) { std::cerr << "."; } + } + std::cerr << "\n"; + std::cerr << "Loaded " << cc << " words\n"; +} diff --git a/decoder/freqdict.h b/decoder/freqdict.h new file mode 100644 index 00000000..9acf0c33 --- /dev/null +++ b/decoder/freqdict.h @@ -0,0 +1,20 @@ +#ifndef _FREQDICT_H_ +#define _FREQDICT_H_ + +#include +#include +#include "wordid.h" + +class FreqDict { + public: + void Load(const std::string& fname); + float LookUp(const WordID& word) const { + std::map::const_iterator i = counts_.find(word); + if (i == counts_.end()) return 0; + return i->second; + } + private: + std::map counts_; +}; + +#endif diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc new file mode 100644 index 00000000..57feb227 --- /dev/null +++ b/decoder/fst_translator.cc @@ -0,0 +1,91 @@ +#include "translator.h" + +#include +#include + +#include "sentence_metadata.h" +#include "filelib.h" +#include "hg.h" +#include "hg_io.h" +#include "earley_composer.h" +#include "phrasetable_fst.h" +#include "tdict.h" + +using namespace std; + +struct FSTTranslatorImpl { + FSTTranslatorImpl(const boost::program_options::variables_map& conf) : + goal_sym(conf["goal"].as()), + kGOAL_RULE(new TRule("[Goal] ||| [" + goal_sym + ",1] ||| [1]")), + kGOAL(TD::Convert("Goal") * -1), + add_pass_through_rules(conf.count("add_pass_through_rules")) { + fst.reset(LoadTextPhrasetable(conf["grammar"].as >())); + ec.reset(new EarleyComposer(fst.get())); + } + + bool Translate(const string& input, + const vector& weights, + Hypergraph* forest) { + bool composed = false; + if (input.find("{\"rules\"") == 0) { + istringstream is(input); + Hypergraph src_cfg_hg; + assert(HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)); + if (add_pass_through_rules) { + SparseVector feats; + feats.set_value(FD::Convert("PassThrough"), 1); + for (int i = 0; i < src_cfg_hg.edges_.size(); ++i) { + const vector& f = src_cfg_hg.edges_[i].rule_->f_; + for (int j = 0; j < f.size(); ++j) { + if (f[j] > 0) { + fst->AddPassThroughTranslation(f[j], feats); + } + } + } + } + composed = ec->Compose(src_cfg_hg, forest); + } else { + const string dummy_grammar("[" + goal_sym + "] ||| " + input + " ||| TOP=1"); + cerr << " Dummy grammar: " << dummy_grammar << endl; + istringstream is(dummy_grammar); + if (add_pass_through_rules) { + vector words; + TD::ConvertSentence(input, &words); + SparseVector feats; + feats.set_value(FD::Convert("PassThrough"), 1); + for (int i = 0; i < words.size(); ++i) + fst->AddPassThroughTranslation(words[i], feats); + } + composed = ec->Compose(&is, forest); + } + if (composed) { + Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); + Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1, ""); + Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); + forest->ConnectEdgeToHeadNode(hg_edge, goal); + forest->Reweight(weights); + } + if (add_pass_through_rules) + fst->ClearPassThroughTranslations(); + return composed; + } + + const string goal_sym; + const TRulePtr kGOAL_RULE; + const WordID kGOAL; + const bool add_pass_through_rules; + boost::shared_ptr ec; + boost::shared_ptr fst; +}; + +FSTTranslator::FSTTranslator(const boost::program_options::variables_map& conf) : + pimpl_(new FSTTranslatorImpl(conf)) {} + +bool FSTTranslator::Translate(const string& input, + SentenceMetadata* smeta, + const vector& weights, + Hypergraph* minus_lm_forest) { + smeta->SetSourceLength(0); // don't know how to compute this + return pimpl_->Translate(input, weights, minus_lm_forest); +} + diff --git a/decoder/grammar.cc b/decoder/grammar.cc new file mode 100644 index 00000000..e19bd344 --- /dev/null +++ b/decoder/grammar.cc @@ -0,0 +1,164 @@ +#include "grammar.h" + +#include +#include +#include + +#include "filelib.h" +#include "tdict.h" + +using namespace std; + +const vector Grammar::NO_RULES; + +RuleBin::~RuleBin() {} +GrammarIter::~GrammarIter() {} +Grammar::~Grammar() {} + +bool Grammar::HasRuleForSpan(int i, int j, int distance) const { + (void) i; + (void) j; + (void) distance; + return true; // always true by default +} + +struct TextRuleBin : public RuleBin { + int GetNumRules() const { + return rules_.size(); + } + TRulePtr GetIthRule(int i) const { + return rules_[i]; + } + void AddRule(TRulePtr t) { + rules_.push_back(t); + } + int Arity() const { + return rules_.front()->Arity(); + } + void Dump() const { + for (int i = 0; i < rules_.size(); ++i) + cerr << rules_[i]->AsString() << endl; + } + private: + vector rules_; +}; + +struct TextGrammarNode : public GrammarIter { + TextGrammarNode() : rb_(NULL) {} + ~TextGrammarNode() { + delete rb_; + } + const GrammarIter* Extend(int symbol) const { + map::const_iterator i = tree_.find(symbol); + if (i == tree_.end()) return NULL; + return &i->second; + } + + const RuleBin* GetRules() const { + if (rb_) { + //rb_->Dump(); + } + return rb_; + } + + map tree_; + TextRuleBin* rb_; +}; + +struct TGImpl { + TextGrammarNode root_; +}; + +TextGrammar::TextGrammar() : max_span_(10), pimpl_(new TGImpl) {} +TextGrammar::TextGrammar(const string& file) : + max_span_(10), + pimpl_(new TGImpl) { + ReadFromFile(file); +} + +const GrammarIter* TextGrammar::GetRoot() const { + return &pimpl_->root_; +} + +void TextGrammar::AddRule(const TRulePtr& rule) { + if (rule->IsUnary()) { + rhs2unaries_[rule->f().front()].push_back(rule); + unaries_.push_back(rule); + } else { + TextGrammarNode* cur = &pimpl_->root_; + for (int i = 0; i < rule->f_.size(); ++i) + cur = &cur->tree_[rule->f_[i]]; + if (cur->rb_ == NULL) + cur->rb_ = new TextRuleBin; + cur->rb_->AddRule(rule); + } +} + +void TextGrammar::ReadFromFile(const string& filename) { + ReadFile in(filename); + istream& in_file = *in.stream(); + assert(in_file); + long long int rule_count = 0; + bool fl = false; + while(in_file) { + string line; + getline(in_file, line); + if (line.empty()) continue; + ++rule_count; + if (rule_count % 50000 == 0) { cerr << '.' << flush; fl = true; } + if (rule_count % 2000000 == 0) { cerr << " [" << rule_count << "]\n"; fl = false; } + TRulePtr rule(TRule::CreateRuleSynchronous(line)); + if (rule) { + AddRule(rule); + } else { + if (fl) { cerr << endl; } + cerr << "Skipping badly formatted rule in line " << rule_count << " of " << filename << endl; + fl = false; + } + } + if (fl) cerr << endl; + cerr << " " << rule_count << " rules read.\n"; +} + +bool TextGrammar::HasRuleForSpan(int i, int j, int distance) const { + return (max_span_ >= distance); +} + +GlueGrammar::GlueGrammar(const string& file) : TextGrammar(file) {} + +GlueGrammar::GlueGrammar(const string& goal_nt, const string& default_nt) { + TRulePtr stop_glue(new TRule("[" + goal_nt + "] ||| [" + default_nt + ",1] ||| [" + default_nt + ",1]")); + TRulePtr glue(new TRule("[" + goal_nt + "] ||| [" + goal_nt + ",1] [" + + default_nt + ",2] ||| [" + goal_nt + ",1] [" + default_nt + ",2] ||| Glue=1")); + + AddRule(stop_glue); + AddRule(glue); + //cerr << "GLUE: " << stop_glue->AsString() << endl; + //cerr << "GLUE: " << glue->AsString() << endl; +} + +bool GlueGrammar::HasRuleForSpan(int i, int j, int distance) const { + (void) j; + return (i == 0); +} + +PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat) : + has_rule_(input.size() + 1) { + for (int i = 0; i < input.size(); ++i) { + const vector& alts = input[i]; + for (int k = 0; k < alts.size(); ++k) { + const int j = alts[k].dist2next + i; + has_rule_[i].insert(j); + const string& src = TD::Convert(alts[k].label); + TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1")); + AddRule(pt); +// cerr << "PT: " << pt->AsString() << endl; + } + } +} + +bool PassThroughGrammar::HasRuleForSpan(int i, int j, int distance) const { + const set& hr = has_rule_[i]; + if (i == j) { return !hr.empty(); } + return (hr.find(j) != hr.end()); +} diff --git a/decoder/grammar.h b/decoder/grammar.h new file mode 100644 index 00000000..3471e3f1 --- /dev/null +++ b/decoder/grammar.h @@ -0,0 +1,83 @@ +#ifndef GRAMMAR_H_ +#define GRAMMAR_H_ + +#include +#include +#include +#include + +#include "lattice.h" +#include "trule.h" + +struct RuleBin { + virtual ~RuleBin(); + virtual int GetNumRules() const = 0; + virtual TRulePtr GetIthRule(int i) const = 0; + virtual int Arity() const = 0; +}; + +struct GrammarIter { + virtual ~GrammarIter(); + virtual const RuleBin* GetRules() const = 0; + virtual const GrammarIter* Extend(int symbol) const = 0; +}; + +struct Grammar { + typedef std::map > Cat2Rules; + static const std::vector NO_RULES; + + virtual ~Grammar(); + virtual const GrammarIter* GetRoot() const = 0; + virtual bool HasRuleForSpan(int i, int j, int distance) const; + + // cat is the category to be rewritten + inline const std::vector& GetAllUnaryRules() const { + return unaries_; + } + + // get all the unary rules that rewrite category cat + inline const std::vector& GetUnaryRulesForRHS(const WordID& cat) const { + Cat2Rules::const_iterator found = rhs2unaries_.find(cat); + if (found == rhs2unaries_.end()) + return NO_RULES; + else + return found->second; + } + + protected: + Cat2Rules rhs2unaries_; // these must be filled in by subclasses! + std::vector unaries_; +}; + +typedef boost::shared_ptr GrammarPtr; + +class TGImpl; +struct TextGrammar : public Grammar { + TextGrammar(); + TextGrammar(const std::string& file); + void SetMaxSpan(int m) { max_span_ = m; } + virtual const GrammarIter* GetRoot() const; + void AddRule(const TRulePtr& rule); + void ReadFromFile(const std::string& filename); + virtual bool HasRuleForSpan(int i, int j, int distance) const; + const std::vector& GetUnaryRules(const WordID& cat) const; + private: + int max_span_; + boost::shared_ptr pimpl_; +}; + +struct GlueGrammar : public TextGrammar { + // read glue grammar from file + explicit GlueGrammar(const std::string& file); + GlueGrammar(const std::string& goal_nt, const std::string& default_nt); // "S", "X" + virtual bool HasRuleForSpan(int i, int j, int distance) const; +}; + +struct PassThroughGrammar : public TextGrammar { + PassThroughGrammar(const Lattice& input, const std::string& cat); + virtual bool HasRuleForSpan(int i, int j, int distance) const; + private: + std::vector > has_rule_; // index by [i][j] +}; + +#endif diff --git a/decoder/grammar_test.cc b/decoder/grammar_test.cc new file mode 100644 index 00000000..62b8f958 --- /dev/null +++ b/decoder/grammar_test.cc @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include +#include "trule.h" +#include "tdict.h" +#include "grammar.h" +#include "bottom_up_parser.h" +#include "ff.h" +#include "weights.h" + +using namespace std; + +class GrammarTest : public testing::Test { + public: + GrammarTest() { + wts.InitFromFile("test_data/weights.gt"); + } + protected: + virtual void SetUp() { } + virtual void TearDown() { } + Weights wts; +}; + +TEST_F(GrammarTest,TestTextGrammar) { + vector w; + vector ms; + ModelSet models(w, ms); + + TextGrammar g; + TRulePtr r1(new TRule("[X] ||| a b c ||| A B C ||| 0.1 0.2 0.3", true)); + TRulePtr r2(new TRule("[X] ||| a b c ||| 1 2 3 ||| 0.2 0.3 0.4", true)); + TRulePtr r3(new TRule("[X] ||| a b c d ||| A B C D ||| 0.1 0.2 0.3", true)); + cerr << r1->AsString() << endl; + g.AddRule(r1); + g.AddRule(r2); + g.AddRule(r3); +} + +TEST_F(GrammarTest,TestTextGrammarFile) { + GrammarPtr g(new TextGrammar("./test_data/grammar.prune")); + vector grammars(1, g); + + LatticeArc a(TD::Convert("ein"), 0.0, 1); + LatticeArc b(TD::Convert("haus"), 0.0, 1); + Lattice lattice(2); + lattice[0].push_back(a); + lattice[1].push_back(b); + Hypergraph forest; + ExhaustiveBottomUpParser parser("PHRASE", grammars); + parser.Parse(lattice, &forest); + forest.PrintGraphviz(); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/decoder/gzstream.cc b/decoder/gzstream.cc new file mode 100644 index 00000000..9703e6ad --- /dev/null +++ b/decoder/gzstream.cc @@ -0,0 +1,165 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.C +// Revision : $Revision: 1.1 $ +// Revision_date : $Date: 2006/03/30 04:05:52 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#include "gzstream.h" +#include +#include + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See header file for user classes. +// ---------------------------------------------------------------------------- + +// -------------------------------------- +// class gzstreambuf: +// -------------------------------------- + +gzstreambuf* gzstreambuf::open( const char* name, int open_mode) { + if ( is_open()) + return (gzstreambuf*)0; + mode = open_mode; + // no append nor read/write mode + if ((mode & std::ios::ate) || (mode & std::ios::app) + || ((mode & std::ios::in) && (mode & std::ios::out))) + return (gzstreambuf*)0; + char fmode[10]; + char* fmodeptr = fmode; + if ( mode & std::ios::in) + *fmodeptr++ = 'r'; + else if ( mode & std::ios::out) + *fmodeptr++ = 'w'; + *fmodeptr++ = 'b'; + *fmodeptr = '\0'; + file = gzopen( name, fmode); + if (file == 0) + return (gzstreambuf*)0; + opened = 1; + return this; +} + +gzstreambuf * gzstreambuf::close() { + if ( is_open()) { + sync(); + opened = 0; + if ( gzclose( file) == Z_OK) + return this; + } + return (gzstreambuf*)0; +} + +int gzstreambuf::underflow() { // used for input buffer only + if ( gptr() && ( gptr() < egptr())) + return * reinterpret_cast( gptr()); + + if ( ! (mode & std::ios::in) || ! opened) + return EOF; + // Josuttis' implementation of inbuf + int n_putback = gptr() - eback(); + if ( n_putback > 4) + n_putback = 4; + memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback); + + int num = gzread( file, buffer+4, bufferSize-4); + if (num <= 0) // ERROR or EOF + return EOF; + + // reset buffer pointers + setg( buffer + (4 - n_putback), // beginning of putback area + buffer + 4, // read position + buffer + 4 + num); // end of buffer + + // return next character + return * reinterpret_cast( gptr()); +} + +int gzstreambuf::flush_buffer() { + // Separate the writing of the buffer from overflow() and + // sync() operation. + int w = pptr() - pbase(); + if ( gzwrite( file, pbase(), w) != w) + return EOF; + pbump( -w); + return w; +} + +int gzstreambuf::overflow( int c) { // used for output buffer only + if ( ! ( mode & std::ios::out) || ! opened) + return EOF; + if (c != EOF) { + *pptr() = c; + pbump(1); + } + if ( flush_buffer() == EOF) + return EOF; + return c; +} + +int gzstreambuf::sync() { + // Changed to use flush_buffer() instead of overflow( EOF) + // which caused improper behavior with std::endl and flush(), + // bug reported by Vincent Ricard. + if ( pptr() && pptr() > pbase()) { + if ( flush_buffer() == EOF) + return -1; + } + return 0; +} + +// -------------------------------------- +// class gzstreambase: +// -------------------------------------- + +gzstreambase::gzstreambase( const char* name, int mode) { + init( &buf); + open( name, mode); +} + +gzstreambase::~gzstreambase() { + buf.close(); +} + +void gzstreambase::open( const char* name, int open_mode) { + if ( ! buf.open( name, open_mode)) + clear( rdstate() | std::ios::badbit); +} + +void gzstreambase::close() { + if ( buf.is_open()) + if ( ! buf.close()) + clear( rdstate() | std::ios::badbit); +} + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +// ============================================================================ +// EOF // diff --git a/decoder/gzstream.h b/decoder/gzstream.h new file mode 100644 index 00000000..ad9785fd --- /dev/null +++ b/decoder/gzstream.h @@ -0,0 +1,121 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.h +// Revision : $Revision: 1.1 $ +// Revision_date : $Date: 2006/03/30 04:05:52 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#ifndef GZSTREAM_H +#define GZSTREAM_H 1 + +// standard C++ with new header file names and std:: namespace +#include +#include +#include + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See below for user classes. +// ---------------------------------------------------------------------------- + +class gzstreambuf : public std::streambuf { +private: + static const int bufferSize = 47+256; // size of data buff + // totals 512 bytes under g++ for igzstream at the end. + + gzFile file; // file handle for compressed file + char buffer[bufferSize]; // data buffer + char opened; // open/close state of stream + int mode; // I/O mode + + int flush_buffer(); +public: + gzstreambuf() : opened(0) { + setp( buffer, buffer + (bufferSize-1)); + setg( buffer + 4, // beginning of putback area + buffer + 4, // read position + buffer + 4); // end position + // ASSERT: both input & output capabilities will not be used together + } + int is_open() { return opened; } + gzstreambuf* open( const char* name, int open_mode); + gzstreambuf* close(); + ~gzstreambuf() { close(); } + + virtual int overflow( int c = EOF); + virtual int underflow(); + virtual int sync(); +}; + +class gzstreambase : virtual public std::ios { +protected: + gzstreambuf buf; +public: + gzstreambase() { init(&buf); } + gzstreambase( const char* name, int open_mode); + ~gzstreambase(); + void open( const char* name, int open_mode); + void close(); + gzstreambuf* rdbuf() { return &buf; } +}; + +// ---------------------------------------------------------------------------- +// User classes. Use igzstream and ogzstream analogously to ifstream and +// ofstream respectively. They read and write files based on the gz* +// function interface of the zlib. Files are compatible with gzip compression. +// ---------------------------------------------------------------------------- + +class igzstream : public gzstreambase, public std::istream { +public: + igzstream() : std::istream( &buf) {} + igzstream( const char* name, int open_mode = std::ios::in) + : gzstreambase( name, open_mode), std::istream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::in) { + gzstreambase::open( name, open_mode); + } +}; + +class ogzstream : public gzstreambase, public std::ostream { +public: + ogzstream() : std::ostream( &buf) {} + ogzstream( const char* name, int mode = std::ios::out) + : gzstreambase( name, mode), std::ostream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::out) { + gzstreambase::open( name, open_mode); + } +}; + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +#endif // GZSTREAM_H +// ============================================================================ +// EOF // + diff --git a/decoder/hg.cc b/decoder/hg.cc new file mode 100644 index 00000000..7bd79394 --- /dev/null +++ b/decoder/hg.cc @@ -0,0 +1,486 @@ +#include "hg.h" + +#include +#include +#include +#include +#include + +#include "viterbi.h" +#include "inside_outside.h" +#include "tdict.h" + +using namespace std; + +double Hypergraph::NumberOfPaths() const { + return Inside(*this); +} + +prob_t Hypergraph::ComputeEdgePosteriors(double scale, vector* posts) const { + const ScaledEdgeProb weight(scale); + SparseVector pv; + const double inside = InsideOutside, + EdgeFeaturesWeightFunction>(*this, &pv, weight); + posts->resize(edges_.size()); + for (int i = 0; i < edges_.size(); ++i) + (*posts)[i] = prob_t(pv.value(i)); + return prob_t(inside); +} + +prob_t Hypergraph::ComputeBestPathThroughEdges(vector* post) const { + vector in(edges_.size()); + vector out(edges_.size()); + post->resize(edges_.size()); + + vector ins_node_best(nodes_.size()); + for (int i = 0; i < nodes_.size(); ++i) { + const Node& node = nodes_[i]; + prob_t& node_ins_best = ins_node_best[i]; + if (node.in_edges_.empty()) node_ins_best = prob_t::One(); + for (int j = 0; j < node.in_edges_.size(); ++j) { + const Edge& edge = edges_[node.in_edges_[j]]; + prob_t& in_edge_sco = in[node.in_edges_[j]]; + in_edge_sco = edge.edge_prob_; + for (int k = 0; k < edge.tail_nodes_.size(); ++k) + in_edge_sco *= ins_node_best[edge.tail_nodes_[k]]; + if (in_edge_sco > node_ins_best) node_ins_best = in_edge_sco; + } + } + const prob_t ins_sco = ins_node_best[nodes_.size() - 1]; + + // sanity check + int tots = 0; + for (int i = 0; i < nodes_.size(); ++i) { if (nodes_[i].out_edges_.empty()) tots++; } + assert(tots == 1); + + // compute outside scores, potentially using inside scores + vector out_node_best(nodes_.size()); + for (int i = nodes_.size() - 1; i >= 0; --i) { + const Node& node = nodes_[i]; + prob_t& node_out_best = out_node_best[node.id_]; + if (node.out_edges_.empty()) node_out_best = prob_t::One(); + for (int j = 0; j < node.out_edges_.size(); ++j) { + const Edge& edge = edges_[node.out_edges_[j]]; + prob_t sco = edge.edge_prob_ * out_node_best[edge.head_node_]; + for (int k = 0; k < edge.tail_nodes_.size(); ++k) { + if (edge.tail_nodes_[k] != i) + sco *= ins_node_best[edge.tail_nodes_[k]]; + } + if (sco > node_out_best) node_out_best = sco; + } + for (int j = 0; j < node.in_edges_.size(); ++j) { + out[node.in_edges_[j]] = node_out_best; + } + } + + for (int i = 0; i < in.size(); ++i) + (*post)[i] = in[i] * out[i]; + // for (int i = 0; i < in.size(); ++i) + // cerr << "edge " << i << ": " << log((*post)[i]) << endl; + + return ins_sco; +} + +void Hypergraph::PushWeightsToSource(double scale) { + vector posts; + ComputeEdgePosteriors(scale, &posts); + for (int i = 0; i < nodes_.size(); ++i) { + const Hypergraph::Node& node = nodes_[i]; + prob_t z = prob_t::Zero(); + for (int j = 0; j < node.out_edges_.size(); ++j) + z += posts[node.out_edges_[j]]; + for (int j = 0; j < node.out_edges_.size(); ++j) { + edges_[node.out_edges_[j]].edge_prob_ = posts[node.out_edges_[j]] / z; + } + } +} + +void Hypergraph::PushWeightsToGoal(double scale) { + vector posts; + ComputeEdgePosteriors(scale, &posts); + for (int i = 0; i < nodes_.size(); ++i) { + const Hypergraph::Node& node = nodes_[i]; + prob_t z = prob_t::Zero(); + for (int j = 0; j < node.in_edges_.size(); ++j) + z += posts[node.in_edges_[j]]; + for (int j = 0; j < node.in_edges_.size(); ++j) { + edges_[node.in_edges_[j]].edge_prob_ = posts[node.in_edges_[j]] / z; + } + } +} + +void Hypergraph::PruneEdges(const std::vector& prune_edge) { + assert(prune_edge.size() == edges_.size()); + TopologicallySortNodesAndEdges(nodes_.size() - 1, &prune_edge); +} + +void Hypergraph::DensityPruneInsideOutside(const double scale, + const bool use_sum_prod_semiring, + const double density, + const vector* preserve_mask) { + assert(density >= 1.0); + const int plen = ViterbiPathLength(*this); + vector bp; + int rnum = min(static_cast(edges_.size()), static_cast(density * static_cast(plen))); + if (rnum == edges_.size()) { + cerr << "No pruning required: denisty already sufficient"; + return; + } + vector io(edges_.size()); + if (use_sum_prod_semiring) + ComputeEdgePosteriors(scale, &io); + else + ComputeBestPathThroughEdges(&io); + assert(edges_.size() == io.size()); + vector sorted = io; + nth_element(sorted.begin(), sorted.begin() + rnum, sorted.end(), greater()); + const double cutoff = sorted[rnum]; + vector prune(edges_.size()); + for (int i = 0; i < edges_.size(); ++i) { + prune[i] = (io[i] < cutoff); + if (preserve_mask && (*preserve_mask)[i]) prune[i] = false; + } + PruneEdges(prune); +} + +void Hypergraph::BeamPruneInsideOutside( + const double scale, + const bool use_sum_prod_semiring, + const double alpha, + const vector* preserve_mask) { + assert(alpha > 0.0); + assert(scale > 0.0); + vector io(edges_.size()); + if (use_sum_prod_semiring) + ComputeEdgePosteriors(scale, &io); + else + ComputeBestPathThroughEdges(&io); + assert(edges_.size() == io.size()); + prob_t best; // initializes to zero + for (int i = 0; i < io.size(); ++i) + if (io[i] > best) best = io[i]; + const prob_t aprob(exp(-alpha)); + const prob_t cutoff = best * aprob; + // cerr << "aprob = " << aprob << "\t CUTOFF=" << cutoff << endl; + vector prune(edges_.size()); + //cerr << preserve_mask.size() << " " << edges_.size() << endl; + int pc = 0; + for (int i = 0; i < io.size(); ++i) { + const bool prune_edge = (io[i] < cutoff); + if (prune_edge) ++pc; + prune[i] = (io[i] < cutoff); + if (preserve_mask && (*preserve_mask)[i]) prune[i] = false; + } + // cerr << "Beam pruning " << pc << "/" << io.size() << " edges\n"; + PruneEdges(prune); +} + +void Hypergraph::PrintGraphviz() const { + int ei = 0; + cerr << "digraph G {\n rankdir=LR;\n nodesep=.05;\n"; + for (vector::const_iterator i = edges_.begin(); + i != edges_.end(); ++i) { + const Edge& edge=*i; + ++ei; + static const string none = ""; + string rule = (edge.rule_ ? edge.rule_->AsString(false) : none); + + cerr << " A_" << ei << " [label=\"" << rule << " p=" << edge.edge_prob_ + << " F:" << edge.feature_values_ + << "\" shape=\"rect\"];\n"; + for (int i = 0; i < edge.tail_nodes_.size(); ++i) { + cerr << " " << edge.tail_nodes_[i] << " -> A_" << ei << ";\n"; + } + cerr << " A_" << ei << " -> " << edge.head_node_ << ";\n"; + } + for (vector::const_iterator ni = nodes_.begin(); + ni != nodes_.end(); ++ni) { + cerr << " " << ni->id_ << "[label=\"" << (ni->cat_ < 0 ? TD::Convert(ni->cat_ * -1) : "") + //cerr << " " << ni->id_ << "[label=\"" << ni->cat_ + << " n=" << ni->id_ +// << ",x=" << &*ni +// << ",in=" << ni->in_edges_.size() +// << ",out=" << ni->out_edges_.size() + << "\"];\n"; + } + cerr << "}\n"; +} + +void Hypergraph::Union(const Hypergraph& other) { + if (&other == this) return; + if (nodes_.empty()) { nodes_ = other.nodes_; edges_ = other.edges_; return; } + int noff = nodes_.size(); + int eoff = edges_.size(); + int ogoal = other.nodes_.size() - 1; + int cgoal = noff - 1; + // keep a single goal node, so add nodes.size - 1 + nodes_.resize(nodes_.size() + ogoal); + // add all edges + edges_.resize(edges_.size() + other.edges_.size()); + + for (int i = 0; i < ogoal; ++i) { + const Node& on = other.nodes_[i]; + Node& cn = nodes_[i + noff]; + cn.id_ = i + noff; + cn.in_edges_.resize(on.in_edges_.size()); + for (int j = 0; j < on.in_edges_.size(); ++j) + cn.in_edges_[j] = on.in_edges_[j] + eoff; + + cn.out_edges_.resize(on.out_edges_.size()); + for (int j = 0; j < on.out_edges_.size(); ++j) + cn.out_edges_[j] = on.out_edges_[j] + eoff; + } + + for (int i = 0; i < other.edges_.size(); ++i) { + const Edge& oe = other.edges_[i]; + Edge& ce = edges_[i + eoff]; + ce.id_ = i + eoff; + ce.rule_ = oe.rule_; + ce.feature_values_ = oe.feature_values_; + if (oe.head_node_ == ogoal) { + ce.head_node_ = cgoal; + nodes_[cgoal].in_edges_.push_back(ce.id_); + } else { + ce.head_node_ = oe.head_node_ + noff; + } + ce.tail_nodes_.resize(oe.tail_nodes_.size()); + for (int j = 0; j < oe.tail_nodes_.size(); ++j) + ce.tail_nodes_[j] = oe.tail_nodes_[j] + noff; + } + + TopologicallySortNodesAndEdges(cgoal); +} + +int Hypergraph::MarkReachable(const Node& node, + vector* rmap, + const vector* prune_edges) const { + int total = 0; + if (!(*rmap)[node.id_]) { + total = 1; + (*rmap)[node.id_] = true; + for (int i = 0; i < node.in_edges_.size(); ++i) { + if (!(prune_edges && (*prune_edges)[node.in_edges_[i]])) { + for (int j = 0; j < edges_[node.in_edges_[i]].tail_nodes_.size(); ++j) + total += MarkReachable(nodes_[edges_[node.in_edges_[i]].tail_nodes_[j]], rmap, prune_edges); + } + } + } + return total; +} + +void Hypergraph::PruneUnreachable(int goal_node_id) { + TopologicallySortNodesAndEdges(goal_node_id, NULL); +} + +void Hypergraph::RemoveNoncoaccessibleStates(int goal_node_id) { + if (goal_node_id < 0) goal_node_id += nodes_.size(); + assert(goal_node_id >= 0); + assert(goal_node_id < nodes_.size()); + + // TODO finish implementation + abort(); +} + +void Hypergraph::TopologicallySortNodesAndEdges(int goal_index, + const vector* prune_edges) { + vector sedges(edges_.size()); + // figure out which nodes are reachable from the goal + vector reachable(nodes_.size(), false); + int num_reachable = MarkReachable(nodes_[goal_index], &reachable, prune_edges); + vector snodes(num_reachable); snodes.clear(); + + // enumerate all reachable nodes in topologically sorted order + vector old_node_to_new_id(nodes_.size(), -1); + vector node_to_incount(nodes_.size(), -1); + vector node_processed(nodes_.size(), false); + typedef map > PQueue; + PQueue pri_q; + for (int i = 0; i < nodes_.size(); ++i) { + if (!reachable[i]) + continue; + const int inedges = nodes_[i].in_edges_.size(); + int incount = inedges; + for (int j = 0; j < inedges; ++j) + if (edges_[nodes_[i].in_edges_[j]].tail_nodes_.size() == 0 || + (prune_edges && (*prune_edges)[nodes_[i].in_edges_[j]])) + --incount; + // cerr << &nodes_[i] <<" : incount=" << incount << "\tout=" << nodes_[i].out_edges_.size() << "\t(in-edges=" << inedges << ")\n"; + assert(node_to_incount[i] == -1); + node_to_incount[i] = incount; + pri_q[incount].insert(i); + } + + int edge_count = 0; + while (!pri_q.empty()) { + PQueue::iterator iter = pri_q.find(0); + assert(iter != pri_q.end()); + assert(!iter->second.empty()); + + // get first node with incount = 0 + const int cur_index = *iter->second.begin(); + const Node& node = nodes_[cur_index]; + assert(reachable[cur_index]); + //cerr << "node: " << node << endl; + const int new_node_index = snodes.size(); + old_node_to_new_id[cur_index] = new_node_index; + snodes.push_back(node); + Node& new_node = snodes.back(); + new_node.id_ = new_node_index; + new_node.out_edges_.clear(); + + // fix up edges - we can now process the in edges and + // the out edges of their tails + int oi = 0; + for (int i = 0; i < node.in_edges_.size(); ++i, ++oi) { + if (prune_edges && (*prune_edges)[node.in_edges_[i]]) { + --oi; + continue; + } + new_node.in_edges_[oi] = edge_count; + Edge& edge = sedges[edge_count]; + edge.id_ = edge_count; + ++edge_count; + const Edge& old_edge = edges_[node.in_edges_[i]]; + edge.rule_ = old_edge.rule_; + edge.feature_values_ = old_edge.feature_values_; + edge.head_node_ = new_node_index; + edge.tail_nodes_.resize(old_edge.tail_nodes_.size()); + edge.edge_prob_ = old_edge.edge_prob_; + edge.i_ = old_edge.i_; + edge.j_ = old_edge.j_; + edge.prev_i_ = old_edge.prev_i_; + edge.prev_j_ = old_edge.prev_j_; + for (int j = 0; j < old_edge.tail_nodes_.size(); ++j) { + const Node& old_tail_node = nodes_[old_edge.tail_nodes_[j]]; + edge.tail_nodes_[j] = old_node_to_new_id[old_tail_node.id_]; + snodes[edge.tail_nodes_[j]].out_edges_.push_back(edge_count-1); + assert(edge.tail_nodes_[j] != new_node_index); + } + } + assert(oi <= new_node.in_edges_.size()); + new_node.in_edges_.resize(oi); + + for (int i = 0; i < node.out_edges_.size(); ++i) { + const Edge& edge = edges_[node.out_edges_[i]]; + const int next_index = edge.head_node_; + assert(cur_index != next_index); + if (!reachable[next_index]) continue; + if (prune_edges && (*prune_edges)[edge.id_]) continue; + + bool dontReduce = false; + for (int j = 0; j < edge.tail_nodes_.size() && !dontReduce; ++j) { + int tail_index = edge.tail_nodes_[j]; + dontReduce = (tail_index != cur_index) && !node_processed[tail_index]; + } + if (dontReduce) + continue; + + const int incount = node_to_incount[next_index]; + if (incount <= 0) { + cerr << "incount = " << incount << ", should be > 0!\n"; + cerr << "do you have a cycle in your hypergraph?\n"; + abort(); + } + PQueue::iterator it = pri_q.find(incount); + assert(it != pri_q.end()); + it->second.erase(next_index); + if (it->second.empty()) pri_q.erase(it); + + // reinsert node with reduced incount + pri_q[incount-1].insert(next_index); + --node_to_incount[next_index]; + } + + // remove node from set + iter->second.erase(cur_index); + if (iter->second.empty()) + pri_q.erase(iter); + node_processed[cur_index] = true; + } + + sedges.resize(edge_count); + nodes_.swap(snodes); + edges_.swap(sedges); + assert(nodes_.back().out_edges_.size() == 0); +} + +TRulePtr Hypergraph::kEPSRule; +TRulePtr Hypergraph::kUnaryRule; + +void Hypergraph::EpsilonRemove(WordID eps) { + if (!kEPSRule) { + kEPSRule.reset(new TRule("[X] ||| ||| ")); + kUnaryRule.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); + } + vector kill(edges_.size(), false); + for (int i = 0; i < edges_.size(); ++i) { + const Edge& edge = edges_[i]; + if (edge.tail_nodes_.empty() && + edge.rule_->f_.size() == 1 && + edge.rule_->f_[0] == eps) { + kill[i] = true; + if (!edge.feature_values_.empty()) { + Node& node = nodes_[edge.head_node_]; + if (node.in_edges_.size() != 1) { + cerr << "[WARNING] edge with features going into non-empty node - can't promote\n"; + // this *probably* means that there are multiple derivations of the + // same sequence via different paths through the input forest + // this needs to be investigated and fixed + } else { + for (int j = 0; j < node.out_edges_.size(); ++j) + edges_[node.out_edges_[j]].feature_values_ += edge.feature_values_; + // cerr << "PROMOTED " << edge.feature_values_ << endl; + } + } + } + } + bool created_eps = false; + PruneEdges(kill); + for (int i = 0; i < nodes_.size(); ++i) { + const Node& node = nodes_[i]; + if (node.in_edges_.empty()) { + for (int j = 0; j < node.out_edges_.size(); ++j) { + Edge& edge = edges_[node.out_edges_[j]]; + if (edge.rule_->Arity() == 2) { + assert(edge.rule_->f_.size() == 2); + assert(edge.rule_->e_.size() == 2); + edge.rule_ = kUnaryRule; + int cur = node.id_; + int t = -1; + assert(edge.tail_nodes_.size() == 2); + for (int i = 0; i < 2; ++i) if (edge.tail_nodes_[i] != cur) { t = edge.tail_nodes_[i]; } + assert(t != -1); + edge.tail_nodes_.resize(1); + edge.tail_nodes_[0] = t; + } else { + edge.rule_ = kEPSRule; + edge.rule_->f_[0] = eps; + edge.rule_->e_[0] = eps; + edge.tail_nodes_.clear(); + created_eps = true; + } + } + } + } + vector k2(edges_.size(), false); + PruneEdges(k2); + if (created_eps) EpsilonRemove(eps); +} + +struct EdgeWeightSorter { + const Hypergraph& hg; + EdgeWeightSorter(const Hypergraph& h) : hg(h) {} + bool operator()(int a, int b) const { + return hg.edges_[a].edge_prob_ > hg.edges_[b].edge_prob_; + } +}; + +void Hypergraph::SortInEdgesByEdgeWeights() { + for (int i = 0; i < nodes_.size(); ++i) { + Node& node = nodes_[i]; + sort(node.in_edges_.begin(), node.in_edges_.end(), EdgeWeightSorter(*this)); + } +} + diff --git a/decoder/hg.h b/decoder/hg.h new file mode 100644 index 00000000..7a2658b8 --- /dev/null +++ b/decoder/hg.h @@ -0,0 +1,225 @@ +#ifndef _HG_H_ +#define _HG_H_ + +#include +#include + +#include "small_vector.h" +#include "sparse_vector.h" +#include "wordid.h" +#include "trule.h" +#include "prob.h" + +// class representing an acyclic hypergraph +// - edges have 1 head, 0..n tails +class Hypergraph { + public: + Hypergraph() {} + + // SmallVector is a fast, small vector implementation for sizes <= 2 + typedef SmallVector TailNodeVector; + + // TODO get rid of state_ and cat_? + struct Node { + Node() : id_(), cat_() {} + int id_; // equal to this object's position in the nodes_ vector + WordID cat_; // non-terminal category if <0, 0 if not set + std::vector in_edges_; // contents refer to positions in edges_ + std::vector out_edges_; // contents refer to positions in edges_ + std::string state_; // opaque state + }; + + // TODO get rid of edge_prob_? (can be computed on the fly as the dot + // product of the weight vector and the feature values) + struct Edge { + Edge() : i_(-1), j_(-1), prev_i_(-1), prev_j_(-1) {} + inline int Arity() const { return tail_nodes_.size(); } + int head_node_; // refers to a position in nodes_ + TailNodeVector tail_nodes_; // contents refer to positions in nodes_ + TRulePtr rule_; + SparseVector feature_values_; + prob_t edge_prob_; // dot product of weights and feat_values + int id_; // equal to this object's position in the edges_ vector + + // span info. typically, i_ and j_ refer to indices in the source sentence + // if a synchronous parse has been executed i_ and j_ will refer to indices + // in the target sentence / lattice and prev_i_ prev_j_ will refer to + // positions in the source. Note: it is up to the translator implementation + // to properly set these values. For some models (like the Forest-input + // phrase based model) it may not be straightforward to do. if these values + // are not properly set, most things will work but alignment and any features + // that depend on them will be broken. + short int i_; + short int j_; + short int prev_i_; + short int prev_j_; + }; + + void swap(Hypergraph& other) { + other.nodes_.swap(nodes_); + other.edges_.swap(edges_); + } + + void ResizeNodes(int size) { + nodes_.resize(size); + for (int i = 0; i < size; ++i) nodes_[i].id_ = i; + } + + // reserves space in the nodes vector to prevent memory locations + // from changing + void ReserveNodes(size_t n, size_t e = 0) { + nodes_.reserve(n); + if (e) edges_.reserve(e); + } + + Edge* AddEdge(const TRulePtr& rule, const TailNodeVector& tail) { + edges_.push_back(Edge()); + Edge* edge = &edges_.back(); + edge->rule_ = rule; + edge->tail_nodes_ = tail; + edge->id_ = edges_.size() - 1; + for (int i = 0; i < edge->tail_nodes_.size(); ++i) + nodes_[edge->tail_nodes_[i]].out_edges_.push_back(edge->id_); + return edge; + } + + Node* AddNode(const WordID& cat, const std::string& state = "") { + nodes_.push_back(Node()); + nodes_.back().cat_ = cat; + nodes_.back().state_ = state; + nodes_.back().id_ = nodes_.size() - 1; + return &nodes_.back(); + } + + void ConnectEdgeToHeadNode(const int edge_id, const int head_id) { + edges_[edge_id].head_node_ = head_id; + nodes_[head_id].in_edges_.push_back(edge_id); + } + + // TODO remove this - use the version that takes indices + void ConnectEdgeToHeadNode(Edge* edge, Node* head) { + edge->head_node_ = head->id_; + head->in_edges_.push_back(edge->id_); + } + + // merge the goal node from other with this goal node + void Union(const Hypergraph& other); + + void PrintGraphviz() const; + + // compute the total number of paths in the forest + double NumberOfPaths() const; + + // BEWARE. this assumes that the source and target language + // strings are identical and that there are no loops. + // It assumes a bunch of other things about where the + // epsilons will be. It tries to assert failure if you + // break these assumptions, but it may not. + // TODO - make this work + void EpsilonRemove(WordID eps); + + // multiple the weights vector by the edge feature vector + // (inner product) to set the edge probabilities + template + void Reweight(const V& weights) { + for (int i = 0; i < edges_.size(); ++i) { + Edge& e = edges_[i]; + e.edge_prob_.logeq(e.feature_values_.dot(weights)); + } + } + + // computes inside and outside scores for each + // edge in the hypergraph + // alpha->size = edges_.size = beta->size + // returns inside prob of goal node + prob_t ComputeEdgePosteriors(double scale, + std::vector* posts) const; + + // find the score of the very best path passing through each edge + prob_t ComputeBestPathThroughEdges(std::vector* posts) const; + + // move weights as near to the source as possible, resulting in a + // stochastic automaton. ONLY FUNCTIONAL FOR *LATTICES*. + // See M. Mohri and M. Riley. A Weight Pushing Algorithm for Large + // Vocabulary Speech Recognition. 2001. + // the log semiring (NOT tropical) is used + void PushWeightsToSource(double scale = 1.0); + // same, except weights are pushed to the goal, works for HGs, + // not just lattices + void PushWeightsToGoal(double scale = 1.0); + + void SortInEdgesByEdgeWeights(); + + void PruneUnreachable(int goal_node_id); // DEPRECATED + + void RemoveNoncoaccessibleStates(int goal_node_id = -1); + + // remove edges from the hypergraph if prune_edge[edge_id] is true + void PruneEdges(const std::vector& prune_edge); + + // if you don't know, use_sum_prod_semiring should be false + void DensityPruneInsideOutside(const double scale, const bool use_sum_prod_semiring, const double density, + const std::vector* preserve_mask = NULL); + + // prunes any edge whose score on the best path taking that edge is more than alpha away + // from the score of the global best past (or the highest edge posterior) + void BeamPruneInsideOutside(const double scale, const bool use_sum_prod_semiring, const double alpha, + const std::vector* preserve_mask = NULL); + + void clear() { + nodes_.clear(); + edges_.clear(); + } + + inline size_t NumberOfEdges() const { return edges_.size(); } + inline size_t NumberOfNodes() const { return nodes_.size(); } + inline bool empty() const { return nodes_.empty(); } + + // nodes_ is sorted in topological order + std::vector nodes_; + // edges_ is not guaranteed to be in any particular order + std::vector edges_; + + // reorder nodes_ so they are in topological order + // source nodes at 0 sink nodes at size-1 + void TopologicallySortNodesAndEdges(int goal_idx, + const std::vector* prune_edges = NULL); + private: + // returns total nodes reachable + int MarkReachable(const Node& node, + std::vector* rmap, + const std::vector* prune_edges) const; + + static TRulePtr kEPSRule; + static TRulePtr kUnaryRule; +}; + +// common WeightFunctions, map an edge -> WeightType +// for generic Viterbi/Inside algorithms +struct EdgeProb { + inline const prob_t& operator()(const Hypergraph::Edge& e) const { return e.edge_prob_; } +}; + +struct ScaledEdgeProb { + ScaledEdgeProb(const double& alpha) : alpha_(alpha) {} + inline prob_t operator()(const Hypergraph::Edge& e) const { return e.edge_prob_.pow(alpha_); } + const double alpha_; +}; + +struct EdgeFeaturesWeightFunction { + inline const SparseVector& operator()(const Hypergraph::Edge& e) const { return e.feature_values_; } +}; + +struct TransitionEventWeightFunction { + inline SparseVector operator()(const Hypergraph::Edge& e) const { + SparseVector result; + result.set_value(e.id_, prob_t::One()); + return result; + } +}; + +struct TransitionCountWeightFunction { + inline double operator()(const Hypergraph::Edge& e) const { (void)e; return 1.0; } +}; + +#endif diff --git a/decoder/hg_intersect.cc b/decoder/hg_intersect.cc new file mode 100644 index 00000000..a5e8913a --- /dev/null +++ b/decoder/hg_intersect.cc @@ -0,0 +1,121 @@ +#include "hg_intersect.h" + +#include +#include +#include +#include + +#include "tdict.h" +#include "hg.h" +#include "trule.h" +#include "wordid.h" +#include "bottom_up_parser.h" + +using boost::lexical_cast; +using namespace std::tr1; +using namespace std; + +struct RuleFilter { + unordered_map, bool, boost::hash > > exists_; + bool true_lattice; + RuleFilter(const Lattice& target, int max_phrase_size) { + true_lattice = false; + for (int i = 0; i < target.size(); ++i) { + vector phrase; + int lim = min(static_cast(target.size()), i + max_phrase_size); + for (int j = i; j < lim; ++j) { + if (target[j].size() > 1) { true_lattice = true; break; } + phrase.push_back(target[j][0].label); + exists_[phrase] = true; + } + } + vector sos(1, TD::Convert("")); + exists_[sos] = true; + } + bool operator()(const TRule& r) const { + // TODO do some smarter filtering for lattices + if (true_lattice) return false; // don't filter "true lattice" input + const vector& e = r.e(); + for (int i = 0; i < e.size(); ++i) { + if (e[i] <= 0) continue; + vector phrase; + for (int j = i; j < e.size(); ++j) { + if (e[j] <= 0) break; + phrase.push_back(e[j]); + if (exists_.count(phrase) == 0) return true; + } + } + return false; + } +}; + +bool HG::Intersect(const Lattice& target, Hypergraph* hg) { + vector rem(hg->edges_.size(), false); + const RuleFilter filter(target, 15); // TODO make configurable + for (int i = 0; i < rem.size(); ++i) + rem[i] = filter(*hg->edges_[i].rule_); + hg->PruneEdges(rem); + + const int nedges = hg->edges_.size(); + const int nnodes = hg->nodes_.size(); + + TextGrammar* g = new TextGrammar; + GrammarPtr gp(g); + vector cats(nnodes); + // each node in the translation forest becomes a "non-terminal" in the new + // grammar, create the labels here + for (int i = 0; i < nnodes; ++i) + cats[i] = TD::Convert("CAT_" + lexical_cast(i)) * -1; + + // construct the grammar + for (int i = 0; i < nedges; ++i) { + const Hypergraph::Edge& edge = hg->edges_[i]; + const vector& tgt = edge.rule_->e(); + const vector& src = edge.rule_->f(); + TRulePtr rule(new TRule); + rule->prev_i = edge.i_; + rule->prev_j = edge.j_; + rule->lhs_ = cats[edge.head_node_]; + vector& f = rule->f_; + vector& e = rule->e_; + f.resize(tgt.size()); // swap source and target, since the parser + e.resize(src.size()); // parses using the source side! + Hypergraph::TailNodeVector tn(edge.tail_nodes_.size()); + int ntc = 0; + for (int j = 0; j < tgt.size(); ++j) { + const WordID& cur = tgt[j]; + if (cur > 0) { + f[j] = cur; + } else { + tn[ntc++] = cur; + f[j] = cats[edge.tail_nodes_[-cur]]; + } + } + ntc = 0; + for (int j = 0; j < src.size(); ++j) { + const WordID& cur = src[j]; + if (cur > 0) { + e[j] = cur; + } else { + e[j] = tn[ntc++]; + } + } + rule->scores_ = edge.feature_values_; + rule->parent_rule_ = edge.rule_; + rule->ComputeArity(); + //cerr << "ADD: " << rule->AsString() << endl; + + g->AddRule(rule); + } + g->SetMaxSpan(target.size() + 1); + const string& new_goal = TD::Convert(cats.back() * -1); + vector grammars(1, gp); + Hypergraph tforest; + ExhaustiveBottomUpParser parser(new_goal, grammars); + if (!parser.Parse(target, &tforest)) + return false; + else + hg->swap(tforest); + return true; +} + diff --git a/decoder/hg_intersect.h b/decoder/hg_intersect.h new file mode 100644 index 00000000..826bdaae --- /dev/null +++ b/decoder/hg_intersect.h @@ -0,0 +1,13 @@ +#ifndef _HG_INTERSECT_H_ +#define _HG_INTERSECT_H_ + +#include + +#include "lattice.h" + +class Hypergraph; +struct HG { + static bool Intersect(const Lattice& target, Hypergraph* hg); +}; + +#endif diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc new file mode 100644 index 00000000..243106b8 --- /dev/null +++ b/decoder/hg_io.cc @@ -0,0 +1,599 @@ +#include "hg_io.h" + +#include +#include + +#include + +#include "tdict.h" +#include "json_parse.h" +#include "hg.h" + +using namespace std; + +struct HGReader : public JSONParser { + HGReader(Hypergraph* g) : rp("[X] ||| "), state(-1), hg(*g), nodes_needed(true), edges_needed(true) { nodes = 0; edges = 0; } + + void CreateNode(const string& cat, const vector& in_edges) { + WordID c = TD::Convert("X") * -1; + if (!cat.empty()) c = TD::Convert(cat) * -1; + Hypergraph::Node* node = hg.AddNode(c, ""); + for (int i = 0; i < in_edges.size(); ++i) { + if (in_edges[i] >= hg.edges_.size()) { + cerr << "JSONParser: in_edges[" << i << "]=" << in_edges[i] + << ", but hg only has " << hg.edges_.size() << " edges!\n"; + abort(); + } + hg.ConnectEdgeToHeadNode(&hg.edges_[in_edges[i]], node); + } + } + void CreateEdge(const TRulePtr& rule, SparseVector* feats, const SmallVector& tail) { + Hypergraph::Edge* edge = hg.AddEdge(rule, tail); + feats->swap(edge->feature_values_); + } + + bool HandleJSONEvent(int type, const JSON_value* value) { + switch(state) { + case -1: + assert(type == JSON_T_OBJECT_BEGIN); + state = 0; + break; + case 0: + if (type == JSON_T_OBJECT_END) { + //cerr << "HG created\n"; // TODO, signal some kind of callback + } else if (type == JSON_T_KEY) { + string val = value->vu.str.value; + if (val == "features") { assert(fdict.empty()); state = 1; } + else if (val == "is_sorted") { state = 3; } + else if (val == "rules") { assert(rules.empty()); state = 4; } + else if (val == "node") { state = 8; } + else if (val == "edges") { state = 13; } + else { cerr << "Unexpected key: " << val << endl; return false; } + } + break; + + // features + case 1: + if(type == JSON_T_NULL) { state = 0; break; } + assert(type == JSON_T_ARRAY_BEGIN); + state = 2; + break; + case 2: + if(type == JSON_T_ARRAY_END) { state = 0; break; } + assert(type == JSON_T_STRING); + fdict.push_back(FD::Convert(value->vu.str.value)); + break; + + // is_sorted + case 3: + assert(type == JSON_T_TRUE || type == JSON_T_FALSE); + is_sorted = (type == JSON_T_TRUE); + if (!is_sorted) { cerr << "[WARNING] is_sorted flag is ignored\n"; } + state = 0; + break; + + // rules + case 4: + if(type == JSON_T_NULL) { state = 0; break; } + assert(type == JSON_T_ARRAY_BEGIN); + state = 5; + break; + case 5: + if(type == JSON_T_ARRAY_END) { state = 0; break; } + assert(type == JSON_T_INTEGER); + state = 6; + rule_id = value->vu.integer_value; + break; + case 6: + assert(type == JSON_T_STRING); + rules[rule_id] = TRulePtr(new TRule(value->vu.str.value)); + state = 5; + break; + + // Nodes + case 8: + assert(type == JSON_T_OBJECT_BEGIN); + ++nodes; + in_edges.clear(); + cat.clear(); + state = 9; break; + case 9: + if (type == JSON_T_OBJECT_END) { + //cerr << "Creating NODE\n"; + CreateNode(cat, in_edges); + state = 0; break; + } + assert(type == JSON_T_KEY); + cur_key = value->vu.str.value; + if (cur_key == "cat") { assert(cat.empty()); state = 10; break; } + if (cur_key == "in_edges") { assert(in_edges.empty()); state = 11; break; } + cerr << "Syntax error: unexpected key " << cur_key << " in node specification.\n"; + return false; + case 10: + assert(type == JSON_T_STRING || type == JSON_T_NULL); + cat = value->vu.str.value; + state = 9; break; + case 11: + if (type == JSON_T_NULL) { state = 9; break; } + assert(type == JSON_T_ARRAY_BEGIN); + state = 12; break; + case 12: + if (type == JSON_T_ARRAY_END) { state = 9; break; } + assert(type == JSON_T_INTEGER); + //cerr << "in_edges: " << value->vu.integer_value << endl; + in_edges.push_back(value->vu.integer_value); + break; + + // "edges": [ { "tail": null, "feats" : [0,1.63,1,-0.54], "rule": 12}, + // { "tail": null, "feats" : [0,0.87,1,0.02], "rule": 17}, + // { "tail": [0], "feats" : [1,2.3,2,15.3,"ExtraFeature",1.2], "rule": 13}] + case 13: + assert(type == JSON_T_ARRAY_BEGIN); + state = 14; + break; + case 14: + if (type == JSON_T_ARRAY_END) { state = 0; break; } + assert(type == JSON_T_OBJECT_BEGIN); + //cerr << "New edge\n"; + ++edges; + cur_rule.reset(); feats.clear(); tail.clear(); + state = 15; break; + case 15: + if (type == JSON_T_OBJECT_END) { + CreateEdge(cur_rule, &feats, tail); + state = 14; break; + } + assert(type == JSON_T_KEY); + cur_key = value->vu.str.value; + //cerr << "edge key " << cur_key << endl; + if (cur_key == "rule") { assert(!cur_rule); state = 16; break; } + if (cur_key == "feats") { assert(feats.empty()); state = 17; break; } + if (cur_key == "tail") { assert(tail.empty()); state = 20; break; } + cerr << "Unexpected key " << cur_key << " in edge specification\n"; + return false; + case 16: // edge.rule + if (type == JSON_T_INTEGER) { + int rule_id = value->vu.integer_value; + if (rules.find(rule_id) == rules.end()) { + // rules list must come before the edge definitions! + cerr << "Rule_id " << rule_id << " given but only loaded " << rules.size() << " rules\n"; + return false; + } + cur_rule = rules[rule_id]; + } else if (type == JSON_T_STRING) { + cur_rule.reset(new TRule(value->vu.str.value)); + } else { + cerr << "Rule must be either a rule id or a rule string" << endl; + return false; + } + // cerr << "Edge: rule=" << cur_rule->AsString() << endl; + state = 15; + break; + case 17: // edge.feats + if (type == JSON_T_NULL) { state = 15; break; } + assert(type == JSON_T_ARRAY_BEGIN); + state = 18; break; + case 18: + if (type == JSON_T_ARRAY_END) { state = 15; break; } + if (type != JSON_T_INTEGER && type != JSON_T_STRING) { + cerr << "Unexpected feature id type\n"; return false; + } + if (type == JSON_T_INTEGER) { + fid = value->vu.integer_value; + assert(fid < fdict.size()); + fid = fdict[fid]; + } else if (JSON_T_STRING) { + fid = FD::Convert(value->vu.str.value); + } else { abort(); } + state = 19; + break; + case 19: + { + assert(type == JSON_T_INTEGER || type == JSON_T_FLOAT); + double val = (type == JSON_T_INTEGER ? static_cast(value->vu.integer_value) : + strtod(value->vu.str.value, NULL)); + feats.set_value(fid, val); + state = 18; + break; + } + case 20: // edge.tail + if (type == JSON_T_NULL) { state = 15; break; } + assert(type == JSON_T_ARRAY_BEGIN); + state = 21; break; + case 21: + if (type == JSON_T_ARRAY_END) { state = 15; break; } + assert(type == JSON_T_INTEGER); + tail.push_back(value->vu.integer_value); + break; + } + return true; + } + string rp; + string cat; + SmallVector tail; + vector in_edges; + TRulePtr cur_rule; + map rules; + vector fdict; + SparseVector feats; + int state; + int fid; + int nodes; + int edges; + string cur_key; + Hypergraph& hg; + int rule_id; + bool nodes_needed; + bool edges_needed; + bool is_sorted; +}; + +bool HypergraphIO::ReadFromJSON(istream* in, Hypergraph* hg) { + hg->clear(); + HGReader reader(hg); + return reader.Parse(in); +} + +static void WriteRule(const TRule& r, ostream* out) { + if (!r.lhs_) { (*out) << "[X] ||| "; } + JSONParser::WriteEscapedString(r.AsString(), out); +} + +bool HypergraphIO::WriteToJSON(const Hypergraph& hg, bool remove_rules, ostream* out) { + map rid; + ostream& o = *out; + rid[NULL] = 0; + o << '{'; + if (!remove_rules) { + o << "\"rules\":["; + for (int i = 0; i < hg.edges_.size(); ++i) { + const TRule* r = hg.edges_[i].rule_.get(); + int &id = rid[r]; + if (!id) { + id=rid.size() - 1; + if (id > 1) o << ','; + o << id << ','; + WriteRule(*r, &o); + }; + } + o << "],"; + } + const bool use_fdict = FD::NumFeats() < 1000; + if (use_fdict) { + o << "\"features\":["; + for (int i = 1; i < FD::NumFeats(); ++i) { + o << (i==1 ? "":",") << '"' << FD::Convert(i) << '"'; + } + o << "],"; + } + vector edgemap(hg.edges_.size(), -1); // edges may be in non-topo order + int edge_count = 0; + for (int i = 0; i < hg.nodes_.size(); ++i) { + const Hypergraph::Node& node = hg.nodes_[i]; + if (i > 0) { o << ","; } + o << "\"edges\":["; + for (int j = 0; j < node.in_edges_.size(); ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + edgemap[edge.id_] = edge_count; + ++edge_count; + o << (j == 0 ? "" : ",") << "{"; + + o << "\"tail\":["; + for (int k = 0; k < edge.tail_nodes_.size(); ++k) { + o << (k > 0 ? "," : "") << edge.tail_nodes_[k]; + } + o << "],"; + + o << "\"feats\":["; + bool first = true; + for (SparseVector::const_iterator it = edge.feature_values_.begin(); it != edge.feature_values_.end(); ++it) { + if (!it->second) continue; + if (!first) o << ','; + if (use_fdict) + o << (it->first - 1); + else + o << '"' << FD::Convert(it->first) << '"'; + o << ',' << it->second; + first = false; + } + o << "]"; + if (!remove_rules) { o << ",\"rule\":" << rid[edge.rule_.get()]; } + o << "}"; + } + o << "],"; + + o << "\"node\":{\"in_edges\":["; + for (int j = 0; j < node.in_edges_.size(); ++j) { + int mapped_edge = edgemap[node.in_edges_[j]]; + assert(mapped_edge >= 0); + o << (j == 0 ? "" : ",") << mapped_edge; + } + o << "]"; + if (node.cat_ < 0) { o << ",\"cat\":\"" << TD::Convert(node.cat_ * -1) << '"'; } + o << "}"; + } + o << "}\n"; + return true; +} + +bool needs_escape[128]; +void InitEscapes() { + memset(needs_escape, false, 128); + needs_escape[static_cast('\'')] = true; + needs_escape[static_cast('\\')] = true; +} + +string HypergraphIO::Escape(const string& s) { + size_t len = s.size(); + for (int i = 0; i < s.size(); ++i) { + unsigned char c = s[i]; + if (c < 128 && needs_escape[c]) ++len; + } + if (len == s.size()) return s; + string res(len, ' '); + size_t o = 0; + for (int i = 0; i < s.size(); ++i) { + unsigned char c = s[i]; + if (c < 128 && needs_escape[c]) + res[o++] = '\\'; + res[o++] = c; + } + assert(o == len); + return res; +} + +string HypergraphIO::AsPLF(const Hypergraph& hg, bool include_global_parentheses) { + static bool first = true; + if (first) { InitEscapes(); first = false; } + if (hg.nodes_.empty()) return "()"; + ostringstream os; + if (include_global_parentheses) os << '('; + static const string EPS="*EPS*"; + for (int i = 0; i < hg.nodes_.size()-1; ++i) { + if (hg.nodes_[i].out_edges_.empty()) abort(); + const bool last_node = (i == hg.nodes_.size() - 2); + const int out_edges_size = hg.nodes_[i].out_edges_.size(); + // compound splitter adds an extra goal transition which we suppress with + // the following conditional + if (!last_node || out_edges_size != 1 || + hg.edges_[hg.nodes_[i].out_edges_[0]].rule_->EWords() == 1) { + os << '('; + for (int j = 0; j < out_edges_size; ++j) { + const Hypergraph::Edge& e = hg.edges_[hg.nodes_[i].out_edges_[j]]; + const string output = e.rule_->e_.size() ==2 ? Escape(TD::Convert(e.rule_->e_[1])) : EPS; + double prob = log(e.edge_prob_); + if (isinf(prob)) { prob = -9e20; } + if (isnan(prob)) { prob = 0; } + os << "('" << output << "'," << prob << "," << e.head_node_ - i << "),"; + } + os << "),"; + } + } + if (include_global_parentheses) os << ')'; + return os.str(); +} + +namespace PLF { + +const string chars = "'\\"; +const char& quote = chars[0]; +const char& slash = chars[1]; + +// safe get +inline char get(const std::string& in, int c) { + if (c < 0 || c >= (int)in.size()) return 0; + else return in[(size_t)c]; +} + +// consume whitespace +inline void eatws(const std::string& in, int& c) { + while (get(in,c) == ' ') { c++; } +} + +// from 'foo' return foo +std::string getEscapedString(const std::string& in, int &c) +{ + eatws(in,c); + if (get(in,c++) != quote) return "ERROR"; + std::string res; + char cur = 0; + do { + cur = get(in,c++); + if (cur == slash) { res += get(in,c++); } + else if (cur != quote) { res += cur; } + } while (get(in,c) != quote && (c < (int)in.size())); + c++; + eatws(in,c); + return res; +} + +// basically atof +float getFloat(const std::string& in, int &c) +{ + std::string tmp; + eatws(in,c); + while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') { + tmp += get(in,c++); + } + eatws(in,c); + if (tmp.empty()) { + cerr << "Syntax error while reading number! col=" << c << endl; + abort(); + } + return atof(tmp.c_str()); +} + +// basically atoi +int getInt(const std::string& in, int &c) +{ + std::string tmp; + eatws(in,c); + while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') { + tmp += get(in,c++); + } + eatws(in,c); + return atoi(tmp.c_str()); +} + +// maximum number of nodes permitted +#define MAX_NODES 100000000 +// parse ('foo', 0.23) +void ReadPLFEdge(const std::string& in, int &c, int cur_node, Hypergraph* hg) { + if (get(in,c++) != '(') { assert(!"PCN/PLF parse error: expected ( at start of cn alt block\n"); } + vector ewords(2, 0); + ewords[1] = TD::Convert(getEscapedString(in,c)); + TRulePtr r(new TRule(ewords)); + r->ComputeArity(); + // cerr << "RULE: " << r->AsString() << endl; + if (get(in,c++) != ',') { assert(!"PCN/PLF parse error: expected , after string\n"); } + size_t cnNext = 1; + std::vector probs; + probs.push_back(getFloat(in,c)); + while (get(in,c) == ',') { + c++; + float val = getFloat(in,c); + probs.push_back(val); + // cerr << val << endl; //REMO + } + //if we read more than one prob, this was a lattice, last item was column increment + if (probs.size()>1) { + cnNext = static_cast(probs.back()); + probs.pop_back(); + if (cnNext < 1) { cerr << cnNext << endl; + assert(!"PCN/PLF parse error: bad link length at last element of cn alt block\n"); } + } + if (get(in,c++) != ')') { assert(!"PCN/PLF parse error: expected ) at end of cn alt block\n"); } + eatws(in,c); + Hypergraph::TailNodeVector tail(1, cur_node); + Hypergraph::Edge* edge = hg->AddEdge(r, tail); + //cerr << " <--" << cur_node << endl; + int head_node = cur_node + cnNext; + assert(head_node < MAX_NODES); // prevent malicious PLFs from using all the memory + if (hg->nodes_.size() < (head_node + 1)) { hg->ResizeNodes(head_node + 1); } + hg->ConnectEdgeToHeadNode(edge, &hg->nodes_[head_node]); + for (int i = 0; i < probs.size(); ++i) + edge->feature_values_.set_value(FD::Convert("Feature_" + boost::lexical_cast(i)), probs[i]); +} + +// parse (('foo', 0.23), ('bar', 0.77)) +void ReadPLFNode(const std::string& in, int &c, int cur_node, int line, Hypergraph* hg) { + //cerr << "PLF READING NODE " << cur_node << endl; + if (hg->nodes_.size() < (cur_node + 1)) { hg->ResizeNodes(cur_node + 1); } + if (get(in,c++) != '(') { cerr << line << ": Syntax error 1\n"; abort(); } + eatws(in,c); + while (1) { + if (c > (int)in.size()) { break; } + if (get(in,c) == ')') { + c++; + eatws(in,c); + break; + } + if (get(in,c) == ',' && get(in,c+1) == ')') { + c+=2; + eatws(in,c); + break; + } + if (get(in,c) == ',') { c++; eatws(in,c); } + ReadPLFEdge(in, c, cur_node, hg); + } +} + +} // namespace PLF + +void HypergraphIO::ReadFromPLF(const std::string& in, Hypergraph* hg, int line) { + hg->clear(); + int c = 0; + int cur_node = 0; + if (in[c++] != '(') { cerr << line << ": Syntax error!\n"; abort(); } + while (1) { + if (c > (int)in.size()) { break; } + if (PLF::get(in,c) == ')') { + c++; + PLF::eatws(in,c); + break; + } + if (PLF::get(in,c) == ',' && PLF::get(in,c+1) == ')') { + c+=2; + PLF::eatws(in,c); + break; + } + if (PLF::get(in,c) == ',') { c++; PLF::eatws(in,c); } + PLF::ReadPLFNode(in, c, cur_node, line, hg); + ++cur_node; + } + assert(cur_node == hg->nodes_.size() - 1); +} + +void HypergraphIO::PLFtoLattice(const string& plf, Lattice* pl) { + Lattice& l = *pl; + Hypergraph g; + ReadFromPLF(plf, &g, 0); + const int num_nodes = g.nodes_.size() - 1; + l.resize(num_nodes); + for (int i = 0; i < num_nodes; ++i) { + vector& alts = l[i]; + const Hypergraph::Node& node = g.nodes_[i]; + const int num_alts = node.out_edges_.size(); + alts.resize(num_alts); + for (int j = 0; j < num_alts; ++j) { + const Hypergraph::Edge& edge = g.edges_[node.out_edges_[j]]; + alts[j].label = edge.rule_->e_[1]; + alts[j].cost = edge.feature_values_.value(FD::Convert("Feature_0")); + alts[j].dist2next = edge.head_node_ - node.id_; + } + } +} + +namespace B64 { + +static const char cb64[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`abcdefghijklmnopq"; + +static void encodeblock(const unsigned char* in, ostream* os, int len) { + char out[4]; + out[0] = cb64[ in[0] >> 2 ]; + out[1] = cb64[ ((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4) ]; + out[2] = (len > 1 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '='); + out[3] = (len > 2 ? cb64[ in[2] & 0x3f ] : '='); + os->write(out, 4); +} + +void b64encode(const char* data, const size_t size, ostream* out) { + size_t cur = 0; + while(cur < size) { + int len = min(static_cast(3), size - cur); + encodeblock(reinterpret_cast(&data[cur]), out, len); + cur += len; + } +} + +static void decodeblock(const unsigned char* in, unsigned char* out) { + out[0] = (unsigned char ) (in[0] << 2 | in[1] >> 4); + out[1] = (unsigned char ) (in[1] << 4 | in[2] >> 2); + out[2] = (unsigned char ) (((in[2] << 6) & 0xc0) | in[3]); +} + +bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize) { + size_t cur = 0; + size_t ocur = 0; + unsigned char in[4]; + while(cur < insize) { + assert(ocur < outsize); + for (int i = 0; i < 4; ++i) { + unsigned char v = data[cur]; + v = (unsigned char) ((v < 43 || v > 122) ? '\0' : cd64[ v - 43 ]); + if (!v) { + cerr << "B64 decode error at offset " << cur << " offending character: " << (int)data[cur] << endl; + return false; + } + v = (unsigned char) ((v == '$') ? '\0' : v - 61); + if (v) in[i] = v - 1; else in[i] = 0; + ++cur; + } + decodeblock(in, reinterpret_cast(&out[ocur])); + ocur += 3; + } + return true; +} +} + diff --git a/decoder/hg_io.h b/decoder/hg_io.h new file mode 100644 index 00000000..69a516c1 --- /dev/null +++ b/decoder/hg_io.h @@ -0,0 +1,37 @@ +#ifndef _HG_IO_H_ +#define _HG_IO_H_ + +#include + +#include "lattice.h" +class Hypergraph; + +struct HypergraphIO { + + // the format is basically a list of nodes and edges in topological order + // any edge you read, you must have already read its tail nodes + // any node you read, you must have already read its incoming edges + // this may make writing a bit more challenging if your forest is not + // topologically sorted (but that probably doesn't happen very often), + // but it makes reading much more memory efficient. + // see test_data/small.json.gz for an email encoding + static bool ReadFromJSON(std::istream* in, Hypergraph* out); + + // if remove_rules is used, the hypergraph is serialized without rule information + // (so it only contains structure and feature information) + static bool WriteToJSON(const Hypergraph& hg, bool remove_rules, std::ostream* out); + + // serialization utils + static void ReadFromPLF(const std::string& in, Hypergraph* out, int line = 0); + // return PLF string representation (undefined behavior on non-lattices) + static std::string AsPLF(const Hypergraph& hg, bool include_global_parentheses = true); + static void PLFtoLattice(const std::string& plf, Lattice* pl); + static std::string Escape(const std::string& s); // PLF helper +}; + +namespace B64 { + bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize); + void b64encode(const char* data, const size_t size, std::ostream* out); +} + +#endif diff --git a/decoder/hg_test.cc b/decoder/hg_test.cc new file mode 100644 index 00000000..ecd97508 --- /dev/null +++ b/decoder/hg_test.cc @@ -0,0 +1,441 @@ +#include +#include +#include +#include +#include +#include "tdict.h" + +#include "json_parse.h" +#include "filelib.h" +#include "hg.h" +#include "hg_io.h" +#include "hg_intersect.h" +#include "viterbi.h" +#include "kbest.h" +#include "inside_outside.h" + +using namespace std; + +class HGTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } + void CreateHG(Hypergraph* hg) const; + void CreateHG_int(Hypergraph* hg) const; + void CreateHG_tiny(Hypergraph* hg) const; + void CreateHGBalanced(Hypergraph* hg) const; + void CreateLatticeHG(Hypergraph* hg) const; + void CreateTinyLatticeHG(Hypergraph* hg) const; +}; + +void HGTest::CreateTinyLatticeHG(Hypergraph* hg) const { + const string json = "{\"rules\":[1,\"[X] ||| [1] a\",2,\"[X] ||| [1] A\",3,\"[X] ||| [1] b\",4,\"[X] ||| [1] B'\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[],\"node\":{\"in_edges\":[]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.2],\"rule\":1},{\"tail\":[0],\"feats\":[0,-0.6],\"rule\":2}],\"node\":{\"in_edges\":[0,1]},\"edges\":[{\"tail\":[1],\"feats\":[0,-0.1],\"rule\":3},{\"tail\":[1],\"feats\":[0,-0.9],\"rule\":4}],\"node\":{\"in_edges\":[2,3]}}"; + istringstream instr(json); + EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); +} + +void HGTest::CreateLatticeHG(Hypergraph* hg) const { + const string json = "{\"rules\":[1,\"[X] ||| [1] a\",2,\"[X] ||| [1] A\",3,\"[X] ||| [1] A A\",4,\"[X] ||| [1] b\",5,\"[X] ||| [1] c\",6,\"[X] ||| [1] B C\",7,\"[X] ||| [1] A B C\",8,\"[X] ||| [1] CC\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[],\"node\":{\"in_edges\":[]},\"edges\":[{\"tail\":[0],\"feats\":[2,-0.3],\"rule\":1},{\"tail\":[0],\"feats\":[2,-0.6],\"rule\":2},{\"tail\":[0],\"feats\":[2,-1.7],\"rule\":3}],\"node\":{\"in_edges\":[0,1,2]},\"edges\":[{\"tail\":[1],\"feats\":[2,-0.5],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[2],\"feats\":[2,-0.6],\"rule\":5},{\"tail\":[1],\"feats\":[2,-0.8],\"rule\":6},{\"tail\":[0],\"feats\":[2,-0.01],\"rule\":7},{\"tail\":[2],\"feats\":[2,-0.8],\"rule\":8}],\"node\":{\"in_edges\":[4,5,6,7]}}"; + istringstream instr(json); + EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); +} + +void HGTest::CreateHG_tiny(Hypergraph* hg) const { + const string json = "{\"rules\":[1,\"[X] ||| \",2,\"[X] ||| X [1]\",3,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,-2,1,-99],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.5,1,-0.8],\"rule\":2},{\"tail\":[0],\"feats\":[0,-0.7,1,-0.9],\"rule\":3}],\"node\":{\"in_edges\":[1,2]}}"; + istringstream instr(json); + EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); +} + +void HGTest::CreateHG_int(Hypergraph* hg) const { + const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| b\",3,\"[X] ||| a [1]\",4,\"[X] ||| [1] b\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,0.1],\"rule\":1},{\"tail\":[],\"feats\":[0,0.1],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X\"},\"edges\":[{\"tail\":[0],\"feats\":[0,0.3],\"rule\":3},{\"tail\":[0],\"feats\":[0,0.2],\"rule\":4}],\"node\":{\"in_edges\":[2,3],\"cat\":\"Goal\"}}"; + istringstream instr(json); + EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); +} + +void HGTest::CreateHG(Hypergraph* hg) const { + string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; + istringstream instr(json); + EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); +} + +void HGTest::CreateHGBalanced(Hypergraph* hg) const { + const string json = "{\"rules\":[1,\"[X] ||| i\",2,\"[X] ||| a\",3,\"[X] ||| b\",4,\"[X] ||| [1] [2]\",5,\"[X] ||| [1] [2]\",6,\"[X] ||| c\",7,\"[X] ||| d\",8,\"[X] ||| [1] [2]\",9,\"[X] ||| [1] [2]\",10,\"[X] ||| [1] [2]\",11,\"[X] ||| [1] [2]\",12,\"[X] ||| [1] [2]\",13,\"[X] ||| [1] [2]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[1,2],\"feats\":[],\"rule\":4},{\"tail\":[2,1],\"feats\":[],\"rule\":5}],\"node\":{\"in_edges\":[3,4]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":6}],\"node\":{\"in_edges\":[5]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":7}],\"node\":{\"in_edges\":[6]},\"edges\":[{\"tail\":[4,5],\"feats\":[],\"rule\":8},{\"tail\":[5,4],\"feats\":[],\"rule\":9}],\"node\":{\"in_edges\":[7,8]},\"edges\":[{\"tail\":[3,6],\"feats\":[],\"rule\":10},{\"tail\":[6,3],\"feats\":[],\"rule\":11}],\"node\":{\"in_edges\":[9,10]},\"edges\":[{\"tail\":[7,0],\"feats\":[],\"rule\":12},{\"tail\":[0,7],\"feats\":[],\"rule\":13}],\"node\":{\"in_edges\":[11,12]}}"; + istringstream instr(json); + EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); +} + +TEST_F(HGTest,Controlled) { + Hypergraph hg; + CreateHG_tiny(&hg); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 0.8); + hg.Reweight(wts); + vector trans; + prob_t prob = ViterbiESentence(hg, &trans); + cerr << TD::GetString(trans) << "\n"; + cerr << "prob: " << prob << "\n"; + EXPECT_FLOAT_EQ(-80.839996, log(prob)); + EXPECT_EQ("X ", TD::GetString(trans)); + vector post; + hg.PrintGraphviz(); + prob_t c2 = Inside(hg, NULL, ScaledEdgeProb(0.6)); + EXPECT_FLOAT_EQ(-47.8577, log(c2)); +} + +TEST_F(HGTest,Union) { + Hypergraph hg1; + Hypergraph hg2; + CreateHG_tiny(&hg1); + CreateHG(&hg2); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 1.0); + hg1.Reweight(wts); + hg2.Reweight(wts); + prob_t c1,c2,c3,c4; + vector t1,t2,t3,t4; + c1 = ViterbiESentence(hg1, &t1); + c2 = ViterbiESentence(hg2, &t2); + int l2 = ViterbiPathLength(hg2); + cerr << c1 << "\t" << TD::GetString(t1) << endl; + cerr << c2 << "\t" << TD::GetString(t2) << endl; + hg1.Union(hg2); + hg1.Reweight(wts); + c3 = ViterbiESentence(hg1, &t3); + int l3 = ViterbiPathLength(hg1); + cerr << c3 << "\t" << TD::GetString(t3) << endl; + EXPECT_FLOAT_EQ(c2, c3); + EXPECT_EQ(TD::GetString(t2), TD::GetString(t3)); + EXPECT_EQ(l2, l3); + + wts.set_value(FD::Convert("f2"), -1); + hg1.Reweight(wts); + c4 = ViterbiESentence(hg1, &t4); + cerr << c4 << "\t" << TD::GetString(t4) << endl; + EXPECT_EQ("Z ", TD::GetString(t4)); + EXPECT_FLOAT_EQ(98.82, log(c4)); + + vector, prob_t> > list; + KBest::KBestDerivations, ESentenceTraversal> kbest(hg1, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg1.nodes_.size() - 1, i); + if (!d) break; + list.push_back(make_pair(d->yield, d->score)); + } + EXPECT_TRUE(list[0].first == t4); + EXPECT_FLOAT_EQ(log(list[0].second), log(c4)); + EXPECT_EQ(list.size(), 6); + EXPECT_FLOAT_EQ(log(list.back().second / list.front().second), -97.7); +} + +TEST_F(HGTest,ControlledKBest) { + Hypergraph hg; + CreateHG(&hg); + vector w(2); w[0]=0.4; w[1]=0.8; + hg.Reweight(w); + vector trans; + prob_t cost = ViterbiESentence(hg, &trans); + cerr << TD::GetString(trans) << "\n"; + cerr << "cost: " << cost << "\n"; + + int best = 0; + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << TD::GetString(d->yield) << endl; + ++best; + } + EXPECT_EQ(4, best); +} + + +TEST_F(HGTest,InsideScore) { + SparseVector wts; + wts.set_value(FD::Convert("f1"), 1.0); + Hypergraph hg; + CreateTinyLatticeHG(&hg); + hg.Reweight(wts); + vector trans; + prob_t cost = ViterbiESentence(hg, &trans); + cerr << TD::GetString(trans) << "\n"; + cerr << "cost: " << cost << "\n"; + hg.PrintGraphviz(); + prob_t inside = Inside(hg); + EXPECT_FLOAT_EQ(1.7934048, inside); // computed by hand + vector post; + inside = hg.ComputeBestPathThroughEdges(&post); + EXPECT_FLOAT_EQ(-0.3, log(inside)); // computed by hand + EXPECT_EQ(post.size(), 4); + for (int i = 0; i < 4; ++i) { + cerr << "edge post: " << log(post[i]) << '\t' << hg.edges_[i].rule_->AsString() << endl; + } +} + + +TEST_F(HGTest,PruneInsideOutside) { + SparseVector wts; + wts.set_value(FD::Convert("Feature_1"), 1.0); + Hypergraph hg; + CreateLatticeHG(&hg); + hg.Reweight(wts); + vector trans; + prob_t cost = ViterbiESentence(hg, &trans); + cerr << TD::GetString(trans) << "\n"; + cerr << "cost: " << cost << "\n"; + hg.PrintGraphviz(); + //hg.DensityPruneInsideOutside(0.5, false, 2.0); + hg.BeamPruneInsideOutside(0.5, false, 0.5); + cost = ViterbiESentence(hg, &trans); + cerr << "Ncst: " << cost << endl; + cerr << TD::GetString(trans) << "\n"; + hg.PrintGraphviz(); +} + +TEST_F(HGTest,TestPruneEdges) { + Hypergraph hg; + CreateLatticeHG(&hg); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 1.0); + hg.Reweight(wts); + hg.PrintGraphviz(); + vector prune(hg.edges_.size(), true); + prune[6] = false; + hg.PruneEdges(prune); + cerr << "Pruned:\n"; + hg.PrintGraphviz(); +} + +TEST_F(HGTest,TestIntersect) { + Hypergraph hg; + CreateHG_int(&hg); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 1.0); + hg.Reweight(wts); + hg.PrintGraphviz(); + + int best = 0; + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << TD::GetString(d->yield) << endl; + ++best; + } + EXPECT_EQ(4, best); + + Lattice target(2); + target[0].push_back(LatticeArc(TD::Convert("a"), 0.0, 1)); + target[1].push_back(LatticeArc(TD::Convert("b"), 0.0, 1)); + HG::Intersect(target, &hg); + hg.PrintGraphviz(); +} + +TEST_F(HGTest,TestPrune2) { + Hypergraph hg; + CreateHG_int(&hg); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 1.0); + hg.Reweight(wts); + hg.PrintGraphviz(); + vector rem(hg.edges_.size(), false); + rem[0] = true; + rem[1] = true; + hg.PruneEdges(rem); + hg.PrintGraphviz(); + cerr << "TODO: fix this pruning behavior-- the resulting HG should be empty!\n"; +} + +TEST_F(HGTest,Sample) { + Hypergraph hg; + CreateLatticeHG(&hg); + SparseVector wts; + wts.set_value(FD::Convert("Feature_1"), 0.0); + hg.Reweight(wts); + vector trans; + prob_t cost = ViterbiESentence(hg, &trans); + cerr << TD::GetString(trans) << "\n"; + cerr << "cost: " << cost << "\n"; + hg.PrintGraphviz(); +} + +TEST_F(HGTest,PLF) { + Hypergraph hg; + string inplf = "((('haupt',-2.06655,1),('hauptgrund',-5.71033,2),),(('grund',-1.78709,1),),(('für\\'',0.1,1),),)"; + HypergraphIO::ReadFromPLF(inplf, &hg); + SparseVector wts; + wts.set_value(FD::Convert("Feature_0"), 1.0); + hg.Reweight(wts); + hg.PrintGraphviz(); + string outplf = HypergraphIO::AsPLF(hg); + cerr << " IN: " << inplf << endl; + cerr << "OUT: " << outplf << endl; + assert(inplf == outplf); +} + +TEST_F(HGTest,PushWeightsToGoal) { + Hypergraph hg; + CreateHG(&hg); + vector w(2); w[0]=0.4; w[1]=0.8; + hg.Reweight(w); + vector trans; + prob_t cost = ViterbiESentence(hg, &trans); + cerr << TD::GetString(trans) << "\n"; + cerr << "cost: " << cost << "\n"; + hg.PrintGraphviz(); + hg.PushWeightsToGoal(); + hg.PrintGraphviz(); +} + +TEST_F(HGTest,TestSpecialKBest) { + Hypergraph hg; + CreateHGBalanced(&hg); + vector w(1); w[0]=0; + hg.Reweight(w); + vector, prob_t> > list; + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 100000); + for (int i = 0; i < 100000; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << TD::GetString(d->yield) << endl; + } + hg.PrintGraphviz(); +} + +TEST_F(HGTest, TestGenericViterbi) { + Hypergraph hg; + CreateHG_tiny(&hg); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 0.8); + hg.Reweight(wts); + vector trans; + const prob_t prob = ViterbiESentence(hg, &trans); + cerr << TD::GetString(trans) << "\n"; + cerr << "prob: " << prob << "\n"; + EXPECT_FLOAT_EQ(-80.839996, log(prob)); + EXPECT_EQ("X ", TD::GetString(trans)); +} + +TEST_F(HGTest, TestGenericInside) { + Hypergraph hg; + CreateTinyLatticeHG(&hg); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 1.0); + hg.Reweight(wts); + vector inside; + prob_t ins = Inside(hg, &inside); + EXPECT_FLOAT_EQ(1.7934048, ins); // computed by hand + vector outside; + Outside(hg, inside, &outside); + EXPECT_EQ(3, outside.size()); + EXPECT_FLOAT_EQ(1.7934048, outside[0]); + EXPECT_FLOAT_EQ(1.3114071, outside[1]); + EXPECT_FLOAT_EQ(1.0, outside[2]); +} + +TEST_F(HGTest,TestGenericInside2) { + Hypergraph hg; + CreateHG(&hg); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 0.8); + hg.Reweight(wts); + vector inside, outside; + prob_t ins = Inside(hg, &inside); + Outside(hg, inside, &outside); + for (int i = 0; i < hg.nodes_.size(); ++i) + cerr << i << "\t" << log(inside[i]) << "\t" << log(outside[i]) << endl; + EXPECT_FLOAT_EQ(0, log(inside[0])); + EXPECT_FLOAT_EQ(-1.7861683, log(outside[0])); + EXPECT_FLOAT_EQ(-0.4, log(inside[1])); + EXPECT_FLOAT_EQ(-1.3861683, log(outside[1])); + EXPECT_FLOAT_EQ(-0.8, log(inside[2])); + EXPECT_FLOAT_EQ(-0.986168, log(outside[2])); + EXPECT_FLOAT_EQ(-0.96, log(inside[3])); + EXPECT_FLOAT_EQ(-0.8261683, log(outside[3])); + EXPECT_FLOAT_EQ(-1.562512, log(inside[4])); + EXPECT_FLOAT_EQ(-0.22365622, log(outside[4])); + EXPECT_FLOAT_EQ(-1.7861683, log(inside[5])); + EXPECT_FLOAT_EQ(0, log(outside[5])); +} + +TEST_F(HGTest,TestAddExpectations) { + Hypergraph hg; + CreateHG(&hg); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 0.8); + hg.Reweight(wts); + SparseVector feat_exps; + InsideOutside, EdgeFeaturesWeightFunction>(hg, &feat_exps); + EXPECT_FLOAT_EQ(-2.5439765, feat_exps[FD::Convert("f1")]); + EXPECT_FLOAT_EQ(-2.6357865, feat_exps[FD::Convert("f2")]); + cerr << feat_exps << endl; + SparseVector posts; + InsideOutside, TransitionEventWeightFunction>(hg, &posts); +} + +TEST_F(HGTest, Small) { + ReadFile rf("test_data/small.json.gz"); + Hypergraph hg; + assert(HypergraphIO::ReadFromJSON(rf.stream(), &hg)); + SparseVector wts; + wts.set_value(FD::Convert("Model_0"), -2.0); + wts.set_value(FD::Convert("Model_1"), -0.5); + wts.set_value(FD::Convert("Model_2"), -1.1); + wts.set_value(FD::Convert("Model_3"), -1.0); + wts.set_value(FD::Convert("Model_4"), -1.0); + wts.set_value(FD::Convert("Model_5"), 0.5); + wts.set_value(FD::Convert("Model_6"), 0.2); + wts.set_value(FD::Convert("Model_7"), -3.0); + hg.Reweight(wts); + vector trans; + prob_t cost = ViterbiESentence(hg, &trans); + cerr << TD::GetString(trans) << "\n"; + cerr << "cost: " << cost << "\n"; + vector post; + prob_t c2 = Inside(hg, NULL, ScaledEdgeProb(0.6)); + EXPECT_FLOAT_EQ(2.1431036, log(c2)); +} + +TEST_F(HGTest, JSONTest) { + ostringstream os; + JSONParser::WriteEscapedString("\"I don't know\", she said.", &os); + EXPECT_EQ("\"\\\"I don't know\\\", she said.\"", os.str()); + ostringstream os2; + JSONParser::WriteEscapedString("yes", &os2); + EXPECT_EQ("\"yes\"", os2.str()); +} + +TEST_F(HGTest, TestGenericKBest) { + Hypergraph hg; + CreateHG(&hg); + //CreateHGBalanced(&hg); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 1.0); + hg.Reweight(wts); + vector trans; + prob_t cost = ViterbiESentence(hg, &trans); + cerr << TD::GetString(trans) << "\n"; + cerr << "cost: " << cost << "\n"; + + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 1000); + for (int i = 0; i < 1000; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << TD::GetString(d->yield) << " F:" << d->feature_values << endl; + } +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/decoder/inside_outside.h b/decoder/inside_outside.h new file mode 100644 index 00000000..9114c9d7 --- /dev/null +++ b/decoder/inside_outside.h @@ -0,0 +1,111 @@ +#ifndef _INSIDE_H_ +#define _INSIDE_H_ + +#include +#include +#include "hg.h" + +// run the inside algorithm and return the inside score +// if result is non-NULL, result will contain the inside +// score for each node +// NOTE: WeightType(0) must construct the semiring's additive identity +// WeightType(1) must construct the semiring's multiplicative identity +template +WeightType Inside(const Hypergraph& hg, + std::vector* result = NULL, + const WeightFunction& weight = WeightFunction()) { + const int num_nodes = hg.nodes_.size(); + std::vector dummy; + std::vector& inside_score = result ? *result : dummy; + inside_score.resize(num_nodes); + std::fill(inside_score.begin(), inside_score.end(), WeightType()); + for (int i = 0; i < num_nodes; ++i) { + const Hypergraph::Node& cur_node = hg.nodes_[i]; + WeightType* const cur_node_inside_score = &inside_score[i]; + const int num_in_edges = cur_node.in_edges_.size(); + if (num_in_edges == 0) { + *cur_node_inside_score = WeightType(1); + continue; + } + for (int j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]]; + WeightType score = weight(edge); + for (int k = 0; k < edge.tail_nodes_.size(); ++k) { + const int tail_node_index = edge.tail_nodes_[k]; + score *= inside_score[tail_node_index]; + } + *cur_node_inside_score += score; + } + } + return inside_score.back(); +} + +template +void Outside(const Hypergraph& hg, + std::vector& inside_score, + std::vector* result, + const WeightFunction& weight = WeightFunction()) { + assert(result); + const int num_nodes = hg.nodes_.size(); + assert(inside_score.size() == num_nodes); + std::vector& outside_score = *result; + outside_score.resize(num_nodes); + std::fill(outside_score.begin(), outside_score.end(), WeightType(0)); + outside_score.back() = WeightType(1); + for (int i = num_nodes - 1; i >= 0; --i) { + const Hypergraph::Node& cur_node = hg.nodes_[i]; + const WeightType& head_node_outside_score = outside_score[i]; + const int num_in_edges = cur_node.in_edges_.size(); + for (int j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]]; + const WeightType head_and_edge_weight = weight(edge) * head_node_outside_score; + const int num_tail_nodes = edge.tail_nodes_.size(); + for (int k = 0; k < num_tail_nodes; ++k) { + const int update_tail_node_index = edge.tail_nodes_[k]; + WeightType* const tail_outside_score = &outside_score[update_tail_node_index]; + WeightType inside_contribution = WeightType(1); + for (int l = 0; l < num_tail_nodes; ++l) { + const int other_tail_node_index = edge.tail_nodes_[l]; + if (update_tail_node_index != other_tail_node_index) + inside_contribution *= inside_score[other_tail_node_index]; + } + *tail_outside_score += head_and_edge_weight * inside_contribution; + } + } + } +} + +// this is the Inside-Outside optimization described in Li et al. (EMNLP 2009) +// for computing the inside algorithm over expensive semirings +// (such as expectations over features). See Figure 4. It is slightly different +// in that x/k is returned not (k,x) +// NOTE: RType * PType must be valid (and yield RType) +template +PType InsideOutside(const Hypergraph& hg, + RType* result_x, + const WeightFunction& weight1 = WeightFunction(), + const WeightFunction2& weight2 = WeightFunction2()) { + const int num_nodes = hg.nodes_.size(); + std::vector inside, outside; + const PType z = Inside(hg, &inside, weight1); + Outside(hg, inside, &outside, weight1); + RType& x = *result_x; + x = RType(); + for (int i = 0; i < num_nodes; ++i) { + const Hypergraph::Node& cur_node = hg.nodes_[i]; + const int num_in_edges = cur_node.in_edges_.size(); + for (int j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]]; + PType prob = outside[i]; + prob *= weight1(edge); + const int num_tail_nodes = edge.tail_nodes_.size(); + for (int k = 0; k < num_tail_nodes; ++k) + prob *= inside[edge.tail_nodes_[k]]; + prob /= z; + x += weight2(edge) * prob; + } + } + return z; +} + +#endif diff --git a/decoder/json_parse.cc b/decoder/json_parse.cc new file mode 100644 index 00000000..f6fdfea8 --- /dev/null +++ b/decoder/json_parse.cc @@ -0,0 +1,50 @@ +#include "json_parse.h" + +#include +#include + +using namespace std; + +static const char *json_hex_chars = "0123456789abcdef"; + +void JSONParser::WriteEscapedString(const string& in, ostream* out) { + int pos = 0; + int start_offset = 0; + unsigned char c = 0; + (*out) << '"'; + while(pos < in.size()) { + c = in[pos]; + switch(c) { + case '\b': + case '\n': + case '\r': + case '\t': + case '"': + case '\\': + case '/': + if(pos - start_offset > 0) + (*out) << in.substr(start_offset, pos - start_offset); + if(c == '\b') (*out) << "\\b"; + else if(c == '\n') (*out) << "\\n"; + else if(c == '\r') (*out) << "\\r"; + else if(c == '\t') (*out) << "\\t"; + else if(c == '"') (*out) << "\\\""; + else if(c == '\\') (*out) << "\\\\"; + else if(c == '/') (*out) << "\\/"; + start_offset = ++pos; + break; + default: + if(c < ' ') { + cerr << "Warning, bad character (" << static_cast(c) << ") in string\n"; + if(pos - start_offset > 0) + (*out) << in.substr(start_offset, pos - start_offset); + (*out) << "\\u00" << json_hex_chars[c >> 4] << json_hex_chars[c & 0xf]; + start_offset = ++pos; + } else pos++; + } + } + if(pos - start_offset > 0) + (*out) << in.substr(start_offset, pos - start_offset); + (*out) << '"'; +} + diff --git a/decoder/json_parse.h b/decoder/json_parse.h new file mode 100644 index 00000000..c3cba954 --- /dev/null +++ b/decoder/json_parse.h @@ -0,0 +1,58 @@ +#ifndef _JSON_WRAPPER_H_ +#define _JSON_WRAPPER_H_ + +#include +#include +#include "JSON_parser.h" + +class JSONParser { + public: + JSONParser() { + init_JSON_config(&config); + hack.mf = &JSONParser::Callback; + config.depth = 10; + config.callback_ctx = reinterpret_cast(this); + config.callback = hack.cb; + config.allow_comments = 1; + config.handle_floats_manually = 1; + jc = new_JSON_parser(&config); + } + virtual ~JSONParser() { + delete_JSON_parser(jc); + } + bool Parse(std::istream* in) { + int count = 0; + int lc = 1; + for (; in ; ++count) { + int next_char = in->get(); + if (!in->good()) break; + if (lc == '\n') { ++lc; } + if (!JSON_parser_char(jc, next_char)) { + std::cerr << "JSON_parser_char: syntax error, line " << lc << " (byte " << count << ")" << std::endl; + return false; + } + } + if (!JSON_parser_done(jc)) { + std::cerr << "JSON_parser_done: syntax error\n"; + return false; + } + return true; + } + static void WriteEscapedString(const std::string& in, std::ostream* out); + protected: + virtual bool HandleJSONEvent(int type, const JSON_value* value) = 0; + private: + int Callback(int type, const JSON_value* value) { + if (HandleJSONEvent(type, value)) return 1; + return 0; + } + JSON_parser_struct* jc; + JSON_config config; + typedef int (JSONParser::* MF)(int type, const struct JSON_value_struct* value); + union CBHack { + JSON_parser_callback cb; + MF mf; + } hack; +}; + +#endif diff --git a/decoder/kbest.h b/decoder/kbest.h new file mode 100644 index 00000000..cd9b6c2b --- /dev/null +++ b/decoder/kbest.h @@ -0,0 +1,207 @@ +#ifndef _HG_KBEST_H_ +#define _HG_KBEST_H_ + +#include +#include +#include + +#include + +#include "wordid.h" +#include "hg.h" + +namespace KBest { + // default, don't filter any derivations from the k-best list + struct NoFilter { + bool operator()(const std::vector& yield) { + (void) yield; + return false; + } + }; + + // optional, filter unique yield strings + struct FilterUnique { + std::tr1::unordered_set, boost::hash > > unique; + + bool operator()(const std::vector& yield) { + return !unique.insert(yield).second; + } + }; + + // utility class to lazily create the k-best derivations from a forest, uses + // the lazy k-best algorithm (Algorithm 3) from Huang and Chiang (IWPT 2005) + template + struct KBestDerivations { + KBestDerivations(const Hypergraph& hg, + const size_t k, + const Traversal& tf = Traversal(), + const WeightFunction& wf = WeightFunction()) : + traverse(tf), w(wf), g(hg), nds(g.nodes_.size()), k_prime(k) {} + + ~KBestDerivations() { + for (int i = 0; i < freelist.size(); ++i) + delete freelist[i]; + } + + struct Derivation { + Derivation(const Hypergraph::Edge& e, + const SmallVector& jv, + const WeightType& w, + const SparseVector& f) : + edge(&e), + j(jv), + score(w), + feature_values(f) {} + + // dummy constructor, just for query + Derivation(const Hypergraph::Edge& e, + const SmallVector& jv) : edge(&e), j(jv) {} + + T yield; + const Hypergraph::Edge* const edge; + const SmallVector j; + const WeightType score; + const SparseVector feature_values; + }; + struct HeapCompare { + bool operator()(const Derivation* a, const Derivation* b) const { + return a->score < b->score; + } + }; + struct DerivationCompare { + bool operator()(const Derivation* a, const Derivation* b) const { + return a->score > b->score; + } + }; + struct DerivationUniquenessHash { + size_t operator()(const Derivation* d) const { + size_t x = 5381; + x = ((x << 5) + x) ^ d->edge->id_; + for (int i = 0; i < d->j.size(); ++i) + x = ((x << 5) + x) ^ d->j[i]; + return x; + } + }; + struct DerivationUniquenessEquals { + bool operator()(const Derivation* a, const Derivation* b) const { + return (a->edge == b->edge) && (a->j == b->j); + } + }; + typedef std::vector CandidateHeap; + typedef std::vector DerivationList; + typedef std::tr1::unordered_set< + const Derivation*, DerivationUniquenessHash, DerivationUniquenessEquals> UniqueDerivationSet; + + struct NodeDerivationState { + CandidateHeap cand; + DerivationList D; + DerivationFilter filter; + UniqueDerivationSet ds; + explicit NodeDerivationState(const DerivationFilter& f = DerivationFilter()) : filter(f) {} + }; + + Derivation* LazyKthBest(int v, int k) { + NodeDerivationState& s = GetCandidates(v); + CandidateHeap& cand = s.cand; + DerivationList& D = s.D; + DerivationFilter& filter = s.filter; + bool add_next = true; + while (D.size() <= k) { + if (add_next && D.size() > 0) { + const Derivation* d = D.back(); + LazyNext(d, &cand, &s.ds); + } + add_next = false; + + if (cand.size() > 0) { + std::pop_heap(cand.begin(), cand.end(), HeapCompare()); + Derivation* d = cand.back(); + cand.pop_back(); + std::vector ants(d->edge->Arity()); + for (int j = 0; j < ants.size(); ++j) + ants[j] = &LazyKthBest(d->edge->tail_nodes_[j], d->j[j])->yield; + traverse(*d->edge, ants, &d->yield); + if (!filter(d->yield)) { + D.push_back(d); + add_next = true; + } + } else { + break; + } + } + if (k < D.size()) return D[k]; else return NULL; + } + + private: + // creates a derivation object with all fields set but the yield + // the yield is computed in LazyKthBest before the derivation is added to D + // returns NULL if j refers to derivation numbers larger than the + // antecedent structure define + Derivation* CreateDerivation(const Hypergraph::Edge& e, const SmallVector& j) { + WeightType score = w(e); + SparseVector feats = e.feature_values_; + for (int i = 0; i < e.Arity(); ++i) { + const Derivation* ant = LazyKthBest(e.tail_nodes_[i], j[i]); + if (!ant) { return NULL; } + score *= ant->score; + feats += ant->feature_values; + } + freelist.push_back(new Derivation(e, j, score, feats)); + return freelist.back(); + } + + NodeDerivationState& GetCandidates(int v) { + NodeDerivationState& s = nds[v]; + if (!s.D.empty() || !s.cand.empty()) return s; + + const Hypergraph::Node& node = g.nodes_[v]; + for (int i = 0; i < node.in_edges_.size(); ++i) { + const Hypergraph::Edge& edge = g.edges_[node.in_edges_[i]]; + SmallVector jv(edge.Arity(), 0); + Derivation* d = CreateDerivation(edge, jv); + assert(d); + s.cand.push_back(d); + } + + const int effective_k = std::min(k_prime, s.cand.size()); + const typename CandidateHeap::iterator kth = s.cand.begin() + effective_k; + std::nth_element(s.cand.begin(), kth, s.cand.end(), DerivationCompare()); + s.cand.resize(effective_k); + std::make_heap(s.cand.begin(), s.cand.end(), HeapCompare()); + + return s; + } + + void LazyNext(const Derivation* d, CandidateHeap* cand, UniqueDerivationSet* ds) { + for (int i = 0; i < d->j.size(); ++i) { + SmallVector j = d->j; + ++j[i]; + const Derivation* ant = LazyKthBest(d->edge->tail_nodes_[i], j[i]); + if (ant) { + Derivation query_unique(*d->edge, j); + if (ds->count(&query_unique) == 0) { + Derivation* new_d = CreateDerivation(*d->edge, j); + if (new_d) { + cand->push_back(new_d); + std::push_heap(cand->begin(), cand->end(), HeapCompare()); + assert(ds->insert(new_d).second); // insert into uniqueness set, sanity check + } + } + } + } + } + + const Traversal traverse; + const WeightFunction w; + const Hypergraph& g; + std::vector nds; + std::vector freelist; + const size_t k_prime; + }; +} + +#endif diff --git a/decoder/lattice.cc b/decoder/lattice.cc new file mode 100644 index 00000000..56bc9551 --- /dev/null +++ b/decoder/lattice.cc @@ -0,0 +1,61 @@ +#include "lattice.h" + +#include "tdict.h" +#include "hg_io.h" + +using namespace std; + +static const int kUNREACHABLE = 99999999; + +void Lattice::ComputeDistances() { + const int n = this->size() + 1; + dist_.resize(n, n, kUNREACHABLE); + for (int i = 0; i < this->size(); ++i) { + const vector& alts = (*this)[i]; + for (int j = 0; j < alts.size(); ++j) + dist_(i, i + alts[j].dist2next) = 1; + } + for (int k = 0; k < n; ++k) { + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + const int dp = dist_(i,k) + dist_(k,j); + if (dist_(i,j) > dp) + dist_(i,j) = dp; + } + } + } + + for (int i = 0; i < n; ++i) { + int latest = kUNREACHABLE; + for (int j = n-1; j >= 0; --j) { + const int c = dist_(i,j); + if (c < kUNREACHABLE) + latest = c; + else + dist_(i,j) = latest; + } + } + // cerr << dist_ << endl; +} + +bool LatticeTools::LooksLikePLF(const string &line) { + return (line.size() > 5) && (line.substr(0,4) == "((('"); +} + +void LatticeTools::ConvertTextToLattice(const string& text, Lattice* pl) { + Lattice& l = *pl; + vector ids; + TD::ConvertSentence(text, &ids); + l.resize(ids.size()); + for (int i = 0; i < l.size(); ++i) + l[i].push_back(LatticeArc(ids[i], 0.0, 1)); +} + +void LatticeTools::ConvertTextOrPLF(const string& text_or_plf, Lattice* pl) { + if (LooksLikePLF(text_or_plf)) + HypergraphIO::PLFtoLattice(text_or_plf, pl); + else + ConvertTextToLattice(text_or_plf, pl); + pl->ComputeDistances(); +} + diff --git a/decoder/lattice.h b/decoder/lattice.h new file mode 100644 index 00000000..71589b92 --- /dev/null +++ b/decoder/lattice.h @@ -0,0 +1,41 @@ +#ifndef __LATTICE_H_ +#define __LATTICE_H_ + +#include +#include +#include "wordid.h" +#include "array2d.h" + +class Lattice; +struct LatticeTools { + static bool LooksLikePLF(const std::string &line); + static void ConvertTextToLattice(const std::string& text, Lattice* pl); + static void ConvertTextOrPLF(const std::string& text_or_plf, Lattice* pl); +}; + +struct LatticeArc { + WordID label; + double cost; + int dist2next; + LatticeArc() : label(), cost(), dist2next() {} + LatticeArc(WordID w, double c, int i) : label(w), cost(c), dist2next(i) {} +}; + +class Lattice : public std::vector > { + friend void LatticeTools::ConvertTextOrPLF(const std::string& text_or_plf, Lattice* pl); + public: + Lattice() {} + explicit Lattice(size_t t, const std::vector& v = std::vector()) : + std::vector >(t, v) {} + int Distance(int from, int to) const { + if (dist_.empty()) + return (to - from); + return dist_(from, to); + } + + private: + void ComputeDistances(); + Array2D dist_; +}; + +#endif diff --git a/decoder/lexcrf.cc b/decoder/lexcrf.cc new file mode 100644 index 00000000..33455a3d --- /dev/null +++ b/decoder/lexcrf.cc @@ -0,0 +1,112 @@ +#include "lexcrf.h" + +#include + +#include "filelib.h" +#include "hg.h" +#include "tdict.h" +#include "grammar.h" +#include "sentence_metadata.h" + +using namespace std; + +struct LexicalCRFImpl { + LexicalCRFImpl(const boost::program_options::variables_map& conf) : + use_null(false), + kXCAT(TD::Convert("X")*-1), + kNULL(TD::Convert("")), + kBINARY(new TRule("[X] ||| [X,1] [X,2] ||| [1] [2]")), + kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")) { + vector gfiles = conf["grammar"].as >(); + assert(gfiles.size() == 1); + ReadFile rf(gfiles.front()); + TextGrammar *tg = new TextGrammar; + grammar.reset(tg); + istream* in = rf.stream(); + int lc = 0; + bool flag = false; + while(*in) { + string line; + getline(*in, line); + if (line.empty()) continue; + ++lc; + TRulePtr r(TRule::CreateRulePhrasetable(line)); + tg->AddRule(r); + if (lc % 50000 == 0) { cerr << '.'; flag = true; } + if (lc % 2000000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } + } + if (flag) cerr << endl; + cerr << "Loaded " << lc << " rules\n"; + } + + void BuildTrellis(const Lattice& lattice, const SentenceMetadata& smeta, Hypergraph* forest) { + const int e_len = smeta.GetTargetLength(); + assert(e_len > 0); + const int f_len = lattice.size(); + // hack to tell the feature function system how big the sentence pair is + const int f_start = (use_null ? -1 : 0); + int prev_node_id = -1; + for (int i = 0; i < e_len; ++i) { // for each word in the *ref* + Hypergraph::Node* node = forest->AddNode(kXCAT); + const int new_node_id = node->id_; + for (int j = f_start; j < f_len; ++j) { // for each word in the source + const WordID src_sym = (j < 0 ? kNULL : lattice[j][0].label); + const GrammarIter* gi = grammar->GetRoot()->Extend(src_sym); + if (!gi) { + cerr << "No translations found for: " << TD::Convert(src_sym) << "\n"; + abort(); + } + const RuleBin* rb = gi->GetRules(); + assert(rb); + for (int k = 0; k < rb->GetNumRules(); ++k) { + TRulePtr rule = rb->GetIthRule(k); + Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector()); + edge->i_ = j; + edge->j_ = j+1; + edge->prev_i_ = i; + edge->prev_j_ = i+1; + edge->feature_values_ += edge->rule_->GetFeatureValues(); + forest->ConnectEdgeToHeadNode(edge->id_, new_node_id); + } + } + if (prev_node_id >= 0) { + const int comb_node_id = forest->AddNode(kXCAT)->id_; + Hypergraph::TailNodeVector tail(2, prev_node_id); + tail[1] = new_node_id; + const int edge_id = forest->AddEdge(kBINARY, tail)->id_; + forest->ConnectEdgeToHeadNode(edge_id, comb_node_id); + prev_node_id = comb_node_id; + } else { + prev_node_id = new_node_id; + } + } + Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); + Hypergraph::Node* goal = forest->AddNode(TD::Convert("[Goal]")*-1); + Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); + forest->ConnectEdgeToHeadNode(hg_edge, goal); + } + + private: + const bool use_null; + const WordID kXCAT; + const WordID kNULL; + const TRulePtr kBINARY; + const TRulePtr kGOAL_RULE; + GrammarPtr grammar; +}; + +LexicalCRF::LexicalCRF(const boost::program_options::variables_map& conf) : + pimpl_(new LexicalCRFImpl(conf)) {} + +bool LexicalCRF::Translate(const string& input, + SentenceMetadata* smeta, + const vector& weights, + Hypergraph* forest) { + Lattice lattice; + LatticeTools::ConvertTextToLattice(input, &lattice); + smeta->SetSourceLength(lattice.size()); + pimpl_->BuildTrellis(lattice, *smeta, forest); + forest->Reweight(weights); + return true; +} + diff --git a/decoder/lexcrf.h b/decoder/lexcrf.h new file mode 100644 index 00000000..99362c81 --- /dev/null +++ b/decoder/lexcrf.h @@ -0,0 +1,18 @@ +#ifndef _LEXCRF_H_ +#define _LEXCRF_H_ + +#include "translator.h" +#include "lattice.h" + +struct LexicalCRFImpl; +struct LexicalCRF : public Translator { + LexicalCRF(const boost::program_options::variables_map& conf); + bool Translate(const std::string& input, + SentenceMetadata* smeta, + const std::vector& weights, + Hypergraph* forest); + private: + boost::shared_ptr pimpl_; +}; + +#endif diff --git a/decoder/logval.h b/decoder/logval.h new file mode 100644 index 00000000..a8ca620c --- /dev/null +++ b/decoder/logval.h @@ -0,0 +1,136 @@ +#ifndef LOGVAL_H_ +#define LOGVAL_H_ + +#include +#include + +template +class LogVal { + public: + LogVal() : v_(-std::numeric_limits::infinity()) {} + explicit LogVal(double x) : v_(std::log(x)) {} + LogVal(const LogVal& o) : v_(o.v_) {} + static LogVal One() { return LogVal(1); } + static LogVal Zero() { return LogVal(); } + + void logeq(const T& v) { v_ = v; } + + LogVal& operator+=(const LogVal& a) { + if (a.v_ == -std::numeric_limits::infinity()) return *this; + if (a.v_ < v_) { + v_ = v_ + log1p(std::exp(a.v_ - v_)); + } else { + v_ = a.v_ + log1p(std::exp(v_ - a.v_)); + } + return *this; + } + + LogVal& operator*=(const LogVal& a) { + v_ += a.v_; + return *this; + } + + LogVal& operator*=(const T& a) { + v_ += log(a); + return *this; + } + + LogVal& operator/=(const LogVal& a) { + v_ -= a.v_; + return *this; + } + + LogVal& poweq(const T& power) { + if (power == 0) v_ = 0; else v_ *= power; + return *this; + } + + LogVal pow(const T& power) const { + LogVal res = *this; + res.poweq(power); + return res; + } + + operator T() const { + return std::exp(v_); + } + + T v_; +}; + +template +LogVal operator+(const LogVal& o1, const LogVal& o2) { + LogVal res(o1); + res += o2; + return res; +} + +template +LogVal operator*(const LogVal& o1, const LogVal& o2) { + LogVal res(o1); + res *= o2; + return res; +} + +template +LogVal operator*(const LogVal& o1, const T& o2) { + LogVal res(o1); + res *= o2; + return res; +} + +template +LogVal operator*(const T& o1, const LogVal& o2) { + LogVal res(o2); + res *= o1; + return res; +} + +template +LogVal operator/(const LogVal& o1, const LogVal& o2) { + LogVal res(o1); + res /= o2; + return res; +} + +template +T log(const LogVal& o) { + return o.v_; +} + +template +LogVal pow(const LogVal& b, const T& e) { + return b.pow(e); +} + +template +bool operator<(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ < rhs.v_); +} + +template +bool operator<=(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ <= rhs.v_); +} + +template +bool operator>(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ > rhs.v_); +} + +template +bool operator>=(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ >= rhs.v_); +} + +template +bool operator==(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ == rhs.v_); +} + +template +bool operator!=(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ != rhs.v_); +} + +#endif diff --git a/decoder/maxtrans_blunsom.cc b/decoder/maxtrans_blunsom.cc new file mode 100644 index 00000000..4a6680e0 --- /dev/null +++ b/decoder/maxtrans_blunsom.cc @@ -0,0 +1,287 @@ +#include "apply_models.h" + +#include +#include +#include +#include + +#include +#include + +#include "tdict.h" +#include "hg.h" +#include "ff.h" + +using boost::tuple; +using namespace std; +using namespace std::tr1; + +namespace Hack { + +struct Candidate; +typedef SmallVector JVector; +typedef vector CandidateHeap; +typedef vector CandidateList; + +// life cycle: candidates are created, placed on the heap +// and retrieved by their estimated cost, when they're +// retrieved, they're incorporated into the +LM hypergraph +// where they also know the head node index they are +// attached to. After they are added to the +LM hypergraph +// inside_prob_ and est_prob_ fields may be updated as better +// derivations are found (this happens since the successor's +// of derivation d may have a better score- they are +// explored lazily). However, the updates don't happen +// when a candidate is in the heap so maintaining the heap +// property is not an issue. +struct Candidate { + int node_index_; // -1 until incorporated + // into the +LM forest + const Hypergraph::Edge* in_edge_; // in -LM forest + Hypergraph::Edge out_edge_; + vector state_; + const JVector j_; + prob_t inside_prob_; // these are fixed until the cand + // is popped, then they may be updated + prob_t est_prob_; + + Candidate(const Hypergraph::Edge& e, + const JVector& j, + const vector& D, + bool is_goal) : + node_index_(-1), + in_edge_(&e), + j_(j) { + InitializeCandidate(D, is_goal); + } + + // used to query uniqueness + Candidate(const Hypergraph::Edge& e, + const JVector& j) : in_edge_(&e), j_(j) {} + + bool IsIncorporatedIntoHypergraph() const { + return node_index_ >= 0; + } + + void InitializeCandidate(const vector >& D, + const bool is_goal) { + const Hypergraph::Edge& in_edge = *in_edge_; + out_edge_.rule_ = in_edge.rule_; + out_edge_.feature_values_ = in_edge.feature_values_; + Hypergraph::TailNodeVector& tail = out_edge_.tail_nodes_; + tail.resize(j_.size()); + prob_t p = prob_t::One(); + // cerr << "\nEstimating application of " << in_edge.rule_->AsString() << endl; + vector* > ants(tail.size()); + for (int i = 0; i < tail.size(); ++i) { + const Candidate& ant = *D[in_edge.tail_nodes_[i]][j_[i]]; + ants[i] = &ant.state_; + assert(ant.IsIncorporatedIntoHypergraph()); + tail[i] = ant.node_index_; + p *= ant.inside_prob_; + } + prob_t edge_estimate = prob_t::One(); + if (is_goal) { + assert(tail.size() == 1); + out_edge_.edge_prob_ = in_edge.edge_prob_; + } else { + in_edge.rule_->ESubstitute(ants, &state_); + out_edge_.edge_prob_ = in_edge.edge_prob_; + } + inside_prob_ = out_edge_.edge_prob_ * p; + est_prob_ = inside_prob_ * edge_estimate; + } +}; + +ostream& operator<<(ostream& os, const Candidate& cand) { + os << "CAND["; + if (!cand.IsIncorporatedIntoHypergraph()) { os << "PENDING "; } + else { os << "+LM_node=" << cand.node_index_; } + os << " edge=" << cand.in_edge_->id_; + os << " j=<"; + for (int i = 0; i < cand.j_.size(); ++i) + os << (i==0 ? "" : " ") << cand.j_[i]; + os << "> vit=" << log(cand.inside_prob_); + os << " est=" << log(cand.est_prob_); + return os << ']'; +} + +struct HeapCandCompare { + bool operator()(const Candidate* l, const Candidate* r) const { + return l->est_prob_ < r->est_prob_; + } +}; + +struct EstProbSorter { + bool operator()(const Candidate* l, const Candidate* r) const { + return l->est_prob_ > r->est_prob_; + } +}; + +// the same candidate can be added multiple times if +// j is multidimensional (if you're going NW in Manhattan, you +// can first go north, then west, or you can go west then north) +// this is a hash function on the relevant variables from +// Candidate to enforce this. +struct CandidateUniquenessHash { + size_t operator()(const Candidate* c) const { + size_t x = 5381; + x = ((x << 5) + x) ^ c->in_edge_->id_; + for (int i = 0; i < c->j_.size(); ++i) + x = ((x << 5) + x) ^ c->j_[i]; + return x; + } +}; + +struct CandidateUniquenessEquals { + bool operator()(const Candidate* a, const Candidate* b) const { + return (a->in_edge_ == b->in_edge_) && (a->j_ == b->j_); + } +}; + +typedef unordered_set UniqueCandidateSet; +typedef unordered_map, Candidate*, boost::hash > > State2Node; + +class MaxTransBeamSearch { + +public: + MaxTransBeamSearch(const Hypergraph& i, int pop_limit, Hypergraph* o) : + in(i), + out(*o), + D(in.nodes_.size()), + pop_limit_(pop_limit) { + cerr << " Finding max translation (cube pruning, pop_limit = " << pop_limit_ << ')' << endl; + } + + void Apply() { + int num_nodes = in.nodes_.size(); + int goal_id = num_nodes - 1; + int pregoal = goal_id - 1; + assert(in.nodes_[pregoal].out_edges_.size() == 1); + cerr << " "; + for (int i = 0; i < in.nodes_.size(); ++i) { + cerr << '.'; + KBest(i, i == goal_id); + } + cerr << endl; + int best_node = D[goal_id].front()->in_edge_->tail_nodes_.front(); + Candidate& best = *D[best_node].front(); + cerr << " Best path: " << log(best.inside_prob_) + << "\t" << log(best.est_prob_) << endl; + cout << TD::GetString(D[best_node].front()->state_) << endl; + FreeAll(); + } + + private: + void FreeAll() { + for (int i = 0; i < D.size(); ++i) { + CandidateList& D_i = D[i]; + for (int j = 0; j < D_i.size(); ++j) + delete D_i[j]; + } + D.clear(); + } + + void IncorporateIntoPlusLMForest(Candidate* item, State2Node* s2n, CandidateList* freelist) { + Hypergraph::Edge* new_edge = out.AddEdge(item->out_edge_.rule_, item->out_edge_.tail_nodes_); + new_edge->feature_values_ = item->out_edge_.feature_values_; + new_edge->edge_prob_ = item->out_edge_.edge_prob_; + Candidate*& o_item = (*s2n)[item->state_]; + if (!o_item) o_item = item; + + int& node_id = o_item->node_index_; + if (node_id < 0) { + Hypergraph::Node* new_node = out.AddNode(in.nodes_[item->in_edge_->head_node_].cat_, ""); + node_id = new_node->id_; + } + Hypergraph::Node* node = &out.nodes_[node_id]; + out.ConnectEdgeToHeadNode(new_edge, node); + + if (item != o_item) { + assert(o_item->state_ == item->state_); // sanity check! + o_item->est_prob_ += item->est_prob_; + o_item->inside_prob_ += item->inside_prob_; + freelist->push_back(item); + } + } + + void KBest(const int vert_index, const bool is_goal) { + // cerr << "KBest(" << vert_index << ")\n"; + CandidateList& D_v = D[vert_index]; + assert(D_v.empty()); + const Hypergraph::Node& v = in.nodes_[vert_index]; + // cerr << " has " << v.in_edges_.size() << " in-coming edges\n"; + const vector& in_edges = v.in_edges_; + CandidateHeap cand; + CandidateList freelist; + cand.reserve(in_edges.size()); + UniqueCandidateSet unique_cands; + for (int i = 0; i < in_edges.size(); ++i) { + const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; + const JVector j(edge.tail_nodes_.size(), 0); + cand.push_back(new Candidate(edge, j, D, is_goal)); + assert(unique_cands.insert(cand.back()).second); // these should all be unique! + } +// cerr << " making heap of " << cand.size() << " candidates\n"; + make_heap(cand.begin(), cand.end(), HeapCandCompare()); + State2Node state2node; // "buf" in Figure 2 + int pops = 0; + while(!cand.empty() && pops < pop_limit_) { + pop_heap(cand.begin(), cand.end(), HeapCandCompare()); + Candidate* item = cand.back(); + cand.pop_back(); + // cerr << "POPPED: " << *item << endl; + PushSucc(*item, is_goal, &cand, &unique_cands); + IncorporateIntoPlusLMForest(item, &state2node, &freelist); + ++pops; + } + D_v.resize(state2node.size()); + int c = 0; + for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i) + D_v[c++] = i->second; + sort(D_v.begin(), D_v.end(), EstProbSorter()); + // cerr << " expanded to " << D_v.size() << " nodes\n"; + + for (int i = 0; i < cand.size(); ++i) + delete cand[i]; + // freelist is necessary since even after an item merged, it still stays in + // the unique set so it can't be deleted til now + for (int i = 0; i < freelist.size(); ++i) + delete freelist[i]; + } + + void PushSucc(const Candidate& item, const bool is_goal, CandidateHeap* pcand, UniqueCandidateSet* cs) { + CandidateHeap& cand = *pcand; + for (int i = 0; i < item.j_.size(); ++i) { + JVector j = item.j_; + ++j[i]; + if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) { + Candidate query_unique(*item.in_edge_, j); + if (cs->count(&query_unique) == 0) { + Candidate* new_cand = new Candidate(*item.in_edge_, j, D, is_goal); + cand.push_back(new_cand); + push_heap(cand.begin(), cand.end(), HeapCandCompare()); + assert(cs->insert(new_cand).second); // insert into uniqueness set, sanity check + } + } + } + } + + const Hypergraph& in; + Hypergraph& out; + + vector D; // maps nodes in in-HG to the + // equivalent nodes (many due to state + // splits) in the out-HG. + const int pop_limit_; +}; + +// each node in the graph has one of these, it keeps track of +void MaxTrans(const Hypergraph& in, + int beam_size) { + Hypergraph out; + MaxTransBeamSearch ma(in, beam_size, &out); + ma.Apply(); +} + +} diff --git a/decoder/parser_test.cc b/decoder/parser_test.cc new file mode 100644 index 00000000..da1fbd89 --- /dev/null +++ b/decoder/parser_test.cc @@ -0,0 +1,35 @@ +#include +#include +#include +#include +#include +#include "hg.h" +#include "trule.h" +#include "bottom_up_parser.h" +#include "tdict.h" + +using namespace std; + +class ChartTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +TEST_F(ChartTest,LanguageModel) { + LatticeArc a(TD::Convert("ein"), 0.0, 1); + LatticeArc b(TD::Convert("haus"), 0.0, 1); + Lattice lattice(2); + lattice[0].push_back(a); + lattice[1].push_back(b); + Hypergraph forest; + GrammarPtr g(new TextGrammar); + vector grammars(1, g); + ExhaustiveBottomUpParser parser("PHRASE", grammars); + parser.Parse(lattice, &forest); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/decoder/phrasebased_translator.cc b/decoder/phrasebased_translator.cc new file mode 100644 index 00000000..5eb70876 --- /dev/null +++ b/decoder/phrasebased_translator.cc @@ -0,0 +1,206 @@ +#include "phrasebased_translator.h" + +#include +#include +#include +#include + +#include +#include + +#include "sentence_metadata.h" +#include "tdict.h" +#include "hg.h" +#include "filelib.h" +#include "lattice.h" +#include "phrasetable_fst.h" +#include "array2d.h" + +using namespace std; +using namespace std::tr1; +using namespace boost::tuples; + +struct Coverage : public vector { + explicit Coverage(int n, bool v = false) : vector(n, v), first_gap() {} + void Cover(int i, int j) { + vector::iterator it = this->begin() + i; + vector::iterator end = this->begin() + j; + while (it != end) + *it++ = true; + if (first_gap == i) { + first_gap = j; + it = end; + while (*it && it != this->end()) { + ++it; + ++first_gap; + } + } + } + bool Collides(int i, int j) const { + vector::const_iterator it = this->begin() + i; + vector::const_iterator end = this->begin() + j; + while (it != end) + if (*it++) return true; + return false; + } + int GetFirstGap() const { return first_gap; } + private: + int first_gap; +}; +struct CoverageHash { + size_t operator()(const Coverage& cov) const { + return hasher_(static_cast&>(cov)); + } + private: + boost::hash > hasher_; +}; +ostream& operator<<(ostream& os, const Coverage& cov) { + os << '['; + for (int i = 0; i < cov.size(); ++i) + os << (cov[i] ? '*' : '.'); + return os << " gap=" << cov.GetFirstGap() << ']'; +} + +typedef unordered_map CoverageNodeMap; +typedef unordered_set UniqueCoverageSet; + +struct PhraseBasedTranslatorImpl { + PhraseBasedTranslatorImpl(const boost::program_options::variables_map& conf) : + add_pass_through_rules(conf.count("add_pass_through_rules")), + max_distortion(conf["pb_max_distortion"].as()), + kSOURCE_RULE(new TRule("[X] ||| [X,1] ||| [X,1]", true)), + kCONCAT_RULE(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]", true)), + kNT_TYPE(TD::Convert("X") * -1) { + assert(max_distortion >= 0); + vector gfiles = conf["grammar"].as >(); + assert(gfiles.size() == 1); + cerr << "Reading phrasetable from " << gfiles.front() << endl; + ReadFile in(gfiles.front()); + fst.reset(LoadTextPhrasetable(in.stream())); + } + + struct State { + State(const Coverage& c, int _i, int _j, const FSTNode* q) : + coverage(c), i(_i), j(_j), fst(q) {} + Coverage coverage; + int i; + int j; + const FSTNode* fst; + }; + + // we keep track of unique coverages that have been extended since it's + // possible to "extend" the same coverage twice, e.g. translate "a b c" + // with phrases "a" "b" "a b" and "c". There are two ways to cover "a b" + void EnqueuePossibleContinuations(const Coverage& coverage, queue* q, UniqueCoverageSet* ucs) { + if (ucs->insert(coverage).second) { + const int gap = coverage.GetFirstGap(); + const int end = min(static_cast(coverage.size()), gap + max_distortion + 1); + for (int i = gap; i < end; ++i) + if (!coverage[i]) q->push(State(coverage, i, i, fst.get())); + } + } + + bool Translate(const std::string& input, + SentenceMetadata* smeta, + const std::vector& weights, + Hypergraph* minus_lm_forest) { + Lattice lattice; + LatticeTools::ConvertTextOrPLF(input, &lattice); + smeta->SetSourceLength(lattice.size()); + size_t est_nodes = lattice.size() * lattice.size() * (1 << max_distortion); + minus_lm_forest->ReserveNodes(est_nodes, est_nodes * 100); + if (add_pass_through_rules) { + SparseVector feats; + feats.set_value(FD::Convert("PassThrough"), 1); + for (int i = 0; i < lattice.size(); ++i) { + const vector& arcs = lattice[i]; + for (int j = 0; j < arcs.size(); ++j) { + fst->AddPassThroughTranslation(arcs[j].label, feats); + // TODO handle lattice edge features + } + } + } + CoverageNodeMap c; + queue q; + UniqueCoverageSet ucs; + const Coverage empty_cov(lattice.size(), false); + const Coverage goal_cov(lattice.size(), true); + EnqueuePossibleContinuations(empty_cov, &q, &ucs); + c[empty_cov] = 0; // have to handle the left edge specially + while(!q.empty()) { + const State s = q.front(); + q.pop(); + // cerr << "(" << s.i << "," << s.j << " ptr=" << s.fst << ") cov=" << s.coverage << endl; + const vector& arcs = lattice[s.j]; + if (s.fst->HasData()) { + Coverage new_cov = s.coverage; + new_cov.Cover(s.i, s.j); + EnqueuePossibleContinuations(new_cov, &q, &ucs); + const vector& phrases = s.fst->GetTranslations()->GetRules(); + const int phrase_head_index = minus_lm_forest->AddNode(kNT_TYPE)->id_; + for (int i = 0; i < phrases.size(); ++i) { + Hypergraph::Edge* edge = minus_lm_forest->AddEdge(phrases[i], Hypergraph::TailNodeVector()); + edge->feature_values_ = edge->rule_->scores_; + minus_lm_forest->ConnectEdgeToHeadNode(edge->id_, phrase_head_index); + } + CoverageNodeMap::iterator cit = c.find(s.coverage); + assert(cit != c.end()); + const int tail_node_plus1 = cit->second; + if (tail_node_plus1 == 0) { // left edge + c[new_cov] = phrase_head_index + 1; + } else { // not left edge + int& head_node_plus1 = c[new_cov]; + if (!head_node_plus1) + head_node_plus1 = minus_lm_forest->AddNode(kNT_TYPE)->id_ + 1; + Hypergraph::TailNodeVector tail(2, tail_node_plus1 - 1); + tail[1] = phrase_head_index; + const int concat_edge = minus_lm_forest->AddEdge(kCONCAT_RULE, tail)->id_; + minus_lm_forest->ConnectEdgeToHeadNode(concat_edge, head_node_plus1 - 1); + } + } + if (s.j == lattice.size()) continue; + for (int l = 0; l < arcs.size(); ++l) { + const LatticeArc& arc = arcs[l]; + + const FSTNode* next_fst_state = s.fst->Extend(arc.label); + const int next_j = s.j + arc.dist2next; + if (next_fst_state && + !s.coverage.Collides(s.i, next_j)) { + q.push(State(s.coverage, s.i, next_j, next_fst_state)); + } + } + } + if (add_pass_through_rules) + fst->ClearPassThroughTranslations(); + int pregoal_plus1 = c[goal_cov]; + if (pregoal_plus1 > 0) { + TRulePtr kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [X,1]")); + int goal = minus_lm_forest->AddNode(TD::Convert("Goal") * -1)->id_; + int gedge = minus_lm_forest->AddEdge(kGOAL_RULE, Hypergraph::TailNodeVector(1, pregoal_plus1 - 1))->id_; + minus_lm_forest->ConnectEdgeToHeadNode(gedge, goal); + // they are almost topo, but not quite always + minus_lm_forest->TopologicallySortNodesAndEdges(goal); + minus_lm_forest->Reweight(weights); + return true; + } else { + return false; // composition failed + } + } + + const bool add_pass_through_rules; + const int max_distortion; + TRulePtr kSOURCE_RULE; + const TRulePtr kCONCAT_RULE; + const WordID kNT_TYPE; + boost::shared_ptr fst; +}; + +PhraseBasedTranslator::PhraseBasedTranslator(const boost::program_options::variables_map& conf) : + pimpl_(new PhraseBasedTranslatorImpl(conf)) {} + +bool PhraseBasedTranslator::Translate(const std::string& input, + SentenceMetadata* smeta, + const std::vector& weights, + Hypergraph* minus_lm_forest) { + return pimpl_->Translate(input, smeta, weights, minus_lm_forest); +} diff --git a/decoder/phrasebased_translator.h b/decoder/phrasebased_translator.h new file mode 100644 index 00000000..d42ce79c --- /dev/null +++ b/decoder/phrasebased_translator.h @@ -0,0 +1,18 @@ +#ifndef _PHRASEBASED_TRANSLATOR_H_ +#define _PHRASEBASED_TRANSLATOR_H_ + +#include "translator.h" + +class PhraseBasedTranslatorImpl; +class PhraseBasedTranslator : public Translator { + public: + PhraseBasedTranslator(const boost::program_options::variables_map& conf); + bool Translate(const std::string& input, + SentenceMetadata* smeta, + const std::vector& weights, + Hypergraph* minus_lm_forest); + private: + boost::shared_ptr pimpl_; +}; + +#endif diff --git a/decoder/phrasetable_fst.cc b/decoder/phrasetable_fst.cc new file mode 100644 index 00000000..f421e941 --- /dev/null +++ b/decoder/phrasetable_fst.cc @@ -0,0 +1,141 @@ +#include "phrasetable_fst.h" + +#include +#include +#include + +#include + +#include "filelib.h" +#include "tdict.h" + +using boost::shared_ptr; +using namespace std; + +TargetPhraseSet::~TargetPhraseSet() {} +FSTNode::~FSTNode() {} + +class TextTargetPhraseSet : public TargetPhraseSet { + public: + void AddRule(TRulePtr rule) { + rules_.push_back(rule); + } + const vector& GetRules() const { + return rules_; + } + + private: + // all rules must have arity 0 + vector rules_; +}; + +class TextFSTNode : public FSTNode { + public: + const TargetPhraseSet* GetTranslations() const { return data.get(); } + bool HasData() const { return (bool)data; } + bool HasOutgoingNonEpsilonEdges() const { return !ptr.empty(); } + const FSTNode* Extend(const WordID& t) const { + map::const_iterator it = ptr.find(t); + if (it == ptr.end()) return NULL; + return &it->second; + } + + void AddPhrase(const string& phrase); + + void AddPassThroughTranslation(const WordID& w, const SparseVector& feats); + void ClearPassThroughTranslations(); + private: + vector passthroughs; + shared_ptr data; + map ptr; +}; + +#ifdef DEBUG_CHART_PARSER +static string TrimRule(const string& r) { + size_t start = r.find(" |||") + 5; + size_t end = r.rfind(" |||"); + return r.substr(start, end - start); +} +#endif + +void TextFSTNode::AddPhrase(const string& phrase) { + vector words; + TRulePtr rule(TRule::CreateRulePhrasetable(phrase)); + if (!rule) { + static int err = 0; + ++err; + if (err > 2) { cerr << "TOO MANY PHRASETABLE ERRORS\n"; exit(1); } + return; + } + + TextFSTNode* fsa = this; + for (int i = 0; i < rule->FLength(); ++i) + fsa = &fsa->ptr[rule->f_[i]]; + + if (!fsa->data) + fsa->data.reset(new TextTargetPhraseSet); + static_cast(fsa->data.get())->AddRule(rule); +} + +void TextFSTNode::AddPassThroughTranslation(const WordID& w, const SparseVector& feats) { + TextFSTNode* next = &ptr[w]; + // current, rules are only added if the symbol is completely missing as a + // word starting the phrase. As a result, it is possible that some sentences + // won't parse. If this becomes a problem, fix it here. + if (!next->data) { + TextTargetPhraseSet* tps = new TextTargetPhraseSet; + next->data.reset(tps); + TRule* rule = new TRule; + rule->e_.resize(1, w); + rule->f_.resize(1, w); + rule->lhs_ = TD::Convert("___PHRASE") * -1; + rule->scores_ = feats; + rule->arity_ = 0; + tps->AddRule(TRulePtr(rule)); + passthroughs.push_back(w); + } +} + +void TextFSTNode::ClearPassThroughTranslations() { + for (int i = 0; i < passthroughs.size(); ++i) + ptr.erase(passthroughs[i]); + passthroughs.clear(); +} + +static void AddPhrasetableToFST(istream* in, TextFSTNode* fst) { + int lc = 0; + bool flag = false; + while(*in) { + string line; + getline(*in, line); + if (line.empty()) continue; + ++lc; + fst->AddPhrase(line); + if (lc % 10000 == 0) { flag = true; cerr << '.' << flush; } + if (lc % 500000 == 0) { flag = false; cerr << " [" << lc << ']' << endl << flush; } + } + if (flag) cerr << endl; + cerr << "Loaded " << lc << " source phrases\n"; +} + +FSTNode* LoadTextPhrasetable(istream* in) { + TextFSTNode *fst = new TextFSTNode; + AddPhrasetableToFST(in, fst); + return fst; +} + +FSTNode* LoadTextPhrasetable(const vector& filenames) { + TextFSTNode* fst = new TextFSTNode; + for (int i = 0; i < filenames.size(); ++i) { + ReadFile rf(filenames[i]); + cerr << "Reading phrase from " << filenames[i] << endl; + AddPhrasetableToFST(rf.stream(), fst); + } + return fst; +} + +FSTNode* LoadBinaryPhrasetable(const string& fname_prefix) { + (void) fname_prefix; + assert(!"not implemented yet"); +} + diff --git a/decoder/phrasetable_fst.h b/decoder/phrasetable_fst.h new file mode 100644 index 00000000..477de1f7 --- /dev/null +++ b/decoder/phrasetable_fst.h @@ -0,0 +1,34 @@ +#ifndef _PHRASETABLE_FST_H_ +#define _PHRASETABLE_FST_H_ + +#include +#include + +#include "sparse_vector.h" +#include "trule.h" + +class TargetPhraseSet { + public: + virtual ~TargetPhraseSet(); + virtual const std::vector& GetRules() const = 0; +}; + +class FSTNode { + public: + virtual ~FSTNode(); + virtual const TargetPhraseSet* GetTranslations() const = 0; + virtual bool HasData() const = 0; + virtual bool HasOutgoingNonEpsilonEdges() const = 0; + virtual const FSTNode* Extend(const WordID& t) const = 0; + + // these should only be called on q_0: + virtual void AddPassThroughTranslation(const WordID& w, const SparseVector& feats) = 0; + virtual void ClearPassThroughTranslations() = 0; +}; + +// attn caller: you own the memory +FSTNode* LoadTextPhrasetable(const std::vector& filenames); +FSTNode* LoadTextPhrasetable(std::istream* in); +FSTNode* LoadBinaryPhrasetable(const std::string& fname_prefix); + +#endif diff --git a/decoder/prob.h b/decoder/prob.h new file mode 100644 index 00000000..bc297870 --- /dev/null +++ b/decoder/prob.h @@ -0,0 +1,8 @@ +#ifndef _PROB_H_ +#define _PROB_H_ + +#include "logval.h" + +typedef LogVal prob_t; + +#endif diff --git a/decoder/sampler.h b/decoder/sampler.h new file mode 100644 index 00000000..e5840f41 --- /dev/null +++ b/decoder/sampler.h @@ -0,0 +1,136 @@ +#ifndef SAMPLER_H_ +#define SAMPLER_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "prob.h" + +struct SampleSet; + +template +struct RandomNumberGenerator { + static uint32_t GetTrulyRandomSeed() { + uint32_t seed; + std::ifstream r("/dev/urandom"); + if (r) { + r.read((char*)&seed,sizeof(uint32_t)); + } + if (r.fail() || !r) { + std::cerr << "Warning: could not read from /dev/urandom. Seeding from clock" << std::endl; + seed = time(NULL); + } + std::cerr << "Seeding random number sequence to " << seed << std::endl; + return seed; + } + + RandomNumberGenerator() : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { + uint32_t seed = GetTrulyRandomSeed(); + m_generator.seed(seed); + } + explicit RandomNumberGenerator(uint32_t seed) : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { + if (!seed) seed = GetTrulyRandomSeed(); + m_generator.seed(seed); + } + + size_t SelectSample(const prob_t& a, const prob_t& b, double T = 1.0) { + if (T == 1.0) { + if (this->next() > (a / (a + b))) return 1; else return 0; + } else { + assert(!"not implemented"); + } + } + + // T is the annealing temperature, if desired + size_t SelectSample(const SampleSet& ss, double T = 1.0); + + // draw a value from U(0,1) + double next() {return m_random();} + + // draw a value from N(mean,var) + double NextNormal(double mean, double var) { + return boost::normal_distribution(mean, var)(m_random); + } + + // draw a value from a Poisson distribution + // lambda must be greater than 0 + int NextPoisson(int lambda) { + return boost::poisson_distribution(lambda)(m_random); + } + + bool AcceptMetropolisHastings(const prob_t& p_cur, + const prob_t& p_prev, + const prob_t& q_cur, + const prob_t& q_prev) { + const prob_t a = (p_cur / p_prev) * (q_prev / q_cur); + if (log(a) >= 0.0) return true; + return (prob_t(this->next()) < a); + } + + private: + boost::uniform_real<> m_dist; + RNG m_generator; + boost::variate_generator > m_random; +}; + +typedef RandomNumberGenerator MT19937; + +class SampleSet { + public: + const prob_t& operator[](int i) const { return m_scores[i]; } + bool empty() const { return m_scores.empty(); } + void add(const prob_t& s) { m_scores.push_back(s); } + void clear() { m_scores.clear(); } + size_t size() const { return m_scores.size(); } + std::vector m_scores; +}; + +template +size_t RandomNumberGenerator::SelectSample(const SampleSet& ss, double T) { + assert(T > 0.0); + assert(ss.m_scores.size() > 0); + if (ss.m_scores.size() == 1) return 0; + const prob_t annealing_factor(1.0 / T); + const bool anneal = (annealing_factor != prob_t::One()); + prob_t sum = prob_t::Zero(); + if (anneal) { + for (int i = 0; i < ss.m_scores.size(); ++i) + sum += ss.m_scores[i].pow(annealing_factor); // p^(1/T) + } else { + sum = std::accumulate(ss.m_scores.begin(), ss.m_scores.end(), prob_t::Zero()); + } + //for (size_t i = 0; i < ss.m_scores.size(); ++i) std::cerr << ss.m_scores[i] << ","; + //std::cerr << std::endl; + + prob_t random(this->next()); // random number between 0 and 1 + random *= sum; // scale with normalization factor + //std::cerr << "Random number " << random << std::endl; + + //now figure out which sample + size_t position = 1; + sum = ss.m_scores[0]; + if (anneal) { + sum.poweq(annealing_factor); + for (; position < ss.m_scores.size() && sum < random; ++position) + sum += ss.m_scores[position].pow(annealing_factor); + } else { + for (; position < ss.m_scores.size() && sum < random; ++position) + sum += ss.m_scores[position]; + } + //std::cout << "random: " << random << " sample: " << position << std::endl; + //std::cerr << "Sample: " << position-1 << std::endl; + //exit(1); + return position-1; +} + +#endif diff --git a/decoder/scfg_translator.cc b/decoder/scfg_translator.cc new file mode 100644 index 00000000..03602c6b --- /dev/null +++ b/decoder/scfg_translator.cc @@ -0,0 +1,66 @@ +#include "translator.h" + +#include + +#include "hg.h" +#include "grammar.h" +#include "bottom_up_parser.h" +#include "sentence_metadata.h" + +using namespace std; + +Translator::~Translator() {} + +struct SCFGTranslatorImpl { + SCFGTranslatorImpl(const boost::program_options::variables_map& conf) : + max_span_limit(conf["scfg_max_span_limit"].as()), + add_pass_through_rules(conf.count("add_pass_through_rules")), + goal(conf["goal"].as()), + default_nt(conf["scfg_default_nt"].as()) { + vector gfiles = conf["grammar"].as >(); + for (int i = 0; i < gfiles.size(); ++i) { + cerr << "Reading SCFG grammar from " << gfiles[i] << endl; + TextGrammar* g = new TextGrammar(gfiles[i]); + g->SetMaxSpan(max_span_limit); + grammars.push_back(GrammarPtr(g)); + } + if (!conf.count("scfg_no_hiero_glue_grammar")) + grammars.push_back(GrammarPtr(new GlueGrammar(goal, default_nt))); + if (conf.count("scfg_extra_glue_grammar")) + grammars.push_back(GrammarPtr(new GlueGrammar(conf["scfg_extra_glue_grammar"].as()))); + } + + const int max_span_limit; + const bool add_pass_through_rules; + const string goal; + const string default_nt; + vector grammars; + + bool Translate(const string& input, + SentenceMetadata* smeta, + const vector& weights, + Hypergraph* forest) { + vector glist = grammars; + Lattice lattice; + LatticeTools::ConvertTextOrPLF(input, &lattice); + smeta->SetSourceLength(lattice.size()); + if (add_pass_through_rules) + glist.push_back(GrammarPtr(new PassThroughGrammar(lattice, default_nt))); + ExhaustiveBottomUpParser parser(goal, glist); + if (!parser.Parse(lattice, forest)) + return false; + forest->Reweight(weights); + return true; + } +}; + +SCFGTranslator::SCFGTranslator(const boost::program_options::variables_map& conf) : + pimpl_(new SCFGTranslatorImpl(conf)) {} + +bool SCFGTranslator::Translate(const string& input, + SentenceMetadata* smeta, + const vector& weights, + Hypergraph* minus_lm_forest) { + return pimpl_->Translate(input, smeta, weights, minus_lm_forest); +} + diff --git a/decoder/sentence_metadata.h b/decoder/sentence_metadata.h new file mode 100644 index 00000000..ef9eb388 --- /dev/null +++ b/decoder/sentence_metadata.h @@ -0,0 +1,47 @@ +#ifndef _SENTENCE_METADATA_H_ +#define _SENTENCE_METADATA_H_ + +#include +#include "lattice.h" + +struct SentenceMetadata { + SentenceMetadata(int id, const Lattice& ref) : + sent_id_(id), + src_len_(-1), + has_reference_(ref.size() > 0), + trg_len_(ref.size()), + ref_(has_reference_ ? &ref : NULL) {} + + // this should be called by the Translator object after + // it has parsed the source + void SetSourceLength(int sl) { src_len_ = sl; } + + // this should be called if a separate model needs to + // specify how long the target sentence should be + void SetTargetLength(int tl) { + assert(!has_reference_); + trg_len_ = tl; + } + bool HasReference() const { return has_reference_; } + const Lattice& GetReference() const { return *ref_; } + int GetSourceLength() const { return src_len_; } + int GetTargetLength() const { return trg_len_; } + int GetSentenceID() const { return sent_id_; } + // this will be empty if the translator accepts non FS input! + const Lattice& GetSourceLattice() const { return src_lattice_; } + + private: + const int sent_id_; + // the following should be set, if possible, by the Translator + int src_len_; + public: + Lattice src_lattice_; // this will only be set if inputs are finite state! + private: + // you need to be very careful when depending on these values + // they will only be set during training / alignment contexts + const bool has_reference_; + int trg_len_; + const Lattice* const ref_; +}; + +#endif diff --git a/decoder/small_vector.h b/decoder/small_vector.h new file mode 100644 index 00000000..800c1df1 --- /dev/null +++ b/decoder/small_vector.h @@ -0,0 +1,187 @@ +#ifndef _SMALL_VECTOR_H_ + +#include // std::max - where to get this? +#include +#include + +#define __SV_MAX_STATIC 2 + +class SmallVector { + + public: + SmallVector() : size_(0) {} + + explicit SmallVector(size_t s, int v = 0) : size_(s) { + assert(s < 0x80); + if (s <= __SV_MAX_STATIC) { + for (int i = 0; i < s; ++i) data_.vals[i] = v; + } else { + capacity_ = s; + size_ = s; + data_.ptr = new int[s]; + for (int i = 0; i < size_; ++i) data_.ptr[i] = v; + } + } + + SmallVector(const SmallVector& o) : size_(o.size_) { + if (size_ <= __SV_MAX_STATIC) { + for (int i = 0; i < __SV_MAX_STATIC; ++i) data_.vals[i] = o.data_.vals[i]; + } else { + capacity_ = size_ = o.size_; + data_.ptr = new int[capacity_]; + std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(int)); + } + } + + const SmallVector& operator=(const SmallVector& o) { + if (size_ <= __SV_MAX_STATIC) { + if (o.size_ <= __SV_MAX_STATIC) { + size_ = o.size_; + for (int i = 0; i < __SV_MAX_STATIC; ++i) data_.vals[i] = o.data_.vals[i]; + } else { + capacity_ = size_ = o.size_; + data_.ptr = new int[capacity_]; + std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(int)); + } + } else { + if (o.size_ <= __SV_MAX_STATIC) { + delete[] data_.ptr; + size_ = o.size_; + for (int i = 0; i < size_; ++i) data_.vals[i] = o.data_.vals[i]; + } else { + if (capacity_ < o.size_) { + delete[] data_.ptr; + capacity_ = o.size_; + data_.ptr = new int[capacity_]; + } + size_ = o.size_; + for (int i = 0; i < size_; ++i) + data_.ptr[i] = o.data_.ptr[i]; + } + } + return *this; + } + + ~SmallVector() { + if (size_ <= __SV_MAX_STATIC) return; + delete[] data_.ptr; + } + + void clear() { + if (size_ > __SV_MAX_STATIC) { + delete[] data_.ptr; + } + size_ = 0; + } + + bool empty() const { return size_ == 0; } + size_t size() const { return size_; } + + inline void ensure_capacity(unsigned char min_size) { + assert(min_size > __SV_MAX_STATIC); + if (min_size < capacity_) return; + unsigned char new_cap = std::max(static_cast(capacity_ << 1), min_size); + int* tmp = new int[new_cap]; + std::memcpy(tmp, data_.ptr, capacity_ * sizeof(int)); + delete[] data_.ptr; + data_.ptr = tmp; + capacity_ = new_cap; + } + + inline void copy_vals_to_ptr() { + capacity_ = __SV_MAX_STATIC * 2; + int* tmp = new int[capacity_]; + for (int i = 0; i < __SV_MAX_STATIC; ++i) tmp[i] = data_.vals[i]; + data_.ptr = tmp; + } + + inline void push_back(int v) { + if (size_ < __SV_MAX_STATIC) { + data_.vals[size_] = v; + ++size_; + return; + } else if (size_ == __SV_MAX_STATIC) { + copy_vals_to_ptr(); + } else if (size_ == capacity_) { + ensure_capacity(size_ + 1); + } + data_.ptr[size_] = v; + ++size_; + } + + int& back() { return this->operator[](size_ - 1); } + const int& back() const { return this->operator[](size_ - 1); } + int& front() { return this->operator[](0); } + const int& front() const { return this->operator[](0); } + + void resize(size_t s, int v = 0) { + if (s <= __SV_MAX_STATIC) { + if (size_ > __SV_MAX_STATIC) { + int tmp[__SV_MAX_STATIC]; + for (int i = 0; i < s; ++i) tmp[i] = data_.ptr[i]; + delete[] data_.ptr; + for (int i = 0; i < s; ++i) data_.vals[i] = tmp[i]; + size_ = s; + return; + } + if (s <= size_) { + size_ = s; + return; + } else { + for (int i = size_; i < s; ++i) + data_.vals[i] = v; + size_ = s; + return; + } + } else { + if (size_ <= __SV_MAX_STATIC) + copy_vals_to_ptr(); + if (s > capacity_) + ensure_capacity(s); + if (s > size_) { + for (int i = size_; i < s; ++i) + data_.ptr[i] = v; + } + size_ = s; + } + } + + int& operator[](size_t i) { + if (size_ <= __SV_MAX_STATIC) return data_.vals[i]; + return data_.ptr[i]; + } + + const int& operator[](size_t i) const { + if (size_ <= __SV_MAX_STATIC) return data_.vals[i]; + return data_.ptr[i]; + } + + bool operator==(const SmallVector& o) const { + if (size_ != o.size_) return false; + if (size_ <= __SV_MAX_STATIC) { + for (size_t i = 0; i < size_; ++i) + if (data_.vals[i] != o.data_.vals[i]) return false; + return true; + } else { + for (size_t i = 0; i < size_; ++i) + if (data_.ptr[i] != o.data_.ptr[i]) return false; + return true; + } + } + + private: + unsigned char capacity_; // only defined when size_ >= __SV_MAX_STATIC + unsigned char size_; + union StorageType { + int vals[__SV_MAX_STATIC]; + int* ptr; + }; + StorageType data_; + +}; + +inline bool operator!=(const SmallVector& a, const SmallVector& b) { + return !(a==b); +} + +#endif diff --git a/decoder/small_vector_test.cc b/decoder/small_vector_test.cc new file mode 100644 index 00000000..84237791 --- /dev/null +++ b/decoder/small_vector_test.cc @@ -0,0 +1,129 @@ +#include "small_vector.h" + +#include +#include +#include +#include + +using namespace std; + +class SVTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +TEST_F(SVTest, LargerThan2) { + SmallVector v; + SmallVector v2; + v.push_back(0); + v.push_back(1); + v.push_back(2); + assert(v.size() == 3); + assert(v[2] == 2); + assert(v[1] == 1); + assert(v[0] == 0); + v2 = v; + SmallVector copy(v); + assert(copy.size() == 3); + assert(copy[0] == 0); + assert(copy[1] == 1); + assert(copy[2] == 2); + assert(copy == v2); + copy[1] = 99; + assert(copy != v2); + assert(v2.size() == 3); + assert(v2[2] == 2); + assert(v2[1] == 1); + assert(v2[0] == 0); + v2[0] = -2; + v2[1] = -1; + v2[2] = 0; + assert(v2[2] == 0); + assert(v2[1] == -1); + assert(v2[0] == -2); + SmallVector v3(1,1); + assert(v3[0] == 1); + v2 = v3; + assert(v2.size() == 1); + assert(v2[0] == 1); + SmallVector v4(10, 1); + assert(v4.size() == 10); + assert(v4[5] == 1); + assert(v4[9] == 1); + v4 = v; + assert(v4.size() == 3); + assert(v4[2] == 2); + assert(v4[1] == 1); + assert(v4[0] == 0); + SmallVector v5(10, 2); + assert(v5.size() == 10); + assert(v5[7] == 2); + assert(v5[0] == 2); + assert(v.size() == 3); + v = v5; + assert(v.size() == 10); + assert(v[2] == 2); + assert(v[9] == 2); + SmallVector cc; + for (int i = 0; i < 33; ++i) + cc.push_back(i); + for (int i = 0; i < 33; ++i) + assert(cc[i] == i); + cc.resize(20); + assert(cc.size() == 20); + for (int i = 0; i < 20; ++i) + assert(cc[i] == i); + cc[0]=-1; + cc.resize(1, 999); + assert(cc.size() == 1); + assert(cc[0] == -1); + cc.resize(99, 99); + for (int i = 1; i < 99; ++i) { + cerr << i << " " << cc[i] << endl; + assert(cc[i] == 99); + } + cc.clear(); + assert(cc.size() == 0); +} + +TEST_F(SVTest, Small) { + SmallVector v; + SmallVector v1(1,0); + SmallVector v2(2,10); + SmallVector v1a(2,0); + EXPECT_TRUE(v1 != v1a); + EXPECT_TRUE(v1 == v1); + EXPECT_EQ(v1[0], 0); + EXPECT_EQ(v2[1], 10); + EXPECT_EQ(v2[0], 10); + ++v2[1]; + --v2[0]; + EXPECT_EQ(v2[0], 9); + EXPECT_EQ(v2[1], 11); + SmallVector v3(v2); + assert(v3[0] == 9); + assert(v3[1] == 11); + assert(!v3.empty()); + assert(v3.size() == 2); + v3.clear(); + assert(v3.empty()); + assert(v3.size() == 0); + assert(v3 != v2); + assert(v2 != v3); + v3 = v2; + assert(v3 == v2); + assert(v2 == v3); + assert(v3[0] == 9); + assert(v3[1] == 11); + assert(!v3.empty()); + assert(v3.size() == 2); + cerr << sizeof(SmallVector) << endl; + cerr << sizeof(vector) << endl; +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/decoder/sparse_vector.cc b/decoder/sparse_vector.cc new file mode 100644 index 00000000..4035b9ef --- /dev/null +++ b/decoder/sparse_vector.cc @@ -0,0 +1,98 @@ +#include "sparse_vector.h" + +#include +#include + +#include "hg_io.h" + +using namespace std; + +namespace B64 { + +void Encode(double objective, const SparseVector& v, ostream* out) { + const int num_feats = v.num_active(); + size_t tot_size = 0; + const size_t off_objective = tot_size; + tot_size += sizeof(double); // objective + const size_t off_num_feats = tot_size; + tot_size += sizeof(int); // num_feats + const size_t off_data = tot_size; + tot_size += sizeof(unsigned char) * num_feats; // lengths of feature names; + typedef SparseVector::const_iterator const_iterator; + for (const_iterator it = v.begin(); it != v.end(); ++it) + tot_size += FD::Convert(it->first).size(); // feature names; + tot_size += sizeof(double) * num_feats; // gradient + const size_t off_magic = tot_size; + tot_size += 4; // magic + + // size_t b64_size = tot_size * 4 / 3; + // cerr << "Sparse vector binary size: " << tot_size << " (b64 size=" << b64_size << ")\n"; + char* data = new char[tot_size]; + *reinterpret_cast(&data[off_objective]) = objective; + *reinterpret_cast(&data[off_num_feats]) = num_feats; + char* cur = &data[off_data]; + assert(cur - data == off_data); + for (const_iterator it = v.begin(); it != v.end(); ++it) { + const string& fname = FD::Convert(it->first); + *cur++ = static_cast(fname.size()); // name len + memcpy(cur, &fname[0], fname.size()); + cur += fname.size(); + *reinterpret_cast(cur) = it->second; + cur += sizeof(double); + } + assert(cur - data == off_magic); + *reinterpret_cast(cur) = 0xBAABABBAu; + cur += sizeof(unsigned int); + assert(cur - data == tot_size); + b64encode(data, tot_size, out); + delete[] data; +} + +bool Decode(double* objective, SparseVector* v, const char* in, size_t size) { + v->clear(); + if (size % 4 != 0) { + cerr << "B64 error - line % 4 != 0\n"; + return false; + } + const size_t decoded_size = size * 3 / 4 - sizeof(unsigned int); + const size_t buf_size = decoded_size + sizeof(unsigned int); + if (decoded_size < 6) { cerr << "SparseVector decoding error: too short!\n"; return false; } + char* data = new char[buf_size]; + if (!b64decode(reinterpret_cast(in), size, data, buf_size)) { + delete[] data; + return false; + } + size_t cur = 0; + *objective = *reinterpret_cast(data); + cur += sizeof(double); + const int num_feats = *reinterpret_cast(&data[cur]); + cur += sizeof(int); + int fc = 0; + while(fc < num_feats && cur < decoded_size) { + ++fc; + const int fname_len = data[cur++]; + assert(fname_len > 0); + assert(fname_len < 256); + string fname(fname_len, '\0'); + memcpy(&fname[0], &data[cur], fname_len); + cur += fname_len; + const double val = *reinterpret_cast(&data[cur]); + cur += sizeof(double); + int fid = FD::Convert(fname); + v->set_value(fid, val); + } + if(num_feats != fc) { + cerr << "Expected " << num_feats << " but only decoded " << fc << "!\n"; + delete[] data; + return false; + } + if (*reinterpret_cast(&data[cur]) != 0xBAABABBAu) { + cerr << "SparseVector decodeding error : magic does not match!\n"; + delete[] data; + return false; + } + delete[] data; + return true; +} + +} diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h new file mode 100644 index 00000000..6a8c9bf4 --- /dev/null +++ b/decoder/sparse_vector.h @@ -0,0 +1,264 @@ +#ifndef _SPARSE_VECTOR_H_ +#define _SPARSE_VECTOR_H_ + +// this is a modified version of code originally written +// by Phil Blunsom + +#include +#include +#include +#include + +#include "fdict.h" + +template +class SparseVector { +public: + SparseVector() {} + + const T operator[](int index) const { + typename std::map::const_iterator found = _values.find(index); + if (found == _values.end()) + return T(0); + else + return found->second; + } + + void set_value(int index, const T &value) { + _values[index] = value; + } + + void add_value(int index, const T &value) { + _values[index] += value; + } + + T value(int index) const { + typename std::map::const_iterator found = _values.find(index); + if (found != _values.end()) + return found->second; + else + return T(0); + } + + void store(std::valarray* target) const { + (*target) *= 0; + for (typename std::map::const_iterator + it = _values.begin(); it != _values.end(); ++it) { + if (it->first >= target->size()) break; + (*target)[it->first] = it->second; + } + } + + int max_index() const { + if (_values.empty()) return 0; + typename std::map::const_iterator found =_values.end(); + --found; + return found->first; + } + + // dot product with a unit vector of the same length + // as the sparse vector + T dot() const { + T sum = 0; + for (typename std::map::const_iterator + it = _values.begin(); it != _values.end(); ++it) + sum += it->second; + return sum; + } + + template + S dot(const SparseVector &vec) const { + S sum = 0; + for (typename std::map::const_iterator + it = _values.begin(); it != _values.end(); ++it) + { + typename std::map::const_iterator + found = vec._values.find(it->first); + if (found != vec._values.end()) + sum += it->second * found->second; + } + return sum; + } + + template + S dot(const std::vector &vec) const { + S sum = 0; + for (typename std::map::const_iterator + it = _values.begin(); it != _values.end(); ++it) + { + if (it->first < static_cast(vec.size())) + sum += it->second * vec[it->first]; + } + return sum; + } + + template + S dot(const S *vec) const { + // this is not range checked! + S sum = 0; + for (typename std::map::const_iterator + it = _values.begin(); it != _values.end(); ++it) + sum += it->second * vec[it->first]; + std::cout << "dot(*vec) " << sum << std::endl; + return sum; + } + + T l1norm() const { + T sum = 0; + for (typename std::map::const_iterator + it = _values.begin(); it != _values.end(); ++it) + sum += fabs(it->second); + return sum; + } + + T l2norm() const { + T sum = 0; + for (typename std::map::const_iterator + it = _values.begin(); it != _values.end(); ++it) + sum += it->second * it->second; + return sqrt(sum); + } + + SparseVector &operator+=(const SparseVector &other) { + for (typename std::map::const_iterator + it = other._values.begin(); it != other._values.end(); ++it) + { + T v = (_values[it->first] += it->second); + if (v == 0) + _values.erase(it->first); + } + return *this; + } + + SparseVector &operator-=(const SparseVector &other) { + for (typename std::map::const_iterator + it = other._values.begin(); it != other._values.end(); ++it) + { + T v = (_values[it->first] -= it->second); + if (v == 0) + _values.erase(it->first); + } + return *this; + } + + SparseVector &operator-=(const double &x) { + for (typename std::map::iterator + it = _values.begin(); it != _values.end(); ++it) + it->second -= x; + return *this; + } + + SparseVector &operator+=(const double &x) { + for (typename std::map::iterator + it = _values.begin(); it != _values.end(); ++it) + it->second += x; + return *this; + } + + SparseVector &operator/=(const double &x) { + for (typename std::map::iterator + it = _values.begin(); it != _values.end(); ++it) + it->second /= x; + return *this; + } + + SparseVector &operator*=(const T& x) { + for (typename std::map::iterator + it = _values.begin(); it != _values.end(); ++it) + it->second *= x; + return *this; + } + + SparseVector operator+(const double &x) const { + SparseVector result = *this; + return result += x; + } + + SparseVector operator-(const double &x) const { + SparseVector result = *this; + return result -= x; + } + + SparseVector operator/(const double &x) const { + SparseVector result = *this; + return result /= x; + } + + std::ostream &operator<<(std::ostream &out) const { + for (typename std::map::const_iterator + it = _values.begin(); it != _values.end(); ++it) + out << (it == _values.begin() ? "" : ";") + << FD::Convert(it->first) << '=' << it->second; + return out; + } + + bool operator<(const SparseVector &other) const { + typename std::map::const_iterator it = _values.begin(); + typename std::map::const_iterator other_it = other._values.begin(); + + for (; it != _values.end() && other_it != other._values.end(); ++it, ++other_it) + { + if (it->first < other_it->first) return true; + if (it->first > other_it->first) return false; + if (it->second < other_it->second) return true; + if (it->second > other_it->second) return false; + } + return _values.size() < other._values.size(); + } + + int num_active() const { return _values.size(); } + bool empty() const { return _values.empty(); } + + typedef typename std::map::const_iterator const_iterator; + const_iterator begin() const { return _values.begin(); } + const_iterator end() const { return _values.end(); } + + void clear() { + _values.clear(); + } + + void swap(SparseVector& other) { + _values.swap(other._values); + } + +private: + std::map _values; +}; + +template +SparseVector operator+(const SparseVector& a, const SparseVector& b) { + SparseVector result = a; + return result += b; +} + +template +SparseVector operator*(const SparseVector& a, const double& b) { + SparseVector result = a; + return result *= b; +} + +template +SparseVector operator*(const SparseVector& a, const T& b) { + SparseVector result = a; + return result *= b; +} + +template +SparseVector operator*(const double& a, const SparseVector& b) { + SparseVector result = b; + return result *= a; +} + +template +std::ostream &operator<<(std::ostream &out, const SparseVector &vec) +{ + return vec.operator<<(out); +} + +namespace B64 { + void Encode(double objective, const SparseVector& v, std::ostream* out); + // returns false if failed to decode + bool Decode(double* objective, SparseVector* v, const char* data, size_t size); +} + +#endif diff --git a/decoder/stringlib.cc b/decoder/stringlib.cc new file mode 100644 index 00000000..3ed74bef --- /dev/null +++ b/decoder/stringlib.cc @@ -0,0 +1,97 @@ +#include "stringlib.h" + +#include +#include +#include +#include + +#include "lattice.h" + +using namespace std; + +void ParseTranslatorInput(const string& line, string* input, string* ref) { + size_t hint = 0; + if (line.find("{\"rules\":") == 0) { + hint = line.find("}}"); + if (hint == string::npos) { + cerr << "Syntax error: " << line << endl; + abort(); + } + hint += 2; + } + size_t pos = line.find("|||", hint); + if (pos == string::npos) { *input = line; return; } + ref->clear(); + *input = line.substr(0, pos - 1); + string rline = line.substr(pos + 4); + if (rline.size() > 0) { + assert(ref); + *ref = rline; + } +} + +void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) { + string sref; + ParseTranslatorInput(line, input, &sref); + if (sref.size() > 0) { + assert(ref); + LatticeTools::ConvertTextOrPLF(sref, ref); + } +} + +void ProcessAndStripSGML(string* pline, map* out) { + map& meta = *out; + string& line = *pline; + string lline = LowercaseString(line); + if (lline.find(""); + if (close == string::npos) return; // error + size_t end = lline.find(""); + string seg = Trim(lline.substr(4, close-4)); + string text = line.substr(close+1, end - close - 1); + for (size_t i = 1; i < seg.size(); i++) { + if (seg[i] == '=' && seg[i-1] == ' ') { + string less = seg.substr(0, i-1) + seg.substr(i); + seg = less; i = 0; continue; + } + if (seg[i] == '=' && seg[i+1] == ' ') { + string less = seg.substr(0, i+1); + if (i+2 < seg.size()) less += seg.substr(i+2); + seg = less; i = 0; continue; + } + } + line = Trim(text); + if (seg == "") return; + for (size_t i = 1; i < seg.size(); i++) { + if (seg[i] == '=') { + string label = seg.substr(0, i); + string val = seg.substr(i+1); + if (val[0] == '"') { + val = val.substr(1); + size_t close = val.find('"'); + if (close == string::npos) { + cerr << "SGML parse error: missing \"\n"; + seg = ""; + i = 0; + } else { + seg = val.substr(close+1); + val = val.substr(0, close); + i = 0; + } + } else { + size_t close = val.find(' '); + if (close == string::npos) { + seg = ""; + i = 0; + } else { + seg = val.substr(close+1); + val = val.substr(0, close); + } + } + label = Trim(label); + seg = Trim(seg); + meta[label] = val; + } + } +} + diff --git a/decoder/stringlib.h b/decoder/stringlib.h new file mode 100644 index 00000000..76efee8f --- /dev/null +++ b/decoder/stringlib.h @@ -0,0 +1,101 @@ +#ifndef _STRINGLIB_H_ + +#include +#include +#include +#include + +// read line in the form of either: +// source +// source ||| target +// source will be returned as a string, target must be a sentence or +// a lattice (in PLF format) and will be returned as a Lattice object +void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref); +struct Lattice; +void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref); + +inline const std::string Trim(const std::string& str, const std::string& dropChars = " \t") { + std::string res = str; + res.erase(str.find_last_not_of(dropChars)+1); + return res.erase(0, res.find_first_not_of(dropChars)); +} + +inline void Tokenize(const std::string& str, char delimiter, std::vector* res) { + std::string s = str; + int last = 0; + res->clear(); + for (int i=0; i < s.size(); ++i) + if (s[i] == delimiter) { + s[i]=0; + if (last != i) { + res->push_back(&s[last]); + } + last = i + 1; + } + if (last != s.size()) + res->push_back(&s[last]); +} + +inline std::string LowercaseString(const std::string& in) { + std::string res(in.size(),' '); + for (int i = 0; i < in.size(); ++i) + res[i] = tolower(in[i]); + return res; +} + +inline int CountSubstrings(const std::string& str, const std::string& sub) { + size_t p = 0; + int res = 0; + while (p < str.size()) { + p = str.find(sub, p); + if (p == std::string::npos) break; + ++res; + p += sub.size(); + } + return res; +} + +inline int SplitOnWhitespace(const std::string& in, std::vector* out) { + out->clear(); + int i = 0; + int start = 0; + std::string cur; + while(i < in.size()) { + if (in[i] == ' ' || in[i] == '\t') { + if (i - start > 0) + out->push_back(in.substr(start, i - start)); + start = i + 1; + } + ++i; + } + if (i > start) + out->push_back(in.substr(start, i - start)); + return out->size(); +} + +inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) { + cmd->clear(); + param->clear(); + std::vector x; + SplitOnWhitespace(in, &x); + if (x.size() == 0) return; + *cmd = x[0]; + for (int i = 1; i < x.size(); ++i) { + if (i > 1) { *param += " "; } + *param += x[i]; + } +} + +void ProcessAndStripSGML(std::string* line, std::map* out); + +// given the first character of a UTF8 block, find out how wide it is +// see http://en.wikipedia.org/wiki/UTF-8 for more info +inline unsigned int UTF8Len(unsigned char x) { + if (x < 0x80) return 1; + else if ((x >> 5) == 0x06) return 2; + else if ((x >> 4) == 0x0e) return 3; + else if ((x >> 3) == 0x1e) return 4; + else return 0; +} + +#endif diff --git a/decoder/tdict.cc b/decoder/tdict.cc new file mode 100644 index 00000000..c00d20b8 --- /dev/null +++ b/decoder/tdict.cc @@ -0,0 +1,49 @@ +#include "Ngram.h" +#include "dict.h" +#include "tdict.h" +#include "Vocab.h" + +using namespace std; + +Vocab* TD::dict_ = new Vocab; + +static const string empty; +static const string space = " "; + +WordID TD::Convert(const std::string& s) { + return dict_->addWord((VocabString)s.c_str()); +} + +const char* TD::Convert(const WordID& w) { + return dict_->getWord((VocabIndex)w); +} + +void TD::GetWordIDs(const std::vector& strings, std::vector* ids) { + ids->clear(); + for (vector::const_iterator i = strings.begin(); i != strings.end(); ++i) + ids->push_back(TD::Convert(*i)); +} + +std::string TD::GetString(const std::vector& str) { + string res; + for (vector::const_iterator i = str.begin(); i != str.end(); ++i) + res += (i == str.begin() ? empty : space) + TD::Convert(*i); + return res; +} + +void TD::ConvertSentence(const std::string& sent, std::vector* ids) { + string s = sent; + int last = 0; + ids->clear(); + for (int i=0; i < s.size(); ++i) + if (s[i] == 32 || s[i] == '\t') { + s[i]=0; + if (last != i) { + ids->push_back(Convert(&s[last])); + } + last = i + 1; + } + if (last != s.size()) + ids->push_back(Convert(&s[last])); +} + diff --git a/decoder/tdict.h b/decoder/tdict.h new file mode 100644 index 00000000..9d4318fe --- /dev/null +++ b/decoder/tdict.h @@ -0,0 +1,19 @@ +#ifndef _TDICT_H_ +#define _TDICT_H_ + +#include +#include +#include "wordid.h" + +class Vocab; + +struct TD { + static Vocab* dict_; + static void ConvertSentence(const std::string& sent, std::vector* ids); + static void GetWordIDs(const std::vector& strings, std::vector* ids); + static std::string GetString(const std::vector& str); + static WordID Convert(const std::string& s); + static const char* Convert(const WordID& w); +}; + +#endif diff --git a/decoder/test_data/dummy.3gram.lm b/decoder/test_data/dummy.3gram.lm new file mode 100644 index 00000000..ae665284 --- /dev/null +++ b/decoder/test_data/dummy.3gram.lm @@ -0,0 +1,2645 @@ + +\data\ +ngram 1=490 +ngram 2=1023 +ngram 3=1119 + +\1-grams: +-2.761928 ! -0.06284945 +-1.91683 " -0.03559465 +-2.761928 ' -0.06057167 +-2.159868 ( -0.07742823 +-2.159868 ) -0.05637721 +-1.292106 , -0.04497077 +-3.062958 - -0.06247065 +-1.429489 . -0.08555528 +-2.761928 12 -0.06473851 +-3.062958 17 -0.06586801 +-2.585837 2000 -0.05520994 +-3.062958 2002 -0.06360606 +-3.062958 2006 -0.0497812 +-3.062958 2008 -0.06322792 +-3.062958 2009 -0.0497812 +-3.062958 200–400 -0.06549184 +-3.062958 224 -0.06586801 +-1.91683 +-99 -0.0457003 +-2.761928 ? -0.05751594 +-1.720535 a -0.05548429 +-2.460898 about -0.05211611 +-3.062958 acquiesced -0.05942829 +-3.062958 actually -0.04349266 +-3.062958 addition -0.05980976 +-3.062958 admit -0.06095213 +-3.062958 affected -0.04071253 +-2.761928 against -0.06549184 +-3.062958 aging -0.06586801 +-3.062958 ago -0.04349266 +-3.062958 ahead -0.06586801 +-2.761928 al -0.06284945 +-2.761928 all -0.0590465 +-3.062958 all-around -0.06586801 +-3.062958 along -0.04071253 +-2.761928 also -0.06322792 +-2.761928 always -0.06436136 +-2.363988 an -0.06436135 +-3.062958 analysis -0.06473851 +-1.631594 and 0.006203346 +-3.062958 anti-divine -0.06586801 +-3.062958 any -0.06549184 +-3.062958 approach -0.05789908 +-3.062958 archive -0.04071253 +-3.062958 are -0.05789908 +-2.761928 arkive -0.06549184 +-2.585837 article -0.0228177 +-2.21786 as -0.09020901 +-3.062958 asked -0.06398387 +-2.585837 at -0.03145044 +-2.761928 attention -0.02612664 +-3.062958 available -0.04349266 +-3.062958 average -0.04349266 +-3.062958 away -0.06322792 +-3.062958 ayers -0.05597997 +-3.062958 b -0.04349266 +-3.062958 back-and-forth -0.06586801 +-3.062958 bailie -0.0497812 +-2.761928 be -0.06511534 +-3.062958 because -0.06586801 +-2.460898 been -0.06322791 +-3.062958 before -0.04349266 +-2.761928 begin -0.05520995 +-3.062958 being -0.06586801 +-2.585837 between -0.1350269 +-2.460898 bias -0.04111077 +-3.062958 biased -0.06511534 +-3.062958 biblical -0.06586801 +-3.062958 bill -0.06586801 +-3.062958 blade -0.06436136 +-3.062958 blood -0.04349266 +-3.062958 bob -0.06549184 +-3.062958 book -0.06436136 +-2.159868 briffa -0.06804922 +-2.761928 briffa's -0.06284945 +-2.021565 but -0.01525023 +-2.21786 by -0.07600738 +-2.761928 ca -0.2166343 +-2.761928 can -0.06473851 +-3.062958 case -0.06511534 +-3.062958 cast -0.06473851 +-3.062958 catch -0.06511534 +-3.062958 caught -0.06511534 +-3.062958 caveats -0.06322792 +-3.062958 centennial-scale -0.06549184 +-3.062958 cf -0.0497812 +-3.062958 change -0.06209152 +-3.062958 changing -0.06360606 +-3.062958 characterizes -0.06586801 +-3.062958 checked -0.06586801 +-2.159868 chronology -0.02240231 +-3.062958 church -0.06398387 +-3.062958 cocaine -0.06398387 +-3.062958 collection -0.06586801 +-3.062958 combination -0.06209152 +-3.062958 combine -0.04071253 +-3.062958 combined -0.06209152 +-3.062958 comment -0.06360606 +-3.062958 commentary -0.06322792 +-3.062958 commenter -0.06586801 +-3.062958 comments -0.06586801 +-3.062958 compared -0.05789908 +-3.062958 concerned -0.06473851 +-3.062958 concrete -0.06095213 +-3.062958 connection -0.06209152 +-2.761928 conservatives -0.06360606 +-3.062958 considered -0.06095213 +-3.062958 consists -0.04349266 +-3.062958 constructing -0.05789908 +-2.761928 control -0.03991493 +-2.585837 cores -0.0236473 +-3.062958 corridor -0.06473851 +-2.761928 crack -0.06436136 +-3.062958 crossroads -0.0497812 +-2.460898 cru -0.1318786 +-3.062958 darkness -0.05597997 +-2.108715 data -0.06845023 +-2.761928 day -0.05674864 +-2.761928 days -0.04939082 +-3.062958 debt -0.04349266 +-3.062958 decline -0.06095213 +-3.062958 deep -0.06549184 +-3.062958 deeper -0.06586801 +-3.062958 delete -0.05789908 +-3.062958 derived -0.06511534 +-3.062958 described -0.05942829 +-2.761928 did -0.06095213 +-2.761928 difference -0.04860901 +-2.761928 different -0.06247065 +-2.761928 divergence -0.2166343 +-2.761928 do -0.05559513 +-3.062958 does -0.06247065 +-3.062958 doing -0.06586801 +-3.062958 don't -0.06586801 +-3.062958 done -0.06586801 +-3.062958 doubt -0.06360606 +-3.062958 down -0.05789908 +-3.062958 due -0.06473851 +-3.062958 earlier -0.06019088 +-3.062958 editors -0.06511534 +-3.062958 energy -0.04349266 +-3.062958 enormous -0.06586801 +-2.761928 et -0.2166343 +-3.062958 even -0.06586801 +-3.062958 every -0.06586801 +-3.062958 exactly -0.06360606 +-3.062958 exception -0.05789908 +-3.062958 excluding -0.06549184 +-3.062958 expect -0.06511534 +-3.062958 extension -0.05597997 +-3.062958 factors -0.04349266 +-3.062958 fantasy -0.06436136 +-3.062958 far -0.06511534 +-2.585837 few -0.1590744 +-2.585837 finally -0.06511533 +-3.062958 first -0.04349266 +-3.062958 flesh -0.05597997 +-3.062958 following: -0.06095213 +-3.062958 follows: -0.06095213 +-2.284806 for -0.06171204 +-3.062958 forests -0.0497812 +-2.585837 from -0.05713245 +-3.062958 fully -0.06586801 +-2.585837 further -0.06511533 +-3.062958 furthermore -0.04349266 +-3.062958 future -0.0497812 +-3.062958 generating -0.06586801 +-2.761928 get -0.191855 +-3.062958 ghastly -0.06586801 +-3.062958 ghostwritten -0.06360606 +-3.062958 gil -0.06586801 +-3.062958 given -0.04071253 +-3.062958 going -0.05789908 +-3.062958 got -0.06436136 +-2.761928 great -0.2166343 +-3.062958 growing -0.0497812 +-3.062958 grows -0.06511534 +-2.363988 had -0.1033177 +-2.585837 hantemirov -0.09654189 +-2.761928 happening -0.06436136 +-3.062958 happens -0.06549184 +-3.062958 hard -0.05789908 +-3.062958 hardly -0.06473851 +-2.460898 has -0.03063563 +-3.062958 hate -0.05789908 +-2.284806 have -0.08108715 +-3.062958 haven't -0.06586801 +-2.363988 he -0.112982 +-3.062958 here -0.06586801 +-3.062958 highly -0.06586801 +-2.761928 him -0.05751594 +-2.585837 his -0.06511533 +-3.062958 how -0.06586801 +-2.761928 however -0.1946352 +-3.062958 hs -0.06586801 +-3.062958 humanity -0.06511534 +-2.108715 i -0.05980975 +-3.062958 i'd -0.06586801 +-3.062958 i've -0.06586801 +-2.761928 idea -0.02612664 +-2.761928 if -0.03670979 +-3.062958 illusion -0.05597997 +-3.062958 immense -0.06586801 +-3.062958 impact -0.06322792 +-3.062958 important -0.06586801 +-1.807685 in -0.04419087 +-3.062958 included -0.06209152 +-2.761928 including -0.0165447 +-3.062958 indeed -0.06511534 +-3.062958 individual -0.06511534 +-3.062958 information -0.06511534 +-3.062958 inhomogeneities -0.04349266 +-3.062958 initial -0.06549184 +-2.761928 instead -0.2109523 +-3.062958 interannual -0.06549184 +-2.761928 into -0.03991493 +-3.062958 introduced -0.06360606 +-1.91683 is -0.001109093 +-2.062958 it -0.06621437 +-2.460898 it's -0.06019088 +-3.062958 its -0.06586801 +-2.761928 journal -0.06209152 +-3.062958 jurisdiction -0.0497812 +-2.460898 just -0.05520994 +-3.062958 kaufman -0.06549184 +-3.062958 keeps -0.06586801 +-2.761928 khadyta -0.2166343 +-2.460898 know -0.1105378 +-3.062958 larch -0.06586801 +-2.761928 larches -0.04743365 +-3.062958 large-scale -0.06095213 +-2.761928 like -0.06511534 +-3.062958 limited -0.06586801 +-3.062958 living -0.06549184 +-3.062958 longest -0.05597997 +-3.062958 looking -0.06549184 +-3.062958 looks -0.06586801 +-3.062958 love -0.05789908 +-3.062958 made -0.06095213 +-2.761928 mag -0.2143704 +-3.062958 magnitude -0.05980976 +-3.062958 magnus -0.0497812 +-3.062958 makes -0.04071253 +-3.062958 many -0.06586801 +-3.062958 may -0.06586801 +-3.062958 mean -0.06322792 +-3.062958 measured -0.06360606 +-2.761928 measurement -0.213992 +-2.460898 method -0.03711172 +-3.062958 methodology -0.06586801 +-3.062958 mind -0.06511534 +-3.062958 mix -0.06586801 +-2.585837 more -0.05636447 +-3.062958 morning -0.06284945 +-2.585837 most -0.0647385 +-2.761928 much -0.06473851 +-3.062958 multi-parters -0.04349266 +-3.062958 multiproxy -0.06586801 +-3.062958 mundane -0.06511534 +-2.585837 my -0.1598284 +-3.062958 national -0.06586801 +-3.062958 naughtiness -0.0497812 +-3.062958 nettle -0.04349266 +-3.062958 never -0.06586801 +-3.062958 next -0.04349266 +-3.062958 no -0.06586801 +-3.062958 non-robustness -0.06586801 +-3.062958 northern -0.06586801 +-2.062958 not -0.0712041 +-3.062958 noted -0.06586801 +-3.062958 noticed -0.06095213 +-3.062958 notwithstanding -0.06473851 +-3.062958 now -0.04349266 +-2.761928 obama -0.03791448 +-3.062958 observed -0.06586801 +-1.832509 of -0.04850956 +-2.761928 old -0.06436136 +-2.585837 older -0.1053004 +-3.062958 oldie -0.04349266 +-2.159868 on -0.09226183 +-2.585837 one -0.04900008 +-3.062958 online -0.0497812 +-3.062958 only -0.06586801 +-3.062958 or -0.06586801 +-3.062958 originated -0.06209152 +-3.062958 osborn -0.05597997 +-3.062958 out -0.06322792 +-3.062958 outright -0.06586801 +-3.062958 own -0.06586801 +-3.062958 paleoclimatologists -0.05597997 +-3.062958 passage -0.06284945 +-3.062958 passing -0.05597997 +-3.062958 path -0.06095213 +-3.062958 patterns -0.05942829 +-3.062958 paul -0.06436136 +-3.062958 people -0.06095213 +-2.363988 perhaps -0.06259563 +-2.761928 phil -0.2166343 +-3.062958 picked -0.06511534 +-3.062958 piece -0.06360606 +-3.062958 place -0.0497812 +-3.062958 placed -0.06586801 +-3.062958 play -0.06322792 +-3.062958 point -0.06095213 +-3.062958 policy -0.06322792 +-2.585837 politics -0.02571439 +-2.363988 population -0.1001791 +-3.062958 position -0.06095213 +-3.062958 possible -0.05597997 +-2.761928 potential -0.06436136 +-3.062958 power -0.05789908 +-3.062958 powers -0.05597997 +-3.062958 precipitous -0.06586801 +-3.062958 precisely -0.04071253 +-3.062958 predictable -0.06586801 +-3.062958 presented -0.06019088 +-3.062958 preserve -0.06586801 +-3.062958 previous -0.06549184 +-3.062958 principalities -0.05980976 +-3.062958 principles -0.05942829 +-3.062958 prior -0.06511534 +-3.062958 probable -0.06095213 +-2.761928 problem -0.2120946 +-3.062958 projected -0.06549184 +-3.062958 properly -0.06586801 +-3.062958 prove -0.06586801 +-3.062958 provide -0.04071253 +-3.062958 provided -0.05789908 +-3.062958 provocative -0.06586801 +-3.062958 published -0.05942829 +-3.062958 push -0.06511534 +-2.585837 rcs -0.06133225 +-3.062958 react -0.05789908 +-3.062958 read -0.06247065 +-2.761928 readers -0.06398387 +-3.062958 reading -0.04349266 +-3.062958 real -0.06322792 +-3.062958 really -0.06586801 +-3.062958 realm -0.05980976 +-2.761928 reason -0.06360606 +-3.062958 recent -0.06511534 +-2.761928 recently -0.1946352 +-3.062958 reconstruction -0.0497812 +-3.062958 refusal -0.05942829 +-3.062958 refused -0.05789908 +-3.062958 related -0.05789908 +-3.062958 relevant -0.04349266 +-3.062958 relied -0.06322792 +-3.062958 religion -0.05597997 +-3.062958 remained -0.06586801 +-3.062958 remarked -0.06095213 +-3.062958 reposting -0.06473851 +-3.062958 requiring -0.06322792 +-3.062958 response -0.05789908 +-3.062958 resulting -0.06322792 +-3.062958 rev -0.0497812 +-2.460898 right -0.04821757 +-3.062958 ring -0.06586801 +-3.062958 ring-width -0.06511534 +-2.761928 river -0.1946352 +-3.062958 said -0.06436136 +-3.062958 same -0.06473851 +-3.062958 sample -0.06586801 +-3.062958 sat -0.05942829 +-2.460898 schweingruber -0.09101291 +-3.062958 schweingruber's -0.06549184 +-2.585837 science -0.1568045 +-3.062958 script -0.06322792 +-2.585837 see -0.1112577 +-3.062958 seized -0.04071253 +-2.761928 selected -0.04664831 +-2.585837 selection -0.1491516 +-3.062958 sensitive -0.06511534 +-3.062958 sensitivity -0.06095213 +-2.585837 series -0.1314228 +-3.062958 set -0.05942829 +-3.062958 several -0.06549184 +-3.062958 shadow -0.06586801 +-2.761928 shadows -0.04309659 +-2.585837 shiyatov -0.06360605 +-3.062958 should -0.06247065 +-3.062958 similar -0.06473851 +-3.062958 similarly -0.06586801 +-3.062958 since -0.06019088 +-3.062958 size -0.05597997 +-3.062958 skimmed -0.06019088 +-2.761928 slowly -0.04270015 +-3.062958 small -0.06586801 +-3.062958 so -0.06549184 +-3.062958 some -0.06549184 +-3.062958 someone -0.06586801 +-3.062958 start -0.06549184 +-3.062958 staunchly -0.06586801 +-3.062958 struggling -0.06549184 +-3.062958 studies -0.06095213 +-2.761928 study -0.02612664 +-3.062958 stumbled -0.06586801 +-2.585837 subfossil -0.06171205 +-3.062958 subsequent -0.06549184 +-3.062958 subset -0.05942829 +-3.062958 success -0.0497812 +-3.062958 supplement -0.0497812 +-3.062958 supplemented -0.06360606 +-3.062958 surface -0.04349266 +-3.062958 take -0.06436136 +-3.062958 taken -0.05789908 +-2.761928 taymir -0.06247065 +-3.062958 temperature -0.04349266 +-3.062958 tendency -0.05789908 +-3.062958 terms -0.05980976 +-3.062958 than -0.04071253 +-1.91683 that -0.06692892 +-1.243414 the -0.08813193 +-3.062958 their -0.06511534 +-2.761928 themselves -0.04111078 +-3.062958 there's -0.06586801 +-2.460898 these -0.05942829 +-2.460898 they -0.06398387 +-2.761928 things -0.06057167 +-3.062958 think -0.06549184 +-3.062958 thinking -0.06586801 +-1.858838 this -0.08175352 +-2.761928 those -0.06057167 +-3.062958 thought -0.0497812 +-3.062958 thousand -0.04349266 +-3.062958 through -0.04071253 +-2.761928 time -0.0326698 +-1.720535 to -0.07930601 +-2.761928 today -0.04821758 +-3.062958 took -0.04071253 +-3.062958 towards -0.06511534 +-2.761928 trans -0.06549184 +-2.460898 trees -0.04704115 +-2.761928 trouble -0.213234 +-3.062958 true -0.04349266 +-3.062958 trying -0.05789908 +-2.761928 two -0.2166343 +-3.062958 unarchived -0.0497812 +-3.062958 under -0.06549184 +-3.062958 unintentional -0.06473851 +-3.062958 unrepresentativeness -0.05980976 +-3.062958 until -0.06549184 +-3.062958 unveiled: -0.06586801 +-2.761928 up -0.03185729 +-3.062958 upon -0.06019088 +-2.761928 use -0.2109523 +-2.363988 used -0.0545155 +-2.761928 using -0.02323271 +-3.062958 usual -0.06586801 +-3.062958 valid -0.06549184 +-2.761928 variability -0.03911585 +-2.761928 versions -0.04428373 +-2.761928 very -0.06549184 +-3.062958 violence -0.06586801 +-3.062958 virtually -0.06586801 +-3.062958 virtue -0.05980976 +-3.062958 voted -0.06398387 +-3.062958 warn -0.06549184 +-3.062958 warnings -0.04349266 +-2.363988 was -0.06171205 +-3.062958 way -0.06549184 +-3.062958 we -0.06549184 +-3.062958 well -0.06398387 +-2.284806 were -0.07866543 +-2.21786 what -0.02364731 +-3.062958 what's -0.06549184 +-2.585837 when -0.06057167 +-2.585837 where -0.05597997 +-2.460898 which -0.0403139 +-2.585837 while -0.03951557 +-3.062958 whose -0.06586801 +-3.062958 why -0.06586801 +-3.062958 widths -0.05597997 +-2.761928 will -0.06322792 +-3.062958 wise -0.06549184 +-2.021565 with -0.08912028 +-3.062958 within -0.06549184 +-3.062958 without -0.06586801 +-3.062958 worth -0.06586801 +-2.460898 would -0.1303614 +-3.062958 wright's -0.06586801 +-3.062958 wrote -0.04071253 +-2.159868 yamal -0.0719028 +-2.761928 year -0.04270015 +-3.062958 years -0.06549184 +-3.062958 yes -0.04349266 +-3.062958 yesterday -0.06473851 +-3.062958 yet -0.04349266 +-3.062958 you -0.06511534 +-2.761928 your -0.06511534 + +\2-grams: +-1.15037 ! as -0.004049858 +-1.15037 ! instead 0.2044696 +-1.995468 " ( -0.005168174 +-1.995468 " - 0.05332709 +-1.995468 " +-1.995468 " as -0.004049858 +-1.995468 " concrete 0.05332709 +-1.995468 " corridor 0.05332709 +-1.249819 " divergence 0.1451325 +-1.995468 " further 0.008061528 +-1.995468 " i'd 0.05332709 +-1.995468 " success 0.05332709 +-1.995468 " that -0.008505944 +-1.995468 " the -0.007702977 +-1.995468 " used -0.0004517734 +-1.15037 ' +-1.15037 ' yes 0.05332709 +-1.75243 ( and -0.01063527 +-1.75243 ( in 0.006514465 +-1.006781 ( mag 0.1451325 +-1.75243 ( or 0.05332709 +-1.75243 ( phil 0.2044696 +-1.75243 ( which 0.00272119 +-1.75243 ( while 0.008061528 +-1.006781 ) , -0.002172916 +-1.75243 ) +-1.75243 ) acquiesced 0.05332709 +-1.75243 ) and -0.002266581 +-1.75243 ) had -0.0004517734 +-1.75243 ) things 0.01894335 +-1.75243 ) took 0.05332709 +-2.620192 , 2008 0.05332709 +-2.620192 , 224 0.05332709 +-2.620192 , a -0.01011507 +-2.620192 , all 0.01894335 +-1.955229 , and -0.006035992 +-2.620192 , as 0.0389223 +-2.620192 , bob 0.05332709 +-2.620192 , briffa -0.005168174 +-0.8166095 , but 0.05114232 +-2.620192 , cf 0.05332709 +-2.620192 , cru 0.00272119 +-2.620192 , delete 0.05332709 +-2.620192 , for -0.002554279 +-2.620192 , from 0.008061528 +-2.620192 , he -0.0004517734 +-2.620192 , his 0.008061528 +-1.955229 , i 0.008061524 +-2.620192 , if 0.01894335 +-2.620192 , including 0.01894335 +-2.620192 , is -0.008505944 +-1.874543 , it -0.0004517762 +-1.874543 , it's 0.01894334 +-2.620192 , kaufman 0.05332709 +-2.620192 , most 0.008061528 +-2.620192 , notwithstanding 0.05332709 +-2.620192 , of 0.007685009 +-2.620192 , on -0.005168174 +-2.620192 , perhaps 0.04797027 +-2.620192 , requiring 0.05332709 +-2.620192 , since 0.05332709 +-1.955229 , the 0.02331641 +-1.955229 , this 0.01715922 +-2.620192 , until 0.05332709 +-2.620192 , using 0.01894335 +-1.874543 , when 0.03010483 +-2.620192 , where 0.008061528 +-1.874543 , which 0.01894334 +-2.620192 , while 0.008061528 +-2.620192 , yamal -0.005168174 +-0.8493397 - not -0.006728992 +-2.482808 . " -0.008505944 +-2.482808 . ' 0.01894335 +-2.482808 . ( -0.005168174 +-2.482808 . ) -0.005168174 +-0.6792259 . +-1.737159 . a 0.003078613 +-2.482808 . actually 0.05332709 +-2.482808 . and -0.01063527 +-2.482808 . as -0.004049858 +-1.737159 . briffa 0.03257156 +-2.482808 . but -0.007295175 +-2.482808 . changing 0.05332709 +-2.482808 . first 0.05332709 +-2.482808 . furthermore 0.05332709 +-1.737159 . however 0.1451325 +-2.482808 . i -0.006035987 +-2.482808 . in -0.009490006 +-2.482808 . it 0.0164606 +-2.482808 . perhaps 0.04797027 +-2.482808 . science 0.1193421 +-2.482808 . several 0.05332709 +-2.482808 . the -0.008591395 +-1.737159 . these 0.01894334 +-1.737159 . this 0.0130633 +-2.482808 . violence 0.05332709 +-2.482808 . what -0.004049858 +-2.482808 . what's 0.05332709 +-2.482808 . while 0.008061528 +-2.482808 . with 0.05785327 +-2.482808 . wright's 0.05332709 +-1.15037 12 cores 0.008061528 +-1.15037 12 picked 0.05332709 +-0.8493397 17 ring-width 0.05332709 +-1.326461 2000 and -0.01063527 +-1.326461 2000 may 0.05332709 +-1.326461 2000 presented 0.05332709 +-0.8493397 2002 as -0.004049858 +-0.8493397 2006 . -0.0114856 +-0.8493397 2008 ) -0.005168174 +-0.8493397 2009 . 0.08907277 +-0.8493397 200–400 year 0.01894335 +-0.8493397 224 individual 0.05332709 +-1.995468 ' 0.01894335 +-1.995468 as 0.0389223 +-1.995468 briffa's 0.01894335 +-1.995468 but -0.007295175 +-1.995468 i -0.006035987 +-1.995468 if 0.01894335 +-1.995468 in -0.009490006 +-1.995468 next 0.05332709 +-1.249819 perhaps 0.06234263 +-1.249819 the 0.0223057 +-1.995468 this -0.009059753 +-1.995468 what -0.004049858 +-1.15037 ? " -0.008505944 +-1.15037 ? i -0.006035987 +-2.191762 a " 0.01222976 +-2.191762 a case 0.05332709 +-2.191762 a comment 0.05332709 +-2.191762 a commenter 0.05332709 +-2.191762 a different 0.01894335 +-1.5268 a few 0.109396 +-2.191762 a generating 0.05332709 +-2.191762 a great 0.2044696 +-2.191762 a mean 0.05332709 +-2.191762 a prior 0.05332709 +-2.191762 a provocative 0.05332709 +-2.191762 a rcs 0.008061528 +-2.191762 a science 0.008061528 +-2.191762 a shadow 0.05332709 +-2.191762 a similar 0.05332709 +-2.191762 a small 0.05332709 +-2.191762 a surface 0.05332709 +-2.191762 a thousand 0.05332709 +-2.191762 a time 0.01894335 +-2.191762 a valid 0.05332709 +-1.4514 about a -0.01011507 +-1.4514 about my 0.008061528 +-1.4514 about not -0.006728992 +-1.4514 about potential 0.01894335 +-0.8493397 acquiesced in -0.009490006 +-0.8493397 actually , -0.01187418 +-0.8493397 addition of -0.009287588 +-0.8493397 admit that 0.04168737 +-0.8493397 affected the -0.01198488 +-1.15037 against flesh 0.05332709 +-1.15037 against inhomogeneities 0.05332709 +-0.8493397 aging patterns 0.05332709 +-0.8493397 ago , -0.008075343 +-0.8493397 ahead you 0.05332709 +-1.15037 al ( -0.005168174 +-1.15037 al 2009 0.05332709 +-1.15037 all of -0.009287588 +-1.15037 all those 0.01894335 +-0.8493397 all-around naughtiness 0.05332709 +-0.8493397 along the -0.01198488 +-1.15037 also has 0.00272119 +-1.15037 also know 0.08231446 +-1.15037 always been 0.00272119 +-1.15037 always worth 0.05332709 +-1.54831 an exception 0.05332709 +-1.54831 an extension 0.05332709 +-1.54831 an immense 0.05332709 +-1.54831 an important 0.05332709 +-1.54831 an unintentional 0.05332709 +-0.8493397 analysis has 0.00272119 +-2.280704 and , -0.007080218 +-2.280704 and all-around 0.05332709 +-2.280704 and blood 0.05332709 +-2.280704 and briffa -0.005168174 +-2.280704 and even 0.05332709 +-2.280704 and got 0.05332709 +-2.280704 and hantemirov 0.09388901 +-2.280704 and he 0.06152429 +-2.280704 and i've 0.05332709 +-2.280704 and it -0.006728992 +-2.280704 and most 0.008061528 +-2.280704 and outright 0.05332709 +-2.280704 and perhaps -0.0004517734 +-2.280704 and politics 0.008061528 +-2.280704 and potential 0.01894335 +-2.280704 and principalities 0.05332709 +-2.280704 and sat 0.05332709 +-2.280704 and science 0.1193421 +-1.615741 and shiyatov 0.05332708 +-2.280704 and temperature 0.05332709 +-2.280704 and that -0.008505944 +-1.615741 and the -0.005814605 +-2.280704 and they 0.00272119 +-0.8493397 anti-divine powers 0.05332709 +-0.8493397 any journal 0.01894335 +-0.8493397 approach to -0.01011507 +-0.8493397 archive the -0.01198488 +-0.8493397 are to -0.01011507 +-1.15037 arkive down 0.05332709 +-1.15037 arkive under 0.05332709 +-1.326461 article , -0.007080218 +-1.326461 article . -0.004888296 +-1.326461 article on -0.005168174 +-1.694438 as a -0.01011507 +-0.9487888 as ca 0.1451325 +-1.694438 as compared 0.05332709 +-1.694438 as follows: 0.05332709 +-1.694438 as it 0.0164606 +-1.694438 as noted 0.05332709 +-0.8493397 asked for -0.002554279 +-1.326461 at a -0.01011507 +-1.326461 at precisely 0.05332709 +-1.326461 at the -0.01198488 +-1.15037 attention , 0.05896524 +-1.15037 attention . -0.0114856 +-0.8493397 available , -0.008075343 +-0.8493397 average , -0.01187418 +-0.8493397 away ) 0.03209379 +-0.8493397 ayers and -0.01063527 +-0.8493397 b , -0.01187418 +-0.8493397 back-and-forth yesterday 0.05332709 +-0.8493397 bailie . -0.0114856 +-1.15037 be happening 0.01894335 +-1.15037 be included 0.05332709 +-0.8493397 because so 0.05332709 +-1.4514 been an -0.0004517734 +-1.4514 been concerned 0.05332709 +-1.4514 been done 0.05332709 +-1.4514 been projected 0.05332709 +-0.8493397 before , -0.01187418 +-1.15037 begin in -0.009490006 +-1.15037 begin with -0.007295175 +-0.8493397 being true 0.05332709 +-1.326461 between ring 0.05332709 +-0.580812 between the -0.06704012 +-1.4514 bias , -0.007080218 +-1.4514 bias introduced 0.05332709 +-1.4514 bias towards 0.05332709 +-1.4514 bias would 0.08231446 +-0.8493397 biased selection 0.1193421 +-0.8493397 biblical passage 0.05332709 +-0.8493397 bill ayers 0.05332709 +-0.8493397 blade was -0.0004517734 +-0.8493397 blood , 0.05896524 +-0.8493397 bob ? 0.01894335 +-0.8493397 book was -0.0004517734 +-1.087467 briffa 2000 0.05332708 +-1.75243 briffa 2006 0.05332709 +-1.75243 briffa asked 0.05332709 +-1.75243 briffa et 0.2044696 +-1.75243 briffa to -0.01011507 +-1.75243 briffa used -0.0004517734 +-1.15037 briffa's own 0.05332709 +-1.15037 briffa's yamal -0.005168174 +-1.890732 but , -0.01187418 +-1.890732 but anti-divine 0.05332709 +-1.890732 but because 0.05332709 +-1.890732 but between 0.1193421 +-1.890732 but given 0.05332709 +-1.145083 but it -0.0004517762 +-1.890732 but it's 0.00272119 +-1.890732 but the -0.01198488 +-1.890732 but this 0.009005655 +-1.890732 but to 0.002916232 +-1.694438 by bill 0.05332709 +-1.694438 by gil 0.05332709 +-1.694438 by hantemirov 0.09388901 +-1.694438 by how 0.05332709 +-1.694438 by magnus 0.05332709 +-0.9487888 by the -0.01105098 +-0.4047208 ca readers 0.05332709 +-1.15037 can combine 0.05332709 +-1.15037 can see 0.1193421 +-0.8493397 case where 0.008061528 +-0.8493397 cast these 0.00272119 +-0.8493397 catch my 0.1193421 +-0.8493397 caught my 0.1193421 +-0.8493397 caveats on -0.005168174 +-0.8493397 centennial-scale variability 0.01894335 +-0.8493397 cf . -0.0114856 +-0.8493397 change with -0.007295175 +-0.8493397 changing what -0.004049858 +-0.8493397 characterizes northern 0.05332709 +-0.8493397 checked earlier 0.05332709 +-1.75243 chronology , -0.01187418 +-1.75243 chronology also 0.01894335 +-1.75243 chronology briffa -0.005168174 +-1.75243 chronology has 0.00272119 +-1.75243 chronology in -0.009490006 +-1.75243 chronology method 0.00272119 +-1.75243 chronology was -0.0004517734 +-1.75243 chronology with -0.007295175 +-0.8493397 church for -0.002554279 +-0.8493397 cocaine for -0.002554279 +-0.8493397 collection does 0.05332709 +-0.8493397 combination with 0.05785327 +-0.8493397 combine the -0.01198488 +-0.8493397 combined with 0.05785327 +-0.8493397 comment by -0.004049858 +-0.8493397 commentary on 0.03209379 +-0.8493397 commenter remarked 0.05332709 +-0.8493397 comments catch 0.05332709 +-0.8493397 compared to 0.02102831 +-0.8493397 concerned about 0.00272119 +-0.8493397 concrete " -0.008505944 +-0.8493397 connection with -0.007295175 +-1.15037 conservatives said 0.05332709 +-1.15037 conservatives were -0.002554279 +-0.8493397 considered " -0.008505944 +-0.8493397 consists , -0.01187418 +-0.8493397 constructing a -0.01011507 +-1.15037 control ! 0.01894335 +-1.15037 control the -0.01198488 +-1.326461 cores , -0.008075343 +-1.326461 cores . -0.004888296 +-1.326461 cores were 0.04819728 +-0.8493397 corridor method 0.00272119 +-1.15037 crack about 0.00272119 +-1.15037 crack cocaine 0.05332709 +-0.8493397 crossroads . -0.0114856 +-0.7057508 cru population 0.07636014 +-1.4514 cru selection 0.008061528 +-1.4514 cru staunchly 0.05332709 +-0.8493397 darkness and -0.01063527 +-1.803582 data ( -0.005168174 +-1.057933 data . -0.0100497 +-1.803582 data policy 0.05332709 +-1.803582 data remained 0.05332709 +-1.803582 data set 0.05332709 +-1.803582 data used 0.04797027 +-1.803582 data was -0.0004517734 +-1.803582 data were -0.002554279 +-1.15037 day politics 0.008061528 +-1.15037 day to -0.01011507 +-1.15037 days . 0.08907277 +-1.15037 days ago 0.05332709 +-0.8493397 debt , -0.007080218 +-0.8493397 decline is -0.008505944 +-0.8493397 deep into 0.01894335 +-0.8493397 deeper principles 0.05332709 +-0.8493397 delete a 0.0001907796 +-0.8493397 derived from 0.008061528 +-0.8493397 described in -0.009490006 +-1.15037 did not -0.006728992 +-1.15037 did they 0.00272119 +-1.15037 difference . 0.08907277 +-1.15037 difference between 0.1193421 +-1.15037 different aging 0.05332709 +-1.15037 different data -0.006035987 +-0.4047208 divergence problem 0.1451325 +-1.15037 do and -0.002266581 +-1.15037 do indeed 0.05332709 +-0.8493397 does not 0.0164606 +-0.8493397 doing exactly 0.05332709 +-0.8493397 don't really 0.05332709 +-0.8493397 done without 0.05332709 +-0.8493397 doubt what -0.004049858 +-0.8493397 down to -0.01011507 +-0.8493397 due just 0.00272119 +-0.8493397 earlier this -0.009059753 +-0.8493397 editors finally 0.008061528 +-0.8493397 energy , 0.05896524 +-0.8493397 enormous hs 0.05332709 +-0.4047208 et al 0.05332709 +-0.8493397 even probable 0.05332709 +-0.8493397 every subsequent 0.05332709 +-0.8493397 exactly what -0.004049858 +-0.8493397 exception to 0.02102831 +-0.8493397 excluding khadyta 0.2044696 +-0.8493397 expect from 0.008061528 +-0.8493397 extension and -0.01063527 +-0.8493397 factors , 0.05896524 +-0.8493397 fantasy had -0.0004517734 +-0.8493397 far more 0.008061528 +-1.326461 few at 0.008061528 +-0.580812 few days 0.05332709 +-1.326461 finally available 0.05332709 +-1.326461 finally placed 0.05332709 +-1.326461 finally seized 0.05332709 +-0.8493397 first , -0.01187418 +-0.8493397 flesh and -0.01063527 +-0.8493397 following: +-0.8493397 follows: +-1.627491 for all 0.01894335 +-1.627491 for an -0.0004517734 +-1.627491 for excluding 0.05332709 +-1.627491 for him 0.01894335 +-1.627491 for paleoclimatologists 0.05332709 +-1.627491 for we 0.05332709 +-0.8493397 forests . -0.004888296 +-1.326461 from 200–400 0.05332709 +-1.326461 from a -0.01011507 +-1.326461 from someone 0.05332709 +-0.8493397 fully thinking 0.05332709 +-1.326461 further ahead 0.05332709 +-1.326461 further along 0.05332709 +-1.326461 further away 0.05332709 +-0.8493397 furthermore , -0.007080218 +-0.8493397 future . -0.0114856 +-0.8493397 generating script 0.05332709 +-0.4047208 get the -0.06704012 +-0.8493397 ghastly tendency 0.05332709 +-0.8493397 ghostwritten by -0.004049858 +-0.8493397 gil bailie 0.05332709 +-0.8493397 given the -0.01198488 +-0.8493397 going to -0.01011507 +-0.8493397 got used 0.04797027 +-0.4047208 great idea 0.05332709 +-0.8493397 growing . 0.08907277 +-0.8493397 grows more 0.008061528 +-0.8026608 had a -0.007295178 +-1.54831 had been 0.00272119 +-1.54831 had in -0.009490006 +-1.54831 had jurisdiction 0.05332709 +-0.6614985 hantemirov and -0.5914098 +-1.15037 happening deep 0.05332709 +-1.15037 happening right 0.00272119 +-0.8493397 happens today 0.01894335 +-0.8493397 hard to -0.01011507 +-0.8493397 hardly know 0.00272119 +-1.4514 has a -0.01011507 +-1.4514 has always 0.01894335 +-1.4514 has only 0.05332709 +-1.4514 has the -0.01198488 +-0.8493397 hate to -0.01011507 +-1.627491 have an -0.0004517734 +-0.881842 have been 0.01894334 +-1.627491 have relied 0.05332709 +-1.627491 have similarly 0.05332709 +-1.627491 have the -0.01198488 +-0.8493397 haven't read 0.05332709 +-0.8026608 he is -0.004049861 +-1.54831 he made 0.05332709 +-1.54831 he would 0.00272119 +-1.54831 he wrote 0.05332709 +-0.8493397 here prove 0.05332709 +-0.8493397 highly possible 0.05332709 +-1.15037 him hate 0.05332709 +-1.15037 him to 0.002916232 +-1.326461 his comments 0.05332709 +-1.326461 his initial 0.05332709 +-1.326461 his precipitous 0.05332709 +-0.8493397 how their 0.05332709 +-0.4047208 however , -0.01082908 +-0.8493397 hs blade 0.05332709 +-0.8493397 humanity at 0.008061528 +-1.803582 i can 0.01894335 +-1.803582 i checked 0.05332709 +-1.803582 i had 0.06152429 +-1.803582 i hardly 0.05332709 +-1.803582 i haven't 0.05332709 +-1.803582 i know 0.00272119 +-1.803582 i noticed 0.05332709 +-1.803582 i skimmed 0.05332709 +-1.803582 i stumbled 0.05332709 +-0.8493397 i'd love 0.05332709 +-0.8493397 i've provided 0.05332709 +-1.15037 idea , -0.01187418 +-1.15037 idea . -0.0114856 +-1.15037 if it -0.006728992 +-1.15037 if the -0.01198488 +-0.8493397 illusion and -0.01063527 +-0.8493397 immense energy 0.05332709 +-0.8493397 impact on -0.005168174 +-0.8493397 important impact 0.05332709 +-1.358963 in a -0.007295178 +-2.104612 in any 0.05332709 +-2.104612 in briffa 0.02412629 +-2.104612 in briffa's 0.01894335 +-2.104612 in combination 0.05332709 +-2.104612 in connection 0.05332709 +-2.104612 in hantemirov 0.09388901 +-2.104612 in mind 0.05332709 +-2.104612 in one 0.008061528 +-2.104612 in passing 0.05332709 +-2.104612 in response 0.05332709 +-2.104612 in rev 0.05332709 +-2.104612 in terms 0.05332709 +-1.358963 in the -0.007650165 +-2.104612 in this -0.009059753 +-2.104612 in virtually 0.05332709 +-0.8493397 included with 0.05785327 +-1.15037 including , -0.01187418 +-1.15037 including the -0.007702977 +-0.8493397 indeed see 0.1193421 +-0.8493397 individual series 0.008061528 +-0.8493397 information finally 0.008061528 +-0.8493397 inhomogeneities , 0.05896524 +-0.8493397 initial use 0.2044696 +-0.4047208 instead of 0.01149127 +-0.8493397 interannual variability 0.01894335 +-1.15037 into him 0.01894335 +-1.15037 into the -0.01198488 +-0.8493397 introduced by -0.004049858 +-1.995468 is , -0.007080218 +-1.995468 is always 0.01894335 +-1.995468 is considered 0.05332709 +-1.995468 is derived 0.05332709 +-1.995468 is doing 0.05332709 +-1.995468 is happening 0.01894335 +-1.995468 is highly 0.05332709 +-1.995468 is measured 0.05332709 +-1.995468 is no 0.05332709 +-1.995468 is not -0.006728992 +-1.995468 is related 0.05332709 +-1.995468 is that -0.008505944 +-1.995468 is the -0.01198488 +-1.995468 is within 0.05332709 +-1.84934 it grows 0.05332709 +-1.84934 it has 0.00272119 +-1.184377 it is 0.0004524188 +-1.84934 it just 0.00272119 +-1.84934 it looks 0.05332709 +-1.84934 it originated 0.05332709 +-1.84934 it was -0.0004517734 +-1.84934 it yet 0.05332709 +-1.4514 it's like 0.01894335 +-1.4514 it's much 0.01894335 +-1.4514 it's not -0.006728992 +-1.4514 it's very 0.01894335 +-0.8493397 its enormous 0.05332709 +-1.15037 journal ( -0.005168174 +-1.15037 journal article 0.008061528 +-0.8493397 jurisdiction . -0.004888296 +-1.4514 just between 0.008061528 +-1.4514 just keeps 0.05332709 +-1.4514 just one 0.008061528 +-1.4514 just to 0.02102831 +-0.8493397 kaufman et 0.2044696 +-0.8493397 keeps growing 0.05332709 +-0.4047208 khadyta river 0.1451325 +-1.4514 know ! 0.01894335 +-0.7057508 know , -0.007021053 +-1.4514 know where 0.008061528 +-0.8493397 larch sample 0.05332709 +-1.15037 larches . 0.08907277 +-1.15037 larches were 0.04819728 +-0.8493397 large-scale " 0.01222976 +-1.15037 like crack 0.01894335 +-1.15037 like trying 0.05332709 +-0.8493397 limited size 0.05332709 +-0.8493397 living larches 0.01894335 +-0.8493397 longest and -0.01063527 +-0.8493397 looking up 0.01894335 +-0.8493397 looks relevant 0.05332709 +-0.8493397 love to -0.01011507 +-0.8493397 made that -0.008505944 +-0.4047208 mag ) 0.002721187 +-0.8493397 magnitude of -0.009287588 +-0.8493397 magnus . -0.0114856 +-0.8493397 makes the -0.01198488 +-0.8493397 many multiproxy 0.05332709 +-0.8493397 may well 0.05332709 +-0.8493397 mean chronology -0.005168174 +-0.8493397 measured by 0.0389223 +-0.4047208 measurement data 0.0009555696 +-1.4514 method " -0.008505944 +-1.4514 method . -0.004888296 +-1.4514 method that -0.008505944 +-1.4514 method which 0.00272119 +-0.8493397 methodology warn 0.05332709 +-0.8493397 mind when 0.008061528 +-0.8493397 mix religion 0.05332709 +-1.326461 more " -0.008505944 +-1.326461 more it 0.0164606 +-1.326461 more slowly 0.01894335 +-0.8493397 morning i -0.006035987 +-1.326461 most recent 0.05332709 +-1.326461 most recently 0.2044696 +-1.326461 most sensitive 0.05332709 +-1.15037 much further 0.008061528 +-1.15037 much illusion 0.05332709 +-0.8493397 multi-parters , -0.01187418 +-0.8493397 multiproxy studies 0.05332709 +-0.8493397 mundane politics 0.008061528 +-0.580812 my attention 0.05332709 +-1.326461 my ghastly 0.05332709 +-0.8493397 national debt 0.05332709 +-0.8493397 naughtiness . -0.0114856 +-0.8493397 nettle , -0.01187418 +-0.8493397 never properly 0.05332709 +-0.8493397 next , -0.008075343 +-0.8493397 no doubt 0.05332709 +-0.8493397 non-robustness observed 0.05332709 +-0.8493397 northern forests 0.05332709 +-1.84934 not be 0.01894335 +-1.84934 not due 0.05332709 +-1.84934 not going 0.05332709 +-1.184377 not have 0.07243546 +-1.84934 not just 0.00272119 +-1.84934 not preserve 0.05332709 +-1.84934 not struggling 0.05332709 +-1.84934 not using 0.01894335 +-0.8493397 noted before 0.05332709 +-0.8493397 noticed that 0.04168737 +-0.8493397 notwithstanding these 0.00272119 +-0.8493397 now , 0.05896524 +-1.15037 obama , -0.007080218 +-1.15037 obama is -0.008505944 +-0.8493397 observed here 0.05332709 +-2.079789 of 17 0.05332709 +-2.079789 of a -0.01011507 +-2.079789 of being 0.05332709 +-2.079789 of commentary 0.05332709 +-2.079789 of darkness 0.05332709 +-2.079789 of deeper 0.05332709 +-2.079789 of his 0.008061528 +-2.079789 of interannual 0.05332709 +-2.079789 of mundane 0.05332709 +-2.079789 of old 0.01894335 +-1.33414 of older 0.03455187 +-2.079789 of reposting 0.05332709 +-2.079789 of subfossil 0.008061528 +-1.33414 of the -0.06704012 +-2.079789 of this -0.009059753 +-1.15037 old living 0.05332709 +-1.15037 old trees 0.00272119 +-0.6614985 older trees 0.03579502 +-0.8493397 oldie , -0.008075343 +-1.006781 on a -0.007295178 +-1.75243 on average 0.05332709 +-1.75243 on many 0.05332709 +-1.75243 on rcs 0.008061528 +-1.75243 on the -0.007702977 +-1.006781 on this -0.005168174 +-1.326461 one . -0.0114856 +-1.326461 one approach 0.05332709 +-1.326461 one oldie 0.05332709 +-0.8493397 online . -0.0114856 +-0.8493397 only taken 0.05332709 +-0.8493397 or real 0.05332709 +-0.8493397 originated with -0.007295175 +-0.8493397 osborn and -0.01063527 +-0.8493397 out ( -0.005168174 +-0.8493397 outright fantasy 0.05332709 +-0.8493397 own caveats 0.05332709 +-0.8493397 paleoclimatologists and -0.01063527 +-0.8493397 passage i -0.006035987 +-0.8493397 passing and -0.01063527 +-0.8493397 path " -0.008505944 +-0.8493397 patterns in 0.006514465 +-0.8493397 paul had -0.0004517734 +-0.8493397 people that -0.008505944 +-0.8833473 perhaps the -0.01011507 +-1.54831 perhaps there's 0.05332709 +-1.54831 perhaps they 0.00272119 +-0.4047208 phil trans 0.05332709 +-0.8493397 picked cores 0.008061528 +-0.8493397 piece by -0.004049858 +-0.8493397 place . -0.0114856 +-0.8493397 placed online 0.05332709 +-0.8493397 play on 0.03209379 +-0.8493397 point that -0.008505944 +-0.8493397 policy ) -0.005168174 +-1.326461 politics , -0.01187418 +-1.326461 politics . -0.004888296 +-1.326461 politics are 0.05332709 +-0.8026608 population . -0.0100497 +-1.54831 population as -0.004049858 +-1.54831 population consists 0.05332709 +-1.54831 population instead 0.2044696 +-0.8493397 position that 0.04168737 +-0.8493397 possible and -0.01063527 +-1.15037 potential bias 0.00272119 +-1.15037 potential unrepresentativeness 0.05332709 +-0.8493397 power to -0.01011507 +-0.8493397 powers and -0.01063527 +-0.8493397 precipitous decline 0.05332709 +-0.8493397 precisely the -0.007702977 +-0.8493397 predictable factors 0.05332709 +-0.8493397 presented this 0.009005655 +-0.8493397 preserve centennial-scale 0.05332709 +-0.8493397 previous journal 0.01894335 +-0.8493397 principalities of -0.009287588 +-0.8493397 principles in 0.006514465 +-0.8493397 prior selection 0.1193421 +-0.8493397 probable that 0.04168737 +-0.4047208 problem " -0.004049861 +-0.8493397 projected into 0.01894335 +-0.8493397 properly published 0.05332709 +-0.8493397 prove out 0.05332709 +-0.8493397 provide the -0.01198488 +-0.8493397 provided a -0.01011507 +-0.8493397 provocative thought 0.05332709 +-0.8493397 published in -0.009490006 +-0.8493397 push at 0.008061528 +-1.326461 rcs chronology -0.005168174 +-1.326461 rcs method 0.00272119 +-1.326461 rcs methodology 0.05332709 +-0.8493397 react to 0.002916232 +-0.8493397 read it -0.006728992 +-1.15037 readers also 0.01894335 +-1.15037 readers know 0.08231446 +-0.8493397 reading , -0.01187418 +-0.8493397 real ) -0.005168174 +-0.8493397 really react 0.05332709 +-0.8493397 realm of -0.009287588 +-1.15037 reason for -0.002554279 +-1.15037 reason why 0.05332709 +-0.8493397 recent one 0.008061528 +-0.4047208 recently , -0.01082908 +-0.8493397 reconstruction . -0.0114856 +-0.8493397 refusal in -0.009490006 +-0.8493397 refused to -0.01011507 +-0.8493397 related to -0.01011507 +-0.8493397 relevant , -0.008075343 +-0.8493397 relied on 0.03209379 +-0.8493397 religion and -0.01063527 +-0.8493397 remained unarchived 0.05332709 +-0.8493397 remarked that -0.008505944 +-0.8493397 reposting just 0.00272119 +-0.8493397 requiring briffa -0.005168174 +-0.8493397 response to 0.02102831 +-0.8493397 resulting yamal 0.02412629 +-0.8493397 rev . -0.0114856 +-1.4514 right . -0.0114856 +-1.4514 right now 0.05332709 +-1.4514 right place 0.05332709 +-1.4514 right time 0.01894335 +-0.8493397 ring widths 0.05332709 +-0.8493397 ring-width series 0.1193421 +-0.4047208 river , -0.01082908 +-0.8493397 said he -0.0004517734 +-0.8493397 same bias 0.00272119 +-0.8493397 sample should 0.05332709 +-0.8493397 sat in -0.009490006 +-1.4514 schweingruber data -0.006035987 +-0.7864373 schweingruber population 0.09172077 +-0.8493397 schweingruber's khadyta 0.2044696 +-0.580812 science ( -0.02724335 +-1.326461 science article 0.008061528 +-0.8493397 script ) 0.03209379 +-1.326461 see , -0.008075343 +-0.580812 see the -0.01105098 +-0.8493397 seized the -0.01198488 +-1.15037 selected . -0.004888296 +-1.15037 selected on 0.03209379 +-1.326461 selection is -0.008505944 +-0.580812 selection of 0.01149127 +-0.8493397 sensitive series 0.1193421 +-0.8493397 sensitivity is -0.008505944 +-0.580812 series , -0.01082908 +-1.326461 series of -0.009287588 +-0.8493397 set in -0.009490006 +-0.8493397 several things 0.01894335 +-0.8493397 shadow play 0.05332709 +-1.15037 shadows . -0.0114856 +-1.15037 shadows of -0.009287588 +-1.326461 shiyatov 2002 0.05332709 +-1.326461 shiyatov themselves 0.01894335 +-1.326461 shiyatov would 0.08231446 +-0.8493397 should not -0.006728992 +-0.8493397 similar schweingruber 0.00272119 +-0.8493397 similarly affected 0.05332709 +-0.8493397 since this -0.009059753 +-0.8493397 size and -0.01063527 +-0.8493397 skimmed this -0.009059753 +-1.15037 slowly , -0.01187418 +-1.15037 slowly get 0.2044696 +-0.8493397 small push 0.05332709 +-0.8493397 so much 0.01894335 +-0.8493397 some reason 0.01894335 +-0.8493397 someone whose 0.05332709 +-0.8493397 start today 0.01894335 +-0.8493397 staunchly refused 0.05332709 +-0.8493397 struggling against 0.01894335 +-0.8493397 studies that -0.008505944 +-1.15037 study , -0.01187418 +-1.15037 study . 0.08907277 +-0.8493397 stumbled upon 0.05332709 +-1.326461 subfossil collection 0.05332709 +-1.326461 subfossil data 0.02685598 +-1.326461 subfossil larches 0.01894335 +-0.8493397 subsequent study 0.01894335 +-0.8493397 subset in -0.009490006 +-0.8493397 success . -0.0114856 +-0.8493397 supplement . 0.08907277 +-0.8493397 supplemented by 0.0389223 +-0.8493397 surface , -0.008075343 +-0.8493397 take an -0.0004517734 +-0.8493397 taken a 0.0001907796 +-1.15037 taymir data -0.006035987 +-1.15037 taymir supplement 0.05332709 +-0.8493397 temperature , 0.05896524 +-0.8493397 tendency to -0.01011507 +-0.8493397 terms of -0.009287588 +-0.8493397 than the -0.008591395 +-1.995468 that " -0.008505944 +-1.995468 that cast 0.05332709 +-1.995468 that characterizes 0.05332709 +-1.995468 that have -0.002554279 +-1.995468 that he 0.06152429 +-1.995468 that his 0.008061528 +-0.9275748 that the 0.03271748 +-1.995468 that they 0.00272119 +-1.995468 that voted 0.05332709 +-1.995468 that way 0.05332709 +-1.995468 that wise 0.05332709 +-2.668884 the " -0.008505944 +-1.923235 the 12 0.05332709 +-2.668884 the addition 0.05332709 +-1.923235 the arkive 0.05332709 +-2.668884 the back-and-forth 0.05332709 +-2.668884 the biased 0.05332709 +-2.668884 the biblical 0.05332709 +-2.668884 the chronology -0.005168174 +-1.923235 the conservatives 0.05332709 +-2.668884 the crossroads 0.05332709 +-2.003921 the cru 0.0632299 +-2.668884 the data 0.02685598 +-2.668884 the day 0.01894335 +-2.668884 the difference 0.01894335 +-2.668884 the far 0.05332709 +-2.668884 the following: 0.05332709 +-2.668884 the further 0.008061528 +-2.668884 the future 0.05332709 +-2.668884 the information 0.05332709 +-2.668884 the large-scale 0.05332709 +-2.668884 the longest 0.05332709 +-2.668884 the magnitude 0.05332709 +-2.668884 the measurement 0.2044696 +-2.668884 the more 0.008061528 +-2.668884 the most 0.008061528 +-2.668884 the multi-parters 0.05332709 +-2.668884 the national 0.05332709 +-2.668884 the nettle 0.05332709 +-2.668884 the non-robustness 0.05332709 +-2.668884 the path 0.05332709 +-2.668884 the people 0.05332709 +-2.668884 the phil 0.2044696 +-2.668884 the point 0.05332709 +-2.668884 the position 0.05332709 +-2.668884 the previous 0.05332709 +-2.668884 the rcs 0.008061528 +-2.668884 the realm 0.05332709 +-2.668884 the resulting 0.05332709 +-1.923235 the right 0.01894334 +-2.668884 the same 0.05332709 +-2.003921 the schweingruber -0.5245172 +-2.668884 the shadows 0.01894335 +-2.668884 the subfossil 0.008061528 +-1.923235 the taymir 0.05332709 +-1.923235 the trouble 0.1451325 +-1.923235 the two 0.1451325 +-2.668884 the use 0.2044696 +-2.668884 the usual 0.05332709 +-2.668884 the very 0.01894335 +-2.668884 the virtue 0.05332709 +-1.120574 the yamal 0.02719982 +-0.8493397 their cores 0.008061528 +-1.15037 themselves , -0.01187418 +-1.15037 themselves were -0.002554279 +-0.8493397 there's some 0.05332709 +-1.4514 these data -0.006035987 +-1.4514 these shadows 0.01894335 +-1.4514 these warnings 0.05332709 +-1.4514 these were -0.002554279 +-1.4514 they can 0.01894335 +-1.4514 they don't 0.05332709 +-1.4514 they expect 0.05332709 +-1.4514 they themselves 0.01894335 +-1.15037 things caught 0.05332709 +-1.15037 things that -0.008505944 +-0.8493397 think up 0.01894335 +-0.8493397 thinking through 0.05332709 +-2.05346 this analysis 0.05332709 +-2.05346 this article 0.008061528 +-2.05346 this bias 0.00272119 +-1.307811 this chronology 0.002721187 +-2.05346 this difference 0.01894335 +-1.307811 this is -0.004049861 +-2.05346 this method 0.00272119 +-2.05346 this morning 0.05332709 +-2.05346 this piece 0.05332709 +-2.05346 this refusal 0.05332709 +-2.05346 this study 0.01894335 +-2.05346 this subset 0.05332709 +-2.05346 this will 0.01894335 +-2.05346 this year 0.01894335 +-1.15037 those " -0.008505944 +-1.15037 those years 0.05332709 +-0.8493397 thought . -0.0114856 +-0.8493397 thousand , 0.05896524 +-0.8493397 through the -0.01198488 +-1.15037 time , -0.008075343 +-1.15037 time and -0.002266581 +-2.191762 to about 0.00272119 +-2.191762 to admit 0.05332709 +-2.191762 to archive 0.05332709 +-1.446113 to begin 0.05332709 +-2.191762 to change 0.05332709 +-2.191762 to constructing 0.05332709 +-2.191762 to control 0.01894335 +-2.191762 to day 0.01894335 +-2.191762 to different 0.01894335 +-2.191762 to get 0.2044696 +-2.191762 to mix 0.05332709 +-2.191762 to provide 0.05332709 +-2.191762 to start 0.05332709 +-1.123869 to the -0.005761562 +-2.191762 to think 0.05332709 +-2.191762 to those 0.01894335 +-1.446113 to what 0.005001867 +-1.15037 today . -0.0114856 +-1.15037 today would 0.00272119 +-0.8493397 took the -0.01198488 +-0.8493397 towards older 0.09388901 +-1.15037 trans b 0.05332709 +-1.15037 trans editors 0.05332709 +-1.4514 trees . -0.0114856 +-1.4514 trees an -0.0004517734 +-1.4514 trees described 0.05332709 +-1.4514 trees than 0.05332709 +-0.4047208 trouble with -0.03998877 +-0.8493397 true , -0.01187418 +-0.8493397 trying to -0.01011507 +-0.4047208 two versions 0.05332709 +-0.8493397 unarchived . -0.004888296 +-0.8493397 under control 0.01894335 +-0.8493397 unintentional bias 0.00272119 +-0.8493397 unrepresentativeness of 0.007685009 +-0.8493397 until recently 0.2044696 +-0.8493397 unveiled: humanity 0.05332709 +-1.15037 up a -0.01011507 +-1.15037 up the -0.01198488 +-0.8493397 upon this -0.009059753 +-0.4047208 use of -0.005627823 +-1.54831 used by -0.004049858 +-0.8833473 used in 0.01371272 +-1.54831 used the -0.01198488 +-1.15037 using . 0.08907277 +-1.15037 using the -0.008591395 +-0.8493397 usual predictable 0.05332709 +-0.8493397 valid reason 0.01894335 +-1.15037 variability . -0.004888296 +-1.15037 variability and -0.01063527 +-1.15037 versions . 0.08907277 +-1.15037 versions is -0.008505944 +-1.15037 very hard 0.05332709 +-1.15037 very limited 0.05332709 +-0.8493397 violence unveiled: 0.05332709 +-0.8493397 virtually every 0.05332709 +-0.8493397 virtue of -0.009287588 +-0.8493397 voted for -0.002554279 +-0.8493397 warn against 0.01894335 +-0.8493397 warnings , -0.01187418 +-1.54831 was finally 0.008061528 +-1.54831 was ghostwritten 0.05332709 +-1.54831 was like 0.01894335 +-1.54831 was never 0.05332709 +-1.54831 was used 0.04797027 +-0.8493397 way slowly 0.01894335 +-0.8493397 we do 0.01894335 +-0.8493397 well have 0.04819728 +-1.627491 were not -0.006728992 +-1.627491 were right 0.00272119 +-0.881842 were selected 0.05332709 +-1.627491 were supplemented 0.05332709 +-1.627491 were the -0.01198488 +-1.694438 what a -0.01011507 +-1.694438 what did 0.01894335 +-1.694438 what happens 0.05332709 +-1.694438 what is -0.008505944 +-1.694438 what paul 0.05332709 +-1.694438 what the -0.007702977 +-1.694438 what will 0.01894335 +-0.8493397 what's your 0.01894335 +-1.326461 when combined 0.05332709 +-1.326461 when he -0.0004517734 +-1.326461 when i -0.006035987 +-1.326461 where it's 0.00272119 +-1.326461 where sensitivity 0.05332709 +-1.326461 where to 0.002916232 +-1.4514 which , -0.01187418 +-1.4514 which did 0.01894335 +-1.4514 which had 0.06152429 +-1.4514 which makes 0.05332709 +-1.326461 while including 0.01894335 +-1.326461 while looking 0.05332709 +-1.326461 while the 0.02129733 +-0.8493397 whose book 0.05332709 +-0.8493397 why schweingruber's 0.05332709 +-0.8493397 widths and -0.01063527 +-1.15037 will be 0.01894335 +-1.15037 will have -0.002554279 +-0.8493397 wise crack 0.01894335 +-1.890732 with . -0.004888296 +-1.890732 with a -0.01011507 +-1.890732 with briffa 0.02412629 +-1.890732 with its 0.05332709 +-1.145083 with obama 0.05332709 +-1.890732 with osborn 0.05332709 +-0.8228394 with the 0.02898683 +-0.8493397 within your 0.01894335 +-0.8493397 without fully 0.05332709 +-0.8493397 worth reading 0.05332709 +-1.4514 would do 0.01894335 +-0.7057508 would not -0.04287655 +-1.4514 would take 0.05332709 +-0.8493397 wright's church 0.05332709 +-0.8493397 wrote the -0.01198488 +-1.087467 yamal chronology 0.01075652 +-1.75243 yamal data -0.006035987 +-1.75243 yamal larch 0.05332709 +-1.75243 yamal measurement 0.2044696 +-1.75243 yamal reconstruction 0.05332709 +-1.75243 yamal subfossil 0.008061528 +-1.15037 year , -0.008075343 +-1.15037 year old 0.01894335 +-0.8493397 years ? 0.01894335 +-0.8493397 yes , -0.01187418 +-0.8493397 yesterday about 0.00272119 +-0.8493397 yet , 0.05896524 +-0.8493397 you see 0.008061528 +-1.15037 your great 0.2044696 +-1.15037 your power 0.05332709 + +\3-grams: +-1.533073 control ! as +-1.533073 know ! instead +-1.533073 . " i'd +-1.533073 ? " +-1.533073 a " divergence +-1.533073 concrete " ( +-1.533073 considered " success +-1.533073 large-scale " divergence +-1.533073 method " used +-1.533073 more " concrete +-1.533073 path " as +-1.834103 problem " - +-1.834103 problem " that +-1.533073 that " the +-1.533073 the " corridor +-1.533073 those " further +-1.533073 . ' +-1.533073 ' yes +-1.533073 " ( or +-1.533073 . ( while +-1.533073 al ( phil +-1.533073 data ( in +-1.533073 journal ( which +-1.533073 out ( and +-0.8145491 science ( mag +-1.533073 . ) +-1.533073 2008 ) and +-1.533073 away ) , +-1.834103 mag ) acquiesced +-1.834103 mag ) took +-1.533073 policy ) had +-1.533073 real ) things +-1.533073 script ) , +-1.834103 ) , it's +-1.834103 ) , this +-1.533073 actually , all +-1.533073 ago , i +-1.533073 and , when +-1.533073 article , it +-1.533073 attention , but +-1.533073 available , this +-1.533073 average , of +-1.533073 b , 2008 +-1.533073 before , briffa +-1.533073 bias , when +-1.533073 blood , but +-1.533073 but , notwithstanding +-1.533073 chronology , 224 +-1.533073 consists , on +-1.533073 cores , this +-1.533073 debt , which +-1.533073 energy , but +-1.533073 factors , but +-1.533073 first , a +-1.533073 furthermore , it +-1.834103 however , as +-1.834103 however , using +-1.533073 idea , bob +-1.533073 including , most +-1.533073 inhomogeneities , but +-1.533073 is , it's +-1.834103 know , the +-1.834103 know , until +-1.533073 multi-parters , delete +-1.533073 nettle , requiring +-1.533073 next , i +-1.533073 now , but +-1.533073 obama , which +-1.533073 oldie , i +-1.533073 politics , he +-1.533073 reading , cf +-1.834103 recently , cru +-1.834103 recently , kaufman +-1.533073 relevant , and +-1.834103 river , while +-1.834103 river , yamal +-1.533073 see , the +-1.834103 series , from +-1.834103 series , where +-1.533073 slowly , is +-1.533073 study , including +-1.533073 surface , and +-1.533073 temperature , but +-1.533073 themselves , since +-1.533073 thousand , but +-1.533073 time , and +-1.533073 true , for +-1.533073 warnings , his +-1.533073 which , if +-1.533073 year , the +-1.533073 yes , perhaps +-1.533073 yet , but +-1.533073 " - not +-1.533073 2006 . while +-1.533073 2009 . +-1.533073 article . however +-1.533073 attention . first +-1.533073 bailie . i +-1.533073 cf . violence +-1.533073 cores . briffa +-1.533073 crossroads . ) +-1.834103 data . as +-1.834103 data . but +-1.533073 days . +-1.533073 difference . +-1.533073 forests . however +-1.533073 future . changing +-1.533073 growing . +-1.533073 idea . what's +-1.533073 jurisdiction . briffa +-1.533073 larches . +-1.533073 magnus . actually +-1.533073 method . this +-1.533073 naughtiness . ( +-1.533073 one . in +-1.533073 online . with +-1.533073 place . ' +-1.533073 politics . this +-1.834103 population . it +-1.834103 population . the +-1.533073 reconstruction . science +-1.533073 rev . wright's +-1.533073 right . what +-1.533073 selected . these +-1.533073 shadows . and +-1.533073 study . +-1.533073 success . " +-1.533073 supplement . +-1.533073 thought . furthermore +-1.533073 today . several +-1.533073 trees . perhaps +-1.533073 unarchived . a +-1.533073 using . +-1.533073 variability . these +-1.533073 versions . +-1.533073 with . a +-1.834103 the 12 cores +-1.834103 the 12 picked +-1.533073 of 17 ring-width +-2.010194 briffa 2000 and +-2.010194 briffa 2000 may +-2.010194 briffa 2000 presented +-1.533073 shiyatov 2002 as +-1.533073 briffa 2006 . +-1.533073 , 2008 ) +-1.533073 al 2009 . +-1.533073 from 200–400 year +-1.533073 , 224 individual +-1.533073 bob ? i +-1.533073 years ? " +-1.533073 , a comment +-1.834103 . a commenter +-1.834103 . a few +-1.533073 about a thousand +-1.533073 as a shadow +-1.533073 at a time +-1.533073 constructing a mean +-1.533073 delete a few +-1.533073 from a prior +-1.834103 had a different +-1.834103 had a great +-1.533073 has a " +-1.834103 in a case +-1.834103 in a science +-1.533073 of a similar +-1.834103 on a rcs +-1.834103 on a surface +-1.533073 provided a generating +-1.533073 taken a few +-1.533073 up a valid +-1.533073 what a provocative +-1.533073 with a small +-1.533073 concerned about potential +-1.533073 crack about not +-1.533073 to about a +-1.533073 yesterday about my +-1.533073 ) acquiesced in +-1.533073 . actually , +-1.533073 the addition of +-1.533073 to admit that +-1.533073 similarly affected the +-1.533073 struggling against flesh +-1.533073 warn against inhomogeneities +-1.533073 different aging patterns +-1.533073 days ago , +-1.533073 further ahead you +-1.834103 et al ( +-1.834103 et al 2009 +-1.533073 , all of +-1.533073 for all those +-1.533073 and all-around naughtiness +-1.533073 further along the +-1.533073 chronology also has +-1.533073 readers also know +-1.533073 has always been +-1.533073 is always worth +-1.533073 been an exception +-1.533073 for an extension +-1.533073 have an important +-1.533073 take an immense +-1.533073 trees an unintentional +-1.533073 this analysis has +-1.533073 ( and i've +-1.533073 ) and the +-2.010194 , and he +-2.010194 , and that +-2.010194 , and they +-1.533073 . and perhaps +-1.533073 2000 and science +-1.533073 ayers and sat +-1.533073 darkness and all-around +-1.533073 do and the +-1.533073 extension and , +-1.533073 flesh and blood +-0.1249387 hantemirov and shiyatov +-1.533073 illusion and outright +-1.533073 longest and most +-1.533073 osborn and briffa +-1.533073 paleoclimatologists and got +-1.533073 passing and it +-1.533073 possible and even +-1.533073 powers and principalities +-1.533073 religion and politics +-1.533073 size and potential +-1.533073 time and the +-1.533073 variability and hantemirov +-1.533073 widths and temperature +-1.533073 but anti-divine powers +-1.533073 in any journal +-1.533073 one approach to +-1.533073 to archive the +-1.533073 politics are to +-1.834103 the arkive down +-1.834103 the arkive under +-1.533073 journal article . +-1.533073 science article , +-1.533073 this article on +-1.533073 ! as it +-1.533073 " as a +-1.533073 , as ca +-1.533073 . as noted +-1.533073 2002 as follows: +-1.533073 as ca +-1.533073 population as compared +-1.533073 briffa asked for +-1.533073 few at a +-1.533073 humanity at the +-1.533073 push at precisely +-1.834103 my attention , +-1.834103 my attention . +-1.533073 finally available , +-1.533073 on average , +-1.533073 further away ) +-1.533073 bill ayers and +-1.533073 trans b , +-1.533073 the back-and-forth yesterday +-1.533073 gil bailie . +-1.533073 not be included +-1.533073 will be happening +-1.533073 but because so +-1.533073 always been an +-1.533073 had been projected +-1.834103 have been concerned +-1.834103 have been done +-1.533073 noted before , +-1.834103 to begin in +-1.834103 to begin with +-1.533073 of being true +-1.533073 but between the +-1.533073 difference between the +-1.533073 just between ring +-1.533073 potential bias introduced +-1.533073 same bias towards +-1.533073 this bias would +-1.533073 unintentional bias , +-1.533073 the biased selection +-1.533073 the biblical passage +-1.533073 by bill ayers +-1.533073 hs blade was +-1.533073 and blood , +-1.533073 , bob ? +-1.533073 whose book was +-1.533073 , briffa asked +-1.834103 . briffa 2000 +-1.834103 . briffa used +-1.533073 and briffa 2006 +-1.533073 chronology briffa et +-1.533073 in briffa 2000 +-1.533073 requiring briffa to +-1.533073 with briffa 2000 +-1.533073 briffa's own +-1.533073 in briffa's yamal +-2.487315 , but , +-2.487315 , but anti-divine +-2.487315 , but because +-2.487315 , but between +-1.467762 , but it +-2.487315 , but the +-2.487315 , but this +-2.487315 , but to +-1.533073 . but given +-1.533073 but it's +-1.533073 comment by magnus +-1.533073 ghostwritten by bill +-1.533073 introduced by how +-1.533073 measured by the +-1.533073 piece by gil +-1.533073 supplemented by the +-1.533073 used by hantemirov +-0.8145491 as ca readers +-1.533073 i can combine +-1.533073 they can see +-1.533073 a case where +-1.533073 that cast these +-1.533073 comments catch my +-1.533073 things caught my +-1.533073 own caveats on +-1.533073 preserve centennial-scale variability +-1.533073 , cf . +-1.533073 to change with +-1.533073 . changing what +-1.533073 that characterizes northern +-1.533073 i checked earlier +-1.533073 mean chronology , +-1.533073 rcs chronology method +-1.533073 the chronology briffa +-1.834103 this chronology also +-1.834103 this chronology in +-2.010194 yamal chronology has +-2.010194 yamal chronology was +-2.010194 yamal chronology with +-1.533073 wright's church for +-1.533073 crack cocaine for +-1.533073 subfossil collection does +-1.533073 in combination with +-1.533073 can combine the +-1.533073 when combined with +-1.533073 a comment by +-1.533073 of commentary on +-1.533073 a commenter remarked +-1.533073 his comments catch +-1.533073 as compared to +-1.533073 been concerned about +-1.533073 " concrete " +-1.533073 in connection with +-1.834103 the conservatives said +-1.834103 the conservatives were +-1.533073 is considered " +-1.533073 population consists , +-1.533073 to constructing a +-1.533073 to control the +-1.533073 under control ! +-1.533073 12 cores . +-1.533073 picked cores , +-1.533073 their cores were +-1.533073 " corridor method +-1.533073 like crack cocaine +-1.533073 wise crack about +-1.533073 the crossroads . +-1.533073 , cru staunchly +-0.9906404 the cru population +-2.010194 the cru selection +-1.533073 of darkness and +-1.533073 different data policy +-1.834103 measurement data remained +-1.834103 measurement data used +-1.533073 schweingruber data set +-1.533073 subfossil data . +-1.533073 taymir data ( +-1.533073 the data . +-1.533073 these data were +-1.533073 yamal data was +-1.533073 the day to +-1.533073 to day politics +-1.834103 few days . +-1.834103 few days ago +-1.533073 national debt , +-1.533073 precipitous decline is +-1.533073 happening deep into +-1.533073 of deeper principles +-1.533073 , delete a +-1.533073 is derived from +-1.533073 trees described in +-1.533073 what did they +-1.533073 which did not +-1.533073 the difference between +-1.533073 this difference . +-1.533073 a different data +-1.533073 to different aging +-0.8145491 " divergence problem +-1.533073 we do indeed +-1.533073 would do and +-1.533073 collection does not +-1.533073 is doing exactly +-1.533073 they don't really +-1.533073 been done without +-1.533073 no doubt what +-1.533073 arkive down to +-1.533073 not due just +-1.533073 checked earlier this +-1.533073 trans editors finally +-1.533073 immense energy , +-1.533073 its enormous hs +-1.533073 briffa et al +-1.533073 kaufman et al +-1.533073 and even probable +-1.533073 virtually every subsequent +-1.533073 doing exactly what +-1.533073 an exception to +-1.533073 for excluding khadyta +-1.533073 they expect from +-1.533073 an extension and +-1.533073 predictable factors , +-1.533073 outright fantasy had +-1.533073 the far more +-2.010194 a few at +-0.9906404 a few days +-1.533073 editors finally seized +-1.533073 information finally available +-1.533073 was finally placed +-1.533073 . first , +-1.533073 against flesh and +-1.533073 the following: +-1.533073 as follows: +-1.533073 , for we +-1.533073 asked for an +-1.533073 church for all +-1.533073 cocaine for paleoclimatologists +-1.533073 reason for excluding +-1.533073 voted for him +-1.533073 northern forests . +-1.533073 , from 200–400 +-1.533073 derived from a +-1.533073 expect from someone +-1.533073 without fully thinking +-1.533073 " further along +-1.533073 much further away +-1.533073 the further ahead +-1.533073 . furthermore , +-1.533073 the future . +-1.533073 a generating script +-1.533073 slowly get the +-1.533073 to get the +-1.533073 my ghastly tendency +-1.533073 was ghostwritten by +-1.533073 by gil bailie +-1.533073 but given the +-1.533073 not going to +-1.533073 and got used +-1.533073 a great idea +-1.533073 your great idea +-1.533073 keeps growing . +-1.533073 it grows more +-1.533073 ) had jurisdiction +-1.533073 fantasy had been +-1.533073 i had a +-1.533073 paul had in +-1.533073 which had a +-1.533073 and hantemirov and +-1.533073 by hantemirov and +-1.533073 in hantemirov and +-1.533073 be happening deep +-1.533073 is happening right +-1.533073 what happens today +-1.533073 very hard to +-1.533073 i hardly know +-1.533073 also has a +-1.533073 analysis has only +-1.533073 chronology has always +-1.533073 it has the +-1.533073 him hate to +-2.010194 not have been +-2.010194 not have similarly +-2.010194 not have the +-1.533073 that have relied +-1.533073 well have been +-1.533073 will have an +-1.533073 i haven't read +-1.533073 , he wrote +-1.533073 and he is +-1.533073 said he would +-1.533073 that he is +-1.533073 when he made +-1.533073 observed here prove +-1.533073 is highly possible +-1.533073 for him hate +-1.533073 into him to +-1.533073 , his initial +-1.533073 of his comments +-1.533073 that his precipitous +-1.533073 by how their +-0.8145491 . however , +-1.533073 enormous hs blade +-1.533073 unveiled: humanity at +-2.010194 , i can +-2.010194 , i noticed +-2.010194 , i skimmed +-1.533073 . i haven't +-1.533073 i hardly +-1.533073 ? i know +-1.533073 morning i had +-1.533073 passage i stumbled +-1.533073 when i checked +-1.533073 " i'd love +-1.533073 and i've provided +-1.834103 great idea , +-1.834103 great idea . +-1.533073 , if it +-1.533073 if the +-1.533073 much illusion and +-1.533073 an immense energy +-1.533073 important impact on +-1.533073 an important impact +-1.533073 ( in a +-1.533073 . in response +-1.533073 in one +-1.533073 acquiesced in this +-1.533073 begin in terms +-1.533073 chronology in passing +-1.533073 described in hantemirov +-1.533073 had in mind +-1.533073 patterns in the +-1.533073 principles in the +-1.533073 published in any +-1.533073 refusal in connection +-1.533073 sat in rev +-1.533073 set in combination +-1.533073 subset in briffa +-2.010194 used in a +-2.010194 used in briffa's +-2.010194 used in virtually +-1.533073 be included with +-1.533073 , including , +-1.533073 while including the +-1.533073 do indeed see +-1.533073 224 individual series +-1.533073 the information finally +-1.533073 against inhomogeneities , +-1.533073 his initial use +-1.533073 ! instead of +-1.533073 population instead of +-1.533073 of interannual variability +-1.533073 deep into the +-1.533073 projected into him +-1.533073 bias introduced by +-1.533073 , is considered +-1.533073 decline is not +-1.834103 he is always +-1.834103 he is doing +-2.010194 it is , +-2.010194 it is highly +-2.010194 it is within +-1.533073 obama is that +-1.533073 selection is derived +-1.533073 sensitivity is measured +-1.834103 this is no +-1.834103 this is the +-1.533073 versions is related +-1.533073 what is happening +-1.834103 , it has +-1.834103 , it originated +-1.533073 . it is +-1.533073 and it was +-1.533073 as it is +-1.834103 but it just +-1.834103 but it looks +-1.533073 if it grows +-1.533073 more it is +-1.533073 read it yet +-1.834103 , it's like +-1.834103 , it's very +-1.533073 but it's not +-1.533073 where it's much +-1.533073 with its enormous +-1.533073 any journal article +-1.533073 previous journal ( +-1.533073 had jurisdiction . +-1.533073 due just to +-1.533073 it just keeps +-1.533073 not just between +-1.533073 reposting just one +-1.533073 , kaufman et +-1.533073 just keeps growing +-1.533073 excluding khadyta river +-1.533073 schweingruber's khadyta river +-1.533073 also know , +-1.533073 hardly know where +-1.533073 i know ! +-1.533073 readers know , +-1.533073 yamal larch sample +-1.533073 living larches . +-1.533073 subfossil larches were +-1.533073 the large-scale " +-1.533073 it's like trying +-1.533073 was like crack +-1.533073 very limited size +-1.533073 old living larches +-1.533073 the longest and +-1.533073 while looking up +-1.533073 it looks relevant +-1.533073 i'd love to +-1.533073 he made that +-0.8145491 ( mag ) +-1.533073 the magnitude of +-1.533073 by magnus . +-1.533073 which makes the +-1.533073 on many multiproxy +-1.533073 2000 may well +-1.533073 a mean chronology +-1.533073 is measured by +-1.533073 the measurement data +-1.533073 yamal measurement data +-1.533073 chronology method that +-1.533073 corridor method " +-1.533073 rcs method . +-1.533073 this method which +-1.533073 rcs methodology warn +-1.533073 in mind when +-1.533073 to mix religion +-1.533073 far more " +-1.533073 grows more slowly +-1.533073 the more it +-1.533073 this morning i +-1.533073 , most recently +-1.533073 and most sensitive +-1.533073 the most recent +-1.533073 it's much further +-1.533073 so much illusion +-1.533073 the multi-parters , +-1.533073 many multiproxy studies +-1.533073 of mundane politics +-1.533073 about my ghastly +-1.533073 catch my attention +-1.533073 caught my attention +-1.533073 the national debt +-1.533073 all-around naughtiness . +-1.533073 the nettle , +-1.533073 was never properly +-1.533073 next , +-1.533073 is no doubt +-1.533073 the non-robustness observed +-1.533073 characterizes northern forests +-1.533073 - not just +-1.533073 about not struggling +-1.533073 did not preserve +-1.533073 does not have +-1.533073 is not due +-1.533073 it's not going +-1.533073 should not be +-1.533073 were not using +-0.8145491 would not have +-1.533073 as noted before +-1.533073 i noticed that +-1.533073 , notwithstanding these +-1.533073 right now , +-1.834103 with obama , +-1.834103 with obama is +-1.533073 non-robustness observed here +-1.533073 , of older +-1.533073 addition of 17 +-1.533073 all of his +-1.834103 instead of reposting +-1.834103 instead of the +-1.533073 magnitude of interannual +-1.533073 principalities of darkness +-1.533073 realm of mundane +-1.834103 selection of old +-1.834103 selection of older +-1.533073 series of subfossil +-1.533073 shadows of deeper +-1.533073 terms of commentary +-1.533073 unrepresentativeness of the +-1.834103 use of a +-1.834103 use of this +-1.533073 virtue of being +-1.533073 of old trees +-1.533073 year old living +-0.8145491 of older trees +-1.533073 towards older trees +-1.533073 one oldie , +-1.533073 , on average +-1.533073 article on the +-1.533073 caveats on rcs +-1.533073 commentary on this +-1.533073 impact on many +-1.533073 play on a +-1.533073 relied on this +-1.533073 selected on a +-1.533073 in one approach +-1.533073 just one oldie +-1.533073 recent one . +-1.533073 placed online . +-1.533073 has only taken +-1.533073 ( or real +-1.533073 it originated with +-1.533073 with osborn and +-1.533073 prove out ( +-1.533073 and outright fantasy +-1.533073 briffa's own caveats +-1.533073 for paleoclimatologists and +-1.533073 biblical passage i +-1.533073 in passing and +-1.533073 the path " +-1.533073 aging patterns in +-1.533073 what paul had +-1.533073 the people that +-1.533073 , perhaps the +-1.533073 . perhaps the +-1.834103 perhaps the +-1.834103 perhaps there's +-1.533073 and perhaps they +-1.533073 ( phil trans +-1.533073 the phil trans +-1.533073 12 picked cores +-1.533073 this piece by +-1.533073 right place . +-1.533073 finally placed online +-1.533073 shadow play on +-1.533073 the point that +-1.533073 data policy ) +-1.533073 and politics , +-1.533073 day politics are +-1.533073 mundane politics . +-1.834103 cru population . +-1.834103 cru population consists +-2.010194 schweingruber population . +-2.010194 schweingruber population as +-2.010194 schweingruber population instead +-1.533073 the position that +-1.533073 highly possible and +-1.533073 about potential bias +-1.533073 and potential unrepresentativeness +-1.533073 your power to +-1.533073 anti-divine powers and +-1.533073 his precipitous decline +-1.533073 at precisely the +-1.533073 usual predictable factors +-1.533073 2000 presented this +-1.533073 not preserve centennial-scale +-1.533073 the previous journal +-1.533073 and principalities of +-1.533073 deeper principles in +-1.533073 a prior selection +-1.533073 even probable that +-0.8145491 divergence problem " +-1.533073 been projected into +-1.533073 never properly published +-1.533073 here prove out +-1.533073 to provide the +-1.533073 i've provided a +-1.533073 a provocative thought +-1.533073 properly published in +-1.533073 small push at +-1.533073 a rcs chronology +-1.533073 on rcs methodology +-1.533073 the rcs method +-1.533073 really react to +-1.533073 haven't read it +-1.834103 ca readers also +-1.834103 ca readers know +-1.533073 worth reading , +-1.533073 or real ) +-1.533073 don't really react +-1.533073 the realm of +-1.533073 some reason why +-1.533073 valid reason for +-1.533073 most recent one +-1.533073 most recently , +-1.533073 until recently , +-1.533073 yamal reconstruction . +-1.533073 this refusal in +-1.533073 staunchly refused to +-1.533073 is related to +-1.533073 looks relevant , +-1.533073 have relied on +-1.533073 mix religion and +-1.533073 data remained unarchived +-1.533073 commenter remarked that +-1.533073 of reposting just +-1.533073 , requiring briffa +-1.533073 in response to +-1.533073 the resulting yamal +-1.533073 in rev . +-1.533073 happening right now +-1.834103 the right place +-1.834103 the right time +-1.533073 were right . +-1.533073 between ring widths +-1.533073 17 ring-width series +-0.8145491 khadyta river , +-1.533073 conservatives said he +-1.533073 the same bias +-1.533073 larch sample should +-1.533073 and sat in +-1.533073 similar schweingruber data +-0.1249387 the schweingruber population +-1.533073 why schweingruber's khadyta +-1.533073 . science ( +-1.533073 a science article +-1.533073 and science ( +-1.533073 generating script ) +-1.533073 can see the +-1.533073 indeed see the +-1.533073 you see , +-1.533073 finally seized the +-1.834103 were selected . +-1.834103 were selected on +-1.533073 biased selection of +-1.533073 cru selection is +-1.533073 prior selection of +-1.533073 most sensitive series +-1.533073 where sensitivity is +-1.533073 individual series of +-1.533073 ring-width series , +-1.533073 sensitive series , +-1.533073 data set in +-1.533073 . several things +-1.533073 a shadow play +-1.533073 the shadows of +-1.533073 these shadows . +-2.010194 and shiyatov 2002 +-2.010194 and shiyatov themselves +-2.010194 and shiyatov would +-1.533073 sample should not +-1.533073 a similar schweingruber +-1.533073 have similarly affected +-1.533073 , since this +-1.533073 limited size and +-1.533073 i skimmed this +-1.533073 more slowly , +-1.533073 way slowly get +-1.533073 a small push +-1.533073 because so much +-1.533073 there's some reason +-1.533073 from someone whose +-1.533073 to start today +-1.533073 cru staunchly refused +-1.533073 not struggling against +-1.533073 multiproxy studies that +-1.533073 subsequent study , +-1.533073 this study . +-1.533073 i stumbled upon +-1.533073 of subfossil larches +-1.533073 the subfossil collection +-1.533073 yamal subfossil data +-1.533073 every subsequent study +-1.533073 this subset in +-1.533073 " success . +-1.533073 taymir supplement . +-1.533073 were supplemented by +-1.533073 a surface , +-1.533073 would take an +-1.533073 only taken a +-1.834103 the taymir data +-1.834103 the taymir supplement +-1.533073 and temperature , +-1.533073 ghastly tendency to +-1.533073 in terms of +-1.533073 trees than the +-1.533073 " that characterizes +-1.533073 admit that the +-1.533073 and that way +-1.533073 is that he +-1.533073 made that wise +-1.533073 method that they +-1.533073 noticed that the +-1.533073 people that voted +-1.533073 point that his +-1.533073 position that the +-1.533073 probable that the +-1.533073 remarked that " +-1.533073 studies that have +-1.533073 things that cast +-1.533073 " the trouble +-2.010194 , the more +-2.010194 , the resulting +-2.010194 , the yamal +-1.533073 . the cru +-1.834103 the subfossil +-1.834103 the yamal +-1.533073 affected the " +-1.533073 along the path +-2.010194 and the people +-2.010194 and the phil +-2.010194 and the right +-1.533073 archive the data +-1.533073 at the crossroads +-0.8145491 between the two +-1.533073 but the further +-1.834103 by the addition +-1.834103 by the magnitude +-1.533073 combine the multi-parters +-1.533073 control the national +-0.8145491 get the arkive +-1.533073 given the use +-1.533073 has the virtue +-1.533073 have the same +-1.533073 if the non-robustness +-1.834103 in the realm +-1.834103 in the schweingruber +-1.533073 including the taymir +-1.533073 into the future +-1.533073 is the most +-1.533073 makes the point +-0.8145491 of the 12 +-1.533073 on the trouble +-2.010194 perhaps the biased +-2.010194 perhaps the day +-2.010194 perhaps the difference +-1.533073 precisely the right +-1.533073 provide the measurement +-1.834103 see the far +-1.834103 see the shadows +-1.533073 seized the nettle +-1.533073 than the schweingruber +-2.135133 that the conservatives +-2.135133 that the cru +-2.135133 that the previous +-2.135133 that the yamal +-1.533073 through the very +-2.135133 to the back-and-forth +-2.135133 to the cru +-2.135133 to the large-scale +-2.135133 to the usual +-1.533073 took the position +-1.533073 up the biblical +-1.533073 used the chronology +-1.533073 using the schweingruber +-1.533073 were the longest +-1.533073 what the conservatives +-1.533073 while the yamal +-2.135133 with the information +-2.135133 with the rcs +-2.135133 with the taymir +-2.135133 with the yamal +-1.533073 wrote the following: +-1.533073 how their cores +-1.533073 shiyatov themselves , +-1.533073 they themselves were +-1.533073 perhaps there's some +-1.834103 . these data +-1.834103 . these were +-1.533073 cast these shadows +-1.533073 notwithstanding these warnings +-1.533073 and they can +-1.533073 did they expect +-1.533073 perhaps they don't +-1.533073 that they themselves +-1.533073 ) things that +-1.533073 several things caught +-1.533073 to think up +-1.533073 fully thinking through +-2.010194 , this analysis +-2.010194 , this chronology +-2.010194 , this will +-1.834103 . this bias +-1.834103 . this is +-1.533073 this morning +-1.533073 but this is +-1.533073 earlier this year +-1.533073 in this refusal +-1.533073 of this subset +-1.834103 on this difference +-1.834103 on this study +-1.533073 presented this chronology +-1.533073 since this method +-1.533073 skimmed this article +-1.533073 upon this piece +-1.533073 all those years +-1.533073 to those " +-1.533073 provocative thought . +-1.533073 a thousand , +-1.533073 thinking through the +-1.533073 a time , +-1.533073 right time and +-1.533073 approach to constructing +-1.533073 are to those +-1.533073 briffa to archive +-1.533073 but to what +-1.533073 compared to the +-1.533073 day to day +-1.533073 down to about +-1.533073 exception to the +-1.533073 going to start +-1.533073 hard to think +-1.533073 hate to admit +-1.533073 him to begin +-1.533073 just to the +-1.533073 love to get +-1.533073 power to change +-1.533073 react to what +-1.533073 refused to provide +-1.533073 related to different +-1.533073 response to the +-1.533073 tendency to mix +-1.533073 trying to control +-1.533073 where to begin +-1.533073 happens today would +-1.533073 start today . +-1.533073 ) took the +-1.533073 bias towards older +-1.834103 phil trans b +-1.834103 phil trans editors +-1.533073 old trees described +-2.010194 older trees . +-2.010194 older trees an +-2.010194 older trees than +-0.8145491 the trouble with +-1.533073 being true , +-1.533073 like trying to +-0.8145491 the two versions +-1.533073 remained unarchived . +-1.533073 arkive under control +-1.533073 an unintentional bias +-1.533073 potential unrepresentativeness of +-1.533073 , until recently +-1.533073 violence unveiled: humanity +-1.533073 looking up the +-1.533073 think up a +-1.533073 stumbled upon this +-1.533073 initial use of +-1.533073 the use of +-1.533073 " used by +-1.533073 briffa used the +-1.533073 data used in +-1.533073 got used in +-1.533073 was used in +-1.533073 , using the +-1.533073 not using . +-1.533073 the usual predictable +-1.533073 a valid reason +-1.533073 centennial-scale variability and +-1.533073 interannual variability . +-1.834103 two versions . +-1.834103 two versions is +-1.533073 it's very hard +-1.533073 the very limited +-1.533073 . violence unveiled: +-1.533073 in virtually every +-1.533073 the virtue of +-1.533073 that voted for +-1.533073 methodology warn against +-1.533073 these warnings , +-1.533073 blade was like +-1.533073 book was ghostwritten +-1.533073 chronology was used +-1.533073 data was finally +-1.533073 it was never +-1.533073 that way slowly +-1.533073 for we do +-1.533073 may well have +-1.533073 conservatives were right +-1.533073 cores were selected +-1.533073 data were supplemented +-1.533073 larches were selected +-1.533073 themselves were not +-1.533073 these were the +-1.533073 . what did +-1.533073 what a +-1.533073 changing what happens +-1.533073 doubt what paul +-1.533073 exactly what the +-1.834103 to what is +-1.834103 to what will +-1.533073 . what's your +-1.834103 , when combined +-1.834103 , when i +-1.533073 mind when he +-1.533073 , where sensitivity +-1.533073 case where it's +-1.533073 know where to +-1.533073 ( which had +-1.834103 , which , +-1.834103 , which makes +-1.533073 method which did +-1.533073 ( while looking +-1.533073 , while including +-1.533073 . while the +-1.533073 someone whose book +-1.533073 reason why schweingruber's +-1.533073 ring widths and +-1.533073 this will have +-1.533073 what will be +-1.533073 that wise crack +-1.533073 . with the +-1.533073 begin with . +-1.533073 change with a +-1.533073 chronology with its +-1.533073 combination with the +-1.533073 combined with the +-1.533073 connection with osborn +-1.533073 included with the +-1.533073 originated with briffa +-0.8145491 trouble with obama +-1.533073 is within your +-1.533073 done without fully +-1.533073 always worth reading +-1.533073 bias would not +-1.533073 he would do +-1.533073 shiyatov would not +-1.533073 today would take +-1.533073 . wright's church +-1.533073 he wrote the +-1.533073 , yamal larch +-1.533073 briffa's yamal reconstruction +-1.533073 resulting yamal chronology +-1.212489 the yamal chronology +-2.232043 the yamal data +-2.232043 the yamal measurement +-2.232043 the yamal subfossil +-1.533073 200–400 year old +-1.533073 this year , +-1.533073 those years ? +-1.533073 ' yes , +-1.533073 back-and-forth yesterday about +-1.533073 it yet , +-1.533073 ahead you see +-1.533073 what's your great +-1.533073 within your power + +\end\ diff --git a/decoder/test_data/grammar.prune b/decoder/test_data/grammar.prune new file mode 100644 index 00000000..4ebcb509 --- /dev/null +++ b/decoder/test_data/grammar.prune @@ -0,0 +1,196 @@ +[PHRASE] ||| [PHRASE,1] haus ||| [PHRASE,1] house ||| 1.86183 0 0 0 0.0211892 +[PHRASE] ||| [PHRASE,1] haus ist ||| is [PHRASE,1] house ||| 2.58883 0.311249 0 0.348455 0.0211893 +[PHRASE] ||| [PHRASE,1] haus gibt ||| is [PHRASE,1] house ||| 2.56863 0.291046 0 0.258278 0.0211893 +[PHRASE] ||| [PHRASE,1] ein haus ist ||| [PHRASE,1] is a house ||| 3.16286 0 0 0.576934 0.0211893 +[PHRASE] ||| [PHRASE,1] ist ||| [PHRASE,1] is ||| 2.94101 0 0.676694 0.348455 0 +[PHRASE] ||| [PHRASE,1] ist ||| is [PHRASE,1] ||| 2.36698 0.649056 0.102662 0.348455 0 +[PHRASE] ||| [PHRASE,1] klein ist ||| [PHRASE,1] is small ||| 2.58883 0.124939 0 0.78211 0 +[PHRASE] ||| [PHRASE,1] maus ||| [PHRASE,1] mouse ||| 2.09592 0 0 0 0 +[PHRASE] ||| [PHRASE,1] maus gibt ||| is [PHRASE,1] mouse ||| 2.44865 0 0 0.258278 0 +[PHRASE] ||| [PHRASE,1] kleines ||| [PHRASE,1] small ||| 2.94101 0.439333 0 0.579784 0 +[PHRASE] ||| [PHRASE,1] kleines haus ||| [PHRASE,1] small house ||| 3.24204 0 0 0.579784 0.0211893 +[PHRASE] ||| [PHRASE,1] kleines haus gibt ||| is [PHRASE,1] small house ||| 3.30899 0 0 0.838062 0.0211893 +[PHRASE] ||| [PHRASE,1] kleine ||| [PHRASE,1] small ||| 2.94101 0.439333 0 0.500602 0 +[PHRASE] ||| [PHRASE,1] kleine maus ||| [PHRASE,1] small mouse ||| 3.24204 0 0 0.500602 0 +[PHRASE] ||| [PHRASE,1] kleine maus gibt ||| is [PHRASE,1] small mouse ||| 3.30899 0 0 0.75888 0 +[PHRASE] ||| [PHRASE,1] gelb ||| [PHRASE,1] yellow ||| 2.63998 0 0 0 0 +[PHRASE] ||| [PHRASE,1] gelb haus ||| [PHRASE,1] yellow house ||| 3.24204 0 0 0 0.0211893 +[PHRASE] ||| [PHRASE,1] gelb haus gibt ||| is [PHRASE,1] yellow house ||| 3.30899 0 0 0.258278 0.0211893 +[PHRASE] ||| [PHRASE,1] gelb maus ||| [PHRASE,1] yellow mouse ||| 3.24204 0 0 0 0 +[PHRASE] ||| [PHRASE,1] gelb maus gibt ||| is [PHRASE,1] yellow mouse ||| 3.30899 0 0 0.258278 0 +[PHRASE] ||| [PHRASE,1] gibt ||| is [PHRASE,1] ||| 1.82827 0.110339 0 0.258278 0 +[PHRASE] ||| haus ||| small yellow mouse house ||| 2.46389 0.845098 1.30103 0.278754 1.34341 +[PHRASE] ||| haus ||| house ||| Phrase_0=1.18514 Phrase_2=0.0222764 Phrase_4=0.0211893 +[PHRASE] ||| haus [PHRASE,1] ||| house [PHRASE,1] ||| 2.2878 0 0 0 0.0211893 +[PHRASE] ||| haus ist ||| house is ||| 2.46389 0 0 0.348455 0.0211893 +[PHRASE] ||| haus klein ist ||| house is small ||| 2.2878 0 0 0.78211 0.0211893 +[PHRASE] ||| ein ||| a ||| Phrase_0=1.34995 Phrase_1=0.228479 Phrase_3=0.228479 +[PHRASE] ||| ein [PHRASE,1] ||| a [PHRASE,1] ||| 2.03792 0.290035 0 0.228479 0 +[PHRASE] ||| ein [PHRASE,1] haus ||| a [PHRASE,1] house ||| 2.94101 0 0 0.228479 0.0211893 +[PHRASE] ||| ein [PHRASE,1] haus gibt ||| is a [PHRASE,1] house ||| 3.00796 0 0 0.486757 0.0211893 +[PHRASE] ||| ein [PHRASE,1] ist ||| is a [PHRASE,1] ||| 2.58883 0.535113 0 0.576934 0 +[PHRASE] ||| ein [PHRASE,1] gibt ||| is a [PHRASE,1] ||| 2.56863 0.51491 0 0.486757 0 +[PHRASE] ||| ein haus ||| a house ||| 1.76492 0 0.0791813 0.228479 0.0211893 +[PHRASE] ||| ein haus ||| a small house ||| 2.46389 0.30103 0.778151 0.507233 1.34341 +[PHRASE] ||| ein haus ist ||| is a house ||| 2.76492 0.477121 0 0.576934 0.0211893 +[PHRASE] ||| ein haus gibt ||| is a house ||| 2.46389 0.176091 0.176091 0.486757 0.0211893 +[PHRASE] ||| ein haus gibt ||| is a small house ||| 2.76492 0.39794 0.477121 0.765511 1.34341 +[PHRASE] ||| ein kleines ||| a small ||| 1.86183 0.243038 0 0.808263 0 +[PHRASE] ||| ein kleines [PHRASE,1] ||| a small [PHRASE,1] ||| 3.24204 0.30103 0 0.808263 0 +[PHRASE] ||| ein kleines [PHRASE,1] gibt ||| is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.06654 0 +[PHRASE] ||| ein kleines haus ||| a small house ||| 2.46389 0.30103 0 0.808263 0.0211893 +[PHRASE] ||| ein kleines haus ist ||| is a small house ||| 2.76492 0.39794 0 1.15672 0.0211893 +[PHRASE] ||| ein kleines haus gibt ||| is a small house ||| 3.06595 0.69897 0 1.06654 0.0211893 +[PHRASE] ||| ein kleines gelb ||| a small yellow ||| 2.94101 0.30103 0 0.808263 0 +[PHRASE] ||| ein kleines gelb haus ||| a small yellow house ||| 3.24204 0 0 0.808263 0.0211893 +[PHRASE] ||| ein kleines gelb haus gibt ||| is a small yellow house ||| 3.30899 0 0 1.06654 0.0211893 +[PHRASE] ||| ein gelb ||| a yellow ||| 1.98677 0.221849 0 0.228479 0 +[PHRASE] ||| ein gelb [PHRASE,1] ||| a yellow [PHRASE,1] ||| 3.24204 0.30103 0 0.228479 0 +[PHRASE] ||| ein gelb [PHRASE,1] gibt ||| is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.486757 0 +[PHRASE] ||| ein gelb haus ||| a yellow house ||| 2.63998 0 0 0.228479 0.0211893 +[PHRASE] ||| ein gelb haus ist ||| is a yellow house ||| 3.06595 0.30103 0 0.576934 0.0211893 +[PHRASE] ||| ein gelb haus gibt ||| is a yellow house ||| 3.06595 0.30103 0 0.486757 0.0211893 +[PHRASE] ||| ein gelb kleines ||| a yellow small ||| 2.94101 0.30103 0 0.808263 0 +[PHRASE] ||| ein gelb kleines haus ||| a yellow small house ||| 3.24204 0 0 0.808263 0.0211893 +[PHRASE] ||| ein gelb kleines haus gibt ||| is a yellow small house ||| 3.30899 0 0 1.06654 0.0211893 +[PHRASE] ||| ist ||| is ||| 1.34995 0.348455 0 0.348455 0 +[PHRASE] ||| klein ||| small ||| 1.61879 0.410174 0 0.433656 0 +[PHRASE] ||| klein [PHRASE,1] ||| [PHRASE,1] small ||| 3.06595 0.564271 0 0.433656 0 +[PHRASE] ||| klein [PHRASE,1] ist ||| [PHRASE,1] is small ||| 3.06595 0.60206 0 0.78211 0 +[PHRASE] ||| klein ist ||| is small ||| 1.68574 0 0 0.78211 0 +[PHRASE] ||| klein das [PHRASE,1] ||| the [PHRASE,1] small ||| 3.06595 0 0 0.433656 0.30103 +[PHRASE] ||| klein das haus ist ||| the house is small ||| 3.06595 0.477121 0 0.78211 0.322219 +[PHRASE] ||| maus ||| mouse ||| 1.50965 0 0 0 0 +[PHRASE] ||| maus [PHRASE,1] ||| mouse [PHRASE,1] ||| 2.94101 0 0 0 0 +[PHRASE] ||| maus [PHRASE,1] ist ||| mouse is [PHRASE,1] ||| 2.94101 0 0 0.348455 0 +[PHRASE] ||| maus ein haus ist ||| mouse is a house ||| 2.94101 0 0 0.576934 0.0211893 +[PHRASE] ||| kleines ||| small ||| 1.76492 0.556302 0 0.579784 0 +[PHRASE] ||| kleines [PHRASE,1] ||| small [PHRASE,1] ||| 2.94101 0.30103 0 0.579784 0 +[PHRASE] ||| kleines haus ||| small house ||| 1.86183 0.243038 0 0.579784 0.0211893 +[PHRASE] ||| kleines gelb ||| small yellow ||| 2.46389 0.30103 0 0.579784 0 +[PHRASE] ||| kleines gelb haus ||| small yellow house ||| 2.94101 0 0 0.579784 0.0211893 +[PHRASE] ||| kleine ||| small ||| 1.68574 0.477121 0 0.500602 0 +[PHRASE] ||| kleine [PHRASE,1] ||| small [PHRASE,1] ||| 2.94101 0.30103 0 0.500602 0 +[PHRASE] ||| kleine haus ||| small house ||| 2.16286 0.544068 0 0.500602 0.0211893 +[PHRASE] ||| kleine maus ||| small mouse ||| 1.98677 0 0 0.500602 0 +[PHRASE] ||| kleine gelb ||| small yellow ||| 2.46389 0.30103 0 0.500602 0 +[PHRASE] ||| kleine gelb maus ||| small yellow mouse ||| 2.94101 0 0 0.500602 0 +[PHRASE] ||| gelb ||| yellow ||| 1.61879 0 0 0 0 +[PHRASE] ||| gelb [PHRASE,1] ||| yellow [PHRASE,1] ||| 2.63998 0 0 0 0 +[PHRASE] ||| gelb haus ||| yellow house ||| 1.98677 0 0 0 0.0211893 +[PHRASE] ||| gelb maus ||| yellow mouse ||| 2.16286 0 0 0 0 +[PHRASE] ||| gelb kleines ||| yellow small ||| 2.46389 0.30103 0 0.579784 0 +[PHRASE] ||| gelb kleines haus ||| yellow small house ||| 2.94101 0 0 0.579784 0.0211893 +[PHRASE] ||| gelb kleine ||| yellow small ||| 2.46389 0.30103 0 0.500602 0 +[PHRASE] ||| gelb kleine maus ||| yellow small mouse ||| 2.94101 0 0 0.500602 0 +[PHRASE] ||| eine ||| a ||| 1.50965 0.38818 0 0.38818 0 +[PHRASE] ||| eine [PHRASE,1] ||| a [PHRASE,1] ||| 2.0602 0.312311 0 0.38818 0 +[PHRASE] ||| eine [PHRASE,1] maus ||| a [PHRASE,1] mouse ||| 2.94101 0 0 0.38818 0 +[PHRASE] ||| eine [PHRASE,1] maus gibt ||| is a [PHRASE,1] mouse ||| 3.00796 0 0 0.646458 0 +[PHRASE] ||| eine [PHRASE,1] gibt ||| is a [PHRASE,1] ||| 2.44865 0.394934 0 0.646458 0 +[PHRASE] ||| eine maus ||| a mouse ||| 1.98677 0 0 0.38818 0 +[PHRASE] ||| eine maus [PHRASE,1] ||| a mouse [PHRASE,1] ||| 3.16286 0 0 0.38818 0 +[PHRASE] ||| eine maus [PHRASE,1] ist ||| a mouse is [PHRASE,1] ||| 3.16286 0 0 0.736635 0 +[PHRASE] ||| eine maus ein haus ist ||| a mouse is a house ||| 3.16286 0 0 0.965114 0.0211893 +[PHRASE] ||| eine maus gibt ||| is a mouse ||| 2.46389 0 0 0.646458 0 +[PHRASE] ||| eine kleine ||| a small ||| 1.98677 0.367977 0 0.888783 0 +[PHRASE] ||| eine kleine [PHRASE,1] ||| a small [PHRASE,1] ||| 3.24204 0.30103 0 0.888783 0 +[PHRASE] ||| eine kleine [PHRASE,1] gibt ||| is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.14706 0 +[PHRASE] ||| eine kleine maus ||| a small mouse ||| 2.63998 0 0 0.888783 0 +[PHRASE] ||| eine kleine maus gibt ||| is a small mouse ||| 2.76492 0 0 1.14706 0 +[PHRASE] ||| eine kleine gelb ||| a small yellow ||| 2.94101 0.30103 0 0.888783 0 +[PHRASE] ||| eine kleine gelb maus ||| a small yellow mouse ||| 3.24204 0 0 0.888783 0 +[PHRASE] ||| eine kleine gelb maus gibt ||| is a small yellow mouse ||| 3.30899 0 0 1.14706 0 +[PHRASE] ||| eine gelb ||| a yellow ||| 2.16286 0.39794 0 0.38818 0 +[PHRASE] ||| eine gelb [PHRASE,1] ||| a yellow [PHRASE,1] ||| 3.24204 0.30103 0 0.38818 0 +[PHRASE] ||| eine gelb [PHRASE,1] gibt ||| is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.646458 0 +[PHRASE] ||| eine gelb maus ||| a yellow mouse ||| 2.94101 0 0 0.38818 0 +[PHRASE] ||| eine gelb maus gibt ||| is a yellow mouse ||| 3.06595 0 0 0.646458 0 +[PHRASE] ||| eine gelb kleine ||| a yellow small ||| 2.94101 0.30103 0 0.888783 0 +[PHRASE] ||| eine gelb kleine maus ||| a yellow small mouse ||| 3.24204 0 0 0.888783 0 +[PHRASE] ||| eine gelb kleine maus gibt ||| is a yellow small mouse ||| 3.30899 0 0 1.14706 0 +[PHRASE] ||| eine gruen ||| a green ||| 2.46389 0 0 0.38818 0 +[PHRASE] ||| eine gruen maus ||| a green mouse ||| 2.94101 0 0 0.38818 0 +[PHRASE] ||| gruen ||| green ||| 2.16286 0 0 0 0 +[PHRASE] ||| gruen maus ||| green mouse ||| 2.46389 0 0 0 0 +[PHRASE] ||| tages ||| day ||| 2.46389 0 0 0 0 +[PHRASE] ||| gibt ||| is ||| 1.25977 0.258278 0 0.258278 0 +[PHRASE] ||| meins ||| mine ||| 2.16286 0 0 0 0 +[PHRASE] ||| meins [PHRASE,1] ||| mine [PHRASE,1] ||| 2.76492 0 0 0 0 +[PHRASE] ||| meins ist ||| is mine ||| 2.46389 0 0 0.348455 0 +[PHRASE] ||| meins klein ist ||| mine is small ||| 2.76492 0 0 0.78211 0 +[PHRASE] ||| geld ||| money ||| 1.98677 0 0 0 0 +[PHRASE] ||| geld ist ||| is money ||| 2.46389 0.30103 0 0.348455 0 +[PHRASE] ||| geld gibt ||| is money ||| 2.46389 0.30103 0 0.258278 0 +[PHRASE] ||| keins ||| none ||| 1.98677 0 0 0 0 +[PHRASE] ||| keins [PHRASE,1] ||| none [PHRASE,1] ||| 2.76492 0 0 0 0 +[PHRASE] ||| keins klein ist ||| none is small ||| 2.76492 0 0 0.78211 0 +[PHRASE] ||| keins gibt ||| is none ||| 2.46389 0 0 0.258278 0 +[PHRASE] ||| dem haeuschen ||| of control ||| 2.46389 0 0 0.681241 0.425969 +[PHRASE] ||| eines ||| one ||| 2.46389 0.30103 0 0.30103 0 +[PHRASE] ||| eines tages ||| one day ||| 2.46389 0 0 0.30103 0 +[PHRASE] ||| eins ||| one ||| 2.46389 0.30103 0 0.30103 0 +[PHRASE] ||| aus ||| out ||| 2.46389 0 0.477121 0 0.221849 +[PHRASE] ||| aus ||| out of ||| 2.16286 0 0.176091 0.0791812 0.619789 +[PHRASE] ||| aus [PHRASE,1] ||| out [PHRASE,1] ||| 2.76492 0 0.367977 0 0.221849 +[PHRASE] ||| aus [PHRASE,1] ||| out of [PHRASE,1] ||| 2.63998 0 0.243038 0.0791812 0.619789 +[PHRASE] ||| aus ein ||| out of a ||| 2.46389 0 0 0.307661 0.619789 +[PHRASE] ||| aus ein haus ||| out of a house ||| 2.94101 0 0 0.307661 0.640978 +[PHRASE] ||| aus dem haeuschen ||| out of control ||| 2.76492 0 0 0.681241 0.647817 +[PHRASE] ||| aus das ||| out of the ||| 2.46389 0 0 0.0791812 0.920819 +[PHRASE] ||| aus das haus ||| out of the house ||| 2.94101 0 0 0.0791812 0.942008 +[PHRASE] ||| das ||| the ||| 1.76492 0 0.30103 0 0.30103 +[PHRASE] ||| das ||| that ||| 1.76492 0 0.30103 0 0.30103 +[PHRASE] ||| das [PHRASE,1] ||| the [PHRASE,1] ||| 2.39695 0 0.41972 0 0.30103 +[PHRASE] ||| das [PHRASE,1] ||| that [PHRASE,1] ||| 2.18514 0 0.207913 0 0.30103 +[PHRASE] ||| das [PHRASE,1] haus ist ||| that is [PHRASE,1] house ||| 2.86183 0 0 0.348455 0.322219 +[PHRASE] ||| das [PHRASE,1] ist ||| that is [PHRASE,1] ||| 2.86183 0 0 0.348455 0.30103 +[PHRASE] ||| das haus ||| the house ||| 1.86183 0 0 0 0.322219 +[PHRASE] ||| das haus [PHRASE,1] ||| the house [PHRASE,1] ||| 2.76492 0 0 0 0.322219 +[PHRASE] ||| das haus ist ||| the house is ||| 2.94101 0 0 0.348455 0.322219 +[PHRASE] ||| das haus klein ist ||| the house is small ||| 2.76492 0.176091 0 0.78211 0.322219 +[PHRASE] ||| das ein [PHRASE,1] ist ||| that is a [PHRASE,1] ||| 2.86183 0 0 0.576934 0.30103 +[PHRASE] ||| das ein kleines haus ist ||| that is a small house ||| 3.16286 0 0 1.15672 0.322219 +[PHRASE] ||| das ein gelb haus ist ||| that is a yellow house ||| 3.16286 0 0 0.576934 0.322219 +[PHRASE] ||| das klein ist ||| that is small ||| 2.76492 0 0 0.78211 0.30103 +[PHRASE] ||| das kleine ||| the small ||| 2.46389 0 0 0.500602 0.30103 +[PHRASE] ||| das kleine haus ||| the small house ||| 2.94101 0 0 0.500602 0.322219 +[PHRASE] ||| das meins ist ||| that is mine ||| 2.76492 0 0 0.348455 0.30103 +[PHRASE] ||| das geld ist ||| that is money ||| 2.76492 0 0 0.348455 0.30103 +[PHRASE] ||| es ||| there ||| 1.25977 0 0 0 0 +[PHRASE] ||| es [PHRASE,1] ||| there [PHRASE,1] ||| 1.83672 0 0 0 0 +[PHRASE] ||| es [PHRASE,1] haus gibt ||| there is [PHRASE,1] house ||| 2.62775 0 0 0.258278 0.0211893 +[PHRASE] ||| es [PHRASE,1] maus gibt ||| there is [PHRASE,1] mouse ||| 2.5166 0 0 0.258278 0 +[PHRASE] ||| es [PHRASE,1] kleines haus gibt ||| there is [PHRASE,1] small house ||| 3.30899 0 0 0.838062 0.0211893 +[PHRASE] ||| es [PHRASE,1] kleine maus gibt ||| there is [PHRASE,1] small mouse ||| 3.30899 0 0 0.75888 0 +[PHRASE] ||| es [PHRASE,1] gelb haus gibt ||| there is [PHRASE,1] yellow house ||| 3.30899 0 0 0.258278 0.0211893 +[PHRASE] ||| es [PHRASE,1] gelb maus gibt ||| there is [PHRASE,1] yellow mouse ||| 3.30899 0 0 0.258278 0 +[PHRASE] ||| es [PHRASE,1] gibt ||| there is [PHRASE,1] ||| 1.9536 0 0 0.258278 0 +[PHRASE] ||| es ein [PHRASE,1] haus gibt ||| there is a [PHRASE,1] house ||| 3.00796 0 0 0.486757 0.0211893 +[PHRASE] ||| es ein [PHRASE,1] gibt ||| there is a [PHRASE,1] ||| 2.62775 0.360151 0 0.486757 0 +[PHRASE] ||| es ein haus gibt ||| there is a house ||| 2.63998 0 0.176091 0.486757 0.0211893 +[PHRASE] ||| es ein haus gibt ||| there is a small house ||| 2.94101 0.20412 0.477121 0.765511 1.34341 +[PHRASE] ||| es ein kleines [PHRASE,1] gibt ||| there is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.06654 0 +[PHRASE] ||| es ein kleines haus gibt ||| there is a small house ||| 3.16286 0.425969 0 1.06654 0.0211893 +[PHRASE] ||| es ein gelb [PHRASE,1] gibt ||| there is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.486757 0 +[PHRASE] ||| es ein gelb haus gibt ||| there is a yellow house ||| 3.16286 0 0 0.486757 0.0211893 +[PHRASE] ||| es eine [PHRASE,1] maus gibt ||| there is a [PHRASE,1] mouse ||| 3.00796 0 0 0.646458 0 +[PHRASE] ||| es eine [PHRASE,1] gibt ||| there is a [PHRASE,1] ||| 2.5166 0.249001 0 0.646458 0 +[PHRASE] ||| es eine maus gibt ||| there is a mouse ||| 2.63998 0 0 0.646458 0 +[PHRASE] ||| es eine kleine [PHRASE,1] gibt ||| there is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.14706 0 +[PHRASE] ||| es eine kleine maus gibt ||| there is a small mouse ||| 2.86183 0 0 1.14706 0 +[PHRASE] ||| es eine gelb [PHRASE,1] gibt ||| there is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.646458 0 +[PHRASE] ||| es eine gelb maus gibt ||| there is a yellow mouse ||| 3.16286 0 0 0.646458 0 +[PHRASE] ||| es geld gibt ||| there is money ||| 2.76492 0 0 0.258278 0 +[PHRASE] ||| es keins gibt ||| there is none ||| 2.76492 0 0 0.258278 0 +[PHRASE] ||| dieses ||| this ||| 1.98677 0 0 0 0 +[PHRASE] ||| dieses [PHRASE,1] ||| this [PHRASE,1] ||| 2.56995 0 0 0 0 +[PHRASE] ||| dieses [PHRASE,1] haus ist ||| this is [PHRASE,1] house ||| 3.16286 0 0 0.348455 0.0211893 +[PHRASE] ||| dieses [PHRASE,1] ist ||| this is [PHRASE,1] ||| 3.16286 0 0 0.348455 0 +[PHRASE] ||| dieses haus ||| this house ||| 2.46389 0 0 0 0.0211893 +[PHRASE] ||| dieses haus [PHRASE,1] ||| this house [PHRASE,1] ||| 3.06595 0 0 0 0.0211893 +[PHRASE] ||| dieses haus klein ist ||| this house is small ||| 3.06595 0 0 0.78211 0.0211893 +[PHRASE] ||| dieses ein [PHRASE,1] ist ||| this is a [PHRASE,1] ||| 3.16286 0 0 0.576934 0 +[PHRASE] ||| dieses ein kleines haus ist ||| this is a small house ||| 3.16286 0 0 1.15672 0.0211893 +[PHRASE] ||| dieses kleine ||| this small ||| 2.46389 0 0 0.500602 0 +[PHRASE] ||| dieses kleine haus ||| this small house ||| 2.94101 0 0 0.500602 0.0211893 diff --git a/decoder/test_data/small.json.gz b/decoder/test_data/small.json.gz new file mode 100644 index 00000000..892ba360 Binary files /dev/null and b/decoder/test_data/small.json.gz differ diff --git a/decoder/test_data/test_2gram.lm.gz b/decoder/test_data/test_2gram.lm.gz new file mode 100644 index 00000000..aafa7274 Binary files /dev/null and b/decoder/test_data/test_2gram.lm.gz differ diff --git a/decoder/test_data/weights b/decoder/test_data/weights new file mode 100644 index 00000000..ea70229c --- /dev/null +++ b/decoder/test_data/weights @@ -0,0 +1,8 @@ +# hiero +WordPenalty -0.387029 +LanguageModel 0.253195 +PhraseModel_0 0.142926 +PhraseModel_1 0.465119 +PhraseModel_2 0.079503 +CNPosteriorProbability 0.09259 +Inf -inf diff --git a/decoder/test_data/weights.gt b/decoder/test_data/weights.gt new file mode 100644 index 00000000..08931049 --- /dev/null +++ b/decoder/test_data/weights.gt @@ -0,0 +1,4 @@ +Phrase_0 1.0 +Phrase_1 0.5 +Phrase_2 0.3 +Phrase_3 0.2 diff --git a/decoder/timing_stats.cc b/decoder/timing_stats.cc new file mode 100644 index 00000000..85b95de5 --- /dev/null +++ b/decoder/timing_stats.cc @@ -0,0 +1,24 @@ +#include "timing_stats.h" + +#include + +using namespace std; + +map Timer::stats; + +Timer::Timer(const string& timername) : start_t(clock()), cur(stats[timername]) {} + +Timer::~Timer() { + ++cur.calls; + const clock_t end_t = clock(); + const double elapsed = (end_t - start_t) / 1000000.0; + cur.total_time += elapsed; +} + +void Timer::Summarize() { + for (map::iterator it = stats.begin(); it != stats.end(); ++it) { + cerr << it->first << ": " << it->second.total_time << " secs (" << it->second.calls << " calls)\n"; + } + stats.clear(); +} + diff --git a/decoder/timing_stats.h b/decoder/timing_stats.h new file mode 100644 index 00000000..0a9f7656 --- /dev/null +++ b/decoder/timing_stats.h @@ -0,0 +1,25 @@ +#ifndef _TIMING_STATS_H_ +#define _TIMING_STATS_H_ + +#include +#include + +struct TimerInfo { + int calls; + double total_time; + TimerInfo() : calls(), total_time() {} +}; + +struct Timer { + Timer(const std::string& info); + ~Timer(); + static void Summarize(); + private: + static std::map stats; + clock_t start_t; + TimerInfo& cur; + Timer(const Timer& other); + const Timer& operator=(const Timer& other); +}; + +#endif diff --git a/decoder/translator.h b/decoder/translator.h new file mode 100644 index 00000000..194efbaa --- /dev/null +++ b/decoder/translator.h @@ -0,0 +1,54 @@ +#ifndef _TRANSLATOR_H_ +#define _TRANSLATOR_H_ + +#include +#include +#include +#include + +class Hypergraph; +class SentenceMetadata; + +class Translator { + public: + virtual ~Translator(); + // returns true if goal reached, false otherwise + // minus_lm_forest will contain the unpruned forest. the + // feature values from the phrase table / grammar / etc + // should be in the forest already - the "late" features + // should not just copy values that are available without + // any context or computation. + // SentenceMetadata contains information about the sentence, + // but it is an input/output parameter since the Translator + // is also responsible for setting the value of src_len. + virtual bool Translate(const std::string& src, + SentenceMetadata* smeta, + const std::vector& weights, + Hypergraph* minus_lm_forest) = 0; +}; + +class SCFGTranslatorImpl; +class SCFGTranslator : public Translator { + public: + SCFGTranslator(const boost::program_options::variables_map& conf); + bool Translate(const std::string& src, + SentenceMetadata* smeta, + const std::vector& weights, + Hypergraph* minus_lm_forest); + private: + boost::shared_ptr pimpl_; +}; + +class FSTTranslatorImpl; +class FSTTranslator : public Translator { + public: + FSTTranslator(const boost::program_options::variables_map& conf); + bool Translate(const std::string& src, + SentenceMetadata* smeta, + const std::vector& weights, + Hypergraph* minus_lm_forest); + private: + boost::shared_ptr pimpl_; +}; + +#endif diff --git a/decoder/trule.cc b/decoder/trule.cc new file mode 100644 index 00000000..b8f6995e --- /dev/null +++ b/decoder/trule.cc @@ -0,0 +1,237 @@ +#include "trule.h" + +#include + +#include "stringlib.h" +#include "tdict.h" + +using namespace std; + +static WordID ConvertTrgString(const string& w) { + int len = w.size(); + WordID id = 0; + // [X,0] or [0] + // for target rules, we ignore the category, just keep the index + if (len > 2 && w[0]=='[' && w[len-1]==']' && w[len-2] > '0' && w[len-2] <= '9' && + (len == 3 || (len > 4 && w[len-3] == ','))) { + id = w[len-2] - '0'; + id = 1 - id; + } else { + id = TD::Convert(w); + } + return id; +} + +static WordID ConvertSrcString(const string& w, bool mono = false) { + int len = w.size(); + // [X,0] + // for source rules, we keep the category and ignore the index (source rules are + // always numbered 1, 2, 3... + if (mono) { + if (len > 2 && w[0]=='[' && w[len-1]==']') { + if (len > 4 && w[len-3] == ',') { + cerr << "[ERROR] Monolingual rules mut not have non-terminal indices:\n " + << w << endl; + exit(1); + } + // TODO check that source indices go 1,2,3,etc. + return TD::Convert(w.substr(1, len-2)) * -1; + } else { + return TD::Convert(w); + } + } else { + if (len > 4 && w[0]=='[' && w[len-1]==']' && w[len-3] == ',' && w[len-2] > '0' && w[len-2] <= '9') { + return TD::Convert(w.substr(1, len-4)) * -1; + } else { + return TD::Convert(w); + } + } +} + +static WordID ConvertLHS(const string& w) { + if (w[0] == '[') { + int len = w.size(); + if (len < 3) { cerr << "Format error: " << w << endl; exit(1); } + return TD::Convert(w.substr(1, len-2)) * -1; + } else { + return TD::Convert(w) * -1; + } +} + +TRule* TRule::CreateRuleSynchronous(const std::string& rule) { + TRule* res = new TRule; + if (res->ReadFromString(rule, true, false)) return res; + cerr << "[ERROR] Failed to creating rule from: " << rule << endl; + delete res; + return NULL; +} + +TRule* TRule::CreateRulePhrasetable(const string& rule) { + // TODO make this faster + // TODO add configuration for default NT type + if (rule[0] == '[') { + cerr << "Phrasetable rules shouldn't have a LHS / non-terminals:\n " << rule << endl; + return NULL; + } + TRule* res = new TRule("[X] ||| " + rule, true, false); + if (res->Arity() != 0) { + cerr << "Phrasetable rules should have arity 0:\n " << rule << endl; + delete res; + return NULL; + } + return res; +} + +TRule* TRule::CreateRuleMonolingual(const string& rule) { + return new TRule(rule, false, true); +} + +bool TRule::ReadFromString(const string& line, bool strict, bool mono) { + e_.clear(); + f_.clear(); + scores_.clear(); + + string w; + istringstream is(line); + int format = CountSubstrings(line, "|||"); + if (strict && format < 2) { + cerr << "Bad rule format in strict mode:\n" << line << endl; + return false; + } + if (format >= 2 || (mono && format == 1)) { + while(is>>w && w!="|||") { lhs_ = ConvertLHS(w); } + while(is>>w && w!="|||") { f_.push_back(ConvertSrcString(w, mono)); } + if (!mono) { + while(is>>w && w!="|||") { e_.push_back(ConvertTrgString(w)); } + } + int fv = 0; + if (is) { + string ss; + getline(is, ss); + //cerr << "L: " << ss << endl; + int start = 0; + const int len = ss.size(); + while (start < len) { + while(start < len && (ss[start] == ' ' || ss[start] == ';')) + ++start; + if (start == len) break; + int end = start + 1; + while(end < len && (ss[end] != '=' && ss[end] != ' ' && ss[end] != ';')) + ++end; + if (end == len || ss[end] == ' ' || ss[end] == ';') { + //cerr << "PROC: '" << ss.substr(start, end - start) << "'\n"; + // non-named features + if (end != len) { ss[end] = 0; } + string fname = "PhraseModel_X"; + if (fv > 9) { cerr << "Too many phrasetable scores - used named format\n"; abort(); } + fname[12]='0' + fv; + ++fv; + scores_.set_value(FD::Convert(fname), atof(&ss[start])); + //cerr << "F: " << fname << " VAL=" << scores_.value(FD::Convert(fname)) << endl; + } else { + const int fid = FD::Convert(ss.substr(start, end - start)); + start = end + 1; + end = start + 1; + while(end < len && (ss[end] != ' ' && ss[end] != ';')) + ++end; + if (end < len) { ss[end] = 0; } + assert(start < len); + scores_.set_value(fid, atof(&ss[start])); + //cerr << "F: " << FD::Convert(fid) << " VAL=" << scores_.value(fid) << endl; + } + start = end + 1; + } + } + } else if (format == 1) { + while(is>>w && w!="|||") { lhs_ = ConvertLHS(w); } + while(is>>w && w!="|||") { e_.push_back(ConvertTrgString(w)); } + f_ = e_; + int x = ConvertLHS("[X]"); + for (int i = 0; i < f_.size(); ++i) + if (f_[i] <= 0) { f_[i] = x; } + } else { + cerr << "F: " << format << endl; + cerr << "[ERROR] Don't know how to read:\n" << line << endl; + } + if (mono) { + e_ = f_; + int ci = 0; + for (int i = 0; i < e_.size(); ++i) + if (e_[i] < 0) + e_[i] = ci--; + } + ComputeArity(); + return SanityCheck(); +} + +bool TRule::SanityCheck() const { + vector used(f_.size(), 0); + int ac = 0; + for (int i = 0; i < e_.size(); ++i) { + int ind = e_[i]; + if (ind > 0) continue; + ind = -ind; + if ((++used[ind]) != 1) { + cerr << "[ERROR] e-side variable index " << (ind+1) << " used more than once!\n"; + return false; + } + ac++; + } + if (ac != Arity()) { + cerr << "[ERROR] e-side arity mismatches f-side\n"; + return false; + } + return true; +} + +void TRule::ComputeArity() { + int min = 1; + for (vector::const_iterator i = e_.begin(); i != e_.end(); ++i) + if (*i < min) min = *i; + arity_ = 1 - min; +} + +static string AnonymousStrVar(int i) { + string res("[v]"); + if(!(i <= 0 && i >= -8)) { + cerr << "Can't handle more than 9 non-terminals: index=" << (-i) << endl; + abort(); + } + res[1] = '1' - i; + return res; +} + +string TRule::AsString(bool verbose) const { + ostringstream os; + int idx = 0; + if (lhs_ && verbose) { + os << '[' << TD::Convert(lhs_ * -1) << "] |||"; + for (int i = 0; i < f_.size(); ++i) { + const WordID& w = f_[i]; + if (w < 0) { + int wi = w * -1; + ++idx; + os << " [" << TD::Convert(wi) << ',' << idx << ']'; + } else { + os << ' ' << TD::Convert(w); + } + } + os << " ||| "; + } + if (idx > 9) { + cerr << "Too many non-terminals!\n partial: " << os.str() << endl; + exit(1); + } + for (int i =0; i +#include +#include +#include + +#include "sparse_vector.h" +#include "wordid.h" + +class TRule; +typedef boost::shared_ptr TRulePtr; +struct SpanInfo; + +// Translation rule +class TRule { + public: + TRule() : lhs_(0), prev_i(-1), prev_j(-1) { } + explicit TRule(const std::vector& e) : e_(e), lhs_(0), prev_i(-1), prev_j(-1) {} + TRule(const std::vector& e, const std::vector& f, const WordID& lhs) : + e_(e), f_(f), lhs_(lhs), prev_i(-1), prev_j(-1) {} + + // deprecated - this will be private soon + explicit TRule(const std::string& text, bool strict = false, bool mono = false) { + ReadFromString(text, strict, mono); + } + + // make a rule from a hiero-like rule table, e.g. + // [X] ||| [X,1] DE [X,2] ||| [X,2] of the [X,1] + // if misformatted, returns NULL + static TRule* CreateRuleSynchronous(const std::string& rule); + + // make a rule from a phrasetable entry (i.e., one that has no LHS type), e.g: + // el gato ||| the cat ||| Feature_2=0.34 + static TRule* CreateRulePhrasetable(const std::string& rule); + + // make a rule from a non-synchrnous CFG representation, e.g.: + // [LHS] ||| term1 [NT] term2 [OTHER_NT] [YET_ANOTHER_NT] + static TRule* CreateRuleMonolingual(const std::string& rule); + + void ESubstitute(const std::vector* >& var_values, + std::vector* result) const { + int vc = 0; + result->clear(); + for (std::vector::const_iterator i = e_.begin(); i != e_.end(); ++i) { + const WordID& c = *i; + if (c < 1) { + ++vc; + const std::vector& var_value = *var_values[-c]; + std::copy(var_value.begin(), + var_value.end(), + std::back_inserter(*result)); + } else { + result->push_back(c); + } + } + assert(vc == var_values.size()); + } + + void FSubstitute(const std::vector* >& var_values, + std::vector* result) const { + int vc = 0; + result->clear(); + for (std::vector::const_iterator i = f_.begin(); i != f_.end(); ++i) { + const WordID& c = *i; + if (c < 1) { + const std::vector& var_value = *var_values[vc++]; + std::copy(var_value.begin(), + var_value.end(), + std::back_inserter(*result)); + } else { + result->push_back(c); + } + } + assert(vc == var_values.size()); + } + + bool ReadFromString(const std::string& line, bool strict = false, bool monolingual = false); + + bool Initialized() const { return e_.size(); } + + std::string AsString(bool verbose = true) const; + + static TRule DummyRule() { + TRule res; + res.e_.resize(1, 0); + return res; + } + + const std::vector& f() const { return f_; } + const std::vector& e() const { return e_; } + + int EWords() const { return ELength() - Arity(); } + int FWords() const { return FLength() - Arity(); } + int FLength() const { return f_.size(); } + int ELength() const { return e_.size(); } + int Arity() const { return arity_; } + bool IsUnary() const { return (Arity() == 1) && (f_.size() == 1); } + const SparseVector& GetFeatureValues() const { return scores_; } + double Score(int i) const { return scores_[i]; } + WordID GetLHS() const { return lhs_; } + void ComputeArity(); + + // 0 = first variable, -1 = second variable, -2 = third ... + std::vector e_; + // < 0: *-1 = encoding of category of variable + std::vector f_; + WordID lhs_; + SparseVector scores_; + char arity_; + TRulePtr parent_rule_; // usually NULL, except when doing constrained decoding + + // this is only used when doing synchronous parsing + short int prev_i; + short int prev_j; + + private: + bool SanityCheck() const; +}; + +#endif diff --git a/decoder/trule_test.cc b/decoder/trule_test.cc new file mode 100644 index 00000000..02a70764 --- /dev/null +++ b/decoder/trule_test.cc @@ -0,0 +1,65 @@ +#include "trule.h" + +#include +#include +#include +#include "tdict.h" + +using namespace std; + +class TRuleTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +TEST_F(TRuleTest,TestFSubstitute) { + TRule r1("[X] ||| ob [X,1] [X,2] sah . ||| whether [X,1] saw [X,2] . ||| 0.99"); + TRule r2("[X] ||| ich ||| i ||| 1.0"); + TRule r3("[X] ||| ihn ||| him ||| 1.0"); + vector*> ants; + vector res2; + r2.FSubstitute(ants, &res2); + assert(TD::GetString(res2) == "ich"); + vector res3; + r3.FSubstitute(ants, &res3); + assert(TD::GetString(res3) == "ihn"); + ants.push_back(&res2); + ants.push_back(&res3); + vector res; + r1.FSubstitute(ants, &res); + cerr << TD::GetString(res) << endl; + assert(TD::GetString(res) == "ob ich ihn sah ."); +} + +TEST_F(TRuleTest,TestPhrasetableRule) { + TRulePtr t(TRule::CreateRulePhrasetable("gato ||| cat ||| PhraseModel_0=-23.2;Foo=1;Bar=12")); + cerr << t->AsString() << endl; + assert(t->scores_.num_active() == 3); +}; + + +TEST_F(TRuleTest,TestMonoRule) { + TRulePtr m(TRule::CreateRuleMonolingual("[LHS] ||| term1 [NT] term2 [NT2] [NT3]")); + assert(m->Arity() == 3); + cerr << m->AsString() << endl; + TRulePtr m2(TRule::CreateRuleMonolingual("[LHS] ||| term1 [NT] term2 [NT2] [NT3] ||| Feature1=0.23")); + assert(m2->Arity() == 3); + cerr << m2->AsString() << endl; + EXPECT_FLOAT_EQ(m2->scores_.value(FD::Convert("Feature1")), 0.23); +} + +TEST_F(TRuleTest,TestRuleR) { + TRule t6; + t6.ReadFromString("[X] ||| den [X,1] sah [X,2] . ||| [X,2] saw the [X,1] . ||| 0.12321 0.23232 0.121"); + cerr << "TEXT: " << t6.AsString() << endl; + EXPECT_EQ(t6.Arity(), 2); + EXPECT_EQ(t6.e_[0], -1); + EXPECT_EQ(t6.e_[3], 0); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/decoder/ttables.cc b/decoder/ttables.cc new file mode 100644 index 00000000..2ea960f0 --- /dev/null +++ b/decoder/ttables.cc @@ -0,0 +1,31 @@ +#include "ttables.h" + +#include + +#include "dict.h" + +using namespace std; +using namespace std::tr1; + +void TTable::DeserializeProbsFromText(std::istream* in) { + int c = 0; + while(*in) { + string e; + string f; + double p; + (*in) >> e >> f >> p; + if (e.empty()) break; + ++c; + ttable[TD::Convert(e)][TD::Convert(f)] = prob_t(p); + } + cerr << "Loaded " << c << " translation parameters.\n"; +} + +void TTable::SerializeHelper(string* out, const Word2Word2Double& o) { + assert(!"not implemented"); +} + +void TTable::DeserializeHelper(const string& in, Word2Word2Double* o) { + assert(!"not implemented"); +} + diff --git a/decoder/ttables.h b/decoder/ttables.h new file mode 100644 index 00000000..3ffc238a --- /dev/null +++ b/decoder/ttables.h @@ -0,0 +1,87 @@ +#ifndef _TTABLES_H_ +#define _TTABLES_H_ + +#include +#include + +#include "wordid.h" +#include "prob.h" +#include "tdict.h" + +class TTable { + public: + TTable() {} + typedef std::map Word2Double; + typedef std::map Word2Word2Double; + inline const prob_t prob(const int& e, const int& f) const { + const Word2Word2Double::const_iterator cit = ttable.find(e); + if (cit != ttable.end()) { + const Word2Double& cpd = cit->second; + const Word2Double::const_iterator it = cpd.find(f); + if (it == cpd.end()) return prob_t(0.00001); + return prob_t(it->second); + } else { + return prob_t(0.00001); + } + } + inline void Increment(const int& e, const int& f) { + counts[e][f] += 1.0; + } + inline void Increment(const int& e, const int& f, double x) { + counts[e][f] += x; + } + void Normalize() { + ttable.swap(counts); + for (Word2Word2Double::iterator cit = ttable.begin(); + cit != ttable.end(); ++cit) { + double tot = 0; + Word2Double& cpd = cit->second; + for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) + tot += it->second; + for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) + it->second /= tot; + } + counts.clear(); + } + // adds counts from another TTable - probabilities remain unchanged + TTable& operator+=(const TTable& rhs) { + for (Word2Word2Double::const_iterator it = rhs.counts.begin(); + it != rhs.counts.end(); ++it) { + const Word2Double& cpd = it->second; + Word2Double& tgt = counts[it->first]; + for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) { + tgt[j->first] += j->second; + } + } + return *this; + } + void ShowTTable() { + for (Word2Word2Double::iterator it = ttable.begin(); it != ttable.end(); ++it) { + Word2Double& cpd = it->second; + for (Word2Double::iterator j = cpd.begin(); j != cpd.end(); ++j) { + std::cerr << "P(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl; + } + } + } + void ShowCounts() { + for (Word2Word2Double::iterator it = counts.begin(); it != counts.end(); ++it) { + Word2Double& cpd = it->second; + for (Word2Double::iterator j = cpd.begin(); j != cpd.end(); ++j) { + std::cerr << "c(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl; + } + } + } + void DeserializeProbsFromText(std::istream* in); + void SerializeCounts(std::string* out) const { SerializeHelper(out, counts); } + void DeserializeCounts(const std::string& in) { DeserializeHelper(in, &counts); } + void SerializeProbs(std::string* out) const { SerializeHelper(out, ttable); } + void DeserializeProbs(const std::string& in) { DeserializeHelper(in, &ttable); } + private: + static void SerializeHelper(std::string*, const Word2Word2Double& o); + static void DeserializeHelper(const std::string&, Word2Word2Double* o); + public: + Word2Word2Double ttable; + Word2Word2Double counts; +}; + +#endif diff --git a/decoder/viterbi.cc b/decoder/viterbi.cc new file mode 100644 index 00000000..82b2ce6d --- /dev/null +++ b/decoder/viterbi.cc @@ -0,0 +1,39 @@ +#include "viterbi.h" + +#include +#include "hg.h" + +using namespace std; + +string ViterbiETree(const Hypergraph& hg) { + vector tmp; + const prob_t p = Viterbi, ETreeTraversal, prob_t, EdgeProb>(hg, &tmp); + return TD::GetString(tmp); +} + +string ViterbiFTree(const Hypergraph& hg) { + vector tmp; + const prob_t p = Viterbi, FTreeTraversal, prob_t, EdgeProb>(hg, &tmp); + return TD::GetString(tmp); +} + +prob_t ViterbiESentence(const Hypergraph& hg, vector* result) { + return Viterbi, ESentenceTraversal, prob_t, EdgeProb>(hg, result); +} + +prob_t ViterbiFSentence(const Hypergraph& hg, vector* result) { + return Viterbi, FSentenceTraversal, prob_t, EdgeProb>(hg, result); +} + +int ViterbiELength(const Hypergraph& hg) { + int len = -1; + Viterbi(hg, &len); + return len; +} + +int ViterbiPathLength(const Hypergraph& hg) { + int len = -1; + Viterbi(hg, &len); + return len; +} + diff --git a/decoder/viterbi.h b/decoder/viterbi.h new file mode 100644 index 00000000..46a4f528 --- /dev/null +++ b/decoder/viterbi.h @@ -0,0 +1,130 @@ +#ifndef _VITERBI_H_ +#define _VITERBI_H_ + +#include +#include "prob.h" +#include "hg.h" +#include "tdict.h" + +// V must implement: +// void operator()(const vector& ants, T* result); +template +WeightType Viterbi(const Hypergraph& hg, + T* result, + const Traversal& traverse = Traversal(), + const WeightFunction& weight = WeightFunction()) { + const int num_nodes = hg.nodes_.size(); + std::vector vit_result(num_nodes); + std::vector vit_weight(num_nodes, WeightType::Zero()); + + for (int i = 0; i < num_nodes; ++i) { + const Hypergraph::Node& cur_node = hg.nodes_[i]; + WeightType* const cur_node_best_weight = &vit_weight[i]; + T* const cur_node_best_result = &vit_result[i]; + + const int num_in_edges = cur_node.in_edges_.size(); + if (num_in_edges == 0) { + *cur_node_best_weight = WeightType(1); + continue; + } + for (int j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]]; + WeightType score = weight(edge); + std::vector ants(edge.tail_nodes_.size()); + for (int k = 0; k < edge.tail_nodes_.size(); ++k) { + const int tail_node_index = edge.tail_nodes_[k]; + score *= vit_weight[tail_node_index]; + ants[k] = &vit_result[tail_node_index]; + } + if (*cur_node_best_weight < score) { + *cur_node_best_weight = score; + traverse(edge, ants, cur_node_best_result); + } + } + } + std::swap(*result, vit_result.back()); + return vit_weight.back(); +} + +struct PathLengthTraversal { + void operator()(const Hypergraph::Edge& edge, + const std::vector& ants, + int* result) const { + (void) edge; + *result = 1; + for (int i = 0; i < ants.size(); ++i) *result += *ants[i]; + } +}; + +struct ESentenceTraversal { + void operator()(const Hypergraph::Edge& edge, + const std::vector*>& ants, + std::vector* result) const { + edge.rule_->ESubstitute(ants, result); + } +}; + +struct ELengthTraversal { + void operator()(const Hypergraph::Edge& edge, + const std::vector& ants, + int* result) const { + *result = edge.rule_->ELength() - edge.rule_->Arity(); + for (int i = 0; i < ants.size(); ++i) *result += *ants[i]; + } +}; + +struct FSentenceTraversal { + void operator()(const Hypergraph::Edge& edge, + const std::vector*>& ants, + std::vector* result) const { + edge.rule_->FSubstitute(ants, result); + } +}; + +// create a strings of the form (S (X the man) (X said (X he (X would (X go))))) +struct ETreeTraversal { + ETreeTraversal() : left("("), space(" "), right(")") {} + const std::string left; + const std::string space; + const std::string right; + void operator()(const Hypergraph::Edge& edge, + const std::vector*>& ants, + std::vector* result) const { + std::vector tmp; + edge.rule_->ESubstitute(ants, &tmp); + const std::string cat = TD::Convert(edge.rule_->GetLHS() * -1); + if (cat == "Goal") + result->swap(tmp); + else + TD::ConvertSentence(left + cat + space + TD::GetString(tmp) + right, + result); + } +}; + +struct FTreeTraversal { + FTreeTraversal() : left("("), space(" "), right(")") {} + const std::string left; + const std::string space; + const std::string right; + void operator()(const Hypergraph::Edge& edge, + const std::vector*>& ants, + std::vector* result) const { + std::vector tmp; + edge.rule_->FSubstitute(ants, &tmp); + const std::string cat = TD::Convert(edge.rule_->GetLHS() * -1); + if (cat == "Goal") + result->swap(tmp); + else + TD::ConvertSentence(left + cat + space + TD::GetString(tmp) + right, + result); + } +}; + +prob_t ViterbiESentence(const Hypergraph& hg, std::vector* result); +std::string ViterbiETree(const Hypergraph& hg); +prob_t ViterbiFSentence(const Hypergraph& hg, std::vector* result); +std::string ViterbiFTree(const Hypergraph& hg); +int ViterbiELength(const Hypergraph& hg); +int ViterbiPathLength(const Hypergraph& hg); + +#endif diff --git a/decoder/weights.cc b/decoder/weights.cc new file mode 100644 index 00000000..bb0a878f --- /dev/null +++ b/decoder/weights.cc @@ -0,0 +1,73 @@ +#include "weights.h" + +#include + +#include "fdict.h" +#include "filelib.h" + +using namespace std; + +void Weights::InitFromFile(const std::string& filename, vector* feature_list) { + cerr << "Reading weights from " << filename << endl; + ReadFile in_file(filename); + istream& in = *in_file.stream(); + assert(in); + int weight_count = 0; + bool fl = false; + while (in) { + double val = 0; + string buf; + getline(in, buf); + if (buf.size() == 0) continue; + if (buf[0] == '#') continue; + for (int i = 0; i < buf.size(); ++i) + if (buf[i] == '=') buf[i] = ' '; + int start = 0; + while(start < buf.size() && buf[start] == ' ') ++start; + int end = 0; + while(end < buf.size() && buf[end] != ' ') ++end; + int fid = FD::Convert(buf.substr(start, end - start)); + while(end < buf.size() && buf[end] == ' ') ++end; + val = strtod(&buf.c_str()[end], NULL); + if (wv_.size() <= fid) + wv_.resize(fid + 1); + wv_[fid] = val; + if (feature_list) { feature_list->push_back(FD::Convert(fid)); } + ++weight_count; + if (weight_count % 50000 == 0) { cerr << '.' << flush; fl = true; } + if (weight_count % 2000000 == 0) { cerr << " [" << weight_count << "]\n"; fl = false; } + } + if (fl) { cerr << endl; } + cerr << "Loaded " << weight_count << " feature weights\n"; +} + +void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features) const { + WriteFile out(fname); + ostream& o = *out.stream(); + assert(o); + o.precision(17); + const int num_feats = FD::NumFeats(); + for (int i = 1; i < num_feats; ++i) { + const double val = (i < wv_.size() ? wv_[i] : 0.0); + if (hide_zero_value_features && val == 0.0) continue; + o << FD::Convert(i) << ' ' << val << endl; + } +} + +void Weights::InitVector(std::vector* w) const { + *w = wv_; +} + +void Weights::InitSparseVector(SparseVector* w) const { + for (int i = 1; i < wv_.size(); ++i) { + const double& weight = wv_[i]; + if (weight) w->set_value(i, weight); + } +} + +void Weights::InitFromVector(const std::vector& w) { + wv_ = w; + if (wv_.size() > FD::NumFeats()) + cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n"; + wv_.resize(FD::NumFeats(), 0); +} diff --git a/decoder/weights.h b/decoder/weights.h new file mode 100644 index 00000000..f19aa3ce --- /dev/null +++ b/decoder/weights.h @@ -0,0 +1,21 @@ +#ifndef _WEIGHTS_H_ +#define _WEIGHTS_H_ + +#include +#include +#include +#include "sparse_vector.h" + +class Weights { + public: + Weights() {} + void InitFromFile(const std::string& fname, std::vector* feature_list = NULL); + void WriteToFile(const std::string& fname, bool hide_zero_value_features = true) const; + void InitVector(std::vector* w) const; + void InitSparseVector(SparseVector* w) const; + void InitFromVector(const std::vector& w); + private: + std::vector wv_; +}; + +#endif diff --git a/decoder/weights_test.cc b/decoder/weights_test.cc new file mode 100644 index 00000000..aa6b3db2 --- /dev/null +++ b/decoder/weights_test.cc @@ -0,0 +1,28 @@ +#include +#include +#include +#include +#include +#include "weights.h" +#include "tdict.h" +#include "hg.h" + +using namespace std; + +class WeightsTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + + +TEST_F(WeightsTest,Load) { + Weights w; + w.InitFromFile("test_data/weights"); + w.WriteToFile("-"); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/decoder/wordid.h b/decoder/wordid.h new file mode 100644 index 00000000..fb50bcc1 --- /dev/null +++ b/decoder/wordid.h @@ -0,0 +1,6 @@ +#ifndef _WORD_ID_H_ +#define _WORD_ID_H_ + +typedef int WordID; + +#endif diff --git a/src/JSON_parser.c b/src/JSON_parser.c deleted file mode 100644 index 175b7cc9..00000000 --- a/src/JSON_parser.c +++ /dev/null @@ -1,1012 +0,0 @@ -/* JSON_parser.c */ - -/* 2007-08-24 */ - -/* -Copyright (c) 2005 JSON.org - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -The Software shall be used for Good, not Evil. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -/* - Callbacks, comments, Unicode handling by Jean Gressmann (jean@0x42.de), 2007-2009. - - For the added features the license above applies also. - - Changelog: - 2009-05-17 - Incorporated benrudiak@googlemail.com fix for UTF16 decoding. - - 2009-05-14 - Fixed float parsing bug related to a locale being set that didn't - use '.' as decimal point character (charles@transmissionbt.com). - - 2008-10-14 - Renamed states.IN to states.IT to avoid name clash which IN macro - defined in windef.h (alexey.pelykh@gmail.com) - - 2008-07-19 - Removed some duplicate code & debugging variable (charles@transmissionbt.com) - - 2008-05-28 - Made JSON_value structure ansi C compliant. This bug was report by - trisk@acm.jhu.edu - - 2008-05-20 - Fixed bug reported by charles@transmissionbt.com where the switching - from static to dynamic parse buffer did not copy the static parse - buffer's content. -*/ - - - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "JSON_parser.h" - -#ifdef _MSC_VER -# if _MSC_VER >= 1400 /* Visual Studio 2005 and up */ -# pragma warning(disable:4996) // unsecure sscanf -# endif -#endif - - -#define true 1 -#define false 0 -#define __ -1 /* the universal error code */ - -/* values chosen so that the object size is approx equal to one page (4K) */ -#ifndef JSON_PARSER_STACK_SIZE -# define JSON_PARSER_STACK_SIZE 128 -#endif - -#ifndef JSON_PARSER_PARSE_BUFFER_SIZE -# define JSON_PARSER_PARSE_BUFFER_SIZE 3500 -#endif - -typedef unsigned short UTF16; - -struct JSON_parser_struct { - JSON_parser_callback callback; - void* ctx; - signed char state, before_comment_state, type, escaped, comment, allow_comments, handle_floats_manually; - UTF16 utf16_high_surrogate; - long depth; - long top; - signed char* stack; - long stack_capacity; - char decimal_point; - char* parse_buffer; - size_t parse_buffer_capacity; - size_t parse_buffer_count; - size_t comment_begin_offset; - signed char static_stack[JSON_PARSER_STACK_SIZE]; - char static_parse_buffer[JSON_PARSER_PARSE_BUFFER_SIZE]; -}; - -#define COUNTOF(x) (sizeof(x)/sizeof(x[0])) - -/* - Characters are mapped into these character classes. This allows for - a significant reduction in the size of the state transition table. -*/ - - - -enum classes { - C_SPACE, /* space */ - C_WHITE, /* other whitespace */ - C_LCURB, /* { */ - C_RCURB, /* } */ - C_LSQRB, /* [ */ - C_RSQRB, /* ] */ - C_COLON, /* : */ - C_COMMA, /* , */ - C_QUOTE, /* " */ - C_BACKS, /* \ */ - C_SLASH, /* / */ - C_PLUS, /* + */ - C_MINUS, /* - */ - C_POINT, /* . */ - C_ZERO , /* 0 */ - C_DIGIT, /* 123456789 */ - C_LOW_A, /* a */ - C_LOW_B, /* b */ - C_LOW_C, /* c */ - C_LOW_D, /* d */ - C_LOW_E, /* e */ - C_LOW_F, /* f */ - C_LOW_L, /* l */ - C_LOW_N, /* n */ - C_LOW_R, /* r */ - C_LOW_S, /* s */ - C_LOW_T, /* t */ - C_LOW_U, /* u */ - C_ABCDF, /* ABCDF */ - C_E, /* E */ - C_ETC, /* everything else */ - C_STAR, /* * */ - NR_CLASSES -}; - -static int ascii_class[128] = { -/* - This array maps the 128 ASCII characters into character classes. - The remaining Unicode characters should be mapped to C_ETC. - Non-whitespace control characters are errors. -*/ - __, __, __, __, __, __, __, __, - __, C_WHITE, C_WHITE, __, __, C_WHITE, __, __, - __, __, __, __, __, __, __, __, - __, __, __, __, __, __, __, __, - - C_SPACE, C_ETC, C_QUOTE, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, - C_ETC, C_ETC, C_STAR, C_PLUS, C_COMMA, C_MINUS, C_POINT, C_SLASH, - C_ZERO, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, - C_DIGIT, C_DIGIT, C_COLON, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, - - C_ETC, C_ABCDF, C_ABCDF, C_ABCDF, C_ABCDF, C_E, C_ABCDF, C_ETC, - C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, - C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, - C_ETC, C_ETC, C_ETC, C_LSQRB, C_BACKS, C_RSQRB, C_ETC, C_ETC, - - C_ETC, C_LOW_A, C_LOW_B, C_LOW_C, C_LOW_D, C_LOW_E, C_LOW_F, C_ETC, - C_ETC, C_ETC, C_ETC, C_ETC, C_LOW_L, C_ETC, C_LOW_N, C_ETC, - C_ETC, C_ETC, C_LOW_R, C_LOW_S, C_LOW_T, C_LOW_U, C_ETC, C_ETC, - C_ETC, C_ETC, C_ETC, C_LCURB, C_ETC, C_RCURB, C_ETC, C_ETC -}; - - -/* - The state codes. -*/ -enum states { - GO, /* start */ - OK, /* ok */ - OB, /* object */ - KE, /* key */ - CO, /* colon */ - VA, /* value */ - AR, /* array */ - ST, /* string */ - ES, /* escape */ - U1, /* u1 */ - U2, /* u2 */ - U3, /* u3 */ - U4, /* u4 */ - MI, /* minus */ - ZE, /* zero */ - IT, /* integer */ - FR, /* fraction */ - E1, /* e */ - E2, /* ex */ - E3, /* exp */ - T1, /* tr */ - T2, /* tru */ - T3, /* true */ - F1, /* fa */ - F2, /* fal */ - F3, /* fals */ - F4, /* false */ - N1, /* nu */ - N2, /* nul */ - N3, /* null */ - C1, /* / */ - C2, /* / * */ - C3, /* * */ - FX, /* *.* *eE* */ - D1, /* second UTF-16 character decoding started by \ */ - D2, /* second UTF-16 character proceeded by u */ - NR_STATES -}; - -enum actions -{ - CB = -10, /* comment begin */ - CE = -11, /* comment end */ - FA = -12, /* false */ - TR = -13, /* false */ - NU = -14, /* null */ - DE = -15, /* double detected by exponent e E */ - DF = -16, /* double detected by fraction . */ - SB = -17, /* string begin */ - MX = -18, /* integer detected by minus */ - ZX = -19, /* integer detected by zero */ - IX = -20, /* integer detected by 1-9 */ - EX = -21, /* next char is escaped */ - UC = -22 /* Unicode character read */ -}; - - -static int state_transition_table[NR_STATES][NR_CLASSES] = { -/* - The state transition table takes the current state and the current symbol, - and returns either a new state or an action. An action is represented as a - negative number. A JSON text is accepted if at the end of the text the - state is OK and if the mode is MODE_DONE. - - white 1-9 ABCDF etc - space | { } [ ] : , " \ / + - . 0 | a b c d e f l n r s t u | E | * */ -/*start GO*/ {GO,GO,-6,__,-5,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*ok OK*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*object OB*/ {OB,OB,__,-9,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*key KE*/ {KE,KE,__,__,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*colon CO*/ {CO,CO,__,__,__,__,-2,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*value VA*/ {VA,VA,-6,__,-5,__,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__}, -/*array AR*/ {AR,AR,-6,__,-5,-7,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__}, -/*string ST*/ {ST,__,ST,ST,ST,ST,ST,ST,-4,EX,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST}, -/*escape ES*/ {__,__,__,__,__,__,__,__,ST,ST,ST,__,__,__,__,__,__,ST,__,__,__,ST,__,ST,ST,__,ST,U1,__,__,__,__}, -/*u1 U1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U2,U2,U2,U2,U2,U2,U2,U2,__,__,__,__,__,__,U2,U2,__,__}, -/*u2 U2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U3,U3,U3,U3,U3,U3,U3,U3,__,__,__,__,__,__,U3,U3,__,__}, -/*u3 U3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U4,U4,U4,U4,U4,U4,U4,U4,__,__,__,__,__,__,U4,U4,__,__}, -/*u4 U4*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,UC,UC,UC,UC,UC,UC,UC,UC,__,__,__,__,__,__,UC,UC,__,__}, -/*minus MI*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,ZE,IT,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*zero ZE*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*int IT*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,IT,IT,__,__,__,__,DE,__,__,__,__,__,__,__,__,DE,__,__}, -/*frac FR*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__}, -/*e E1*/ {__,__,__,__,__,__,__,__,__,__,__,E2,E2,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*ex E2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*exp E3*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*tr T1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T2,__,__,__,__,__,__,__}, -/*tru T2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T3,__,__,__,__}, -/*true T3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__}, -/*fa F1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*fal F2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F3,__,__,__,__,__,__,__,__,__}, -/*fals F3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F4,__,__,__,__,__,__}, -/*false F4*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__}, -/*nu N1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N2,__,__,__,__}, -/*nul N2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N3,__,__,__,__,__,__,__,__,__}, -/*null N3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__}, -/*/ C1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,C2}, -/*/* C2*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3}, -/** C3*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,CE,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3}, -/*_. FX*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__}, -/*\ D1*/ {__,__,__,__,__,__,__,__,__,D2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, -/*\ D2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,U1,__,__,__,__}, -}; - - -/* - These modes can be pushed on the stack. -*/ -enum modes { - MODE_ARRAY = 1, - MODE_DONE = 2, - MODE_KEY = 3, - MODE_OBJECT = 4 -}; - -static int -push(JSON_parser jc, int mode) -{ -/* - Push a mode onto the stack. Return false if there is overflow. -*/ - jc->top += 1; - if (jc->depth < 0) { - if (jc->top >= jc->stack_capacity) { - size_t bytes_to_allocate; - jc->stack_capacity *= 2; - bytes_to_allocate = jc->stack_capacity * sizeof(jc->static_stack[0]); - if (jc->stack == &jc->static_stack[0]) { - jc->stack = (signed char*)malloc(bytes_to_allocate); - memcpy(jc->stack, jc->static_stack, sizeof(jc->static_stack)); - } else { - jc->stack = (signed char*)realloc(jc->stack, bytes_to_allocate); - } - } - } else { - if (jc->top >= jc->depth) { - return false; - } - } - - jc->stack[jc->top] = mode; - return true; -} - - -static int -pop(JSON_parser jc, int mode) -{ -/* - Pop the stack, assuring that the current mode matches the expectation. - Return false if there is underflow or if the modes mismatch. -*/ - if (jc->top < 0 || jc->stack[jc->top] != mode) { - return false; - } - jc->top -= 1; - return true; -} - - -#define parse_buffer_clear(jc) \ - do {\ - jc->parse_buffer_count = 0;\ - jc->parse_buffer[0] = 0;\ - } while (0) - -#define parse_buffer_pop_back_char(jc)\ - do {\ - assert(jc->parse_buffer_count >= 1);\ - --jc->parse_buffer_count;\ - jc->parse_buffer[jc->parse_buffer_count] = 0;\ - } while (0) - -void delete_JSON_parser(JSON_parser jc) -{ - if (jc) { - if (jc->stack != &jc->static_stack[0]) { - free((void*)jc->stack); - } - if (jc->parse_buffer != &jc->static_parse_buffer[0]) { - free((void*)jc->parse_buffer); - } - free((void*)jc); - } -} - - -JSON_parser -new_JSON_parser(JSON_config* config) -{ -/* - new_JSON_parser starts the checking process by constructing a JSON_parser - object. It takes a depth parameter that restricts the level of maximum - nesting. - - To continue the process, call JSON_parser_char for each character in the - JSON text, and then call JSON_parser_done to obtain the final result. - These functions are fully reentrant. -*/ - - int depth = 0; - JSON_config default_config; - - JSON_parser jc = (JSON_parser)malloc(sizeof(struct JSON_parser_struct)); - - memset(jc, 0, sizeof(*jc)); - - - /* initialize configuration */ - init_JSON_config(&default_config); - - /* set to default configuration if none was provided */ - if (config == NULL) { - config = &default_config; - } - - depth = config->depth; - - /* We need to be able to push at least one object */ - if (depth == 0) { - depth = 1; - } - - jc->state = GO; - jc->top = -1; - - /* Do we want non-bound stack? */ - if (depth > 0) { - jc->stack_capacity = depth; - jc->depth = depth; - if (depth <= (int)COUNTOF(jc->static_stack)) { - jc->stack = &jc->static_stack[0]; - } else { - jc->stack = (signed char*)malloc(jc->stack_capacity * sizeof(jc->static_stack[0])); - } - } else { - jc->stack_capacity = COUNTOF(jc->static_stack); - jc->depth = -1; - jc->stack = &jc->static_stack[0]; - } - - /* set parser to start */ - push(jc, MODE_DONE); - - /* set up the parse buffer */ - jc->parse_buffer = &jc->static_parse_buffer[0]; - jc->parse_buffer_capacity = COUNTOF(jc->static_parse_buffer); - parse_buffer_clear(jc); - - /* set up callback, comment & float handling */ - jc->callback = config->callback; - jc->ctx = config->callback_ctx; - jc->allow_comments = config->allow_comments != 0; - jc->handle_floats_manually = config->handle_floats_manually != 0; - - /* set up decimal point */ - jc->decimal_point = *localeconv()->decimal_point; - - return jc; -} - -static void grow_parse_buffer(JSON_parser jc) -{ - size_t bytes_to_allocate; - jc->parse_buffer_capacity *= 2; - bytes_to_allocate = jc->parse_buffer_capacity * sizeof(jc->parse_buffer[0]); - if (jc->parse_buffer == &jc->static_parse_buffer[0]) { - jc->parse_buffer = (char*)malloc(bytes_to_allocate); - memcpy(jc->parse_buffer, jc->static_parse_buffer, jc->parse_buffer_count); - } else { - jc->parse_buffer = (char*)realloc(jc->parse_buffer, bytes_to_allocate); - } -} - -#define parse_buffer_push_back_char(jc, c)\ - do {\ - if (jc->parse_buffer_count + 1 >= jc->parse_buffer_capacity) grow_parse_buffer(jc);\ - jc->parse_buffer[jc->parse_buffer_count++] = c;\ - jc->parse_buffer[jc->parse_buffer_count] = 0;\ - } while (0) - -#define assert_is_non_container_type(jc) \ - assert( \ - jc->type == JSON_T_NULL || \ - jc->type == JSON_T_FALSE || \ - jc->type == JSON_T_TRUE || \ - jc->type == JSON_T_FLOAT || \ - jc->type == JSON_T_INTEGER || \ - jc->type == JSON_T_STRING) - - -static int parse_parse_buffer(JSON_parser jc) -{ - if (jc->callback) { - JSON_value value, *arg = NULL; - - if (jc->type != JSON_T_NONE) { - assert_is_non_container_type(jc); - - switch(jc->type) { - case JSON_T_FLOAT: - arg = &value; - if (jc->handle_floats_manually) { - value.vu.str.value = jc->parse_buffer; - value.vu.str.length = jc->parse_buffer_count; - } else { - /*sscanf(jc->parse_buffer, "%Lf", &value.vu.float_value);*/ - - /* not checking with end pointer b/c there may be trailing ws */ - value.vu.float_value = strtold(jc->parse_buffer, NULL); - } - break; - case JSON_T_INTEGER: - arg = &value; - sscanf(jc->parse_buffer, JSON_PARSER_INTEGER_SSCANF_TOKEN, &value.vu.integer_value); - break; - case JSON_T_STRING: - arg = &value; - value.vu.str.value = jc->parse_buffer; - value.vu.str.length = jc->parse_buffer_count; - break; - } - - if (!(*jc->callback)(jc->ctx, jc->type, arg)) { - return false; - } - } - } - - parse_buffer_clear(jc); - - return true; -} - -#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800) -#define IS_LOW_SURROGATE(uc) (((uc) & 0xFC00) == 0xDC00) -#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000) -static unsigned char utf8_lead_bits[4] = { 0x00, 0xC0, 0xE0, 0xF0 }; - -static int decode_unicode_char(JSON_parser jc) -{ - int i; - unsigned uc = 0; - char* p; - int trail_bytes; - - assert(jc->parse_buffer_count >= 6); - - p = &jc->parse_buffer[jc->parse_buffer_count - 4]; - - for (i = 12; i >= 0; i -= 4, ++p) { - unsigned x = *p; - - if (x >= 'a') { - x -= ('a' - 10); - } else if (x >= 'A') { - x -= ('A' - 10); - } else { - x &= ~0x30u; - } - - assert(x < 16); - - uc |= x << i; - } - - /* clear UTF-16 char from buffer */ - jc->parse_buffer_count -= 6; - jc->parse_buffer[jc->parse_buffer_count] = 0; - - /* attempt decoding ... */ - if (jc->utf16_high_surrogate) { - if (IS_LOW_SURROGATE(uc)) { - uc = DECODE_SURROGATE_PAIR(jc->utf16_high_surrogate, uc); - trail_bytes = 3; - jc->utf16_high_surrogate = 0; - } else { - /* high surrogate without a following low surrogate */ - return false; - } - } else { - if (uc < 0x80) { - trail_bytes = 0; - } else if (uc < 0x800) { - trail_bytes = 1; - } else if (IS_HIGH_SURROGATE(uc)) { - /* save the high surrogate and wait for the low surrogate */ - jc->utf16_high_surrogate = uc; - return true; - } else if (IS_LOW_SURROGATE(uc)) { - /* low surrogate without a preceding high surrogate */ - return false; - } else { - trail_bytes = 2; - } - } - - jc->parse_buffer[jc->parse_buffer_count++] = (char) ((uc >> (trail_bytes * 6)) | utf8_lead_bits[trail_bytes]); - - for (i = trail_bytes * 6 - 6; i >= 0; i -= 6) { - jc->parse_buffer[jc->parse_buffer_count++] = (char) (((uc >> i) & 0x3F) | 0x80); - } - - jc->parse_buffer[jc->parse_buffer_count] = 0; - - return true; -} - -static int add_escaped_char_to_parse_buffer(JSON_parser jc, int next_char) -{ - jc->escaped = 0; - /* remove the backslash */ - parse_buffer_pop_back_char(jc); - switch(next_char) { - case 'b': - parse_buffer_push_back_char(jc, '\b'); - break; - case 'f': - parse_buffer_push_back_char(jc, '\f'); - break; - case 'n': - parse_buffer_push_back_char(jc, '\n'); - break; - case 'r': - parse_buffer_push_back_char(jc, '\r'); - break; - case 't': - parse_buffer_push_back_char(jc, '\t'); - break; - case '"': - parse_buffer_push_back_char(jc, '"'); - break; - case '\\': - parse_buffer_push_back_char(jc, '\\'); - break; - case '/': - parse_buffer_push_back_char(jc, '/'); - break; - case 'u': - parse_buffer_push_back_char(jc, '\\'); - parse_buffer_push_back_char(jc, 'u'); - break; - default: - return false; - } - - return true; -} - -#define add_char_to_parse_buffer(jc, next_char, next_class) \ - do { \ - if (jc->escaped) { \ - if (!add_escaped_char_to_parse_buffer(jc, next_char)) \ - return false; \ - } else if (!jc->comment) { \ - if ((jc->type != JSON_T_NONE) | !((next_class == C_SPACE) | (next_class == C_WHITE)) /* non-white-space */) { \ - parse_buffer_push_back_char(jc, (char)next_char); \ - } \ - } \ - } while (0) - - -#define assert_type_isnt_string_null_or_bool(jc) \ - assert(jc->type != JSON_T_FALSE); \ - assert(jc->type != JSON_T_TRUE); \ - assert(jc->type != JSON_T_NULL); \ - assert(jc->type != JSON_T_STRING) - - -int -JSON_parser_char(JSON_parser jc, int next_char) -{ -/* - After calling new_JSON_parser, call this function for each character (or - partial character) in your JSON text. It can accept UTF-8, UTF-16, or - UTF-32. It returns true if things are looking ok so far. If it rejects the - text, it returns false. -*/ - int next_class, next_state; - -/* - Determine the character's class. -*/ - if (next_char < 0) { - return false; - } - if (next_char >= 128) { - next_class = C_ETC; - } else { - next_class = ascii_class[next_char]; - if (next_class <= __) { - return false; - } - } - - add_char_to_parse_buffer(jc, next_char, next_class); - -/* - Get the next state from the state transition table. -*/ - next_state = state_transition_table[jc->state][next_class]; - if (next_state >= 0) { -/* - Change the state. -*/ - jc->state = next_state; - } else { -/* - Or perform one of the actions. -*/ - switch (next_state) { -/* Unicode character */ - case UC: - if(!decode_unicode_char(jc)) { - return false; - } - /* check if we need to read a second UTF-16 char */ - if (jc->utf16_high_surrogate) { - jc->state = D1; - } else { - jc->state = ST; - } - break; -/* escaped char */ - case EX: - jc->escaped = 1; - jc->state = ES; - break; -/* integer detected by minus */ - case MX: - jc->type = JSON_T_INTEGER; - jc->state = MI; - break; -/* integer detected by zero */ - case ZX: - jc->type = JSON_T_INTEGER; - jc->state = ZE; - break; -/* integer detected by 1-9 */ - case IX: - jc->type = JSON_T_INTEGER; - jc->state = IT; - break; - -/* floating point number detected by exponent*/ - case DE: - assert_type_isnt_string_null_or_bool(jc); - jc->type = JSON_T_FLOAT; - jc->state = E1; - break; - -/* floating point number detected by fraction */ - case DF: - assert_type_isnt_string_null_or_bool(jc); - if (!jc->handle_floats_manually) { -/* - Some versions of strtod (which underlies sscanf) don't support converting - C-locale formated floating point values. -*/ - assert(jc->parse_buffer[jc->parse_buffer_count-1] == '.'); - jc->parse_buffer[jc->parse_buffer_count-1] = jc->decimal_point; - } - jc->type = JSON_T_FLOAT; - jc->state = FX; - break; -/* string begin " */ - case SB: - parse_buffer_clear(jc); - assert(jc->type == JSON_T_NONE); - jc->type = JSON_T_STRING; - jc->state = ST; - break; - -/* n */ - case NU: - assert(jc->type == JSON_T_NONE); - jc->type = JSON_T_NULL; - jc->state = N1; - break; -/* f */ - case FA: - assert(jc->type == JSON_T_NONE); - jc->type = JSON_T_FALSE; - jc->state = F1; - break; -/* t */ - case TR: - assert(jc->type == JSON_T_NONE); - jc->type = JSON_T_TRUE; - jc->state = T1; - break; - -/* closing comment */ - case CE: - jc->comment = 0; - assert(jc->parse_buffer_count == 0); - assert(jc->type == JSON_T_NONE); - jc->state = jc->before_comment_state; - break; - -/* opening comment */ - case CB: - if (!jc->allow_comments) { - return false; - } - parse_buffer_pop_back_char(jc); - if (!parse_parse_buffer(jc)) { - return false; - } - assert(jc->parse_buffer_count == 0); - assert(jc->type != JSON_T_STRING); - switch (jc->stack[jc->top]) { - case MODE_ARRAY: - case MODE_OBJECT: - switch(jc->state) { - case VA: - case AR: - jc->before_comment_state = jc->state; - break; - default: - jc->before_comment_state = OK; - break; - } - break; - default: - jc->before_comment_state = jc->state; - break; - } - jc->type = JSON_T_NONE; - jc->state = C1; - jc->comment = 1; - break; -/* empty } */ - case -9: - parse_buffer_clear(jc); - if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) { - return false; - } - if (!pop(jc, MODE_KEY)) { - return false; - } - jc->state = OK; - break; - -/* } */ case -8: - parse_buffer_pop_back_char(jc); - if (!parse_parse_buffer(jc)) { - return false; - } - if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) { - return false; - } - if (!pop(jc, MODE_OBJECT)) { - return false; - } - jc->type = JSON_T_NONE; - jc->state = OK; - break; - -/* ] */ case -7: - parse_buffer_pop_back_char(jc); - if (!parse_parse_buffer(jc)) { - return false; - } - if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_END, NULL)) { - return false; - } - if (!pop(jc, MODE_ARRAY)) { - return false; - } - - jc->type = JSON_T_NONE; - jc->state = OK; - break; - -/* { */ case -6: - parse_buffer_pop_back_char(jc); - if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_BEGIN, NULL)) { - return false; - } - if (!push(jc, MODE_KEY)) { - return false; - } - assert(jc->type == JSON_T_NONE); - jc->state = OB; - break; - -/* [ */ case -5: - parse_buffer_pop_back_char(jc); - if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_BEGIN, NULL)) { - return false; - } - if (!push(jc, MODE_ARRAY)) { - return false; - } - assert(jc->type == JSON_T_NONE); - jc->state = AR; - break; - -/* string end " */ case -4: - parse_buffer_pop_back_char(jc); - switch (jc->stack[jc->top]) { - case MODE_KEY: - assert(jc->type == JSON_T_STRING); - jc->type = JSON_T_NONE; - jc->state = CO; - - if (jc->callback) { - JSON_value value; - value.vu.str.value = jc->parse_buffer; - value.vu.str.length = jc->parse_buffer_count; - if (!(*jc->callback)(jc->ctx, JSON_T_KEY, &value)) { - return false; - } - } - parse_buffer_clear(jc); - break; - case MODE_ARRAY: - case MODE_OBJECT: - assert(jc->type == JSON_T_STRING); - if (!parse_parse_buffer(jc)) { - return false; - } - jc->type = JSON_T_NONE; - jc->state = OK; - break; - default: - return false; - } - break; - -/* , */ case -3: - parse_buffer_pop_back_char(jc); - if (!parse_parse_buffer(jc)) { - return false; - } - switch (jc->stack[jc->top]) { - case MODE_OBJECT: -/* - A comma causes a flip from object mode to key mode. -*/ - if (!pop(jc, MODE_OBJECT) || !push(jc, MODE_KEY)) { - return false; - } - assert(jc->type != JSON_T_STRING); - jc->type = JSON_T_NONE; - jc->state = KE; - break; - case MODE_ARRAY: - assert(jc->type != JSON_T_STRING); - jc->type = JSON_T_NONE; - jc->state = VA; - break; - default: - return false; - } - break; - -/* : */ case -2: -/* - A colon causes a flip from key mode to object mode. -*/ - parse_buffer_pop_back_char(jc); - if (!pop(jc, MODE_KEY) || !push(jc, MODE_OBJECT)) { - return false; - } - assert(jc->type == JSON_T_NONE); - jc->state = VA; - break; -/* - Bad action. -*/ - default: - return false; - } - } - return true; -} - - -int -JSON_parser_done(JSON_parser jc) -{ - const int result = jc->state == OK && pop(jc, MODE_DONE); - - return result; -} - - -int JSON_parser_is_legal_white_space_string(const char* s) -{ - int c, char_class; - - if (s == NULL) { - return false; - } - - for (; *s; ++s) { - c = *s; - - if (c < 0 || c >= 128) { - return false; - } - - char_class = ascii_class[c]; - - if (char_class != C_SPACE && char_class != C_WHITE) { - return false; - } - } - - return true; -} - - - -void init_JSON_config(JSON_config* config) -{ - if (config) { - memset(config, 0, sizeof(*config)); - - config->depth = JSON_PARSER_STACK_SIZE - 1; - } -} diff --git a/src/JSON_parser.h b/src/JSON_parser.h deleted file mode 100644 index ceb5b24b..00000000 --- a/src/JSON_parser.h +++ /dev/null @@ -1,152 +0,0 @@ -#ifndef JSON_PARSER_H -#define JSON_PARSER_H - -/* JSON_parser.h */ - - -#include - -/* Windows DLL stuff */ -#ifdef _WIN32 -# ifdef JSON_PARSER_DLL_EXPORTS -# define JSON_PARSER_DLL_API __declspec(dllexport) -# else -# define JSON_PARSER_DLL_API __declspec(dllimport) -# endif -#else -# define JSON_PARSER_DLL_API -#endif - -/* Determine the integer type use to parse non-floating point numbers */ -#if __STDC_VERSION__ >= 199901L || HAVE_LONG_LONG == 1 -typedef long long JSON_int_t; -#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%lld" -#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%lld" -#else -typedef long JSON_int_t; -#define JSON_PARSER_INTEGER_SSCANF_TOKEN "%ld" -#define JSON_PARSER_INTEGER_SPRINTF_TOKEN "%ld" -#endif - - -#ifdef __cplusplus -extern "C" { -#endif - -typedef enum -{ - JSON_T_NONE = 0, - JSON_T_ARRAY_BEGIN, // 1 - JSON_T_ARRAY_END, // 2 - JSON_T_OBJECT_BEGIN, // 3 - JSON_T_OBJECT_END, // 4 - JSON_T_INTEGER, // 5 - JSON_T_FLOAT, // 6 - JSON_T_NULL, // 7 - JSON_T_TRUE, // 8 - JSON_T_FALSE, // 9 - JSON_T_STRING, // 10 - JSON_T_KEY, // 11 - JSON_T_MAX // 12 -} JSON_type; - -typedef struct JSON_value_struct { - union { - JSON_int_t integer_value; - - long double float_value; - - struct { - const char* value; - size_t length; - } str; - } vu; -} JSON_value; - -typedef struct JSON_parser_struct* JSON_parser; - -/*! \brief JSON parser callback - - \param ctx The pointer passed to new_JSON_parser. - \param type An element of JSON_type but not JSON_T_NONE. - \param value A representation of the parsed value. This parameter is NULL for - JSON_T_ARRAY_BEGIN, JSON_T_ARRAY_END, JSON_T_OBJECT_BEGIN, JSON_T_OBJECT_END, - JSON_T_NULL, JSON_T_TRUE, and SON_T_FALSE. String values are always returned - as zero-terminated C strings. - - \return Non-zero if parsing should continue, else zero. -*/ -typedef int (*JSON_parser_callback)(void* ctx, int type, const struct JSON_value_struct* value); - - -/*! \brief The structure used to configure a JSON parser object - - \param depth If negative, the parser can parse arbitrary levels of JSON, otherwise - the depth is the limit - \param Pointer to a callback. This parameter may be NULL. In this case the input is merely checked for validity. - \param Callback context. This parameter may be NULL. - \param depth. Specifies the levels of nested JSON to allow. Negative numbers yield unlimited nesting. - \param allowComments. To allow C style comments in JSON, set to non-zero. - \param handleFloatsManually. To decode floating point numbers manually set this parameter to non-zero. - - \return The parser object. -*/ -typedef struct { - JSON_parser_callback callback; - void* callback_ctx; - int depth; - int allow_comments; - int handle_floats_manually; -} JSON_config; - - -/*! \brief Initializes the JSON parser configuration structure to default values. - - The default configuration is - - 127 levels of nested JSON (depends on JSON_PARSER_STACK_SIZE, see json_parser.c) - - no parsing, just checking for JSON syntax - - no comments - - \param config. Used to configure the parser. -*/ -JSON_PARSER_DLL_API void init_JSON_config(JSON_config* config); - -/*! \brief Create a JSON parser object - - \param config. Used to configure the parser. Set to NULL to use the default configuration. - See init_JSON_config - - \return The parser object. -*/ -JSON_PARSER_DLL_API extern JSON_parser new_JSON_parser(JSON_config* config); - -/*! \brief Destroy a previously created JSON parser object. */ -JSON_PARSER_DLL_API extern void delete_JSON_parser(JSON_parser jc); - -/*! \brief Parse a character. - - \return Non-zero, if all characters passed to this function are part of are valid JSON. -*/ -JSON_PARSER_DLL_API extern int JSON_parser_char(JSON_parser jc, int next_char); - -/*! \brief Finalize parsing. - - Call this method once after all input characters have been consumed. - - \return Non-zero, if all parsed characters are valid JSON, zero otherwise. -*/ -JSON_PARSER_DLL_API extern int JSON_parser_done(JSON_parser jc); - -/*! \brief Determine if a given string is valid JSON white space - - \return Non-zero if the string is valid, zero otherwise. -*/ -JSON_PARSER_DLL_API extern int JSON_parser_is_legal_white_space_string(const char* s); - - -#ifdef __cplusplus -} -#endif - - -#endif /* JSON_PARSER_H */ diff --git a/src/Makefile.am b/src/Makefile.am deleted file mode 100644 index 4d0459ef..00000000 --- a/src/Makefile.am +++ /dev/null @@ -1,69 +0,0 @@ -bin_PROGRAMS = \ - dict_test \ - weights_test \ - trule_test \ - hg_test \ - ff_test \ - parser_test \ - grammar_test \ - cdec \ - small_vector_test - -cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc ff_factory.cc timing_stats.cc -small_vector_test_SOURCES = small_vector_test.cc -small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a -parser_test_SOURCES = parser_test.cc -parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a -dict_test_SOURCES = dict_test.cc -dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a -ff_test_SOURCES = ff_test.cc -ff_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a -grammar_test_SOURCES = grammar_test.cc -grammar_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a -hg_test_SOURCES = hg_test.cc -hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a -trule_test_SOURCES = trule_test.cc -trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a -weights_test_SOURCES = weights_test.cc -weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a - -LDADD = libhg.a - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB) -lz - -noinst_LIBRARIES = libhg.a - -libhg_a_SOURCES = \ - fst_translator.cc \ - csplit.cc \ - scfg_translator.cc \ - hg.cc \ - hg_io.cc \ - hg_intersect.cc \ - viterbi.cc \ - lattice.cc \ - aligner.cc \ - gzstream.cc \ - apply_models.cc \ - earley_composer.cc \ - phrasetable_fst.cc \ - sparse_vector.cc \ - trule.cc \ - filelib.cc \ - stringlib.cc \ - fdict.cc \ - tdict.cc \ - weights.cc \ - ttables.cc \ - ff.cc \ - ff_lm.cc \ - ff_wordalign.cc \ - ff_csplit.cc \ - freqdict.cc \ - lexcrf.cc \ - bottom_up_parser.cc \ - phrasebased_translator.cc \ - JSON_parser.c \ - json_parse.cc \ - grammar.cc diff --git a/src/aligner.cc b/src/aligner.cc deleted file mode 100644 index d9d067e5..00000000 --- a/src/aligner.cc +++ /dev/null @@ -1,204 +0,0 @@ -#include "aligner.h" - -#include "array2d.h" -#include "hg.h" -#include "inside_outside.h" -#include - -using namespace std; - -struct EdgeCoverageInfo { - set src_indices; - set trg_indices; -}; - -static bool is_digit(char x) { return x >= '0' && x <= '9'; } - -boost::shared_ptr > AlignerTools::ReadPharaohAlignmentGrid(const string& al) { - int max_x = 0; - int max_y = 0; - int i = 0; - while (i < al.size()) { - int x = 0; - while(i < al.size() && is_digit(al[i])) { - x *= 10; - x += al[i] - '0'; - ++i; - } - if (x > max_x) max_x = x; - assert(i < al.size()); - assert(al[i] == '-'); - ++i; - int y = 0; - while(i < al.size() && is_digit(al[i])) { - y *= 10; - y += al[i] - '0'; - ++i; - } - if (y > max_y) max_y = y; - while(i < al.size() && al[i] == ' ') { ++i; } - } - - boost::shared_ptr > grid(new Array2D(max_x + 1, max_y + 1)); - i = 0; - while (i < al.size()) { - int x = 0; - while(i < al.size() && is_digit(al[i])) { - x *= 10; - x += al[i] - '0'; - ++i; - } - assert(i < al.size()); - assert(al[i] == '-'); - ++i; - int y = 0; - while(i < al.size() && is_digit(al[i])) { - y *= 10; - y += al[i] - '0'; - ++i; - } - (*grid)(x, y) = true; - while(i < al.size() && al[i] == ' ') { ++i; } - } - // cerr << *grid << endl; - return grid; -} - -void AlignerTools::SerializePharaohFormat(const Array2D& alignment, ostream* out) { - bool need_space = false; - for (int i = 0; i < alignment.width(); ++i) - for (int j = 0; j < alignment.height(); ++j) - if (alignment(i,j)) { - if (need_space) (*out) << ' '; else need_space = true; - (*out) << i << '-' << j; - } - (*out) << endl; -} - -// compute the coverage vectors of each edge -// prereq: all derivations yield the same string pair -void ComputeCoverages(const Hypergraph& g, - vector* pcovs) { - for (int i = 0; i < g.edges_.size(); ++i) { - const Hypergraph::Edge& edge = g.edges_[i]; - EdgeCoverageInfo& cov = (*pcovs)[i]; - // no words - if (edge.rule_->EWords() == 0 || edge.rule_->FWords() == 0) - continue; - // aligned to NULL (crf ibm variant only) - if (edge.prev_i_ == -1 || edge.i_ == -1) - continue; - assert(edge.j_ >= 0); - assert(edge.prev_j_ >= 0); - if (edge.Arity() == 0) { - for (int k = edge.i_; k < edge.j_; ++k) - cov.trg_indices.insert(k); - for (int k = edge.prev_i_; k < edge.prev_j_; ++k) - cov.src_indices.insert(k); - } else { - // note: this code, which handles mixed NT and terminal - // rules assumes that nodes uniquely define a src and trg - // span. - int k = edge.prev_i_; - int j = 0; - const vector& f = edge.rule_->e(); // rules are inverted - while (k < edge.prev_j_) { - if (f[j] > 0) { - cov.src_indices.insert(k); - // cerr << "src: " << k << endl; - ++k; - ++j; - } else { - const Hypergraph::Node& tailnode = g.nodes_[edge.tail_nodes_[-f[j]]]; - assert(tailnode.in_edges_.size() > 0); - // any edge will do: - const Hypergraph::Edge& rep_edge = g.edges_[tailnode.in_edges_.front()]; - //cerr << "skip " << (rep_edge.prev_j_ - rep_edge.prev_i_) << endl; // src span - k += (rep_edge.prev_j_ - rep_edge.prev_i_); // src span - ++j; - } - } - int tc = 0; - const vector& e = edge.rule_->f(); // rules are inverted - k = edge.i_; - j = 0; - // cerr << edge.rule_->AsString() << endl; - // cerr << "i=" << k << " j=" << edge.j_ << endl; - while (k < edge.j_) { - //cerr << " k=" << k << endl; - if (e[j] > 0) { - cov.trg_indices.insert(k); - // cerr << "trg: " << k << endl; - ++k; - ++j; - } else { - assert(tc < edge.tail_nodes_.size()); - const Hypergraph::Node& tailnode = g.nodes_[edge.tail_nodes_[tc]]; - assert(tailnode.in_edges_.size() > 0); - // any edge will do: - const Hypergraph::Edge& rep_edge = g.edges_[tailnode.in_edges_.front()]; - // cerr << "t skip " << (rep_edge.j_ - rep_edge.i_) << endl; // src span - k += (rep_edge.j_ - rep_edge.i_); // src span - ++j; - ++tc; - } - } - //abort(); - } - } -} - -void AlignerTools::WriteAlignment(const string& input, - const Lattice& ref, - const Hypergraph& g, - bool map_instead_of_viterbi) { - if (!map_instead_of_viterbi) { - assert(!"not implemented!"); - } - vector edge_posteriors(g.edges_.size()); - { - SparseVector posts; - InsideOutside, TransitionEventWeightFunction>(g, &posts); - for (int i = 0; i < edge_posteriors.size(); ++i) - edge_posteriors[i] = posts[i]; - } - vector edge2cov(g.edges_.size()); - ComputeCoverages(g, &edge2cov); - - Lattice src; - // currently only dealing with src text, even if the - // model supports lattice translation (which it probably does) - LatticeTools::ConvertTextToLattice(input, &src); - // TODO assert that src is a "real lattice" - - Array2D align(src.size(), ref.size(), prob_t::Zero()); - for (int c = 0; c < g.edges_.size(); ++c) { - const prob_t& p = edge_posteriors[c]; - const EdgeCoverageInfo& eci = edge2cov[c]; - for (set::const_iterator si = eci.src_indices.begin(); - si != eci.src_indices.end(); ++si) { - for (set::const_iterator ti = eci.trg_indices.begin(); - ti != eci.trg_indices.end(); ++ti) { - align(*si, *ti) += p; - } - } - } - prob_t threshold(0.9); - const bool use_soft_threshold = true; // TODO configure - - Array2D grid(src.size(), ref.size(), false); - for (int j = 0; j < ref.size(); ++j) { - if (use_soft_threshold) { - threshold = prob_t::Zero(); - for (int i = 0; i < src.size(); ++i) - if (align(i, j) > threshold) threshold = align(i, j); - //threshold *= prob_t(0.99); - } - for (int i = 0; i < src.size(); ++i) - grid(i, j) = align(i, j) >= threshold; - } - cerr << align << endl; - cerr << grid << endl; - SerializePharaohFormat(grid, &cout); -}; - diff --git a/src/aligner.h b/src/aligner.h deleted file mode 100644 index 970c72f2..00000000 --- a/src/aligner.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _ALIGNER_H_ - -#include -#include -#include -#include "array2d.h" -#include "lattice.h" - -class Hypergraph; - -struct AlignerTools { - static boost::shared_ptr > ReadPharaohAlignmentGrid(const std::string& al); - static void SerializePharaohFormat(const Array2D& alignment, std::ostream* out); - - // assumption: g contains derivations of input/ref and - // ONLY input/ref. - static void WriteAlignment(const std::string& input, - const Lattice& ref, - const Hypergraph& g, - bool map_instead_of_viterbi = true); -}; - -#endif diff --git a/src/apply_models.cc b/src/apply_models.cc deleted file mode 100644 index b1d002f4..00000000 --- a/src/apply_models.cc +++ /dev/null @@ -1,344 +0,0 @@ -#include "apply_models.h" - -#include -#include -#include -#include - -#include - -#include "hg.h" -#include "ff.h" - -using namespace std; -using namespace std::tr1; - -struct Candidate; -typedef SmallVector JVector; -typedef vector CandidateHeap; -typedef vector CandidateList; - -// life cycle: candidates are created, placed on the heap -// and retrieved by their estimated cost, when they're -// retrieved, they're incorporated into the +LM hypergraph -// where they also know the head node index they are -// attached to. After they are added to the +LM hypergraph -// vit_prob_ and est_prob_ fields may be updated as better -// derivations are found (this happens since the successor's -// of derivation d may have a better score- they are -// explored lazily). However, the updates don't happen -// when a candidate is in the heap so maintaining the heap -// property is not an issue. -struct Candidate { - int node_index_; // -1 until incorporated - // into the +LM forest - const Hypergraph::Edge* in_edge_; // in -LM forest - Hypergraph::Edge out_edge_; - string state_; - const JVector j_; - prob_t vit_prob_; // these are fixed until the cand - // is popped, then they may be updated - prob_t est_prob_; - - Candidate(const Hypergraph::Edge& e, - const JVector& j, - const Hypergraph& out_hg, - const vector& D, - const SentenceMetadata& smeta, - const ModelSet& models, - bool is_goal) : - node_index_(-1), - in_edge_(&e), - j_(j) { - InitializeCandidate(out_hg, smeta, D, models, is_goal); - } - - // used to query uniqueness - Candidate(const Hypergraph::Edge& e, - const JVector& j) : in_edge_(&e), j_(j) {} - - bool IsIncorporatedIntoHypergraph() const { - return node_index_ >= 0; - } - - void InitializeCandidate(const Hypergraph& out_hg, - const SentenceMetadata& smeta, - const vector >& D, - const ModelSet& models, - const bool is_goal) { - const Hypergraph::Edge& in_edge = *in_edge_; - out_edge_.rule_ = in_edge.rule_; - out_edge_.feature_values_ = in_edge.feature_values_; - out_edge_.i_ = in_edge.i_; - out_edge_.j_ = in_edge.j_; - out_edge_.prev_i_ = in_edge.prev_i_; - out_edge_.prev_j_ = in_edge.prev_j_; - Hypergraph::TailNodeVector& tail = out_edge_.tail_nodes_; - tail.resize(j_.size()); - prob_t p = prob_t::One(); - // cerr << "\nEstimating application of " << in_edge.rule_->AsString() << endl; - for (int i = 0; i < tail.size(); ++i) { - const Candidate& ant = *D[in_edge.tail_nodes_[i]][j_[i]]; - assert(ant.IsIncorporatedIntoHypergraph()); - tail[i] = ant.node_index_; - p *= ant.vit_prob_; - } - prob_t edge_estimate = prob_t::One(); - if (is_goal) { - assert(tail.size() == 1); - const string& ant_state = out_hg.nodes_[tail.front()].state_; - models.AddFinalFeatures(ant_state, &out_edge_); - } else { - models.AddFeaturesToEdge(smeta, out_hg, &out_edge_, &state_, &edge_estimate); - } - vit_prob_ = out_edge_.edge_prob_ * p; - est_prob_ = vit_prob_ * edge_estimate; - } -}; - -ostream& operator<<(ostream& os, const Candidate& cand) { - os << "CAND["; - if (!cand.IsIncorporatedIntoHypergraph()) { os << "PENDING "; } - else { os << "+LM_node=" << cand.node_index_; } - os << " edge=" << cand.in_edge_->id_; - os << " j=<"; - for (int i = 0; i < cand.j_.size(); ++i) - os << (i==0 ? "" : " ") << cand.j_[i]; - os << "> vit=" << log(cand.vit_prob_); - os << " est=" << log(cand.est_prob_); - return os << ']'; -} - -struct HeapCandCompare { - bool operator()(const Candidate* l, const Candidate* r) const { - return l->est_prob_ < r->est_prob_; - } -}; - -struct EstProbSorter { - bool operator()(const Candidate* l, const Candidate* r) const { - return l->est_prob_ > r->est_prob_; - } -}; - -// the same candidate can be added multiple times if -// j is multidimensional (if you're going NW in Manhattan, you -// can first go north, then west, or you can go west then north) -// this is a hash function on the relevant variables from -// Candidate to enforce this. -struct CandidateUniquenessHash { - size_t operator()(const Candidate* c) const { - size_t x = 5381; - x = ((x << 5) + x) ^ c->in_edge_->id_; - for (int i = 0; i < c->j_.size(); ++i) - x = ((x << 5) + x) ^ c->j_[i]; - return x; - } -}; - -struct CandidateUniquenessEquals { - bool operator()(const Candidate* a, const Candidate* b) const { - return (a->in_edge_ == b->in_edge_) && (a->j_ == b->j_); - } -}; - -typedef unordered_set UniqueCandidateSet; -typedef unordered_map > State2Node; - -class CubePruningRescorer { - -public: - CubePruningRescorer(const ModelSet& m, - const SentenceMetadata& sm, - const Hypergraph& i, - int pop_limit, - Hypergraph* o) : - models(m), - smeta(sm), - in(i), - out(*o), - D(in.nodes_.size()), - pop_limit_(pop_limit) { - cerr << " Applying feature functions (cube pruning, pop_limit = " << pop_limit_ << ')' << endl; - } - - void Apply() { - int num_nodes = in.nodes_.size(); - int goal_id = num_nodes - 1; - int pregoal = goal_id - 1; - int every = 1; - if (num_nodes > 100) every = 10; - assert(in.nodes_[pregoal].out_edges_.size() == 1); - cerr << " "; - for (int i = 0; i < in.nodes_.size(); ++i) { - if (i % every == 0) cerr << '.'; - KBest(i, i == goal_id); - } - cerr << endl; - cerr << " Best path: " << log(D[goal_id].front()->vit_prob_) - << "\t" << log(D[goal_id].front()->est_prob_) << endl; - out.PruneUnreachable(D[goal_id].front()->node_index_); - FreeAll(); - } - - private: - void FreeAll() { - for (int i = 0; i < D.size(); ++i) { - CandidateList& D_i = D[i]; - for (int j = 0; j < D_i.size(); ++j) - delete D_i[j]; - } - D.clear(); - } - - void IncorporateIntoPlusLMForest(Candidate* item, State2Node* s2n, CandidateList* freelist) { - Hypergraph::Edge* new_edge = out.AddEdge(item->out_edge_.rule_, item->out_edge_.tail_nodes_); - new_edge->feature_values_ = item->out_edge_.feature_values_; - new_edge->edge_prob_ = item->out_edge_.edge_prob_; - new_edge->i_ = item->out_edge_.i_; - new_edge->j_ = item->out_edge_.j_; - new_edge->prev_i_ = item->out_edge_.prev_i_; - new_edge->prev_j_ = item->out_edge_.prev_j_; - Candidate*& o_item = (*s2n)[item->state_]; - if (!o_item) o_item = item; - - int& node_id = o_item->node_index_; - if (node_id < 0) { - Hypergraph::Node* new_node = out.AddNode(in.nodes_[item->in_edge_->head_node_].cat_, item->state_); - node_id = new_node->id_; - } - Hypergraph::Node* node = &out.nodes_[node_id]; - out.ConnectEdgeToHeadNode(new_edge, node); - - // update candidate if we have a better derivation - // note: the difference between the vit score and the estimated - // score is the same for all items with a common residual DP - // state - if (item->vit_prob_ > o_item->vit_prob_) { - assert(o_item->state_ == item->state_); // sanity check! - o_item->est_prob_ = item->est_prob_; - o_item->vit_prob_ = item->vit_prob_; - } - if (item != o_item) freelist->push_back(item); - } - - void KBest(const int vert_index, const bool is_goal) { - // cerr << "KBest(" << vert_index << ")\n"; - CandidateList& D_v = D[vert_index]; - assert(D_v.empty()); - const Hypergraph::Node& v = in.nodes_[vert_index]; - // cerr << " has " << v.in_edges_.size() << " in-coming edges\n"; - const vector& in_edges = v.in_edges_; - CandidateHeap cand; - CandidateList freelist; - cand.reserve(in_edges.size()); - UniqueCandidateSet unique_cands; - for (int i = 0; i < in_edges.size(); ++i) { - const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; - const JVector j(edge.tail_nodes_.size(), 0); - cand.push_back(new Candidate(edge, j, out, D, smeta, models, is_goal)); - assert(unique_cands.insert(cand.back()).second); // these should all be unique! - } -// cerr << " making heap of " << cand.size() << " candidates\n"; - make_heap(cand.begin(), cand.end(), HeapCandCompare()); - State2Node state2node; // "buf" in Figure 2 - int pops = 0; - while(!cand.empty() && pops < pop_limit_) { - pop_heap(cand.begin(), cand.end(), HeapCandCompare()); - Candidate* item = cand.back(); - cand.pop_back(); - // cerr << "POPPED: " << *item << endl; - PushSucc(*item, is_goal, &cand, &unique_cands); - IncorporateIntoPlusLMForest(item, &state2node, &freelist); - ++pops; - } - D_v.resize(state2node.size()); - int c = 0; - for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i) - D_v[c++] = i->second; - sort(D_v.begin(), D_v.end(), EstProbSorter()); - // cerr << " expanded to " << D_v.size() << " nodes\n"; - - for (int i = 0; i < cand.size(); ++i) - delete cand[i]; - // freelist is necessary since even after an item merged, it still stays in - // the unique set so it can't be deleted til now - for (int i = 0; i < freelist.size(); ++i) - delete freelist[i]; - } - - void PushSucc(const Candidate& item, const bool is_goal, CandidateHeap* pcand, UniqueCandidateSet* cs) { - CandidateHeap& cand = *pcand; - for (int i = 0; i < item.j_.size(); ++i) { - JVector j = item.j_; - ++j[i]; - if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) { - Candidate query_unique(*item.in_edge_, j); - if (cs->count(&query_unique) == 0) { - Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, smeta, models, is_goal); - cand.push_back(new_cand); - push_heap(cand.begin(), cand.end(), HeapCandCompare()); - assert(cs->insert(new_cand).second); // insert into uniqueness set, sanity check - } - } - } - } - - const ModelSet& models; - const SentenceMetadata& smeta; - const Hypergraph& in; - Hypergraph& out; - - vector D; // maps nodes in in-HG to the - // equivalent nodes (many due to state - // splits) in the out-HG. - const int pop_limit_; -}; - -struct NoPruningRescorer { - NoPruningRescorer(const ModelSet& m, const Hypergraph& i, Hypergraph* o) : - models(m), - in(i), - out(*o) { - cerr << " Rescoring forest (full intersection)\n"; - } - - void RescoreNode(const int node_num, const bool is_goal) { - } - - void Apply() { - int num_nodes = in.nodes_.size(); - int goal_id = num_nodes - 1; - int pregoal = goal_id - 1; - int every = 1; - if (num_nodes > 100) every = 10; - assert(in.nodes_[pregoal].out_edges_.size() == 1); - cerr << " "; - for (int i = 0; i < in.nodes_.size(); ++i) { - if (i % every == 0) cerr << '.'; - RescoreNode(i, i == goal_id); - } - cerr << endl; - } - - private: - const ModelSet& models; - const Hypergraph& in; - Hypergraph& out; -}; - -// each node in the graph has one of these, it keeps track of -void ApplyModelSet(const Hypergraph& in, - const SentenceMetadata& smeta, - const ModelSet& models, - const PruningConfiguration& config, - Hypergraph* out) { - int pl = config.pop_limit; - if (pl > 100 && in.nodes_.size() > 80000) { - cerr << " Note: reducing pop_limit to " << pl << " for very large forest\n"; - pl = 30; - } - CubePruningRescorer ma(models, smeta, in, pl, out); - ma.Apply(); -} - diff --git a/src/apply_models.h b/src/apply_models.h deleted file mode 100644 index 08fce037..00000000 --- a/src/apply_models.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _APPLY_MODELS_H_ -#define _APPLY_MODELS_H_ - -struct ModelSet; -struct Hypergraph; -struct SentenceMetadata; - -struct PruningConfiguration { - const int algorithm; // 0 = full intersection, 1 = cube pruning - const int pop_limit; // max number of pops off the heap at each node - explicit PruningConfiguration(int k) : algorithm(1), pop_limit(k) {} -}; - -void ApplyModelSet(const Hypergraph& in, - const SentenceMetadata& smeta, - const ModelSet& models, - const PruningConfiguration& config, - Hypergraph* out); - -#endif diff --git a/src/array2d.h b/src/array2d.h deleted file mode 100644 index e63eda0d..00000000 --- a/src/array2d.h +++ /dev/null @@ -1,172 +0,0 @@ -#ifndef ARRAY2D_H_ -#define ARRAY2D_H_ - -#include -#include -#include -#include -#include - -template -class Array2D { - public: - typedef typename std::vector::reference reference; - typedef typename std::vector::const_reference const_reference; - typedef typename std::vector::iterator iterator; - typedef typename std::vector::const_iterator const_iterator; - Array2D() : width_(0), height_(0) {} - Array2D(int w, int h, const T& d = T()) : - width_(w), height_(h), data_(w*h, d) {} - Array2D(const Array2D& rhs) : - width_(rhs.width_), height_(rhs.height_), data_(rhs.data_) {} - bool empty() const { return data_.empty(); } - void resize(int w, int h, const T& d = T()) { - data_.resize(w * h, d); - width_ = w; - height_ = h; - } - const Array2D& operator=(const Array2D& rhs) { - data_ = rhs.data_; - width_ = rhs.width_; - height_ = rhs.height_; - return *this; - } - void fill(const T& v) { data_.assign(data_.size(), v); } - int width() const { return width_; } - int height() const { return height_; } - reference operator()(int i, int j) { - return data_[offset(i, j)]; - } - void clear() { data_.clear(); width_=0; height_=0; } - const_reference operator()(int i, int j) const { - return data_[offset(i, j)]; - } - iterator begin_col(int j) { - return data_.begin() + offset(0,j); - } - const_iterator begin_col(int j) const { - return data_.begin() + offset(0,j); - } - iterator end_col(int j) { - return data_.begin() + offset(0,j) + width_; - } - const_iterator end_col(int j) const { - return data_.begin() + offset(0,j) + width_; - } - iterator end() { return data_.end(); } - const_iterator end() const { return data_.end(); } - const Array2D& operator*=(const T& x) { - std::transform(data_.begin(), data_.end(), data_.begin(), - std::bind2nd(std::multiplies(), x)); - } - const Array2D& operator/=(const T& x) { - std::transform(data_.begin(), data_.end(), data_.begin(), - std::bind2nd(std::divides(), x)); - } - const Array2D& operator+=(const Array2D& m) { - std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::plus()); - } - const Array2D& operator-=(const Array2D& m) { - std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::minus()); - } - - private: - inline int offset(int i, int j) const { - assert(i data_; -}; - -template -Array2D operator*(const Array2D& l, const T& scalar) { - Array2D res(l); - res *= scalar; - return res; -} - -template -Array2D operator*(const T& scalar, const Array2D& l) { - Array2D res(l); - res *= scalar; - return res; -} - -template -Array2D operator/(const Array2D& l, const T& scalar) { - Array2D res(l); - res /= scalar; - return res; -} - -template -Array2D operator+(const Array2D& l, const Array2D& r) { - Array2D res(l); - res += r; - return res; -} - -template -Array2D operator-(const Array2D& l, const Array2D& r) { - Array2D res(l); - res -= r; - return res; -} - -template -inline std::ostream& operator<<(std::ostream& os, const Array2D& m) { - for (int i=0; i& m) { - os << ' '; - for (int j=0; j >& m) { - os << ' '; - for (int j=0; j& ar = m(i,j); - for (int k=0; k - -#include "hg.h" -#include "array2d.h" -#include "tdict.h" - -using namespace std; - -class ActiveChart; -class PassiveChart { - public: - PassiveChart(const string& goal, - const vector& grammars, - const Lattice& input, - Hypergraph* forest); - ~PassiveChart(); - - inline const vector& operator()(int i, int j) const { return chart_(i,j); } - bool Parse(); - inline int size() const { return chart_.width(); } - inline bool GoalFound() const { return goal_idx_ >= 0; } - inline int GetGoalIndex() const { return goal_idx_; } - - private: - void ApplyRules(const int i, - const int j, - const RuleBin* rules, - const Hypergraph::TailNodeVector& tail, - const float lattice_cost); - - void ApplyRule(const int i, - const int j, - const TRulePtr& r, - const Hypergraph::TailNodeVector& ant_nodes, - const float lattice_cost); - - void ApplyUnaryRules(const int i, const int j); - - const vector& grammars_; - const Lattice& input_; - Hypergraph* forest_; - Array2D > chart_; // chart_(i,j) is the list of nodes derived spanning i,j - typedef map Cat2NodeMap; - Array2D nodemap_; - vector act_chart_; - const WordID goal_cat_; // category that is being searched for at [0,n] - TRulePtr goal_rule_; - int goal_idx_; // index of goal node, if found - const int lc_fid_; - - static WordID kGOAL; // [Goal] -}; - -WordID PassiveChart::kGOAL = 0; - -class ActiveChart { - public: - ActiveChart(const Hypergraph* hg, const PassiveChart& psv_chart) : - hg_(hg), - act_chart_(psv_chart.size(), psv_chart.size()), psv_chart_(psv_chart) {} - - struct ActiveItem { - ActiveItem(const GrammarIter* g, const Hypergraph::TailNodeVector& a, float lcost) : - gptr_(g), ant_nodes_(a), lattice_cost(lcost) {} - explicit ActiveItem(const GrammarIter* g) : - gptr_(g), ant_nodes_(), lattice_cost(0.0) {} - - void ExtendTerminal(int symbol, float src_cost, vector* out_cell) const { - const GrammarIter* ni = gptr_->Extend(symbol); - if (ni) out_cell->push_back(ActiveItem(ni, ant_nodes_, lattice_cost + src_cost)); - } - void ExtendNonTerminal(const Hypergraph* hg, int node_index, vector* out_cell) const { - int symbol = hg->nodes_[node_index].cat_; - const GrammarIter* ni = gptr_->Extend(symbol); - if (!ni) return; - Hypergraph::TailNodeVector na(ant_nodes_.size() + 1); - for (int i = 0; i < ant_nodes_.size(); ++i) - na[i] = ant_nodes_[i]; - na[ant_nodes_.size()] = node_index; - out_cell->push_back(ActiveItem(ni, na, lattice_cost)); - } - - const GrammarIter* gptr_; - Hypergraph::TailNodeVector ant_nodes_; - float lattice_cost; // TODO? use SparseVector - }; - - inline const vector& operator()(int i, int j) const { return act_chart_(i,j); } - void SeedActiveChart(const Grammar& g) { - int size = act_chart_.width(); - for (int i = 0; i < size; ++i) - if (g.HasRuleForSpan(i,i,0)) - act_chart_(i,i).push_back(ActiveItem(g.GetRoot())); - } - - void ExtendActiveItems(int i, int k, int j) { - //cerr << " LOOK(" << i << "," << k << ") for completed items in (" << k << "," << j << ")\n"; - vector& cell = act_chart_(i,j); - const vector& icell = act_chart_(i,k); - const vector& idxs = psv_chart_(k, j); - //if (!idxs.empty()) { cerr << "FOUND IN (" << k << "," << j << ")\n"; } - for (vector::const_iterator di = icell.begin(); di != icell.end(); ++di) { - for (vector::const_iterator ni = idxs.begin(); ni != idxs.end(); ++ni) { - di->ExtendNonTerminal(hg_, *ni, &cell); - } - } - } - - void AdvanceDotsForAllItemsInCell(int i, int j, const vector >& input) { - //cerr << "ADVANCE(" << i << "," << j << ")\n"; - for (int k=i+1; k < j; ++k) - ExtendActiveItems(i, k, j); - - const vector& out_arcs = input[j-1]; - for (vector::const_iterator ai = out_arcs.begin(); - ai != out_arcs.end(); ++ai) { - const WordID& f = ai->label; - const double& c = ai->cost; - const int& len = ai->dist2next; - //VLOG(1) << "F: " << TD::Convert(f) << endl; - const vector& ec = act_chart_(i, j-1); - for (vector::const_iterator di = ec.begin(); di != ec.end(); ++di) - di->ExtendTerminal(f, c, &act_chart_(i, j + len - 1)); - } - } - - private: - const Hypergraph* hg_; - Array2D > act_chart_; - const PassiveChart& psv_chart_; -}; - -PassiveChart::PassiveChart(const string& goal, - const vector& grammars, - const Lattice& input, - Hypergraph* forest) : - grammars_(grammars), - input_(input), - forest_(forest), - chart_(input.size()+1, input.size()+1), - nodemap_(input.size()+1, input.size()+1), - goal_cat_(TD::Convert(goal) * -1), - goal_rule_(new TRule("[Goal] ||| [" + goal + ",1] ||| [" + goal + ",1]")), - goal_idx_(-1), - lc_fid_(FD::Convert("LatticeCost")) { - act_chart_.resize(grammars_.size()); - for (int i = 0; i < grammars_.size(); ++i) - act_chart_[i] = new ActiveChart(forest, *this); - if (!kGOAL) kGOAL = TD::Convert("Goal") * -1; - cerr << " Goal category: [" << goal << ']' << endl; -} - -void PassiveChart::ApplyRule(const int i, - const int j, - const TRulePtr& r, - const Hypergraph::TailNodeVector& ant_nodes, - const float lattice_cost) { - Hypergraph::Edge* new_edge = forest_->AddEdge(r, ant_nodes); - new_edge->prev_i_ = r->prev_i; - new_edge->prev_j_ = r->prev_j; - new_edge->i_ = i; - new_edge->j_ = j; - new_edge->feature_values_ = r->GetFeatureValues(); - if (lattice_cost) - new_edge->feature_values_.set_value(lc_fid_, lattice_cost); - Cat2NodeMap& c2n = nodemap_(i,j); - const bool is_goal = (r->GetLHS() == kGOAL); - const Cat2NodeMap::iterator ni = c2n.find(r->GetLHS()); - Hypergraph::Node* node = NULL; - if (ni == c2n.end()) { - node = forest_->AddNode(r->GetLHS(), ""); - c2n[r->GetLHS()] = node->id_; - if (is_goal) { - assert(goal_idx_ == -1); - goal_idx_ = node->id_; - } else { - chart_(i,j).push_back(node->id_); - } - } else { - node = &forest_->nodes_[ni->second]; - } - forest_->ConnectEdgeToHeadNode(new_edge, node); -} - -void PassiveChart::ApplyRules(const int i, - const int j, - const RuleBin* rules, - const Hypergraph::TailNodeVector& tail, - const float lattice_cost) { - const int n = rules->GetNumRules(); - for (int k = 0; k < n; ++k) - ApplyRule(i, j, rules->GetIthRule(k), tail, lattice_cost); -} - -void PassiveChart::ApplyUnaryRules(const int i, const int j) { - const vector& nodes = chart_(i,j); // reference is important! - for (int gi = 0; gi < grammars_.size(); ++gi) { - if (!grammars_[gi]->HasRuleForSpan(i,j,input_.Distance(i,j))) continue; - for (int di = 0; di < nodes.size(); ++di) { - const WordID& cat = forest_->nodes_[nodes[di]].cat_; - const vector& unaries = grammars_[gi]->GetUnaryRulesForRHS(cat); - for (int ri = 0; ri < unaries.size(); ++ri) { - // cerr << "At (" << i << "," << j << "): applying " << unaries[ri]->AsString() << endl; - const Hypergraph::TailNodeVector ant(1, nodes[di]); - ApplyRule(i, j, unaries[ri], ant, 0); // may update nodes - } - } - } -} - -bool PassiveChart::Parse() { - forest_->nodes_.reserve(input_.size() * input_.size() * 2); - forest_->edges_.reserve(input_.size() * input_.size() * 1000); // TODO: reservation?? - goal_idx_ = -1; - for (int gi = 0; gi < grammars_.size(); ++gi) - act_chart_[gi]->SeedActiveChart(*grammars_[gi]); - - cerr << " "; - for (int l=1; lAdvanceDotsForAllItemsInCell(i, j, input_); - - const vector& cell = (*act_chart_[gi])(i,j); - for (vector::const_iterator ai = cell.begin(); - ai != cell.end(); ++ai) { - const RuleBin* rules = (ai->gptr_->GetRules()); - if (!rules) continue; - ApplyRules(i, j, rules, ai->ant_nodes_, ai->lattice_cost); - } - } - } - ApplyUnaryRules(i,j); - - for (int gi = 0; gi < grammars_.size(); ++gi) { - const Grammar& g = *grammars_[gi]; - // deal with non-terminals that were just proved - if (g.HasRuleForSpan(i, j, input_.Distance(i,j))) - act_chart_[gi]->ExtendActiveItems(i, i, j); - } - } - const vector& dh = chart_(0, input_.size()); - for (int di = 0; di < dh.size(); ++di) { - const Hypergraph::Node& node = forest_->nodes_[dh[di]]; - if (node.cat_ == goal_cat_) { - Hypergraph::TailNodeVector ant(1, node.id_); - ApplyRule(0, input_.size(), goal_rule_, ant, 0); - } - } - } - cerr << endl; - - if (GoalFound()) - forest_->PruneUnreachable(forest_->nodes_.size() - 1); - return GoalFound(); -} - -PassiveChart::~PassiveChart() { - for (int i = 0; i < act_chart_.size(); ++i) - delete act_chart_[i]; -} - -ExhaustiveBottomUpParser::ExhaustiveBottomUpParser( - const string& goal_sym, - const vector& grammars) : - goal_sym_(goal_sym), - grammars_(grammars) {} - -bool ExhaustiveBottomUpParser::Parse(const Lattice& input, - Hypergraph* forest) const { - PassiveChart chart(goal_sym_, grammars_, input, forest); - return chart.Parse(); -} diff --git a/src/bottom_up_parser.h b/src/bottom_up_parser.h deleted file mode 100644 index 546bfb54..00000000 --- a/src/bottom_up_parser.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _BOTTOM_UP_PARSER_H_ -#define _BOTTOM_UP_PARSER_H_ - -#include -#include - -#include "lattice.h" -#include "grammar.h" - -class Hypergraph; - -class ExhaustiveBottomUpParser { - public: - ExhaustiveBottomUpParser(const std::string& goal_sym, - const std::vector& grammars); - - // returns true if goal reached spanning the full input - // forest contains the full (i.e., unpruned) parse forest - bool Parse(const Lattice& input, - Hypergraph* forest) const; - - private: - const std::string goal_sym_; - const std::vector grammars_; -}; - -#endif diff --git a/src/cdec.cc b/src/cdec.cc deleted file mode 100644 index 6185c79b..00000000 --- a/src/cdec.cc +++ /dev/null @@ -1,507 +0,0 @@ -#include -#include -#include -#include - -#include -#include -#include - -#include "timing_stats.h" -#include "translator.h" -#include "phrasebased_translator.h" -#include "aligner.h" -#include "stringlib.h" -#include "forest_writer.h" -#include "hg_io.h" -#include "filelib.h" -#include "sampler.h" -#include "sparse_vector.h" -#include "lexcrf.h" -#include "csplit.h" -#include "weights.h" -#include "tdict.h" -#include "ff.h" -#include "ff_factory.h" -#include "hg_intersect.h" -#include "apply_models.h" -#include "viterbi.h" -#include "kbest.h" -#include "inside_outside.h" -#include "exp_semiring.h" -#include "sentence_metadata.h" - -using namespace std; -using namespace std::tr1; -using boost::shared_ptr; -namespace po = boost::program_options; - -// some globals ... -boost::shared_ptr > rng; - -namespace Hack { void MaxTrans(const Hypergraph& in, int beam_size); } - -void ShowBanner() { - cerr << "cdec v1.0 (c) 2009 by Chris Dyer\n"; -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("formalism,f",po::value(),"Translation formalism; values include SCFG, FST, PB, LexCRF (lexical translation model), CSplit (compound splitting)") - ("input,i",po::value()->default_value("-"),"Source file") - ("grammar,g",po::value >()->composing(),"Either SCFG grammar file(s) or phrase tables file(s)") - ("weights,w",po::value(),"Feature weights file") - ("feature_function,F",po::value >()->composing(), "Additional feature function(s) (-L for list)") - ("list_feature_functions,L","List available feature functions") - ("add_pass_through_rules,P","Add rules to translate OOV words as themselves") - ("k_best,k",po::value(),"Extract the k best derivations") - ("unique_k_best,r", "Unique k-best translation list") - ("aligner,a", "Run as a word/phrase aligner (src & ref required)") - ("cubepruning_pop_limit,K",po::value()->default_value(200), "Max number of pops from the candidate heap at each node") - ("goal",po::value()->default_value("S"),"Goal symbol (SCFG & FST)") - ("scfg_extra_glue_grammar", po::value(), "Extra glue grammar file (Glue grammars apply when i=0 but have no other span restrictions)") - ("scfg_no_hiero_glue_grammar,n", "No Hiero glue grammar (nb. by default the SCFG decoder adds Hiero glue rules)") - ("scfg_default_nt,d",po::value()->default_value("X"),"Default non-terminal symbol in SCFG") - ("scfg_max_span_limit,S",po::value()->default_value(10),"Maximum non-terminal span limit (except \"glue\" grammar)") - ("show_tree_structure,T", "Show the Viterbi derivation structure") - ("show_expected_length", "Show the expected translation length under the model") - ("show_partition,z", "Compute and show the partition (inside score)") - ("beam_prune", po::value(), "Prune paths from +LM forest") - ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format") - ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice") - ("extract_rules", po::value(), "Extract the rules used in translation (de-duped) to this file") - ("graphviz","Show (constrained) translation forest in GraphViz format") - ("max_translation_beam,x", po::value(), "Beam approximation to get max translation from the chart") - ("max_translation_sample,X", po::value(), "Sample the max translation from the chart") - ("pb_max_distortion,D", po::value()->default_value(4), "Phrase-based decoder: maximum distortion") - ("gradient,G","Compute d log p(e|f) / d lambda_i and write to STDOUT (src & ref required)") - ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)") - ("vector_format",po::value()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)") - ("combine_size,C",po::value()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)") - ("forest_output,O",po::value(),"Directory to write forests to") - ("minimal_forests,m","Write minimal forests (excludes Rule information). Such forests can be used for ML/MAP training, but not rescoring, etc."); - po::options_description clo("Command line options"); - clo.add_options() - ("config,c", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - const string cfg = (*conf)["config"].as(); - cerr << "Configuration file: " << cfg << endl; - ifstream config(cfg.c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("list_feature_functions")) { - cerr << "Available feature functions (specify with -F):\n"; - global_ff_registry->DisplayList(); - cerr << endl; - exit(1); - } - - if (conf->count("help") || conf->count("formalism") == 0) { - cerr << dcmdline_options << endl; - exit(1); - } - - const string formalism = LowercaseString((*conf)["formalism"].as()); - if (formalism != "scfg" && formalism != "fst" && formalism != "lexcrf" && formalism != "pb" && formalism != "csplit") { - cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit' or 'lexcrf'\n"; - cerr << dcmdline_options << endl; - exit(1); - } -} - -// TODO move out of cdec into some sampling decoder file -void SampleRecurse(const Hypergraph& hg, const vector& ss, int n, vector* out) { - const SampleSet& s = ss[n]; - int i = rng->SelectSample(s); - const Hypergraph::Edge& edge = hg.edges_[hg.nodes_[n].in_edges_[i]]; - vector > ants(edge.tail_nodes_.size()); - for (int j = 0; j < ants.size(); ++j) - SampleRecurse(hg, ss, edge.tail_nodes_[j], &ants[j]); - - vector*> pants(ants.size()); - for (int j = 0; j < ants.size(); ++j) pants[j] = &ants[j]; - edge.rule_->ESubstitute(pants, out); -} - -struct SampleSort { - bool operator()(const pair& a, const pair& b) const { - return a.first > b.first; - } -}; - -// TODO move out of cdec into some sampling decoder file -void MaxTranslationSample(Hypergraph* hg, const int samples, const int k) { - unordered_map > m; - hg->PushWeightsToGoal(); - const int num_nodes = hg->nodes_.size(); - vector ss(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - SampleSet& s = ss[i]; - const vector& in_edges = hg->nodes_[i].in_edges_; - for (int j = 0; j < in_edges.size(); ++j) { - s.add(hg->edges_[in_edges[j]].edge_prob_); - } - } - for (int i = 0; i < samples; ++i) { - vector yield; - SampleRecurse(*hg, ss, hg->nodes_.size() - 1, &yield); - const string trans = TD::GetString(yield); - ++m[trans]; - } - vector > dist; - for (unordered_map >::iterator i = m.begin(); - i != m.end(); ++i) { - dist.push_back(make_pair(i->second, i->first)); - } - sort(dist.begin(), dist.end(), SampleSort()); - if (k) { - for (int i = 0; i < k; ++i) - cout << dist[i].first << " ||| " << dist[i].second << endl; - } else { - cout << dist[0].second << endl; - } -} - -// TODO decoder output should probably be moved to another file -void DumpKBest(const int sent_id, const Hypergraph& forest, const int k, const bool unique) { - if (unique) { - KBest::KBestDerivations, ESentenceTraversal, KBest::FilterUnique> kbest(forest, k); - for (int i = 0; i < k; ++i) { - const KBest::KBestDerivations, ESentenceTraversal, KBest::FilterUnique>::Derivation* d = - kbest.LazyKthBest(forest.nodes_.size() - 1, i); - if (!d) break; - cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " - << d->feature_values << " ||| " << log(d->score) << endl; - } - } else { - KBest::KBestDerivations, ESentenceTraversal> kbest(forest, k); - for (int i = 0; i < k; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(forest.nodes_.size() - 1, i); - if (!d) break; - cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " - << d->feature_values << " ||| " << log(d->score) << endl; - } - } -} - -struct ELengthWeightFunction { - double operator()(const Hypergraph::Edge& e) const { - return e.rule_->ELength() - e.rule_->Arity(); - } -}; - - -struct TRPHash { - size_t operator()(const TRulePtr& o) const { return reinterpret_cast(o.get()); } -}; -static void ExtractRulesDedupe(const Hypergraph& hg, ostream* os) { - static unordered_set written; - for (int i = 0; i < hg.edges_.size(); ++i) { - const TRulePtr& rule = hg.edges_[i].rule_; - if (written.insert(rule).second) { - (*os) << rule->AsString() << endl; - } - } -} - -void register_feature_functions(); - -int main(int argc, char** argv) { - global_ff_registry.reset(new FFRegistry); - register_feature_functions(); - ShowBanner(); - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const bool write_gradient = conf.count("gradient"); - const bool feature_expectations = conf.count("feature_expectations"); - if (write_gradient && feature_expectations) { - cerr << "You can only specify --gradient or --feature_expectations, not both!\n"; - exit(1); - } - const bool output_training_vector = (write_gradient || feature_expectations); - - boost::shared_ptr translator; - const string formalism = LowercaseString(conf["formalism"].as()); - const bool csplit_preserve_full_word = conf.count("csplit_preserve_full_word"); - if (csplit_preserve_full_word && - (formalism != "csplit" || !conf.count("beam_prune"))) { - cerr << "--csplit_preserve_full_word should only be " - << "used with csplit AND --beam_prune!\n"; - exit(1); - } - const bool csplit_output_plf = conf.count("csplit_output_plf"); - if (csplit_output_plf && formalism != "csplit") { - cerr << "--csplit_output_plf should only be used with csplit!\n"; - exit(1); - } - - if (formalism == "scfg") - translator.reset(new SCFGTranslator(conf)); - else if (formalism == "fst") - translator.reset(new FSTTranslator(conf)); - else if (formalism == "pb") - translator.reset(new PhraseBasedTranslator(conf)); - else if (formalism == "csplit") - translator.reset(new CompoundSplit(conf)); - else if (formalism == "lexcrf") - translator.reset(new LexicalCRF(conf)); - else - assert(!"error"); - - vector feature_weights; - Weights w; - if (conf.count("weights")) { - w.InitFromFile(conf["weights"].as()); - feature_weights.resize(FD::NumFeats()); - w.InitVector(&feature_weights); - } - - // set up additional scoring features - vector > pffs; - vector late_ffs; - if (conf.count("feature_function") > 0) { - const vector& add_ffs = conf["feature_function"].as >(); - for (int i = 0; i < add_ffs.size(); ++i) { - string ff, param; - SplitCommandAndParam(add_ffs[i], &ff, ¶m); - cerr << "Feature: " << ff; - if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n"; - else cerr << " (no config parameters)\n"; - shared_ptr pff = global_ff_registry->Create(ff, param); - if (!pff) { exit(1); } - // TODO check that multiple features aren't trying to set the same fid - pffs.push_back(pff); - late_ffs.push_back(pff.get()); - } - } - ModelSet late_models(feature_weights, late_ffs); - - const int sample_max_trans = conf.count("max_translation_sample") ? - conf["max_translation_sample"].as() : 0; - if (sample_max_trans) - rng.reset(new RandomNumberGenerator); - const bool aligner_mode = conf.count("aligner"); - const bool minimal_forests = conf.count("minimal_forests"); - const bool graphviz = conf.count("graphviz"); - const bool encode_b64 = conf["vector_format"].as() == "b64"; - const bool kbest = conf.count("k_best"); - const bool unique_kbest = conf.count("unique_k_best"); - shared_ptr extract_file; - if (conf.count("extract_rules")) - extract_file.reset(new WriteFile(conf["extract_rules"].as())); - - int combine_size = conf["combine_size"].as(); - if (combine_size < 1) combine_size = 1; - const string input = conf["input"].as(); - cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; - ReadFile in_read(input); - istream *in = in_read.stream(); - assert(*in); - - SparseVector acc_vec; // accumulate gradient - double acc_obj = 0; // accumulate objective - int g_count = 0; // number of gradient pieces computed - int sent_id = -1; // line counter - - while(*in) { - Timer::Summarize(); - ++sent_id; - string buf; - getline(*in, buf); - if (buf.empty()) continue; - map sgml; - ProcessAndStripSGML(&buf, &sgml); - if (sgml.find("id") != sgml.end()) - sent_id = atoi(sgml["id"].c_str()); - - cerr << "\nINPUT: "; - if (buf.size() < 100) - cerr << buf << endl; - else { - size_t x = buf.rfind(" ", 100); - if (x == string::npos) x = 100; - cerr << buf.substr(0, x) << " ..." << endl; - } - cerr << " id = " << sent_id << endl; - string to_translate; - Lattice ref; - ParseTranslatorInputLattice(buf, &to_translate, &ref); - const bool has_ref = ref.size() > 0; - SentenceMetadata smeta(sent_id, ref); - const bool hadoop_counters = (write_gradient); - Hypergraph forest; // -LM forest - Timer t("Translation"); - if (!translator->Translate(to_translate, &smeta, feature_weights, &forest)) { - cerr << " NO PARSE FOUND.\n"; - if (hadoop_counters) - cerr << "reporter:counter:UserCounters,FParseFailed,1" << endl; - cout << endl << flush; - continue; - } - cerr << " -LM forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; - cerr << " -LM forest (paths): " << forest.NumberOfPaths() << endl; - if (conf.count("show_expected_length")) { - const PRPair res = - Inside, - PRWeightFunction >(forest); - cerr << " Expected length (words): " << res.r / res.p << "\t" << res << endl; - } - if (conf.count("show_partition")) { - const prob_t z = Inside(forest); - cerr << " -LM partition log(Z): " << log(z) << endl; - } - if (extract_file) - ExtractRulesDedupe(forest, extract_file->stream()); - vector trans; - const prob_t vs = ViterbiESentence(forest, &trans); - cerr << " -LM Viterbi: " << TD::GetString(trans) << endl; - if (conf.count("show_tree_structure")) - cerr << " -LM tree: " << ViterbiETree(forest) << endl;; - cerr << " -LM Viterbi: " << log(vs) << endl; - - bool has_late_models = !late_models.empty(); - if (has_late_models) { - forest.Reweight(feature_weights); - forest.SortInEdgesByEdgeWeights(); - Hypergraph lm_forest; - int cubepruning_pop_limit = conf["cubepruning_pop_limit"].as(); - ApplyModelSet(forest, - smeta, - late_models, - PruningConfiguration(cubepruning_pop_limit), - &lm_forest); - forest.swap(lm_forest); - forest.Reweight(feature_weights); - trans.clear(); - ViterbiESentence(forest, &trans); - cerr << " +LM forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; - cerr << " +LM forest (paths): " << forest.NumberOfPaths() << endl; - cerr << " +LM Viterbi: " << TD::GetString(trans) << endl; - } - if (conf.count("beam_prune")) { - vector preserve_mask(forest.edges_.size(), false); - if (csplit_preserve_full_word) - preserve_mask[CompoundSplit::GetFullWordEdgeIndex(forest)] = true; - forest.BeamPruneInsideOutside(1.0, false, conf["beam_prune"].as(), &preserve_mask); - cerr << " Pruned forest (paths): " << forest.NumberOfPaths() << endl; - } - - if (conf.count("forest_output") && !has_ref) { - ForestWriter writer(conf["forest_output"].as(), sent_id); - assert(writer.Write(forest, minimal_forests)); - } - - if (sample_max_trans) { - MaxTranslationSample(&forest, sample_max_trans, conf.count("k_best") ? conf["k_best"].as() : 0); - } else { - if (kbest) { - DumpKBest(sent_id, forest, conf["k_best"].as(), unique_kbest); - } else if (csplit_output_plf) { - cout << HypergraphIO::AsPLF(forest, false) << endl; - } else { - if (!graphviz && !has_ref) { - cout << TD::GetString(trans) << endl << flush; - } - } - } - - const int max_trans_beam_size = conf.count("max_translation_beam") ? - conf["max_translation_beam"].as() : 0; - if (max_trans_beam_size) { - Hack::MaxTrans(forest, max_trans_beam_size); - continue; - } - - if (graphviz && !has_ref) forest.PrintGraphviz(); - - // the following are only used if write_gradient is true! - SparseVector full_exp, ref_exp, gradient; - double log_z = 0, log_ref_z = 0; - if (write_gradient) - log_z = log( - InsideOutside, EdgeFeaturesWeightFunction>(forest, &full_exp)); - - if (has_ref) { - if (HG::Intersect(ref, &forest)) { - cerr << " Constr. forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; - cerr << " Constr. forest (paths): " << forest.NumberOfPaths() << endl; - forest.Reweight(feature_weights); - cerr << " Constr. VitTree: " << ViterbiFTree(forest) << endl; - if (hadoop_counters) - cerr << "reporter:counter:UserCounters,SentencePairsParsed,1" << endl; - if (conf.count("show_partition")) { - const prob_t z = Inside(forest); - cerr << " Contst. partition log(Z): " << log(z) << endl; - } - //DumpKBest(sent_id, forest, 1000); - if (conf.count("forest_output")) { - ForestWriter writer(conf["forest_output"].as(), sent_id); - assert(writer.Write(forest, minimal_forests)); - } - if (aligner_mode && !output_training_vector) - AlignerTools::WriteAlignment(to_translate, ref, forest); - if (write_gradient) { - log_ref_z = log( - InsideOutside, EdgeFeaturesWeightFunction>(forest, &ref_exp)); - if (log_z < log_ref_z) { - cerr << "DIFF. ERR! log_z < log_ref_z: " << log_z << " " << log_ref_z << endl; - exit(1); - } - //cerr << "FULL: " << full_exp << endl; - //cerr << " REF: " << ref_exp << endl; - ref_exp -= full_exp; - acc_vec += ref_exp; - acc_obj += (log_z - log_ref_z); - } - if (feature_expectations) { - acc_obj += log( - InsideOutside, EdgeFeaturesWeightFunction>(forest, &ref_exp)); - acc_vec += ref_exp; - } - - if (output_training_vector) { - ++g_count; - if (g_count % combine_size == 0) { - if (encode_b64) { - cout << "0\t"; - B64::Encode(acc_obj, acc_vec, &cout); - cout << endl << flush; - } else { - cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush; - } - acc_vec.clear(); - acc_obj = 0; - } - } - if (conf.count("graphviz")) forest.PrintGraphviz(); - } else { - cerr << " REFERENCE UNREACHABLE.\n"; - if (write_gradient) { - if (hadoop_counters) - cerr << "reporter:counter:UserCounters,EFParseFailed,1" << endl; - cout << endl << flush; - } - } - } - } - if (output_training_vector && !acc_vec.empty()) { - if (encode_b64) { - cout << "0\t"; - B64::Encode(acc_obj, acc_vec, &cout); - cout << endl << flush; - } else { - cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush; - } - } -} - diff --git a/src/cdec_ff.cc b/src/cdec_ff.cc deleted file mode 100644 index 0a4f3d5e..00000000 --- a/src/cdec_ff.cc +++ /dev/null @@ -1,22 +0,0 @@ -#include - -#include "ff.h" -#include "ff_lm.h" -#include "ff_csplit.h" -#include "ff_wordalign.h" -#include "ff_factory.h" - -boost::shared_ptr global_ff_registry; - -void register_feature_functions() { - global_ff_registry->Register("LanguageModel", new FFFactory); - global_ff_registry->Register("WordPenalty", new FFFactory); - global_ff_registry->Register("SourceWordPenalty", new FFFactory); - global_ff_registry->Register("RelativeSentencePosition", new FFFactory); - global_ff_registry->Register("MarkovJump", new FFFactory); - global_ff_registry->Register("BlunsomSynchronousParseHack", new FFFactory); - global_ff_registry->Register("AlignerResults", new FFFactory); - global_ff_registry->Register("CSplit_BasicFeatures", new FFFactory); - global_ff_registry->Register("CSplit_ReverseCharLM", new FFFactory); -}; - diff --git a/src/csplit.cc b/src/csplit.cc deleted file mode 100644 index 47197782..00000000 --- a/src/csplit.cc +++ /dev/null @@ -1,173 +0,0 @@ -#include "csplit.h" - -#include - -#include "filelib.h" -#include "stringlib.h" -#include "hg.h" -#include "tdict.h" -#include "grammar.h" -#include "sentence_metadata.h" - -using namespace std; - -struct CompoundSplitImpl { - CompoundSplitImpl(const boost::program_options::variables_map& conf) : - fugen_elements_(true), // TODO configure - min_size_(3), - kXCAT(TD::Convert("X")*-1), - kWORDBREAK_RULE(new TRule("[X] ||| # ||| #")), - kTEMPLATE_RULE(new TRule("[X] ||| [X,1] ? ||| [1] ?")), - kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")), - kFUGEN_S(FD::Convert("FugS")), - kFUGEN_N(FD::Convert("FugN")) {} - - void PasteTogetherStrings(const vector& chars, - const int i, - const int j, - string* yield) { - int size = 0; - for (int k=i; kresize(size); - int cur = 0; - for (int k=i; k& chars, - Hypergraph* forest) { - vector nodes(chars.size()+1, -1); - nodes[0] = forest->AddNode(kXCAT)->id_; // source - const int left_rule = forest->AddEdge(kWORDBREAK_RULE, Hypergraph::TailNodeVector())->id_; - forest->ConnectEdgeToHeadNode(left_rule, nodes[0]); - - const int max_split_ = max(static_cast(chars.size()) - min_size_ + 1, 1); - cerr << "max: " << max_split_ << " " << " min: " << min_size_ << endl; - for (int i = min_size_; i < max_split_; ++i) - nodes[i] = forest->AddNode(kXCAT)->id_; - assert(nodes.back() == -1); - nodes.back() = forest->AddNode(kXCAT)->id_; // sink - - for (int i = 0; i < max_split_; ++i) { - if (nodes[i] < 0) continue; - const int start = min(i + min_size_, static_cast(chars.size())); - for (int j = start; j <= chars.size(); ++j) { - if (nodes[j] < 0) continue; - string yield; - PasteTogetherStrings(chars, i, j, &yield); - // cerr << "[" << i << "," << j << "] " << yield << endl; - TRulePtr rule = TRulePtr(new TRule(*kTEMPLATE_RULE)); - rule->e_[1] = rule->f_[1] = TD::Convert(yield); - // cerr << rule->AsString() << endl; - int edge = forest->AddEdge( - rule, - Hypergraph::TailNodeVector(1, nodes[i]))->id_; - forest->ConnectEdgeToHeadNode(edge, nodes[j]); - forest->edges_[edge].i_ = i; - forest->edges_[edge].j_ = j; - - // handle "fugenelemente" here - // don't delete "fugenelemente" at the end of words - if (fugen_elements_ && j != chars.size()) { - const int len = yield.size(); - string alt; - int fid = 0; - if (len > (min_size_ + 2) && yield[len-1] == 's' && yield[len-2] == 'e') { - alt = yield.substr(0, len - 2); - fid = kFUGEN_S; - } else if (len > (min_size_ + 1) && yield[len-1] == 's') { - alt = yield.substr(0, len - 1); - fid = kFUGEN_S; - } else if (len > (min_size_ + 2) && yield[len-2] == 'e' && yield[len-1] == 'n') { - alt = yield.substr(0, len - 1); - fid = kFUGEN_N; - } - if (alt.size()) { - TRulePtr altrule = TRulePtr(new TRule(*rule)); - altrule->e_[1] = TD::Convert(alt); - // cerr << altrule->AsString() << endl; - int edge = forest->AddEdge( - altrule, - Hypergraph::TailNodeVector(1, nodes[i]))->id_; - forest->ConnectEdgeToHeadNode(edge, nodes[j]); - forest->edges_[edge].feature_values_.set_value(fid, 1.0); - forest->edges_[edge].i_ = i; - forest->edges_[edge].j_ = j; - } - } - } - } - - // add goal rule - Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); - Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1); - Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); - forest->ConnectEdgeToHeadNode(hg_edge, goal); - } - private: - const bool fugen_elements_; - const int min_size_; - const WordID kXCAT; - const TRulePtr kWORDBREAK_RULE; - const TRulePtr kTEMPLATE_RULE; - const TRulePtr kGOAL_RULE; - const int kFUGEN_S; - const int kFUGEN_N; -}; - -CompoundSplit::CompoundSplit(const boost::program_options::variables_map& conf) : - pimpl_(new CompoundSplitImpl(conf)) {} - -static void SplitUTF8String(const string& in, vector* out) { - out->resize(in.size()); - int i = 0; - int c = 0; - while (i < in.size()) { - const int len = UTF8Len(in[i]); - assert(len); - (*out)[c] = in.substr(i, len); - ++c; - i += len; - } - out->resize(c); -} - -bool CompoundSplit::Translate(const string& input, - SentenceMetadata* smeta, - const vector& weights, - Hypergraph* forest) { - if (input.find(" ") != string::npos) { - cerr << " BAD INPUT: " << input << "\n CompoundSplit expects single words\n"; - abort(); - } - vector in; - SplitUTF8String(input, &in); - smeta->SetSourceLength(in.size()); // TODO do utf8 or somethign - for (int i = 0; i < in.size(); ++i) - smeta->src_lattice_.push_back(vector(1, LatticeArc(TD::Convert(in[i]), 0.0, 1))); - pimpl_->BuildTrellis(in, forest); - forest->Reweight(weights); - return true; -} - -int CompoundSplit::GetFullWordEdgeIndex(const Hypergraph& forest) { - assert(forest.nodes_.size() > 0); - const vector out_edges = forest.nodes_[0].out_edges_; - int max_edge = -1; - int max_j = -1; - for (int i = 0; i < out_edges.size(); ++i) { - const int j = forest.edges_[out_edges[i]].j_; - if (j > max_j) { - max_j = j; - max_edge = out_edges[i]; - } - } - assert(max_edge >= 0); - assert(max_edge < forest.edges_.size()); - return max_edge; -} - diff --git a/src/csplit.h b/src/csplit.h deleted file mode 100644 index ce6295c1..00000000 --- a/src/csplit.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _CSPLIT_H_ -#define _CSPLIT_H_ - -#include "translator.h" -#include "lattice.h" - -// this "translator" takes single words (with NO SPACES) and segments -// them using the approach described in: -// -// C. Dyer. (2009) Using a maximum entropy model to build segmentation -// lattices for MT. In Proceedings of NAACL HLT 2009. -// note, an extra word space marker # is inserted at the left edge of -// the forest! -struct CompoundSplitImpl; -struct CompoundSplit : public Translator { - CompoundSplit(const boost::program_options::variables_map& conf); - bool Translate(const std::string& input, - SentenceMetadata* smeta, - const std::vector& weights, - Hypergraph* forest); - - // given a forest generated by CompoundSplit::Translate, - // find the edge representing the unsegmented form - static int GetFullWordEdgeIndex(const Hypergraph& forest); - - private: - boost::shared_ptr pimpl_; -}; - -#endif diff --git a/src/dict.h b/src/dict.h deleted file mode 100644 index bae9debe..00000000 --- a/src/dict.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef DICT_H_ -#define DICT_H_ - -#include -#include -#include -#include -#include - -#include - -#include "wordid.h" - -class Dict { - typedef std::tr1::unordered_map > Map; - public: - Dict() : b0_("") { words_.reserve(1000); } - inline int max() const { return words_.size(); } - inline WordID Convert(const std::string& word) { - Map::iterator i = d_.find(word); - if (i == d_.end()) { - words_.push_back(word); - d_[word] = words_.size(); - return words_.size(); - } else { - return i->second; - } - } - inline const std::string& Convert(const WordID& id) const { - if (id == 0) return b0_; - assert(id <= words_.size()); - return words_[id-1]; - } - private: - const std::string b0_; - std::vector words_; - Map d_; -}; - -#endif diff --git a/src/dict_test.cc b/src/dict_test.cc deleted file mode 100644 index 5c5d84f0..00000000 --- a/src/dict_test.cc +++ /dev/null @@ -1,30 +0,0 @@ -#include "dict.h" - -#include -#include - -class DTest : public testing::Test { - public: - DTest() {} - protected: - virtual void SetUp() { } - virtual void TearDown() { } -}; - -TEST_F(DTest, Convert) { - Dict d; - WordID a = d.Convert("foo"); - WordID b = d.Convert("bar"); - std::string x = "foo"; - WordID c = d.Convert(x); - EXPECT_NE(a, b); - EXPECT_EQ(a, c); - EXPECT_EQ(d.Convert(a), "foo"); - EXPECT_EQ(d.Convert(b), "bar"); -} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - diff --git a/src/earley_composer.cc b/src/earley_composer.cc deleted file mode 100644 index a59686e0..00000000 --- a/src/earley_composer.cc +++ /dev/null @@ -1,726 +0,0 @@ -#include "earley_composer.h" - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "phrasetable_fst.h" -#include "sparse_vector.h" -#include "tdict.h" -#include "hg.h" - -using boost::shared_ptr; -namespace po = boost::program_options; -using namespace std; -using namespace std::tr1; - -// Define the following macro if you want to see lots of debugging output -// when you run the chart parser -#undef DEBUG_CHART_PARSER - -// A few constants used by the chart parser /////////////// -static const int kMAX_NODES = 2000000; -static const string kPHRASE_STRING = "X"; -static bool constants_need_init = true; -static WordID kUNIQUE_START; -static WordID kPHRASE; -static TRulePtr kX1X2; -static TRulePtr kX1; -static WordID kEPS; -static TRulePtr kEPSRule; - -static void InitializeConstants() { - if (constants_need_init) { - kPHRASE = TD::Convert(kPHRASE_STRING) * -1; - kUNIQUE_START = TD::Convert("S") * -1; - kX1X2.reset(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]")); - kX1.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); - kEPSRule.reset(new TRule("[X] ||| ||| ")); - kEPS = TD::Convert(""); - constants_need_init = false; - } -} -//////////////////////////////////////////////////////////// - -class EGrammarNode { - friend bool EarleyComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); - friend void AddGrammarRule(const string& r, map* g); - public: -#ifdef DEBUG_CHART_PARSER - string hint; -#endif - EGrammarNode() : is_some_rule_complete(false), is_root(false) {} - const map& GetTerminals() const { return tptr; } - const map& GetNonTerminals() const { return ntptr; } - bool HasNonTerminals() const { return (!ntptr.empty()); } - bool HasTerminals() const { return (!tptr.empty()); } - bool RuleCompletes() const { - return (is_some_rule_complete || (ntptr.empty() && tptr.empty())); - } - bool GrammarContinues() const { - return !(ntptr.empty() && tptr.empty()); - } - bool IsRoot() const { - return is_root; - } - // these are the features associated with the rule from the start - // node up to this point. If you use these features, you must - // not Extend() this rule. - const SparseVector& GetCFGProductionFeatures() const { - return input_features; - } - - const EGrammarNode* Extend(const WordID& t) const { - if (t < 0) { - map::const_iterator it = ntptr.find(t); - if (it == ntptr.end()) return NULL; - return &it->second; - } else { - map::const_iterator it = tptr.find(t); - if (it == tptr.end()) return NULL; - return &it->second; - } - } - - private: - map tptr; - map ntptr; - SparseVector input_features; - bool is_some_rule_complete; - bool is_root; -}; -typedef map EGrammar; // indexed by the rule LHS - -// edges are immutable once created -struct Edge { -#ifdef DEBUG_CHART_PARSER - static int id_count; - const int id; -#endif - const WordID cat; // lhs side of rule proved/being proved - const EGrammarNode* const dot; // dot position - const FSTNode* const q; // start of span - const FSTNode* const r; // end of span - const Edge* const active_parent; // back pointer, NULL for PREDICT items - const Edge* const passive_parent; // back pointer, NULL for SCAN and PREDICT items - const TargetPhraseSet* const tps; // translations - shared_ptr > features; // features from CFG rule - - bool IsPassive() const { - // when a rule is completed, this value will be set - return static_cast(features); - } - bool IsActive() const { return !IsPassive(); } - bool IsInitial() const { - return !(active_parent || passive_parent); - } - bool IsCreatedByScan() const { - return active_parent && !passive_parent && !dot->IsRoot(); - } - bool IsCreatedByPredict() const { - return dot->IsRoot(); - } - bool IsCreatedByComplete() const { - return active_parent && passive_parent; - } - - // constructor for PREDICT - Edge(WordID c, const EGrammarNode* d, const FSTNode* q_and_r) : -#ifdef DEBUG_CHART_PARSER - id(++id_count), -#endif - cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(NULL), passive_parent(NULL), tps(NULL) {} - Edge(WordID c, const EGrammarNode* d, const FSTNode* q_and_r, const Edge* act_parent) : -#ifdef DEBUG_CHART_PARSER - id(++id_count), -#endif - cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(act_parent), passive_parent(NULL), tps(NULL) {} - - // constructors for SCAN - Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, - const Edge* act_par, const TargetPhraseSet* translations) : -#ifdef DEBUG_CHART_PARSER - id(++id_count), -#endif - cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations) {} - - Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, - const Edge* act_par, const TargetPhraseSet* translations, - const SparseVector& feats) : -#ifdef DEBUG_CHART_PARSER - id(++id_count), -#endif - cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations), - features(new SparseVector(feats)) {} - - // constructors for COMPLETE - Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, - const Edge* act_par, const Edge *pas_par) : -#ifdef DEBUG_CHART_PARSER - id(++id_count), -#endif - cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(NULL) { - assert(pas_par->IsPassive()); - assert(act_par->IsActive()); - } - - Edge(WordID c, const EGrammarNode* d, const FSTNode* i, const FSTNode* j, - const Edge* act_par, const Edge *pas_par, const SparseVector& feats) : -#ifdef DEBUG_CHART_PARSER - id(++id_count), -#endif - cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(NULL), - features(new SparseVector(feats)) { - assert(pas_par->IsPassive()); - assert(act_par->IsActive()); - } - - // constructor for COMPLETE query - Edge(const FSTNode* _r) : -#ifdef DEBUG_CHART_PARSER - id(0), -#endif - cat(0), dot(NULL), q(NULL), - r(_r), active_parent(NULL), passive_parent(NULL), tps(NULL) {} - // constructor for MERGE quere - Edge(const FSTNode* _q, int) : -#ifdef DEBUG_CHART_PARSER - id(0), -#endif - cat(0), dot(NULL), q(_q), - r(NULL), active_parent(NULL), passive_parent(NULL), tps(NULL) {} -}; -#ifdef DEBUG_CHART_PARSER -int Edge::id_count = 0; -#endif - -ostream& operator<<(ostream& os, const Edge& e) { - string type = "PREDICT"; - if (e.IsCreatedByScan()) - type = "SCAN"; - else if (e.IsCreatedByComplete()) - type = "COMPLETE"; - os << "[" -#ifdef DEBUG_CHART_PARSER - << '(' << e.id << ") " -#else - << '(' << &e << ") " -#endif - << "q=" << e.q << ", r=" << e.r - << ", cat="<< TD::Convert(e.cat*-1) << ", dot=" - << e.dot -#ifdef DEBUG_CHART_PARSER - << e.dot->hint -#endif - << (e.IsActive() ? ", Active" : ", Passive") - << ", " << type; -#ifdef DEBUG_CHART_PARSER - if (e.active_parent) { os << ", act.parent=(" << e.active_parent->id << ')'; } - if (e.passive_parent) { os << ", psv.parent=(" << e.passive_parent->id << ')'; } -#endif - if (e.tps) { os << ", tps=" << e.tps; } - return os << ']'; -} - -struct Traversal { - const Edge* const edge; // result from the active / passive combination - const Edge* const active; - const Edge* const passive; - Traversal(const Edge* me, const Edge* a, const Edge* p) : edge(me), active(a), passive(p) {} -}; - -struct UniqueTraversalHash { - size_t operator()(const Traversal* t) const { - size_t x = 5381; - x = ((x << 5) + x) ^ reinterpret_cast(t->active); - x = ((x << 5) + x) ^ reinterpret_cast(t->passive); - x = ((x << 5) + x) ^ t->edge->IsActive(); - return x; - } -}; - -struct UniqueTraversalEquals { - size_t operator()(const Traversal* a, const Traversal* b) const { - return (a->passive == b->passive && a->active == b->active && a->edge->IsActive() == b->edge->IsActive()); - } -}; - -struct UniqueEdgeHash { - size_t operator()(const Edge* e) const { - size_t x = 5381; - if (e->IsActive()) { - x = ((x << 5) + x) ^ reinterpret_cast(e->dot); - x = ((x << 5) + x) ^ reinterpret_cast(e->q); - x = ((x << 5) + x) ^ reinterpret_cast(e->r); - x = ((x << 5) + x) ^ static_cast(e->cat); - x += 13; - } else { // with passive edges, we don't care about the dot - x = ((x << 5) + x) ^ reinterpret_cast(e->q); - x = ((x << 5) + x) ^ reinterpret_cast(e->r); - x = ((x << 5) + x) ^ static_cast(e->cat); - } - return x; - } -}; - -struct UniqueEdgeEquals { - bool operator()(const Edge* a, const Edge* b) const { - if (a->IsActive() != b->IsActive()) return false; - if (a->IsActive()) { - return (a->cat == b->cat) && (a->dot == b->dot) && (a->q == b->q) && (a->r == b->r); - } else { - return (a->cat == b->cat) && (a->q == b->q) && (a->r == b->r); - } - } -}; - -struct REdgeHash { - size_t operator()(const Edge* e) const { - size_t x = 5381; - x = ((x << 5) + x) ^ reinterpret_cast(e->r); - return x; - } -}; - -struct REdgeEquals { - bool operator()(const Edge* a, const Edge* b) const { - return (a->r == b->r); - } -}; - -struct QEdgeHash { - size_t operator()(const Edge* e) const { - size_t x = 5381; - x = ((x << 5) + x) ^ reinterpret_cast(e->q); - return x; - } -}; - -struct QEdgeEquals { - bool operator()(const Edge* a, const Edge* b) const { - return (a->q == b->q); - } -}; - -struct EdgeQueue { - queue q; - EdgeQueue() {} - void clear() { while(!q.empty()) q.pop(); } - bool HasWork() const { return !q.empty(); } - const Edge* Next() { const Edge* res = q.front(); q.pop(); return res; } - void AddEdge(const Edge* s) { q.push(s); } -}; - -class EarleyComposerImpl { - public: - EarleyComposerImpl(WordID start_cat, const FSTNode& q_0) : start_cat_(start_cat), q_0_(&q_0) {} - - // returns false if the intersection is empty - bool Compose(const EGrammar& g, Hypergraph* forest) { - goal_node = NULL; - EGrammar::const_iterator sit = g.find(start_cat_); - forest->ReserveNodes(kMAX_NODES); - assert(sit != g.end()); - Edge* init = new Edge(start_cat_, &sit->second, q_0_); - assert(IncorporateNewEdge(init)); - while (exp_agenda.HasWork() || agenda.HasWork()) { - while(exp_agenda.HasWork()) { - const Edge* edge = exp_agenda.Next(); - FinishEdge(edge, forest); - } - if (agenda.HasWork()) { - const Edge* edge = agenda.Next(); -#ifdef DEBUG_CHART_PARSER - cerr << "processing (" << edge->id << ')' << endl; -#endif - if (edge->IsActive()) { - if (edge->dot->HasTerminals()) - DoScan(edge); - if (edge->dot->HasNonTerminals()) { - DoMergeWithPassives(edge); - DoPredict(edge, g); - } - } else { - DoComplete(edge); - } - } - } - if (goal_node) { - forest->PruneUnreachable(goal_node->id_); - forest->EpsilonRemove(kEPS); - } - FreeAll(); - return goal_node; - } - - void FreeAll() { - for (int i = 0; i < free_list_.size(); ++i) - delete free_list_[i]; - free_list_.clear(); - for (int i = 0; i < traversal_free_list_.size(); ++i) - delete traversal_free_list_[i]; - traversal_free_list_.clear(); - all_traversals.clear(); - exp_agenda.clear(); - agenda.clear(); - tps2node.clear(); - edge2node.clear(); - all_edges.clear(); - passive_edges.clear(); - active_edges.clear(); - } - - ~EarleyComposerImpl() { - FreeAll(); - } - - // returns the total number of edges created during composition - int EdgesCreated() const { - return free_list_.size(); - } - - private: - void DoScan(const Edge* edge) { - // here, we assume that the FST will potentially have many more outgoing - // edges than the grammar, which will be just a couple. If you want to - // efficiently handle the case where both are relatively large, this code - // will need to change how the intersection is done. The best general - // solution would probably be the Baeza-Yates double binary search. - - const EGrammarNode* dot = edge->dot; - const FSTNode* r = edge->r; - const map& terms = dot->GetTerminals(); - for (map::const_iterator git = terms.begin(); - git != terms.end(); ++git) { - const FSTNode* next_r = r->Extend(git->first); - if (!next_r) continue; - const EGrammarNode* next_dot = &git->second; - const bool grammar_continues = next_dot->GrammarContinues(); - const bool rule_completes = next_dot->RuleCompletes(); - assert(grammar_continues || rule_completes); - const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); - // create up to 4 new edges! - if (next_r->HasOutgoingNonEpsilonEdges()) { // are there further symbols in the FST? - const TargetPhraseSet* translations = NULL; - if (rule_completes) - IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, translations, input_features)); - if (grammar_continues) - IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, translations)); - } - if (next_r->HasData()) { // indicates a loop back to q_0 in the FST - const TargetPhraseSet* translations = next_r->GetTranslations(); - if (rule_completes) - IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, q_0_, edge, translations, input_features)); - if (grammar_continues) - IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, q_0_, edge, translations)); - } - } - } - - void DoPredict(const Edge* edge, const EGrammar& g) { - const EGrammarNode* dot = edge->dot; - const map& non_terms = dot->GetNonTerminals(); - for (map::const_iterator git = non_terms.begin(); - git != non_terms.end(); ++git) { - const WordID nt_to_predict = git->first; - //cerr << edge->id << " -- " << TD::Convert(nt_to_predict*-1) << endl; - EGrammar::const_iterator egi = g.find(nt_to_predict); - if (egi == g.end()) { - cerr << "[ERROR] Can't find any grammar rules with a LHS of type " - << TD::Convert(-1*nt_to_predict) << '!' << endl; - continue; - } - assert(edge->IsActive()); - const EGrammarNode* new_dot = &egi->second; - Edge* new_edge = new Edge(nt_to_predict, new_dot, edge->r, edge); - IncorporateNewEdge(new_edge); - } - } - - void DoComplete(const Edge* passive) { -#ifdef DEBUG_CHART_PARSER - cerr << " complete: " << *passive << endl; -#endif - const WordID completed_nt = passive->cat; - const FSTNode* q = passive->q; - const FSTNode* next_r = passive->r; - const Edge query(q); - const pair::iterator, - unordered_multiset::iterator > p = - active_edges.equal_range(&query); - for (unordered_multiset::iterator it = p.first; - it != p.second; ++it) { - const Edge* active = *it; -#ifdef DEBUG_CHART_PARSER - cerr << " pos: " << *active << endl; -#endif - const EGrammarNode* next_dot = active->dot->Extend(completed_nt); - if (!next_dot) continue; - const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); - // add up to 2 rules - if (next_dot->RuleCompletes()) - IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); - if (next_dot->GrammarContinues()) - IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); - } - } - - void DoMergeWithPassives(const Edge* active) { - // edge is active, has non-terminals, we need to find the passives that can extend it - assert(active->IsActive()); - assert(active->dot->HasNonTerminals()); -#ifdef DEBUG_CHART_PARSER - cerr << " merge active with passives: ACT=" << *active << endl; -#endif - const Edge query(active->r, 1); - const pair::iterator, - unordered_multiset::iterator > p = - passive_edges.equal_range(&query); - for (unordered_multiset::iterator it = p.first; - it != p.second; ++it) { - const Edge* passive = *it; - const EGrammarNode* next_dot = active->dot->Extend(passive->cat); - if (!next_dot) continue; - const FSTNode* next_r = passive->r; - const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); - if (next_dot->RuleCompletes()) - IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); - if (next_dot->GrammarContinues()) - IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); - } - } - - // take ownership of edge memory, add to various indexes, etc - // returns true if this edge is new - bool IncorporateNewEdge(Edge* edge) { - free_list_.push_back(edge); - if (edge->passive_parent && edge->active_parent) { - Traversal* t = new Traversal(edge, edge->active_parent, edge->passive_parent); - traversal_free_list_.push_back(t); - if (all_traversals.find(t) != all_traversals.end()) { - return false; - } else { - all_traversals.insert(t); - } - } - exp_agenda.AddEdge(edge); - return true; - } - - bool FinishEdge(const Edge* edge, Hypergraph* hg) { - bool is_new = false; - if (all_edges.find(edge) == all_edges.end()) { -#ifdef DEBUG_CHART_PARSER - cerr << *edge << " is NEW\n"; -#endif - all_edges.insert(edge); - is_new = true; - if (edge->IsPassive()) passive_edges.insert(edge); - if (edge->IsActive()) active_edges.insert(edge); - agenda.AddEdge(edge); - } else { -#ifdef DEBUG_CHART_PARSER - cerr << *edge << " is NOT NEW.\n"; -#endif - } - AddEdgeToTranslationForest(edge, hg); - return is_new; - } - - // build the translation forest - void AddEdgeToTranslationForest(const Edge* edge, Hypergraph* hg) { - assert(hg->nodes_.size() < kMAX_NODES); - Hypergraph::Node* tps = NULL; - // first add any target language rules - if (edge->tps) { - Hypergraph::Node*& node = tps2node[(size_t)edge->tps]; - if (!node) { - // cerr << "Creating phrases for " << edge->tps << endl; - const vector& rules = edge->tps->GetRules(); - node = hg->AddNode(kPHRASE, ""); - for (int i = 0; i < rules.size(); ++i) { - Hypergraph::Edge* hg_edge = hg->AddEdge(rules[i], Hypergraph::TailNodeVector()); - hg_edge->feature_values_ += rules[i]->GetFeatureValues(); - hg->ConnectEdgeToHeadNode(hg_edge, node); - } - } - tps = node; - } - Hypergraph::Node*& head_node = edge2node[edge]; - if (!head_node) - head_node = hg->AddNode(kPHRASE, ""); - if (edge->cat == start_cat_ && edge->q == q_0_ && edge->r == q_0_ && edge->IsPassive()) { - assert(goal_node == NULL || goal_node == head_node); - goal_node = head_node; - } - Hypergraph::TailNodeVector tail; - SparseVector extra; - if (edge->IsCreatedByPredict()) { - // extra.set_value(FD::Convert("predict"), 1); - } else if (edge->IsCreatedByScan()) { - tail.push_back(edge2node[edge->active_parent]->id_); - if (tps) { - tail.push_back(tps->id_); - } - //extra.set_value(FD::Convert("scan"), 1); - } else if (edge->IsCreatedByComplete()) { - tail.push_back(edge2node[edge->active_parent]->id_); - tail.push_back(edge2node[edge->passive_parent]->id_); - //extra.set_value(FD::Convert("complete"), 1); - } else { - assert(!"unexpected edge type!"); - } - //cerr << head_node->id_ << "<--" << *edge << endl; - -#ifdef DEBUG_CHART_PARSER - for (int i = 0; i < tail.size(); ++i) - if (tail[i] == head_node->id_) { - cerr << "ERROR: " << *edge << "\n i=" << i << endl; - if (i == 1) { cerr << "\tP: " << *edge->passive_parent << endl; } - if (i == 0) { cerr << "\tA: " << *edge->active_parent << endl; } - assert(!"self-loop found!"); - } -#endif - Hypergraph::Edge* hg_edge = NULL; - if (tail.size() == 0) { - hg_edge = hg->AddEdge(kEPSRule, tail); - } else if (tail.size() == 1) { - hg_edge = hg->AddEdge(kX1, tail); - } else if (tail.size() == 2) { - hg_edge = hg->AddEdge(kX1X2, tail); - } - if (edge->features) - hg_edge->feature_values_ += *edge->features; - hg_edge->feature_values_ += extra; - hg->ConnectEdgeToHeadNode(hg_edge, head_node); - } - - Hypergraph::Node* goal_node; - EdgeQueue exp_agenda; - EdgeQueue agenda; - unordered_map tps2node; - unordered_map edge2node; - unordered_set all_traversals; - unordered_set all_edges; - unordered_multiset passive_edges; - unordered_multiset active_edges; - vector free_list_; - vector traversal_free_list_; - const WordID start_cat_; - const FSTNode* const q_0_; -}; - -#ifdef DEBUG_CHART_PARSER -static string TrimRule(const string& r) { - size_t start = r.find(" |||") + 5; - size_t end = r.rfind(" |||"); - return r.substr(start, end - start); -} -#endif - -void AddGrammarRule(const string& r, EGrammar* g) { - const size_t pos = r.find(" ||| "); - if (pos == string::npos || r[0] != '[') { - cerr << "Bad rule: " << r << endl; - return; - } - const size_t rpos = r.rfind(" ||| "); - string feats; - string rs = r; - if (rpos != pos) { - feats = r.substr(rpos + 5); - rs = r.substr(0, rpos); - } - string rhs = rs.substr(pos + 5); - string trule = rs + " ||| " + rhs + " ||| " + feats; - TRule tr(trule); -#ifdef DEBUG_CHART_PARSER - string hint_last_rule; -#endif - EGrammarNode* cur = &(*g)[tr.GetLHS()]; - cur->is_root = true; - for (int i = 0; i < tr.FLength(); ++i) { - WordID sym = tr.f()[i]; -#ifdef DEBUG_CHART_PARSER - hint_last_rule = TD::Convert(sym < 0 ? -sym : sym); - cur->hint += " <@@> (*" + hint_last_rule + ") " + TrimRule(tr.AsString()); -#endif - if (sym < 0) - cur = &cur->ntptr[sym]; - else - cur = &cur->tptr[sym]; - } -#ifdef DEBUG_CHART_PARSER - cur->hint += " <@@> (" + hint_last_rule + "*) " + TrimRule(tr.AsString()); -#endif - cur->is_some_rule_complete = true; - cur->input_features = tr.GetFeatureValues(); -} - -EarleyComposer::~EarleyComposer() { - delete pimpl_; -} - -EarleyComposer::EarleyComposer(const FSTNode* fst) { - InitializeConstants(); - pimpl_ = new EarleyComposerImpl(kUNIQUE_START, *fst); -} - -bool EarleyComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest) { - // first, convert the src forest into an EGrammar - EGrammar g; - const int nedges = src_forest.edges_.size(); - const int nnodes = src_forest.nodes_.size(); - vector cats(nnodes); - bool assign_cats = false; - for (int i = 0; i < nnodes; ++i) - if (assign_cats) { - cats[i] = TD::Convert("CAT_" + boost::lexical_cast(i)) * -1; - } else { - cats[i] = src_forest.nodes_[i].cat_; - } - // construct the grammar - for (int i = 0; i < nedges; ++i) { - const Hypergraph::Edge& edge = src_forest.edges_[i]; - const vector& src = edge.rule_->f(); - EGrammarNode* cur = &g[cats[edge.head_node_]]; - cur->is_root = true; - int ntc = 0; - for (int j = 0; j < src.size(); ++j) { - WordID sym = src[j]; - if (sym <= 0) { - sym = cats[edge.tail_nodes_[ntc]]; - ++ntc; - cur = &cur->ntptr[sym]; - } else { - cur = &cur->tptr[sym]; - } - } - cur->is_some_rule_complete = true; - cur->input_features = edge.feature_values_; - } - EGrammarNode& goal_rule = g[kUNIQUE_START]; - assert((goal_rule.ntptr.size() == 1 && goal_rule.tptr.size() == 0) || - (goal_rule.ntptr.size() == 0 && goal_rule.tptr.size() == 1)); - - return pimpl_->Compose(g, trg_forest); -} - -bool EarleyComposer::Compose(istream* in, Hypergraph* trg_forest) { - EGrammar g; - while(*in) { - string line; - getline(*in, line); - if (line.empty()) continue; - AddGrammarRule(line, &g); - } - - return pimpl_->Compose(g, trg_forest); -} diff --git a/src/earley_composer.h b/src/earley_composer.h deleted file mode 100644 index 9f786bf6..00000000 --- a/src/earley_composer.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef _EARLEY_COMPOSER_H_ -#define _EARLEY_COMPOSER_H_ - -#include - -class EarleyComposerImpl; -class FSTNode; -class Hypergraph; - -class EarleyComposer { - public: - ~EarleyComposer(); - EarleyComposer(const FSTNode* phrasetable_root); - bool Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); - - // reads the grammar from a file. There must be a single top-level - // S -> X rule. Anything else is possible. Format is: - // [S] ||| [SS,1] - // [SS] ||| [NP,1] [VP,2] ||| Feature1=0.2 Feature2=-2.3 - // [SS] ||| [VP,1] [NP,2] ||| Feature1=0.8 - // [NP] ||| [DET,1] [N,2] ||| Feature3=2 - // ... - bool Compose(std::istream* grammar_file, Hypergraph* trg_forest); - - private: - EarleyComposerImpl* pimpl_; -}; - -#endif diff --git a/src/exp_semiring.h b/src/exp_semiring.h deleted file mode 100644 index f91beee4..00000000 --- a/src/exp_semiring.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef _EXP_SEMIRING_H_ -#define _EXP_SEMIRING_H_ - -#include - -// this file implements the first-order expectation semiring described -// in Li & Eisner (EMNLP 2009) - -// requirements: -// RType * RType ==> RType -// PType * PType ==> PType -// RType * PType ==> RType -// good examples: -// PType scalar, RType vector -// BAD examples: -// PType vector, RType scalar -template -struct PRPair { - PRPair() : p(), r() {} - // Inside algorithm requires that T(0) and T(1) - // return the 0 and 1 values of the semiring - explicit PRPair(double x) : p(x), r() {} - PRPair(const PType& p, const RType& r) : p(p), r(r) {} - PRPair& operator+=(const PRPair& o) { - p += o.p; - r += o.r; - return *this; - } - PRPair& operator*=(const PRPair& o) { - r = (o.r * p) + (o.p * r); - p *= o.p; - return *this; - } - PType p; - RType r; -}; - -template -std::ostream& operator<<(std::ostream& o, const PRPair& x) { - return o << '<' << x.p << ", " << x.r << '>'; -} - -template -const PRPair operator+(const PRPair& a, const PRPair& b) { - PRPair result = a; - result += b; - return result; -} - -template -const PRPair operator*(const PRPair& a, const PRPair& b) { - PRPair result = a; - result *= b; - return result; -} - -template -struct PRWeightFunction { - explicit PRWeightFunction(const PWeightFunction& pwf = PWeightFunction(), - const RWeightFunction& rwf = RWeightFunction()) : - pweight(pwf), rweight(rwf) {} - PRPair operator()(const Hypergraph::Edge& e) const { - const P p = pweight(e); - const R r = rweight(e); - return PRPair(p, r * p); - } - const PWeightFunction pweight; - const RWeightFunction rweight; -}; - -#endif diff --git a/src/fdict.cc b/src/fdict.cc deleted file mode 100644 index 83aa7cea..00000000 --- a/src/fdict.cc +++ /dev/null @@ -1,4 +0,0 @@ -#include "fdict.h" - -Dict FD::dict_; - diff --git a/src/fdict.h b/src/fdict.h deleted file mode 100644 index ff491cfb..00000000 --- a/src/fdict.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef _FDICT_H_ -#define _FDICT_H_ - -#include -#include -#include "dict.h" - -struct FD { - static Dict dict_; - static inline int NumFeats() { - return dict_.max() + 1; - } - static inline WordID Convert(const std::string& s) { - return dict_.Convert(s); - } - static inline const std::string& Convert(const WordID& w) { - return dict_.Convert(w); - } -}; - -#endif diff --git a/src/ff.cc b/src/ff.cc deleted file mode 100644 index 2ae5b9eb..00000000 --- a/src/ff.cc +++ /dev/null @@ -1,114 +0,0 @@ -#include "ff.h" - -#include "tdict.h" -#include "hg.h" - -using namespace std; - -FeatureFunction::~FeatureFunction() {} - - -void FeatureFunction::FinalTraversalFeatures(const void* ant_state, - SparseVector* features) const { - (void) ant_state; - (void) features; -} - -// Hiero and Joshua use log_10(e) as the value, so I do to -WordPenalty::WordPenalty(const string& param) : - fid_(FD::Convert("WordPenalty")), - value_(-1.0 / log(10)) { - if (!param.empty()) { - cerr << "Warning WordPenalty ignoring parameter: " << param << endl; - } -} - -void WordPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_states, - SparseVector* features, - SparseVector* estimated_features, - void* state) const { - (void) smeta; - (void) ant_states; - (void) state; - (void) estimated_features; - features->set_value(fid_, edge.rule_->EWords() * value_); -} - -SourceWordPenalty::SourceWordPenalty(const string& param) : - fid_(FD::Convert("SourceWordPenalty")), - value_(-1.0 / log(10)) { - if (!param.empty()) { - cerr << "Warning SourceWordPenalty ignoring parameter: " << param << endl; - } -} - -void SourceWordPenalty::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_states, - SparseVector* features, - SparseVector* estimated_features, - void* state) const { - (void) smeta; - (void) ant_states; - (void) state; - (void) estimated_features; - features->set_value(fid_, edge.rule_->FWords() * value_); -} - -ModelSet::ModelSet(const vector& w, const vector& models) : - models_(models), - weights_(w), - state_size_(0), - model_state_pos_(models.size()) { - for (int i = 0; i < models_.size(); ++i) { - model_state_pos_[i] = state_size_; - state_size_ += models_[i]->NumBytesContext(); - } -} - -void ModelSet::AddFeaturesToEdge(const SentenceMetadata& smeta, - const Hypergraph& hg, - Hypergraph::Edge* edge, - string* context, - prob_t* combination_cost_estimate) const { - context->resize(state_size_); - memset(&(*context)[0], 0, state_size_); - SparseVector est_vals; // only computed if combination_cost_estimate is non-NULL - if (combination_cost_estimate) *combination_cost_estimate = prob_t::One(); - for (int i = 0; i < models_.size(); ++i) { - const FeatureFunction& ff = *models_[i]; - void* cur_ff_context = NULL; - vector ants(edge->tail_nodes_.size()); - bool has_context = ff.NumBytesContext() > 0; - if (has_context) { - int spos = model_state_pos_[i]; - cur_ff_context = &(*context)[spos]; - for (int i = 0; i < ants.size(); ++i) { - ants[i] = &hg.nodes_[edge->tail_nodes_[i]].state_[spos]; - } - } - ff.TraversalFeatures(smeta, *edge, ants, &edge->feature_values_, &est_vals, cur_ff_context); - } - if (combination_cost_estimate) - combination_cost_estimate->logeq(est_vals.dot(weights_)); - edge->edge_prob_.logeq(edge->feature_values_.dot(weights_)); -} - -void ModelSet::AddFinalFeatures(const std::string& state, Hypergraph::Edge* edge) const { - assert(1 == edge->rule_->Arity()); - - for (int i = 0; i < models_.size(); ++i) { - const FeatureFunction& ff = *models_[i]; - const void* ant_state = NULL; - bool has_context = ff.NumBytesContext() > 0; - if (has_context) { - int spos = model_state_pos_[i]; - ant_state = &state[spos]; - } - ff.FinalTraversalFeatures(ant_state, &edge->feature_values_); - } - edge->edge_prob_.logeq(edge->feature_values_.dot(weights_)); -} - diff --git a/src/ff.h b/src/ff.h deleted file mode 100644 index e962b4ba..00000000 --- a/src/ff.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef _FF_H_ -#define _FF_H_ - -#include - -#include "fdict.h" -#include "hg.h" - -class SentenceMetadata; -class FeatureFunction; // see definition below - -// if you want to develop a new feature, inherit from this class and -// override TraversalFeaturesImpl(...). If it's a feature that returns / -// depends on context, you may also need to implement -// FinalTraversalFeatures(...) -class FeatureFunction { - public: - FeatureFunction() : state_size_() {} - explicit FeatureFunction(int state_size) : state_size_(state_size) {} - virtual ~FeatureFunction(); - - // returns the number of bytes of context that this feature function will - // (maximally) use. By default, 0 ("stateless" models in Hiero/Joshua). - // NOTE: this value is fixed for the instance of your class, you cannot - // use different amounts of memory for different nodes in the forest. - inline int NumBytesContext() const { return state_size_; } - - // Compute the feature values and (if this applies) the estimates of the - // feature values when this edge is used incorporated into a larger context - inline void TraversalFeatures(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* out_state) const { - TraversalFeaturesImpl(smeta, edge, ant_contexts, - features, estimated_features, out_state); - // TODO it's easy for careless feature function developers to overwrite - // the end of their state and clobber someone else's memory. These bugs - // will be horrendously painful to track down. There should be some - // optional strict mode that's enforced here that adds some kind of - // barrier between the blocks reserved for the residual contexts - } - - // if there's some state left when you transition to the goal state, score - // it here. For example, the language model computes the cost of adding - // and . - virtual void FinalTraversalFeatures(const void* residual_state, - SparseVector* final_features) const; - - protected: - // context is a pointer to a buffer of size NumBytesContext() that the - // feature function can write its state to. It's up to the feature function - // to determine how much space it needs and to determine how to encode its - // residual contextual information since it is OPAQUE to all clients outside - // of the particular FeatureFunction class. There is one exception: - // equality of the contents (i.e., memcmp) is required to determine whether - // two states can be combined. - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const = 0; - - // !!! ONLY call this from subclass *CONSTRUCTORS* !!! - void SetStateSize(size_t state_size) { - state_size_ = state_size; - } - - private: - int state_size_; -}; - -// word penalty feature, for each word on the E side of a rule, -// add value_ -class WordPenalty : public FeatureFunction { - public: - WordPenalty(const std::string& param); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const; - private: - const int fid_; - const double value_; -}; - -class SourceWordPenalty : public FeatureFunction { - public: - SourceWordPenalty(const std::string& param); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const; - private: - const int fid_; - const double value_; -}; - -// this class is a set of FeatureFunctions that can be used to score, rescore, -// etc. a (translation?) forest -class ModelSet { - public: - ModelSet() : state_size_(0) {} - - ModelSet(const std::vector& weights, - const std::vector& models); - - // sets edge->feature_values_ and edge->edge_prob_ - // NOTE: edge must not necessarily be in hg.edges_ but its TAIL nodes - // must be. - void AddFeaturesToEdge(const SentenceMetadata& smeta, - const Hypergraph& hg, - Hypergraph::Edge* edge, - std::string* residual_context, - prob_t* combination_cost_estimate = NULL) const; - - void AddFinalFeatures(const std::string& residual_context, - Hypergraph::Edge* edge) const; - - bool empty() const { return models_.empty(); } - private: - std::vector models_; - std::vector weights_; - int state_size_; - std::vector model_state_pos_; -}; - -#endif diff --git a/src/ff_csplit.cc b/src/ff_csplit.cc deleted file mode 100644 index cac4bb8e..00000000 --- a/src/ff_csplit.cc +++ /dev/null @@ -1,212 +0,0 @@ -#include "ff_csplit.h" - -#include -#include - -#include "Vocab.h" -#include "Ngram.h" - -#include "sentence_metadata.h" -#include "lattice.h" -#include "tdict.h" -#include "freqdict.h" -#include "filelib.h" -#include "stringlib.h" -#include "tdict.h" - -using namespace std; - -struct BasicCSplitFeaturesImpl { - BasicCSplitFeaturesImpl(const string& param) : - word_count_(FD::Convert("WordCount")), - letters_sq_(FD::Convert("LettersSq")), - letters_sqrt_(FD::Convert("LettersSqrt")), - in_dict_(FD::Convert("InDict")), - short_(FD::Convert("Short")), - long_(FD::Convert("Long")), - oov_(FD::Convert("OOV")), - short_range_(FD::Convert("ShortRange")), - high_freq_(FD::Convert("HighFreq")), - med_freq_(FD::Convert("MedFreq")), - freq_(FD::Convert("Freq")), - fl1_(FD::Convert("FreqLen1")), - fl2_(FD::Convert("FreqLen2")), - bad_(FD::Convert("Bad")) { - vector argv; - int argc = SplitOnWhitespace(param, &argv); - if (argc != 1 && argc != 2) { - cerr << "Expected: freqdict.txt [badwords.txt]\n"; - abort(); - } - freq_dict_.Load(argv[0]); - if (argc == 2) { - ReadFile rf(argv[1]); - istream& in = *rf.stream(); - while(in) { - string badword; - in >> badword; - if (badword.empty()) continue; - bad_words_.insert(TD::Convert(badword)); - } - } - } - - void TraversalFeaturesImpl(const Hypergraph::Edge& edge, - SparseVector* features) const; - - const int word_count_; - const int letters_sq_; - const int letters_sqrt_; - const int in_dict_; - const int short_; - const int long_; - const int oov_; - const int short_range_; - const int high_freq_; - const int med_freq_; - const int freq_; - const int fl1_; - const int fl2_; - const int bad_; - FreqDict freq_dict_; - set bad_words_; -}; - -BasicCSplitFeatures::BasicCSplitFeatures(const string& param) : - pimpl_(new BasicCSplitFeaturesImpl(param)) {} - -void BasicCSplitFeaturesImpl::TraversalFeaturesImpl( - const Hypergraph::Edge& edge, - SparseVector* features) const { - features->set_value(word_count_, 1.0); - features->set_value(letters_sq_, (edge.j_ - edge.i_) * (edge.j_ - edge.i_)); - features->set_value(letters_sqrt_, sqrt(edge.j_ - edge.i_)); - const WordID word = edge.rule_->e_[1]; - const char* sword = TD::Convert(word); - const int len = strlen(sword); - int cur = 0; - int chars = 0; - while(cur < len) { - cur += UTF8Len(sword[cur]); - ++chars; - } - - // these are corrections that attempt to make chars - // more like a phoneme count than a letter count, they - // are only really meaningful for german and should - // probably be gotten rid of - bool has_sch = strstr(sword, "sch"); - bool has_ch = (!has_sch && strstr(sword, "ch")); - bool has_ie = strstr(sword, "ie"); - bool has_zw = strstr(sword, "zw"); - if (has_sch) chars -= 2; - if (has_ch) --chars; - if (has_ie) --chars; - if (has_zw) --chars; - - float freq = freq_dict_.LookUp(word); - if (freq) { - features->set_value(freq_, freq); - features->set_value(in_dict_, 1.0); - } else { - features->set_value(oov_, 1.0); - freq = 99.0f; - } - if (bad_words_.count(word) != 0) - features->set_value(bad_, 1.0); - if (chars < 5) - features->set_value(short_, 1.0); - if (chars > 10) - features->set_value(long_, 1.0); - if (freq < 7.0f) - features->set_value(high_freq_, 1.0); - if (freq > 8.0f && freq < 10.f) - features->set_value(med_freq_, 1.0); - if (freq < 10.0f && chars < 5) - features->set_value(short_range_, 1.0); - - // i don't understand these features, but they really help! - features->set_value(fl1_, sqrt(chars * freq)); - features->set_value(fl2_, freq / chars); -} - -void BasicCSplitFeatures::TraversalFeaturesImpl( - const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* out_context) const { - (void) smeta; - (void) ant_contexts; - (void) out_context; - (void) estimated_features; - if (edge.Arity() == 0) return; - if (edge.rule_->EWords() != 1) return; - pimpl_->TraversalFeaturesImpl(edge, features); -} - -struct ReverseCharLMCSplitFeatureImpl { - ReverseCharLMCSplitFeatureImpl(const string& param) : - order_(5), - vocab_(*TD::dict_), - ngram_(vocab_, order_) { - kBOS = vocab_.getIndex(""); - kEOS = vocab_.getIndex(""); - File file(param.c_str(), "r", 0); - assert(file); - cerr << "Reading " << order_ << "-gram LM from " << param << endl; - ngram_.read(file); - } - - double LeftPhonotacticProb(const Lattice& inword, const int start) { - const int end = inword.size(); - for (int i = 0; i < order_; ++i) - sc[i] = kBOS; - int sp = min(end - start, order_ - 1); - // cerr << "[" << start << "," << sp << "]\n"; - int ci = (order_ - sp - 1); - int wi = start; - while (sp > 0) { - sc[ci] = inword[wi][0].label; - // cerr << " CHAR: " << TD::Convert(sc[ci]) << " ci=" << ci << endl; - ++wi; - ++ci; - --sp; - } - // cerr << " END ci=" << ci << endl; - sc[ci] = Vocab_None; - const double startprob = ngram_.wordProb(kEOS, sc); - // cerr << " PROB=" << startprob << endl; - return startprob; - } - private: - const int order_; - Vocab& vocab_; - VocabIndex kBOS; - VocabIndex kEOS; - Ngram ngram_; - VocabIndex sc[80]; -}; - -ReverseCharLMCSplitFeature::ReverseCharLMCSplitFeature(const string& param) : - pimpl_(new ReverseCharLMCSplitFeatureImpl(param)), - fid_(FD::Convert("RevCharLM")) {} - -void ReverseCharLMCSplitFeature::TraversalFeaturesImpl( - const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* out_context) const { - (void) ant_contexts; - (void) estimated_features; - (void) out_context; - - if (edge.Arity() != 1) return; - if (edge.rule_->EWords() != 1) return; - const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_); - features->set_value(fid_, lpp); -} - diff --git a/src/ff_csplit.h b/src/ff_csplit.h deleted file mode 100644 index c1cfb64b..00000000 --- a/src/ff_csplit.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef _FF_CSPLIT_H_ -#define _FF_CSPLIT_H_ - -#include - -#include "ff.h" - -class BasicCSplitFeaturesImpl; -class BasicCSplitFeatures : public FeatureFunction { - public: - BasicCSplitFeatures(const std::string& param); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* out_context) const; - private: - boost::shared_ptr pimpl_; -}; - -class ReverseCharLMCSplitFeatureImpl; -class ReverseCharLMCSplitFeature : public FeatureFunction { - public: - ReverseCharLMCSplitFeature(const std::string& param); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* out_context) const; - private: - boost::shared_ptr pimpl_; - const int fid_; -}; - -#endif diff --git a/src/ff_factory.cc b/src/ff_factory.cc deleted file mode 100644 index 1854e0bb..00000000 --- a/src/ff_factory.cc +++ /dev/null @@ -1,35 +0,0 @@ -#include "ff_factory.h" - -#include "ff.h" - -using boost::shared_ptr; -using namespace std; - -FFFactoryBase::~FFFactoryBase() {} - -void FFRegistry::DisplayList() const { - for (map >::const_iterator it = reg_.begin(); - it != reg_.end(); ++it) { - cerr << " " << it->first << endl; - } -} - -shared_ptr FFRegistry::Create(const string& ffname, const string& param) const { - map >::const_iterator it = reg_.find(ffname); - shared_ptr res; - if (it == reg_.end()) { - cerr << "I don't know how to create feature " << ffname << endl; - } else { - res = it->second->Create(param); - } - return res; -} - -void FFRegistry::Register(const string& ffname, FFFactoryBase* factory) { - if (reg_.find(ffname) != reg_.end()) { - cerr << "Duplicate registration of FeatureFunction with name " << ffname << "!\n"; - abort(); - } - reg_[ffname].reset(factory); -} - diff --git a/src/ff_factory.h b/src/ff_factory.h deleted file mode 100644 index bc586567..00000000 --- a/src/ff_factory.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef _FF_FACTORY_H_ -#define _FF_FACTORY_H_ - -#include -#include -#include - -#include - -class FeatureFunction; -class FFRegistry; -class FFFactoryBase; -extern boost::shared_ptr global_ff_registry; - -class FFRegistry { - friend int main(int argc, char** argv); - friend class FFFactoryBase; - public: - boost::shared_ptr Create(const std::string& ffname, const std::string& param) const; - void DisplayList() const; - void Register(const std::string& ffname, FFFactoryBase* factory); - private: - FFRegistry() {} - std::map > reg_; -}; - -struct FFFactoryBase { - virtual ~FFFactoryBase(); - virtual boost::shared_ptr Create(const std::string& param) const = 0; -}; - -template -class FFFactory : public FFFactoryBase { - boost::shared_ptr Create(const std::string& param) const { - return boost::shared_ptr(new FF(param)); - } -}; - -#endif diff --git a/src/ff_lm.cc b/src/ff_lm.cc deleted file mode 100644 index 354787ec..00000000 --- a/src/ff_lm.cc +++ /dev/null @@ -1,328 +0,0 @@ -#include "ff_lm.h" - -#include -#include -#include -#include -#include -#include - -#include "tdict.h" -#include "Vocab.h" -#include "Ngram.h" -#include "hg.h" -#include "stringlib.h" - -using namespace std; - -struct LMClient { - struct Cache { - map tree; - float prob; - Cache() : prob() {} - }; - - LMClient(const char* host) : port(6666) { - s = strchr(host, ':'); - if (s != NULL) { - *s = '\0'; - ++s; - port = atoi(s); - } - sock = socket(AF_INET, SOCK_STREAM, 0); - hp = gethostbyname(host); - if (hp == NULL) { - cerr << "unknown host " << host << endl; - abort(); - } - bzero((char *)&server, sizeof(server)); - bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); - server.sin_family = hp->h_addrtype; - server.sin_port = htons(port); - - int errors = 0; - while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) { - cerr << "Error: connect()\n"; - sleep(1); - errors++; - if (errors > 3) exit(1); - } - cerr << "Connected to LM on " << host << " on port " << port << endl; - } - - float wordProb(int word, int* context) { - Cache* cur = &cache; - int i = 0; - while (context[i] > 0) { - cur = &cur->tree[context[i++]]; - } - cur = &cur->tree[word]; - if (cur->prob) { return cur->prob; } - - i = 0; - ostringstream os; - os << "prob " << TD::Convert(word); - while (context[i] > 0) { - os << ' ' << TD::Convert(context[i++]); - } - os << endl; - string out = os.str(); - write(sock, out.c_str(), out.size()); - int r = read(sock, res, 6); - int errors = 0; - int cnt = 0; - while (1) { - if (r < 0) { - errors++; sleep(1); - cerr << "Error: read()\n"; - if (errors > 5) exit(1); - } else if (r==0 || res[cnt] == '\n') { break; } - else { - cnt += r; - if (cnt==6) break; - read(sock, &res[cnt], 6-cnt); - } - } - cur->prob = *reinterpret_cast(res); - return cur->prob; - } - - void clear() { - cache.tree.clear(); - } - - private: - Cache cache; - int sock, port; - char *s; - struct hostent *hp; - struct sockaddr_in server; - char res[8]; -}; - -class LanguageModelImpl { - public: - LanguageModelImpl(int order, const string& f) : - ngram_(*TD::dict_), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), - floor_(-100.0), - client_(NULL), - kSTART(TD::Convert("")), - kSTOP(TD::Convert("")), - kUNKNOWN(TD::Convert("")), - kNONE(-1), - kSTAR(TD::Convert("<{STAR}>")) { - if (f.find("lm://") == 0) { - client_ = new LMClient(f.substr(5).c_str()); - } else { - File file(f.c_str(), "r", 0); - assert(file); - cerr << "Reading " << order_ << "-gram LM from " << f << endl; - ngram_.read(file, false); - } - } - - ~LanguageModelImpl() { - delete client_; - } - - inline int StateSize(const void* state) const { - return *(static_cast(state) + state_size_); - } - - inline void SetStateSize(int size, void* state) const { - *(static_cast(state) + state_size_) = size; - } - - inline double LookupProbForBufferContents(int i) { - double p = client_ ? - client_->wordProb(buffer_[i], &buffer_[i+1]) - : ngram_.wordProb(buffer_[i], (VocabIndex*)&buffer_[i+1]); - if (p < floor_) p = floor_; - return p; - } - - string DebugStateToString(const void* state) const { - int len = StateSize(state); - const int* astate = reinterpret_cast(state); - string res = "["; - for (int i = 0; i < len; ++i) { - res += " "; - res += TD::Convert(astate[i]); - } - res += " ]"; - return res; - } - - inline double ProbNoRemnant(int i, int len) { - int edge = len; - bool flag = true; - double sum = 0.0; - while (i >= 0) { - if (buffer_[i] == kSTAR) { - edge = i; - flag = false; - } else if (buffer_[i] <= 0) { - edge = i; - flag = true; - } else { - if ((edge-i >= order_) || (flag && !(i == (len-1) && buffer_[i] == kSTART))) - sum += LookupProbForBufferContents(i); - } - --i; - } - return sum; - } - - double EstimateProb(const vector& phrase) { - int len = phrase.size(); - buffer_.resize(len + 1); - buffer_[len] = kNONE; - int i = len - 1; - for (int j = 0; j < len; ++j,--i) - buffer_[i] = phrase[j]; - return ProbNoRemnant(len - 1, len); - } - - double EstimateProb(const void* state) { - int len = StateSize(state); - // cerr << "residual len: " << len << endl; - buffer_.resize(len + 1); - buffer_[len] = kNONE; - const int* astate = reinterpret_cast(state); - int i = len - 1; - for (int j = 0; j < len; ++j,--i) - buffer_[i] = astate[j]; - return ProbNoRemnant(len - 1, len); - } - - double FinalTraversalCost(const void* state) { - int slen = StateSize(state); - int len = slen + 2; - // cerr << "residual len: " << len << endl; - buffer_.resize(len + 1); - buffer_[len] = kNONE; - buffer_[len-1] = kSTART; - const int* astate = reinterpret_cast(state); - int i = len - 2; - for (int j = 0; j < slen; ++j,--i) - buffer_[i] = astate[j]; - buffer_[i] = kSTOP; - assert(i == 0); - return ProbNoRemnant(len - 1, len); - } - - double LookupWords(const TRule& rule, const vector& ant_states, void* vstate) { - int len = rule.ELength() - rule.Arity(); - for (int i = 0; i < ant_states.size(); ++i) - len += StateSize(ant_states[i]); - buffer_.resize(len + 1); - buffer_[len] = kNONE; - int i = len - 1; - const vector& e = rule.e(); - for (int j = 0; j < e.size(); ++j) { - if (e[j] < 1) { - const int* astate = reinterpret_cast(ant_states[-e[j]]); - int slen = StateSize(astate); - for (int k = 0; k < slen; ++k) - buffer_[i--] = astate[k]; - } else { - buffer_[i--] = e[j]; - } - } - - double sum = 0.0; - int* remnant = reinterpret_cast(vstate); - int j = 0; - i = len - 1; - int edge = len; - - while (i >= 0) { - if (buffer_[i] == kSTAR) { - edge = i; - } else if (edge-i >= order_) { - sum += LookupProbForBufferContents(i); - } else if (edge == len && remnant) { - remnant[j++] = buffer_[i]; - } - --i; - } - if (!remnant) return sum; - - if (edge != len || len >= order_) { - remnant[j++] = kSTAR; - if (order_-1 < edge) edge = order_-1; - for (int i = edge-1; i >= 0; --i) - remnant[j++] = buffer_[i]; - } - - SetStateSize(j, vstate); - return sum; - } - - static int OrderToStateSize(int order) { - return ((order-1) * 2 + 1) * sizeof(WordID) + 1; - } - - private: - Ngram ngram_; - vector buffer_; - const int order_; - const int state_size_; - const double floor_; - LMClient* client_; - - public: - const WordID kSTART; - const WordID kSTOP; - const WordID kUNKNOWN; - const WordID kNONE; - const WordID kSTAR; -}; - -LanguageModel::LanguageModel(const string& param) : - fid_(FD::Convert("LanguageModel")) { - vector argv; - int argc = SplitOnWhitespace(param, &argv); - int order = 3; - // TODO add support for -n FeatureName - string filename; - if (argc < 1) { cerr << "LanguageModel requires a filename, minimally!\n"; abort(); } - else if (argc == 1) { filename = argv[0]; } - else if (argc == 2 || argc > 3) { cerr << "Don't understand 'LanguageModel " << param << "'\n"; } - else if (argc == 3) { - if (argv[0] == "-o") { - order = atoi(argv[1].c_str()); - filename = argv[2]; - } else if (argv[1] == "-o") { - order = atoi(argv[2].c_str()); - filename = argv[0]; - } - } - SetStateSize(LanguageModelImpl::OrderToStateSize(order)); - pimpl_ = new LanguageModelImpl(order, filename); -} - -LanguageModel::~LanguageModel() { - delete pimpl_; -} - -string LanguageModel::DebugStateToString(const void* state) const{ - return pimpl_->DebugStateToString(state); -} - -void LanguageModel::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_states, - SparseVector* features, - SparseVector* estimated_features, - void* state) const { - (void) smeta; - features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state)); - estimated_features->set_value(fid_, pimpl_->EstimateProb(state)); -} - -void LanguageModel::FinalTraversalFeatures(const void* ant_state, - SparseVector* features) const { - features->set_value(fid_, pimpl_->FinalTraversalCost(ant_state)); -} - diff --git a/src/ff_lm.h b/src/ff_lm.h deleted file mode 100644 index cd717360..00000000 --- a/src/ff_lm.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _LM_FF_H_ -#define _LM_FF_H_ - -#include -#include - -#include "hg.h" -#include "ff.h" - -class LanguageModelImpl; - -class LanguageModel : public FeatureFunction { - public: - // param = "filename.lm [-o n]" - LanguageModel(const std::string& param); - ~LanguageModel(); - virtual void FinalTraversalFeatures(const void* context, - SparseVector* features) const; - std::string DebugStateToString(const void* state) const; - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* out_context) const; - private: - const int fid_; - mutable LanguageModelImpl* pimpl_; -}; - -#endif diff --git a/src/ff_test.cc b/src/ff_test.cc deleted file mode 100644 index babaf985..00000000 --- a/src/ff_test.cc +++ /dev/null @@ -1,134 +0,0 @@ -#include -#include -#include -#include -#include -#include "hg.h" -#include "ff_lm.h" -#include "ff.h" -#include "trule.h" -#include "sentence_metadata.h" - -using namespace std; - -LanguageModel* lm_ = NULL; -LanguageModel* lm3_ = NULL; - -class FFTest : public testing::Test { - public: - FFTest() : smeta(0,Lattice()) { - if (!lm_) { - static LanguageModel slm("-o 2 ./test_data/test_2gram.lm.gz"); - lm_ = &slm; - static LanguageModel slm3("./test_data/dummy.3gram.lm -o 3"); - lm3_ = &slm3; - } - } - protected: - virtual void SetUp() { } - virtual void TearDown() { } - SentenceMetadata smeta; -}; - -TEST_F(FFTest,LanguageModel) { - vector ms(1, lm_); - TRulePtr tr1(new TRule("[X] ||| [X,1] said")); - TRulePtr tr2(new TRule("[X] ||| the man said")); - TRulePtr tr3(new TRule("[X] ||| the fat man")); - Hypergraph hg; - const int lm_fid = FD::Convert("LanguageModel"); - vector w(lm_fid + 1,1); - ModelSet models(w, ms); - string state; - Hypergraph::Edge edge; - edge.rule_ = tr2; - models.AddFeaturesToEdge(smeta, hg, &edge, &state); - double lm1 = edge.feature_values_.dot(w); - cerr << "lm=" << edge.feature_values_[lm_fid] << endl; - - hg.nodes_.resize(1); - hg.edges_.resize(2); - hg.edges_[0].rule_ = tr3; - models.AddFeaturesToEdge(smeta, hg, &hg.edges_[0], &hg.nodes_[0].state_); - hg.edges_[1].tail_nodes_.push_back(0); - hg.edges_[1].rule_ = tr1; - string state2; - models.AddFeaturesToEdge(smeta, hg, &hg.edges_[1], &state2); - double tot = hg.edges_[1].feature_values_[lm_fid] + hg.edges_[0].feature_values_[lm_fid]; - cerr << "lm=" << tot << endl; - EXPECT_TRUE(state2 == state); - EXPECT_FALSE(state == hg.nodes_[0].state_); -} - -TEST_F(FFTest, Small) { - WordPenalty wp(""); - vector ms(2, lm_); - ms[1] = ℘ - TRulePtr tr1(new TRule("[X] ||| [X,1] said")); - TRulePtr tr2(new TRule("[X] ||| john said")); - TRulePtr tr3(new TRule("[X] ||| john")); - cerr << "RULE: " << tr1->AsString() << endl; - Hypergraph hg; - vector w(2); w[0]=1.0; w[1]=-2.0; - ModelSet models(w, ms); - string state; - Hypergraph::Edge edge; - edge.rule_ = tr2; - cerr << tr2->AsString() << endl; - models.AddFeaturesToEdge(smeta, hg, &edge, &state); - double s1 = edge.feature_values_.dot(w); - cerr << "lm=" << edge.feature_values_[0] << endl; - cerr << "wp=" << edge.feature_values_[1] << endl; - - hg.nodes_.resize(1); - hg.edges_.resize(2); - hg.edges_[0].rule_ = tr3; - models.AddFeaturesToEdge(smeta, hg, &hg.edges_[0], &hg.nodes_[0].state_); - double acc = hg.edges_[0].feature_values_.dot(w); - cerr << hg.edges_[0].feature_values_[0] << endl; - hg.edges_[1].tail_nodes_.push_back(0); - hg.edges_[1].rule_ = tr1; - string state2; - models.AddFeaturesToEdge(smeta, hg, &hg.edges_[1], &state2); - acc += hg.edges_[1].feature_values_.dot(w); - double tot = hg.edges_[1].feature_values_[0] + hg.edges_[0].feature_values_[0]; - cerr << "lm=" << tot << endl; - cerr << "acc=" << acc << endl; - cerr << " s1=" << s1 << endl; - EXPECT_TRUE(state2 == state); - EXPECT_FALSE(state == hg.nodes_[0].state_); - EXPECT_FLOAT_EQ(acc, s1); -} - -TEST_F(FFTest, LM3) { - int x = lm3_->NumBytesContext(); - Hypergraph::Edge edge1; - edge1.rule_.reset(new TRule("[X] ||| x y ||| one ||| 1.0 -2.4 3.0")); - Hypergraph::Edge edge2; - edge2.rule_.reset(new TRule("[X] ||| [X,1] a ||| [X,1] two ||| 1.0 -2.4 3.0")); - Hypergraph::Edge edge3; - edge3.rule_.reset(new TRule("[X] ||| [X,1] a ||| zero [X,1] two ||| 1.0 -2.4 3.0")); - vector ants1; - string state(x, '\0'); - SparseVector feats; - SparseVector est; - lm3_->TraversalFeatures(smeta, edge1, ants1, &feats, &est, (void *)&state[0]); - cerr << "returned " << feats << endl; - cerr << edge1.feature_values_ << endl; - cerr << lm3_->DebugStateToString((const void*)&state[0]) << endl; - EXPECT_EQ("[ one ]", lm3_->DebugStateToString((const void*)&state[0])); - ants1.push_back((const void*)&state[0]); - string state2(x, '\0'); - lm3_->TraversalFeatures(smeta, edge2, ants1, &feats, &est, (void *)&state2[0]); - cerr << lm3_->DebugStateToString((const void*)&state2[0]) << endl; - EXPECT_EQ("[ one two ]", lm3_->DebugStateToString((const void*)&state2[0])); - string state3(x, '\0'); - lm3_->TraversalFeatures(smeta, edge3, ants1, &feats, &est, (void *)&state3[0]); - cerr << lm3_->DebugStateToString((const void*)&state3[0]) << endl; - EXPECT_EQ("[ zero one <{STAR}> one two ]", lm3_->DebugStateToString((const void*)&state3[0])); -} - -int main(int argc, char **argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ff_wordalign.cc b/src/ff_wordalign.cc deleted file mode 100644 index e605ac8d..00000000 --- a/src/ff_wordalign.cc +++ /dev/null @@ -1,221 +0,0 @@ -#include "ff_wordalign.h" - -#include -#include - -#include "stringlib.h" -#include "sentence_metadata.h" -#include "hg.h" -#include "fdict.h" -#include "aligner.h" -#include "tdict.h" // Blunsom hack -#include "filelib.h" // Blunsom hack - -using namespace std; - -RelativeSentencePosition::RelativeSentencePosition(const string& param) : - fid_(FD::Convert("RelativeSentencePosition")) {} - -void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_states, - SparseVector* features, - SparseVector* estimated_features, - void* state) const { - // if the source word is either null or the generated word - // has no position in the reference - if (edge.i_ == -1 || edge.prev_i_ == -1) - return; - - assert(smeta.GetTargetLength() > 0); - const double val = fabs(static_cast(edge.i_) / smeta.GetSourceLength() - - static_cast(edge.prev_i_) / smeta.GetTargetLength()); - features->set_value(fid_, val); -// cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl; -} - -MarkovJump::MarkovJump(const string& param) : - FeatureFunction(1), - fid_(FD::Convert("MarkovJump")), - individual_params_per_jumpsize_(false), - condition_on_flen_(false) { - cerr << " MarkovJump: Blunsom&Cohn feature"; - vector argv; - int argc = SplitOnWhitespace(param, &argv); - if (argc > 0) { - if (argc != 1 || !(argv[0] == "-f" || argv[0] == "-i" || argv[0] == "-if")) { - cerr << "MarkovJump: expected parameters to be -f, -i, or -if\n"; - exit(1); - } - individual_params_per_jumpsize_ = (argv[0][1] == 'i'); - condition_on_flen_ = (argv[0][argv[0].size() - 1] == 'f'); - if (individual_params_per_jumpsize_) { - template_ = "Jump:000"; - cerr << ", individual jump parameters"; - if (condition_on_flen_) { - template_ += ":F00"; - cerr << " (split by f-length)"; - } - } - } - cerr << endl; -} - -void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_states, - SparseVector* features, - SparseVector* estimated_features, - void* state) const { - unsigned char& dpstate = *((unsigned char*)state); - if (edge.Arity() == 0) { - dpstate = static_cast(edge.i_); - } else if (edge.Arity() == 1) { - dpstate = *((unsigned char*)ant_states[0]); - } else if (edge.Arity() == 2) { - int left_index = *((unsigned char*)ant_states[0]); - int right_index = *((unsigned char*)ant_states[1]); - if (right_index == -1) - dpstate = static_cast(left_index); - else - dpstate = static_cast(right_index); - const int jumpsize = right_index - left_index; - features->set_value(fid_, fabs(jumpsize - 1)); // Blunsom and Cohn def - - if (individual_params_per_jumpsize_) { - string fname = template_; - int param = jumpsize; - if (jumpsize < 0) { - param *= -1; - fname[5]='L'; - } else if (jumpsize > 0) { - fname[5]='R'; - } - if (param) { - fname[6] = '0' + (param / 10); - fname[7] = '0' + (param % 10); - } - if (condition_on_flen_) { - const int flen = smeta.GetSourceLength(); - fname[10] = '0' + (flen / 10); - fname[11] = '0' + (flen % 10); - } - features->set_value(FD::Convert(fname), 1.0); - } - } else { - assert(!"something really unexpected is happening"); - } -} - -AlignerResults::AlignerResults(const std::string& param) : - cur_sent_(-1), - cur_grid_(NULL) { - vector argv; - int argc = SplitOnWhitespace(param, &argv); - if (argc != 2) { - cerr << "Required format: AlignerResults [FeatureName] [file.pharaoh]\n"; - exit(1); - } - cerr << " feature: " << argv[0] << "\talignments: " << argv[1] << endl; - fid_ = FD::Convert(argv[0]); - ReadFile rf(argv[1]); - istream& in = *rf.stream(); int lc = 0; - while(in) { - string line; - getline(in, line); - if (!in) break; - ++lc; - is_aligned_.push_back(AlignerTools::ReadPharaohAlignmentGrid(line)); - } - cerr << " Loaded " << lc << " refs\n"; -} - -void AlignerResults::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_states, - SparseVector* features, - SparseVector* estimated_features, - void* state) const { - if (edge.i_ == -1 || edge.prev_i_ == -1) - return; - - if (cur_sent_ != smeta.GetSentenceID()) { - assert(smeta.HasReference()); - cur_sent_ = smeta.GetSentenceID(); - assert(cur_sent_ < is_aligned_.size()); - cur_grid_ = is_aligned_[cur_sent_].get(); - } - - //cerr << edge.rule_->AsString() << endl; - - int j = edge.i_; // source side (f) - int i = edge.prev_i_; // target side (e) - if (j < cur_grid_->height() && i < cur_grid_->width() && (*cur_grid_)(i, j)) { -// if (edge.rule_->e_[0] == smeta.GetReference()[i][0].label) { - features->set_value(fid_, 1.0); -// cerr << edge.rule_->AsString() << " (" << i << "," << j << ")\n"; -// } - } -} - -BlunsomSynchronousParseHack::BlunsomSynchronousParseHack(const string& param) : - FeatureFunction((100 / 8) + 1), fid_(FD::Convert("NotRef")), cur_sent_(-1) { - ReadFile rf(param); - istream& in = *rf.stream(); int lc = 0; - while(in) { - string line; - getline(in, line); - if (!in) break; - ++lc; - refs_.push_back(vector()); - TD::ConvertSentence(line, &refs_.back()); - } - cerr << " Loaded " << lc << " refs\n"; -} - -void BlunsomSynchronousParseHack::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_states, - SparseVector* features, - SparseVector* estimated_features, - void* state) const { - if (cur_sent_ != smeta.GetSentenceID()) { - // assert(smeta.HasReference()); - cur_sent_ = smeta.GetSentenceID(); - assert(cur_sent_ < refs_.size()); - cur_ref_ = &refs_[cur_sent_]; - cur_map_.clear(); - for (int i = 0; i < cur_ref_->size(); ++i) { - vector phrase; - for (int j = i; j < cur_ref_->size(); ++j) { - phrase.push_back((*cur_ref_)[j]); - cur_map_[phrase] = i; - } - } - } - //cerr << edge.rule_->AsString() << endl; - for (int i = 0; i < ant_states.size(); ++i) { - if (DoesNotBelong(ant_states[i])) { - //cerr << " ant " << i << " does not belong\n"; - return; - } - } - vector > ants(ant_states.size()); - vector* > pants(ant_states.size()); - for (int i = 0; i < ant_states.size(); ++i) { - AppendAntecedentString(ant_states[i], &ants[i]); - //cerr << " ant[" << i << "]: " << ((int)*(static_cast(ant_states[i]))) << " " << TD::GetString(ants[i]) << endl; - pants[i] = &ants[i]; - } - vector yield; - edge.rule_->ESubstitute(pants, &yield); - //cerr << "YIELD: " << TD::GetString(yield) << endl; - Vec2Int::iterator it = cur_map_.find(yield); - if (it == cur_map_.end()) { - features->set_value(fid_, 1); - //cerr << " BAD!\n"; - return; - } - SetStateMask(it->second, it->second + yield.size(), state); -} - diff --git a/src/ff_wordalign.h b/src/ff_wordalign.h deleted file mode 100644 index 1581641c..00000000 --- a/src/ff_wordalign.h +++ /dev/null @@ -1,133 +0,0 @@ -#ifndef _FF_WORD_ALIGN_H_ -#define _FF_WORD_ALIGN_H_ - -#include "ff.h" -#include "array2d.h" - -class RelativeSentencePosition : public FeatureFunction { - public: - RelativeSentencePosition(const std::string& param); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* out_context) const; - private: - const int fid_; -}; - -class MarkovJump : public FeatureFunction { - public: - MarkovJump(const std::string& param); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* out_context) const; - private: - const int fid_; - bool individual_params_per_jumpsize_; - bool condition_on_flen_; - std::string template_; -}; - -class AlignerResults : public FeatureFunction { - public: - AlignerResults(const std::string& param); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* out_context) const; - private: - int fid_; - std::vector > > is_aligned_; - mutable int cur_sent_; - const Array2D mutable* cur_grid_; -}; - -#include -#include -#include -class BlunsomSynchronousParseHack : public FeatureFunction { - public: - BlunsomSynchronousParseHack(const std::string& param); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* out_context) const; - private: - inline bool DoesNotBelong(const void* state) const { - for (int i = 0; i < NumBytesContext(); ++i) { - if (*(static_cast(state) + i)) return false; - } - return true; - } - - inline void AppendAntecedentString(const void* state, std::vector* yield) const { - int i = 0; - int ind = 0; - while (i < NumBytesContext() && !(*(static_cast(state) + i))) { ++i; ind += 8; } - // std::cerr << i << " " << NumBytesContext() << std::endl; - assert(i != NumBytesContext()); - assert(ind < cur_ref_->size()); - int cur = *(static_cast(state) + i); - int comp = 1; - while (comp < 256 && (comp & cur) == 0) { comp <<= 1; ++ind; } - assert(ind < cur_ref_->size()); - assert(comp < 256); - do { - assert(ind < cur_ref_->size()); - yield->push_back((*cur_ref_)[ind]); - ++ind; - comp <<= 1; - if (comp == 256) { - comp = 1; - ++i; - cur = *(static_cast(state) + i); - } - } while (comp & cur); - } - - inline void SetStateMask(int start, int end, void* state) const { - assert((end / 8) < NumBytesContext()); - int i = 0; - int comp = 1; - for (int j = 0; j < start; ++j) { - comp <<= 1; - if (comp == 256) { - ++i; - comp = 1; - } - } - //std::cerr << "SM: " << i << "\n"; - for (int j = start; j < end; ++j) { - *(static_cast(state) + i) |= comp; - //std::cerr << " " << comp << "\n"; - comp <<= 1; - if (comp == 256) { - ++i; - comp = 1; - } - } - //std::cerr << " MASK: " << ((int)*(static_cast(state))) << "\n"; - } - - const int fid_; - mutable int cur_sent_; - typedef std::tr1::unordered_map, int, boost::hash > > Vec2Int; - mutable Vec2Int cur_map_; - const std::vector mutable * cur_ref_; - mutable std::vector > refs_; -}; - -#endif diff --git a/src/filelib.cc b/src/filelib.cc deleted file mode 100644 index 79ad2847..00000000 --- a/src/filelib.cc +++ /dev/null @@ -1,22 +0,0 @@ -#include "filelib.h" - -#include -#include - -using namespace std; - -bool FileExists(const std::string& fn) { - struct stat info; - int s = stat(fn.c_str(), &info); - return (s==0); -} - -bool DirectoryExists(const string& dir) { - if (access(dir.c_str(),0) == 0) { - struct stat status; - stat(dir.c_str(), &status); - if (status.st_mode & S_IFDIR) return true; - } - return false; -} - diff --git a/src/filelib.h b/src/filelib.h deleted file mode 100644 index 62cb9427..00000000 --- a/src/filelib.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef _FILELIB_H_ -#define _FILELIB_H_ - -#include -#include -#include -#include -#include "gzstream.h" - -// reads from standard in if filename is - -// uncompresses if file ends with .gz -// otherwise, reads from a normal file -class ReadFile { - public: - ReadFile(const std::string& filename) : - no_delete_on_exit_(filename == "-"), - in_(no_delete_on_exit_ ? static_cast(&std::cin) : - (EndsWith(filename, ".gz") ? - static_cast(new igzstream(filename.c_str())) : - static_cast(new std::ifstream(filename.c_str())))) { - if (!*in_) { - std::cerr << "Failed to open " << filename << std::endl; - abort(); - } - } - ~ReadFile() { - if (!no_delete_on_exit_) delete in_; - } - - inline std::istream* stream() { return in_; } - - private: - static bool EndsWith(const std::string& f, const std::string& suf) { - return (f.size() > suf.size()) && (f.rfind(suf) == f.size() - suf.size()); - } - const bool no_delete_on_exit_; - std::istream* const in_; -}; - -class WriteFile { - public: - WriteFile(const std::string& filename) : - no_delete_on_exit_(filename == "-"), - out_(no_delete_on_exit_ ? static_cast(&std::cout) : - (EndsWith(filename, ".gz") ? - static_cast(new ogzstream(filename.c_str())) : - static_cast(new std::ofstream(filename.c_str())))) {} - ~WriteFile() { - (*out_) << std::flush; - if (!no_delete_on_exit_) delete out_; - } - - inline std::ostream* stream() { return out_; } - - private: - static bool EndsWith(const std::string& f, const std::string& suf) { - return (f.size() > suf.size()) && (f.rfind(suf) == f.size() - suf.size()); - } - const bool no_delete_on_exit_; - std::ostream* const out_; -}; - -bool FileExists(const std::string& file_name); -bool DirectoryExists(const std::string& dir_name); - -#endif diff --git a/src/forest_writer.cc b/src/forest_writer.cc deleted file mode 100644 index a9117d18..00000000 --- a/src/forest_writer.cc +++ /dev/null @@ -1,23 +0,0 @@ -#include "forest_writer.h" - -#include - -#include - -#include "filelib.h" -#include "hg_io.h" -#include "hg.h" - -using namespace std; - -ForestWriter::ForestWriter(const std::string& path, int num) : - fname_(path + '/' + boost::lexical_cast(num) + ".json.gz"), used_(false) {} - -bool ForestWriter::Write(const Hypergraph& forest, bool minimal_rules) { - assert(!used_); - used_ = true; - cerr << " Writing forest to " << fname_ << endl; - WriteFile wf(fname_); - return HypergraphIO::WriteToJSON(forest, minimal_rules, wf.stream()); -} - diff --git a/src/forest_writer.h b/src/forest_writer.h deleted file mode 100644 index 819a8940..00000000 --- a/src/forest_writer.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _FOREST_WRITER_H_ -#define _FOREST_WRITER_H_ - -#include - -class Hypergraph; - -struct ForestWriter { - ForestWriter(const std::string& path, int num); - bool Write(const Hypergraph& forest, bool minimal_rules); - - const std::string fname_; - bool used_; -}; - -#endif diff --git a/src/freqdict.cc b/src/freqdict.cc deleted file mode 100644 index 9e25d346..00000000 --- a/src/freqdict.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include -#include -#include "freqdict.h" -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -void FreqDict::Load(const std::string& fname) { - cerr << "Reading word frequencies: " << fname << endl; - ReadFile rf(fname); - istream& ifs = *rf.stream(); - int cc=0; - while (ifs) { - std::string word; - ifs >> word; - if (word.size() == 0) continue; - if (word[0] == '#') continue; - double count = 0; - ifs >> count; - assert(count > 0.0); // use -log(f) - counts_[TD::Convert(word)]=count; - ++cc; - if (cc % 10000 == 0) { std::cerr << "."; } - } - std::cerr << "\n"; - std::cerr << "Loaded " << cc << " words\n"; -} diff --git a/src/freqdict.h b/src/freqdict.h deleted file mode 100644 index 9acf0c33..00000000 --- a/src/freqdict.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _FREQDICT_H_ -#define _FREQDICT_H_ - -#include -#include -#include "wordid.h" - -class FreqDict { - public: - void Load(const std::string& fname); - float LookUp(const WordID& word) const { - std::map::const_iterator i = counts_.find(word); - if (i == counts_.end()) return 0; - return i->second; - } - private: - std::map counts_; -}; - -#endif diff --git a/src/fst_translator.cc b/src/fst_translator.cc deleted file mode 100644 index 57feb227..00000000 --- a/src/fst_translator.cc +++ /dev/null @@ -1,91 +0,0 @@ -#include "translator.h" - -#include -#include - -#include "sentence_metadata.h" -#include "filelib.h" -#include "hg.h" -#include "hg_io.h" -#include "earley_composer.h" -#include "phrasetable_fst.h" -#include "tdict.h" - -using namespace std; - -struct FSTTranslatorImpl { - FSTTranslatorImpl(const boost::program_options::variables_map& conf) : - goal_sym(conf["goal"].as()), - kGOAL_RULE(new TRule("[Goal] ||| [" + goal_sym + ",1] ||| [1]")), - kGOAL(TD::Convert("Goal") * -1), - add_pass_through_rules(conf.count("add_pass_through_rules")) { - fst.reset(LoadTextPhrasetable(conf["grammar"].as >())); - ec.reset(new EarleyComposer(fst.get())); - } - - bool Translate(const string& input, - const vector& weights, - Hypergraph* forest) { - bool composed = false; - if (input.find("{\"rules\"") == 0) { - istringstream is(input); - Hypergraph src_cfg_hg; - assert(HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)); - if (add_pass_through_rules) { - SparseVector feats; - feats.set_value(FD::Convert("PassThrough"), 1); - for (int i = 0; i < src_cfg_hg.edges_.size(); ++i) { - const vector& f = src_cfg_hg.edges_[i].rule_->f_; - for (int j = 0; j < f.size(); ++j) { - if (f[j] > 0) { - fst->AddPassThroughTranslation(f[j], feats); - } - } - } - } - composed = ec->Compose(src_cfg_hg, forest); - } else { - const string dummy_grammar("[" + goal_sym + "] ||| " + input + " ||| TOP=1"); - cerr << " Dummy grammar: " << dummy_grammar << endl; - istringstream is(dummy_grammar); - if (add_pass_through_rules) { - vector words; - TD::ConvertSentence(input, &words); - SparseVector feats; - feats.set_value(FD::Convert("PassThrough"), 1); - for (int i = 0; i < words.size(); ++i) - fst->AddPassThroughTranslation(words[i], feats); - } - composed = ec->Compose(&is, forest); - } - if (composed) { - Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); - Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1, ""); - Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); - forest->ConnectEdgeToHeadNode(hg_edge, goal); - forest->Reweight(weights); - } - if (add_pass_through_rules) - fst->ClearPassThroughTranslations(); - return composed; - } - - const string goal_sym; - const TRulePtr kGOAL_RULE; - const WordID kGOAL; - const bool add_pass_through_rules; - boost::shared_ptr ec; - boost::shared_ptr fst; -}; - -FSTTranslator::FSTTranslator(const boost::program_options::variables_map& conf) : - pimpl_(new FSTTranslatorImpl(conf)) {} - -bool FSTTranslator::Translate(const string& input, - SentenceMetadata* smeta, - const vector& weights, - Hypergraph* minus_lm_forest) { - smeta->SetSourceLength(0); // don't know how to compute this - return pimpl_->Translate(input, weights, minus_lm_forest); -} - diff --git a/src/grammar.cc b/src/grammar.cc deleted file mode 100644 index e19bd344..00000000 --- a/src/grammar.cc +++ /dev/null @@ -1,164 +0,0 @@ -#include "grammar.h" - -#include -#include -#include - -#include "filelib.h" -#include "tdict.h" - -using namespace std; - -const vector Grammar::NO_RULES; - -RuleBin::~RuleBin() {} -GrammarIter::~GrammarIter() {} -Grammar::~Grammar() {} - -bool Grammar::HasRuleForSpan(int i, int j, int distance) const { - (void) i; - (void) j; - (void) distance; - return true; // always true by default -} - -struct TextRuleBin : public RuleBin { - int GetNumRules() const { - return rules_.size(); - } - TRulePtr GetIthRule(int i) const { - return rules_[i]; - } - void AddRule(TRulePtr t) { - rules_.push_back(t); - } - int Arity() const { - return rules_.front()->Arity(); - } - void Dump() const { - for (int i = 0; i < rules_.size(); ++i) - cerr << rules_[i]->AsString() << endl; - } - private: - vector rules_; -}; - -struct TextGrammarNode : public GrammarIter { - TextGrammarNode() : rb_(NULL) {} - ~TextGrammarNode() { - delete rb_; - } - const GrammarIter* Extend(int symbol) const { - map::const_iterator i = tree_.find(symbol); - if (i == tree_.end()) return NULL; - return &i->second; - } - - const RuleBin* GetRules() const { - if (rb_) { - //rb_->Dump(); - } - return rb_; - } - - map tree_; - TextRuleBin* rb_; -}; - -struct TGImpl { - TextGrammarNode root_; -}; - -TextGrammar::TextGrammar() : max_span_(10), pimpl_(new TGImpl) {} -TextGrammar::TextGrammar(const string& file) : - max_span_(10), - pimpl_(new TGImpl) { - ReadFromFile(file); -} - -const GrammarIter* TextGrammar::GetRoot() const { - return &pimpl_->root_; -} - -void TextGrammar::AddRule(const TRulePtr& rule) { - if (rule->IsUnary()) { - rhs2unaries_[rule->f().front()].push_back(rule); - unaries_.push_back(rule); - } else { - TextGrammarNode* cur = &pimpl_->root_; - for (int i = 0; i < rule->f_.size(); ++i) - cur = &cur->tree_[rule->f_[i]]; - if (cur->rb_ == NULL) - cur->rb_ = new TextRuleBin; - cur->rb_->AddRule(rule); - } -} - -void TextGrammar::ReadFromFile(const string& filename) { - ReadFile in(filename); - istream& in_file = *in.stream(); - assert(in_file); - long long int rule_count = 0; - bool fl = false; - while(in_file) { - string line; - getline(in_file, line); - if (line.empty()) continue; - ++rule_count; - if (rule_count % 50000 == 0) { cerr << '.' << flush; fl = true; } - if (rule_count % 2000000 == 0) { cerr << " [" << rule_count << "]\n"; fl = false; } - TRulePtr rule(TRule::CreateRuleSynchronous(line)); - if (rule) { - AddRule(rule); - } else { - if (fl) { cerr << endl; } - cerr << "Skipping badly formatted rule in line " << rule_count << " of " << filename << endl; - fl = false; - } - } - if (fl) cerr << endl; - cerr << " " << rule_count << " rules read.\n"; -} - -bool TextGrammar::HasRuleForSpan(int i, int j, int distance) const { - return (max_span_ >= distance); -} - -GlueGrammar::GlueGrammar(const string& file) : TextGrammar(file) {} - -GlueGrammar::GlueGrammar(const string& goal_nt, const string& default_nt) { - TRulePtr stop_glue(new TRule("[" + goal_nt + "] ||| [" + default_nt + ",1] ||| [" + default_nt + ",1]")); - TRulePtr glue(new TRule("[" + goal_nt + "] ||| [" + goal_nt + ",1] [" - + default_nt + ",2] ||| [" + goal_nt + ",1] [" + default_nt + ",2] ||| Glue=1")); - - AddRule(stop_glue); - AddRule(glue); - //cerr << "GLUE: " << stop_glue->AsString() << endl; - //cerr << "GLUE: " << glue->AsString() << endl; -} - -bool GlueGrammar::HasRuleForSpan(int i, int j, int distance) const { - (void) j; - return (i == 0); -} - -PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat) : - has_rule_(input.size() + 1) { - for (int i = 0; i < input.size(); ++i) { - const vector& alts = input[i]; - for (int k = 0; k < alts.size(); ++k) { - const int j = alts[k].dist2next + i; - has_rule_[i].insert(j); - const string& src = TD::Convert(alts[k].label); - TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1")); - AddRule(pt); -// cerr << "PT: " << pt->AsString() << endl; - } - } -} - -bool PassThroughGrammar::HasRuleForSpan(int i, int j, int distance) const { - const set& hr = has_rule_[i]; - if (i == j) { return !hr.empty(); } - return (hr.find(j) != hr.end()); -} diff --git a/src/grammar.h b/src/grammar.h deleted file mode 100644 index 3471e3f1..00000000 --- a/src/grammar.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef GRAMMAR_H_ -#define GRAMMAR_H_ - -#include -#include -#include -#include - -#include "lattice.h" -#include "trule.h" - -struct RuleBin { - virtual ~RuleBin(); - virtual int GetNumRules() const = 0; - virtual TRulePtr GetIthRule(int i) const = 0; - virtual int Arity() const = 0; -}; - -struct GrammarIter { - virtual ~GrammarIter(); - virtual const RuleBin* GetRules() const = 0; - virtual const GrammarIter* Extend(int symbol) const = 0; -}; - -struct Grammar { - typedef std::map > Cat2Rules; - static const std::vector NO_RULES; - - virtual ~Grammar(); - virtual const GrammarIter* GetRoot() const = 0; - virtual bool HasRuleForSpan(int i, int j, int distance) const; - - // cat is the category to be rewritten - inline const std::vector& GetAllUnaryRules() const { - return unaries_; - } - - // get all the unary rules that rewrite category cat - inline const std::vector& GetUnaryRulesForRHS(const WordID& cat) const { - Cat2Rules::const_iterator found = rhs2unaries_.find(cat); - if (found == rhs2unaries_.end()) - return NO_RULES; - else - return found->second; - } - - protected: - Cat2Rules rhs2unaries_; // these must be filled in by subclasses! - std::vector unaries_; -}; - -typedef boost::shared_ptr GrammarPtr; - -class TGImpl; -struct TextGrammar : public Grammar { - TextGrammar(); - TextGrammar(const std::string& file); - void SetMaxSpan(int m) { max_span_ = m; } - virtual const GrammarIter* GetRoot() const; - void AddRule(const TRulePtr& rule); - void ReadFromFile(const std::string& filename); - virtual bool HasRuleForSpan(int i, int j, int distance) const; - const std::vector& GetUnaryRules(const WordID& cat) const; - private: - int max_span_; - boost::shared_ptr pimpl_; -}; - -struct GlueGrammar : public TextGrammar { - // read glue grammar from file - explicit GlueGrammar(const std::string& file); - GlueGrammar(const std::string& goal_nt, const std::string& default_nt); // "S", "X" - virtual bool HasRuleForSpan(int i, int j, int distance) const; -}; - -struct PassThroughGrammar : public TextGrammar { - PassThroughGrammar(const Lattice& input, const std::string& cat); - virtual bool HasRuleForSpan(int i, int j, int distance) const; - private: - std::vector > has_rule_; // index by [i][j] -}; - -#endif diff --git a/src/grammar_test.cc b/src/grammar_test.cc deleted file mode 100644 index 62b8f958..00000000 --- a/src/grammar_test.cc +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include -#include -#include -#include "trule.h" -#include "tdict.h" -#include "grammar.h" -#include "bottom_up_parser.h" -#include "ff.h" -#include "weights.h" - -using namespace std; - -class GrammarTest : public testing::Test { - public: - GrammarTest() { - wts.InitFromFile("test_data/weights.gt"); - } - protected: - virtual void SetUp() { } - virtual void TearDown() { } - Weights wts; -}; - -TEST_F(GrammarTest,TestTextGrammar) { - vector w; - vector ms; - ModelSet models(w, ms); - - TextGrammar g; - TRulePtr r1(new TRule("[X] ||| a b c ||| A B C ||| 0.1 0.2 0.3", true)); - TRulePtr r2(new TRule("[X] ||| a b c ||| 1 2 3 ||| 0.2 0.3 0.4", true)); - TRulePtr r3(new TRule("[X] ||| a b c d ||| A B C D ||| 0.1 0.2 0.3", true)); - cerr << r1->AsString() << endl; - g.AddRule(r1); - g.AddRule(r2); - g.AddRule(r3); -} - -TEST_F(GrammarTest,TestTextGrammarFile) { - GrammarPtr g(new TextGrammar("./test_data/grammar.prune")); - vector grammars(1, g); - - LatticeArc a(TD::Convert("ein"), 0.0, 1); - LatticeArc b(TD::Convert("haus"), 0.0, 1); - Lattice lattice(2); - lattice[0].push_back(a); - lattice[1].push_back(b); - Hypergraph forest; - ExhaustiveBottomUpParser parser("PHRASE", grammars); - parser.Parse(lattice, &forest); - forest.PrintGraphviz(); -} - -int main(int argc, char **argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/gzstream.cc b/src/gzstream.cc deleted file mode 100644 index 9703e6ad..00000000 --- a/src/gzstream.cc +++ /dev/null @@ -1,165 +0,0 @@ -// ============================================================================ -// gzstream, C++ iostream classes wrapping the zlib compression library. -// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// ============================================================================ -// -// File : gzstream.C -// Revision : $Revision: 1.1 $ -// Revision_date : $Date: 2006/03/30 04:05:52 $ -// Author(s) : Deepak Bandyopadhyay, Lutz Kettner -// -// Standard streambuf implementation following Nicolai Josuttis, "The -// Standard C++ Library". -// ============================================================================ - -#include "gzstream.h" -#include -#include - -#ifdef GZSTREAM_NAMESPACE -namespace GZSTREAM_NAMESPACE { -#endif - -// ---------------------------------------------------------------------------- -// Internal classes to implement gzstream. See header file for user classes. -// ---------------------------------------------------------------------------- - -// -------------------------------------- -// class gzstreambuf: -// -------------------------------------- - -gzstreambuf* gzstreambuf::open( const char* name, int open_mode) { - if ( is_open()) - return (gzstreambuf*)0; - mode = open_mode; - // no append nor read/write mode - if ((mode & std::ios::ate) || (mode & std::ios::app) - || ((mode & std::ios::in) && (mode & std::ios::out))) - return (gzstreambuf*)0; - char fmode[10]; - char* fmodeptr = fmode; - if ( mode & std::ios::in) - *fmodeptr++ = 'r'; - else if ( mode & std::ios::out) - *fmodeptr++ = 'w'; - *fmodeptr++ = 'b'; - *fmodeptr = '\0'; - file = gzopen( name, fmode); - if (file == 0) - return (gzstreambuf*)0; - opened = 1; - return this; -} - -gzstreambuf * gzstreambuf::close() { - if ( is_open()) { - sync(); - opened = 0; - if ( gzclose( file) == Z_OK) - return this; - } - return (gzstreambuf*)0; -} - -int gzstreambuf::underflow() { // used for input buffer only - if ( gptr() && ( gptr() < egptr())) - return * reinterpret_cast( gptr()); - - if ( ! (mode & std::ios::in) || ! opened) - return EOF; - // Josuttis' implementation of inbuf - int n_putback = gptr() - eback(); - if ( n_putback > 4) - n_putback = 4; - memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback); - - int num = gzread( file, buffer+4, bufferSize-4); - if (num <= 0) // ERROR or EOF - return EOF; - - // reset buffer pointers - setg( buffer + (4 - n_putback), // beginning of putback area - buffer + 4, // read position - buffer + 4 + num); // end of buffer - - // return next character - return * reinterpret_cast( gptr()); -} - -int gzstreambuf::flush_buffer() { - // Separate the writing of the buffer from overflow() and - // sync() operation. - int w = pptr() - pbase(); - if ( gzwrite( file, pbase(), w) != w) - return EOF; - pbump( -w); - return w; -} - -int gzstreambuf::overflow( int c) { // used for output buffer only - if ( ! ( mode & std::ios::out) || ! opened) - return EOF; - if (c != EOF) { - *pptr() = c; - pbump(1); - } - if ( flush_buffer() == EOF) - return EOF; - return c; -} - -int gzstreambuf::sync() { - // Changed to use flush_buffer() instead of overflow( EOF) - // which caused improper behavior with std::endl and flush(), - // bug reported by Vincent Ricard. - if ( pptr() && pptr() > pbase()) { - if ( flush_buffer() == EOF) - return -1; - } - return 0; -} - -// -------------------------------------- -// class gzstreambase: -// -------------------------------------- - -gzstreambase::gzstreambase( const char* name, int mode) { - init( &buf); - open( name, mode); -} - -gzstreambase::~gzstreambase() { - buf.close(); -} - -void gzstreambase::open( const char* name, int open_mode) { - if ( ! buf.open( name, open_mode)) - clear( rdstate() | std::ios::badbit); -} - -void gzstreambase::close() { - if ( buf.is_open()) - if ( ! buf.close()) - clear( rdstate() | std::ios::badbit); -} - -#ifdef GZSTREAM_NAMESPACE -} // namespace GZSTREAM_NAMESPACE -#endif - -// ============================================================================ -// EOF // diff --git a/src/gzstream.h b/src/gzstream.h deleted file mode 100644 index ad9785fd..00000000 --- a/src/gzstream.h +++ /dev/null @@ -1,121 +0,0 @@ -// ============================================================================ -// gzstream, C++ iostream classes wrapping the zlib compression library. -// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// ============================================================================ -// -// File : gzstream.h -// Revision : $Revision: 1.1 $ -// Revision_date : $Date: 2006/03/30 04:05:52 $ -// Author(s) : Deepak Bandyopadhyay, Lutz Kettner -// -// Standard streambuf implementation following Nicolai Josuttis, "The -// Standard C++ Library". -// ============================================================================ - -#ifndef GZSTREAM_H -#define GZSTREAM_H 1 - -// standard C++ with new header file names and std:: namespace -#include -#include -#include - -#ifdef GZSTREAM_NAMESPACE -namespace GZSTREAM_NAMESPACE { -#endif - -// ---------------------------------------------------------------------------- -// Internal classes to implement gzstream. See below for user classes. -// ---------------------------------------------------------------------------- - -class gzstreambuf : public std::streambuf { -private: - static const int bufferSize = 47+256; // size of data buff - // totals 512 bytes under g++ for igzstream at the end. - - gzFile file; // file handle for compressed file - char buffer[bufferSize]; // data buffer - char opened; // open/close state of stream - int mode; // I/O mode - - int flush_buffer(); -public: - gzstreambuf() : opened(0) { - setp( buffer, buffer + (bufferSize-1)); - setg( buffer + 4, // beginning of putback area - buffer + 4, // read position - buffer + 4); // end position - // ASSERT: both input & output capabilities will not be used together - } - int is_open() { return opened; } - gzstreambuf* open( const char* name, int open_mode); - gzstreambuf* close(); - ~gzstreambuf() { close(); } - - virtual int overflow( int c = EOF); - virtual int underflow(); - virtual int sync(); -}; - -class gzstreambase : virtual public std::ios { -protected: - gzstreambuf buf; -public: - gzstreambase() { init(&buf); } - gzstreambase( const char* name, int open_mode); - ~gzstreambase(); - void open( const char* name, int open_mode); - void close(); - gzstreambuf* rdbuf() { return &buf; } -}; - -// ---------------------------------------------------------------------------- -// User classes. Use igzstream and ogzstream analogously to ifstream and -// ofstream respectively. They read and write files based on the gz* -// function interface of the zlib. Files are compatible with gzip compression. -// ---------------------------------------------------------------------------- - -class igzstream : public gzstreambase, public std::istream { -public: - igzstream() : std::istream( &buf) {} - igzstream( const char* name, int open_mode = std::ios::in) - : gzstreambase( name, open_mode), std::istream( &buf) {} - gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } - void open( const char* name, int open_mode = std::ios::in) { - gzstreambase::open( name, open_mode); - } -}; - -class ogzstream : public gzstreambase, public std::ostream { -public: - ogzstream() : std::ostream( &buf) {} - ogzstream( const char* name, int mode = std::ios::out) - : gzstreambase( name, mode), std::ostream( &buf) {} - gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } - void open( const char* name, int open_mode = std::ios::out) { - gzstreambase::open( name, open_mode); - } -}; - -#ifdef GZSTREAM_NAMESPACE -} // namespace GZSTREAM_NAMESPACE -#endif - -#endif // GZSTREAM_H -// ============================================================================ -// EOF // - diff --git a/src/hg.cc b/src/hg.cc deleted file mode 100644 index 7bd79394..00000000 --- a/src/hg.cc +++ /dev/null @@ -1,486 +0,0 @@ -#include "hg.h" - -#include -#include -#include -#include -#include - -#include "viterbi.h" -#include "inside_outside.h" -#include "tdict.h" - -using namespace std; - -double Hypergraph::NumberOfPaths() const { - return Inside(*this); -} - -prob_t Hypergraph::ComputeEdgePosteriors(double scale, vector* posts) const { - const ScaledEdgeProb weight(scale); - SparseVector pv; - const double inside = InsideOutside, - EdgeFeaturesWeightFunction>(*this, &pv, weight); - posts->resize(edges_.size()); - for (int i = 0; i < edges_.size(); ++i) - (*posts)[i] = prob_t(pv.value(i)); - return prob_t(inside); -} - -prob_t Hypergraph::ComputeBestPathThroughEdges(vector* post) const { - vector in(edges_.size()); - vector out(edges_.size()); - post->resize(edges_.size()); - - vector ins_node_best(nodes_.size()); - for (int i = 0; i < nodes_.size(); ++i) { - const Node& node = nodes_[i]; - prob_t& node_ins_best = ins_node_best[i]; - if (node.in_edges_.empty()) node_ins_best = prob_t::One(); - for (int j = 0; j < node.in_edges_.size(); ++j) { - const Edge& edge = edges_[node.in_edges_[j]]; - prob_t& in_edge_sco = in[node.in_edges_[j]]; - in_edge_sco = edge.edge_prob_; - for (int k = 0; k < edge.tail_nodes_.size(); ++k) - in_edge_sco *= ins_node_best[edge.tail_nodes_[k]]; - if (in_edge_sco > node_ins_best) node_ins_best = in_edge_sco; - } - } - const prob_t ins_sco = ins_node_best[nodes_.size() - 1]; - - // sanity check - int tots = 0; - for (int i = 0; i < nodes_.size(); ++i) { if (nodes_[i].out_edges_.empty()) tots++; } - assert(tots == 1); - - // compute outside scores, potentially using inside scores - vector out_node_best(nodes_.size()); - for (int i = nodes_.size() - 1; i >= 0; --i) { - const Node& node = nodes_[i]; - prob_t& node_out_best = out_node_best[node.id_]; - if (node.out_edges_.empty()) node_out_best = prob_t::One(); - for (int j = 0; j < node.out_edges_.size(); ++j) { - const Edge& edge = edges_[node.out_edges_[j]]; - prob_t sco = edge.edge_prob_ * out_node_best[edge.head_node_]; - for (int k = 0; k < edge.tail_nodes_.size(); ++k) { - if (edge.tail_nodes_[k] != i) - sco *= ins_node_best[edge.tail_nodes_[k]]; - } - if (sco > node_out_best) node_out_best = sco; - } - for (int j = 0; j < node.in_edges_.size(); ++j) { - out[node.in_edges_[j]] = node_out_best; - } - } - - for (int i = 0; i < in.size(); ++i) - (*post)[i] = in[i] * out[i]; - // for (int i = 0; i < in.size(); ++i) - // cerr << "edge " << i << ": " << log((*post)[i]) << endl; - - return ins_sco; -} - -void Hypergraph::PushWeightsToSource(double scale) { - vector posts; - ComputeEdgePosteriors(scale, &posts); - for (int i = 0; i < nodes_.size(); ++i) { - const Hypergraph::Node& node = nodes_[i]; - prob_t z = prob_t::Zero(); - for (int j = 0; j < node.out_edges_.size(); ++j) - z += posts[node.out_edges_[j]]; - for (int j = 0; j < node.out_edges_.size(); ++j) { - edges_[node.out_edges_[j]].edge_prob_ = posts[node.out_edges_[j]] / z; - } - } -} - -void Hypergraph::PushWeightsToGoal(double scale) { - vector posts; - ComputeEdgePosteriors(scale, &posts); - for (int i = 0; i < nodes_.size(); ++i) { - const Hypergraph::Node& node = nodes_[i]; - prob_t z = prob_t::Zero(); - for (int j = 0; j < node.in_edges_.size(); ++j) - z += posts[node.in_edges_[j]]; - for (int j = 0; j < node.in_edges_.size(); ++j) { - edges_[node.in_edges_[j]].edge_prob_ = posts[node.in_edges_[j]] / z; - } - } -} - -void Hypergraph::PruneEdges(const std::vector& prune_edge) { - assert(prune_edge.size() == edges_.size()); - TopologicallySortNodesAndEdges(nodes_.size() - 1, &prune_edge); -} - -void Hypergraph::DensityPruneInsideOutside(const double scale, - const bool use_sum_prod_semiring, - const double density, - const vector* preserve_mask) { - assert(density >= 1.0); - const int plen = ViterbiPathLength(*this); - vector bp; - int rnum = min(static_cast(edges_.size()), static_cast(density * static_cast(plen))); - if (rnum == edges_.size()) { - cerr << "No pruning required: denisty already sufficient"; - return; - } - vector io(edges_.size()); - if (use_sum_prod_semiring) - ComputeEdgePosteriors(scale, &io); - else - ComputeBestPathThroughEdges(&io); - assert(edges_.size() == io.size()); - vector sorted = io; - nth_element(sorted.begin(), sorted.begin() + rnum, sorted.end(), greater()); - const double cutoff = sorted[rnum]; - vector prune(edges_.size()); - for (int i = 0; i < edges_.size(); ++i) { - prune[i] = (io[i] < cutoff); - if (preserve_mask && (*preserve_mask)[i]) prune[i] = false; - } - PruneEdges(prune); -} - -void Hypergraph::BeamPruneInsideOutside( - const double scale, - const bool use_sum_prod_semiring, - const double alpha, - const vector* preserve_mask) { - assert(alpha > 0.0); - assert(scale > 0.0); - vector io(edges_.size()); - if (use_sum_prod_semiring) - ComputeEdgePosteriors(scale, &io); - else - ComputeBestPathThroughEdges(&io); - assert(edges_.size() == io.size()); - prob_t best; // initializes to zero - for (int i = 0; i < io.size(); ++i) - if (io[i] > best) best = io[i]; - const prob_t aprob(exp(-alpha)); - const prob_t cutoff = best * aprob; - // cerr << "aprob = " << aprob << "\t CUTOFF=" << cutoff << endl; - vector prune(edges_.size()); - //cerr << preserve_mask.size() << " " << edges_.size() << endl; - int pc = 0; - for (int i = 0; i < io.size(); ++i) { - const bool prune_edge = (io[i] < cutoff); - if (prune_edge) ++pc; - prune[i] = (io[i] < cutoff); - if (preserve_mask && (*preserve_mask)[i]) prune[i] = false; - } - // cerr << "Beam pruning " << pc << "/" << io.size() << " edges\n"; - PruneEdges(prune); -} - -void Hypergraph::PrintGraphviz() const { - int ei = 0; - cerr << "digraph G {\n rankdir=LR;\n nodesep=.05;\n"; - for (vector::const_iterator i = edges_.begin(); - i != edges_.end(); ++i) { - const Edge& edge=*i; - ++ei; - static const string none = ""; - string rule = (edge.rule_ ? edge.rule_->AsString(false) : none); - - cerr << " A_" << ei << " [label=\"" << rule << " p=" << edge.edge_prob_ - << " F:" << edge.feature_values_ - << "\" shape=\"rect\"];\n"; - for (int i = 0; i < edge.tail_nodes_.size(); ++i) { - cerr << " " << edge.tail_nodes_[i] << " -> A_" << ei << ";\n"; - } - cerr << " A_" << ei << " -> " << edge.head_node_ << ";\n"; - } - for (vector::const_iterator ni = nodes_.begin(); - ni != nodes_.end(); ++ni) { - cerr << " " << ni->id_ << "[label=\"" << (ni->cat_ < 0 ? TD::Convert(ni->cat_ * -1) : "") - //cerr << " " << ni->id_ << "[label=\"" << ni->cat_ - << " n=" << ni->id_ -// << ",x=" << &*ni -// << ",in=" << ni->in_edges_.size() -// << ",out=" << ni->out_edges_.size() - << "\"];\n"; - } - cerr << "}\n"; -} - -void Hypergraph::Union(const Hypergraph& other) { - if (&other == this) return; - if (nodes_.empty()) { nodes_ = other.nodes_; edges_ = other.edges_; return; } - int noff = nodes_.size(); - int eoff = edges_.size(); - int ogoal = other.nodes_.size() - 1; - int cgoal = noff - 1; - // keep a single goal node, so add nodes.size - 1 - nodes_.resize(nodes_.size() + ogoal); - // add all edges - edges_.resize(edges_.size() + other.edges_.size()); - - for (int i = 0; i < ogoal; ++i) { - const Node& on = other.nodes_[i]; - Node& cn = nodes_[i + noff]; - cn.id_ = i + noff; - cn.in_edges_.resize(on.in_edges_.size()); - for (int j = 0; j < on.in_edges_.size(); ++j) - cn.in_edges_[j] = on.in_edges_[j] + eoff; - - cn.out_edges_.resize(on.out_edges_.size()); - for (int j = 0; j < on.out_edges_.size(); ++j) - cn.out_edges_[j] = on.out_edges_[j] + eoff; - } - - for (int i = 0; i < other.edges_.size(); ++i) { - const Edge& oe = other.edges_[i]; - Edge& ce = edges_[i + eoff]; - ce.id_ = i + eoff; - ce.rule_ = oe.rule_; - ce.feature_values_ = oe.feature_values_; - if (oe.head_node_ == ogoal) { - ce.head_node_ = cgoal; - nodes_[cgoal].in_edges_.push_back(ce.id_); - } else { - ce.head_node_ = oe.head_node_ + noff; - } - ce.tail_nodes_.resize(oe.tail_nodes_.size()); - for (int j = 0; j < oe.tail_nodes_.size(); ++j) - ce.tail_nodes_[j] = oe.tail_nodes_[j] + noff; - } - - TopologicallySortNodesAndEdges(cgoal); -} - -int Hypergraph::MarkReachable(const Node& node, - vector* rmap, - const vector* prune_edges) const { - int total = 0; - if (!(*rmap)[node.id_]) { - total = 1; - (*rmap)[node.id_] = true; - for (int i = 0; i < node.in_edges_.size(); ++i) { - if (!(prune_edges && (*prune_edges)[node.in_edges_[i]])) { - for (int j = 0; j < edges_[node.in_edges_[i]].tail_nodes_.size(); ++j) - total += MarkReachable(nodes_[edges_[node.in_edges_[i]].tail_nodes_[j]], rmap, prune_edges); - } - } - } - return total; -} - -void Hypergraph::PruneUnreachable(int goal_node_id) { - TopologicallySortNodesAndEdges(goal_node_id, NULL); -} - -void Hypergraph::RemoveNoncoaccessibleStates(int goal_node_id) { - if (goal_node_id < 0) goal_node_id += nodes_.size(); - assert(goal_node_id >= 0); - assert(goal_node_id < nodes_.size()); - - // TODO finish implementation - abort(); -} - -void Hypergraph::TopologicallySortNodesAndEdges(int goal_index, - const vector* prune_edges) { - vector sedges(edges_.size()); - // figure out which nodes are reachable from the goal - vector reachable(nodes_.size(), false); - int num_reachable = MarkReachable(nodes_[goal_index], &reachable, prune_edges); - vector snodes(num_reachable); snodes.clear(); - - // enumerate all reachable nodes in topologically sorted order - vector old_node_to_new_id(nodes_.size(), -1); - vector node_to_incount(nodes_.size(), -1); - vector node_processed(nodes_.size(), false); - typedef map > PQueue; - PQueue pri_q; - for (int i = 0; i < nodes_.size(); ++i) { - if (!reachable[i]) - continue; - const int inedges = nodes_[i].in_edges_.size(); - int incount = inedges; - for (int j = 0; j < inedges; ++j) - if (edges_[nodes_[i].in_edges_[j]].tail_nodes_.size() == 0 || - (prune_edges && (*prune_edges)[nodes_[i].in_edges_[j]])) - --incount; - // cerr << &nodes_[i] <<" : incount=" << incount << "\tout=" << nodes_[i].out_edges_.size() << "\t(in-edges=" << inedges << ")\n"; - assert(node_to_incount[i] == -1); - node_to_incount[i] = incount; - pri_q[incount].insert(i); - } - - int edge_count = 0; - while (!pri_q.empty()) { - PQueue::iterator iter = pri_q.find(0); - assert(iter != pri_q.end()); - assert(!iter->second.empty()); - - // get first node with incount = 0 - const int cur_index = *iter->second.begin(); - const Node& node = nodes_[cur_index]; - assert(reachable[cur_index]); - //cerr << "node: " << node << endl; - const int new_node_index = snodes.size(); - old_node_to_new_id[cur_index] = new_node_index; - snodes.push_back(node); - Node& new_node = snodes.back(); - new_node.id_ = new_node_index; - new_node.out_edges_.clear(); - - // fix up edges - we can now process the in edges and - // the out edges of their tails - int oi = 0; - for (int i = 0; i < node.in_edges_.size(); ++i, ++oi) { - if (prune_edges && (*prune_edges)[node.in_edges_[i]]) { - --oi; - continue; - } - new_node.in_edges_[oi] = edge_count; - Edge& edge = sedges[edge_count]; - edge.id_ = edge_count; - ++edge_count; - const Edge& old_edge = edges_[node.in_edges_[i]]; - edge.rule_ = old_edge.rule_; - edge.feature_values_ = old_edge.feature_values_; - edge.head_node_ = new_node_index; - edge.tail_nodes_.resize(old_edge.tail_nodes_.size()); - edge.edge_prob_ = old_edge.edge_prob_; - edge.i_ = old_edge.i_; - edge.j_ = old_edge.j_; - edge.prev_i_ = old_edge.prev_i_; - edge.prev_j_ = old_edge.prev_j_; - for (int j = 0; j < old_edge.tail_nodes_.size(); ++j) { - const Node& old_tail_node = nodes_[old_edge.tail_nodes_[j]]; - edge.tail_nodes_[j] = old_node_to_new_id[old_tail_node.id_]; - snodes[edge.tail_nodes_[j]].out_edges_.push_back(edge_count-1); - assert(edge.tail_nodes_[j] != new_node_index); - } - } - assert(oi <= new_node.in_edges_.size()); - new_node.in_edges_.resize(oi); - - for (int i = 0; i < node.out_edges_.size(); ++i) { - const Edge& edge = edges_[node.out_edges_[i]]; - const int next_index = edge.head_node_; - assert(cur_index != next_index); - if (!reachable[next_index]) continue; - if (prune_edges && (*prune_edges)[edge.id_]) continue; - - bool dontReduce = false; - for (int j = 0; j < edge.tail_nodes_.size() && !dontReduce; ++j) { - int tail_index = edge.tail_nodes_[j]; - dontReduce = (tail_index != cur_index) && !node_processed[tail_index]; - } - if (dontReduce) - continue; - - const int incount = node_to_incount[next_index]; - if (incount <= 0) { - cerr << "incount = " << incount << ", should be > 0!\n"; - cerr << "do you have a cycle in your hypergraph?\n"; - abort(); - } - PQueue::iterator it = pri_q.find(incount); - assert(it != pri_q.end()); - it->second.erase(next_index); - if (it->second.empty()) pri_q.erase(it); - - // reinsert node with reduced incount - pri_q[incount-1].insert(next_index); - --node_to_incount[next_index]; - } - - // remove node from set - iter->second.erase(cur_index); - if (iter->second.empty()) - pri_q.erase(iter); - node_processed[cur_index] = true; - } - - sedges.resize(edge_count); - nodes_.swap(snodes); - edges_.swap(sedges); - assert(nodes_.back().out_edges_.size() == 0); -} - -TRulePtr Hypergraph::kEPSRule; -TRulePtr Hypergraph::kUnaryRule; - -void Hypergraph::EpsilonRemove(WordID eps) { - if (!kEPSRule) { - kEPSRule.reset(new TRule("[X] ||| ||| ")); - kUnaryRule.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); - } - vector kill(edges_.size(), false); - for (int i = 0; i < edges_.size(); ++i) { - const Edge& edge = edges_[i]; - if (edge.tail_nodes_.empty() && - edge.rule_->f_.size() == 1 && - edge.rule_->f_[0] == eps) { - kill[i] = true; - if (!edge.feature_values_.empty()) { - Node& node = nodes_[edge.head_node_]; - if (node.in_edges_.size() != 1) { - cerr << "[WARNING] edge with features going into non-empty node - can't promote\n"; - // this *probably* means that there are multiple derivations of the - // same sequence via different paths through the input forest - // this needs to be investigated and fixed - } else { - for (int j = 0; j < node.out_edges_.size(); ++j) - edges_[node.out_edges_[j]].feature_values_ += edge.feature_values_; - // cerr << "PROMOTED " << edge.feature_values_ << endl; - } - } - } - } - bool created_eps = false; - PruneEdges(kill); - for (int i = 0; i < nodes_.size(); ++i) { - const Node& node = nodes_[i]; - if (node.in_edges_.empty()) { - for (int j = 0; j < node.out_edges_.size(); ++j) { - Edge& edge = edges_[node.out_edges_[j]]; - if (edge.rule_->Arity() == 2) { - assert(edge.rule_->f_.size() == 2); - assert(edge.rule_->e_.size() == 2); - edge.rule_ = kUnaryRule; - int cur = node.id_; - int t = -1; - assert(edge.tail_nodes_.size() == 2); - for (int i = 0; i < 2; ++i) if (edge.tail_nodes_[i] != cur) { t = edge.tail_nodes_[i]; } - assert(t != -1); - edge.tail_nodes_.resize(1); - edge.tail_nodes_[0] = t; - } else { - edge.rule_ = kEPSRule; - edge.rule_->f_[0] = eps; - edge.rule_->e_[0] = eps; - edge.tail_nodes_.clear(); - created_eps = true; - } - } - } - } - vector k2(edges_.size(), false); - PruneEdges(k2); - if (created_eps) EpsilonRemove(eps); -} - -struct EdgeWeightSorter { - const Hypergraph& hg; - EdgeWeightSorter(const Hypergraph& h) : hg(h) {} - bool operator()(int a, int b) const { - return hg.edges_[a].edge_prob_ > hg.edges_[b].edge_prob_; - } -}; - -void Hypergraph::SortInEdgesByEdgeWeights() { - for (int i = 0; i < nodes_.size(); ++i) { - Node& node = nodes_[i]; - sort(node.in_edges_.begin(), node.in_edges_.end(), EdgeWeightSorter(*this)); - } -} - diff --git a/src/hg.h b/src/hg.h deleted file mode 100644 index 7a2658b8..00000000 --- a/src/hg.h +++ /dev/null @@ -1,225 +0,0 @@ -#ifndef _HG_H_ -#define _HG_H_ - -#include -#include - -#include "small_vector.h" -#include "sparse_vector.h" -#include "wordid.h" -#include "trule.h" -#include "prob.h" - -// class representing an acyclic hypergraph -// - edges have 1 head, 0..n tails -class Hypergraph { - public: - Hypergraph() {} - - // SmallVector is a fast, small vector implementation for sizes <= 2 - typedef SmallVector TailNodeVector; - - // TODO get rid of state_ and cat_? - struct Node { - Node() : id_(), cat_() {} - int id_; // equal to this object's position in the nodes_ vector - WordID cat_; // non-terminal category if <0, 0 if not set - std::vector in_edges_; // contents refer to positions in edges_ - std::vector out_edges_; // contents refer to positions in edges_ - std::string state_; // opaque state - }; - - // TODO get rid of edge_prob_? (can be computed on the fly as the dot - // product of the weight vector and the feature values) - struct Edge { - Edge() : i_(-1), j_(-1), prev_i_(-1), prev_j_(-1) {} - inline int Arity() const { return tail_nodes_.size(); } - int head_node_; // refers to a position in nodes_ - TailNodeVector tail_nodes_; // contents refer to positions in nodes_ - TRulePtr rule_; - SparseVector feature_values_; - prob_t edge_prob_; // dot product of weights and feat_values - int id_; // equal to this object's position in the edges_ vector - - // span info. typically, i_ and j_ refer to indices in the source sentence - // if a synchronous parse has been executed i_ and j_ will refer to indices - // in the target sentence / lattice and prev_i_ prev_j_ will refer to - // positions in the source. Note: it is up to the translator implementation - // to properly set these values. For some models (like the Forest-input - // phrase based model) it may not be straightforward to do. if these values - // are not properly set, most things will work but alignment and any features - // that depend on them will be broken. - short int i_; - short int j_; - short int prev_i_; - short int prev_j_; - }; - - void swap(Hypergraph& other) { - other.nodes_.swap(nodes_); - other.edges_.swap(edges_); - } - - void ResizeNodes(int size) { - nodes_.resize(size); - for (int i = 0; i < size; ++i) nodes_[i].id_ = i; - } - - // reserves space in the nodes vector to prevent memory locations - // from changing - void ReserveNodes(size_t n, size_t e = 0) { - nodes_.reserve(n); - if (e) edges_.reserve(e); - } - - Edge* AddEdge(const TRulePtr& rule, const TailNodeVector& tail) { - edges_.push_back(Edge()); - Edge* edge = &edges_.back(); - edge->rule_ = rule; - edge->tail_nodes_ = tail; - edge->id_ = edges_.size() - 1; - for (int i = 0; i < edge->tail_nodes_.size(); ++i) - nodes_[edge->tail_nodes_[i]].out_edges_.push_back(edge->id_); - return edge; - } - - Node* AddNode(const WordID& cat, const std::string& state = "") { - nodes_.push_back(Node()); - nodes_.back().cat_ = cat; - nodes_.back().state_ = state; - nodes_.back().id_ = nodes_.size() - 1; - return &nodes_.back(); - } - - void ConnectEdgeToHeadNode(const int edge_id, const int head_id) { - edges_[edge_id].head_node_ = head_id; - nodes_[head_id].in_edges_.push_back(edge_id); - } - - // TODO remove this - use the version that takes indices - void ConnectEdgeToHeadNode(Edge* edge, Node* head) { - edge->head_node_ = head->id_; - head->in_edges_.push_back(edge->id_); - } - - // merge the goal node from other with this goal node - void Union(const Hypergraph& other); - - void PrintGraphviz() const; - - // compute the total number of paths in the forest - double NumberOfPaths() const; - - // BEWARE. this assumes that the source and target language - // strings are identical and that there are no loops. - // It assumes a bunch of other things about where the - // epsilons will be. It tries to assert failure if you - // break these assumptions, but it may not. - // TODO - make this work - void EpsilonRemove(WordID eps); - - // multiple the weights vector by the edge feature vector - // (inner product) to set the edge probabilities - template - void Reweight(const V& weights) { - for (int i = 0; i < edges_.size(); ++i) { - Edge& e = edges_[i]; - e.edge_prob_.logeq(e.feature_values_.dot(weights)); - } - } - - // computes inside and outside scores for each - // edge in the hypergraph - // alpha->size = edges_.size = beta->size - // returns inside prob of goal node - prob_t ComputeEdgePosteriors(double scale, - std::vector* posts) const; - - // find the score of the very best path passing through each edge - prob_t ComputeBestPathThroughEdges(std::vector* posts) const; - - // move weights as near to the source as possible, resulting in a - // stochastic automaton. ONLY FUNCTIONAL FOR *LATTICES*. - // See M. Mohri and M. Riley. A Weight Pushing Algorithm for Large - // Vocabulary Speech Recognition. 2001. - // the log semiring (NOT tropical) is used - void PushWeightsToSource(double scale = 1.0); - // same, except weights are pushed to the goal, works for HGs, - // not just lattices - void PushWeightsToGoal(double scale = 1.0); - - void SortInEdgesByEdgeWeights(); - - void PruneUnreachable(int goal_node_id); // DEPRECATED - - void RemoveNoncoaccessibleStates(int goal_node_id = -1); - - // remove edges from the hypergraph if prune_edge[edge_id] is true - void PruneEdges(const std::vector& prune_edge); - - // if you don't know, use_sum_prod_semiring should be false - void DensityPruneInsideOutside(const double scale, const bool use_sum_prod_semiring, const double density, - const std::vector* preserve_mask = NULL); - - // prunes any edge whose score on the best path taking that edge is more than alpha away - // from the score of the global best past (or the highest edge posterior) - void BeamPruneInsideOutside(const double scale, const bool use_sum_prod_semiring, const double alpha, - const std::vector* preserve_mask = NULL); - - void clear() { - nodes_.clear(); - edges_.clear(); - } - - inline size_t NumberOfEdges() const { return edges_.size(); } - inline size_t NumberOfNodes() const { return nodes_.size(); } - inline bool empty() const { return nodes_.empty(); } - - // nodes_ is sorted in topological order - std::vector nodes_; - // edges_ is not guaranteed to be in any particular order - std::vector edges_; - - // reorder nodes_ so they are in topological order - // source nodes at 0 sink nodes at size-1 - void TopologicallySortNodesAndEdges(int goal_idx, - const std::vector* prune_edges = NULL); - private: - // returns total nodes reachable - int MarkReachable(const Node& node, - std::vector* rmap, - const std::vector* prune_edges) const; - - static TRulePtr kEPSRule; - static TRulePtr kUnaryRule; -}; - -// common WeightFunctions, map an edge -> WeightType -// for generic Viterbi/Inside algorithms -struct EdgeProb { - inline const prob_t& operator()(const Hypergraph::Edge& e) const { return e.edge_prob_; } -}; - -struct ScaledEdgeProb { - ScaledEdgeProb(const double& alpha) : alpha_(alpha) {} - inline prob_t operator()(const Hypergraph::Edge& e) const { return e.edge_prob_.pow(alpha_); } - const double alpha_; -}; - -struct EdgeFeaturesWeightFunction { - inline const SparseVector& operator()(const Hypergraph::Edge& e) const { return e.feature_values_; } -}; - -struct TransitionEventWeightFunction { - inline SparseVector operator()(const Hypergraph::Edge& e) const { - SparseVector result; - result.set_value(e.id_, prob_t::One()); - return result; - } -}; - -struct TransitionCountWeightFunction { - inline double operator()(const Hypergraph::Edge& e) const { (void)e; return 1.0; } -}; - -#endif diff --git a/src/hg_intersect.cc b/src/hg_intersect.cc deleted file mode 100644 index a5e8913a..00000000 --- a/src/hg_intersect.cc +++ /dev/null @@ -1,121 +0,0 @@ -#include "hg_intersect.h" - -#include -#include -#include -#include - -#include "tdict.h" -#include "hg.h" -#include "trule.h" -#include "wordid.h" -#include "bottom_up_parser.h" - -using boost::lexical_cast; -using namespace std::tr1; -using namespace std; - -struct RuleFilter { - unordered_map, bool, boost::hash > > exists_; - bool true_lattice; - RuleFilter(const Lattice& target, int max_phrase_size) { - true_lattice = false; - for (int i = 0; i < target.size(); ++i) { - vector phrase; - int lim = min(static_cast(target.size()), i + max_phrase_size); - for (int j = i; j < lim; ++j) { - if (target[j].size() > 1) { true_lattice = true; break; } - phrase.push_back(target[j][0].label); - exists_[phrase] = true; - } - } - vector sos(1, TD::Convert("")); - exists_[sos] = true; - } - bool operator()(const TRule& r) const { - // TODO do some smarter filtering for lattices - if (true_lattice) return false; // don't filter "true lattice" input - const vector& e = r.e(); - for (int i = 0; i < e.size(); ++i) { - if (e[i] <= 0) continue; - vector phrase; - for (int j = i; j < e.size(); ++j) { - if (e[j] <= 0) break; - phrase.push_back(e[j]); - if (exists_.count(phrase) == 0) return true; - } - } - return false; - } -}; - -bool HG::Intersect(const Lattice& target, Hypergraph* hg) { - vector rem(hg->edges_.size(), false); - const RuleFilter filter(target, 15); // TODO make configurable - for (int i = 0; i < rem.size(); ++i) - rem[i] = filter(*hg->edges_[i].rule_); - hg->PruneEdges(rem); - - const int nedges = hg->edges_.size(); - const int nnodes = hg->nodes_.size(); - - TextGrammar* g = new TextGrammar; - GrammarPtr gp(g); - vector cats(nnodes); - // each node in the translation forest becomes a "non-terminal" in the new - // grammar, create the labels here - for (int i = 0; i < nnodes; ++i) - cats[i] = TD::Convert("CAT_" + lexical_cast(i)) * -1; - - // construct the grammar - for (int i = 0; i < nedges; ++i) { - const Hypergraph::Edge& edge = hg->edges_[i]; - const vector& tgt = edge.rule_->e(); - const vector& src = edge.rule_->f(); - TRulePtr rule(new TRule); - rule->prev_i = edge.i_; - rule->prev_j = edge.j_; - rule->lhs_ = cats[edge.head_node_]; - vector& f = rule->f_; - vector& e = rule->e_; - f.resize(tgt.size()); // swap source and target, since the parser - e.resize(src.size()); // parses using the source side! - Hypergraph::TailNodeVector tn(edge.tail_nodes_.size()); - int ntc = 0; - for (int j = 0; j < tgt.size(); ++j) { - const WordID& cur = tgt[j]; - if (cur > 0) { - f[j] = cur; - } else { - tn[ntc++] = cur; - f[j] = cats[edge.tail_nodes_[-cur]]; - } - } - ntc = 0; - for (int j = 0; j < src.size(); ++j) { - const WordID& cur = src[j]; - if (cur > 0) { - e[j] = cur; - } else { - e[j] = tn[ntc++]; - } - } - rule->scores_ = edge.feature_values_; - rule->parent_rule_ = edge.rule_; - rule->ComputeArity(); - //cerr << "ADD: " << rule->AsString() << endl; - - g->AddRule(rule); - } - g->SetMaxSpan(target.size() + 1); - const string& new_goal = TD::Convert(cats.back() * -1); - vector grammars(1, gp); - Hypergraph tforest; - ExhaustiveBottomUpParser parser(new_goal, grammars); - if (!parser.Parse(target, &tforest)) - return false; - else - hg->swap(tforest); - return true; -} - diff --git a/src/hg_intersect.h b/src/hg_intersect.h deleted file mode 100644 index 826bdaae..00000000 --- a/src/hg_intersect.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _HG_INTERSECT_H_ -#define _HG_INTERSECT_H_ - -#include - -#include "lattice.h" - -class Hypergraph; -struct HG { - static bool Intersect(const Lattice& target, Hypergraph* hg); -}; - -#endif diff --git a/src/hg_io.cc b/src/hg_io.cc deleted file mode 100644 index e21b1714..00000000 --- a/src/hg_io.cc +++ /dev/null @@ -1,598 +0,0 @@ -#include "hg_io.h" - -#include -#include - -#include - -#include "tdict.h" -#include "json_parse.h" -#include "hg.h" - -using namespace std; - -struct HGReader : public JSONParser { - HGReader(Hypergraph* g) : rp("[X] ||| "), state(-1), hg(*g), nodes_needed(true), edges_needed(true) { nodes = 0; edges = 0; } - - void CreateNode(const string& cat, const vector& in_edges) { - WordID c = TD::Convert("X") * -1; - if (!cat.empty()) c = TD::Convert(cat) * -1; - Hypergraph::Node* node = hg.AddNode(c, ""); - for (int i = 0; i < in_edges.size(); ++i) { - if (in_edges[i] >= hg.edges_.size()) { - cerr << "JSONParser: in_edges[" << i << "]=" << in_edges[i] - << ", but hg only has " << hg.edges_.size() << " edges!\n"; - abort(); - } - hg.ConnectEdgeToHeadNode(&hg.edges_[in_edges[i]], node); - } - } - void CreateEdge(const TRulePtr& rule, SparseVector* feats, const SmallVector& tail) { - Hypergraph::Edge* edge = hg.AddEdge(rule, tail); - feats->swap(edge->feature_values_); - } - - bool HandleJSONEvent(int type, const JSON_value* value) { - switch(state) { - case -1: - assert(type == JSON_T_OBJECT_BEGIN); - state = 0; - break; - case 0: - if (type == JSON_T_OBJECT_END) { - //cerr << "HG created\n"; // TODO, signal some kind of callback - } else if (type == JSON_T_KEY) { - string val = value->vu.str.value; - if (val == "features") { assert(fdict.empty()); state = 1; } - else if (val == "is_sorted") { state = 3; } - else if (val == "rules") { assert(rules.empty()); state = 4; } - else if (val == "node") { state = 8; } - else if (val == "edges") { state = 13; } - else { cerr << "Unexpected key: " << val << endl; return false; } - } - break; - - // features - case 1: - if(type == JSON_T_NULL) { state = 0; break; } - assert(type == JSON_T_ARRAY_BEGIN); - state = 2; - break; - case 2: - if(type == JSON_T_ARRAY_END) { state = 0; break; } - assert(type == JSON_T_STRING); - fdict.push_back(FD::Convert(value->vu.str.value)); - break; - - // is_sorted - case 3: - assert(type == JSON_T_TRUE || type == JSON_T_FALSE); - is_sorted = (type == JSON_T_TRUE); - if (!is_sorted) { cerr << "[WARNING] is_sorted flag is ignored\n"; } - state = 0; - break; - - // rules - case 4: - if(type == JSON_T_NULL) { state = 0; break; } - assert(type == JSON_T_ARRAY_BEGIN); - state = 5; - break; - case 5: - if(type == JSON_T_ARRAY_END) { state = 0; break; } - assert(type == JSON_T_INTEGER); - state = 6; - rule_id = value->vu.integer_value; - break; - case 6: - assert(type == JSON_T_STRING); - rules[rule_id] = TRulePtr(new TRule(value->vu.str.value)); - state = 5; - break; - - // Nodes - case 8: - assert(type == JSON_T_OBJECT_BEGIN); - ++nodes; - in_edges.clear(); - cat.clear(); - state = 9; break; - case 9: - if (type == JSON_T_OBJECT_END) { - //cerr << "Creating NODE\n"; - CreateNode(cat, in_edges); - state = 0; break; - } - assert(type == JSON_T_KEY); - cur_key = value->vu.str.value; - if (cur_key == "cat") { assert(cat.empty()); state = 10; break; } - if (cur_key == "in_edges") { assert(in_edges.empty()); state = 11; break; } - cerr << "Syntax error: unexpected key " << cur_key << " in node specification.\n"; - return false; - case 10: - assert(type == JSON_T_STRING || type == JSON_T_NULL); - cat = value->vu.str.value; - state = 9; break; - case 11: - if (type == JSON_T_NULL) { state = 9; break; } - assert(type == JSON_T_ARRAY_BEGIN); - state = 12; break; - case 12: - if (type == JSON_T_ARRAY_END) { state = 9; break; } - assert(type == JSON_T_INTEGER); - //cerr << "in_edges: " << value->vu.integer_value << endl; - in_edges.push_back(value->vu.integer_value); - break; - - // "edges": [ { "tail": null, "feats" : [0,1.63,1,-0.54], "rule": 12}, - // { "tail": null, "feats" : [0,0.87,1,0.02], "rule": 17}, - // { "tail": [0], "feats" : [1,2.3,2,15.3,"ExtraFeature",1.2], "rule": 13}] - case 13: - assert(type == JSON_T_ARRAY_BEGIN); - state = 14; - break; - case 14: - if (type == JSON_T_ARRAY_END) { state = 0; break; } - assert(type == JSON_T_OBJECT_BEGIN); - //cerr << "New edge\n"; - ++edges; - cur_rule.reset(); feats.clear(); tail.clear(); - state = 15; break; - case 15: - if (type == JSON_T_OBJECT_END) { - CreateEdge(cur_rule, &feats, tail); - state = 14; break; - } - assert(type == JSON_T_KEY); - cur_key = value->vu.str.value; - //cerr << "edge key " << cur_key << endl; - if (cur_key == "rule") { assert(!cur_rule); state = 16; break; } - if (cur_key == "feats") { assert(feats.empty()); state = 17; break; } - if (cur_key == "tail") { assert(tail.empty()); state = 20; break; } - cerr << "Unexpected key " << cur_key << " in edge specification\n"; - return false; - case 16: // edge.rule - if (type == JSON_T_INTEGER) { - int rule_id = value->vu.integer_value; - if (rules.find(rule_id) == rules.end()) { - // rules list must come before the edge definitions! - cerr << "Rule_id " << rule_id << " given but only loaded " << rules.size() << " rules\n"; - return false; - } - cur_rule = rules[rule_id]; - } else if (type == JSON_T_STRING) { - cur_rule.reset(new TRule(value->vu.str.value)); - } else { - cerr << "Rule must be either a rule id or a rule string" << endl; - return false; - } - // cerr << "Edge: rule=" << cur_rule->AsString() << endl; - state = 15; - break; - case 17: // edge.feats - if (type == JSON_T_NULL) { state = 15; break; } - assert(type == JSON_T_ARRAY_BEGIN); - state = 18; break; - case 18: - if (type == JSON_T_ARRAY_END) { state = 15; break; } - if (type != JSON_T_INTEGER && type != JSON_T_STRING) { - cerr << "Unexpected feature id type\n"; return false; - } - if (type == JSON_T_INTEGER) { - fid = value->vu.integer_value; - assert(fid < fdict.size()); - fid = fdict[fid]; - } else if (JSON_T_STRING) { - fid = FD::Convert(value->vu.str.value); - } else { abort(); } - state = 19; - break; - case 19: - { - assert(type == JSON_T_INTEGER || type == JSON_T_FLOAT); - double val = (type == JSON_T_INTEGER ? static_cast(value->vu.integer_value) : - strtod(value->vu.str.value, NULL)); - feats.set_value(fid, val); - state = 18; - break; - } - case 20: // edge.tail - if (type == JSON_T_NULL) { state = 15; break; } - assert(type == JSON_T_ARRAY_BEGIN); - state = 21; break; - case 21: - if (type == JSON_T_ARRAY_END) { state = 15; break; } - assert(type == JSON_T_INTEGER); - tail.push_back(value->vu.integer_value); - break; - } - return true; - } - string rp; - string cat; - SmallVector tail; - vector in_edges; - TRulePtr cur_rule; - map rules; - vector fdict; - SparseVector feats; - int state; - int fid; - int nodes; - int edges; - string cur_key; - Hypergraph& hg; - int rule_id; - bool nodes_needed; - bool edges_needed; - bool is_sorted; -}; - -bool HypergraphIO::ReadFromJSON(istream* in, Hypergraph* hg) { - hg->clear(); - HGReader reader(hg); - return reader.Parse(in); -} - -static void WriteRule(const TRule& r, ostream* out) { - if (!r.lhs_) { (*out) << "[X] ||| "; } - JSONParser::WriteEscapedString(r.AsString(), out); -} - -bool HypergraphIO::WriteToJSON(const Hypergraph& hg, bool remove_rules, ostream* out) { - map rid; - ostream& o = *out; - rid[NULL] = 0; - o << '{'; - if (!remove_rules) { - o << "\"rules\":["; - for (int i = 0; i < hg.edges_.size(); ++i) { - const TRule* r = hg.edges_[i].rule_.get(); - int &id = rid[r]; - if (!id) { - id=rid.size() - 1; - if (id > 1) o << ','; - o << id << ','; - WriteRule(*r, &o); - }; - } - o << "],"; - } - const bool use_fdict = FD::NumFeats() < 1000; - if (use_fdict) { - o << "\"features\":["; - for (int i = 1; i < FD::NumFeats(); ++i) { - o << (i==1 ? "":",") << '"' << FD::Convert(i) << '"'; - } - o << "],"; - } - vector edgemap(hg.edges_.size(), -1); // edges may be in non-topo order - int edge_count = 0; - for (int i = 0; i < hg.nodes_.size(); ++i) { - const Hypergraph::Node& node = hg.nodes_[i]; - if (i > 0) { o << ","; } - o << "\"edges\":["; - for (int j = 0; j < node.in_edges_.size(); ++j) { - const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; - edgemap[edge.id_] = edge_count; - ++edge_count; - o << (j == 0 ? "" : ",") << "{"; - - o << "\"tail\":["; - for (int k = 0; k < edge.tail_nodes_.size(); ++k) { - o << (k > 0 ? "," : "") << edge.tail_nodes_[k]; - } - o << "],"; - - o << "\"feats\":["; - bool first = true; - for (SparseVector::const_iterator it = edge.feature_values_.begin(); it != edge.feature_values_.end(); ++it) { - if (!it->second) continue; - if (!first) o << ','; - if (use_fdict) - o << (it->first - 1); - else - o << '"' << FD::Convert(it->first) << '"'; - o << ',' << it->second; - first = false; - } - o << "]"; - if (!remove_rules) { o << ",\"rule\":" << rid[edge.rule_.get()]; } - o << "}"; - } - o << "],"; - - o << "\"node\":{\"in_edges\":["; - for (int j = 0; j < node.in_edges_.size(); ++j) { - int mapped_edge = edgemap[node.in_edges_[j]]; - assert(mapped_edge >= 0); - o << (j == 0 ? "" : ",") << mapped_edge; - } - o << "]"; - if (node.cat_ < 0) { o << ",\"cat\":\"" << TD::Convert(node.cat_ * -1) << '"'; } - o << "}"; - } - o << "}\n"; - return true; -} - -bool needs_escape[128]; -void InitEscapes() { - memset(needs_escape, false, 128); - needs_escape[static_cast('\'')] = true; - needs_escape[static_cast('\\')] = true; -} - -string HypergraphIO::Escape(const string& s) { - size_t len = s.size(); - for (int i = 0; i < s.size(); ++i) { - unsigned char c = s[i]; - if (c < 128 && needs_escape[c]) ++len; - } - if (len == s.size()) return s; - string res(len, ' '); - size_t o = 0; - for (int i = 0; i < s.size(); ++i) { - unsigned char c = s[i]; - if (c < 128 && needs_escape[c]) - res[o++] = '\\'; - res[o++] = c; - } - assert(o == len); - return res; -} - -string HypergraphIO::AsPLF(const Hypergraph& hg, bool include_global_parentheses) { - static bool first = true; - if (first) { InitEscapes(); first = false; } - if (hg.nodes_.empty()) return "()"; - ostringstream os; - if (include_global_parentheses) os << '('; - static const string EPS="*EPS*"; - for (int i = 0; i < hg.nodes_.size()-1; ++i) { - if (hg.nodes_[i].out_edges_.empty()) abort(); - const bool last_node = (i == hg.nodes_.size() - 2); - const int out_edges_size = hg.nodes_[i].out_edges_.size(); - // compound splitter adds an extra goal transition which we suppress with - // the following conditional - if (!last_node || out_edges_size != 1 || - hg.edges_[hg.nodes_[i].out_edges_[0]].rule_->EWords() == 1) { - os << '('; - for (int j = 0; j < out_edges_size; ++j) { - const Hypergraph::Edge& e = hg.edges_[hg.nodes_[i].out_edges_[j]]; - const string output = e.rule_->e_.size() ==2 ? Escape(TD::Convert(e.rule_->e_[1])) : EPS; - double prob = log(e.edge_prob_); - if (isinf(prob)) { prob = -9e20; } - if (isnan(prob)) { prob = 0; } - os << "('" << output << "'," << prob << "," << e.head_node_ - i << "),"; - } - os << "),"; - } - } - if (include_global_parentheses) os << ')'; - return os.str(); -} - -namespace PLF { - -const string chars = "'\\"; -const char& quote = chars[0]; -const char& slash = chars[1]; - -// safe get -inline char get(const std::string& in, int c) { - if (c < 0 || c >= (int)in.size()) return 0; - else return in[(size_t)c]; -} - -// consume whitespace -inline void eatws(const std::string& in, int& c) { - while (get(in,c) == ' ') { c++; } -} - -// from 'foo' return foo -std::string getEscapedString(const std::string& in, int &c) -{ - eatws(in,c); - if (get(in,c++) != quote) return "ERROR"; - std::string res; - char cur = 0; - do { - cur = get(in,c++); - if (cur == slash) { res += get(in,c++); } - else if (cur != quote) { res += cur; } - } while (get(in,c) != quote && (c < (int)in.size())); - c++; - eatws(in,c); - return res; -} - -// basically atof -float getFloat(const std::string& in, int &c) -{ - std::string tmp; - eatws(in,c); - while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') { - tmp += get(in,c++); - } - eatws(in,c); - if (tmp.empty()) { - cerr << "Syntax error while reading number! col=" << c << endl; - abort(); - } - return atof(tmp.c_str()); -} - -// basically atoi -int getInt(const std::string& in, int &c) -{ - std::string tmp; - eatws(in,c); - while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') { - tmp += get(in,c++); - } - eatws(in,c); - return atoi(tmp.c_str()); -} - -// maximum number of nodes permitted -#define MAX_NODES 100000000 -// parse ('foo', 0.23) -void ReadPLFEdge(const std::string& in, int &c, int cur_node, Hypergraph* hg) { - if (get(in,c++) != '(') { assert(!"PCN/PLF parse error: expected ( at start of cn alt block\n"); } - vector ewords(2, 0); - ewords[1] = TD::Convert(getEscapedString(in,c)); - TRulePtr r(new TRule(ewords)); - // cerr << "RULE: " << r->AsString() << endl; - if (get(in,c++) != ',') { assert(!"PCN/PLF parse error: expected , after string\n"); } - size_t cnNext = 1; - std::vector probs; - probs.push_back(getFloat(in,c)); - while (get(in,c) == ',') { - c++; - float val = getFloat(in,c); - probs.push_back(val); - // cerr << val << endl; //REMO - } - //if we read more than one prob, this was a lattice, last item was column increment - if (probs.size()>1) { - cnNext = static_cast(probs.back()); - probs.pop_back(); - if (cnNext < 1) { cerr << cnNext << endl; - assert(!"PCN/PLF parse error: bad link length at last element of cn alt block\n"); } - } - if (get(in,c++) != ')') { assert(!"PCN/PLF parse error: expected ) at end of cn alt block\n"); } - eatws(in,c); - Hypergraph::TailNodeVector tail(1, cur_node); - Hypergraph::Edge* edge = hg->AddEdge(r, tail); - //cerr << " <--" << cur_node << endl; - int head_node = cur_node + cnNext; - assert(head_node < MAX_NODES); // prevent malicious PLFs from using all the memory - if (hg->nodes_.size() < (head_node + 1)) { hg->ResizeNodes(head_node + 1); } - hg->ConnectEdgeToHeadNode(edge, &hg->nodes_[head_node]); - for (int i = 0; i < probs.size(); ++i) - edge->feature_values_.set_value(FD::Convert("Feature_" + boost::lexical_cast(i)), probs[i]); -} - -// parse (('foo', 0.23), ('bar', 0.77)) -void ReadPLFNode(const std::string& in, int &c, int cur_node, int line, Hypergraph* hg) { - //cerr << "PLF READING NODE " << cur_node << endl; - if (hg->nodes_.size() < (cur_node + 1)) { hg->ResizeNodes(cur_node + 1); } - if (get(in,c++) != '(') { cerr << line << ": Syntax error 1\n"; abort(); } - eatws(in,c); - while (1) { - if (c > (int)in.size()) { break; } - if (get(in,c) == ')') { - c++; - eatws(in,c); - break; - } - if (get(in,c) == ',' && get(in,c+1) == ')') { - c+=2; - eatws(in,c); - break; - } - if (get(in,c) == ',') { c++; eatws(in,c); } - ReadPLFEdge(in, c, cur_node, hg); - } -} - -} // namespace PLF - -void HypergraphIO::ReadFromPLF(const std::string& in, Hypergraph* hg, int line) { - hg->clear(); - int c = 0; - int cur_node = 0; - if (in[c++] != '(') { cerr << line << ": Syntax error!\n"; abort(); } - while (1) { - if (c > (int)in.size()) { break; } - if (PLF::get(in,c) == ')') { - c++; - PLF::eatws(in,c); - break; - } - if (PLF::get(in,c) == ',' && PLF::get(in,c+1) == ')') { - c+=2; - PLF::eatws(in,c); - break; - } - if (PLF::get(in,c) == ',') { c++; PLF::eatws(in,c); } - PLF::ReadPLFNode(in, c, cur_node, line, hg); - ++cur_node; - } - assert(cur_node == hg->nodes_.size() - 1); -} - -void HypergraphIO::PLFtoLattice(const string& plf, Lattice* pl) { - Lattice& l = *pl; - Hypergraph g; - ReadFromPLF(plf, &g, 0); - const int num_nodes = g.nodes_.size() - 1; - l.resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - vector& alts = l[i]; - const Hypergraph::Node& node = g.nodes_[i]; - const int num_alts = node.out_edges_.size(); - alts.resize(num_alts); - for (int j = 0; j < num_alts; ++j) { - const Hypergraph::Edge& edge = g.edges_[node.out_edges_[j]]; - alts[j].label = edge.rule_->e_[1]; - alts[j].cost = edge.feature_values_.value(FD::Convert("Feature_0")); - alts[j].dist2next = edge.head_node_ - node.id_; - } - } -} - -namespace B64 { - -static const char cb64[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; -static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`abcdefghijklmnopq"; - -static void encodeblock(const unsigned char* in, ostream* os, int len) { - char out[4]; - out[0] = cb64[ in[0] >> 2 ]; - out[1] = cb64[ ((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4) ]; - out[2] = (len > 1 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '='); - out[3] = (len > 2 ? cb64[ in[2] & 0x3f ] : '='); - os->write(out, 4); -} - -void b64encode(const char* data, const size_t size, ostream* out) { - size_t cur = 0; - while(cur < size) { - int len = min(static_cast(3), size - cur); - encodeblock(reinterpret_cast(&data[cur]), out, len); - cur += len; - } -} - -static void decodeblock(const unsigned char* in, unsigned char* out) { - out[0] = (unsigned char ) (in[0] << 2 | in[1] >> 4); - out[1] = (unsigned char ) (in[1] << 4 | in[2] >> 2); - out[2] = (unsigned char ) (((in[2] << 6) & 0xc0) | in[3]); -} - -bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize) { - size_t cur = 0; - size_t ocur = 0; - unsigned char in[4]; - while(cur < insize) { - assert(ocur < outsize); - for (int i = 0; i < 4; ++i) { - unsigned char v = data[cur]; - v = (unsigned char) ((v < 43 || v > 122) ? '\0' : cd64[ v - 43 ]); - if (!v) { - cerr << "B64 decode error at offset " << cur << " offending character: " << (int)data[cur] << endl; - return false; - } - v = (unsigned char) ((v == '$') ? '\0' : v - 61); - if (v) in[i] = v - 1; else in[i] = 0; - ++cur; - } - decodeblock(in, reinterpret_cast(&out[ocur])); - ocur += 3; - } - return true; -} -} - diff --git a/src/hg_io.h b/src/hg_io.h deleted file mode 100644 index 69a516c1..00000000 --- a/src/hg_io.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef _HG_IO_H_ -#define _HG_IO_H_ - -#include - -#include "lattice.h" -class Hypergraph; - -struct HypergraphIO { - - // the format is basically a list of nodes and edges in topological order - // any edge you read, you must have already read its tail nodes - // any node you read, you must have already read its incoming edges - // this may make writing a bit more challenging if your forest is not - // topologically sorted (but that probably doesn't happen very often), - // but it makes reading much more memory efficient. - // see test_data/small.json.gz for an email encoding - static bool ReadFromJSON(std::istream* in, Hypergraph* out); - - // if remove_rules is used, the hypergraph is serialized without rule information - // (so it only contains structure and feature information) - static bool WriteToJSON(const Hypergraph& hg, bool remove_rules, std::ostream* out); - - // serialization utils - static void ReadFromPLF(const std::string& in, Hypergraph* out, int line = 0); - // return PLF string representation (undefined behavior on non-lattices) - static std::string AsPLF(const Hypergraph& hg, bool include_global_parentheses = true); - static void PLFtoLattice(const std::string& plf, Lattice* pl); - static std::string Escape(const std::string& s); // PLF helper -}; - -namespace B64 { - bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize); - void b64encode(const char* data, const size_t size, std::ostream* out); -} - -#endif diff --git a/src/hg_test.cc b/src/hg_test.cc deleted file mode 100644 index ecd97508..00000000 --- a/src/hg_test.cc +++ /dev/null @@ -1,441 +0,0 @@ -#include -#include -#include -#include -#include -#include "tdict.h" - -#include "json_parse.h" -#include "filelib.h" -#include "hg.h" -#include "hg_io.h" -#include "hg_intersect.h" -#include "viterbi.h" -#include "kbest.h" -#include "inside_outside.h" - -using namespace std; - -class HGTest : public testing::Test { - protected: - virtual void SetUp() { } - virtual void TearDown() { } - void CreateHG(Hypergraph* hg) const; - void CreateHG_int(Hypergraph* hg) const; - void CreateHG_tiny(Hypergraph* hg) const; - void CreateHGBalanced(Hypergraph* hg) const; - void CreateLatticeHG(Hypergraph* hg) const; - void CreateTinyLatticeHG(Hypergraph* hg) const; -}; - -void HGTest::CreateTinyLatticeHG(Hypergraph* hg) const { - const string json = "{\"rules\":[1,\"[X] ||| [1] a\",2,\"[X] ||| [1] A\",3,\"[X] ||| [1] b\",4,\"[X] ||| [1] B'\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[],\"node\":{\"in_edges\":[]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.2],\"rule\":1},{\"tail\":[0],\"feats\":[0,-0.6],\"rule\":2}],\"node\":{\"in_edges\":[0,1]},\"edges\":[{\"tail\":[1],\"feats\":[0,-0.1],\"rule\":3},{\"tail\":[1],\"feats\":[0,-0.9],\"rule\":4}],\"node\":{\"in_edges\":[2,3]}}"; - istringstream instr(json); - EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); -} - -void HGTest::CreateLatticeHG(Hypergraph* hg) const { - const string json = "{\"rules\":[1,\"[X] ||| [1] a\",2,\"[X] ||| [1] A\",3,\"[X] ||| [1] A A\",4,\"[X] ||| [1] b\",5,\"[X] ||| [1] c\",6,\"[X] ||| [1] B C\",7,\"[X] ||| [1] A B C\",8,\"[X] ||| [1] CC\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[],\"node\":{\"in_edges\":[]},\"edges\":[{\"tail\":[0],\"feats\":[2,-0.3],\"rule\":1},{\"tail\":[0],\"feats\":[2,-0.6],\"rule\":2},{\"tail\":[0],\"feats\":[2,-1.7],\"rule\":3}],\"node\":{\"in_edges\":[0,1,2]},\"edges\":[{\"tail\":[1],\"feats\":[2,-0.5],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[2],\"feats\":[2,-0.6],\"rule\":5},{\"tail\":[1],\"feats\":[2,-0.8],\"rule\":6},{\"tail\":[0],\"feats\":[2,-0.01],\"rule\":7},{\"tail\":[2],\"feats\":[2,-0.8],\"rule\":8}],\"node\":{\"in_edges\":[4,5,6,7]}}"; - istringstream instr(json); - EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); -} - -void HGTest::CreateHG_tiny(Hypergraph* hg) const { - const string json = "{\"rules\":[1,\"[X] ||| \",2,\"[X] ||| X [1]\",3,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,-2,1,-99],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.5,1,-0.8],\"rule\":2},{\"tail\":[0],\"feats\":[0,-0.7,1,-0.9],\"rule\":3}],\"node\":{\"in_edges\":[1,2]}}"; - istringstream instr(json); - EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); -} - -void HGTest::CreateHG_int(Hypergraph* hg) const { - const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| b\",3,\"[X] ||| a [1]\",4,\"[X] ||| [1] b\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,0.1],\"rule\":1},{\"tail\":[],\"feats\":[0,0.1],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X\"},\"edges\":[{\"tail\":[0],\"feats\":[0,0.3],\"rule\":3},{\"tail\":[0],\"feats\":[0,0.2],\"rule\":4}],\"node\":{\"in_edges\":[2,3],\"cat\":\"Goal\"}}"; - istringstream instr(json); - EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); -} - -void HGTest::CreateHG(Hypergraph* hg) const { - string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; - istringstream instr(json); - EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); -} - -void HGTest::CreateHGBalanced(Hypergraph* hg) const { - const string json = "{\"rules\":[1,\"[X] ||| i\",2,\"[X] ||| a\",3,\"[X] ||| b\",4,\"[X] ||| [1] [2]\",5,\"[X] ||| [1] [2]\",6,\"[X] ||| c\",7,\"[X] ||| d\",8,\"[X] ||| [1] [2]\",9,\"[X] ||| [1] [2]\",10,\"[X] ||| [1] [2]\",11,\"[X] ||| [1] [2]\",12,\"[X] ||| [1] [2]\",13,\"[X] ||| [1] [2]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[1,2],\"feats\":[],\"rule\":4},{\"tail\":[2,1],\"feats\":[],\"rule\":5}],\"node\":{\"in_edges\":[3,4]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":6}],\"node\":{\"in_edges\":[5]},\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":7}],\"node\":{\"in_edges\":[6]},\"edges\":[{\"tail\":[4,5],\"feats\":[],\"rule\":8},{\"tail\":[5,4],\"feats\":[],\"rule\":9}],\"node\":{\"in_edges\":[7,8]},\"edges\":[{\"tail\":[3,6],\"feats\":[],\"rule\":10},{\"tail\":[6,3],\"feats\":[],\"rule\":11}],\"node\":{\"in_edges\":[9,10]},\"edges\":[{\"tail\":[7,0],\"feats\":[],\"rule\":12},{\"tail\":[0,7],\"feats\":[],\"rule\":13}],\"node\":{\"in_edges\":[11,12]}}"; - istringstream instr(json); - EXPECT_TRUE(HypergraphIO::ReadFromJSON(&instr, hg)); -} - -TEST_F(HGTest,Controlled) { - Hypergraph hg; - CreateHG_tiny(&hg); - SparseVector wts; - wts.set_value(FD::Convert("f1"), 0.4); - wts.set_value(FD::Convert("f2"), 0.8); - hg.Reweight(wts); - vector trans; - prob_t prob = ViterbiESentence(hg, &trans); - cerr << TD::GetString(trans) << "\n"; - cerr << "prob: " << prob << "\n"; - EXPECT_FLOAT_EQ(-80.839996, log(prob)); - EXPECT_EQ("X ", TD::GetString(trans)); - vector post; - hg.PrintGraphviz(); - prob_t c2 = Inside(hg, NULL, ScaledEdgeProb(0.6)); - EXPECT_FLOAT_EQ(-47.8577, log(c2)); -} - -TEST_F(HGTest,Union) { - Hypergraph hg1; - Hypergraph hg2; - CreateHG_tiny(&hg1); - CreateHG(&hg2); - SparseVector wts; - wts.set_value(FD::Convert("f1"), 0.4); - wts.set_value(FD::Convert("f2"), 1.0); - hg1.Reweight(wts); - hg2.Reweight(wts); - prob_t c1,c2,c3,c4; - vector t1,t2,t3,t4; - c1 = ViterbiESentence(hg1, &t1); - c2 = ViterbiESentence(hg2, &t2); - int l2 = ViterbiPathLength(hg2); - cerr << c1 << "\t" << TD::GetString(t1) << endl; - cerr << c2 << "\t" << TD::GetString(t2) << endl; - hg1.Union(hg2); - hg1.Reweight(wts); - c3 = ViterbiESentence(hg1, &t3); - int l3 = ViterbiPathLength(hg1); - cerr << c3 << "\t" << TD::GetString(t3) << endl; - EXPECT_FLOAT_EQ(c2, c3); - EXPECT_EQ(TD::GetString(t2), TD::GetString(t3)); - EXPECT_EQ(l2, l3); - - wts.set_value(FD::Convert("f2"), -1); - hg1.Reweight(wts); - c4 = ViterbiESentence(hg1, &t4); - cerr << c4 << "\t" << TD::GetString(t4) << endl; - EXPECT_EQ("Z ", TD::GetString(t4)); - EXPECT_FLOAT_EQ(98.82, log(c4)); - - vector, prob_t> > list; - KBest::KBestDerivations, ESentenceTraversal> kbest(hg1, 10); - for (int i = 0; i < 10; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg1.nodes_.size() - 1, i); - if (!d) break; - list.push_back(make_pair(d->yield, d->score)); - } - EXPECT_TRUE(list[0].first == t4); - EXPECT_FLOAT_EQ(log(list[0].second), log(c4)); - EXPECT_EQ(list.size(), 6); - EXPECT_FLOAT_EQ(log(list.back().second / list.front().second), -97.7); -} - -TEST_F(HGTest,ControlledKBest) { - Hypergraph hg; - CreateHG(&hg); - vector w(2); w[0]=0.4; w[1]=0.8; - hg.Reweight(w); - vector trans; - prob_t cost = ViterbiESentence(hg, &trans); - cerr << TD::GetString(trans) << "\n"; - cerr << "cost: " << cost << "\n"; - - int best = 0; - KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); - for (int i = 0; i < 10; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - cerr << TD::GetString(d->yield) << endl; - ++best; - } - EXPECT_EQ(4, best); -} - - -TEST_F(HGTest,InsideScore) { - SparseVector wts; - wts.set_value(FD::Convert("f1"), 1.0); - Hypergraph hg; - CreateTinyLatticeHG(&hg); - hg.Reweight(wts); - vector trans; - prob_t cost = ViterbiESentence(hg, &trans); - cerr << TD::GetString(trans) << "\n"; - cerr << "cost: " << cost << "\n"; - hg.PrintGraphviz(); - prob_t inside = Inside(hg); - EXPECT_FLOAT_EQ(1.7934048, inside); // computed by hand - vector post; - inside = hg.ComputeBestPathThroughEdges(&post); - EXPECT_FLOAT_EQ(-0.3, log(inside)); // computed by hand - EXPECT_EQ(post.size(), 4); - for (int i = 0; i < 4; ++i) { - cerr << "edge post: " << log(post[i]) << '\t' << hg.edges_[i].rule_->AsString() << endl; - } -} - - -TEST_F(HGTest,PruneInsideOutside) { - SparseVector wts; - wts.set_value(FD::Convert("Feature_1"), 1.0); - Hypergraph hg; - CreateLatticeHG(&hg); - hg.Reweight(wts); - vector trans; - prob_t cost = ViterbiESentence(hg, &trans); - cerr << TD::GetString(trans) << "\n"; - cerr << "cost: " << cost << "\n"; - hg.PrintGraphviz(); - //hg.DensityPruneInsideOutside(0.5, false, 2.0); - hg.BeamPruneInsideOutside(0.5, false, 0.5); - cost = ViterbiESentence(hg, &trans); - cerr << "Ncst: " << cost << endl; - cerr << TD::GetString(trans) << "\n"; - hg.PrintGraphviz(); -} - -TEST_F(HGTest,TestPruneEdges) { - Hypergraph hg; - CreateLatticeHG(&hg); - SparseVector wts; - wts.set_value(FD::Convert("f1"), 1.0); - hg.Reweight(wts); - hg.PrintGraphviz(); - vector prune(hg.edges_.size(), true); - prune[6] = false; - hg.PruneEdges(prune); - cerr << "Pruned:\n"; - hg.PrintGraphviz(); -} - -TEST_F(HGTest,TestIntersect) { - Hypergraph hg; - CreateHG_int(&hg); - SparseVector wts; - wts.set_value(FD::Convert("f1"), 1.0); - hg.Reweight(wts); - hg.PrintGraphviz(); - - int best = 0; - KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); - for (int i = 0; i < 10; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - cerr << TD::GetString(d->yield) << endl; - ++best; - } - EXPECT_EQ(4, best); - - Lattice target(2); - target[0].push_back(LatticeArc(TD::Convert("a"), 0.0, 1)); - target[1].push_back(LatticeArc(TD::Convert("b"), 0.0, 1)); - HG::Intersect(target, &hg); - hg.PrintGraphviz(); -} - -TEST_F(HGTest,TestPrune2) { - Hypergraph hg; - CreateHG_int(&hg); - SparseVector wts; - wts.set_value(FD::Convert("f1"), 1.0); - hg.Reweight(wts); - hg.PrintGraphviz(); - vector rem(hg.edges_.size(), false); - rem[0] = true; - rem[1] = true; - hg.PruneEdges(rem); - hg.PrintGraphviz(); - cerr << "TODO: fix this pruning behavior-- the resulting HG should be empty!\n"; -} - -TEST_F(HGTest,Sample) { - Hypergraph hg; - CreateLatticeHG(&hg); - SparseVector wts; - wts.set_value(FD::Convert("Feature_1"), 0.0); - hg.Reweight(wts); - vector trans; - prob_t cost = ViterbiESentence(hg, &trans); - cerr << TD::GetString(trans) << "\n"; - cerr << "cost: " << cost << "\n"; - hg.PrintGraphviz(); -} - -TEST_F(HGTest,PLF) { - Hypergraph hg; - string inplf = "((('haupt',-2.06655,1),('hauptgrund',-5.71033,2),),(('grund',-1.78709,1),),(('für\\'',0.1,1),),)"; - HypergraphIO::ReadFromPLF(inplf, &hg); - SparseVector wts; - wts.set_value(FD::Convert("Feature_0"), 1.0); - hg.Reweight(wts); - hg.PrintGraphviz(); - string outplf = HypergraphIO::AsPLF(hg); - cerr << " IN: " << inplf << endl; - cerr << "OUT: " << outplf << endl; - assert(inplf == outplf); -} - -TEST_F(HGTest,PushWeightsToGoal) { - Hypergraph hg; - CreateHG(&hg); - vector w(2); w[0]=0.4; w[1]=0.8; - hg.Reweight(w); - vector trans; - prob_t cost = ViterbiESentence(hg, &trans); - cerr << TD::GetString(trans) << "\n"; - cerr << "cost: " << cost << "\n"; - hg.PrintGraphviz(); - hg.PushWeightsToGoal(); - hg.PrintGraphviz(); -} - -TEST_F(HGTest,TestSpecialKBest) { - Hypergraph hg; - CreateHGBalanced(&hg); - vector w(1); w[0]=0; - hg.Reweight(w); - vector, prob_t> > list; - KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 100000); - for (int i = 0; i < 100000; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - cerr << TD::GetString(d->yield) << endl; - } - hg.PrintGraphviz(); -} - -TEST_F(HGTest, TestGenericViterbi) { - Hypergraph hg; - CreateHG_tiny(&hg); - SparseVector wts; - wts.set_value(FD::Convert("f1"), 0.4); - wts.set_value(FD::Convert("f2"), 0.8); - hg.Reweight(wts); - vector trans; - const prob_t prob = ViterbiESentence(hg, &trans); - cerr << TD::GetString(trans) << "\n"; - cerr << "prob: " << prob << "\n"; - EXPECT_FLOAT_EQ(-80.839996, log(prob)); - EXPECT_EQ("X ", TD::GetString(trans)); -} - -TEST_F(HGTest, TestGenericInside) { - Hypergraph hg; - CreateTinyLatticeHG(&hg); - SparseVector wts; - wts.set_value(FD::Convert("f1"), 1.0); - hg.Reweight(wts); - vector inside; - prob_t ins = Inside(hg, &inside); - EXPECT_FLOAT_EQ(1.7934048, ins); // computed by hand - vector outside; - Outside(hg, inside, &outside); - EXPECT_EQ(3, outside.size()); - EXPECT_FLOAT_EQ(1.7934048, outside[0]); - EXPECT_FLOAT_EQ(1.3114071, outside[1]); - EXPECT_FLOAT_EQ(1.0, outside[2]); -} - -TEST_F(HGTest,TestGenericInside2) { - Hypergraph hg; - CreateHG(&hg); - SparseVector wts; - wts.set_value(FD::Convert("f1"), 0.4); - wts.set_value(FD::Convert("f2"), 0.8); - hg.Reweight(wts); - vector inside, outside; - prob_t ins = Inside(hg, &inside); - Outside(hg, inside, &outside); - for (int i = 0; i < hg.nodes_.size(); ++i) - cerr << i << "\t" << log(inside[i]) << "\t" << log(outside[i]) << endl; - EXPECT_FLOAT_EQ(0, log(inside[0])); - EXPECT_FLOAT_EQ(-1.7861683, log(outside[0])); - EXPECT_FLOAT_EQ(-0.4, log(inside[1])); - EXPECT_FLOAT_EQ(-1.3861683, log(outside[1])); - EXPECT_FLOAT_EQ(-0.8, log(inside[2])); - EXPECT_FLOAT_EQ(-0.986168, log(outside[2])); - EXPECT_FLOAT_EQ(-0.96, log(inside[3])); - EXPECT_FLOAT_EQ(-0.8261683, log(outside[3])); - EXPECT_FLOAT_EQ(-1.562512, log(inside[4])); - EXPECT_FLOAT_EQ(-0.22365622, log(outside[4])); - EXPECT_FLOAT_EQ(-1.7861683, log(inside[5])); - EXPECT_FLOAT_EQ(0, log(outside[5])); -} - -TEST_F(HGTest,TestAddExpectations) { - Hypergraph hg; - CreateHG(&hg); - SparseVector wts; - wts.set_value(FD::Convert("f1"), 0.4); - wts.set_value(FD::Convert("f2"), 0.8); - hg.Reweight(wts); - SparseVector feat_exps; - InsideOutside, EdgeFeaturesWeightFunction>(hg, &feat_exps); - EXPECT_FLOAT_EQ(-2.5439765, feat_exps[FD::Convert("f1")]); - EXPECT_FLOAT_EQ(-2.6357865, feat_exps[FD::Convert("f2")]); - cerr << feat_exps << endl; - SparseVector posts; - InsideOutside, TransitionEventWeightFunction>(hg, &posts); -} - -TEST_F(HGTest, Small) { - ReadFile rf("test_data/small.json.gz"); - Hypergraph hg; - assert(HypergraphIO::ReadFromJSON(rf.stream(), &hg)); - SparseVector wts; - wts.set_value(FD::Convert("Model_0"), -2.0); - wts.set_value(FD::Convert("Model_1"), -0.5); - wts.set_value(FD::Convert("Model_2"), -1.1); - wts.set_value(FD::Convert("Model_3"), -1.0); - wts.set_value(FD::Convert("Model_4"), -1.0); - wts.set_value(FD::Convert("Model_5"), 0.5); - wts.set_value(FD::Convert("Model_6"), 0.2); - wts.set_value(FD::Convert("Model_7"), -3.0); - hg.Reweight(wts); - vector trans; - prob_t cost = ViterbiESentence(hg, &trans); - cerr << TD::GetString(trans) << "\n"; - cerr << "cost: " << cost << "\n"; - vector post; - prob_t c2 = Inside(hg, NULL, ScaledEdgeProb(0.6)); - EXPECT_FLOAT_EQ(2.1431036, log(c2)); -} - -TEST_F(HGTest, JSONTest) { - ostringstream os; - JSONParser::WriteEscapedString("\"I don't know\", she said.", &os); - EXPECT_EQ("\"\\\"I don't know\\\", she said.\"", os.str()); - ostringstream os2; - JSONParser::WriteEscapedString("yes", &os2); - EXPECT_EQ("\"yes\"", os2.str()); -} - -TEST_F(HGTest, TestGenericKBest) { - Hypergraph hg; - CreateHG(&hg); - //CreateHGBalanced(&hg); - SparseVector wts; - wts.set_value(FD::Convert("f1"), 0.4); - wts.set_value(FD::Convert("f2"), 1.0); - hg.Reweight(wts); - vector trans; - prob_t cost = ViterbiESentence(hg, &trans); - cerr << TD::GetString(trans) << "\n"; - cerr << "cost: " << cost << "\n"; - - KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 1000); - for (int i = 0; i < 1000; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - cerr << TD::GetString(d->yield) << " F:" << d->feature_values << endl; - } -} - -int main(int argc, char **argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/inside_outside.h b/src/inside_outside.h deleted file mode 100644 index 9114c9d7..00000000 --- a/src/inside_outside.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef _INSIDE_H_ -#define _INSIDE_H_ - -#include -#include -#include "hg.h" - -// run the inside algorithm and return the inside score -// if result is non-NULL, result will contain the inside -// score for each node -// NOTE: WeightType(0) must construct the semiring's additive identity -// WeightType(1) must construct the semiring's multiplicative identity -template -WeightType Inside(const Hypergraph& hg, - std::vector* result = NULL, - const WeightFunction& weight = WeightFunction()) { - const int num_nodes = hg.nodes_.size(); - std::vector dummy; - std::vector& inside_score = result ? *result : dummy; - inside_score.resize(num_nodes); - std::fill(inside_score.begin(), inside_score.end(), WeightType()); - for (int i = 0; i < num_nodes; ++i) { - const Hypergraph::Node& cur_node = hg.nodes_[i]; - WeightType* const cur_node_inside_score = &inside_score[i]; - const int num_in_edges = cur_node.in_edges_.size(); - if (num_in_edges == 0) { - *cur_node_inside_score = WeightType(1); - continue; - } - for (int j = 0; j < num_in_edges; ++j) { - const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]]; - WeightType score = weight(edge); - for (int k = 0; k < edge.tail_nodes_.size(); ++k) { - const int tail_node_index = edge.tail_nodes_[k]; - score *= inside_score[tail_node_index]; - } - *cur_node_inside_score += score; - } - } - return inside_score.back(); -} - -template -void Outside(const Hypergraph& hg, - std::vector& inside_score, - std::vector* result, - const WeightFunction& weight = WeightFunction()) { - assert(result); - const int num_nodes = hg.nodes_.size(); - assert(inside_score.size() == num_nodes); - std::vector& outside_score = *result; - outside_score.resize(num_nodes); - std::fill(outside_score.begin(), outside_score.end(), WeightType(0)); - outside_score.back() = WeightType(1); - for (int i = num_nodes - 1; i >= 0; --i) { - const Hypergraph::Node& cur_node = hg.nodes_[i]; - const WeightType& head_node_outside_score = outside_score[i]; - const int num_in_edges = cur_node.in_edges_.size(); - for (int j = 0; j < num_in_edges; ++j) { - const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]]; - const WeightType head_and_edge_weight = weight(edge) * head_node_outside_score; - const int num_tail_nodes = edge.tail_nodes_.size(); - for (int k = 0; k < num_tail_nodes; ++k) { - const int update_tail_node_index = edge.tail_nodes_[k]; - WeightType* const tail_outside_score = &outside_score[update_tail_node_index]; - WeightType inside_contribution = WeightType(1); - for (int l = 0; l < num_tail_nodes; ++l) { - const int other_tail_node_index = edge.tail_nodes_[l]; - if (update_tail_node_index != other_tail_node_index) - inside_contribution *= inside_score[other_tail_node_index]; - } - *tail_outside_score += head_and_edge_weight * inside_contribution; - } - } - } -} - -// this is the Inside-Outside optimization described in Li et al. (EMNLP 2009) -// for computing the inside algorithm over expensive semirings -// (such as expectations over features). See Figure 4. It is slightly different -// in that x/k is returned not (k,x) -// NOTE: RType * PType must be valid (and yield RType) -template -PType InsideOutside(const Hypergraph& hg, - RType* result_x, - const WeightFunction& weight1 = WeightFunction(), - const WeightFunction2& weight2 = WeightFunction2()) { - const int num_nodes = hg.nodes_.size(); - std::vector inside, outside; - const PType z = Inside(hg, &inside, weight1); - Outside(hg, inside, &outside, weight1); - RType& x = *result_x; - x = RType(); - for (int i = 0; i < num_nodes; ++i) { - const Hypergraph::Node& cur_node = hg.nodes_[i]; - const int num_in_edges = cur_node.in_edges_.size(); - for (int j = 0; j < num_in_edges; ++j) { - const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]]; - PType prob = outside[i]; - prob *= weight1(edge); - const int num_tail_nodes = edge.tail_nodes_.size(); - for (int k = 0; k < num_tail_nodes; ++k) - prob *= inside[edge.tail_nodes_[k]]; - prob /= z; - x += weight2(edge) * prob; - } - } - return z; -} - -#endif diff --git a/src/json_parse.cc b/src/json_parse.cc deleted file mode 100644 index f6fdfea8..00000000 --- a/src/json_parse.cc +++ /dev/null @@ -1,50 +0,0 @@ -#include "json_parse.h" - -#include -#include - -using namespace std; - -static const char *json_hex_chars = "0123456789abcdef"; - -void JSONParser::WriteEscapedString(const string& in, ostream* out) { - int pos = 0; - int start_offset = 0; - unsigned char c = 0; - (*out) << '"'; - while(pos < in.size()) { - c = in[pos]; - switch(c) { - case '\b': - case '\n': - case '\r': - case '\t': - case '"': - case '\\': - case '/': - if(pos - start_offset > 0) - (*out) << in.substr(start_offset, pos - start_offset); - if(c == '\b') (*out) << "\\b"; - else if(c == '\n') (*out) << "\\n"; - else if(c == '\r') (*out) << "\\r"; - else if(c == '\t') (*out) << "\\t"; - else if(c == '"') (*out) << "\\\""; - else if(c == '\\') (*out) << "\\\\"; - else if(c == '/') (*out) << "\\/"; - start_offset = ++pos; - break; - default: - if(c < ' ') { - cerr << "Warning, bad character (" << static_cast(c) << ") in string\n"; - if(pos - start_offset > 0) - (*out) << in.substr(start_offset, pos - start_offset); - (*out) << "\\u00" << json_hex_chars[c >> 4] << json_hex_chars[c & 0xf]; - start_offset = ++pos; - } else pos++; - } - } - if(pos - start_offset > 0) - (*out) << in.substr(start_offset, pos - start_offset); - (*out) << '"'; -} - diff --git a/src/json_parse.h b/src/json_parse.h deleted file mode 100644 index c3cba954..00000000 --- a/src/json_parse.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef _JSON_WRAPPER_H_ -#define _JSON_WRAPPER_H_ - -#include -#include -#include "JSON_parser.h" - -class JSONParser { - public: - JSONParser() { - init_JSON_config(&config); - hack.mf = &JSONParser::Callback; - config.depth = 10; - config.callback_ctx = reinterpret_cast(this); - config.callback = hack.cb; - config.allow_comments = 1; - config.handle_floats_manually = 1; - jc = new_JSON_parser(&config); - } - virtual ~JSONParser() { - delete_JSON_parser(jc); - } - bool Parse(std::istream* in) { - int count = 0; - int lc = 1; - for (; in ; ++count) { - int next_char = in->get(); - if (!in->good()) break; - if (lc == '\n') { ++lc; } - if (!JSON_parser_char(jc, next_char)) { - std::cerr << "JSON_parser_char: syntax error, line " << lc << " (byte " << count << ")" << std::endl; - return false; - } - } - if (!JSON_parser_done(jc)) { - std::cerr << "JSON_parser_done: syntax error\n"; - return false; - } - return true; - } - static void WriteEscapedString(const std::string& in, std::ostream* out); - protected: - virtual bool HandleJSONEvent(int type, const JSON_value* value) = 0; - private: - int Callback(int type, const JSON_value* value) { - if (HandleJSONEvent(type, value)) return 1; - return 0; - } - JSON_parser_struct* jc; - JSON_config config; - typedef int (JSONParser::* MF)(int type, const struct JSON_value_struct* value); - union CBHack { - JSON_parser_callback cb; - MF mf; - } hack; -}; - -#endif diff --git a/src/kbest.h b/src/kbest.h deleted file mode 100644 index cd9b6c2b..00000000 --- a/src/kbest.h +++ /dev/null @@ -1,207 +0,0 @@ -#ifndef _HG_KBEST_H_ -#define _HG_KBEST_H_ - -#include -#include -#include - -#include - -#include "wordid.h" -#include "hg.h" - -namespace KBest { - // default, don't filter any derivations from the k-best list - struct NoFilter { - bool operator()(const std::vector& yield) { - (void) yield; - return false; - } - }; - - // optional, filter unique yield strings - struct FilterUnique { - std::tr1::unordered_set, boost::hash > > unique; - - bool operator()(const std::vector& yield) { - return !unique.insert(yield).second; - } - }; - - // utility class to lazily create the k-best derivations from a forest, uses - // the lazy k-best algorithm (Algorithm 3) from Huang and Chiang (IWPT 2005) - template - struct KBestDerivations { - KBestDerivations(const Hypergraph& hg, - const size_t k, - const Traversal& tf = Traversal(), - const WeightFunction& wf = WeightFunction()) : - traverse(tf), w(wf), g(hg), nds(g.nodes_.size()), k_prime(k) {} - - ~KBestDerivations() { - for (int i = 0; i < freelist.size(); ++i) - delete freelist[i]; - } - - struct Derivation { - Derivation(const Hypergraph::Edge& e, - const SmallVector& jv, - const WeightType& w, - const SparseVector& f) : - edge(&e), - j(jv), - score(w), - feature_values(f) {} - - // dummy constructor, just for query - Derivation(const Hypergraph::Edge& e, - const SmallVector& jv) : edge(&e), j(jv) {} - - T yield; - const Hypergraph::Edge* const edge; - const SmallVector j; - const WeightType score; - const SparseVector feature_values; - }; - struct HeapCompare { - bool operator()(const Derivation* a, const Derivation* b) const { - return a->score < b->score; - } - }; - struct DerivationCompare { - bool operator()(const Derivation* a, const Derivation* b) const { - return a->score > b->score; - } - }; - struct DerivationUniquenessHash { - size_t operator()(const Derivation* d) const { - size_t x = 5381; - x = ((x << 5) + x) ^ d->edge->id_; - for (int i = 0; i < d->j.size(); ++i) - x = ((x << 5) + x) ^ d->j[i]; - return x; - } - }; - struct DerivationUniquenessEquals { - bool operator()(const Derivation* a, const Derivation* b) const { - return (a->edge == b->edge) && (a->j == b->j); - } - }; - typedef std::vector CandidateHeap; - typedef std::vector DerivationList; - typedef std::tr1::unordered_set< - const Derivation*, DerivationUniquenessHash, DerivationUniquenessEquals> UniqueDerivationSet; - - struct NodeDerivationState { - CandidateHeap cand; - DerivationList D; - DerivationFilter filter; - UniqueDerivationSet ds; - explicit NodeDerivationState(const DerivationFilter& f = DerivationFilter()) : filter(f) {} - }; - - Derivation* LazyKthBest(int v, int k) { - NodeDerivationState& s = GetCandidates(v); - CandidateHeap& cand = s.cand; - DerivationList& D = s.D; - DerivationFilter& filter = s.filter; - bool add_next = true; - while (D.size() <= k) { - if (add_next && D.size() > 0) { - const Derivation* d = D.back(); - LazyNext(d, &cand, &s.ds); - } - add_next = false; - - if (cand.size() > 0) { - std::pop_heap(cand.begin(), cand.end(), HeapCompare()); - Derivation* d = cand.back(); - cand.pop_back(); - std::vector ants(d->edge->Arity()); - for (int j = 0; j < ants.size(); ++j) - ants[j] = &LazyKthBest(d->edge->tail_nodes_[j], d->j[j])->yield; - traverse(*d->edge, ants, &d->yield); - if (!filter(d->yield)) { - D.push_back(d); - add_next = true; - } - } else { - break; - } - } - if (k < D.size()) return D[k]; else return NULL; - } - - private: - // creates a derivation object with all fields set but the yield - // the yield is computed in LazyKthBest before the derivation is added to D - // returns NULL if j refers to derivation numbers larger than the - // antecedent structure define - Derivation* CreateDerivation(const Hypergraph::Edge& e, const SmallVector& j) { - WeightType score = w(e); - SparseVector feats = e.feature_values_; - for (int i = 0; i < e.Arity(); ++i) { - const Derivation* ant = LazyKthBest(e.tail_nodes_[i], j[i]); - if (!ant) { return NULL; } - score *= ant->score; - feats += ant->feature_values; - } - freelist.push_back(new Derivation(e, j, score, feats)); - return freelist.back(); - } - - NodeDerivationState& GetCandidates(int v) { - NodeDerivationState& s = nds[v]; - if (!s.D.empty() || !s.cand.empty()) return s; - - const Hypergraph::Node& node = g.nodes_[v]; - for (int i = 0; i < node.in_edges_.size(); ++i) { - const Hypergraph::Edge& edge = g.edges_[node.in_edges_[i]]; - SmallVector jv(edge.Arity(), 0); - Derivation* d = CreateDerivation(edge, jv); - assert(d); - s.cand.push_back(d); - } - - const int effective_k = std::min(k_prime, s.cand.size()); - const typename CandidateHeap::iterator kth = s.cand.begin() + effective_k; - std::nth_element(s.cand.begin(), kth, s.cand.end(), DerivationCompare()); - s.cand.resize(effective_k); - std::make_heap(s.cand.begin(), s.cand.end(), HeapCompare()); - - return s; - } - - void LazyNext(const Derivation* d, CandidateHeap* cand, UniqueDerivationSet* ds) { - for (int i = 0; i < d->j.size(); ++i) { - SmallVector j = d->j; - ++j[i]; - const Derivation* ant = LazyKthBest(d->edge->tail_nodes_[i], j[i]); - if (ant) { - Derivation query_unique(*d->edge, j); - if (ds->count(&query_unique) == 0) { - Derivation* new_d = CreateDerivation(*d->edge, j); - if (new_d) { - cand->push_back(new_d); - std::push_heap(cand->begin(), cand->end(), HeapCompare()); - assert(ds->insert(new_d).second); // insert into uniqueness set, sanity check - } - } - } - } - } - - const Traversal traverse; - const WeightFunction w; - const Hypergraph& g; - std::vector nds; - std::vector freelist; - const size_t k_prime; - }; -} - -#endif diff --git a/src/lattice.cc b/src/lattice.cc deleted file mode 100644 index 56bc9551..00000000 --- a/src/lattice.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include "lattice.h" - -#include "tdict.h" -#include "hg_io.h" - -using namespace std; - -static const int kUNREACHABLE = 99999999; - -void Lattice::ComputeDistances() { - const int n = this->size() + 1; - dist_.resize(n, n, kUNREACHABLE); - for (int i = 0; i < this->size(); ++i) { - const vector& alts = (*this)[i]; - for (int j = 0; j < alts.size(); ++j) - dist_(i, i + alts[j].dist2next) = 1; - } - for (int k = 0; k < n; ++k) { - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - const int dp = dist_(i,k) + dist_(k,j); - if (dist_(i,j) > dp) - dist_(i,j) = dp; - } - } - } - - for (int i = 0; i < n; ++i) { - int latest = kUNREACHABLE; - for (int j = n-1; j >= 0; --j) { - const int c = dist_(i,j); - if (c < kUNREACHABLE) - latest = c; - else - dist_(i,j) = latest; - } - } - // cerr << dist_ << endl; -} - -bool LatticeTools::LooksLikePLF(const string &line) { - return (line.size() > 5) && (line.substr(0,4) == "((('"); -} - -void LatticeTools::ConvertTextToLattice(const string& text, Lattice* pl) { - Lattice& l = *pl; - vector ids; - TD::ConvertSentence(text, &ids); - l.resize(ids.size()); - for (int i = 0; i < l.size(); ++i) - l[i].push_back(LatticeArc(ids[i], 0.0, 1)); -} - -void LatticeTools::ConvertTextOrPLF(const string& text_or_plf, Lattice* pl) { - if (LooksLikePLF(text_or_plf)) - HypergraphIO::PLFtoLattice(text_or_plf, pl); - else - ConvertTextToLattice(text_or_plf, pl); - pl->ComputeDistances(); -} - diff --git a/src/lattice.h b/src/lattice.h deleted file mode 100644 index 71589b92..00000000 --- a/src/lattice.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef __LATTICE_H_ -#define __LATTICE_H_ - -#include -#include -#include "wordid.h" -#include "array2d.h" - -class Lattice; -struct LatticeTools { - static bool LooksLikePLF(const std::string &line); - static void ConvertTextToLattice(const std::string& text, Lattice* pl); - static void ConvertTextOrPLF(const std::string& text_or_plf, Lattice* pl); -}; - -struct LatticeArc { - WordID label; - double cost; - int dist2next; - LatticeArc() : label(), cost(), dist2next() {} - LatticeArc(WordID w, double c, int i) : label(w), cost(c), dist2next(i) {} -}; - -class Lattice : public std::vector > { - friend void LatticeTools::ConvertTextOrPLF(const std::string& text_or_plf, Lattice* pl); - public: - Lattice() {} - explicit Lattice(size_t t, const std::vector& v = std::vector()) : - std::vector >(t, v) {} - int Distance(int from, int to) const { - if (dist_.empty()) - return (to - from); - return dist_(from, to); - } - - private: - void ComputeDistances(); - Array2D dist_; -}; - -#endif diff --git a/src/lexcrf.cc b/src/lexcrf.cc deleted file mode 100644 index 33455a3d..00000000 --- a/src/lexcrf.cc +++ /dev/null @@ -1,112 +0,0 @@ -#include "lexcrf.h" - -#include - -#include "filelib.h" -#include "hg.h" -#include "tdict.h" -#include "grammar.h" -#include "sentence_metadata.h" - -using namespace std; - -struct LexicalCRFImpl { - LexicalCRFImpl(const boost::program_options::variables_map& conf) : - use_null(false), - kXCAT(TD::Convert("X")*-1), - kNULL(TD::Convert("")), - kBINARY(new TRule("[X] ||| [X,1] [X,2] ||| [1] [2]")), - kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")) { - vector gfiles = conf["grammar"].as >(); - assert(gfiles.size() == 1); - ReadFile rf(gfiles.front()); - TextGrammar *tg = new TextGrammar; - grammar.reset(tg); - istream* in = rf.stream(); - int lc = 0; - bool flag = false; - while(*in) { - string line; - getline(*in, line); - if (line.empty()) continue; - ++lc; - TRulePtr r(TRule::CreateRulePhrasetable(line)); - tg->AddRule(r); - if (lc % 50000 == 0) { cerr << '.'; flag = true; } - if (lc % 2000000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } - } - if (flag) cerr << endl; - cerr << "Loaded " << lc << " rules\n"; - } - - void BuildTrellis(const Lattice& lattice, const SentenceMetadata& smeta, Hypergraph* forest) { - const int e_len = smeta.GetTargetLength(); - assert(e_len > 0); - const int f_len = lattice.size(); - // hack to tell the feature function system how big the sentence pair is - const int f_start = (use_null ? -1 : 0); - int prev_node_id = -1; - for (int i = 0; i < e_len; ++i) { // for each word in the *ref* - Hypergraph::Node* node = forest->AddNode(kXCAT); - const int new_node_id = node->id_; - for (int j = f_start; j < f_len; ++j) { // for each word in the source - const WordID src_sym = (j < 0 ? kNULL : lattice[j][0].label); - const GrammarIter* gi = grammar->GetRoot()->Extend(src_sym); - if (!gi) { - cerr << "No translations found for: " << TD::Convert(src_sym) << "\n"; - abort(); - } - const RuleBin* rb = gi->GetRules(); - assert(rb); - for (int k = 0; k < rb->GetNumRules(); ++k) { - TRulePtr rule = rb->GetIthRule(k); - Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector()); - edge->i_ = j; - edge->j_ = j+1; - edge->prev_i_ = i; - edge->prev_j_ = i+1; - edge->feature_values_ += edge->rule_->GetFeatureValues(); - forest->ConnectEdgeToHeadNode(edge->id_, new_node_id); - } - } - if (prev_node_id >= 0) { - const int comb_node_id = forest->AddNode(kXCAT)->id_; - Hypergraph::TailNodeVector tail(2, prev_node_id); - tail[1] = new_node_id; - const int edge_id = forest->AddEdge(kBINARY, tail)->id_; - forest->ConnectEdgeToHeadNode(edge_id, comb_node_id); - prev_node_id = comb_node_id; - } else { - prev_node_id = new_node_id; - } - } - Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); - Hypergraph::Node* goal = forest->AddNode(TD::Convert("[Goal]")*-1); - Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); - forest->ConnectEdgeToHeadNode(hg_edge, goal); - } - - private: - const bool use_null; - const WordID kXCAT; - const WordID kNULL; - const TRulePtr kBINARY; - const TRulePtr kGOAL_RULE; - GrammarPtr grammar; -}; - -LexicalCRF::LexicalCRF(const boost::program_options::variables_map& conf) : - pimpl_(new LexicalCRFImpl(conf)) {} - -bool LexicalCRF::Translate(const string& input, - SentenceMetadata* smeta, - const vector& weights, - Hypergraph* forest) { - Lattice lattice; - LatticeTools::ConvertTextToLattice(input, &lattice); - smeta->SetSourceLength(lattice.size()); - pimpl_->BuildTrellis(lattice, *smeta, forest); - forest->Reweight(weights); - return true; -} - diff --git a/src/lexcrf.h b/src/lexcrf.h deleted file mode 100644 index 99362c81..00000000 --- a/src/lexcrf.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _LEXCRF_H_ -#define _LEXCRF_H_ - -#include "translator.h" -#include "lattice.h" - -struct LexicalCRFImpl; -struct LexicalCRF : public Translator { - LexicalCRF(const boost::program_options::variables_map& conf); - bool Translate(const std::string& input, - SentenceMetadata* smeta, - const std::vector& weights, - Hypergraph* forest); - private: - boost::shared_ptr pimpl_; -}; - -#endif diff --git a/src/logval.h b/src/logval.h deleted file mode 100644 index a8ca620c..00000000 --- a/src/logval.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef LOGVAL_H_ -#define LOGVAL_H_ - -#include -#include - -template -class LogVal { - public: - LogVal() : v_(-std::numeric_limits::infinity()) {} - explicit LogVal(double x) : v_(std::log(x)) {} - LogVal(const LogVal& o) : v_(o.v_) {} - static LogVal One() { return LogVal(1); } - static LogVal Zero() { return LogVal(); } - - void logeq(const T& v) { v_ = v; } - - LogVal& operator+=(const LogVal& a) { - if (a.v_ == -std::numeric_limits::infinity()) return *this; - if (a.v_ < v_) { - v_ = v_ + log1p(std::exp(a.v_ - v_)); - } else { - v_ = a.v_ + log1p(std::exp(v_ - a.v_)); - } - return *this; - } - - LogVal& operator*=(const LogVal& a) { - v_ += a.v_; - return *this; - } - - LogVal& operator*=(const T& a) { - v_ += log(a); - return *this; - } - - LogVal& operator/=(const LogVal& a) { - v_ -= a.v_; - return *this; - } - - LogVal& poweq(const T& power) { - if (power == 0) v_ = 0; else v_ *= power; - return *this; - } - - LogVal pow(const T& power) const { - LogVal res = *this; - res.poweq(power); - return res; - } - - operator T() const { - return std::exp(v_); - } - - T v_; -}; - -template -LogVal operator+(const LogVal& o1, const LogVal& o2) { - LogVal res(o1); - res += o2; - return res; -} - -template -LogVal operator*(const LogVal& o1, const LogVal& o2) { - LogVal res(o1); - res *= o2; - return res; -} - -template -LogVal operator*(const LogVal& o1, const T& o2) { - LogVal res(o1); - res *= o2; - return res; -} - -template -LogVal operator*(const T& o1, const LogVal& o2) { - LogVal res(o2); - res *= o1; - return res; -} - -template -LogVal operator/(const LogVal& o1, const LogVal& o2) { - LogVal res(o1); - res /= o2; - return res; -} - -template -T log(const LogVal& o) { - return o.v_; -} - -template -LogVal pow(const LogVal& b, const T& e) { - return b.pow(e); -} - -template -bool operator<(const LogVal& lhs, const LogVal& rhs) { - return (lhs.v_ < rhs.v_); -} - -template -bool operator<=(const LogVal& lhs, const LogVal& rhs) { - return (lhs.v_ <= rhs.v_); -} - -template -bool operator>(const LogVal& lhs, const LogVal& rhs) { - return (lhs.v_ > rhs.v_); -} - -template -bool operator>=(const LogVal& lhs, const LogVal& rhs) { - return (lhs.v_ >= rhs.v_); -} - -template -bool operator==(const LogVal& lhs, const LogVal& rhs) { - return (lhs.v_ == rhs.v_); -} - -template -bool operator!=(const LogVal& lhs, const LogVal& rhs) { - return (lhs.v_ != rhs.v_); -} - -#endif diff --git a/src/maxtrans_blunsom.cc b/src/maxtrans_blunsom.cc deleted file mode 100644 index 4a6680e0..00000000 --- a/src/maxtrans_blunsom.cc +++ /dev/null @@ -1,287 +0,0 @@ -#include "apply_models.h" - -#include -#include -#include -#include - -#include -#include - -#include "tdict.h" -#include "hg.h" -#include "ff.h" - -using boost::tuple; -using namespace std; -using namespace std::tr1; - -namespace Hack { - -struct Candidate; -typedef SmallVector JVector; -typedef vector CandidateHeap; -typedef vector CandidateList; - -// life cycle: candidates are created, placed on the heap -// and retrieved by their estimated cost, when they're -// retrieved, they're incorporated into the +LM hypergraph -// where they also know the head node index they are -// attached to. After they are added to the +LM hypergraph -// inside_prob_ and est_prob_ fields may be updated as better -// derivations are found (this happens since the successor's -// of derivation d may have a better score- they are -// explored lazily). However, the updates don't happen -// when a candidate is in the heap so maintaining the heap -// property is not an issue. -struct Candidate { - int node_index_; // -1 until incorporated - // into the +LM forest - const Hypergraph::Edge* in_edge_; // in -LM forest - Hypergraph::Edge out_edge_; - vector state_; - const JVector j_; - prob_t inside_prob_; // these are fixed until the cand - // is popped, then they may be updated - prob_t est_prob_; - - Candidate(const Hypergraph::Edge& e, - const JVector& j, - const vector& D, - bool is_goal) : - node_index_(-1), - in_edge_(&e), - j_(j) { - InitializeCandidate(D, is_goal); - } - - // used to query uniqueness - Candidate(const Hypergraph::Edge& e, - const JVector& j) : in_edge_(&e), j_(j) {} - - bool IsIncorporatedIntoHypergraph() const { - return node_index_ >= 0; - } - - void InitializeCandidate(const vector >& D, - const bool is_goal) { - const Hypergraph::Edge& in_edge = *in_edge_; - out_edge_.rule_ = in_edge.rule_; - out_edge_.feature_values_ = in_edge.feature_values_; - Hypergraph::TailNodeVector& tail = out_edge_.tail_nodes_; - tail.resize(j_.size()); - prob_t p = prob_t::One(); - // cerr << "\nEstimating application of " << in_edge.rule_->AsString() << endl; - vector* > ants(tail.size()); - for (int i = 0; i < tail.size(); ++i) { - const Candidate& ant = *D[in_edge.tail_nodes_[i]][j_[i]]; - ants[i] = &ant.state_; - assert(ant.IsIncorporatedIntoHypergraph()); - tail[i] = ant.node_index_; - p *= ant.inside_prob_; - } - prob_t edge_estimate = prob_t::One(); - if (is_goal) { - assert(tail.size() == 1); - out_edge_.edge_prob_ = in_edge.edge_prob_; - } else { - in_edge.rule_->ESubstitute(ants, &state_); - out_edge_.edge_prob_ = in_edge.edge_prob_; - } - inside_prob_ = out_edge_.edge_prob_ * p; - est_prob_ = inside_prob_ * edge_estimate; - } -}; - -ostream& operator<<(ostream& os, const Candidate& cand) { - os << "CAND["; - if (!cand.IsIncorporatedIntoHypergraph()) { os << "PENDING "; } - else { os << "+LM_node=" << cand.node_index_; } - os << " edge=" << cand.in_edge_->id_; - os << " j=<"; - for (int i = 0; i < cand.j_.size(); ++i) - os << (i==0 ? "" : " ") << cand.j_[i]; - os << "> vit=" << log(cand.inside_prob_); - os << " est=" << log(cand.est_prob_); - return os << ']'; -} - -struct HeapCandCompare { - bool operator()(const Candidate* l, const Candidate* r) const { - return l->est_prob_ < r->est_prob_; - } -}; - -struct EstProbSorter { - bool operator()(const Candidate* l, const Candidate* r) const { - return l->est_prob_ > r->est_prob_; - } -}; - -// the same candidate can be added multiple times if -// j is multidimensional (if you're going NW in Manhattan, you -// can first go north, then west, or you can go west then north) -// this is a hash function on the relevant variables from -// Candidate to enforce this. -struct CandidateUniquenessHash { - size_t operator()(const Candidate* c) const { - size_t x = 5381; - x = ((x << 5) + x) ^ c->in_edge_->id_; - for (int i = 0; i < c->j_.size(); ++i) - x = ((x << 5) + x) ^ c->j_[i]; - return x; - } -}; - -struct CandidateUniquenessEquals { - bool operator()(const Candidate* a, const Candidate* b) const { - return (a->in_edge_ == b->in_edge_) && (a->j_ == b->j_); - } -}; - -typedef unordered_set UniqueCandidateSet; -typedef unordered_map, Candidate*, boost::hash > > State2Node; - -class MaxTransBeamSearch { - -public: - MaxTransBeamSearch(const Hypergraph& i, int pop_limit, Hypergraph* o) : - in(i), - out(*o), - D(in.nodes_.size()), - pop_limit_(pop_limit) { - cerr << " Finding max translation (cube pruning, pop_limit = " << pop_limit_ << ')' << endl; - } - - void Apply() { - int num_nodes = in.nodes_.size(); - int goal_id = num_nodes - 1; - int pregoal = goal_id - 1; - assert(in.nodes_[pregoal].out_edges_.size() == 1); - cerr << " "; - for (int i = 0; i < in.nodes_.size(); ++i) { - cerr << '.'; - KBest(i, i == goal_id); - } - cerr << endl; - int best_node = D[goal_id].front()->in_edge_->tail_nodes_.front(); - Candidate& best = *D[best_node].front(); - cerr << " Best path: " << log(best.inside_prob_) - << "\t" << log(best.est_prob_) << endl; - cout << TD::GetString(D[best_node].front()->state_) << endl; - FreeAll(); - } - - private: - void FreeAll() { - for (int i = 0; i < D.size(); ++i) { - CandidateList& D_i = D[i]; - for (int j = 0; j < D_i.size(); ++j) - delete D_i[j]; - } - D.clear(); - } - - void IncorporateIntoPlusLMForest(Candidate* item, State2Node* s2n, CandidateList* freelist) { - Hypergraph::Edge* new_edge = out.AddEdge(item->out_edge_.rule_, item->out_edge_.tail_nodes_); - new_edge->feature_values_ = item->out_edge_.feature_values_; - new_edge->edge_prob_ = item->out_edge_.edge_prob_; - Candidate*& o_item = (*s2n)[item->state_]; - if (!o_item) o_item = item; - - int& node_id = o_item->node_index_; - if (node_id < 0) { - Hypergraph::Node* new_node = out.AddNode(in.nodes_[item->in_edge_->head_node_].cat_, ""); - node_id = new_node->id_; - } - Hypergraph::Node* node = &out.nodes_[node_id]; - out.ConnectEdgeToHeadNode(new_edge, node); - - if (item != o_item) { - assert(o_item->state_ == item->state_); // sanity check! - o_item->est_prob_ += item->est_prob_; - o_item->inside_prob_ += item->inside_prob_; - freelist->push_back(item); - } - } - - void KBest(const int vert_index, const bool is_goal) { - // cerr << "KBest(" << vert_index << ")\n"; - CandidateList& D_v = D[vert_index]; - assert(D_v.empty()); - const Hypergraph::Node& v = in.nodes_[vert_index]; - // cerr << " has " << v.in_edges_.size() << " in-coming edges\n"; - const vector& in_edges = v.in_edges_; - CandidateHeap cand; - CandidateList freelist; - cand.reserve(in_edges.size()); - UniqueCandidateSet unique_cands; - for (int i = 0; i < in_edges.size(); ++i) { - const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; - const JVector j(edge.tail_nodes_.size(), 0); - cand.push_back(new Candidate(edge, j, D, is_goal)); - assert(unique_cands.insert(cand.back()).second); // these should all be unique! - } -// cerr << " making heap of " << cand.size() << " candidates\n"; - make_heap(cand.begin(), cand.end(), HeapCandCompare()); - State2Node state2node; // "buf" in Figure 2 - int pops = 0; - while(!cand.empty() && pops < pop_limit_) { - pop_heap(cand.begin(), cand.end(), HeapCandCompare()); - Candidate* item = cand.back(); - cand.pop_back(); - // cerr << "POPPED: " << *item << endl; - PushSucc(*item, is_goal, &cand, &unique_cands); - IncorporateIntoPlusLMForest(item, &state2node, &freelist); - ++pops; - } - D_v.resize(state2node.size()); - int c = 0; - for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i) - D_v[c++] = i->second; - sort(D_v.begin(), D_v.end(), EstProbSorter()); - // cerr << " expanded to " << D_v.size() << " nodes\n"; - - for (int i = 0; i < cand.size(); ++i) - delete cand[i]; - // freelist is necessary since even after an item merged, it still stays in - // the unique set so it can't be deleted til now - for (int i = 0; i < freelist.size(); ++i) - delete freelist[i]; - } - - void PushSucc(const Candidate& item, const bool is_goal, CandidateHeap* pcand, UniqueCandidateSet* cs) { - CandidateHeap& cand = *pcand; - for (int i = 0; i < item.j_.size(); ++i) { - JVector j = item.j_; - ++j[i]; - if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) { - Candidate query_unique(*item.in_edge_, j); - if (cs->count(&query_unique) == 0) { - Candidate* new_cand = new Candidate(*item.in_edge_, j, D, is_goal); - cand.push_back(new_cand); - push_heap(cand.begin(), cand.end(), HeapCandCompare()); - assert(cs->insert(new_cand).second); // insert into uniqueness set, sanity check - } - } - } - } - - const Hypergraph& in; - Hypergraph& out; - - vector D; // maps nodes in in-HG to the - // equivalent nodes (many due to state - // splits) in the out-HG. - const int pop_limit_; -}; - -// each node in the graph has one of these, it keeps track of -void MaxTrans(const Hypergraph& in, - int beam_size) { - Hypergraph out; - MaxTransBeamSearch ma(in, beam_size, &out); - ma.Apply(); -} - -} diff --git a/src/parser_test.cc b/src/parser_test.cc deleted file mode 100644 index da1fbd89..00000000 --- a/src/parser_test.cc +++ /dev/null @@ -1,35 +0,0 @@ -#include -#include -#include -#include -#include -#include "hg.h" -#include "trule.h" -#include "bottom_up_parser.h" -#include "tdict.h" - -using namespace std; - -class ChartTest : public testing::Test { - protected: - virtual void SetUp() { } - virtual void TearDown() { } -}; - -TEST_F(ChartTest,LanguageModel) { - LatticeArc a(TD::Convert("ein"), 0.0, 1); - LatticeArc b(TD::Convert("haus"), 0.0, 1); - Lattice lattice(2); - lattice[0].push_back(a); - lattice[1].push_back(b); - Hypergraph forest; - GrammarPtr g(new TextGrammar); - vector grammars(1, g); - ExhaustiveBottomUpParser parser("PHRASE", grammars); - parser.Parse(lattice, &forest); -} - -int main(int argc, char **argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/phrasebased_translator.cc b/src/phrasebased_translator.cc deleted file mode 100644 index 5eb70876..00000000 --- a/src/phrasebased_translator.cc +++ /dev/null @@ -1,206 +0,0 @@ -#include "phrasebased_translator.h" - -#include -#include -#include -#include - -#include -#include - -#include "sentence_metadata.h" -#include "tdict.h" -#include "hg.h" -#include "filelib.h" -#include "lattice.h" -#include "phrasetable_fst.h" -#include "array2d.h" - -using namespace std; -using namespace std::tr1; -using namespace boost::tuples; - -struct Coverage : public vector { - explicit Coverage(int n, bool v = false) : vector(n, v), first_gap() {} - void Cover(int i, int j) { - vector::iterator it = this->begin() + i; - vector::iterator end = this->begin() + j; - while (it != end) - *it++ = true; - if (first_gap == i) { - first_gap = j; - it = end; - while (*it && it != this->end()) { - ++it; - ++first_gap; - } - } - } - bool Collides(int i, int j) const { - vector::const_iterator it = this->begin() + i; - vector::const_iterator end = this->begin() + j; - while (it != end) - if (*it++) return true; - return false; - } - int GetFirstGap() const { return first_gap; } - private: - int first_gap; -}; -struct CoverageHash { - size_t operator()(const Coverage& cov) const { - return hasher_(static_cast&>(cov)); - } - private: - boost::hash > hasher_; -}; -ostream& operator<<(ostream& os, const Coverage& cov) { - os << '['; - for (int i = 0; i < cov.size(); ++i) - os << (cov[i] ? '*' : '.'); - return os << " gap=" << cov.GetFirstGap() << ']'; -} - -typedef unordered_map CoverageNodeMap; -typedef unordered_set UniqueCoverageSet; - -struct PhraseBasedTranslatorImpl { - PhraseBasedTranslatorImpl(const boost::program_options::variables_map& conf) : - add_pass_through_rules(conf.count("add_pass_through_rules")), - max_distortion(conf["pb_max_distortion"].as()), - kSOURCE_RULE(new TRule("[X] ||| [X,1] ||| [X,1]", true)), - kCONCAT_RULE(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]", true)), - kNT_TYPE(TD::Convert("X") * -1) { - assert(max_distortion >= 0); - vector gfiles = conf["grammar"].as >(); - assert(gfiles.size() == 1); - cerr << "Reading phrasetable from " << gfiles.front() << endl; - ReadFile in(gfiles.front()); - fst.reset(LoadTextPhrasetable(in.stream())); - } - - struct State { - State(const Coverage& c, int _i, int _j, const FSTNode* q) : - coverage(c), i(_i), j(_j), fst(q) {} - Coverage coverage; - int i; - int j; - const FSTNode* fst; - }; - - // we keep track of unique coverages that have been extended since it's - // possible to "extend" the same coverage twice, e.g. translate "a b c" - // with phrases "a" "b" "a b" and "c". There are two ways to cover "a b" - void EnqueuePossibleContinuations(const Coverage& coverage, queue* q, UniqueCoverageSet* ucs) { - if (ucs->insert(coverage).second) { - const int gap = coverage.GetFirstGap(); - const int end = min(static_cast(coverage.size()), gap + max_distortion + 1); - for (int i = gap; i < end; ++i) - if (!coverage[i]) q->push(State(coverage, i, i, fst.get())); - } - } - - bool Translate(const std::string& input, - SentenceMetadata* smeta, - const std::vector& weights, - Hypergraph* minus_lm_forest) { - Lattice lattice; - LatticeTools::ConvertTextOrPLF(input, &lattice); - smeta->SetSourceLength(lattice.size()); - size_t est_nodes = lattice.size() * lattice.size() * (1 << max_distortion); - minus_lm_forest->ReserveNodes(est_nodes, est_nodes * 100); - if (add_pass_through_rules) { - SparseVector feats; - feats.set_value(FD::Convert("PassThrough"), 1); - for (int i = 0; i < lattice.size(); ++i) { - const vector& arcs = lattice[i]; - for (int j = 0; j < arcs.size(); ++j) { - fst->AddPassThroughTranslation(arcs[j].label, feats); - // TODO handle lattice edge features - } - } - } - CoverageNodeMap c; - queue q; - UniqueCoverageSet ucs; - const Coverage empty_cov(lattice.size(), false); - const Coverage goal_cov(lattice.size(), true); - EnqueuePossibleContinuations(empty_cov, &q, &ucs); - c[empty_cov] = 0; // have to handle the left edge specially - while(!q.empty()) { - const State s = q.front(); - q.pop(); - // cerr << "(" << s.i << "," << s.j << " ptr=" << s.fst << ") cov=" << s.coverage << endl; - const vector& arcs = lattice[s.j]; - if (s.fst->HasData()) { - Coverage new_cov = s.coverage; - new_cov.Cover(s.i, s.j); - EnqueuePossibleContinuations(new_cov, &q, &ucs); - const vector& phrases = s.fst->GetTranslations()->GetRules(); - const int phrase_head_index = minus_lm_forest->AddNode(kNT_TYPE)->id_; - for (int i = 0; i < phrases.size(); ++i) { - Hypergraph::Edge* edge = minus_lm_forest->AddEdge(phrases[i], Hypergraph::TailNodeVector()); - edge->feature_values_ = edge->rule_->scores_; - minus_lm_forest->ConnectEdgeToHeadNode(edge->id_, phrase_head_index); - } - CoverageNodeMap::iterator cit = c.find(s.coverage); - assert(cit != c.end()); - const int tail_node_plus1 = cit->second; - if (tail_node_plus1 == 0) { // left edge - c[new_cov] = phrase_head_index + 1; - } else { // not left edge - int& head_node_plus1 = c[new_cov]; - if (!head_node_plus1) - head_node_plus1 = minus_lm_forest->AddNode(kNT_TYPE)->id_ + 1; - Hypergraph::TailNodeVector tail(2, tail_node_plus1 - 1); - tail[1] = phrase_head_index; - const int concat_edge = minus_lm_forest->AddEdge(kCONCAT_RULE, tail)->id_; - minus_lm_forest->ConnectEdgeToHeadNode(concat_edge, head_node_plus1 - 1); - } - } - if (s.j == lattice.size()) continue; - for (int l = 0; l < arcs.size(); ++l) { - const LatticeArc& arc = arcs[l]; - - const FSTNode* next_fst_state = s.fst->Extend(arc.label); - const int next_j = s.j + arc.dist2next; - if (next_fst_state && - !s.coverage.Collides(s.i, next_j)) { - q.push(State(s.coverage, s.i, next_j, next_fst_state)); - } - } - } - if (add_pass_through_rules) - fst->ClearPassThroughTranslations(); - int pregoal_plus1 = c[goal_cov]; - if (pregoal_plus1 > 0) { - TRulePtr kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [X,1]")); - int goal = minus_lm_forest->AddNode(TD::Convert("Goal") * -1)->id_; - int gedge = minus_lm_forest->AddEdge(kGOAL_RULE, Hypergraph::TailNodeVector(1, pregoal_plus1 - 1))->id_; - minus_lm_forest->ConnectEdgeToHeadNode(gedge, goal); - // they are almost topo, but not quite always - minus_lm_forest->TopologicallySortNodesAndEdges(goal); - minus_lm_forest->Reweight(weights); - return true; - } else { - return false; // composition failed - } - } - - const bool add_pass_through_rules; - const int max_distortion; - TRulePtr kSOURCE_RULE; - const TRulePtr kCONCAT_RULE; - const WordID kNT_TYPE; - boost::shared_ptr fst; -}; - -PhraseBasedTranslator::PhraseBasedTranslator(const boost::program_options::variables_map& conf) : - pimpl_(new PhraseBasedTranslatorImpl(conf)) {} - -bool PhraseBasedTranslator::Translate(const std::string& input, - SentenceMetadata* smeta, - const std::vector& weights, - Hypergraph* minus_lm_forest) { - return pimpl_->Translate(input, smeta, weights, minus_lm_forest); -} diff --git a/src/phrasebased_translator.h b/src/phrasebased_translator.h deleted file mode 100644 index d42ce79c..00000000 --- a/src/phrasebased_translator.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _PHRASEBASED_TRANSLATOR_H_ -#define _PHRASEBASED_TRANSLATOR_H_ - -#include "translator.h" - -class PhraseBasedTranslatorImpl; -class PhraseBasedTranslator : public Translator { - public: - PhraseBasedTranslator(const boost::program_options::variables_map& conf); - bool Translate(const std::string& input, - SentenceMetadata* smeta, - const std::vector& weights, - Hypergraph* minus_lm_forest); - private: - boost::shared_ptr pimpl_; -}; - -#endif diff --git a/src/phrasetable_fst.cc b/src/phrasetable_fst.cc deleted file mode 100644 index f421e941..00000000 --- a/src/phrasetable_fst.cc +++ /dev/null @@ -1,141 +0,0 @@ -#include "phrasetable_fst.h" - -#include -#include -#include - -#include - -#include "filelib.h" -#include "tdict.h" - -using boost::shared_ptr; -using namespace std; - -TargetPhraseSet::~TargetPhraseSet() {} -FSTNode::~FSTNode() {} - -class TextTargetPhraseSet : public TargetPhraseSet { - public: - void AddRule(TRulePtr rule) { - rules_.push_back(rule); - } - const vector& GetRules() const { - return rules_; - } - - private: - // all rules must have arity 0 - vector rules_; -}; - -class TextFSTNode : public FSTNode { - public: - const TargetPhraseSet* GetTranslations() const { return data.get(); } - bool HasData() const { return (bool)data; } - bool HasOutgoingNonEpsilonEdges() const { return !ptr.empty(); } - const FSTNode* Extend(const WordID& t) const { - map::const_iterator it = ptr.find(t); - if (it == ptr.end()) return NULL; - return &it->second; - } - - void AddPhrase(const string& phrase); - - void AddPassThroughTranslation(const WordID& w, const SparseVector& feats); - void ClearPassThroughTranslations(); - private: - vector passthroughs; - shared_ptr data; - map ptr; -}; - -#ifdef DEBUG_CHART_PARSER -static string TrimRule(const string& r) { - size_t start = r.find(" |||") + 5; - size_t end = r.rfind(" |||"); - return r.substr(start, end - start); -} -#endif - -void TextFSTNode::AddPhrase(const string& phrase) { - vector words; - TRulePtr rule(TRule::CreateRulePhrasetable(phrase)); - if (!rule) { - static int err = 0; - ++err; - if (err > 2) { cerr << "TOO MANY PHRASETABLE ERRORS\n"; exit(1); } - return; - } - - TextFSTNode* fsa = this; - for (int i = 0; i < rule->FLength(); ++i) - fsa = &fsa->ptr[rule->f_[i]]; - - if (!fsa->data) - fsa->data.reset(new TextTargetPhraseSet); - static_cast(fsa->data.get())->AddRule(rule); -} - -void TextFSTNode::AddPassThroughTranslation(const WordID& w, const SparseVector& feats) { - TextFSTNode* next = &ptr[w]; - // current, rules are only added if the symbol is completely missing as a - // word starting the phrase. As a result, it is possible that some sentences - // won't parse. If this becomes a problem, fix it here. - if (!next->data) { - TextTargetPhraseSet* tps = new TextTargetPhraseSet; - next->data.reset(tps); - TRule* rule = new TRule; - rule->e_.resize(1, w); - rule->f_.resize(1, w); - rule->lhs_ = TD::Convert("___PHRASE") * -1; - rule->scores_ = feats; - rule->arity_ = 0; - tps->AddRule(TRulePtr(rule)); - passthroughs.push_back(w); - } -} - -void TextFSTNode::ClearPassThroughTranslations() { - for (int i = 0; i < passthroughs.size(); ++i) - ptr.erase(passthroughs[i]); - passthroughs.clear(); -} - -static void AddPhrasetableToFST(istream* in, TextFSTNode* fst) { - int lc = 0; - bool flag = false; - while(*in) { - string line; - getline(*in, line); - if (line.empty()) continue; - ++lc; - fst->AddPhrase(line); - if (lc % 10000 == 0) { flag = true; cerr << '.' << flush; } - if (lc % 500000 == 0) { flag = false; cerr << " [" << lc << ']' << endl << flush; } - } - if (flag) cerr << endl; - cerr << "Loaded " << lc << " source phrases\n"; -} - -FSTNode* LoadTextPhrasetable(istream* in) { - TextFSTNode *fst = new TextFSTNode; - AddPhrasetableToFST(in, fst); - return fst; -} - -FSTNode* LoadTextPhrasetable(const vector& filenames) { - TextFSTNode* fst = new TextFSTNode; - for (int i = 0; i < filenames.size(); ++i) { - ReadFile rf(filenames[i]); - cerr << "Reading phrase from " << filenames[i] << endl; - AddPhrasetableToFST(rf.stream(), fst); - } - return fst; -} - -FSTNode* LoadBinaryPhrasetable(const string& fname_prefix) { - (void) fname_prefix; - assert(!"not implemented yet"); -} - diff --git a/src/phrasetable_fst.h b/src/phrasetable_fst.h deleted file mode 100644 index 477de1f7..00000000 --- a/src/phrasetable_fst.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _PHRASETABLE_FST_H_ -#define _PHRASETABLE_FST_H_ - -#include -#include - -#include "sparse_vector.h" -#include "trule.h" - -class TargetPhraseSet { - public: - virtual ~TargetPhraseSet(); - virtual const std::vector& GetRules() const = 0; -}; - -class FSTNode { - public: - virtual ~FSTNode(); - virtual const TargetPhraseSet* GetTranslations() const = 0; - virtual bool HasData() const = 0; - virtual bool HasOutgoingNonEpsilonEdges() const = 0; - virtual const FSTNode* Extend(const WordID& t) const = 0; - - // these should only be called on q_0: - virtual void AddPassThroughTranslation(const WordID& w, const SparseVector& feats) = 0; - virtual void ClearPassThroughTranslations() = 0; -}; - -// attn caller: you own the memory -FSTNode* LoadTextPhrasetable(const std::vector& filenames); -FSTNode* LoadTextPhrasetable(std::istream* in); -FSTNode* LoadBinaryPhrasetable(const std::string& fname_prefix); - -#endif diff --git a/src/prob.h b/src/prob.h deleted file mode 100644 index bc297870..00000000 --- a/src/prob.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _PROB_H_ -#define _PROB_H_ - -#include "logval.h" - -typedef LogVal prob_t; - -#endif diff --git a/src/sampler.h b/src/sampler.h deleted file mode 100644 index e5840f41..00000000 --- a/src/sampler.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef SAMPLER_H_ -#define SAMPLER_H_ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "prob.h" - -struct SampleSet; - -template -struct RandomNumberGenerator { - static uint32_t GetTrulyRandomSeed() { - uint32_t seed; - std::ifstream r("/dev/urandom"); - if (r) { - r.read((char*)&seed,sizeof(uint32_t)); - } - if (r.fail() || !r) { - std::cerr << "Warning: could not read from /dev/urandom. Seeding from clock" << std::endl; - seed = time(NULL); - } - std::cerr << "Seeding random number sequence to " << seed << std::endl; - return seed; - } - - RandomNumberGenerator() : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { - uint32_t seed = GetTrulyRandomSeed(); - m_generator.seed(seed); - } - explicit RandomNumberGenerator(uint32_t seed) : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { - if (!seed) seed = GetTrulyRandomSeed(); - m_generator.seed(seed); - } - - size_t SelectSample(const prob_t& a, const prob_t& b, double T = 1.0) { - if (T == 1.0) { - if (this->next() > (a / (a + b))) return 1; else return 0; - } else { - assert(!"not implemented"); - } - } - - // T is the annealing temperature, if desired - size_t SelectSample(const SampleSet& ss, double T = 1.0); - - // draw a value from U(0,1) - double next() {return m_random();} - - // draw a value from N(mean,var) - double NextNormal(double mean, double var) { - return boost::normal_distribution(mean, var)(m_random); - } - - // draw a value from a Poisson distribution - // lambda must be greater than 0 - int NextPoisson(int lambda) { - return boost::poisson_distribution(lambda)(m_random); - } - - bool AcceptMetropolisHastings(const prob_t& p_cur, - const prob_t& p_prev, - const prob_t& q_cur, - const prob_t& q_prev) { - const prob_t a = (p_cur / p_prev) * (q_prev / q_cur); - if (log(a) >= 0.0) return true; - return (prob_t(this->next()) < a); - } - - private: - boost::uniform_real<> m_dist; - RNG m_generator; - boost::variate_generator > m_random; -}; - -typedef RandomNumberGenerator MT19937; - -class SampleSet { - public: - const prob_t& operator[](int i) const { return m_scores[i]; } - bool empty() const { return m_scores.empty(); } - void add(const prob_t& s) { m_scores.push_back(s); } - void clear() { m_scores.clear(); } - size_t size() const { return m_scores.size(); } - std::vector m_scores; -}; - -template -size_t RandomNumberGenerator::SelectSample(const SampleSet& ss, double T) { - assert(T > 0.0); - assert(ss.m_scores.size() > 0); - if (ss.m_scores.size() == 1) return 0; - const prob_t annealing_factor(1.0 / T); - const bool anneal = (annealing_factor != prob_t::One()); - prob_t sum = prob_t::Zero(); - if (anneal) { - for (int i = 0; i < ss.m_scores.size(); ++i) - sum += ss.m_scores[i].pow(annealing_factor); // p^(1/T) - } else { - sum = std::accumulate(ss.m_scores.begin(), ss.m_scores.end(), prob_t::Zero()); - } - //for (size_t i = 0; i < ss.m_scores.size(); ++i) std::cerr << ss.m_scores[i] << ","; - //std::cerr << std::endl; - - prob_t random(this->next()); // random number between 0 and 1 - random *= sum; // scale with normalization factor - //std::cerr << "Random number " << random << std::endl; - - //now figure out which sample - size_t position = 1; - sum = ss.m_scores[0]; - if (anneal) { - sum.poweq(annealing_factor); - for (; position < ss.m_scores.size() && sum < random; ++position) - sum += ss.m_scores[position].pow(annealing_factor); - } else { - for (; position < ss.m_scores.size() && sum < random; ++position) - sum += ss.m_scores[position]; - } - //std::cout << "random: " << random << " sample: " << position << std::endl; - //std::cerr << "Sample: " << position-1 << std::endl; - //exit(1); - return position-1; -} - -#endif diff --git a/src/scfg_translator.cc b/src/scfg_translator.cc deleted file mode 100644 index 03602c6b..00000000 --- a/src/scfg_translator.cc +++ /dev/null @@ -1,66 +0,0 @@ -#include "translator.h" - -#include - -#include "hg.h" -#include "grammar.h" -#include "bottom_up_parser.h" -#include "sentence_metadata.h" - -using namespace std; - -Translator::~Translator() {} - -struct SCFGTranslatorImpl { - SCFGTranslatorImpl(const boost::program_options::variables_map& conf) : - max_span_limit(conf["scfg_max_span_limit"].as()), - add_pass_through_rules(conf.count("add_pass_through_rules")), - goal(conf["goal"].as()), - default_nt(conf["scfg_default_nt"].as()) { - vector gfiles = conf["grammar"].as >(); - for (int i = 0; i < gfiles.size(); ++i) { - cerr << "Reading SCFG grammar from " << gfiles[i] << endl; - TextGrammar* g = new TextGrammar(gfiles[i]); - g->SetMaxSpan(max_span_limit); - grammars.push_back(GrammarPtr(g)); - } - if (!conf.count("scfg_no_hiero_glue_grammar")) - grammars.push_back(GrammarPtr(new GlueGrammar(goal, default_nt))); - if (conf.count("scfg_extra_glue_grammar")) - grammars.push_back(GrammarPtr(new GlueGrammar(conf["scfg_extra_glue_grammar"].as()))); - } - - const int max_span_limit; - const bool add_pass_through_rules; - const string goal; - const string default_nt; - vector grammars; - - bool Translate(const string& input, - SentenceMetadata* smeta, - const vector& weights, - Hypergraph* forest) { - vector glist = grammars; - Lattice lattice; - LatticeTools::ConvertTextOrPLF(input, &lattice); - smeta->SetSourceLength(lattice.size()); - if (add_pass_through_rules) - glist.push_back(GrammarPtr(new PassThroughGrammar(lattice, default_nt))); - ExhaustiveBottomUpParser parser(goal, glist); - if (!parser.Parse(lattice, forest)) - return false; - forest->Reweight(weights); - return true; - } -}; - -SCFGTranslator::SCFGTranslator(const boost::program_options::variables_map& conf) : - pimpl_(new SCFGTranslatorImpl(conf)) {} - -bool SCFGTranslator::Translate(const string& input, - SentenceMetadata* smeta, - const vector& weights, - Hypergraph* minus_lm_forest) { - return pimpl_->Translate(input, smeta, weights, minus_lm_forest); -} - diff --git a/src/sentence_metadata.h b/src/sentence_metadata.h deleted file mode 100644 index ef9eb388..00000000 --- a/src/sentence_metadata.h +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef _SENTENCE_METADATA_H_ -#define _SENTENCE_METADATA_H_ - -#include -#include "lattice.h" - -struct SentenceMetadata { - SentenceMetadata(int id, const Lattice& ref) : - sent_id_(id), - src_len_(-1), - has_reference_(ref.size() > 0), - trg_len_(ref.size()), - ref_(has_reference_ ? &ref : NULL) {} - - // this should be called by the Translator object after - // it has parsed the source - void SetSourceLength(int sl) { src_len_ = sl; } - - // this should be called if a separate model needs to - // specify how long the target sentence should be - void SetTargetLength(int tl) { - assert(!has_reference_); - trg_len_ = tl; - } - bool HasReference() const { return has_reference_; } - const Lattice& GetReference() const { return *ref_; } - int GetSourceLength() const { return src_len_; } - int GetTargetLength() const { return trg_len_; } - int GetSentenceID() const { return sent_id_; } - // this will be empty if the translator accepts non FS input! - const Lattice& GetSourceLattice() const { return src_lattice_; } - - private: - const int sent_id_; - // the following should be set, if possible, by the Translator - int src_len_; - public: - Lattice src_lattice_; // this will only be set if inputs are finite state! - private: - // you need to be very careful when depending on these values - // they will only be set during training / alignment contexts - const bool has_reference_; - int trg_len_; - const Lattice* const ref_; -}; - -#endif diff --git a/src/small_vector.h b/src/small_vector.h deleted file mode 100644 index 800c1df1..00000000 --- a/src/small_vector.h +++ /dev/null @@ -1,187 +0,0 @@ -#ifndef _SMALL_VECTOR_H_ - -#include // std::max - where to get this? -#include -#include - -#define __SV_MAX_STATIC 2 - -class SmallVector { - - public: - SmallVector() : size_(0) {} - - explicit SmallVector(size_t s, int v = 0) : size_(s) { - assert(s < 0x80); - if (s <= __SV_MAX_STATIC) { - for (int i = 0; i < s; ++i) data_.vals[i] = v; - } else { - capacity_ = s; - size_ = s; - data_.ptr = new int[s]; - for (int i = 0; i < size_; ++i) data_.ptr[i] = v; - } - } - - SmallVector(const SmallVector& o) : size_(o.size_) { - if (size_ <= __SV_MAX_STATIC) { - for (int i = 0; i < __SV_MAX_STATIC; ++i) data_.vals[i] = o.data_.vals[i]; - } else { - capacity_ = size_ = o.size_; - data_.ptr = new int[capacity_]; - std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(int)); - } - } - - const SmallVector& operator=(const SmallVector& o) { - if (size_ <= __SV_MAX_STATIC) { - if (o.size_ <= __SV_MAX_STATIC) { - size_ = o.size_; - for (int i = 0; i < __SV_MAX_STATIC; ++i) data_.vals[i] = o.data_.vals[i]; - } else { - capacity_ = size_ = o.size_; - data_.ptr = new int[capacity_]; - std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(int)); - } - } else { - if (o.size_ <= __SV_MAX_STATIC) { - delete[] data_.ptr; - size_ = o.size_; - for (int i = 0; i < size_; ++i) data_.vals[i] = o.data_.vals[i]; - } else { - if (capacity_ < o.size_) { - delete[] data_.ptr; - capacity_ = o.size_; - data_.ptr = new int[capacity_]; - } - size_ = o.size_; - for (int i = 0; i < size_; ++i) - data_.ptr[i] = o.data_.ptr[i]; - } - } - return *this; - } - - ~SmallVector() { - if (size_ <= __SV_MAX_STATIC) return; - delete[] data_.ptr; - } - - void clear() { - if (size_ > __SV_MAX_STATIC) { - delete[] data_.ptr; - } - size_ = 0; - } - - bool empty() const { return size_ == 0; } - size_t size() const { return size_; } - - inline void ensure_capacity(unsigned char min_size) { - assert(min_size > __SV_MAX_STATIC); - if (min_size < capacity_) return; - unsigned char new_cap = std::max(static_cast(capacity_ << 1), min_size); - int* tmp = new int[new_cap]; - std::memcpy(tmp, data_.ptr, capacity_ * sizeof(int)); - delete[] data_.ptr; - data_.ptr = tmp; - capacity_ = new_cap; - } - - inline void copy_vals_to_ptr() { - capacity_ = __SV_MAX_STATIC * 2; - int* tmp = new int[capacity_]; - for (int i = 0; i < __SV_MAX_STATIC; ++i) tmp[i] = data_.vals[i]; - data_.ptr = tmp; - } - - inline void push_back(int v) { - if (size_ < __SV_MAX_STATIC) { - data_.vals[size_] = v; - ++size_; - return; - } else if (size_ == __SV_MAX_STATIC) { - copy_vals_to_ptr(); - } else if (size_ == capacity_) { - ensure_capacity(size_ + 1); - } - data_.ptr[size_] = v; - ++size_; - } - - int& back() { return this->operator[](size_ - 1); } - const int& back() const { return this->operator[](size_ - 1); } - int& front() { return this->operator[](0); } - const int& front() const { return this->operator[](0); } - - void resize(size_t s, int v = 0) { - if (s <= __SV_MAX_STATIC) { - if (size_ > __SV_MAX_STATIC) { - int tmp[__SV_MAX_STATIC]; - for (int i = 0; i < s; ++i) tmp[i] = data_.ptr[i]; - delete[] data_.ptr; - for (int i = 0; i < s; ++i) data_.vals[i] = tmp[i]; - size_ = s; - return; - } - if (s <= size_) { - size_ = s; - return; - } else { - for (int i = size_; i < s; ++i) - data_.vals[i] = v; - size_ = s; - return; - } - } else { - if (size_ <= __SV_MAX_STATIC) - copy_vals_to_ptr(); - if (s > capacity_) - ensure_capacity(s); - if (s > size_) { - for (int i = size_; i < s; ++i) - data_.ptr[i] = v; - } - size_ = s; - } - } - - int& operator[](size_t i) { - if (size_ <= __SV_MAX_STATIC) return data_.vals[i]; - return data_.ptr[i]; - } - - const int& operator[](size_t i) const { - if (size_ <= __SV_MAX_STATIC) return data_.vals[i]; - return data_.ptr[i]; - } - - bool operator==(const SmallVector& o) const { - if (size_ != o.size_) return false; - if (size_ <= __SV_MAX_STATIC) { - for (size_t i = 0; i < size_; ++i) - if (data_.vals[i] != o.data_.vals[i]) return false; - return true; - } else { - for (size_t i = 0; i < size_; ++i) - if (data_.ptr[i] != o.data_.ptr[i]) return false; - return true; - } - } - - private: - unsigned char capacity_; // only defined when size_ >= __SV_MAX_STATIC - unsigned char size_; - union StorageType { - int vals[__SV_MAX_STATIC]; - int* ptr; - }; - StorageType data_; - -}; - -inline bool operator!=(const SmallVector& a, const SmallVector& b) { - return !(a==b); -} - -#endif diff --git a/src/small_vector_test.cc b/src/small_vector_test.cc deleted file mode 100644 index 84237791..00000000 --- a/src/small_vector_test.cc +++ /dev/null @@ -1,129 +0,0 @@ -#include "small_vector.h" - -#include -#include -#include -#include - -using namespace std; - -class SVTest : public testing::Test { - protected: - virtual void SetUp() { } - virtual void TearDown() { } -}; - -TEST_F(SVTest, LargerThan2) { - SmallVector v; - SmallVector v2; - v.push_back(0); - v.push_back(1); - v.push_back(2); - assert(v.size() == 3); - assert(v[2] == 2); - assert(v[1] == 1); - assert(v[0] == 0); - v2 = v; - SmallVector copy(v); - assert(copy.size() == 3); - assert(copy[0] == 0); - assert(copy[1] == 1); - assert(copy[2] == 2); - assert(copy == v2); - copy[1] = 99; - assert(copy != v2); - assert(v2.size() == 3); - assert(v2[2] == 2); - assert(v2[1] == 1); - assert(v2[0] == 0); - v2[0] = -2; - v2[1] = -1; - v2[2] = 0; - assert(v2[2] == 0); - assert(v2[1] == -1); - assert(v2[0] == -2); - SmallVector v3(1,1); - assert(v3[0] == 1); - v2 = v3; - assert(v2.size() == 1); - assert(v2[0] == 1); - SmallVector v4(10, 1); - assert(v4.size() == 10); - assert(v4[5] == 1); - assert(v4[9] == 1); - v4 = v; - assert(v4.size() == 3); - assert(v4[2] == 2); - assert(v4[1] == 1); - assert(v4[0] == 0); - SmallVector v5(10, 2); - assert(v5.size() == 10); - assert(v5[7] == 2); - assert(v5[0] == 2); - assert(v.size() == 3); - v = v5; - assert(v.size() == 10); - assert(v[2] == 2); - assert(v[9] == 2); - SmallVector cc; - for (int i = 0; i < 33; ++i) - cc.push_back(i); - for (int i = 0; i < 33; ++i) - assert(cc[i] == i); - cc.resize(20); - assert(cc.size() == 20); - for (int i = 0; i < 20; ++i) - assert(cc[i] == i); - cc[0]=-1; - cc.resize(1, 999); - assert(cc.size() == 1); - assert(cc[0] == -1); - cc.resize(99, 99); - for (int i = 1; i < 99; ++i) { - cerr << i << " " << cc[i] << endl; - assert(cc[i] == 99); - } - cc.clear(); - assert(cc.size() == 0); -} - -TEST_F(SVTest, Small) { - SmallVector v; - SmallVector v1(1,0); - SmallVector v2(2,10); - SmallVector v1a(2,0); - EXPECT_TRUE(v1 != v1a); - EXPECT_TRUE(v1 == v1); - EXPECT_EQ(v1[0], 0); - EXPECT_EQ(v2[1], 10); - EXPECT_EQ(v2[0], 10); - ++v2[1]; - --v2[0]; - EXPECT_EQ(v2[0], 9); - EXPECT_EQ(v2[1], 11); - SmallVector v3(v2); - assert(v3[0] == 9); - assert(v3[1] == 11); - assert(!v3.empty()); - assert(v3.size() == 2); - v3.clear(); - assert(v3.empty()); - assert(v3.size() == 0); - assert(v3 != v2); - assert(v2 != v3); - v3 = v2; - assert(v3 == v2); - assert(v2 == v3); - assert(v3[0] == 9); - assert(v3[1] == 11); - assert(!v3.empty()); - assert(v3.size() == 2); - cerr << sizeof(SmallVector) << endl; - cerr << sizeof(vector) << endl; -} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - diff --git a/src/sparse_vector.cc b/src/sparse_vector.cc deleted file mode 100644 index 4035b9ef..00000000 --- a/src/sparse_vector.cc +++ /dev/null @@ -1,98 +0,0 @@ -#include "sparse_vector.h" - -#include -#include - -#include "hg_io.h" - -using namespace std; - -namespace B64 { - -void Encode(double objective, const SparseVector& v, ostream* out) { - const int num_feats = v.num_active(); - size_t tot_size = 0; - const size_t off_objective = tot_size; - tot_size += sizeof(double); // objective - const size_t off_num_feats = tot_size; - tot_size += sizeof(int); // num_feats - const size_t off_data = tot_size; - tot_size += sizeof(unsigned char) * num_feats; // lengths of feature names; - typedef SparseVector::const_iterator const_iterator; - for (const_iterator it = v.begin(); it != v.end(); ++it) - tot_size += FD::Convert(it->first).size(); // feature names; - tot_size += sizeof(double) * num_feats; // gradient - const size_t off_magic = tot_size; - tot_size += 4; // magic - - // size_t b64_size = tot_size * 4 / 3; - // cerr << "Sparse vector binary size: " << tot_size << " (b64 size=" << b64_size << ")\n"; - char* data = new char[tot_size]; - *reinterpret_cast(&data[off_objective]) = objective; - *reinterpret_cast(&data[off_num_feats]) = num_feats; - char* cur = &data[off_data]; - assert(cur - data == off_data); - for (const_iterator it = v.begin(); it != v.end(); ++it) { - const string& fname = FD::Convert(it->first); - *cur++ = static_cast(fname.size()); // name len - memcpy(cur, &fname[0], fname.size()); - cur += fname.size(); - *reinterpret_cast(cur) = it->second; - cur += sizeof(double); - } - assert(cur - data == off_magic); - *reinterpret_cast(cur) = 0xBAABABBAu; - cur += sizeof(unsigned int); - assert(cur - data == tot_size); - b64encode(data, tot_size, out); - delete[] data; -} - -bool Decode(double* objective, SparseVector* v, const char* in, size_t size) { - v->clear(); - if (size % 4 != 0) { - cerr << "B64 error - line % 4 != 0\n"; - return false; - } - const size_t decoded_size = size * 3 / 4 - sizeof(unsigned int); - const size_t buf_size = decoded_size + sizeof(unsigned int); - if (decoded_size < 6) { cerr << "SparseVector decoding error: too short!\n"; return false; } - char* data = new char[buf_size]; - if (!b64decode(reinterpret_cast(in), size, data, buf_size)) { - delete[] data; - return false; - } - size_t cur = 0; - *objective = *reinterpret_cast(data); - cur += sizeof(double); - const int num_feats = *reinterpret_cast(&data[cur]); - cur += sizeof(int); - int fc = 0; - while(fc < num_feats && cur < decoded_size) { - ++fc; - const int fname_len = data[cur++]; - assert(fname_len > 0); - assert(fname_len < 256); - string fname(fname_len, '\0'); - memcpy(&fname[0], &data[cur], fname_len); - cur += fname_len; - const double val = *reinterpret_cast(&data[cur]); - cur += sizeof(double); - int fid = FD::Convert(fname); - v->set_value(fid, val); - } - if(num_feats != fc) { - cerr << "Expected " << num_feats << " but only decoded " << fc << "!\n"; - delete[] data; - return false; - } - if (*reinterpret_cast(&data[cur]) != 0xBAABABBAu) { - cerr << "SparseVector decodeding error : magic does not match!\n"; - delete[] data; - return false; - } - delete[] data; - return true; -} - -} diff --git a/src/sparse_vector.h b/src/sparse_vector.h deleted file mode 100644 index 6a8c9bf4..00000000 --- a/src/sparse_vector.h +++ /dev/null @@ -1,264 +0,0 @@ -#ifndef _SPARSE_VECTOR_H_ -#define _SPARSE_VECTOR_H_ - -// this is a modified version of code originally written -// by Phil Blunsom - -#include -#include -#include -#include - -#include "fdict.h" - -template -class SparseVector { -public: - SparseVector() {} - - const T operator[](int index) const { - typename std::map::const_iterator found = _values.find(index); - if (found == _values.end()) - return T(0); - else - return found->second; - } - - void set_value(int index, const T &value) { - _values[index] = value; - } - - void add_value(int index, const T &value) { - _values[index] += value; - } - - T value(int index) const { - typename std::map::const_iterator found = _values.find(index); - if (found != _values.end()) - return found->second; - else - return T(0); - } - - void store(std::valarray* target) const { - (*target) *= 0; - for (typename std::map::const_iterator - it = _values.begin(); it != _values.end(); ++it) { - if (it->first >= target->size()) break; - (*target)[it->first] = it->second; - } - } - - int max_index() const { - if (_values.empty()) return 0; - typename std::map::const_iterator found =_values.end(); - --found; - return found->first; - } - - // dot product with a unit vector of the same length - // as the sparse vector - T dot() const { - T sum = 0; - for (typename std::map::const_iterator - it = _values.begin(); it != _values.end(); ++it) - sum += it->second; - return sum; - } - - template - S dot(const SparseVector &vec) const { - S sum = 0; - for (typename std::map::const_iterator - it = _values.begin(); it != _values.end(); ++it) - { - typename std::map::const_iterator - found = vec._values.find(it->first); - if (found != vec._values.end()) - sum += it->second * found->second; - } - return sum; - } - - template - S dot(const std::vector &vec) const { - S sum = 0; - for (typename std::map::const_iterator - it = _values.begin(); it != _values.end(); ++it) - { - if (it->first < static_cast(vec.size())) - sum += it->second * vec[it->first]; - } - return sum; - } - - template - S dot(const S *vec) const { - // this is not range checked! - S sum = 0; - for (typename std::map::const_iterator - it = _values.begin(); it != _values.end(); ++it) - sum += it->second * vec[it->first]; - std::cout << "dot(*vec) " << sum << std::endl; - return sum; - } - - T l1norm() const { - T sum = 0; - for (typename std::map::const_iterator - it = _values.begin(); it != _values.end(); ++it) - sum += fabs(it->second); - return sum; - } - - T l2norm() const { - T sum = 0; - for (typename std::map::const_iterator - it = _values.begin(); it != _values.end(); ++it) - sum += it->second * it->second; - return sqrt(sum); - } - - SparseVector &operator+=(const SparseVector &other) { - for (typename std::map::const_iterator - it = other._values.begin(); it != other._values.end(); ++it) - { - T v = (_values[it->first] += it->second); - if (v == 0) - _values.erase(it->first); - } - return *this; - } - - SparseVector &operator-=(const SparseVector &other) { - for (typename std::map::const_iterator - it = other._values.begin(); it != other._values.end(); ++it) - { - T v = (_values[it->first] -= it->second); - if (v == 0) - _values.erase(it->first); - } - return *this; - } - - SparseVector &operator-=(const double &x) { - for (typename std::map::iterator - it = _values.begin(); it != _values.end(); ++it) - it->second -= x; - return *this; - } - - SparseVector &operator+=(const double &x) { - for (typename std::map::iterator - it = _values.begin(); it != _values.end(); ++it) - it->second += x; - return *this; - } - - SparseVector &operator/=(const double &x) { - for (typename std::map::iterator - it = _values.begin(); it != _values.end(); ++it) - it->second /= x; - return *this; - } - - SparseVector &operator*=(const T& x) { - for (typename std::map::iterator - it = _values.begin(); it != _values.end(); ++it) - it->second *= x; - return *this; - } - - SparseVector operator+(const double &x) const { - SparseVector result = *this; - return result += x; - } - - SparseVector operator-(const double &x) const { - SparseVector result = *this; - return result -= x; - } - - SparseVector operator/(const double &x) const { - SparseVector result = *this; - return result /= x; - } - - std::ostream &operator<<(std::ostream &out) const { - for (typename std::map::const_iterator - it = _values.begin(); it != _values.end(); ++it) - out << (it == _values.begin() ? "" : ";") - << FD::Convert(it->first) << '=' << it->second; - return out; - } - - bool operator<(const SparseVector &other) const { - typename std::map::const_iterator it = _values.begin(); - typename std::map::const_iterator other_it = other._values.begin(); - - for (; it != _values.end() && other_it != other._values.end(); ++it, ++other_it) - { - if (it->first < other_it->first) return true; - if (it->first > other_it->first) return false; - if (it->second < other_it->second) return true; - if (it->second > other_it->second) return false; - } - return _values.size() < other._values.size(); - } - - int num_active() const { return _values.size(); } - bool empty() const { return _values.empty(); } - - typedef typename std::map::const_iterator const_iterator; - const_iterator begin() const { return _values.begin(); } - const_iterator end() const { return _values.end(); } - - void clear() { - _values.clear(); - } - - void swap(SparseVector& other) { - _values.swap(other._values); - } - -private: - std::map _values; -}; - -template -SparseVector operator+(const SparseVector& a, const SparseVector& b) { - SparseVector result = a; - return result += b; -} - -template -SparseVector operator*(const SparseVector& a, const double& b) { - SparseVector result = a; - return result *= b; -} - -template -SparseVector operator*(const SparseVector& a, const T& b) { - SparseVector result = a; - return result *= b; -} - -template -SparseVector operator*(const double& a, const SparseVector& b) { - SparseVector result = b; - return result *= a; -} - -template -std::ostream &operator<<(std::ostream &out, const SparseVector &vec) -{ - return vec.operator<<(out); -} - -namespace B64 { - void Encode(double objective, const SparseVector& v, std::ostream* out); - // returns false if failed to decode - bool Decode(double* objective, SparseVector* v, const char* data, size_t size); -} - -#endif diff --git a/src/stringlib.cc b/src/stringlib.cc deleted file mode 100644 index 3ed74bef..00000000 --- a/src/stringlib.cc +++ /dev/null @@ -1,97 +0,0 @@ -#include "stringlib.h" - -#include -#include -#include -#include - -#include "lattice.h" - -using namespace std; - -void ParseTranslatorInput(const string& line, string* input, string* ref) { - size_t hint = 0; - if (line.find("{\"rules\":") == 0) { - hint = line.find("}}"); - if (hint == string::npos) { - cerr << "Syntax error: " << line << endl; - abort(); - } - hint += 2; - } - size_t pos = line.find("|||", hint); - if (pos == string::npos) { *input = line; return; } - ref->clear(); - *input = line.substr(0, pos - 1); - string rline = line.substr(pos + 4); - if (rline.size() > 0) { - assert(ref); - *ref = rline; - } -} - -void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) { - string sref; - ParseTranslatorInput(line, input, &sref); - if (sref.size() > 0) { - assert(ref); - LatticeTools::ConvertTextOrPLF(sref, ref); - } -} - -void ProcessAndStripSGML(string* pline, map* out) { - map& meta = *out; - string& line = *pline; - string lline = LowercaseString(line); - if (lline.find(""); - if (close == string::npos) return; // error - size_t end = lline.find(""); - string seg = Trim(lline.substr(4, close-4)); - string text = line.substr(close+1, end - close - 1); - for (size_t i = 1; i < seg.size(); i++) { - if (seg[i] == '=' && seg[i-1] == ' ') { - string less = seg.substr(0, i-1) + seg.substr(i); - seg = less; i = 0; continue; - } - if (seg[i] == '=' && seg[i+1] == ' ') { - string less = seg.substr(0, i+1); - if (i+2 < seg.size()) less += seg.substr(i+2); - seg = less; i = 0; continue; - } - } - line = Trim(text); - if (seg == "") return; - for (size_t i = 1; i < seg.size(); i++) { - if (seg[i] == '=') { - string label = seg.substr(0, i); - string val = seg.substr(i+1); - if (val[0] == '"') { - val = val.substr(1); - size_t close = val.find('"'); - if (close == string::npos) { - cerr << "SGML parse error: missing \"\n"; - seg = ""; - i = 0; - } else { - seg = val.substr(close+1); - val = val.substr(0, close); - i = 0; - } - } else { - size_t close = val.find(' '); - if (close == string::npos) { - seg = ""; - i = 0; - } else { - seg = val.substr(close+1); - val = val.substr(0, close); - } - } - label = Trim(label); - seg = Trim(seg); - meta[label] = val; - } - } -} - diff --git a/src/stringlib.h b/src/stringlib.h deleted file mode 100644 index 76efee8f..00000000 --- a/src/stringlib.h +++ /dev/null @@ -1,101 +0,0 @@ -#ifndef _STRINGLIB_H_ - -#include -#include -#include -#include - -// read line in the form of either: -// source -// source ||| target -// source will be returned as a string, target must be a sentence or -// a lattice (in PLF format) and will be returned as a Lattice object -void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref); -struct Lattice; -void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref); - -inline const std::string Trim(const std::string& str, const std::string& dropChars = " \t") { - std::string res = str; - res.erase(str.find_last_not_of(dropChars)+1); - return res.erase(0, res.find_first_not_of(dropChars)); -} - -inline void Tokenize(const std::string& str, char delimiter, std::vector* res) { - std::string s = str; - int last = 0; - res->clear(); - for (int i=0; i < s.size(); ++i) - if (s[i] == delimiter) { - s[i]=0; - if (last != i) { - res->push_back(&s[last]); - } - last = i + 1; - } - if (last != s.size()) - res->push_back(&s[last]); -} - -inline std::string LowercaseString(const std::string& in) { - std::string res(in.size(),' '); - for (int i = 0; i < in.size(); ++i) - res[i] = tolower(in[i]); - return res; -} - -inline int CountSubstrings(const std::string& str, const std::string& sub) { - size_t p = 0; - int res = 0; - while (p < str.size()) { - p = str.find(sub, p); - if (p == std::string::npos) break; - ++res; - p += sub.size(); - } - return res; -} - -inline int SplitOnWhitespace(const std::string& in, std::vector* out) { - out->clear(); - int i = 0; - int start = 0; - std::string cur; - while(i < in.size()) { - if (in[i] == ' ' || in[i] == '\t') { - if (i - start > 0) - out->push_back(in.substr(start, i - start)); - start = i + 1; - } - ++i; - } - if (i > start) - out->push_back(in.substr(start, i - start)); - return out->size(); -} - -inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) { - cmd->clear(); - param->clear(); - std::vector x; - SplitOnWhitespace(in, &x); - if (x.size() == 0) return; - *cmd = x[0]; - for (int i = 1; i < x.size(); ++i) { - if (i > 1) { *param += " "; } - *param += x[i]; - } -} - -void ProcessAndStripSGML(std::string* line, std::map* out); - -// given the first character of a UTF8 block, find out how wide it is -// see http://en.wikipedia.org/wiki/UTF-8 for more info -inline unsigned int UTF8Len(unsigned char x) { - if (x < 0x80) return 1; - else if ((x >> 5) == 0x06) return 2; - else if ((x >> 4) == 0x0e) return 3; - else if ((x >> 3) == 0x1e) return 4; - else return 0; -} - -#endif diff --git a/src/tdict.cc b/src/tdict.cc deleted file mode 100644 index c00d20b8..00000000 --- a/src/tdict.cc +++ /dev/null @@ -1,49 +0,0 @@ -#include "Ngram.h" -#include "dict.h" -#include "tdict.h" -#include "Vocab.h" - -using namespace std; - -Vocab* TD::dict_ = new Vocab; - -static const string empty; -static const string space = " "; - -WordID TD::Convert(const std::string& s) { - return dict_->addWord((VocabString)s.c_str()); -} - -const char* TD::Convert(const WordID& w) { - return dict_->getWord((VocabIndex)w); -} - -void TD::GetWordIDs(const std::vector& strings, std::vector* ids) { - ids->clear(); - for (vector::const_iterator i = strings.begin(); i != strings.end(); ++i) - ids->push_back(TD::Convert(*i)); -} - -std::string TD::GetString(const std::vector& str) { - string res; - for (vector::const_iterator i = str.begin(); i != str.end(); ++i) - res += (i == str.begin() ? empty : space) + TD::Convert(*i); - return res; -} - -void TD::ConvertSentence(const std::string& sent, std::vector* ids) { - string s = sent; - int last = 0; - ids->clear(); - for (int i=0; i < s.size(); ++i) - if (s[i] == 32 || s[i] == '\t') { - s[i]=0; - if (last != i) { - ids->push_back(Convert(&s[last])); - } - last = i + 1; - } - if (last != s.size()) - ids->push_back(Convert(&s[last])); -} - diff --git a/src/tdict.h b/src/tdict.h deleted file mode 100644 index 9d4318fe..00000000 --- a/src/tdict.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _TDICT_H_ -#define _TDICT_H_ - -#include -#include -#include "wordid.h" - -class Vocab; - -struct TD { - static Vocab* dict_; - static void ConvertSentence(const std::string& sent, std::vector* ids); - static void GetWordIDs(const std::vector& strings, std::vector* ids); - static std::string GetString(const std::vector& str); - static WordID Convert(const std::string& s); - static const char* Convert(const WordID& w); -}; - -#endif diff --git a/src/test_data/dummy.3gram.lm b/src/test_data/dummy.3gram.lm deleted file mode 100644 index ae665284..00000000 --- a/src/test_data/dummy.3gram.lm +++ /dev/null @@ -1,2645 +0,0 @@ - -\data\ -ngram 1=490 -ngram 2=1023 -ngram 3=1119 - -\1-grams: --2.761928 ! -0.06284945 --1.91683 " -0.03559465 --2.761928 ' -0.06057167 --2.159868 ( -0.07742823 --2.159868 ) -0.05637721 --1.292106 , -0.04497077 --3.062958 - -0.06247065 --1.429489 . -0.08555528 --2.761928 12 -0.06473851 --3.062958 17 -0.06586801 --2.585837 2000 -0.05520994 --3.062958 2002 -0.06360606 --3.062958 2006 -0.0497812 --3.062958 2008 -0.06322792 --3.062958 2009 -0.0497812 --3.062958 200–400 -0.06549184 --3.062958 224 -0.06586801 --1.91683 --99 -0.0457003 --2.761928 ? -0.05751594 --1.720535 a -0.05548429 --2.460898 about -0.05211611 --3.062958 acquiesced -0.05942829 --3.062958 actually -0.04349266 --3.062958 addition -0.05980976 --3.062958 admit -0.06095213 --3.062958 affected -0.04071253 --2.761928 against -0.06549184 --3.062958 aging -0.06586801 --3.062958 ago -0.04349266 --3.062958 ahead -0.06586801 --2.761928 al -0.06284945 --2.761928 all -0.0590465 --3.062958 all-around -0.06586801 --3.062958 along -0.04071253 --2.761928 also -0.06322792 --2.761928 always -0.06436136 --2.363988 an -0.06436135 --3.062958 analysis -0.06473851 --1.631594 and 0.006203346 --3.062958 anti-divine -0.06586801 --3.062958 any -0.06549184 --3.062958 approach -0.05789908 --3.062958 archive -0.04071253 --3.062958 are -0.05789908 --2.761928 arkive -0.06549184 --2.585837 article -0.0228177 --2.21786 as -0.09020901 --3.062958 asked -0.06398387 --2.585837 at -0.03145044 --2.761928 attention -0.02612664 --3.062958 available -0.04349266 --3.062958 average -0.04349266 --3.062958 away -0.06322792 --3.062958 ayers -0.05597997 --3.062958 b -0.04349266 --3.062958 back-and-forth -0.06586801 --3.062958 bailie -0.0497812 --2.761928 be -0.06511534 --3.062958 because -0.06586801 --2.460898 been -0.06322791 --3.062958 before -0.04349266 --2.761928 begin -0.05520995 --3.062958 being -0.06586801 --2.585837 between -0.1350269 --2.460898 bias -0.04111077 --3.062958 biased -0.06511534 --3.062958 biblical -0.06586801 --3.062958 bill -0.06586801 --3.062958 blade -0.06436136 --3.062958 blood -0.04349266 --3.062958 bob -0.06549184 --3.062958 book -0.06436136 --2.159868 briffa -0.06804922 --2.761928 briffa's -0.06284945 --2.021565 but -0.01525023 --2.21786 by -0.07600738 --2.761928 ca -0.2166343 --2.761928 can -0.06473851 --3.062958 case -0.06511534 --3.062958 cast -0.06473851 --3.062958 catch -0.06511534 --3.062958 caught -0.06511534 --3.062958 caveats -0.06322792 --3.062958 centennial-scale -0.06549184 --3.062958 cf -0.0497812 --3.062958 change -0.06209152 --3.062958 changing -0.06360606 --3.062958 characterizes -0.06586801 --3.062958 checked -0.06586801 --2.159868 chronology -0.02240231 --3.062958 church -0.06398387 --3.062958 cocaine -0.06398387 --3.062958 collection -0.06586801 --3.062958 combination -0.06209152 --3.062958 combine -0.04071253 --3.062958 combined -0.06209152 --3.062958 comment -0.06360606 --3.062958 commentary -0.06322792 --3.062958 commenter -0.06586801 --3.062958 comments -0.06586801 --3.062958 compared -0.05789908 --3.062958 concerned -0.06473851 --3.062958 concrete -0.06095213 --3.062958 connection -0.06209152 --2.761928 conservatives -0.06360606 --3.062958 considered -0.06095213 --3.062958 consists -0.04349266 --3.062958 constructing -0.05789908 --2.761928 control -0.03991493 --2.585837 cores -0.0236473 --3.062958 corridor -0.06473851 --2.761928 crack -0.06436136 --3.062958 crossroads -0.0497812 --2.460898 cru -0.1318786 --3.062958 darkness -0.05597997 --2.108715 data -0.06845023 --2.761928 day -0.05674864 --2.761928 days -0.04939082 --3.062958 debt -0.04349266 --3.062958 decline -0.06095213 --3.062958 deep -0.06549184 --3.062958 deeper -0.06586801 --3.062958 delete -0.05789908 --3.062958 derived -0.06511534 --3.062958 described -0.05942829 --2.761928 did -0.06095213 --2.761928 difference -0.04860901 --2.761928 different -0.06247065 --2.761928 divergence -0.2166343 --2.761928 do -0.05559513 --3.062958 does -0.06247065 --3.062958 doing -0.06586801 --3.062958 don't -0.06586801 --3.062958 done -0.06586801 --3.062958 doubt -0.06360606 --3.062958 down -0.05789908 --3.062958 due -0.06473851 --3.062958 earlier -0.06019088 --3.062958 editors -0.06511534 --3.062958 energy -0.04349266 --3.062958 enormous -0.06586801 --2.761928 et -0.2166343 --3.062958 even -0.06586801 --3.062958 every -0.06586801 --3.062958 exactly -0.06360606 --3.062958 exception -0.05789908 --3.062958 excluding -0.06549184 --3.062958 expect -0.06511534 --3.062958 extension -0.05597997 --3.062958 factors -0.04349266 --3.062958 fantasy -0.06436136 --3.062958 far -0.06511534 --2.585837 few -0.1590744 --2.585837 finally -0.06511533 --3.062958 first -0.04349266 --3.062958 flesh -0.05597997 --3.062958 following: -0.06095213 --3.062958 follows: -0.06095213 --2.284806 for -0.06171204 --3.062958 forests -0.0497812 --2.585837 from -0.05713245 --3.062958 fully -0.06586801 --2.585837 further -0.06511533 --3.062958 furthermore -0.04349266 --3.062958 future -0.0497812 --3.062958 generating -0.06586801 --2.761928 get -0.191855 --3.062958 ghastly -0.06586801 --3.062958 ghostwritten -0.06360606 --3.062958 gil -0.06586801 --3.062958 given -0.04071253 --3.062958 going -0.05789908 --3.062958 got -0.06436136 --2.761928 great -0.2166343 --3.062958 growing -0.0497812 --3.062958 grows -0.06511534 --2.363988 had -0.1033177 --2.585837 hantemirov -0.09654189 --2.761928 happening -0.06436136 --3.062958 happens -0.06549184 --3.062958 hard -0.05789908 --3.062958 hardly -0.06473851 --2.460898 has -0.03063563 --3.062958 hate -0.05789908 --2.284806 have -0.08108715 --3.062958 haven't -0.06586801 --2.363988 he -0.112982 --3.062958 here -0.06586801 --3.062958 highly -0.06586801 --2.761928 him -0.05751594 --2.585837 his -0.06511533 --3.062958 how -0.06586801 --2.761928 however -0.1946352 --3.062958 hs -0.06586801 --3.062958 humanity -0.06511534 --2.108715 i -0.05980975 --3.062958 i'd -0.06586801 --3.062958 i've -0.06586801 --2.761928 idea -0.02612664 --2.761928 if -0.03670979 --3.062958 illusion -0.05597997 --3.062958 immense -0.06586801 --3.062958 impact -0.06322792 --3.062958 important -0.06586801 --1.807685 in -0.04419087 --3.062958 included -0.06209152 --2.761928 including -0.0165447 --3.062958 indeed -0.06511534 --3.062958 individual -0.06511534 --3.062958 information -0.06511534 --3.062958 inhomogeneities -0.04349266 --3.062958 initial -0.06549184 --2.761928 instead -0.2109523 --3.062958 interannual -0.06549184 --2.761928 into -0.03991493 --3.062958 introduced -0.06360606 --1.91683 is -0.001109093 --2.062958 it -0.06621437 --2.460898 it's -0.06019088 --3.062958 its -0.06586801 --2.761928 journal -0.06209152 --3.062958 jurisdiction -0.0497812 --2.460898 just -0.05520994 --3.062958 kaufman -0.06549184 --3.062958 keeps -0.06586801 --2.761928 khadyta -0.2166343 --2.460898 know -0.1105378 --3.062958 larch -0.06586801 --2.761928 larches -0.04743365 --3.062958 large-scale -0.06095213 --2.761928 like -0.06511534 --3.062958 limited -0.06586801 --3.062958 living -0.06549184 --3.062958 longest -0.05597997 --3.062958 looking -0.06549184 --3.062958 looks -0.06586801 --3.062958 love -0.05789908 --3.062958 made -0.06095213 --2.761928 mag -0.2143704 --3.062958 magnitude -0.05980976 --3.062958 magnus -0.0497812 --3.062958 makes -0.04071253 --3.062958 many -0.06586801 --3.062958 may -0.06586801 --3.062958 mean -0.06322792 --3.062958 measured -0.06360606 --2.761928 measurement -0.213992 --2.460898 method -0.03711172 --3.062958 methodology -0.06586801 --3.062958 mind -0.06511534 --3.062958 mix -0.06586801 --2.585837 more -0.05636447 --3.062958 morning -0.06284945 --2.585837 most -0.0647385 --2.761928 much -0.06473851 --3.062958 multi-parters -0.04349266 --3.062958 multiproxy -0.06586801 --3.062958 mundane -0.06511534 --2.585837 my -0.1598284 --3.062958 national -0.06586801 --3.062958 naughtiness -0.0497812 --3.062958 nettle -0.04349266 --3.062958 never -0.06586801 --3.062958 next -0.04349266 --3.062958 no -0.06586801 --3.062958 non-robustness -0.06586801 --3.062958 northern -0.06586801 --2.062958 not -0.0712041 --3.062958 noted -0.06586801 --3.062958 noticed -0.06095213 --3.062958 notwithstanding -0.06473851 --3.062958 now -0.04349266 --2.761928 obama -0.03791448 --3.062958 observed -0.06586801 --1.832509 of -0.04850956 --2.761928 old -0.06436136 --2.585837 older -0.1053004 --3.062958 oldie -0.04349266 --2.159868 on -0.09226183 --2.585837 one -0.04900008 --3.062958 online -0.0497812 --3.062958 only -0.06586801 --3.062958 or -0.06586801 --3.062958 originated -0.06209152 --3.062958 osborn -0.05597997 --3.062958 out -0.06322792 --3.062958 outright -0.06586801 --3.062958 own -0.06586801 --3.062958 paleoclimatologists -0.05597997 --3.062958 passage -0.06284945 --3.062958 passing -0.05597997 --3.062958 path -0.06095213 --3.062958 patterns -0.05942829 --3.062958 paul -0.06436136 --3.062958 people -0.06095213 --2.363988 perhaps -0.06259563 --2.761928 phil -0.2166343 --3.062958 picked -0.06511534 --3.062958 piece -0.06360606 --3.062958 place -0.0497812 --3.062958 placed -0.06586801 --3.062958 play -0.06322792 --3.062958 point -0.06095213 --3.062958 policy -0.06322792 --2.585837 politics -0.02571439 --2.363988 population -0.1001791 --3.062958 position -0.06095213 --3.062958 possible -0.05597997 --2.761928 potential -0.06436136 --3.062958 power -0.05789908 --3.062958 powers -0.05597997 --3.062958 precipitous -0.06586801 --3.062958 precisely -0.04071253 --3.062958 predictable -0.06586801 --3.062958 presented -0.06019088 --3.062958 preserve -0.06586801 --3.062958 previous -0.06549184 --3.062958 principalities -0.05980976 --3.062958 principles -0.05942829 --3.062958 prior -0.06511534 --3.062958 probable -0.06095213 --2.761928 problem -0.2120946 --3.062958 projected -0.06549184 --3.062958 properly -0.06586801 --3.062958 prove -0.06586801 --3.062958 provide -0.04071253 --3.062958 provided -0.05789908 --3.062958 provocative -0.06586801 --3.062958 published -0.05942829 --3.062958 push -0.06511534 --2.585837 rcs -0.06133225 --3.062958 react -0.05789908 --3.062958 read -0.06247065 --2.761928 readers -0.06398387 --3.062958 reading -0.04349266 --3.062958 real -0.06322792 --3.062958 really -0.06586801 --3.062958 realm -0.05980976 --2.761928 reason -0.06360606 --3.062958 recent -0.06511534 --2.761928 recently -0.1946352 --3.062958 reconstruction -0.0497812 --3.062958 refusal -0.05942829 --3.062958 refused -0.05789908 --3.062958 related -0.05789908 --3.062958 relevant -0.04349266 --3.062958 relied -0.06322792 --3.062958 religion -0.05597997 --3.062958 remained -0.06586801 --3.062958 remarked -0.06095213 --3.062958 reposting -0.06473851 --3.062958 requiring -0.06322792 --3.062958 response -0.05789908 --3.062958 resulting -0.06322792 --3.062958 rev -0.0497812 --2.460898 right -0.04821757 --3.062958 ring -0.06586801 --3.062958 ring-width -0.06511534 --2.761928 river -0.1946352 --3.062958 said -0.06436136 --3.062958 same -0.06473851 --3.062958 sample -0.06586801 --3.062958 sat -0.05942829 --2.460898 schweingruber -0.09101291 --3.062958 schweingruber's -0.06549184 --2.585837 science -0.1568045 --3.062958 script -0.06322792 --2.585837 see -0.1112577 --3.062958 seized -0.04071253 --2.761928 selected -0.04664831 --2.585837 selection -0.1491516 --3.062958 sensitive -0.06511534 --3.062958 sensitivity -0.06095213 --2.585837 series -0.1314228 --3.062958 set -0.05942829 --3.062958 several -0.06549184 --3.062958 shadow -0.06586801 --2.761928 shadows -0.04309659 --2.585837 shiyatov -0.06360605 --3.062958 should -0.06247065 --3.062958 similar -0.06473851 --3.062958 similarly -0.06586801 --3.062958 since -0.06019088 --3.062958 size -0.05597997 --3.062958 skimmed -0.06019088 --2.761928 slowly -0.04270015 --3.062958 small -0.06586801 --3.062958 so -0.06549184 --3.062958 some -0.06549184 --3.062958 someone -0.06586801 --3.062958 start -0.06549184 --3.062958 staunchly -0.06586801 --3.062958 struggling -0.06549184 --3.062958 studies -0.06095213 --2.761928 study -0.02612664 --3.062958 stumbled -0.06586801 --2.585837 subfossil -0.06171205 --3.062958 subsequent -0.06549184 --3.062958 subset -0.05942829 --3.062958 success -0.0497812 --3.062958 supplement -0.0497812 --3.062958 supplemented -0.06360606 --3.062958 surface -0.04349266 --3.062958 take -0.06436136 --3.062958 taken -0.05789908 --2.761928 taymir -0.06247065 --3.062958 temperature -0.04349266 --3.062958 tendency -0.05789908 --3.062958 terms -0.05980976 --3.062958 than -0.04071253 --1.91683 that -0.06692892 --1.243414 the -0.08813193 --3.062958 their -0.06511534 --2.761928 themselves -0.04111078 --3.062958 there's -0.06586801 --2.460898 these -0.05942829 --2.460898 they -0.06398387 --2.761928 things -0.06057167 --3.062958 think -0.06549184 --3.062958 thinking -0.06586801 --1.858838 this -0.08175352 --2.761928 those -0.06057167 --3.062958 thought -0.0497812 --3.062958 thousand -0.04349266 --3.062958 through -0.04071253 --2.761928 time -0.0326698 --1.720535 to -0.07930601 --2.761928 today -0.04821758 --3.062958 took -0.04071253 --3.062958 towards -0.06511534 --2.761928 trans -0.06549184 --2.460898 trees -0.04704115 --2.761928 trouble -0.213234 --3.062958 true -0.04349266 --3.062958 trying -0.05789908 --2.761928 two -0.2166343 --3.062958 unarchived -0.0497812 --3.062958 under -0.06549184 --3.062958 unintentional -0.06473851 --3.062958 unrepresentativeness -0.05980976 --3.062958 until -0.06549184 --3.062958 unveiled: -0.06586801 --2.761928 up -0.03185729 --3.062958 upon -0.06019088 --2.761928 use -0.2109523 --2.363988 used -0.0545155 --2.761928 using -0.02323271 --3.062958 usual -0.06586801 --3.062958 valid -0.06549184 --2.761928 variability -0.03911585 --2.761928 versions -0.04428373 --2.761928 very -0.06549184 --3.062958 violence -0.06586801 --3.062958 virtually -0.06586801 --3.062958 virtue -0.05980976 --3.062958 voted -0.06398387 --3.062958 warn -0.06549184 --3.062958 warnings -0.04349266 --2.363988 was -0.06171205 --3.062958 way -0.06549184 --3.062958 we -0.06549184 --3.062958 well -0.06398387 --2.284806 were -0.07866543 --2.21786 what -0.02364731 --3.062958 what's -0.06549184 --2.585837 when -0.06057167 --2.585837 where -0.05597997 --2.460898 which -0.0403139 --2.585837 while -0.03951557 --3.062958 whose -0.06586801 --3.062958 why -0.06586801 --3.062958 widths -0.05597997 --2.761928 will -0.06322792 --3.062958 wise -0.06549184 --2.021565 with -0.08912028 --3.062958 within -0.06549184 --3.062958 without -0.06586801 --3.062958 worth -0.06586801 --2.460898 would -0.1303614 --3.062958 wright's -0.06586801 --3.062958 wrote -0.04071253 --2.159868 yamal -0.0719028 --2.761928 year -0.04270015 --3.062958 years -0.06549184 --3.062958 yes -0.04349266 --3.062958 yesterday -0.06473851 --3.062958 yet -0.04349266 --3.062958 you -0.06511534 --2.761928 your -0.06511534 - -\2-grams: --1.15037 ! as -0.004049858 --1.15037 ! instead 0.2044696 --1.995468 " ( -0.005168174 --1.995468 " - 0.05332709 --1.995468 " --1.995468 " as -0.004049858 --1.995468 " concrete 0.05332709 --1.995468 " corridor 0.05332709 --1.249819 " divergence 0.1451325 --1.995468 " further 0.008061528 --1.995468 " i'd 0.05332709 --1.995468 " success 0.05332709 --1.995468 " that -0.008505944 --1.995468 " the -0.007702977 --1.995468 " used -0.0004517734 --1.15037 ' --1.15037 ' yes 0.05332709 --1.75243 ( and -0.01063527 --1.75243 ( in 0.006514465 --1.006781 ( mag 0.1451325 --1.75243 ( or 0.05332709 --1.75243 ( phil 0.2044696 --1.75243 ( which 0.00272119 --1.75243 ( while 0.008061528 --1.006781 ) , -0.002172916 --1.75243 ) --1.75243 ) acquiesced 0.05332709 --1.75243 ) and -0.002266581 --1.75243 ) had -0.0004517734 --1.75243 ) things 0.01894335 --1.75243 ) took 0.05332709 --2.620192 , 2008 0.05332709 --2.620192 , 224 0.05332709 --2.620192 , a -0.01011507 --2.620192 , all 0.01894335 --1.955229 , and -0.006035992 --2.620192 , as 0.0389223 --2.620192 , bob 0.05332709 --2.620192 , briffa -0.005168174 --0.8166095 , but 0.05114232 --2.620192 , cf 0.05332709 --2.620192 , cru 0.00272119 --2.620192 , delete 0.05332709 --2.620192 , for -0.002554279 --2.620192 , from 0.008061528 --2.620192 , he -0.0004517734 --2.620192 , his 0.008061528 --1.955229 , i 0.008061524 --2.620192 , if 0.01894335 --2.620192 , including 0.01894335 --2.620192 , is -0.008505944 --1.874543 , it -0.0004517762 --1.874543 , it's 0.01894334 --2.620192 , kaufman 0.05332709 --2.620192 , most 0.008061528 --2.620192 , notwithstanding 0.05332709 --2.620192 , of 0.007685009 --2.620192 , on -0.005168174 --2.620192 , perhaps 0.04797027 --2.620192 , requiring 0.05332709 --2.620192 , since 0.05332709 --1.955229 , the 0.02331641 --1.955229 , this 0.01715922 --2.620192 , until 0.05332709 --2.620192 , using 0.01894335 --1.874543 , when 0.03010483 --2.620192 , where 0.008061528 --1.874543 , which 0.01894334 --2.620192 , while 0.008061528 --2.620192 , yamal -0.005168174 --0.8493397 - not -0.006728992 --2.482808 . " -0.008505944 --2.482808 . ' 0.01894335 --2.482808 . ( -0.005168174 --2.482808 . ) -0.005168174 --0.6792259 . --1.737159 . a 0.003078613 --2.482808 . actually 0.05332709 --2.482808 . and -0.01063527 --2.482808 . as -0.004049858 --1.737159 . briffa 0.03257156 --2.482808 . but -0.007295175 --2.482808 . changing 0.05332709 --2.482808 . first 0.05332709 --2.482808 . furthermore 0.05332709 --1.737159 . however 0.1451325 --2.482808 . i -0.006035987 --2.482808 . in -0.009490006 --2.482808 . it 0.0164606 --2.482808 . perhaps 0.04797027 --2.482808 . science 0.1193421 --2.482808 . several 0.05332709 --2.482808 . the -0.008591395 --1.737159 . these 0.01894334 --1.737159 . this 0.0130633 --2.482808 . violence 0.05332709 --2.482808 . what -0.004049858 --2.482808 . what's 0.05332709 --2.482808 . while 0.008061528 --2.482808 . with 0.05785327 --2.482808 . wright's 0.05332709 --1.15037 12 cores 0.008061528 --1.15037 12 picked 0.05332709 --0.8493397 17 ring-width 0.05332709 --1.326461 2000 and -0.01063527 --1.326461 2000 may 0.05332709 --1.326461 2000 presented 0.05332709 --0.8493397 2002 as -0.004049858 --0.8493397 2006 . -0.0114856 --0.8493397 2008 ) -0.005168174 --0.8493397 2009 . 0.08907277 --0.8493397 200–400 year 0.01894335 --0.8493397 224 individual 0.05332709 --1.995468 ' 0.01894335 --1.995468 as 0.0389223 --1.995468 briffa's 0.01894335 --1.995468 but -0.007295175 --1.995468 i -0.006035987 --1.995468 if 0.01894335 --1.995468 in -0.009490006 --1.995468 next 0.05332709 --1.249819 perhaps 0.06234263 --1.249819 the 0.0223057 --1.995468 this -0.009059753 --1.995468 what -0.004049858 --1.15037 ? " -0.008505944 --1.15037 ? i -0.006035987 --2.191762 a " 0.01222976 --2.191762 a case 0.05332709 --2.191762 a comment 0.05332709 --2.191762 a commenter 0.05332709 --2.191762 a different 0.01894335 --1.5268 a few 0.109396 --2.191762 a generating 0.05332709 --2.191762 a great 0.2044696 --2.191762 a mean 0.05332709 --2.191762 a prior 0.05332709 --2.191762 a provocative 0.05332709 --2.191762 a rcs 0.008061528 --2.191762 a science 0.008061528 --2.191762 a shadow 0.05332709 --2.191762 a similar 0.05332709 --2.191762 a small 0.05332709 --2.191762 a surface 0.05332709 --2.191762 a thousand 0.05332709 --2.191762 a time 0.01894335 --2.191762 a valid 0.05332709 --1.4514 about a -0.01011507 --1.4514 about my 0.008061528 --1.4514 about not -0.006728992 --1.4514 about potential 0.01894335 --0.8493397 acquiesced in -0.009490006 --0.8493397 actually , -0.01187418 --0.8493397 addition of -0.009287588 --0.8493397 admit that 0.04168737 --0.8493397 affected the -0.01198488 --1.15037 against flesh 0.05332709 --1.15037 against inhomogeneities 0.05332709 --0.8493397 aging patterns 0.05332709 --0.8493397 ago , -0.008075343 --0.8493397 ahead you 0.05332709 --1.15037 al ( -0.005168174 --1.15037 al 2009 0.05332709 --1.15037 all of -0.009287588 --1.15037 all those 0.01894335 --0.8493397 all-around naughtiness 0.05332709 --0.8493397 along the -0.01198488 --1.15037 also has 0.00272119 --1.15037 also know 0.08231446 --1.15037 always been 0.00272119 --1.15037 always worth 0.05332709 --1.54831 an exception 0.05332709 --1.54831 an extension 0.05332709 --1.54831 an immense 0.05332709 --1.54831 an important 0.05332709 --1.54831 an unintentional 0.05332709 --0.8493397 analysis has 0.00272119 --2.280704 and , -0.007080218 --2.280704 and all-around 0.05332709 --2.280704 and blood 0.05332709 --2.280704 and briffa -0.005168174 --2.280704 and even 0.05332709 --2.280704 and got 0.05332709 --2.280704 and hantemirov 0.09388901 --2.280704 and he 0.06152429 --2.280704 and i've 0.05332709 --2.280704 and it -0.006728992 --2.280704 and most 0.008061528 --2.280704 and outright 0.05332709 --2.280704 and perhaps -0.0004517734 --2.280704 and politics 0.008061528 --2.280704 and potential 0.01894335 --2.280704 and principalities 0.05332709 --2.280704 and sat 0.05332709 --2.280704 and science 0.1193421 --1.615741 and shiyatov 0.05332708 --2.280704 and temperature 0.05332709 --2.280704 and that -0.008505944 --1.615741 and the -0.005814605 --2.280704 and they 0.00272119 --0.8493397 anti-divine powers 0.05332709 --0.8493397 any journal 0.01894335 --0.8493397 approach to -0.01011507 --0.8493397 archive the -0.01198488 --0.8493397 are to -0.01011507 --1.15037 arkive down 0.05332709 --1.15037 arkive under 0.05332709 --1.326461 article , -0.007080218 --1.326461 article . -0.004888296 --1.326461 article on -0.005168174 --1.694438 as a -0.01011507 --0.9487888 as ca 0.1451325 --1.694438 as compared 0.05332709 --1.694438 as follows: 0.05332709 --1.694438 as it 0.0164606 --1.694438 as noted 0.05332709 --0.8493397 asked for -0.002554279 --1.326461 at a -0.01011507 --1.326461 at precisely 0.05332709 --1.326461 at the -0.01198488 --1.15037 attention , 0.05896524 --1.15037 attention . -0.0114856 --0.8493397 available , -0.008075343 --0.8493397 average , -0.01187418 --0.8493397 away ) 0.03209379 --0.8493397 ayers and -0.01063527 --0.8493397 b , -0.01187418 --0.8493397 back-and-forth yesterday 0.05332709 --0.8493397 bailie . -0.0114856 --1.15037 be happening 0.01894335 --1.15037 be included 0.05332709 --0.8493397 because so 0.05332709 --1.4514 been an -0.0004517734 --1.4514 been concerned 0.05332709 --1.4514 been done 0.05332709 --1.4514 been projected 0.05332709 --0.8493397 before , -0.01187418 --1.15037 begin in -0.009490006 --1.15037 begin with -0.007295175 --0.8493397 being true 0.05332709 --1.326461 between ring 0.05332709 --0.580812 between the -0.06704012 --1.4514 bias , -0.007080218 --1.4514 bias introduced 0.05332709 --1.4514 bias towards 0.05332709 --1.4514 bias would 0.08231446 --0.8493397 biased selection 0.1193421 --0.8493397 biblical passage 0.05332709 --0.8493397 bill ayers 0.05332709 --0.8493397 blade was -0.0004517734 --0.8493397 blood , 0.05896524 --0.8493397 bob ? 0.01894335 --0.8493397 book was -0.0004517734 --1.087467 briffa 2000 0.05332708 --1.75243 briffa 2006 0.05332709 --1.75243 briffa asked 0.05332709 --1.75243 briffa et 0.2044696 --1.75243 briffa to -0.01011507 --1.75243 briffa used -0.0004517734 --1.15037 briffa's own 0.05332709 --1.15037 briffa's yamal -0.005168174 --1.890732 but , -0.01187418 --1.890732 but anti-divine 0.05332709 --1.890732 but because 0.05332709 --1.890732 but between 0.1193421 --1.890732 but given 0.05332709 --1.145083 but it -0.0004517762 --1.890732 but it's 0.00272119 --1.890732 but the -0.01198488 --1.890732 but this 0.009005655 --1.890732 but to 0.002916232 --1.694438 by bill 0.05332709 --1.694438 by gil 0.05332709 --1.694438 by hantemirov 0.09388901 --1.694438 by how 0.05332709 --1.694438 by magnus 0.05332709 --0.9487888 by the -0.01105098 --0.4047208 ca readers 0.05332709 --1.15037 can combine 0.05332709 --1.15037 can see 0.1193421 --0.8493397 case where 0.008061528 --0.8493397 cast these 0.00272119 --0.8493397 catch my 0.1193421 --0.8493397 caught my 0.1193421 --0.8493397 caveats on -0.005168174 --0.8493397 centennial-scale variability 0.01894335 --0.8493397 cf . -0.0114856 --0.8493397 change with -0.007295175 --0.8493397 changing what -0.004049858 --0.8493397 characterizes northern 0.05332709 --0.8493397 checked earlier 0.05332709 --1.75243 chronology , -0.01187418 --1.75243 chronology also 0.01894335 --1.75243 chronology briffa -0.005168174 --1.75243 chronology has 0.00272119 --1.75243 chronology in -0.009490006 --1.75243 chronology method 0.00272119 --1.75243 chronology was -0.0004517734 --1.75243 chronology with -0.007295175 --0.8493397 church for -0.002554279 --0.8493397 cocaine for -0.002554279 --0.8493397 collection does 0.05332709 --0.8493397 combination with 0.05785327 --0.8493397 combine the -0.01198488 --0.8493397 combined with 0.05785327 --0.8493397 comment by -0.004049858 --0.8493397 commentary on 0.03209379 --0.8493397 commenter remarked 0.05332709 --0.8493397 comments catch 0.05332709 --0.8493397 compared to 0.02102831 --0.8493397 concerned about 0.00272119 --0.8493397 concrete " -0.008505944 --0.8493397 connection with -0.007295175 --1.15037 conservatives said 0.05332709 --1.15037 conservatives were -0.002554279 --0.8493397 considered " -0.008505944 --0.8493397 consists , -0.01187418 --0.8493397 constructing a -0.01011507 --1.15037 control ! 0.01894335 --1.15037 control the -0.01198488 --1.326461 cores , -0.008075343 --1.326461 cores . -0.004888296 --1.326461 cores were 0.04819728 --0.8493397 corridor method 0.00272119 --1.15037 crack about 0.00272119 --1.15037 crack cocaine 0.05332709 --0.8493397 crossroads . -0.0114856 --0.7057508 cru population 0.07636014 --1.4514 cru selection 0.008061528 --1.4514 cru staunchly 0.05332709 --0.8493397 darkness and -0.01063527 --1.803582 data ( -0.005168174 --1.057933 data . -0.0100497 --1.803582 data policy 0.05332709 --1.803582 data remained 0.05332709 --1.803582 data set 0.05332709 --1.803582 data used 0.04797027 --1.803582 data was -0.0004517734 --1.803582 data were -0.002554279 --1.15037 day politics 0.008061528 --1.15037 day to -0.01011507 --1.15037 days . 0.08907277 --1.15037 days ago 0.05332709 --0.8493397 debt , -0.007080218 --0.8493397 decline is -0.008505944 --0.8493397 deep into 0.01894335 --0.8493397 deeper principles 0.05332709 --0.8493397 delete a 0.0001907796 --0.8493397 derived from 0.008061528 --0.8493397 described in -0.009490006 --1.15037 did not -0.006728992 --1.15037 did they 0.00272119 --1.15037 difference . 0.08907277 --1.15037 difference between 0.1193421 --1.15037 different aging 0.05332709 --1.15037 different data -0.006035987 --0.4047208 divergence problem 0.1451325 --1.15037 do and -0.002266581 --1.15037 do indeed 0.05332709 --0.8493397 does not 0.0164606 --0.8493397 doing exactly 0.05332709 --0.8493397 don't really 0.05332709 --0.8493397 done without 0.05332709 --0.8493397 doubt what -0.004049858 --0.8493397 down to -0.01011507 --0.8493397 due just 0.00272119 --0.8493397 earlier this -0.009059753 --0.8493397 editors finally 0.008061528 --0.8493397 energy , 0.05896524 --0.8493397 enormous hs 0.05332709 --0.4047208 et al 0.05332709 --0.8493397 even probable 0.05332709 --0.8493397 every subsequent 0.05332709 --0.8493397 exactly what -0.004049858 --0.8493397 exception to 0.02102831 --0.8493397 excluding khadyta 0.2044696 --0.8493397 expect from 0.008061528 --0.8493397 extension and -0.01063527 --0.8493397 factors , 0.05896524 --0.8493397 fantasy had -0.0004517734 --0.8493397 far more 0.008061528 --1.326461 few at 0.008061528 --0.580812 few days 0.05332709 --1.326461 finally available 0.05332709 --1.326461 finally placed 0.05332709 --1.326461 finally seized 0.05332709 --0.8493397 first , -0.01187418 --0.8493397 flesh and -0.01063527 --0.8493397 following: --0.8493397 follows: --1.627491 for all 0.01894335 --1.627491 for an -0.0004517734 --1.627491 for excluding 0.05332709 --1.627491 for him 0.01894335 --1.627491 for paleoclimatologists 0.05332709 --1.627491 for we 0.05332709 --0.8493397 forests . -0.004888296 --1.326461 from 200–400 0.05332709 --1.326461 from a -0.01011507 --1.326461 from someone 0.05332709 --0.8493397 fully thinking 0.05332709 --1.326461 further ahead 0.05332709 --1.326461 further along 0.05332709 --1.326461 further away 0.05332709 --0.8493397 furthermore , -0.007080218 --0.8493397 future . -0.0114856 --0.8493397 generating script 0.05332709 --0.4047208 get the -0.06704012 --0.8493397 ghastly tendency 0.05332709 --0.8493397 ghostwritten by -0.004049858 --0.8493397 gil bailie 0.05332709 --0.8493397 given the -0.01198488 --0.8493397 going to -0.01011507 --0.8493397 got used 0.04797027 --0.4047208 great idea 0.05332709 --0.8493397 growing . 0.08907277 --0.8493397 grows more 0.008061528 --0.8026608 had a -0.007295178 --1.54831 had been 0.00272119 --1.54831 had in -0.009490006 --1.54831 had jurisdiction 0.05332709 --0.6614985 hantemirov and -0.5914098 --1.15037 happening deep 0.05332709 --1.15037 happening right 0.00272119 --0.8493397 happens today 0.01894335 --0.8493397 hard to -0.01011507 --0.8493397 hardly know 0.00272119 --1.4514 has a -0.01011507 --1.4514 has always 0.01894335 --1.4514 has only 0.05332709 --1.4514 has the -0.01198488 --0.8493397 hate to -0.01011507 --1.627491 have an -0.0004517734 --0.881842 have been 0.01894334 --1.627491 have relied 0.05332709 --1.627491 have similarly 0.05332709 --1.627491 have the -0.01198488 --0.8493397 haven't read 0.05332709 --0.8026608 he is -0.004049861 --1.54831 he made 0.05332709 --1.54831 he would 0.00272119 --1.54831 he wrote 0.05332709 --0.8493397 here prove 0.05332709 --0.8493397 highly possible 0.05332709 --1.15037 him hate 0.05332709 --1.15037 him to 0.002916232 --1.326461 his comments 0.05332709 --1.326461 his initial 0.05332709 --1.326461 his precipitous 0.05332709 --0.8493397 how their 0.05332709 --0.4047208 however , -0.01082908 --0.8493397 hs blade 0.05332709 --0.8493397 humanity at 0.008061528 --1.803582 i can 0.01894335 --1.803582 i checked 0.05332709 --1.803582 i had 0.06152429 --1.803582 i hardly 0.05332709 --1.803582 i haven't 0.05332709 --1.803582 i know 0.00272119 --1.803582 i noticed 0.05332709 --1.803582 i skimmed 0.05332709 --1.803582 i stumbled 0.05332709 --0.8493397 i'd love 0.05332709 --0.8493397 i've provided 0.05332709 --1.15037 idea , -0.01187418 --1.15037 idea . -0.0114856 --1.15037 if it -0.006728992 --1.15037 if the -0.01198488 --0.8493397 illusion and -0.01063527 --0.8493397 immense energy 0.05332709 --0.8493397 impact on -0.005168174 --0.8493397 important impact 0.05332709 --1.358963 in a -0.007295178 --2.104612 in any 0.05332709 --2.104612 in briffa 0.02412629 --2.104612 in briffa's 0.01894335 --2.104612 in combination 0.05332709 --2.104612 in connection 0.05332709 --2.104612 in hantemirov 0.09388901 --2.104612 in mind 0.05332709 --2.104612 in one 0.008061528 --2.104612 in passing 0.05332709 --2.104612 in response 0.05332709 --2.104612 in rev 0.05332709 --2.104612 in terms 0.05332709 --1.358963 in the -0.007650165 --2.104612 in this -0.009059753 --2.104612 in virtually 0.05332709 --0.8493397 included with 0.05785327 --1.15037 including , -0.01187418 --1.15037 including the -0.007702977 --0.8493397 indeed see 0.1193421 --0.8493397 individual series 0.008061528 --0.8493397 information finally 0.008061528 --0.8493397 inhomogeneities , 0.05896524 --0.8493397 initial use 0.2044696 --0.4047208 instead of 0.01149127 --0.8493397 interannual variability 0.01894335 --1.15037 into him 0.01894335 --1.15037 into the -0.01198488 --0.8493397 introduced by -0.004049858 --1.995468 is , -0.007080218 --1.995468 is always 0.01894335 --1.995468 is considered 0.05332709 --1.995468 is derived 0.05332709 --1.995468 is doing 0.05332709 --1.995468 is happening 0.01894335 --1.995468 is highly 0.05332709 --1.995468 is measured 0.05332709 --1.995468 is no 0.05332709 --1.995468 is not -0.006728992 --1.995468 is related 0.05332709 --1.995468 is that -0.008505944 --1.995468 is the -0.01198488 --1.995468 is within 0.05332709 --1.84934 it grows 0.05332709 --1.84934 it has 0.00272119 --1.184377 it is 0.0004524188 --1.84934 it just 0.00272119 --1.84934 it looks 0.05332709 --1.84934 it originated 0.05332709 --1.84934 it was -0.0004517734 --1.84934 it yet 0.05332709 --1.4514 it's like 0.01894335 --1.4514 it's much 0.01894335 --1.4514 it's not -0.006728992 --1.4514 it's very 0.01894335 --0.8493397 its enormous 0.05332709 --1.15037 journal ( -0.005168174 --1.15037 journal article 0.008061528 --0.8493397 jurisdiction . -0.004888296 --1.4514 just between 0.008061528 --1.4514 just keeps 0.05332709 --1.4514 just one 0.008061528 --1.4514 just to 0.02102831 --0.8493397 kaufman et 0.2044696 --0.8493397 keeps growing 0.05332709 --0.4047208 khadyta river 0.1451325 --1.4514 know ! 0.01894335 --0.7057508 know , -0.007021053 --1.4514 know where 0.008061528 --0.8493397 larch sample 0.05332709 --1.15037 larches . 0.08907277 --1.15037 larches were 0.04819728 --0.8493397 large-scale " 0.01222976 --1.15037 like crack 0.01894335 --1.15037 like trying 0.05332709 --0.8493397 limited size 0.05332709 --0.8493397 living larches 0.01894335 --0.8493397 longest and -0.01063527 --0.8493397 looking up 0.01894335 --0.8493397 looks relevant 0.05332709 --0.8493397 love to -0.01011507 --0.8493397 made that -0.008505944 --0.4047208 mag ) 0.002721187 --0.8493397 magnitude of -0.009287588 --0.8493397 magnus . -0.0114856 --0.8493397 makes the -0.01198488 --0.8493397 many multiproxy 0.05332709 --0.8493397 may well 0.05332709 --0.8493397 mean chronology -0.005168174 --0.8493397 measured by 0.0389223 --0.4047208 measurement data 0.0009555696 --1.4514 method " -0.008505944 --1.4514 method . -0.004888296 --1.4514 method that -0.008505944 --1.4514 method which 0.00272119 --0.8493397 methodology warn 0.05332709 --0.8493397 mind when 0.008061528 --0.8493397 mix religion 0.05332709 --1.326461 more " -0.008505944 --1.326461 more it 0.0164606 --1.326461 more slowly 0.01894335 --0.8493397 morning i -0.006035987 --1.326461 most recent 0.05332709 --1.326461 most recently 0.2044696 --1.326461 most sensitive 0.05332709 --1.15037 much further 0.008061528 --1.15037 much illusion 0.05332709 --0.8493397 multi-parters , -0.01187418 --0.8493397 multiproxy studies 0.05332709 --0.8493397 mundane politics 0.008061528 --0.580812 my attention 0.05332709 --1.326461 my ghastly 0.05332709 --0.8493397 national debt 0.05332709 --0.8493397 naughtiness . -0.0114856 --0.8493397 nettle , -0.01187418 --0.8493397 never properly 0.05332709 --0.8493397 next , -0.008075343 --0.8493397 no doubt 0.05332709 --0.8493397 non-robustness observed 0.05332709 --0.8493397 northern forests 0.05332709 --1.84934 not be 0.01894335 --1.84934 not due 0.05332709 --1.84934 not going 0.05332709 --1.184377 not have 0.07243546 --1.84934 not just 0.00272119 --1.84934 not preserve 0.05332709 --1.84934 not struggling 0.05332709 --1.84934 not using 0.01894335 --0.8493397 noted before 0.05332709 --0.8493397 noticed that 0.04168737 --0.8493397 notwithstanding these 0.00272119 --0.8493397 now , 0.05896524 --1.15037 obama , -0.007080218 --1.15037 obama is -0.008505944 --0.8493397 observed here 0.05332709 --2.079789 of 17 0.05332709 --2.079789 of a -0.01011507 --2.079789 of being 0.05332709 --2.079789 of commentary 0.05332709 --2.079789 of darkness 0.05332709 --2.079789 of deeper 0.05332709 --2.079789 of his 0.008061528 --2.079789 of interannual 0.05332709 --2.079789 of mundane 0.05332709 --2.079789 of old 0.01894335 --1.33414 of older 0.03455187 --2.079789 of reposting 0.05332709 --2.079789 of subfossil 0.008061528 --1.33414 of the -0.06704012 --2.079789 of this -0.009059753 --1.15037 old living 0.05332709 --1.15037 old trees 0.00272119 --0.6614985 older trees 0.03579502 --0.8493397 oldie , -0.008075343 --1.006781 on a -0.007295178 --1.75243 on average 0.05332709 --1.75243 on many 0.05332709 --1.75243 on rcs 0.008061528 --1.75243 on the -0.007702977 --1.006781 on this -0.005168174 --1.326461 one . -0.0114856 --1.326461 one approach 0.05332709 --1.326461 one oldie 0.05332709 --0.8493397 online . -0.0114856 --0.8493397 only taken 0.05332709 --0.8493397 or real 0.05332709 --0.8493397 originated with -0.007295175 --0.8493397 osborn and -0.01063527 --0.8493397 out ( -0.005168174 --0.8493397 outright fantasy 0.05332709 --0.8493397 own caveats 0.05332709 --0.8493397 paleoclimatologists and -0.01063527 --0.8493397 passage i -0.006035987 --0.8493397 passing and -0.01063527 --0.8493397 path " -0.008505944 --0.8493397 patterns in 0.006514465 --0.8493397 paul had -0.0004517734 --0.8493397 people that -0.008505944 --0.8833473 perhaps the -0.01011507 --1.54831 perhaps there's 0.05332709 --1.54831 perhaps they 0.00272119 --0.4047208 phil trans 0.05332709 --0.8493397 picked cores 0.008061528 --0.8493397 piece by -0.004049858 --0.8493397 place . -0.0114856 --0.8493397 placed online 0.05332709 --0.8493397 play on 0.03209379 --0.8493397 point that -0.008505944 --0.8493397 policy ) -0.005168174 --1.326461 politics , -0.01187418 --1.326461 politics . -0.004888296 --1.326461 politics are 0.05332709 --0.8026608 population . -0.0100497 --1.54831 population as -0.004049858 --1.54831 population consists 0.05332709 --1.54831 population instead 0.2044696 --0.8493397 position that 0.04168737 --0.8493397 possible and -0.01063527 --1.15037 potential bias 0.00272119 --1.15037 potential unrepresentativeness 0.05332709 --0.8493397 power to -0.01011507 --0.8493397 powers and -0.01063527 --0.8493397 precipitous decline 0.05332709 --0.8493397 precisely the -0.007702977 --0.8493397 predictable factors 0.05332709 --0.8493397 presented this 0.009005655 --0.8493397 preserve centennial-scale 0.05332709 --0.8493397 previous journal 0.01894335 --0.8493397 principalities of -0.009287588 --0.8493397 principles in 0.006514465 --0.8493397 prior selection 0.1193421 --0.8493397 probable that 0.04168737 --0.4047208 problem " -0.004049861 --0.8493397 projected into 0.01894335 --0.8493397 properly published 0.05332709 --0.8493397 prove out 0.05332709 --0.8493397 provide the -0.01198488 --0.8493397 provided a -0.01011507 --0.8493397 provocative thought 0.05332709 --0.8493397 published in -0.009490006 --0.8493397 push at 0.008061528 --1.326461 rcs chronology -0.005168174 --1.326461 rcs method 0.00272119 --1.326461 rcs methodology 0.05332709 --0.8493397 react to 0.002916232 --0.8493397 read it -0.006728992 --1.15037 readers also 0.01894335 --1.15037 readers know 0.08231446 --0.8493397 reading , -0.01187418 --0.8493397 real ) -0.005168174 --0.8493397 really react 0.05332709 --0.8493397 realm of -0.009287588 --1.15037 reason for -0.002554279 --1.15037 reason why 0.05332709 --0.8493397 recent one 0.008061528 --0.4047208 recently , -0.01082908 --0.8493397 reconstruction . -0.0114856 --0.8493397 refusal in -0.009490006 --0.8493397 refused to -0.01011507 --0.8493397 related to -0.01011507 --0.8493397 relevant , -0.008075343 --0.8493397 relied on 0.03209379 --0.8493397 religion and -0.01063527 --0.8493397 remained unarchived 0.05332709 --0.8493397 remarked that -0.008505944 --0.8493397 reposting just 0.00272119 --0.8493397 requiring briffa -0.005168174 --0.8493397 response to 0.02102831 --0.8493397 resulting yamal 0.02412629 --0.8493397 rev . -0.0114856 --1.4514 right . -0.0114856 --1.4514 right now 0.05332709 --1.4514 right place 0.05332709 --1.4514 right time 0.01894335 --0.8493397 ring widths 0.05332709 --0.8493397 ring-width series 0.1193421 --0.4047208 river , -0.01082908 --0.8493397 said he -0.0004517734 --0.8493397 same bias 0.00272119 --0.8493397 sample should 0.05332709 --0.8493397 sat in -0.009490006 --1.4514 schweingruber data -0.006035987 --0.7864373 schweingruber population 0.09172077 --0.8493397 schweingruber's khadyta 0.2044696 --0.580812 science ( -0.02724335 --1.326461 science article 0.008061528 --0.8493397 script ) 0.03209379 --1.326461 see , -0.008075343 --0.580812 see the -0.01105098 --0.8493397 seized the -0.01198488 --1.15037 selected . -0.004888296 --1.15037 selected on 0.03209379 --1.326461 selection is -0.008505944 --0.580812 selection of 0.01149127 --0.8493397 sensitive series 0.1193421 --0.8493397 sensitivity is -0.008505944 --0.580812 series , -0.01082908 --1.326461 series of -0.009287588 --0.8493397 set in -0.009490006 --0.8493397 several things 0.01894335 --0.8493397 shadow play 0.05332709 --1.15037 shadows . -0.0114856 --1.15037 shadows of -0.009287588 --1.326461 shiyatov 2002 0.05332709 --1.326461 shiyatov themselves 0.01894335 --1.326461 shiyatov would 0.08231446 --0.8493397 should not -0.006728992 --0.8493397 similar schweingruber 0.00272119 --0.8493397 similarly affected 0.05332709 --0.8493397 since this -0.009059753 --0.8493397 size and -0.01063527 --0.8493397 skimmed this -0.009059753 --1.15037 slowly , -0.01187418 --1.15037 slowly get 0.2044696 --0.8493397 small push 0.05332709 --0.8493397 so much 0.01894335 --0.8493397 some reason 0.01894335 --0.8493397 someone whose 0.05332709 --0.8493397 start today 0.01894335 --0.8493397 staunchly refused 0.05332709 --0.8493397 struggling against 0.01894335 --0.8493397 studies that -0.008505944 --1.15037 study , -0.01187418 --1.15037 study . 0.08907277 --0.8493397 stumbled upon 0.05332709 --1.326461 subfossil collection 0.05332709 --1.326461 subfossil data 0.02685598 --1.326461 subfossil larches 0.01894335 --0.8493397 subsequent study 0.01894335 --0.8493397 subset in -0.009490006 --0.8493397 success . -0.0114856 --0.8493397 supplement . 0.08907277 --0.8493397 supplemented by 0.0389223 --0.8493397 surface , -0.008075343 --0.8493397 take an -0.0004517734 --0.8493397 taken a 0.0001907796 --1.15037 taymir data -0.006035987 --1.15037 taymir supplement 0.05332709 --0.8493397 temperature , 0.05896524 --0.8493397 tendency to -0.01011507 --0.8493397 terms of -0.009287588 --0.8493397 than the -0.008591395 --1.995468 that " -0.008505944 --1.995468 that cast 0.05332709 --1.995468 that characterizes 0.05332709 --1.995468 that have -0.002554279 --1.995468 that he 0.06152429 --1.995468 that his 0.008061528 --0.9275748 that the 0.03271748 --1.995468 that they 0.00272119 --1.995468 that voted 0.05332709 --1.995468 that way 0.05332709 --1.995468 that wise 0.05332709 --2.668884 the " -0.008505944 --1.923235 the 12 0.05332709 --2.668884 the addition 0.05332709 --1.923235 the arkive 0.05332709 --2.668884 the back-and-forth 0.05332709 --2.668884 the biased 0.05332709 --2.668884 the biblical 0.05332709 --2.668884 the chronology -0.005168174 --1.923235 the conservatives 0.05332709 --2.668884 the crossroads 0.05332709 --2.003921 the cru 0.0632299 --2.668884 the data 0.02685598 --2.668884 the day 0.01894335 --2.668884 the difference 0.01894335 --2.668884 the far 0.05332709 --2.668884 the following: 0.05332709 --2.668884 the further 0.008061528 --2.668884 the future 0.05332709 --2.668884 the information 0.05332709 --2.668884 the large-scale 0.05332709 --2.668884 the longest 0.05332709 --2.668884 the magnitude 0.05332709 --2.668884 the measurement 0.2044696 --2.668884 the more 0.008061528 --2.668884 the most 0.008061528 --2.668884 the multi-parters 0.05332709 --2.668884 the national 0.05332709 --2.668884 the nettle 0.05332709 --2.668884 the non-robustness 0.05332709 --2.668884 the path 0.05332709 --2.668884 the people 0.05332709 --2.668884 the phil 0.2044696 --2.668884 the point 0.05332709 --2.668884 the position 0.05332709 --2.668884 the previous 0.05332709 --2.668884 the rcs 0.008061528 --2.668884 the realm 0.05332709 --2.668884 the resulting 0.05332709 --1.923235 the right 0.01894334 --2.668884 the same 0.05332709 --2.003921 the schweingruber -0.5245172 --2.668884 the shadows 0.01894335 --2.668884 the subfossil 0.008061528 --1.923235 the taymir 0.05332709 --1.923235 the trouble 0.1451325 --1.923235 the two 0.1451325 --2.668884 the use 0.2044696 --2.668884 the usual 0.05332709 --2.668884 the very 0.01894335 --2.668884 the virtue 0.05332709 --1.120574 the yamal 0.02719982 --0.8493397 their cores 0.008061528 --1.15037 themselves , -0.01187418 --1.15037 themselves were -0.002554279 --0.8493397 there's some 0.05332709 --1.4514 these data -0.006035987 --1.4514 these shadows 0.01894335 --1.4514 these warnings 0.05332709 --1.4514 these were -0.002554279 --1.4514 they can 0.01894335 --1.4514 they don't 0.05332709 --1.4514 they expect 0.05332709 --1.4514 they themselves 0.01894335 --1.15037 things caught 0.05332709 --1.15037 things that -0.008505944 --0.8493397 think up 0.01894335 --0.8493397 thinking through 0.05332709 --2.05346 this analysis 0.05332709 --2.05346 this article 0.008061528 --2.05346 this bias 0.00272119 --1.307811 this chronology 0.002721187 --2.05346 this difference 0.01894335 --1.307811 this is -0.004049861 --2.05346 this method 0.00272119 --2.05346 this morning 0.05332709 --2.05346 this piece 0.05332709 --2.05346 this refusal 0.05332709 --2.05346 this study 0.01894335 --2.05346 this subset 0.05332709 --2.05346 this will 0.01894335 --2.05346 this year 0.01894335 --1.15037 those " -0.008505944 --1.15037 those years 0.05332709 --0.8493397 thought . -0.0114856 --0.8493397 thousand , 0.05896524 --0.8493397 through the -0.01198488 --1.15037 time , -0.008075343 --1.15037 time and -0.002266581 --2.191762 to about 0.00272119 --2.191762 to admit 0.05332709 --2.191762 to archive 0.05332709 --1.446113 to begin 0.05332709 --2.191762 to change 0.05332709 --2.191762 to constructing 0.05332709 --2.191762 to control 0.01894335 --2.191762 to day 0.01894335 --2.191762 to different 0.01894335 --2.191762 to get 0.2044696 --2.191762 to mix 0.05332709 --2.191762 to provide 0.05332709 --2.191762 to start 0.05332709 --1.123869 to the -0.005761562 --2.191762 to think 0.05332709 --2.191762 to those 0.01894335 --1.446113 to what 0.005001867 --1.15037 today . -0.0114856 --1.15037 today would 0.00272119 --0.8493397 took the -0.01198488 --0.8493397 towards older 0.09388901 --1.15037 trans b 0.05332709 --1.15037 trans editors 0.05332709 --1.4514 trees . -0.0114856 --1.4514 trees an -0.0004517734 --1.4514 trees described 0.05332709 --1.4514 trees than 0.05332709 --0.4047208 trouble with -0.03998877 --0.8493397 true , -0.01187418 --0.8493397 trying to -0.01011507 --0.4047208 two versions 0.05332709 --0.8493397 unarchived . -0.004888296 --0.8493397 under control 0.01894335 --0.8493397 unintentional bias 0.00272119 --0.8493397 unrepresentativeness of 0.007685009 --0.8493397 until recently 0.2044696 --0.8493397 unveiled: humanity 0.05332709 --1.15037 up a -0.01011507 --1.15037 up the -0.01198488 --0.8493397 upon this -0.009059753 --0.4047208 use of -0.005627823 --1.54831 used by -0.004049858 --0.8833473 used in 0.01371272 --1.54831 used the -0.01198488 --1.15037 using . 0.08907277 --1.15037 using the -0.008591395 --0.8493397 usual predictable 0.05332709 --0.8493397 valid reason 0.01894335 --1.15037 variability . -0.004888296 --1.15037 variability and -0.01063527 --1.15037 versions . 0.08907277 --1.15037 versions is -0.008505944 --1.15037 very hard 0.05332709 --1.15037 very limited 0.05332709 --0.8493397 violence unveiled: 0.05332709 --0.8493397 virtually every 0.05332709 --0.8493397 virtue of -0.009287588 --0.8493397 voted for -0.002554279 --0.8493397 warn against 0.01894335 --0.8493397 warnings , -0.01187418 --1.54831 was finally 0.008061528 --1.54831 was ghostwritten 0.05332709 --1.54831 was like 0.01894335 --1.54831 was never 0.05332709 --1.54831 was used 0.04797027 --0.8493397 way slowly 0.01894335 --0.8493397 we do 0.01894335 --0.8493397 well have 0.04819728 --1.627491 were not -0.006728992 --1.627491 were right 0.00272119 --0.881842 were selected 0.05332709 --1.627491 were supplemented 0.05332709 --1.627491 were the -0.01198488 --1.694438 what a -0.01011507 --1.694438 what did 0.01894335 --1.694438 what happens 0.05332709 --1.694438 what is -0.008505944 --1.694438 what paul 0.05332709 --1.694438 what the -0.007702977 --1.694438 what will 0.01894335 --0.8493397 what's your 0.01894335 --1.326461 when combined 0.05332709 --1.326461 when he -0.0004517734 --1.326461 when i -0.006035987 --1.326461 where it's 0.00272119 --1.326461 where sensitivity 0.05332709 --1.326461 where to 0.002916232 --1.4514 which , -0.01187418 --1.4514 which did 0.01894335 --1.4514 which had 0.06152429 --1.4514 which makes 0.05332709 --1.326461 while including 0.01894335 --1.326461 while looking 0.05332709 --1.326461 while the 0.02129733 --0.8493397 whose book 0.05332709 --0.8493397 why schweingruber's 0.05332709 --0.8493397 widths and -0.01063527 --1.15037 will be 0.01894335 --1.15037 will have -0.002554279 --0.8493397 wise crack 0.01894335 --1.890732 with . -0.004888296 --1.890732 with a -0.01011507 --1.890732 with briffa 0.02412629 --1.890732 with its 0.05332709 --1.145083 with obama 0.05332709 --1.890732 with osborn 0.05332709 --0.8228394 with the 0.02898683 --0.8493397 within your 0.01894335 --0.8493397 without fully 0.05332709 --0.8493397 worth reading 0.05332709 --1.4514 would do 0.01894335 --0.7057508 would not -0.04287655 --1.4514 would take 0.05332709 --0.8493397 wright's church 0.05332709 --0.8493397 wrote the -0.01198488 --1.087467 yamal chronology 0.01075652 --1.75243 yamal data -0.006035987 --1.75243 yamal larch 0.05332709 --1.75243 yamal measurement 0.2044696 --1.75243 yamal reconstruction 0.05332709 --1.75243 yamal subfossil 0.008061528 --1.15037 year , -0.008075343 --1.15037 year old 0.01894335 --0.8493397 years ? 0.01894335 --0.8493397 yes , -0.01187418 --0.8493397 yesterday about 0.00272119 --0.8493397 yet , 0.05896524 --0.8493397 you see 0.008061528 --1.15037 your great 0.2044696 --1.15037 your power 0.05332709 - -\3-grams: --1.533073 control ! as --1.533073 know ! instead --1.533073 . " i'd --1.533073 ? " --1.533073 a " divergence --1.533073 concrete " ( --1.533073 considered " success --1.533073 large-scale " divergence --1.533073 method " used --1.533073 more " concrete --1.533073 path " as --1.834103 problem " - --1.834103 problem " that --1.533073 that " the --1.533073 the " corridor --1.533073 those " further --1.533073 . ' --1.533073 ' yes --1.533073 " ( or --1.533073 . ( while --1.533073 al ( phil --1.533073 data ( in --1.533073 journal ( which --1.533073 out ( and --0.8145491 science ( mag --1.533073 . ) --1.533073 2008 ) and --1.533073 away ) , --1.834103 mag ) acquiesced --1.834103 mag ) took --1.533073 policy ) had --1.533073 real ) things --1.533073 script ) , --1.834103 ) , it's --1.834103 ) , this --1.533073 actually , all --1.533073 ago , i --1.533073 and , when --1.533073 article , it --1.533073 attention , but --1.533073 available , this --1.533073 average , of --1.533073 b , 2008 --1.533073 before , briffa --1.533073 bias , when --1.533073 blood , but --1.533073 but , notwithstanding --1.533073 chronology , 224 --1.533073 consists , on --1.533073 cores , this --1.533073 debt , which --1.533073 energy , but --1.533073 factors , but --1.533073 first , a --1.533073 furthermore , it --1.834103 however , as --1.834103 however , using --1.533073 idea , bob --1.533073 including , most --1.533073 inhomogeneities , but --1.533073 is , it's --1.834103 know , the --1.834103 know , until --1.533073 multi-parters , delete --1.533073 nettle , requiring --1.533073 next , i --1.533073 now , but --1.533073 obama , which --1.533073 oldie , i --1.533073 politics , he --1.533073 reading , cf --1.834103 recently , cru --1.834103 recently , kaufman --1.533073 relevant , and --1.834103 river , while --1.834103 river , yamal --1.533073 see , the --1.834103 series , from --1.834103 series , where --1.533073 slowly , is --1.533073 study , including --1.533073 surface , and --1.533073 temperature , but --1.533073 themselves , since --1.533073 thousand , but --1.533073 time , and --1.533073 true , for --1.533073 warnings , his --1.533073 which , if --1.533073 year , the --1.533073 yes , perhaps --1.533073 yet , but --1.533073 " - not --1.533073 2006 . while --1.533073 2009 . --1.533073 article . however --1.533073 attention . first --1.533073 bailie . i --1.533073 cf . violence --1.533073 cores . briffa --1.533073 crossroads . ) --1.834103 data . as --1.834103 data . but --1.533073 days . --1.533073 difference . --1.533073 forests . however --1.533073 future . changing --1.533073 growing . --1.533073 idea . what's --1.533073 jurisdiction . briffa --1.533073 larches . --1.533073 magnus . actually --1.533073 method . this --1.533073 naughtiness . ( --1.533073 one . in --1.533073 online . with --1.533073 place . ' --1.533073 politics . this --1.834103 population . it --1.834103 population . the --1.533073 reconstruction . science --1.533073 rev . wright's --1.533073 right . what --1.533073 selected . these --1.533073 shadows . and --1.533073 study . --1.533073 success . " --1.533073 supplement . --1.533073 thought . furthermore --1.533073 today . several --1.533073 trees . perhaps --1.533073 unarchived . a --1.533073 using . --1.533073 variability . these --1.533073 versions . --1.533073 with . a --1.834103 the 12 cores --1.834103 the 12 picked --1.533073 of 17 ring-width --2.010194 briffa 2000 and --2.010194 briffa 2000 may --2.010194 briffa 2000 presented --1.533073 shiyatov 2002 as --1.533073 briffa 2006 . --1.533073 , 2008 ) --1.533073 al 2009 . --1.533073 from 200–400 year --1.533073 , 224 individual --1.533073 bob ? i --1.533073 years ? " --1.533073 , a comment --1.834103 . a commenter --1.834103 . a few --1.533073 about a thousand --1.533073 as a shadow --1.533073 at a time --1.533073 constructing a mean --1.533073 delete a few --1.533073 from a prior --1.834103 had a different --1.834103 had a great --1.533073 has a " --1.834103 in a case --1.834103 in a science --1.533073 of a similar --1.834103 on a rcs --1.834103 on a surface --1.533073 provided a generating --1.533073 taken a few --1.533073 up a valid --1.533073 what a provocative --1.533073 with a small --1.533073 concerned about potential --1.533073 crack about not --1.533073 to about a --1.533073 yesterday about my --1.533073 ) acquiesced in --1.533073 . actually , --1.533073 the addition of --1.533073 to admit that --1.533073 similarly affected the --1.533073 struggling against flesh --1.533073 warn against inhomogeneities --1.533073 different aging patterns --1.533073 days ago , --1.533073 further ahead you --1.834103 et al ( --1.834103 et al 2009 --1.533073 , all of --1.533073 for all those --1.533073 and all-around naughtiness --1.533073 further along the --1.533073 chronology also has --1.533073 readers also know --1.533073 has always been --1.533073 is always worth --1.533073 been an exception --1.533073 for an extension --1.533073 have an important --1.533073 take an immense --1.533073 trees an unintentional --1.533073 this analysis has --1.533073 ( and i've --1.533073 ) and the --2.010194 , and he --2.010194 , and that --2.010194 , and they --1.533073 . and perhaps --1.533073 2000 and science --1.533073 ayers and sat --1.533073 darkness and all-around --1.533073 do and the --1.533073 extension and , --1.533073 flesh and blood --0.1249387 hantemirov and shiyatov --1.533073 illusion and outright --1.533073 longest and most --1.533073 osborn and briffa --1.533073 paleoclimatologists and got --1.533073 passing and it --1.533073 possible and even --1.533073 powers and principalities --1.533073 religion and politics --1.533073 size and potential --1.533073 time and the --1.533073 variability and hantemirov --1.533073 widths and temperature --1.533073 but anti-divine powers --1.533073 in any journal --1.533073 one approach to --1.533073 to archive the --1.533073 politics are to --1.834103 the arkive down --1.834103 the arkive under --1.533073 journal article . --1.533073 science article , --1.533073 this article on --1.533073 ! as it --1.533073 " as a --1.533073 , as ca --1.533073 . as noted --1.533073 2002 as follows: --1.533073 as ca --1.533073 population as compared --1.533073 briffa asked for --1.533073 few at a --1.533073 humanity at the --1.533073 push at precisely --1.834103 my attention , --1.834103 my attention . --1.533073 finally available , --1.533073 on average , --1.533073 further away ) --1.533073 bill ayers and --1.533073 trans b , --1.533073 the back-and-forth yesterday --1.533073 gil bailie . --1.533073 not be included --1.533073 will be happening --1.533073 but because so --1.533073 always been an --1.533073 had been projected --1.834103 have been concerned --1.834103 have been done --1.533073 noted before , --1.834103 to begin in --1.834103 to begin with --1.533073 of being true --1.533073 but between the --1.533073 difference between the --1.533073 just between ring --1.533073 potential bias introduced --1.533073 same bias towards --1.533073 this bias would --1.533073 unintentional bias , --1.533073 the biased selection --1.533073 the biblical passage --1.533073 by bill ayers --1.533073 hs blade was --1.533073 and blood , --1.533073 , bob ? --1.533073 whose book was --1.533073 , briffa asked --1.834103 . briffa 2000 --1.834103 . briffa used --1.533073 and briffa 2006 --1.533073 chronology briffa et --1.533073 in briffa 2000 --1.533073 requiring briffa to --1.533073 with briffa 2000 --1.533073 briffa's own --1.533073 in briffa's yamal --2.487315 , but , --2.487315 , but anti-divine --2.487315 , but because --2.487315 , but between --1.467762 , but it --2.487315 , but the --2.487315 , but this --2.487315 , but to --1.533073 . but given --1.533073 but it's --1.533073 comment by magnus --1.533073 ghostwritten by bill --1.533073 introduced by how --1.533073 measured by the --1.533073 piece by gil --1.533073 supplemented by the --1.533073 used by hantemirov --0.8145491 as ca readers --1.533073 i can combine --1.533073 they can see --1.533073 a case where --1.533073 that cast these --1.533073 comments catch my --1.533073 things caught my --1.533073 own caveats on --1.533073 preserve centennial-scale variability --1.533073 , cf . --1.533073 to change with --1.533073 . changing what --1.533073 that characterizes northern --1.533073 i checked earlier --1.533073 mean chronology , --1.533073 rcs chronology method --1.533073 the chronology briffa --1.834103 this chronology also --1.834103 this chronology in --2.010194 yamal chronology has --2.010194 yamal chronology was --2.010194 yamal chronology with --1.533073 wright's church for --1.533073 crack cocaine for --1.533073 subfossil collection does --1.533073 in combination with --1.533073 can combine the --1.533073 when combined with --1.533073 a comment by --1.533073 of commentary on --1.533073 a commenter remarked --1.533073 his comments catch --1.533073 as compared to --1.533073 been concerned about --1.533073 " concrete " --1.533073 in connection with --1.834103 the conservatives said --1.834103 the conservatives were --1.533073 is considered " --1.533073 population consists , --1.533073 to constructing a --1.533073 to control the --1.533073 under control ! --1.533073 12 cores . --1.533073 picked cores , --1.533073 their cores were --1.533073 " corridor method --1.533073 like crack cocaine --1.533073 wise crack about --1.533073 the crossroads . --1.533073 , cru staunchly --0.9906404 the cru population --2.010194 the cru selection --1.533073 of darkness and --1.533073 different data policy --1.834103 measurement data remained --1.834103 measurement data used --1.533073 schweingruber data set --1.533073 subfossil data . --1.533073 taymir data ( --1.533073 the data . --1.533073 these data were --1.533073 yamal data was --1.533073 the day to --1.533073 to day politics --1.834103 few days . --1.834103 few days ago --1.533073 national debt , --1.533073 precipitous decline is --1.533073 happening deep into --1.533073 of deeper principles --1.533073 , delete a --1.533073 is derived from --1.533073 trees described in --1.533073 what did they --1.533073 which did not --1.533073 the difference between --1.533073 this difference . --1.533073 a different data --1.533073 to different aging --0.8145491 " divergence problem --1.533073 we do indeed --1.533073 would do and --1.533073 collection does not --1.533073 is doing exactly --1.533073 they don't really --1.533073 been done without --1.533073 no doubt what --1.533073 arkive down to --1.533073 not due just --1.533073 checked earlier this --1.533073 trans editors finally --1.533073 immense energy , --1.533073 its enormous hs --1.533073 briffa et al --1.533073 kaufman et al --1.533073 and even probable --1.533073 virtually every subsequent --1.533073 doing exactly what --1.533073 an exception to --1.533073 for excluding khadyta --1.533073 they expect from --1.533073 an extension and --1.533073 predictable factors , --1.533073 outright fantasy had --1.533073 the far more --2.010194 a few at --0.9906404 a few days --1.533073 editors finally seized --1.533073 information finally available --1.533073 was finally placed --1.533073 . first , --1.533073 against flesh and --1.533073 the following: --1.533073 as follows: --1.533073 , for we --1.533073 asked for an --1.533073 church for all --1.533073 cocaine for paleoclimatologists --1.533073 reason for excluding --1.533073 voted for him --1.533073 northern forests . --1.533073 , from 200–400 --1.533073 derived from a --1.533073 expect from someone --1.533073 without fully thinking --1.533073 " further along --1.533073 much further away --1.533073 the further ahead --1.533073 . furthermore , --1.533073 the future . --1.533073 a generating script --1.533073 slowly get the --1.533073 to get the --1.533073 my ghastly tendency --1.533073 was ghostwritten by --1.533073 by gil bailie --1.533073 but given the --1.533073 not going to --1.533073 and got used --1.533073 a great idea --1.533073 your great idea --1.533073 keeps growing . --1.533073 it grows more --1.533073 ) had jurisdiction --1.533073 fantasy had been --1.533073 i had a --1.533073 paul had in --1.533073 which had a --1.533073 and hantemirov and --1.533073 by hantemirov and --1.533073 in hantemirov and --1.533073 be happening deep --1.533073 is happening right --1.533073 what happens today --1.533073 very hard to --1.533073 i hardly know --1.533073 also has a --1.533073 analysis has only --1.533073 chronology has always --1.533073 it has the --1.533073 him hate to --2.010194 not have been --2.010194 not have similarly --2.010194 not have the --1.533073 that have relied --1.533073 well have been --1.533073 will have an --1.533073 i haven't read --1.533073 , he wrote --1.533073 and he is --1.533073 said he would --1.533073 that he is --1.533073 when he made --1.533073 observed here prove --1.533073 is highly possible --1.533073 for him hate --1.533073 into him to --1.533073 , his initial --1.533073 of his comments --1.533073 that his precipitous --1.533073 by how their --0.8145491 . however , --1.533073 enormous hs blade --1.533073 unveiled: humanity at --2.010194 , i can --2.010194 , i noticed --2.010194 , i skimmed --1.533073 . i haven't --1.533073 i hardly --1.533073 ? i know --1.533073 morning i had --1.533073 passage i stumbled --1.533073 when i checked --1.533073 " i'd love --1.533073 and i've provided --1.834103 great idea , --1.834103 great idea . --1.533073 , if it --1.533073 if the --1.533073 much illusion and --1.533073 an immense energy --1.533073 important impact on --1.533073 an important impact --1.533073 ( in a --1.533073 . in response --1.533073 in one --1.533073 acquiesced in this --1.533073 begin in terms --1.533073 chronology in passing --1.533073 described in hantemirov --1.533073 had in mind --1.533073 patterns in the --1.533073 principles in the --1.533073 published in any --1.533073 refusal in connection --1.533073 sat in rev --1.533073 set in combination --1.533073 subset in briffa --2.010194 used in a --2.010194 used in briffa's --2.010194 used in virtually --1.533073 be included with --1.533073 , including , --1.533073 while including the --1.533073 do indeed see --1.533073 224 individual series --1.533073 the information finally --1.533073 against inhomogeneities , --1.533073 his initial use --1.533073 ! instead of --1.533073 population instead of --1.533073 of interannual variability --1.533073 deep into the --1.533073 projected into him --1.533073 bias introduced by --1.533073 , is considered --1.533073 decline is not --1.834103 he is always --1.834103 he is doing --2.010194 it is , --2.010194 it is highly --2.010194 it is within --1.533073 obama is that --1.533073 selection is derived --1.533073 sensitivity is measured --1.834103 this is no --1.834103 this is the --1.533073 versions is related --1.533073 what is happening --1.834103 , it has --1.834103 , it originated --1.533073 . it is --1.533073 and it was --1.533073 as it is --1.834103 but it just --1.834103 but it looks --1.533073 if it grows --1.533073 more it is --1.533073 read it yet --1.834103 , it's like --1.834103 , it's very --1.533073 but it's not --1.533073 where it's much --1.533073 with its enormous --1.533073 any journal article --1.533073 previous journal ( --1.533073 had jurisdiction . --1.533073 due just to --1.533073 it just keeps --1.533073 not just between --1.533073 reposting just one --1.533073 , kaufman et --1.533073 just keeps growing --1.533073 excluding khadyta river --1.533073 schweingruber's khadyta river --1.533073 also know , --1.533073 hardly know where --1.533073 i know ! --1.533073 readers know , --1.533073 yamal larch sample --1.533073 living larches . --1.533073 subfossil larches were --1.533073 the large-scale " --1.533073 it's like trying --1.533073 was like crack --1.533073 very limited size --1.533073 old living larches --1.533073 the longest and --1.533073 while looking up --1.533073 it looks relevant --1.533073 i'd love to --1.533073 he made that --0.8145491 ( mag ) --1.533073 the magnitude of --1.533073 by magnus . --1.533073 which makes the --1.533073 on many multiproxy --1.533073 2000 may well --1.533073 a mean chronology --1.533073 is measured by --1.533073 the measurement data --1.533073 yamal measurement data --1.533073 chronology method that --1.533073 corridor method " --1.533073 rcs method . --1.533073 this method which --1.533073 rcs methodology warn --1.533073 in mind when --1.533073 to mix religion --1.533073 far more " --1.533073 grows more slowly --1.533073 the more it --1.533073 this morning i --1.533073 , most recently --1.533073 and most sensitive --1.533073 the most recent --1.533073 it's much further --1.533073 so much illusion --1.533073 the multi-parters , --1.533073 many multiproxy studies --1.533073 of mundane politics --1.533073 about my ghastly --1.533073 catch my attention --1.533073 caught my attention --1.533073 the national debt --1.533073 all-around naughtiness . --1.533073 the nettle , --1.533073 was never properly --1.533073 next , --1.533073 is no doubt --1.533073 the non-robustness observed --1.533073 characterizes northern forests --1.533073 - not just --1.533073 about not struggling --1.533073 did not preserve --1.533073 does not have --1.533073 is not due --1.533073 it's not going --1.533073 should not be --1.533073 were not using --0.8145491 would not have --1.533073 as noted before --1.533073 i noticed that --1.533073 , notwithstanding these --1.533073 right now , --1.834103 with obama , --1.834103 with obama is --1.533073 non-robustness observed here --1.533073 , of older --1.533073 addition of 17 --1.533073 all of his --1.834103 instead of reposting --1.834103 instead of the --1.533073 magnitude of interannual --1.533073 principalities of darkness --1.533073 realm of mundane --1.834103 selection of old --1.834103 selection of older --1.533073 series of subfossil --1.533073 shadows of deeper --1.533073 terms of commentary --1.533073 unrepresentativeness of the --1.834103 use of a --1.834103 use of this --1.533073 virtue of being --1.533073 of old trees --1.533073 year old living --0.8145491 of older trees --1.533073 towards older trees --1.533073 one oldie , --1.533073 , on average --1.533073 article on the --1.533073 caveats on rcs --1.533073 commentary on this --1.533073 impact on many --1.533073 play on a --1.533073 relied on this --1.533073 selected on a --1.533073 in one approach --1.533073 just one oldie --1.533073 recent one . --1.533073 placed online . --1.533073 has only taken --1.533073 ( or real --1.533073 it originated with --1.533073 with osborn and --1.533073 prove out ( --1.533073 and outright fantasy --1.533073 briffa's own caveats --1.533073 for paleoclimatologists and --1.533073 biblical passage i --1.533073 in passing and --1.533073 the path " --1.533073 aging patterns in --1.533073 what paul had --1.533073 the people that --1.533073 , perhaps the --1.533073 . perhaps the --1.834103 perhaps the --1.834103 perhaps there's --1.533073 and perhaps they --1.533073 ( phil trans --1.533073 the phil trans --1.533073 12 picked cores --1.533073 this piece by --1.533073 right place . --1.533073 finally placed online --1.533073 shadow play on --1.533073 the point that --1.533073 data policy ) --1.533073 and politics , --1.533073 day politics are --1.533073 mundane politics . --1.834103 cru population . --1.834103 cru population consists --2.010194 schweingruber population . --2.010194 schweingruber population as --2.010194 schweingruber population instead --1.533073 the position that --1.533073 highly possible and --1.533073 about potential bias --1.533073 and potential unrepresentativeness --1.533073 your power to --1.533073 anti-divine powers and --1.533073 his precipitous decline --1.533073 at precisely the --1.533073 usual predictable factors --1.533073 2000 presented this --1.533073 not preserve centennial-scale --1.533073 the previous journal --1.533073 and principalities of --1.533073 deeper principles in --1.533073 a prior selection --1.533073 even probable that --0.8145491 divergence problem " --1.533073 been projected into --1.533073 never properly published --1.533073 here prove out --1.533073 to provide the --1.533073 i've provided a --1.533073 a provocative thought --1.533073 properly published in --1.533073 small push at --1.533073 a rcs chronology --1.533073 on rcs methodology --1.533073 the rcs method --1.533073 really react to --1.533073 haven't read it --1.834103 ca readers also --1.834103 ca readers know --1.533073 worth reading , --1.533073 or real ) --1.533073 don't really react --1.533073 the realm of --1.533073 some reason why --1.533073 valid reason for --1.533073 most recent one --1.533073 most recently , --1.533073 until recently , --1.533073 yamal reconstruction . --1.533073 this refusal in --1.533073 staunchly refused to --1.533073 is related to --1.533073 looks relevant , --1.533073 have relied on --1.533073 mix religion and --1.533073 data remained unarchived --1.533073 commenter remarked that --1.533073 of reposting just --1.533073 , requiring briffa --1.533073 in response to --1.533073 the resulting yamal --1.533073 in rev . --1.533073 happening right now --1.834103 the right place --1.834103 the right time --1.533073 were right . --1.533073 between ring widths --1.533073 17 ring-width series --0.8145491 khadyta river , --1.533073 conservatives said he --1.533073 the same bias --1.533073 larch sample should --1.533073 and sat in --1.533073 similar schweingruber data --0.1249387 the schweingruber population --1.533073 why schweingruber's khadyta --1.533073 . science ( --1.533073 a science article --1.533073 and science ( --1.533073 generating script ) --1.533073 can see the --1.533073 indeed see the --1.533073 you see , --1.533073 finally seized the --1.834103 were selected . --1.834103 were selected on --1.533073 biased selection of --1.533073 cru selection is --1.533073 prior selection of --1.533073 most sensitive series --1.533073 where sensitivity is --1.533073 individual series of --1.533073 ring-width series , --1.533073 sensitive series , --1.533073 data set in --1.533073 . several things --1.533073 a shadow play --1.533073 the shadows of --1.533073 these shadows . --2.010194 and shiyatov 2002 --2.010194 and shiyatov themselves --2.010194 and shiyatov would --1.533073 sample should not --1.533073 a similar schweingruber --1.533073 have similarly affected --1.533073 , since this --1.533073 limited size and --1.533073 i skimmed this --1.533073 more slowly , --1.533073 way slowly get --1.533073 a small push --1.533073 because so much --1.533073 there's some reason --1.533073 from someone whose --1.533073 to start today --1.533073 cru staunchly refused --1.533073 not struggling against --1.533073 multiproxy studies that --1.533073 subsequent study , --1.533073 this study . --1.533073 i stumbled upon --1.533073 of subfossil larches --1.533073 the subfossil collection --1.533073 yamal subfossil data --1.533073 every subsequent study --1.533073 this subset in --1.533073 " success . --1.533073 taymir supplement . --1.533073 were supplemented by --1.533073 a surface , --1.533073 would take an --1.533073 only taken a --1.834103 the taymir data --1.834103 the taymir supplement --1.533073 and temperature , --1.533073 ghastly tendency to --1.533073 in terms of --1.533073 trees than the --1.533073 " that characterizes --1.533073 admit that the --1.533073 and that way --1.533073 is that he --1.533073 made that wise --1.533073 method that they --1.533073 noticed that the --1.533073 people that voted --1.533073 point that his --1.533073 position that the --1.533073 probable that the --1.533073 remarked that " --1.533073 studies that have --1.533073 things that cast --1.533073 " the trouble --2.010194 , the more --2.010194 , the resulting --2.010194 , the yamal --1.533073 . the cru --1.834103 the subfossil --1.834103 the yamal --1.533073 affected the " --1.533073 along the path --2.010194 and the people --2.010194 and the phil --2.010194 and the right --1.533073 archive the data --1.533073 at the crossroads --0.8145491 between the two --1.533073 but the further --1.834103 by the addition --1.834103 by the magnitude --1.533073 combine the multi-parters --1.533073 control the national --0.8145491 get the arkive --1.533073 given the use --1.533073 has the virtue --1.533073 have the same --1.533073 if the non-robustness --1.834103 in the realm --1.834103 in the schweingruber --1.533073 including the taymir --1.533073 into the future --1.533073 is the most --1.533073 makes the point --0.8145491 of the 12 --1.533073 on the trouble --2.010194 perhaps the biased --2.010194 perhaps the day --2.010194 perhaps the difference --1.533073 precisely the right --1.533073 provide the measurement --1.834103 see the far --1.834103 see the shadows --1.533073 seized the nettle --1.533073 than the schweingruber --2.135133 that the conservatives --2.135133 that the cru --2.135133 that the previous --2.135133 that the yamal --1.533073 through the very --2.135133 to the back-and-forth --2.135133 to the cru --2.135133 to the large-scale --2.135133 to the usual --1.533073 took the position --1.533073 up the biblical --1.533073 used the chronology --1.533073 using the schweingruber --1.533073 were the longest --1.533073 what the conservatives --1.533073 while the yamal --2.135133 with the information --2.135133 with the rcs --2.135133 with the taymir --2.135133 with the yamal --1.533073 wrote the following: --1.533073 how their cores --1.533073 shiyatov themselves , --1.533073 they themselves were --1.533073 perhaps there's some --1.834103 . these data --1.834103 . these were --1.533073 cast these shadows --1.533073 notwithstanding these warnings --1.533073 and they can --1.533073 did they expect --1.533073 perhaps they don't --1.533073 that they themselves --1.533073 ) things that --1.533073 several things caught --1.533073 to think up --1.533073 fully thinking through --2.010194 , this analysis --2.010194 , this chronology --2.010194 , this will --1.834103 . this bias --1.834103 . this is --1.533073 this morning --1.533073 but this is --1.533073 earlier this year --1.533073 in this refusal --1.533073 of this subset --1.834103 on this difference --1.834103 on this study --1.533073 presented this chronology --1.533073 since this method --1.533073 skimmed this article --1.533073 upon this piece --1.533073 all those years --1.533073 to those " --1.533073 provocative thought . --1.533073 a thousand , --1.533073 thinking through the --1.533073 a time , --1.533073 right time and --1.533073 approach to constructing --1.533073 are to those --1.533073 briffa to archive --1.533073 but to what --1.533073 compared to the --1.533073 day to day --1.533073 down to about --1.533073 exception to the --1.533073 going to start --1.533073 hard to think --1.533073 hate to admit --1.533073 him to begin --1.533073 just to the --1.533073 love to get --1.533073 power to change --1.533073 react to what --1.533073 refused to provide --1.533073 related to different --1.533073 response to the --1.533073 tendency to mix --1.533073 trying to control --1.533073 where to begin --1.533073 happens today would --1.533073 start today . --1.533073 ) took the --1.533073 bias towards older --1.834103 phil trans b --1.834103 phil trans editors --1.533073 old trees described --2.010194 older trees . --2.010194 older trees an --2.010194 older trees than --0.8145491 the trouble with --1.533073 being true , --1.533073 like trying to --0.8145491 the two versions --1.533073 remained unarchived . --1.533073 arkive under control --1.533073 an unintentional bias --1.533073 potential unrepresentativeness of --1.533073 , until recently --1.533073 violence unveiled: humanity --1.533073 looking up the --1.533073 think up a --1.533073 stumbled upon this --1.533073 initial use of --1.533073 the use of --1.533073 " used by --1.533073 briffa used the --1.533073 data used in --1.533073 got used in --1.533073 was used in --1.533073 , using the --1.533073 not using . --1.533073 the usual predictable --1.533073 a valid reason --1.533073 centennial-scale variability and --1.533073 interannual variability . --1.834103 two versions . --1.834103 two versions is --1.533073 it's very hard --1.533073 the very limited --1.533073 . violence unveiled: --1.533073 in virtually every --1.533073 the virtue of --1.533073 that voted for --1.533073 methodology warn against --1.533073 these warnings , --1.533073 blade was like --1.533073 book was ghostwritten --1.533073 chronology was used --1.533073 data was finally --1.533073 it was never --1.533073 that way slowly --1.533073 for we do --1.533073 may well have --1.533073 conservatives were right --1.533073 cores were selected --1.533073 data were supplemented --1.533073 larches were selected --1.533073 themselves were not --1.533073 these were the --1.533073 . what did --1.533073 what a --1.533073 changing what happens --1.533073 doubt what paul --1.533073 exactly what the --1.834103 to what is --1.834103 to what will --1.533073 . what's your --1.834103 , when combined --1.834103 , when i --1.533073 mind when he --1.533073 , where sensitivity --1.533073 case where it's --1.533073 know where to --1.533073 ( which had --1.834103 , which , --1.834103 , which makes --1.533073 method which did --1.533073 ( while looking --1.533073 , while including --1.533073 . while the --1.533073 someone whose book --1.533073 reason why schweingruber's --1.533073 ring widths and --1.533073 this will have --1.533073 what will be --1.533073 that wise crack --1.533073 . with the --1.533073 begin with . --1.533073 change with a --1.533073 chronology with its --1.533073 combination with the --1.533073 combined with the --1.533073 connection with osborn --1.533073 included with the --1.533073 originated with briffa --0.8145491 trouble with obama --1.533073 is within your --1.533073 done without fully --1.533073 always worth reading --1.533073 bias would not --1.533073 he would do --1.533073 shiyatov would not --1.533073 today would take --1.533073 . wright's church --1.533073 he wrote the --1.533073 , yamal larch --1.533073 briffa's yamal reconstruction --1.533073 resulting yamal chronology --1.212489 the yamal chronology --2.232043 the yamal data --2.232043 the yamal measurement --2.232043 the yamal subfossil --1.533073 200–400 year old --1.533073 this year , --1.533073 those years ? --1.533073 ' yes , --1.533073 back-and-forth yesterday about --1.533073 it yet , --1.533073 ahead you see --1.533073 what's your great --1.533073 within your power - -\end\ diff --git a/src/test_data/grammar.prune b/src/test_data/grammar.prune deleted file mode 100644 index 4ebcb509..00000000 --- a/src/test_data/grammar.prune +++ /dev/null @@ -1,196 +0,0 @@ -[PHRASE] ||| [PHRASE,1] haus ||| [PHRASE,1] house ||| 1.86183 0 0 0 0.0211892 -[PHRASE] ||| [PHRASE,1] haus ist ||| is [PHRASE,1] house ||| 2.58883 0.311249 0 0.348455 0.0211893 -[PHRASE] ||| [PHRASE,1] haus gibt ||| is [PHRASE,1] house ||| 2.56863 0.291046 0 0.258278 0.0211893 -[PHRASE] ||| [PHRASE,1] ein haus ist ||| [PHRASE,1] is a house ||| 3.16286 0 0 0.576934 0.0211893 -[PHRASE] ||| [PHRASE,1] ist ||| [PHRASE,1] is ||| 2.94101 0 0.676694 0.348455 0 -[PHRASE] ||| [PHRASE,1] ist ||| is [PHRASE,1] ||| 2.36698 0.649056 0.102662 0.348455 0 -[PHRASE] ||| [PHRASE,1] klein ist ||| [PHRASE,1] is small ||| 2.58883 0.124939 0 0.78211 0 -[PHRASE] ||| [PHRASE,1] maus ||| [PHRASE,1] mouse ||| 2.09592 0 0 0 0 -[PHRASE] ||| [PHRASE,1] maus gibt ||| is [PHRASE,1] mouse ||| 2.44865 0 0 0.258278 0 -[PHRASE] ||| [PHRASE,1] kleines ||| [PHRASE,1] small ||| 2.94101 0.439333 0 0.579784 0 -[PHRASE] ||| [PHRASE,1] kleines haus ||| [PHRASE,1] small house ||| 3.24204 0 0 0.579784 0.0211893 -[PHRASE] ||| [PHRASE,1] kleines haus gibt ||| is [PHRASE,1] small house ||| 3.30899 0 0 0.838062 0.0211893 -[PHRASE] ||| [PHRASE,1] kleine ||| [PHRASE,1] small ||| 2.94101 0.439333 0 0.500602 0 -[PHRASE] ||| [PHRASE,1] kleine maus ||| [PHRASE,1] small mouse ||| 3.24204 0 0 0.500602 0 -[PHRASE] ||| [PHRASE,1] kleine maus gibt ||| is [PHRASE,1] small mouse ||| 3.30899 0 0 0.75888 0 -[PHRASE] ||| [PHRASE,1] gelb ||| [PHRASE,1] yellow ||| 2.63998 0 0 0 0 -[PHRASE] ||| [PHRASE,1] gelb haus ||| [PHRASE,1] yellow house ||| 3.24204 0 0 0 0.0211893 -[PHRASE] ||| [PHRASE,1] gelb haus gibt ||| is [PHRASE,1] yellow house ||| 3.30899 0 0 0.258278 0.0211893 -[PHRASE] ||| [PHRASE,1] gelb maus ||| [PHRASE,1] yellow mouse ||| 3.24204 0 0 0 0 -[PHRASE] ||| [PHRASE,1] gelb maus gibt ||| is [PHRASE,1] yellow mouse ||| 3.30899 0 0 0.258278 0 -[PHRASE] ||| [PHRASE,1] gibt ||| is [PHRASE,1] ||| 1.82827 0.110339 0 0.258278 0 -[PHRASE] ||| haus ||| small yellow mouse house ||| 2.46389 0.845098 1.30103 0.278754 1.34341 -[PHRASE] ||| haus ||| house ||| Phrase_0=1.18514 Phrase_2=0.0222764 Phrase_4=0.0211893 -[PHRASE] ||| haus [PHRASE,1] ||| house [PHRASE,1] ||| 2.2878 0 0 0 0.0211893 -[PHRASE] ||| haus ist ||| house is ||| 2.46389 0 0 0.348455 0.0211893 -[PHRASE] ||| haus klein ist ||| house is small ||| 2.2878 0 0 0.78211 0.0211893 -[PHRASE] ||| ein ||| a ||| Phrase_0=1.34995 Phrase_1=0.228479 Phrase_3=0.228479 -[PHRASE] ||| ein [PHRASE,1] ||| a [PHRASE,1] ||| 2.03792 0.290035 0 0.228479 0 -[PHRASE] ||| ein [PHRASE,1] haus ||| a [PHRASE,1] house ||| 2.94101 0 0 0.228479 0.0211893 -[PHRASE] ||| ein [PHRASE,1] haus gibt ||| is a [PHRASE,1] house ||| 3.00796 0 0 0.486757 0.0211893 -[PHRASE] ||| ein [PHRASE,1] ist ||| is a [PHRASE,1] ||| 2.58883 0.535113 0 0.576934 0 -[PHRASE] ||| ein [PHRASE,1] gibt ||| is a [PHRASE,1] ||| 2.56863 0.51491 0 0.486757 0 -[PHRASE] ||| ein haus ||| a house ||| 1.76492 0 0.0791813 0.228479 0.0211893 -[PHRASE] ||| ein haus ||| a small house ||| 2.46389 0.30103 0.778151 0.507233 1.34341 -[PHRASE] ||| ein haus ist ||| is a house ||| 2.76492 0.477121 0 0.576934 0.0211893 -[PHRASE] ||| ein haus gibt ||| is a house ||| 2.46389 0.176091 0.176091 0.486757 0.0211893 -[PHRASE] ||| ein haus gibt ||| is a small house ||| 2.76492 0.39794 0.477121 0.765511 1.34341 -[PHRASE] ||| ein kleines ||| a small ||| 1.86183 0.243038 0 0.808263 0 -[PHRASE] ||| ein kleines [PHRASE,1] ||| a small [PHRASE,1] ||| 3.24204 0.30103 0 0.808263 0 -[PHRASE] ||| ein kleines [PHRASE,1] gibt ||| is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.06654 0 -[PHRASE] ||| ein kleines haus ||| a small house ||| 2.46389 0.30103 0 0.808263 0.0211893 -[PHRASE] ||| ein kleines haus ist ||| is a small house ||| 2.76492 0.39794 0 1.15672 0.0211893 -[PHRASE] ||| ein kleines haus gibt ||| is a small house ||| 3.06595 0.69897 0 1.06654 0.0211893 -[PHRASE] ||| ein kleines gelb ||| a small yellow ||| 2.94101 0.30103 0 0.808263 0 -[PHRASE] ||| ein kleines gelb haus ||| a small yellow house ||| 3.24204 0 0 0.808263 0.0211893 -[PHRASE] ||| ein kleines gelb haus gibt ||| is a small yellow house ||| 3.30899 0 0 1.06654 0.0211893 -[PHRASE] ||| ein gelb ||| a yellow ||| 1.98677 0.221849 0 0.228479 0 -[PHRASE] ||| ein gelb [PHRASE,1] ||| a yellow [PHRASE,1] ||| 3.24204 0.30103 0 0.228479 0 -[PHRASE] ||| ein gelb [PHRASE,1] gibt ||| is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.486757 0 -[PHRASE] ||| ein gelb haus ||| a yellow house ||| 2.63998 0 0 0.228479 0.0211893 -[PHRASE] ||| ein gelb haus ist ||| is a yellow house ||| 3.06595 0.30103 0 0.576934 0.0211893 -[PHRASE] ||| ein gelb haus gibt ||| is a yellow house ||| 3.06595 0.30103 0 0.486757 0.0211893 -[PHRASE] ||| ein gelb kleines ||| a yellow small ||| 2.94101 0.30103 0 0.808263 0 -[PHRASE] ||| ein gelb kleines haus ||| a yellow small house ||| 3.24204 0 0 0.808263 0.0211893 -[PHRASE] ||| ein gelb kleines haus gibt ||| is a yellow small house ||| 3.30899 0 0 1.06654 0.0211893 -[PHRASE] ||| ist ||| is ||| 1.34995 0.348455 0 0.348455 0 -[PHRASE] ||| klein ||| small ||| 1.61879 0.410174 0 0.433656 0 -[PHRASE] ||| klein [PHRASE,1] ||| [PHRASE,1] small ||| 3.06595 0.564271 0 0.433656 0 -[PHRASE] ||| klein [PHRASE,1] ist ||| [PHRASE,1] is small ||| 3.06595 0.60206 0 0.78211 0 -[PHRASE] ||| klein ist ||| is small ||| 1.68574 0 0 0.78211 0 -[PHRASE] ||| klein das [PHRASE,1] ||| the [PHRASE,1] small ||| 3.06595 0 0 0.433656 0.30103 -[PHRASE] ||| klein das haus ist ||| the house is small ||| 3.06595 0.477121 0 0.78211 0.322219 -[PHRASE] ||| maus ||| mouse ||| 1.50965 0 0 0 0 -[PHRASE] ||| maus [PHRASE,1] ||| mouse [PHRASE,1] ||| 2.94101 0 0 0 0 -[PHRASE] ||| maus [PHRASE,1] ist ||| mouse is [PHRASE,1] ||| 2.94101 0 0 0.348455 0 -[PHRASE] ||| maus ein haus ist ||| mouse is a house ||| 2.94101 0 0 0.576934 0.0211893 -[PHRASE] ||| kleines ||| small ||| 1.76492 0.556302 0 0.579784 0 -[PHRASE] ||| kleines [PHRASE,1] ||| small [PHRASE,1] ||| 2.94101 0.30103 0 0.579784 0 -[PHRASE] ||| kleines haus ||| small house ||| 1.86183 0.243038 0 0.579784 0.0211893 -[PHRASE] ||| kleines gelb ||| small yellow ||| 2.46389 0.30103 0 0.579784 0 -[PHRASE] ||| kleines gelb haus ||| small yellow house ||| 2.94101 0 0 0.579784 0.0211893 -[PHRASE] ||| kleine ||| small ||| 1.68574 0.477121 0 0.500602 0 -[PHRASE] ||| kleine [PHRASE,1] ||| small [PHRASE,1] ||| 2.94101 0.30103 0 0.500602 0 -[PHRASE] ||| kleine haus ||| small house ||| 2.16286 0.544068 0 0.500602 0.0211893 -[PHRASE] ||| kleine maus ||| small mouse ||| 1.98677 0 0 0.500602 0 -[PHRASE] ||| kleine gelb ||| small yellow ||| 2.46389 0.30103 0 0.500602 0 -[PHRASE] ||| kleine gelb maus ||| small yellow mouse ||| 2.94101 0 0 0.500602 0 -[PHRASE] ||| gelb ||| yellow ||| 1.61879 0 0 0 0 -[PHRASE] ||| gelb [PHRASE,1] ||| yellow [PHRASE,1] ||| 2.63998 0 0 0 0 -[PHRASE] ||| gelb haus ||| yellow house ||| 1.98677 0 0 0 0.0211893 -[PHRASE] ||| gelb maus ||| yellow mouse ||| 2.16286 0 0 0 0 -[PHRASE] ||| gelb kleines ||| yellow small ||| 2.46389 0.30103 0 0.579784 0 -[PHRASE] ||| gelb kleines haus ||| yellow small house ||| 2.94101 0 0 0.579784 0.0211893 -[PHRASE] ||| gelb kleine ||| yellow small ||| 2.46389 0.30103 0 0.500602 0 -[PHRASE] ||| gelb kleine maus ||| yellow small mouse ||| 2.94101 0 0 0.500602 0 -[PHRASE] ||| eine ||| a ||| 1.50965 0.38818 0 0.38818 0 -[PHRASE] ||| eine [PHRASE,1] ||| a [PHRASE,1] ||| 2.0602 0.312311 0 0.38818 0 -[PHRASE] ||| eine [PHRASE,1] maus ||| a [PHRASE,1] mouse ||| 2.94101 0 0 0.38818 0 -[PHRASE] ||| eine [PHRASE,1] maus gibt ||| is a [PHRASE,1] mouse ||| 3.00796 0 0 0.646458 0 -[PHRASE] ||| eine [PHRASE,1] gibt ||| is a [PHRASE,1] ||| 2.44865 0.394934 0 0.646458 0 -[PHRASE] ||| eine maus ||| a mouse ||| 1.98677 0 0 0.38818 0 -[PHRASE] ||| eine maus [PHRASE,1] ||| a mouse [PHRASE,1] ||| 3.16286 0 0 0.38818 0 -[PHRASE] ||| eine maus [PHRASE,1] ist ||| a mouse is [PHRASE,1] ||| 3.16286 0 0 0.736635 0 -[PHRASE] ||| eine maus ein haus ist ||| a mouse is a house ||| 3.16286 0 0 0.965114 0.0211893 -[PHRASE] ||| eine maus gibt ||| is a mouse ||| 2.46389 0 0 0.646458 0 -[PHRASE] ||| eine kleine ||| a small ||| 1.98677 0.367977 0 0.888783 0 -[PHRASE] ||| eine kleine [PHRASE,1] ||| a small [PHRASE,1] ||| 3.24204 0.30103 0 0.888783 0 -[PHRASE] ||| eine kleine [PHRASE,1] gibt ||| is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.14706 0 -[PHRASE] ||| eine kleine maus ||| a small mouse ||| 2.63998 0 0 0.888783 0 -[PHRASE] ||| eine kleine maus gibt ||| is a small mouse ||| 2.76492 0 0 1.14706 0 -[PHRASE] ||| eine kleine gelb ||| a small yellow ||| 2.94101 0.30103 0 0.888783 0 -[PHRASE] ||| eine kleine gelb maus ||| a small yellow mouse ||| 3.24204 0 0 0.888783 0 -[PHRASE] ||| eine kleine gelb maus gibt ||| is a small yellow mouse ||| 3.30899 0 0 1.14706 0 -[PHRASE] ||| eine gelb ||| a yellow ||| 2.16286 0.39794 0 0.38818 0 -[PHRASE] ||| eine gelb [PHRASE,1] ||| a yellow [PHRASE,1] ||| 3.24204 0.30103 0 0.38818 0 -[PHRASE] ||| eine gelb [PHRASE,1] gibt ||| is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.646458 0 -[PHRASE] ||| eine gelb maus ||| a yellow mouse ||| 2.94101 0 0 0.38818 0 -[PHRASE] ||| eine gelb maus gibt ||| is a yellow mouse ||| 3.06595 0 0 0.646458 0 -[PHRASE] ||| eine gelb kleine ||| a yellow small ||| 2.94101 0.30103 0 0.888783 0 -[PHRASE] ||| eine gelb kleine maus ||| a yellow small mouse ||| 3.24204 0 0 0.888783 0 -[PHRASE] ||| eine gelb kleine maus gibt ||| is a yellow small mouse ||| 3.30899 0 0 1.14706 0 -[PHRASE] ||| eine gruen ||| a green ||| 2.46389 0 0 0.38818 0 -[PHRASE] ||| eine gruen maus ||| a green mouse ||| 2.94101 0 0 0.38818 0 -[PHRASE] ||| gruen ||| green ||| 2.16286 0 0 0 0 -[PHRASE] ||| gruen maus ||| green mouse ||| 2.46389 0 0 0 0 -[PHRASE] ||| tages ||| day ||| 2.46389 0 0 0 0 -[PHRASE] ||| gibt ||| is ||| 1.25977 0.258278 0 0.258278 0 -[PHRASE] ||| meins ||| mine ||| 2.16286 0 0 0 0 -[PHRASE] ||| meins [PHRASE,1] ||| mine [PHRASE,1] ||| 2.76492 0 0 0 0 -[PHRASE] ||| meins ist ||| is mine ||| 2.46389 0 0 0.348455 0 -[PHRASE] ||| meins klein ist ||| mine is small ||| 2.76492 0 0 0.78211 0 -[PHRASE] ||| geld ||| money ||| 1.98677 0 0 0 0 -[PHRASE] ||| geld ist ||| is money ||| 2.46389 0.30103 0 0.348455 0 -[PHRASE] ||| geld gibt ||| is money ||| 2.46389 0.30103 0 0.258278 0 -[PHRASE] ||| keins ||| none ||| 1.98677 0 0 0 0 -[PHRASE] ||| keins [PHRASE,1] ||| none [PHRASE,1] ||| 2.76492 0 0 0 0 -[PHRASE] ||| keins klein ist ||| none is small ||| 2.76492 0 0 0.78211 0 -[PHRASE] ||| keins gibt ||| is none ||| 2.46389 0 0 0.258278 0 -[PHRASE] ||| dem haeuschen ||| of control ||| 2.46389 0 0 0.681241 0.425969 -[PHRASE] ||| eines ||| one ||| 2.46389 0.30103 0 0.30103 0 -[PHRASE] ||| eines tages ||| one day ||| 2.46389 0 0 0.30103 0 -[PHRASE] ||| eins ||| one ||| 2.46389 0.30103 0 0.30103 0 -[PHRASE] ||| aus ||| out ||| 2.46389 0 0.477121 0 0.221849 -[PHRASE] ||| aus ||| out of ||| 2.16286 0 0.176091 0.0791812 0.619789 -[PHRASE] ||| aus [PHRASE,1] ||| out [PHRASE,1] ||| 2.76492 0 0.367977 0 0.221849 -[PHRASE] ||| aus [PHRASE,1] ||| out of [PHRASE,1] ||| 2.63998 0 0.243038 0.0791812 0.619789 -[PHRASE] ||| aus ein ||| out of a ||| 2.46389 0 0 0.307661 0.619789 -[PHRASE] ||| aus ein haus ||| out of a house ||| 2.94101 0 0 0.307661 0.640978 -[PHRASE] ||| aus dem haeuschen ||| out of control ||| 2.76492 0 0 0.681241 0.647817 -[PHRASE] ||| aus das ||| out of the ||| 2.46389 0 0 0.0791812 0.920819 -[PHRASE] ||| aus das haus ||| out of the house ||| 2.94101 0 0 0.0791812 0.942008 -[PHRASE] ||| das ||| the ||| 1.76492 0 0.30103 0 0.30103 -[PHRASE] ||| das ||| that ||| 1.76492 0 0.30103 0 0.30103 -[PHRASE] ||| das [PHRASE,1] ||| the [PHRASE,1] ||| 2.39695 0 0.41972 0 0.30103 -[PHRASE] ||| das [PHRASE,1] ||| that [PHRASE,1] ||| 2.18514 0 0.207913 0 0.30103 -[PHRASE] ||| das [PHRASE,1] haus ist ||| that is [PHRASE,1] house ||| 2.86183 0 0 0.348455 0.322219 -[PHRASE] ||| das [PHRASE,1] ist ||| that is [PHRASE,1] ||| 2.86183 0 0 0.348455 0.30103 -[PHRASE] ||| das haus ||| the house ||| 1.86183 0 0 0 0.322219 -[PHRASE] ||| das haus [PHRASE,1] ||| the house [PHRASE,1] ||| 2.76492 0 0 0 0.322219 -[PHRASE] ||| das haus ist ||| the house is ||| 2.94101 0 0 0.348455 0.322219 -[PHRASE] ||| das haus klein ist ||| the house is small ||| 2.76492 0.176091 0 0.78211 0.322219 -[PHRASE] ||| das ein [PHRASE,1] ist ||| that is a [PHRASE,1] ||| 2.86183 0 0 0.576934 0.30103 -[PHRASE] ||| das ein kleines haus ist ||| that is a small house ||| 3.16286 0 0 1.15672 0.322219 -[PHRASE] ||| das ein gelb haus ist ||| that is a yellow house ||| 3.16286 0 0 0.576934 0.322219 -[PHRASE] ||| das klein ist ||| that is small ||| 2.76492 0 0 0.78211 0.30103 -[PHRASE] ||| das kleine ||| the small ||| 2.46389 0 0 0.500602 0.30103 -[PHRASE] ||| das kleine haus ||| the small house ||| 2.94101 0 0 0.500602 0.322219 -[PHRASE] ||| das meins ist ||| that is mine ||| 2.76492 0 0 0.348455 0.30103 -[PHRASE] ||| das geld ist ||| that is money ||| 2.76492 0 0 0.348455 0.30103 -[PHRASE] ||| es ||| there ||| 1.25977 0 0 0 0 -[PHRASE] ||| es [PHRASE,1] ||| there [PHRASE,1] ||| 1.83672 0 0 0 0 -[PHRASE] ||| es [PHRASE,1] haus gibt ||| there is [PHRASE,1] house ||| 2.62775 0 0 0.258278 0.0211893 -[PHRASE] ||| es [PHRASE,1] maus gibt ||| there is [PHRASE,1] mouse ||| 2.5166 0 0 0.258278 0 -[PHRASE] ||| es [PHRASE,1] kleines haus gibt ||| there is [PHRASE,1] small house ||| 3.30899 0 0 0.838062 0.0211893 -[PHRASE] ||| es [PHRASE,1] kleine maus gibt ||| there is [PHRASE,1] small mouse ||| 3.30899 0 0 0.75888 0 -[PHRASE] ||| es [PHRASE,1] gelb haus gibt ||| there is [PHRASE,1] yellow house ||| 3.30899 0 0 0.258278 0.0211893 -[PHRASE] ||| es [PHRASE,1] gelb maus gibt ||| there is [PHRASE,1] yellow mouse ||| 3.30899 0 0 0.258278 0 -[PHRASE] ||| es [PHRASE,1] gibt ||| there is [PHRASE,1] ||| 1.9536 0 0 0.258278 0 -[PHRASE] ||| es ein [PHRASE,1] haus gibt ||| there is a [PHRASE,1] house ||| 3.00796 0 0 0.486757 0.0211893 -[PHRASE] ||| es ein [PHRASE,1] gibt ||| there is a [PHRASE,1] ||| 2.62775 0.360151 0 0.486757 0 -[PHRASE] ||| es ein haus gibt ||| there is a house ||| 2.63998 0 0.176091 0.486757 0.0211893 -[PHRASE] ||| es ein haus gibt ||| there is a small house ||| 2.94101 0.20412 0.477121 0.765511 1.34341 -[PHRASE] ||| es ein kleines [PHRASE,1] gibt ||| there is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.06654 0 -[PHRASE] ||| es ein kleines haus gibt ||| there is a small house ||| 3.16286 0.425969 0 1.06654 0.0211893 -[PHRASE] ||| es ein gelb [PHRASE,1] gibt ||| there is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.486757 0 -[PHRASE] ||| es ein gelb haus gibt ||| there is a yellow house ||| 3.16286 0 0 0.486757 0.0211893 -[PHRASE] ||| es eine [PHRASE,1] maus gibt ||| there is a [PHRASE,1] mouse ||| 3.00796 0 0 0.646458 0 -[PHRASE] ||| es eine [PHRASE,1] gibt ||| there is a [PHRASE,1] ||| 2.5166 0.249001 0 0.646458 0 -[PHRASE] ||| es eine maus gibt ||| there is a mouse ||| 2.63998 0 0 0.646458 0 -[PHRASE] ||| es eine kleine [PHRASE,1] gibt ||| there is a small [PHRASE,1] ||| 3.30899 0.30103 0 1.14706 0 -[PHRASE] ||| es eine kleine maus gibt ||| there is a small mouse ||| 2.86183 0 0 1.14706 0 -[PHRASE] ||| es eine gelb [PHRASE,1] gibt ||| there is a yellow [PHRASE,1] ||| 3.30899 0.30103 0 0.646458 0 -[PHRASE] ||| es eine gelb maus gibt ||| there is a yellow mouse ||| 3.16286 0 0 0.646458 0 -[PHRASE] ||| es geld gibt ||| there is money ||| 2.76492 0 0 0.258278 0 -[PHRASE] ||| es keins gibt ||| there is none ||| 2.76492 0 0 0.258278 0 -[PHRASE] ||| dieses ||| this ||| 1.98677 0 0 0 0 -[PHRASE] ||| dieses [PHRASE,1] ||| this [PHRASE,1] ||| 2.56995 0 0 0 0 -[PHRASE] ||| dieses [PHRASE,1] haus ist ||| this is [PHRASE,1] house ||| 3.16286 0 0 0.348455 0.0211893 -[PHRASE] ||| dieses [PHRASE,1] ist ||| this is [PHRASE,1] ||| 3.16286 0 0 0.348455 0 -[PHRASE] ||| dieses haus ||| this house ||| 2.46389 0 0 0 0.0211893 -[PHRASE] ||| dieses haus [PHRASE,1] ||| this house [PHRASE,1] ||| 3.06595 0 0 0 0.0211893 -[PHRASE] ||| dieses haus klein ist ||| this house is small ||| 3.06595 0 0 0.78211 0.0211893 -[PHRASE] ||| dieses ein [PHRASE,1] ist ||| this is a [PHRASE,1] ||| 3.16286 0 0 0.576934 0 -[PHRASE] ||| dieses ein kleines haus ist ||| this is a small house ||| 3.16286 0 0 1.15672 0.0211893 -[PHRASE] ||| dieses kleine ||| this small ||| 2.46389 0 0 0.500602 0 -[PHRASE] ||| dieses kleine haus ||| this small house ||| 2.94101 0 0 0.500602 0.0211893 diff --git a/src/test_data/small.json.gz b/src/test_data/small.json.gz deleted file mode 100644 index 892ba360..00000000 Binary files a/src/test_data/small.json.gz and /dev/null differ diff --git a/src/test_data/test_2gram.lm.gz b/src/test_data/test_2gram.lm.gz deleted file mode 100644 index aafa7274..00000000 Binary files a/src/test_data/test_2gram.lm.gz and /dev/null differ diff --git a/src/test_data/weights b/src/test_data/weights deleted file mode 100644 index ea70229c..00000000 --- a/src/test_data/weights +++ /dev/null @@ -1,8 +0,0 @@ -# hiero -WordPenalty -0.387029 -LanguageModel 0.253195 -PhraseModel_0 0.142926 -PhraseModel_1 0.465119 -PhraseModel_2 0.079503 -CNPosteriorProbability 0.09259 -Inf -inf diff --git a/src/test_data/weights.gt b/src/test_data/weights.gt deleted file mode 100644 index 08931049..00000000 --- a/src/test_data/weights.gt +++ /dev/null @@ -1,4 +0,0 @@ -Phrase_0 1.0 -Phrase_1 0.5 -Phrase_2 0.3 -Phrase_3 0.2 diff --git a/src/timing_stats.cc b/src/timing_stats.cc deleted file mode 100644 index 85b95de5..00000000 --- a/src/timing_stats.cc +++ /dev/null @@ -1,24 +0,0 @@ -#include "timing_stats.h" - -#include - -using namespace std; - -map Timer::stats; - -Timer::Timer(const string& timername) : start_t(clock()), cur(stats[timername]) {} - -Timer::~Timer() { - ++cur.calls; - const clock_t end_t = clock(); - const double elapsed = (end_t - start_t) / 1000000.0; - cur.total_time += elapsed; -} - -void Timer::Summarize() { - for (map::iterator it = stats.begin(); it != stats.end(); ++it) { - cerr << it->first << ": " << it->second.total_time << " secs (" << it->second.calls << " calls)\n"; - } - stats.clear(); -} - diff --git a/src/timing_stats.h b/src/timing_stats.h deleted file mode 100644 index 0a9f7656..00000000 --- a/src/timing_stats.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _TIMING_STATS_H_ -#define _TIMING_STATS_H_ - -#include -#include - -struct TimerInfo { - int calls; - double total_time; - TimerInfo() : calls(), total_time() {} -}; - -struct Timer { - Timer(const std::string& info); - ~Timer(); - static void Summarize(); - private: - static std::map stats; - clock_t start_t; - TimerInfo& cur; - Timer(const Timer& other); - const Timer& operator=(const Timer& other); -}; - -#endif diff --git a/src/translator.h b/src/translator.h deleted file mode 100644 index 194efbaa..00000000 --- a/src/translator.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef _TRANSLATOR_H_ -#define _TRANSLATOR_H_ - -#include -#include -#include -#include - -class Hypergraph; -class SentenceMetadata; - -class Translator { - public: - virtual ~Translator(); - // returns true if goal reached, false otherwise - // minus_lm_forest will contain the unpruned forest. the - // feature values from the phrase table / grammar / etc - // should be in the forest already - the "late" features - // should not just copy values that are available without - // any context or computation. - // SentenceMetadata contains information about the sentence, - // but it is an input/output parameter since the Translator - // is also responsible for setting the value of src_len. - virtual bool Translate(const std::string& src, - SentenceMetadata* smeta, - const std::vector& weights, - Hypergraph* minus_lm_forest) = 0; -}; - -class SCFGTranslatorImpl; -class SCFGTranslator : public Translator { - public: - SCFGTranslator(const boost::program_options::variables_map& conf); - bool Translate(const std::string& src, - SentenceMetadata* smeta, - const std::vector& weights, - Hypergraph* minus_lm_forest); - private: - boost::shared_ptr pimpl_; -}; - -class FSTTranslatorImpl; -class FSTTranslator : public Translator { - public: - FSTTranslator(const boost::program_options::variables_map& conf); - bool Translate(const std::string& src, - SentenceMetadata* smeta, - const std::vector& weights, - Hypergraph* minus_lm_forest); - private: - boost::shared_ptr pimpl_; -}; - -#endif diff --git a/src/trule.cc b/src/trule.cc deleted file mode 100644 index b8f6995e..00000000 --- a/src/trule.cc +++ /dev/null @@ -1,237 +0,0 @@ -#include "trule.h" - -#include - -#include "stringlib.h" -#include "tdict.h" - -using namespace std; - -static WordID ConvertTrgString(const string& w) { - int len = w.size(); - WordID id = 0; - // [X,0] or [0] - // for target rules, we ignore the category, just keep the index - if (len > 2 && w[0]=='[' && w[len-1]==']' && w[len-2] > '0' && w[len-2] <= '9' && - (len == 3 || (len > 4 && w[len-3] == ','))) { - id = w[len-2] - '0'; - id = 1 - id; - } else { - id = TD::Convert(w); - } - return id; -} - -static WordID ConvertSrcString(const string& w, bool mono = false) { - int len = w.size(); - // [X,0] - // for source rules, we keep the category and ignore the index (source rules are - // always numbered 1, 2, 3... - if (mono) { - if (len > 2 && w[0]=='[' && w[len-1]==']') { - if (len > 4 && w[len-3] == ',') { - cerr << "[ERROR] Monolingual rules mut not have non-terminal indices:\n " - << w << endl; - exit(1); - } - // TODO check that source indices go 1,2,3,etc. - return TD::Convert(w.substr(1, len-2)) * -1; - } else { - return TD::Convert(w); - } - } else { - if (len > 4 && w[0]=='[' && w[len-1]==']' && w[len-3] == ',' && w[len-2] > '0' && w[len-2] <= '9') { - return TD::Convert(w.substr(1, len-4)) * -1; - } else { - return TD::Convert(w); - } - } -} - -static WordID ConvertLHS(const string& w) { - if (w[0] == '[') { - int len = w.size(); - if (len < 3) { cerr << "Format error: " << w << endl; exit(1); } - return TD::Convert(w.substr(1, len-2)) * -1; - } else { - return TD::Convert(w) * -1; - } -} - -TRule* TRule::CreateRuleSynchronous(const std::string& rule) { - TRule* res = new TRule; - if (res->ReadFromString(rule, true, false)) return res; - cerr << "[ERROR] Failed to creating rule from: " << rule << endl; - delete res; - return NULL; -} - -TRule* TRule::CreateRulePhrasetable(const string& rule) { - // TODO make this faster - // TODO add configuration for default NT type - if (rule[0] == '[') { - cerr << "Phrasetable rules shouldn't have a LHS / non-terminals:\n " << rule << endl; - return NULL; - } - TRule* res = new TRule("[X] ||| " + rule, true, false); - if (res->Arity() != 0) { - cerr << "Phrasetable rules should have arity 0:\n " << rule << endl; - delete res; - return NULL; - } - return res; -} - -TRule* TRule::CreateRuleMonolingual(const string& rule) { - return new TRule(rule, false, true); -} - -bool TRule::ReadFromString(const string& line, bool strict, bool mono) { - e_.clear(); - f_.clear(); - scores_.clear(); - - string w; - istringstream is(line); - int format = CountSubstrings(line, "|||"); - if (strict && format < 2) { - cerr << "Bad rule format in strict mode:\n" << line << endl; - return false; - } - if (format >= 2 || (mono && format == 1)) { - while(is>>w && w!="|||") { lhs_ = ConvertLHS(w); } - while(is>>w && w!="|||") { f_.push_back(ConvertSrcString(w, mono)); } - if (!mono) { - while(is>>w && w!="|||") { e_.push_back(ConvertTrgString(w)); } - } - int fv = 0; - if (is) { - string ss; - getline(is, ss); - //cerr << "L: " << ss << endl; - int start = 0; - const int len = ss.size(); - while (start < len) { - while(start < len && (ss[start] == ' ' || ss[start] == ';')) - ++start; - if (start == len) break; - int end = start + 1; - while(end < len && (ss[end] != '=' && ss[end] != ' ' && ss[end] != ';')) - ++end; - if (end == len || ss[end] == ' ' || ss[end] == ';') { - //cerr << "PROC: '" << ss.substr(start, end - start) << "'\n"; - // non-named features - if (end != len) { ss[end] = 0; } - string fname = "PhraseModel_X"; - if (fv > 9) { cerr << "Too many phrasetable scores - used named format\n"; abort(); } - fname[12]='0' + fv; - ++fv; - scores_.set_value(FD::Convert(fname), atof(&ss[start])); - //cerr << "F: " << fname << " VAL=" << scores_.value(FD::Convert(fname)) << endl; - } else { - const int fid = FD::Convert(ss.substr(start, end - start)); - start = end + 1; - end = start + 1; - while(end < len && (ss[end] != ' ' && ss[end] != ';')) - ++end; - if (end < len) { ss[end] = 0; } - assert(start < len); - scores_.set_value(fid, atof(&ss[start])); - //cerr << "F: " << FD::Convert(fid) << " VAL=" << scores_.value(fid) << endl; - } - start = end + 1; - } - } - } else if (format == 1) { - while(is>>w && w!="|||") { lhs_ = ConvertLHS(w); } - while(is>>w && w!="|||") { e_.push_back(ConvertTrgString(w)); } - f_ = e_; - int x = ConvertLHS("[X]"); - for (int i = 0; i < f_.size(); ++i) - if (f_[i] <= 0) { f_[i] = x; } - } else { - cerr << "F: " << format << endl; - cerr << "[ERROR] Don't know how to read:\n" << line << endl; - } - if (mono) { - e_ = f_; - int ci = 0; - for (int i = 0; i < e_.size(); ++i) - if (e_[i] < 0) - e_[i] = ci--; - } - ComputeArity(); - return SanityCheck(); -} - -bool TRule::SanityCheck() const { - vector used(f_.size(), 0); - int ac = 0; - for (int i = 0; i < e_.size(); ++i) { - int ind = e_[i]; - if (ind > 0) continue; - ind = -ind; - if ((++used[ind]) != 1) { - cerr << "[ERROR] e-side variable index " << (ind+1) << " used more than once!\n"; - return false; - } - ac++; - } - if (ac != Arity()) { - cerr << "[ERROR] e-side arity mismatches f-side\n"; - return false; - } - return true; -} - -void TRule::ComputeArity() { - int min = 1; - for (vector::const_iterator i = e_.begin(); i != e_.end(); ++i) - if (*i < min) min = *i; - arity_ = 1 - min; -} - -static string AnonymousStrVar(int i) { - string res("[v]"); - if(!(i <= 0 && i >= -8)) { - cerr << "Can't handle more than 9 non-terminals: index=" << (-i) << endl; - abort(); - } - res[1] = '1' - i; - return res; -} - -string TRule::AsString(bool verbose) const { - ostringstream os; - int idx = 0; - if (lhs_ && verbose) { - os << '[' << TD::Convert(lhs_ * -1) << "] |||"; - for (int i = 0; i < f_.size(); ++i) { - const WordID& w = f_[i]; - if (w < 0) { - int wi = w * -1; - ++idx; - os << " [" << TD::Convert(wi) << ',' << idx << ']'; - } else { - os << ' ' << TD::Convert(w); - } - } - os << " ||| "; - } - if (idx > 9) { - cerr << "Too many non-terminals!\n partial: " << os.str() << endl; - exit(1); - } - for (int i =0; i -#include -#include -#include - -#include "sparse_vector.h" -#include "wordid.h" - -class TRule; -typedef boost::shared_ptr TRulePtr; -struct SpanInfo; - -// Translation rule -class TRule { - public: - TRule() : lhs_(0), prev_i(-1), prev_j(-1) { } - explicit TRule(const std::vector& e) : e_(e), lhs_(0), prev_i(-1), prev_j(-1) {} - TRule(const std::vector& e, const std::vector& f, const WordID& lhs) : - e_(e), f_(f), lhs_(lhs), prev_i(-1), prev_j(-1) {} - - // deprecated - this will be private soon - explicit TRule(const std::string& text, bool strict = false, bool mono = false) { - ReadFromString(text, strict, mono); - } - - // make a rule from a hiero-like rule table, e.g. - // [X] ||| [X,1] DE [X,2] ||| [X,2] of the [X,1] - // if misformatted, returns NULL - static TRule* CreateRuleSynchronous(const std::string& rule); - - // make a rule from a phrasetable entry (i.e., one that has no LHS type), e.g: - // el gato ||| the cat ||| Feature_2=0.34 - static TRule* CreateRulePhrasetable(const std::string& rule); - - // make a rule from a non-synchrnous CFG representation, e.g.: - // [LHS] ||| term1 [NT] term2 [OTHER_NT] [YET_ANOTHER_NT] - static TRule* CreateRuleMonolingual(const std::string& rule); - - void ESubstitute(const std::vector* >& var_values, - std::vector* result) const { - int vc = 0; - result->clear(); - for (std::vector::const_iterator i = e_.begin(); i != e_.end(); ++i) { - const WordID& c = *i; - if (c < 1) { - ++vc; - const std::vector& var_value = *var_values[-c]; - std::copy(var_value.begin(), - var_value.end(), - std::back_inserter(*result)); - } else { - result->push_back(c); - } - } - assert(vc == var_values.size()); - } - - void FSubstitute(const std::vector* >& var_values, - std::vector* result) const { - int vc = 0; - result->clear(); - for (std::vector::const_iterator i = f_.begin(); i != f_.end(); ++i) { - const WordID& c = *i; - if (c < 1) { - const std::vector& var_value = *var_values[vc++]; - std::copy(var_value.begin(), - var_value.end(), - std::back_inserter(*result)); - } else { - result->push_back(c); - } - } - assert(vc == var_values.size()); - } - - bool ReadFromString(const std::string& line, bool strict = false, bool monolingual = false); - - bool Initialized() const { return e_.size(); } - - std::string AsString(bool verbose = true) const; - - static TRule DummyRule() { - TRule res; - res.e_.resize(1, 0); - return res; - } - - const std::vector& f() const { return f_; } - const std::vector& e() const { return e_; } - - int EWords() const { return ELength() - Arity(); } - int FWords() const { return FLength() - Arity(); } - int FLength() const { return f_.size(); } - int ELength() const { return e_.size(); } - int Arity() const { return arity_; } - bool IsUnary() const { return (Arity() == 1) && (f_.size() == 1); } - const SparseVector& GetFeatureValues() const { return scores_; } - double Score(int i) const { return scores_[i]; } - WordID GetLHS() const { return lhs_; } - void ComputeArity(); - - // 0 = first variable, -1 = second variable, -2 = third ... - std::vector e_; - // < 0: *-1 = encoding of category of variable - std::vector f_; - WordID lhs_; - SparseVector scores_; - char arity_; - TRulePtr parent_rule_; // usually NULL, except when doing constrained decoding - - // this is only used when doing synchronous parsing - short int prev_i; - short int prev_j; - - private: - bool SanityCheck() const; -}; - -#endif diff --git a/src/trule_test.cc b/src/trule_test.cc deleted file mode 100644 index 02a70764..00000000 --- a/src/trule_test.cc +++ /dev/null @@ -1,65 +0,0 @@ -#include "trule.h" - -#include -#include -#include -#include "tdict.h" - -using namespace std; - -class TRuleTest : public testing::Test { - protected: - virtual void SetUp() { } - virtual void TearDown() { } -}; - -TEST_F(TRuleTest,TestFSubstitute) { - TRule r1("[X] ||| ob [X,1] [X,2] sah . ||| whether [X,1] saw [X,2] . ||| 0.99"); - TRule r2("[X] ||| ich ||| i ||| 1.0"); - TRule r3("[X] ||| ihn ||| him ||| 1.0"); - vector*> ants; - vector res2; - r2.FSubstitute(ants, &res2); - assert(TD::GetString(res2) == "ich"); - vector res3; - r3.FSubstitute(ants, &res3); - assert(TD::GetString(res3) == "ihn"); - ants.push_back(&res2); - ants.push_back(&res3); - vector res; - r1.FSubstitute(ants, &res); - cerr << TD::GetString(res) << endl; - assert(TD::GetString(res) == "ob ich ihn sah ."); -} - -TEST_F(TRuleTest,TestPhrasetableRule) { - TRulePtr t(TRule::CreateRulePhrasetable("gato ||| cat ||| PhraseModel_0=-23.2;Foo=1;Bar=12")); - cerr << t->AsString() << endl; - assert(t->scores_.num_active() == 3); -}; - - -TEST_F(TRuleTest,TestMonoRule) { - TRulePtr m(TRule::CreateRuleMonolingual("[LHS] ||| term1 [NT] term2 [NT2] [NT3]")); - assert(m->Arity() == 3); - cerr << m->AsString() << endl; - TRulePtr m2(TRule::CreateRuleMonolingual("[LHS] ||| term1 [NT] term2 [NT2] [NT3] ||| Feature1=0.23")); - assert(m2->Arity() == 3); - cerr << m2->AsString() << endl; - EXPECT_FLOAT_EQ(m2->scores_.value(FD::Convert("Feature1")), 0.23); -} - -TEST_F(TRuleTest,TestRuleR) { - TRule t6; - t6.ReadFromString("[X] ||| den [X,1] sah [X,2] . ||| [X,2] saw the [X,1] . ||| 0.12321 0.23232 0.121"); - cerr << "TEXT: " << t6.AsString() << endl; - EXPECT_EQ(t6.Arity(), 2); - EXPECT_EQ(t6.e_[0], -1); - EXPECT_EQ(t6.e_[3], 0); -} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - diff --git a/src/ttables.cc b/src/ttables.cc deleted file mode 100644 index 2ea960f0..00000000 --- a/src/ttables.cc +++ /dev/null @@ -1,31 +0,0 @@ -#include "ttables.h" - -#include - -#include "dict.h" - -using namespace std; -using namespace std::tr1; - -void TTable::DeserializeProbsFromText(std::istream* in) { - int c = 0; - while(*in) { - string e; - string f; - double p; - (*in) >> e >> f >> p; - if (e.empty()) break; - ++c; - ttable[TD::Convert(e)][TD::Convert(f)] = prob_t(p); - } - cerr << "Loaded " << c << " translation parameters.\n"; -} - -void TTable::SerializeHelper(string* out, const Word2Word2Double& o) { - assert(!"not implemented"); -} - -void TTable::DeserializeHelper(const string& in, Word2Word2Double* o) { - assert(!"not implemented"); -} - diff --git a/src/ttables.h b/src/ttables.h deleted file mode 100644 index 3ffc238a..00000000 --- a/src/ttables.h +++ /dev/null @@ -1,87 +0,0 @@ -#ifndef _TTABLES_H_ -#define _TTABLES_H_ - -#include -#include - -#include "wordid.h" -#include "prob.h" -#include "tdict.h" - -class TTable { - public: - TTable() {} - typedef std::map Word2Double; - typedef std::map Word2Word2Double; - inline const prob_t prob(const int& e, const int& f) const { - const Word2Word2Double::const_iterator cit = ttable.find(e); - if (cit != ttable.end()) { - const Word2Double& cpd = cit->second; - const Word2Double::const_iterator it = cpd.find(f); - if (it == cpd.end()) return prob_t(0.00001); - return prob_t(it->second); - } else { - return prob_t(0.00001); - } - } - inline void Increment(const int& e, const int& f) { - counts[e][f] += 1.0; - } - inline void Increment(const int& e, const int& f, double x) { - counts[e][f] += x; - } - void Normalize() { - ttable.swap(counts); - for (Word2Word2Double::iterator cit = ttable.begin(); - cit != ttable.end(); ++cit) { - double tot = 0; - Word2Double& cpd = cit->second; - for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - tot += it->second; - for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - it->second /= tot; - } - counts.clear(); - } - // adds counts from another TTable - probabilities remain unchanged - TTable& operator+=(const TTable& rhs) { - for (Word2Word2Double::const_iterator it = rhs.counts.begin(); - it != rhs.counts.end(); ++it) { - const Word2Double& cpd = it->second; - Word2Double& tgt = counts[it->first]; - for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) { - tgt[j->first] += j->second; - } - } - return *this; - } - void ShowTTable() { - for (Word2Word2Double::iterator it = ttable.begin(); it != ttable.end(); ++it) { - Word2Double& cpd = it->second; - for (Word2Double::iterator j = cpd.begin(); j != cpd.end(); ++j) { - std::cerr << "P(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl; - } - } - } - void ShowCounts() { - for (Word2Word2Double::iterator it = counts.begin(); it != counts.end(); ++it) { - Word2Double& cpd = it->second; - for (Word2Double::iterator j = cpd.begin(); j != cpd.end(); ++j) { - std::cerr << "c(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl; - } - } - } - void DeserializeProbsFromText(std::istream* in); - void SerializeCounts(std::string* out) const { SerializeHelper(out, counts); } - void DeserializeCounts(const std::string& in) { DeserializeHelper(in, &counts); } - void SerializeProbs(std::string* out) const { SerializeHelper(out, ttable); } - void DeserializeProbs(const std::string& in) { DeserializeHelper(in, &ttable); } - private: - static void SerializeHelper(std::string*, const Word2Word2Double& o); - static void DeserializeHelper(const std::string&, Word2Word2Double* o); - public: - Word2Word2Double ttable; - Word2Word2Double counts; -}; - -#endif diff --git a/src/viterbi.cc b/src/viterbi.cc deleted file mode 100644 index 82b2ce6d..00000000 --- a/src/viterbi.cc +++ /dev/null @@ -1,39 +0,0 @@ -#include "viterbi.h" - -#include -#include "hg.h" - -using namespace std; - -string ViterbiETree(const Hypergraph& hg) { - vector tmp; - const prob_t p = Viterbi, ETreeTraversal, prob_t, EdgeProb>(hg, &tmp); - return TD::GetString(tmp); -} - -string ViterbiFTree(const Hypergraph& hg) { - vector tmp; - const prob_t p = Viterbi, FTreeTraversal, prob_t, EdgeProb>(hg, &tmp); - return TD::GetString(tmp); -} - -prob_t ViterbiESentence(const Hypergraph& hg, vector* result) { - return Viterbi, ESentenceTraversal, prob_t, EdgeProb>(hg, result); -} - -prob_t ViterbiFSentence(const Hypergraph& hg, vector* result) { - return Viterbi, FSentenceTraversal, prob_t, EdgeProb>(hg, result); -} - -int ViterbiELength(const Hypergraph& hg) { - int len = -1; - Viterbi(hg, &len); - return len; -} - -int ViterbiPathLength(const Hypergraph& hg) { - int len = -1; - Viterbi(hg, &len); - return len; -} - diff --git a/src/viterbi.h b/src/viterbi.h deleted file mode 100644 index 46a4f528..00000000 --- a/src/viterbi.h +++ /dev/null @@ -1,130 +0,0 @@ -#ifndef _VITERBI_H_ -#define _VITERBI_H_ - -#include -#include "prob.h" -#include "hg.h" -#include "tdict.h" - -// V must implement: -// void operator()(const vector& ants, T* result); -template -WeightType Viterbi(const Hypergraph& hg, - T* result, - const Traversal& traverse = Traversal(), - const WeightFunction& weight = WeightFunction()) { - const int num_nodes = hg.nodes_.size(); - std::vector vit_result(num_nodes); - std::vector vit_weight(num_nodes, WeightType::Zero()); - - for (int i = 0; i < num_nodes; ++i) { - const Hypergraph::Node& cur_node = hg.nodes_[i]; - WeightType* const cur_node_best_weight = &vit_weight[i]; - T* const cur_node_best_result = &vit_result[i]; - - const int num_in_edges = cur_node.in_edges_.size(); - if (num_in_edges == 0) { - *cur_node_best_weight = WeightType(1); - continue; - } - for (int j = 0; j < num_in_edges; ++j) { - const Hypergraph::Edge& edge = hg.edges_[cur_node.in_edges_[j]]; - WeightType score = weight(edge); - std::vector ants(edge.tail_nodes_.size()); - for (int k = 0; k < edge.tail_nodes_.size(); ++k) { - const int tail_node_index = edge.tail_nodes_[k]; - score *= vit_weight[tail_node_index]; - ants[k] = &vit_result[tail_node_index]; - } - if (*cur_node_best_weight < score) { - *cur_node_best_weight = score; - traverse(edge, ants, cur_node_best_result); - } - } - } - std::swap(*result, vit_result.back()); - return vit_weight.back(); -} - -struct PathLengthTraversal { - void operator()(const Hypergraph::Edge& edge, - const std::vector& ants, - int* result) const { - (void) edge; - *result = 1; - for (int i = 0; i < ants.size(); ++i) *result += *ants[i]; - } -}; - -struct ESentenceTraversal { - void operator()(const Hypergraph::Edge& edge, - const std::vector*>& ants, - std::vector* result) const { - edge.rule_->ESubstitute(ants, result); - } -}; - -struct ELengthTraversal { - void operator()(const Hypergraph::Edge& edge, - const std::vector& ants, - int* result) const { - *result = edge.rule_->ELength() - edge.rule_->Arity(); - for (int i = 0; i < ants.size(); ++i) *result += *ants[i]; - } -}; - -struct FSentenceTraversal { - void operator()(const Hypergraph::Edge& edge, - const std::vector*>& ants, - std::vector* result) const { - edge.rule_->FSubstitute(ants, result); - } -}; - -// create a strings of the form (S (X the man) (X said (X he (X would (X go))))) -struct ETreeTraversal { - ETreeTraversal() : left("("), space(" "), right(")") {} - const std::string left; - const std::string space; - const std::string right; - void operator()(const Hypergraph::Edge& edge, - const std::vector*>& ants, - std::vector* result) const { - std::vector tmp; - edge.rule_->ESubstitute(ants, &tmp); - const std::string cat = TD::Convert(edge.rule_->GetLHS() * -1); - if (cat == "Goal") - result->swap(tmp); - else - TD::ConvertSentence(left + cat + space + TD::GetString(tmp) + right, - result); - } -}; - -struct FTreeTraversal { - FTreeTraversal() : left("("), space(" "), right(")") {} - const std::string left; - const std::string space; - const std::string right; - void operator()(const Hypergraph::Edge& edge, - const std::vector*>& ants, - std::vector* result) const { - std::vector tmp; - edge.rule_->FSubstitute(ants, &tmp); - const std::string cat = TD::Convert(edge.rule_->GetLHS() * -1); - if (cat == "Goal") - result->swap(tmp); - else - TD::ConvertSentence(left + cat + space + TD::GetString(tmp) + right, - result); - } -}; - -prob_t ViterbiESentence(const Hypergraph& hg, std::vector* result); -std::string ViterbiETree(const Hypergraph& hg); -prob_t ViterbiFSentence(const Hypergraph& hg, std::vector* result); -std::string ViterbiFTree(const Hypergraph& hg); -int ViterbiELength(const Hypergraph& hg); -int ViterbiPathLength(const Hypergraph& hg); - -#endif diff --git a/src/weights.cc b/src/weights.cc deleted file mode 100644 index bb0a878f..00000000 --- a/src/weights.cc +++ /dev/null @@ -1,73 +0,0 @@ -#include "weights.h" - -#include - -#include "fdict.h" -#include "filelib.h" - -using namespace std; - -void Weights::InitFromFile(const std::string& filename, vector* feature_list) { - cerr << "Reading weights from " << filename << endl; - ReadFile in_file(filename); - istream& in = *in_file.stream(); - assert(in); - int weight_count = 0; - bool fl = false; - while (in) { - double val = 0; - string buf; - getline(in, buf); - if (buf.size() == 0) continue; - if (buf[0] == '#') continue; - for (int i = 0; i < buf.size(); ++i) - if (buf[i] == '=') buf[i] = ' '; - int start = 0; - while(start < buf.size() && buf[start] == ' ') ++start; - int end = 0; - while(end < buf.size() && buf[end] != ' ') ++end; - int fid = FD::Convert(buf.substr(start, end - start)); - while(end < buf.size() && buf[end] == ' ') ++end; - val = strtod(&buf.c_str()[end], NULL); - if (wv_.size() <= fid) - wv_.resize(fid + 1); - wv_[fid] = val; - if (feature_list) { feature_list->push_back(FD::Convert(fid)); } - ++weight_count; - if (weight_count % 50000 == 0) { cerr << '.' << flush; fl = true; } - if (weight_count % 2000000 == 0) { cerr << " [" << weight_count << "]\n"; fl = false; } - } - if (fl) { cerr << endl; } - cerr << "Loaded " << weight_count << " feature weights\n"; -} - -void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features) const { - WriteFile out(fname); - ostream& o = *out.stream(); - assert(o); - o.precision(17); - const int num_feats = FD::NumFeats(); - for (int i = 1; i < num_feats; ++i) { - const double val = (i < wv_.size() ? wv_[i] : 0.0); - if (hide_zero_value_features && val == 0.0) continue; - o << FD::Convert(i) << ' ' << val << endl; - } -} - -void Weights::InitVector(std::vector* w) const { - *w = wv_; -} - -void Weights::InitSparseVector(SparseVector* w) const { - for (int i = 1; i < wv_.size(); ++i) { - const double& weight = wv_[i]; - if (weight) w->set_value(i, weight); - } -} - -void Weights::InitFromVector(const std::vector& w) { - wv_ = w; - if (wv_.size() > FD::NumFeats()) - cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n"; - wv_.resize(FD::NumFeats(), 0); -} diff --git a/src/weights.h b/src/weights.h deleted file mode 100644 index f19aa3ce..00000000 --- a/src/weights.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef _WEIGHTS_H_ -#define _WEIGHTS_H_ - -#include -#include -#include -#include "sparse_vector.h" - -class Weights { - public: - Weights() {} - void InitFromFile(const std::string& fname, std::vector* feature_list = NULL); - void WriteToFile(const std::string& fname, bool hide_zero_value_features = true) const; - void InitVector(std::vector* w) const; - void InitSparseVector(SparseVector* w) const; - void InitFromVector(const std::vector& w); - private: - std::vector wv_; -}; - -#endif diff --git a/src/weights_test.cc b/src/weights_test.cc deleted file mode 100644 index aa6b3db2..00000000 --- a/src/weights_test.cc +++ /dev/null @@ -1,28 +0,0 @@ -#include -#include -#include -#include -#include -#include "weights.h" -#include "tdict.h" -#include "hg.h" - -using namespace std; - -class WeightsTest : public testing::Test { - protected: - virtual void SetUp() { } - virtual void TearDown() { } -}; - - -TEST_F(WeightsTest,Load) { - Weights w; - w.InitFromFile("test_data/weights"); - w.WriteToFile("-"); -} - -int main(int argc, char **argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/wordid.h b/src/wordid.h deleted file mode 100644 index fb50bcc1..00000000 --- a/src/wordid.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _WORD_ID_H_ -#define _WORD_ID_H_ - -typedef int WordID; - -#endif diff --git a/tests/run-system-tests.pl b/tests/run-system-tests.pl index 738000dc..8555ef78 100755 --- a/tests/run-system-tests.pl +++ b/tests/run-system-tests.pl @@ -8,7 +8,7 @@ my $TEMP_DIR = tempdir( CLEANUP => 1 ); #my $cwd = cwd(); #die "Sanity failed: $cwd" unless -d $cwd; -my $DECODER = "$script_dir/../src/cdec"; +my $DECODER = "$script_dir/../decoder/cdec"; my $FILTER = "$script_dir/tools/filter-stderr.pl"; my $COMPARE_STATS = "$script_dir/tools/compare-statistics.pl"; diff --git a/training/Makefile.am b/training/Makefile.am index c4c22fa2..944c75f7 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -10,32 +10,32 @@ bin_PROGRAMS = \ optimize_test atools_SOURCES = atools.cc -atools_LDADD = $(top_srcdir)/src/libhg.a -lz +atools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz model1_SOURCES = model1.cc -model1_LDADD = $(top_srcdir)/src/libhg.a -lz +model1_LDADD = $(top_srcdir)/decoder/libcdec.a -lz grammar_convert_SOURCES = grammar_convert.cc -grammar_convert_LDADD = $(top_srcdir)/src/libhg.a -lz +grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a -lz optimize_test_SOURCES = optimize_test.cc optimize.cc -optimize_test_LDADD = $(top_srcdir)/src/libhg.a -lz +optimize_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz collapse_weights_SOURCES = collapse_weights.cc -collapse_weights_LDADD = $(top_srcdir)/src/libhg.a -lz +collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz lbfgs_test_SOURCES = lbfgs_test.cc -lbfgs_test_LDADD = $(top_srcdir)/src/libhg.a -lz +lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc optimize.cc -mr_optimize_reduce_LDADD = $(top_srcdir)/src/libhg.a -lz +mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_em_train_SOURCES = mr_em_train.cc -mr_em_train_LDADD = $(top_srcdir)/src/libhg.a -lz +mr_em_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz plftools_SOURCES = plftools.cc -plftools_LDADD = $(top_srcdir)/src/libhg.a -lz +plftools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/src +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB) diff --git a/training/atools.cc b/training/atools.cc index bac73859..a18250f7 100644 --- a/training/atools.cc +++ b/training/atools.cc @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -105,6 +106,99 @@ struct IntersectCommand : public Command { } }; +struct UnionCommand : public Command { + string Name() const { return "union"; } + bool RequiresTwoOperands() const { return true; } + void Apply(const Array2D& a, const Array2D& b, Array2D* x) { + EnsureSize(a, b, x); + Array2D& res = *x; + for (int i = 0; i < res.width(); ++i) + for (int j = 0; j < res.height(); ++j) + res(i, j) = Safe(a, i, j) || Safe(b, i, j); + } +}; + +struct RefineCommand : public Command { + RefineCommand() { + neighbors_.push_back(make_pair(1,0)); + neighbors_.push_back(make_pair(-1,0)); + neighbors_.push_back(make_pair(0,1)); + neighbors_.push_back(make_pair(0,-1)); + } + bool RequiresTwoOperands() const { return true; } + protected: + void InitRefine( + const Array2D& a, + const Array2D& b, + Array2D* x) { + EnsureSize(a, b, x); + in_.clear(); un_.clear(); is_i_aligned_.clear(); is_j_aligned_.clear(); + EnsureSize(a, b, &in_); + EnsureSize(a, b, &un_); + is_i_aligned_.resize(x->width(), false); + is_j_aligned_.resize(x->height(), false); + for (int i = 0; i < in_.width(); ++i) + for (int j = 0; j < in_.height(); ++j) { + un_(i, j) = Safe(a, i, j) || Safe(b, i, j); + in_(i, j) = Safe(a, i, j) && Safe(b, i, j); + } + } + // "grow" the intersection alignment with neighboring points + // from the union alignment + void Grow(Array2D* x) { + Array2D& res = *x; + queue > q; + for (int i = 0; i < in_.width(); ++i) + for (int j = 0; j < in_.height(); ++j) + if (in_(i, j)) { + Align(i, j, x); + q.push(make_pair(i, j)); + } + while(!q.empty()) { + const pair point = q.front(); + q.pop(); + for (int k = 0; k < neighbors_.size(); ++k) { + const int test_i = neighbors_[k].first + point.first; + const int test_j = neighbors_[k].second + point.second; + if (Safe(un_, test_i, test_j) && !res(test_i, test_j)) { + Align(test_i, test_j, x); + q.push(make_pair(test_i, test_j)); + } + } + } + } + void Final(bool do_and, Array2D* x) { + } + void Align(int i, int j, Array2D* x) { + (*x)(i, j) = true; + is_i_aligned_[i] = true; + is_j_aligned_[j] = true; + } + Array2D in_; // intersection alignment + Array2D un_; // union alignment + vector is_i_aligned_; + vector is_j_aligned_; + vector > neighbors_; +}; + +struct DiagCommand : public RefineCommand { + DiagCommand() { + neighbors_.push_back(make_pair(1,1)); + neighbors_.push_back(make_pair(-1,1)); + neighbors_.push_back(make_pair(1,-1)); + neighbors_.push_back(make_pair(-1,-1)); + } +}; + +struct GDFCommand : public DiagCommand { + string Name() const { return "gdf"; } + void Apply(const Array2D& a, const Array2D& b, Array2D* x) { + InitRefine(a, b, x); + Grow(x); + Final(false, x); + } +}; + map > commands; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { @@ -163,6 +257,8 @@ int main(int argc, char **argv) { AddCommand(); AddCommand(); AddCommand(); + AddCommand(); + AddCommand(); AddCommand(); po::variables_map conf; InitCommandLine(argc, argv, &conf); diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl index 9f7c1569..8b06f162 100755 --- a/training/cluster-ptrain.pl +++ b/training/cluster-ptrain.pl @@ -8,7 +8,7 @@ my $MAX_ITER_ATTEMPTS = 5; # number of times to retry a failed function evaluati my $CWD=`pwd`; chomp $CWD; my $BIN_DIR = $SCRIPT_DIR; my $OPTIMIZER = "$BIN_DIR/mr_optimize_reduce"; -my $DECODER = "$BIN_DIR/../src/cdec"; +my $DECODER = "$BIN_DIR/../decoder/cdec"; my $COMBINER_CACHE_SIZE = 150; # This is a hack to run this on a weird cluster, # eventually, I'll provide Hadoop scripts. @@ -19,32 +19,35 @@ my $restart = ''; if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; } my $pmem="2500mb"; -my $nodes = 36; +my $nodes = 1; my $max_iteration = 1000; my $PRIOR_FLAG = ""; my $parallel = 1; my $CFLAG = "-C 1"; my $LOCAL; +my $DISTRIBUTED; my $PRIOR; my $OALG = "lbfgs"; my $sigsq = 1; my $means_file; -GetOptions("decoder=s" => \$DECODER, +GetOptions("cdec=s" => \$DECODER, "run_locally" => \$LOCAL, - "gaussian_prior" => \$PRIOR, + "distributed" => \$DISTRIBUTED, "sigma_squared=f" => \$sigsq, "means=s" => \$means_file, "optimizer=s" => \$OALG, + "jobs=i" => \$nodes, "pmem=s" => \$pmem ) or usage(); usage() unless scalar @ARGV==3; my $config_file = shift @ARGV; my $training_corpus = shift @ARGV; my $initial_weights = shift @ARGV; +unless ($DISTRIBUTED) { $LOCAL = 1; } die "Can't find $config_file" unless -f $config_file; die "Can't find $DECODER" unless -f $DECODER; die "Can't execute $DECODER" unless -x $DECODER; -if ($LOCAL) { print STDERR "Will running LOCALLY.\n"; $parallel = 0; } +if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; } if ($PRIOR) { $PRIOR_FLAG="-p --sigma_squared $sigsq"; if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; } @@ -56,20 +59,23 @@ if ($parallel) { } unless ($parallel) { $CFLAG = "-C 500"; } unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; } +my $clines = num_lines($training_corpus); print STDERR <) { $lines++; } + close $fh; + return $lines; } diff --git a/training/make-lexcrf-grammar.pl b/training/make-lexcrf-grammar.pl index 0e290492..8cdf7718 100755 --- a/training/make-lexcrf-grammar.pl +++ b/training/make-lexcrf-grammar.pl @@ -17,23 +17,27 @@ while() { } my $ADD_MODEL1 = 0; # found that model1 hurts performance -my $IS_FRENCH_F = 0; # indicates that the f language is french -my $IS_ARABIC_F = 1; # indicates that the f language is arabic +my $IS_FRENCH_F = 1; # indicates that the f language is french +my $IS_ARABIC_F = 0; # indicates that the f language is arabic +my $IS_URDU_F = 0; # indicates that the f language is arabic my $ADD_PREFIX_ID = 0; my $ADD_LEN = 1; -my $ADD_LD = 0; +my $ADD_SIM = 1; my $ADD_DICE = 1; my $ADD_111 = 1; my $ADD_ID = 1; my $ADD_PUNC = 1; my $ADD_NUM_MM = 1; my $ADD_NULL = 1; +my $ADD_STEM_ID = 1; my $BEAM_RATIO = 50; my %fdict; my %fcounts; my %ecounts; +my %sdict; + while() { chomp; my ($f, $e) = split /\s*\|\|\|\s*/; @@ -56,10 +60,11 @@ print STDERR "PuncMiss 0\n" if $ADD_PUNC; print STDERR "IsNull 0\n" if $ADD_NULL; print STDERR "Model1 0\n" if $ADD_MODEL1; print STDERR "DLen 0\n" if $ADD_LEN; -print STDERR "NumMM 0\n" if $ADD_NUM_MM; -print STDERR "Level 0\n" if $ADD_LD; +print STDERR "NumMM 0\nNumMatch 0\n" if $ADD_NUM_MM; +print STDERR "OrthoSim 0\n" if $ADD_SIM; print STDERR "PfxIdentical 0\n" if ($ADD_PREFIX_ID); my $fc = 1000000; +my $sids = 1000000; for my $f (sort keys %fdict) { my $re = $fdict{$f}; my $max; @@ -72,7 +77,6 @@ for my $f (sort keys %fdict) { my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); my $feats = "F$fc=1"; my $oe = $e; - my $len_e = length($oe); my $of = $f; # normalized form if ($IS_FRENCH_F) { # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French @@ -85,7 +89,27 @@ for my $f (sort keys %fdict) { if (length($of) > 1 && !($of =~ /\d/)) { $of =~ s/\$/sh/g; } + } elsif ($IS_URDU_F) { + if (length($of) > 1 && !($of =~ /\d/)) { + $of =~ s/\$/sh/g; + } + $oe =~ s/^-e-//; + $oe =~ s/^al-/al/; + $of =~ s/([a-z])\~/$1$1/g; + $of =~ s/E/'/g; + $of =~ s/^Aw/o/g; + $of =~ s/\|/a/g; + $of =~ s/@/h/g; + $of =~ s/c/ch/g; + $of =~ s/x/kh/g; + $of =~ s/\*/dh/g; + $of =~ s/w/o/g; + $of =~ s/Z/dh/g; + $of =~ s/y/i/g; + $of =~ s/Y/a/g; + $of = lc $of; } + my $len_e = length($oe); my $len_f = length($of); $feats .= " Model1=$m1" if ($ADD_MODEL1); $feats .= " Dice=$dice" if $ADD_DICE; @@ -100,12 +124,35 @@ for my $f (sort keys %fdict) { $feats .= " DLen=$dlen"; } } - my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/); # this matches *two digit* and more numbers - my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/); + my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); + my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); my $both_non_numeric = (!$e_num && !$f_num); if ($ADD_NUM_MM && (($f_num && !$e_num) || ($e_num && !$f_num))) { $feats .= " NumMM=1"; } + if ($ADD_NUM_MM && ($f_num && $e_num) && ($oe eq $of)) { + $feats .= " NumMatch=1"; + } + if ($ADD_STEM_ID) { + my $el = 4; + my $fl = 4; + if ($oe =~ /^al|re|co/) { $el++; } + if ($of =~ /^al|re|co/) { $fl++; } + if ($oe =~ /^trans|inter/) { $el+=2; } + if ($of =~ /^trans|inter/) { $fl+=2; } + if ($fl > length($of)) { $fl = length($of); } + if ($el > length($oe)) { $el = length($oe); } + my $sf = substr $of, 0, $fl; + my $se = substr $oe, 0, $el; + my $id = $sdict{$sf}->{$se}; + if (!$id) { + $sids++; + $sdict{$sf}->{$se} = $sids; + $id = $sids; + print STDERR "S$sids 0\n" + } + $feats .= " S$id=1"; + } if ($ADD_PREFIX_ID) { if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { my $pe = substr $oe, 0, 3; @@ -113,12 +160,14 @@ for my $f (sort keys %fdict) { if ($pe eq $pf) { $feats .= " PfxIdentical=1"; } } } - if ($ADD_LD) { + if ($ADD_SIM) { my $ld = 0; - if ($is_null) { $ld = length($e); } else { - $ld = levenshtein($e, $f); + my $eff = $len_e; + if ($eff < $len_f) { $eff = $len_f; } + if (!$is_null) { + $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); } - $feats .= " Leven=$ld"; + $feats .= " OrthoSim=$ld"; } my $ident = ($e eq $f); if ($ident && $ADD_ID) { $feats .= " Identical=1"; } diff --git a/vest/Makefile.am b/vest/Makefile.am index 87c2383a..d7d08133 100644 --- a/vest/Makefile.am +++ b/vest/Makefile.am @@ -8,25 +8,25 @@ bin_PROGRAMS = \ union_forests union_forests_SOURCES = union_forests.cc -union_forests_LDADD = $(top_srcdir)/src/libhg.a -lz +union_forests_LDADD = $(top_srcdir)/decoder/libcdec.a -lz fast_score_SOURCES = fast_score.cc ter.cc comb_scorer.cc scorer.cc viterbi_envelope.cc -fast_score_LDADD = $(top_srcdir)/src/libhg.a -lz +fast_score_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc -mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/src/libhg.a -lz +mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_vest_map_SOURCES = viterbi_envelope.cc error_surface.cc mr_vest_map.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc -mr_vest_map_LDADD = $(top_srcdir)/src/libhg.a -lz +mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_vest_reduce_SOURCES = error_surface.cc mr_vest_reduce.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc viterbi_envelope.cc -mr_vest_reduce_LDADD = $(top_srcdir)/src/libhg.a -lz +mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz scorer_test_SOURCES = scorer_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc -scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/src/libhg.a -lz +scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz lo_test_SOURCES = lo_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc -lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/src/libhg.a -lz +lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/src +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB) diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index 5528838c..31dbc61f 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -1,17 +1,16 @@ #!/usr/bin/env perl +use strict; +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } use Getopt::Long; use IPC::Open2; use strict; use POSIX ":sys_wait_h"; -my $mydir = `dirname $0`; -chomp $mydir; # Default settings -my $srcFile = "/fs/cliplab/mteval/Evaluation/Chinese-English/mt03.src.txt"; -my $refFiles = "/fs/cliplab/mteval/Evaluation/Chinese-English/mt03.ref.txt.*"; -my $bin_dir = "/fs/clip-software/cdec/bin"; -$bin_dir = "/Users/redpony/cdyer-svn-root/cdec/vest/bin_dir"; +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; my $FAST_SCORE="$bin_dir/fast_score"; die "Can't find $FAST_SCORE" unless -x $FAST_SCORE; @@ -22,7 +21,7 @@ my $SCORER = $FAST_SCORE; die "Can't find $MAPPER" unless -x $MAPPER; my $forestUnion = "$bin_dir/union_forests"; die "Can't find $forestUnion" unless -x $forestUnion; -my $cdec = "$bin_dir/cdec"; +my $cdec = "$bin_dir/../decoder/cdec"; die "Can't find decoder in $cdec" unless -x $cdec; my $decoder = $cdec; my $lines_per_mapper = 440; @@ -153,7 +152,7 @@ $SIG{HUP} = "cleanup"; my $decoderBase = `basename $decoder`; chomp $decoderBase; my $newIniFile = "$dir/$decoderBase.ini"; -my $parallelize = "$mydir/parallelize.pl"; +my $parallelize = '/chomes/redpony/svn-trunk/sa-utils/parallelize.pl'; my $inputFileName = "$dir/input"; my $user = $ENV{"USER"}; @@ -254,15 +253,18 @@ while (1){ print LOGFILE "\nUNION FORESTS\n"; print LOGFILE `date`; my $mergeLog="$logdir/prune-merge.log.$iteration"; - $cmd = "$forestUnion -r $dir/hgs -n $dir/hgs-current -s $devSize"; - print LOGFILE "COMMAND:\n$cmd\n"; - $result = system($cmd); + `rm -rf $dir/hgs`; + `mv $dir/hgs-current $dir/hgs`; + #$cmd = "$forestUnion -r $dir/hgs -n $dir/hgs-current -s $devSize"; + #print LOGFILE "COMMAND:\n$cmd\n"; + #$result = system($cmd); unless ($result == 0){ cleanup(); print LOGFILE "ERROR: merge command returned non-zero exit code $result\n"; die; } `rm -f $dir/hgs-current/*.json.gz`; # clean up old HGs, they've been moved to the repository + `mkdir -p $dir/hgs-current`; my $score = 0; my $icc = 0; @@ -303,7 +305,7 @@ while (1){ my $mapoutput = $shard; my $client_name = $shard; $client_name =~ s/mapinput.//; - $client_name = "fmert.$client_name"; + $client_name = "vest.$client_name"; $mapoutput =~ s/mapinput/mapoutput/; push @mapoutputs, "$dir/splag.$im1/$mapoutput"; $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; @@ -548,36 +550,9 @@ Options: --decoder Decoder binary to use. - --decode-nodes - A list of nodes used for parallel decoding. If specific nodes - are not desired, use "1" for each node requested. Defaults to - "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1", which indicates a request for - 15 nodes. - - --dont-clean - If present, this flag prevents intermediate files, including - run files and cumulative files, from being automatically removed - after a successful optimization run (these files are left if the - run fails for any reason). If used, a makefile containing - cleanup commands is written to the directory. To clean up - the intermediate files, invoke make without any arguments. - - --dry-run - Prints out the settings and exits without doing anything. - - --epsilon - Require that the dev set BLEU score improve by at least - within iterations (controlled by parameter --interval). - If not specified, defaults to .002. - --help Print this message and exit. - --interval - Require that the dev set BLEU score improve by at least - (controlled by parameter --epsilon) within iterations. - If not specified, defaults to 5. - --iteration Starting iteration number. If not specified, defaults to 1. @@ -586,18 +561,15 @@ Options: to 10. --pmem - Amount of physical memory requested for parallel decoding jobs, - in the format expected by qsub. If not specified, defaults to - 2g. + Amount of physical memory requested for parallel decoding jobs. --ref-files Dev set ref files. This option takes only a single string argument. To use multiple files (including file globbing), this argument should - be quoted. If not specified, defaults to - /fs/cliplab/mteval/Evaluation/Chinese-English/mt03.ref.txt.* + be quoted. --metric - Metric to optimize. See fmert's --metric option for values. + Metric to optimize. Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi --normalize @@ -609,8 +581,7 @@ Options: set this parameter to explore other directions. Defaults to 5. --source-file - Dev set source file. If not specified, defaults to - /fs/cliplab/mteval/Evaluation/Chinese-English/mt03.src.txt + Dev set source file. --weights A file specifying initial feature weights. The format is -- cgit v1.2.3