From 9fae7b75c64f4dfb6e45ff8f2ec61b597f37d225 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 3 Dec 2009 16:49:30 -0500 Subject: fix build in new layout --- m4/ax_boost_base.m4 | 223 +++++++++ m4/ax_boost_program_options.m4 | 106 +++++ m4/ax_boost_serialization.m4 | 115 +++++ m4/gtest.m4 | 61 +++ src/JSON_parser.c | 1012 ++++++++++++++++++++++++++++++++++++++++ src/collapse_weights.cc | 102 ---- src/ibm_model1.cc | 4 - training/collapse_weights.cc | 102 ++++ vest/comb_scorer.h | 17 + vest/error_surface.h | 24 + vest/line_optimizer.h | 44 ++ vest/scorer.h | 51 ++ vest/ter.h | 18 + vest/viterbi_envelope.h | 79 ++++ 14 files changed, 1852 insertions(+), 106 deletions(-) create mode 100644 m4/ax_boost_base.m4 create mode 100644 m4/ax_boost_program_options.m4 create mode 100644 m4/ax_boost_serialization.m4 create mode 100644 m4/gtest.m4 create mode 100644 src/JSON_parser.c delete mode 100644 src/collapse_weights.cc delete mode 100644 src/ibm_model1.cc create mode 100644 training/collapse_weights.cc create mode 100644 vest/comb_scorer.h create mode 100644 vest/error_surface.h create mode 100644 vest/line_optimizer.h create mode 100644 vest/scorer.h create mode 100644 vest/ter.h create mode 100644 vest/viterbi_envelope.h diff --git a/m4/ax_boost_base.m4 b/m4/ax_boost_base.m4 new file mode 100644 index 00000000..2e5afd09 --- /dev/null +++ b/m4/ax_boost_base.m4 @@ -0,0 +1,223 @@ +# =========================================================================== +# http://autoconf-archive.cryp.to/ax_boost_base.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_BOOST_BASE([MINIMUM-VERSION]) +# +# DESCRIPTION +# +# Test for the Boost C++ libraries of a particular version (or newer) +# +# If no path to the installed boost library is given the macro searchs +# under /usr, /usr/local, /opt and /opt/local and evaluates the +# $BOOST_ROOT environment variable. Further documentation is available at +# . +# +# This macro calls: +# +# AC_SUBST(BOOST_CPPFLAGS) / AC_SUBST(BOOST_LDFLAGS) +# +# And sets: +# +# HAVE_BOOST +# +# LAST MODIFICATION +# +# 2008-04-12 +# +# COPYLEFT +# +# Copyright (c) 2008 Thomas Porschberg +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. + +AC_DEFUN([AX_BOOST_BASE], +[ +AC_ARG_WITH([boost], + AS_HELP_STRING([--with-boost@<:@=DIR@:>@], [use boost (default is yes) - it is possible to specify the root directory for boost (optional)]), + [ + if test "$withval" = "no"; then + want_boost="no" + elif test "$withval" = "yes"; then + want_boost="yes" + ac_boost_path="" + else + want_boost="yes" + ac_boost_path="$withval" + fi + ], + [want_boost="yes"]) + + +AC_ARG_WITH([boost-libdir], + AS_HELP_STRING([--with-boost-libdir=LIB_DIR], + [Force given directory for boost libraries. Note that this will overwrite library path detection, so use this parameter only if default library detection fails and you know exactly where your boost libraries are located.]), + [ + if test -d $withval + then + ac_boost_lib_path="$withval" + else + AC_MSG_ERROR(--with-boost-libdir expected directory name) + fi + ], + [ac_boost_lib_path=""] +) + +if test "x$want_boost" = "xyes"; then + boost_lib_version_req=ifelse([$1], ,1.20.0,$1) + boost_lib_version_req_shorten=`expr $boost_lib_version_req : '\([[0-9]]*\.[[0-9]]*\)'` + boost_lib_version_req_major=`expr $boost_lib_version_req : '\([[0-9]]*\)'` + boost_lib_version_req_minor=`expr $boost_lib_version_req : '[[0-9]]*\.\([[0-9]]*\)'` + boost_lib_version_req_sub_minor=`expr $boost_lib_version_req : '[[0-9]]*\.[[0-9]]*\.\([[0-9]]*\)'` + if test "x$boost_lib_version_req_sub_minor" = "x" ; then + boost_lib_version_req_sub_minor="0" + fi + WANT_BOOST_VERSION=`expr $boost_lib_version_req_major \* 100000 \+ $boost_lib_version_req_minor \* 100 \+ $boost_lib_version_req_sub_minor` + AC_MSG_CHECKING(for boostlib >= $boost_lib_version_req) + succeeded=no + + dnl first we check the system location for boost libraries + dnl this location ist chosen if boost libraries are installed with the --layout=system option + dnl or if you install boost with RPM + if test "$ac_boost_path" != ""; then + BOOST_LDFLAGS="-L$ac_boost_path/lib" + BOOST_CPPFLAGS="-I$ac_boost_path/include" + else + for ac_boost_path_tmp in /usr /usr/local /opt /opt/local ; do + if test -d "$ac_boost_path_tmp/include/boost" && test -r "$ac_boost_path_tmp/include/boost"; then + BOOST_LDFLAGS="-L$ac_boost_path_tmp/lib" + BOOST_CPPFLAGS="-I$ac_boost_path_tmp/include" + break; + fi + done + fi + + dnl overwrite ld flags if we have required special directory with + dnl --with-boost-libdir parameter + if test "$ac_boost_lib_path" != ""; then + BOOST_LDFLAGS="-L$ac_boost_lib_path" + fi + + CPPFLAGS_SAVED="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" + export CPPFLAGS + + LDFLAGS_SAVED="$LDFLAGS" + LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" + export LDFLAGS + + AC_LANG_PUSH(C++) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + @%:@include + ]], [[ + #if BOOST_VERSION >= $WANT_BOOST_VERSION + // Everything is okay + #else + # error Boost version is too old + #endif + ]])],[ + AC_MSG_RESULT(yes) + succeeded=yes + found_system=yes + ],[ + ]) + AC_LANG_POP([C++]) + + + + dnl if we found no boost with system layout we search for boost libraries + dnl built and installed without the --layout=system option or for a staged(not installed) version + if test "x$succeeded" != "xyes"; then + _version=0 + if test "$ac_boost_path" != ""; then + if test -d "$ac_boost_path" && test -r "$ac_boost_path"; then + for i in `ls -d $ac_boost_path/include/boost-* 2>/dev/null`; do + _version_tmp=`echo $i | sed "s#$ac_boost_path##" | sed 's/\/include\/boost-//' | sed 's/_/./'` + V_CHECK=`expr $_version_tmp \> $_version` + if test "$V_CHECK" = "1" ; then + _version=$_version_tmp + fi + VERSION_UNDERSCORE=`echo $_version | sed 's/\./_/'` + BOOST_CPPFLAGS="-I$ac_boost_path/include/boost-$VERSION_UNDERSCORE" + done + fi + else + for ac_boost_path in /usr /usr/local /opt /opt/local ; do + if test -d "$ac_boost_path" && test -r "$ac_boost_path"; then + for i in `ls -d $ac_boost_path/include/boost-* 2>/dev/null`; do + _version_tmp=`echo $i | sed "s#$ac_boost_path##" | sed 's/\/include\/boost-//' | sed 's/_/./'` + V_CHECK=`expr $_version_tmp \> $_version` + if test "$V_CHECK" = "1" ; then + _version=$_version_tmp + best_path=$ac_boost_path + fi + done + fi + done + + VERSION_UNDERSCORE=`echo $_version | sed 's/\./_/'` + BOOST_CPPFLAGS="-I$best_path/include/boost-$VERSION_UNDERSCORE" + if test "$ac_boost_lib_path" = "" + then + BOOST_LDFLAGS="-L$best_path/lib" + fi + + if test "x$BOOST_ROOT" != "x"; then + if test -d "$BOOST_ROOT" && test -r "$BOOST_ROOT" && test -d "$BOOST_ROOT/stage/lib" && test -r "$BOOST_ROOT/stage/lib"; then + version_dir=`expr //$BOOST_ROOT : '.*/\(.*\)'` + stage_version=`echo $version_dir | sed 's/boost_//' | sed 's/_/./g'` + stage_version_shorten=`expr $stage_version : '\([[0-9]]*\.[[0-9]]*\)'` + V_CHECK=`expr $stage_version_shorten \>\= $_version` + if test "$V_CHECK" = "1" -a "$ac_boost_lib_path" = "" ; then + AC_MSG_NOTICE(We will use a staged boost library from $BOOST_ROOT) + BOOST_CPPFLAGS="-I$BOOST_ROOT" + BOOST_LDFLAGS="-L$BOOST_ROOT/stage/lib" + fi + fi + fi + fi + + CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" + export CPPFLAGS + LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" + export LDFLAGS + + AC_LANG_PUSH(C++) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + @%:@include + ]], [[ + #if BOOST_VERSION >= $WANT_BOOST_VERSION + // Everything is okay + #else + # error Boost version is too old + #endif + ]])],[ + AC_MSG_RESULT(yes) + succeeded=yes + found_system=yes + ],[ + ]) + AC_LANG_POP([C++]) + fi + + if test "$succeeded" != "yes" ; then + if test "$_version" = "0" ; then + AC_MSG_ERROR([[We could not detect the boost libraries (version $boost_lib_version_req_shorten or higher). If you have a staged boost library (still not installed) please specify \$BOOST_ROOT in your environment and do not give a PATH to --with-boost option. If you are sure you have boost installed, then check your version number looking in . See http://randspringer.de/boost for more documentation.]]) + else + AC_MSG_NOTICE([Your boost libraries seems to old (version $_version).]) + fi + else + AC_SUBST(BOOST_CPPFLAGS) + AC_SUBST(BOOST_LDFLAGS) + AC_DEFINE(HAVE_BOOST,,[define if the Boost library is available]) + fi + + CPPFLAGS="$CPPFLAGS_SAVED" + LDFLAGS="$LDFLAGS_SAVED" +fi + +]) diff --git a/m4/ax_boost_program_options.m4 b/m4/ax_boost_program_options.m4 new file mode 100644 index 00000000..303015cc --- /dev/null +++ b/m4/ax_boost_program_options.m4 @@ -0,0 +1,106 @@ +# =========================================================================== +# http://autoconf-archive.cryp.to/ax_boost_program_options.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_BOOST_PROGRAM_OPTIONS +# +# DESCRIPTION +# +# Test for program options library from the Boost C++ libraries. The macro +# requires a preceding call to AX_BOOST_BASE. Further documentation is +# available at . +# +# This macro calls: +# +# AC_SUBST(BOOST_PROGRAM_OPTIONS_LIB) +# +# And sets: +# +# HAVE_BOOST_PROGRAM_OPTIONS +# +# LAST MODIFICATION +# +# 2008-04-12 +# +# COPYLEFT +# +# Copyright (c) 2008 Thomas Porschberg +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. + +AC_DEFUN([AX_BOOST_PROGRAM_OPTIONS], +[ + AC_ARG_WITH([boost-program-options], + AS_HELP_STRING([--with-boost-program-options@<:@=special-lib@:>@], + [use the program options library from boost - it is possible to specify a certain library for the linker + e.g. --with-boost-program-options=boost_program_options-gcc-mt-1_33_1 ]), + [ + if test "$withval" = "no"; then + want_boost="no" + elif test "$withval" = "yes"; then + want_boost="yes" + ax_boost_user_program_options_lib="" + else + want_boost="yes" + ax_boost_user_program_options_lib="$withval" + fi + ], + [want_boost="yes"] + ) + + if test "x$want_boost" = "xyes"; then + AC_REQUIRE([AC_PROG_CC]) + export want_boost + CPPFLAGS_SAVED="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" + export CPPFLAGS + LDFLAGS_SAVED="$LDFLAGS" + LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" + export LDFLAGS + AC_CACHE_CHECK([whether the Boost::Program_Options library is available], + ax_cv_boost_program_options, + [AC_LANG_PUSH(C++) + AC_COMPILE_IFELSE(AC_LANG_PROGRAM([[@%:@include + ]], + [[boost::program_options::options_description generic("Generic options"); + return 0;]]), + ax_cv_boost_program_options=yes, ax_cv_boost_program_options=no) + AC_LANG_POP([C++]) + ]) + if test "$ax_cv_boost_program_options" = yes; then + AC_DEFINE(HAVE_BOOST_PROGRAM_OPTIONS,,[define if the Boost::PROGRAM_OPTIONS library is available]) + BOOSTLIBDIR=`echo $BOOST_LDFLAGS | sed -e 's/@<:@^\/@:>@*//'` + if test "x$ax_boost_user_program_options_lib" = "x"; then + for libextension in `ls $BOOSTLIBDIR/libboost_program_options*.{so,a}* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_program_options.*\)\.so.*$;\1;' -e 's;^lib\(boost_program_options.*\)\.a*$;\1;'` ; do + ax_lib=${libextension} + AC_CHECK_LIB($ax_lib, exit, + [BOOST_PROGRAM_OPTIONS_LIB="-l$ax_lib"; AC_SUBST(BOOST_PROGRAM_OPTIONS_LIB) link_program_options="yes"; break], + [link_program_options="no"]) + done + if test "x$link_program_options" != "xyes"; then + for libextension in `ls $BOOSTLIBDIR/boost_program_options*.{dll,a}* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_program_options.*\)\.dll.*$;\1;' -e 's;^\(boost_program_options.*\)\.a*$;\1;'` ; do + ax_lib=${libextension} + AC_CHECK_LIB($ax_lib, exit, + [BOOST_PROGRAM_OPTIONS_LIB="-l$ax_lib"; AC_SUBST(BOOST_PROGRAM_OPTIONS_LIB) link_program_options="yes"; break], + [link_program_options="no"]) + done + fi + else + for ax_lib in $ax_boost_user_program_options_lib boost_program_options-$ax_boost_user_program_options_lib; do + AC_CHECK_LIB($ax_lib, main, + [BOOST_PROGRAM_OPTIONS_LIB="-l$ax_lib"; AC_SUBST(BOOST_PROGRAM_OPTIONS_LIB) link_program_options="yes"; break], + [link_program_options="no"]) + done + fi + if test "x$link_program_options" != "xyes"; then + AC_MSG_ERROR([Could not link against [$ax_lib] !]) + fi + fi + CPPFLAGS="$CPPFLAGS_SAVED" + LDFLAGS="$LDFLAGS_SAVED" + fi +]) diff --git a/m4/ax_boost_serialization.m4 b/m4/ax_boost_serialization.m4 new file mode 100644 index 00000000..1fac364b --- /dev/null +++ b/m4/ax_boost_serialization.m4 @@ -0,0 +1,115 @@ +# =========================================================================== +# http://autoconf-archive.cryp.to/ax_boost_serialization.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_BOOST_SERIALIZATION +# +# DESCRIPTION +# +# Test for Serialization library from the Boost C++ libraries. The macro +# requires a preceding call to AX_BOOST_BASE. Further documentation is +# available at . +# +# This macro calls: +# +# AC_SUBST(BOOST_SERIALIZATION_LIB) +# +# And sets: +# +# HAVE_BOOST_SERIALIZATION +# +# LAST MODIFICATION +# +# 2008-04-12 +# +# COPYLEFT +# +# Copyright (c) 2008 Thomas Porschberg +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. + +AC_DEFUN([AX_BOOST_SERIALIZATION], +[ + AC_ARG_WITH([boost-serialization], + AS_HELP_STRING([--with-boost-serialization@<:@=special-lib@:>@], + [use the Serialization library from boost - it is possible to specify a certain library for the linker + e.g. --with-boost-serialization=boost_serialization-gcc-mt-d-1_33_1 ]), + [ + if test "$withval" = "no"; then + want_boost="no" + elif test "$withval" = "yes"; then + want_boost="yes" + ax_boost_user_serialization_lib="" + else + want_boost="yes" + ax_boost_user_serialization_lib="$withval" + fi + ], + [want_boost="yes"] + ) + + if test "x$want_boost" = "xyes"; then + AC_REQUIRE([AC_PROG_CC]) + CPPFLAGS_SAVED="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" + AC_MSG_WARN(BOOST_CPPFLAGS $BOOST_CPPFLAGS) + export CPPFLAGS + + LDFLAGS_SAVED="$LDFLAGS" + LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" + export LDFLAGS + + AC_CACHE_CHECK(whether the Boost::Serialization library is available, + ax_cv_boost_serialization, + [AC_LANG_PUSH([C++]) + AC_COMPILE_IFELSE(AC_LANG_PROGRAM([[@%:@include + @%:@include + @%:@include + ]], + [[std::ofstream ofs("filename"); + boost::archive::text_oarchive oa(ofs); + return 0; + ]]), + ax_cv_boost_serialization=yes, ax_cv_boost_serialization=no) + AC_LANG_POP([C++]) + ]) + if test "x$ax_cv_boost_serialization" = "xyes"; then + AC_DEFINE(HAVE_BOOST_SERIALIZATION,,[define if the Boost::Serialization library is available]) + BOOSTLIBDIR=`echo $BOOST_LDFLAGS | sed -e 's/@<:@^\/@:>@*//'` + if test "x$ax_boost_user_serialization_lib" = "x"; then + for libextension in `ls $BOOSTLIBDIR/libboost_serialization*.{so,a}* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_serialization.*\)\.so.*$;\1;' -e 's;^lib\(boost_serialization.*\)\.a*$;\1;'` ; do + ax_lib=${libextension} + AC_CHECK_LIB($ax_lib, exit, + [BOOST_SERIALIZATION_LIB="-l$ax_lib"; AC_SUBST(BOOST_SERIALIZATION_LIB) link_serialization="yes"; break], + [link_serialization="no"]) + done + if test "x$link_serialization" != "xyes"; then + for libextension in `ls $BOOSTLIBDIR/boost_serialization*.{dll,a}* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_serialization.*\)\.dll.*$;\1;' -e 's;^\(boost_serialization.*\)\.a*$;\1;'` ; do + ax_lib=${libextension} + AC_CHECK_LIB($ax_lib, exit, + [BOOST_SERIALIZATION_LIB="-l$ax_lib"; AC_SUBST(BOOST_SERIALIZATION_LIB) link_serialization="yes"; break], + [link_serialization="no"]) + done + fi + + else + for ax_lib in $ax_boost_user_serialization_lib boost_serialization-$ax_boost_user_serialization_lib; do + AC_CHECK_LIB($ax_lib, main, + [BOOST_SERIALIZATION_LIB="-l$ax_lib"; AC_SUBST(BOOST_SERIALIZATION_LIB) link_serialization="yes"; break], + [link_serialization="no"]) + done + + fi + if test "x$link_serialization" != "xyes"; then + AC_MSG_ERROR(Could not link against $ax_lib !) + fi + fi + + CPPFLAGS="$CPPFLAGS_SAVED" + LDFLAGS="$LDFLAGS_SAVED" + fi +]) diff --git a/m4/gtest.m4 b/m4/gtest.m4 new file mode 100644 index 00000000..ebb488a8 --- /dev/null +++ b/m4/gtest.m4 @@ -0,0 +1,61 @@ +dnl GTEST_LIB_CHECK([minimum version [, +dnl action if found [,action if not found]]]) +dnl +dnl Check for the presence of the Google Test library, optionally at a minimum +dnl version, and indicate a viable version with the HAVE_GTEST flag. It defines +dnl standard variables for substitution including GTEST_CPPFLAGS, +dnl GTEST_CXXFLAGS, GTEST_LDFLAGS, and GTEST_LIBS. It also defines +dnl GTEST_VERSION as the version of Google Test found. Finally, it provides +dnl optional custom action slots in the event GTEST is found or not. +AC_DEFUN([GTEST_LIB_CHECK], +[ +dnl Provide a flag to enable or disable Google Test usage. +AC_ARG_ENABLE([gtest], + [AS_HELP_STRING([--enable-gtest], + [Enable tests using the Google C++ Testing Framework.] + [(Default is enabled.)])], + [], + [enable_gtest=check]) +AC_ARG_VAR([GTEST_CONFIG], + [The exact path of Google Test's 'gtest-config' script.]) +AC_ARG_VAR([GTEST_CPPFLAGS], + [C-like preprocessor flags for Google Test.]) +AC_ARG_VAR([GTEST_CXXFLAGS], + [C++ compile flags for Google Test.]) +AC_ARG_VAR([GTEST_LDFLAGS], + [Linker path and option flags for Google Test.]) +AC_ARG_VAR([GTEST_LIBS], + [Library linking flags for Google Test.]) +AC_ARG_VAR([GTEST_VERSION], + [The version of Google Test available.]) +HAVE_GTEST="no" +AS_IF([test "x$enable_gtest" != "xno"], + [AC_PATH_PROG([GTEST_CONFIG], [gtest-config]) + AS_IF([test -x "$GTEST_CONFIG"], + [AS_IF([test "x$1" != "x"], + [_min_version="--min-version=$1" + AC_MSG_CHECKING([for Google Test at least version >= $1])], + [_min_version="--min-version=0" + AC_MSG_CHECKING([for Google Test])]) + AS_IF([$GTEST_CONFIG $_min_version], + [AC_MSG_RESULT([yes]) + HAVE_GTEST="yes"], + [AC_MSG_RESULT([no])])]) + AS_IF([test "x$HAVE_GTEST" = "xyes"], + [GTEST_CPPFLAGS=$($GTEST_CONFIG --cppflags) + GTEST_CXXFLAGS=$($GTEST_CONFIG --cxxflags) + GTEST_LDFLAGS=$($GTEST_CONFIG --ldflags) + GTEST_LIBS=$($GTEST_CONFIG --libs) + GTEST_VERSION=$($GTEST_CONFIG --version) + AC_DEFINE([HAVE_GTEST],[1],[Defined when Google Test is available.])], + [AS_IF([test "x$enable_gtest" = "xyes"], + [AC_MSG_ERROR([ + The Google C++ Testing Framework was explicitly enabled, but a viable version + could not be found on the system. +])])])]) +AC_SUBST([HAVE_GTEST]) +AM_CONDITIONAL([HAVE_GTEST],[test "x$HAVE_GTEST" = "xyes"]) +AS_IF([test "x$HAVE_GTEST" = "xyes"], + [AS_IF([test "x$2" != "x"],[$2],[:])], + [AS_IF([test "x$3" != "x"],[$3],[:])]) +]) diff --git a/src/JSON_parser.c b/src/JSON_parser.c new file mode 100644 index 00000000..175b7cc9 --- /dev/null +++ b/src/JSON_parser.c @@ -0,0 +1,1012 @@ +/* JSON_parser.c */ + +/* 2007-08-24 */ + +/* +Copyright (c) 2005 JSON.org + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +The Software shall be used for Good, not Evil. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* + Callbacks, comments, Unicode handling by Jean Gressmann (jean@0x42.de), 2007-2009. + + For the added features the license above applies also. + + Changelog: + 2009-05-17 + Incorporated benrudiak@googlemail.com fix for UTF16 decoding. + + 2009-05-14 + Fixed float parsing bug related to a locale being set that didn't + use '.' as decimal point character (charles@transmissionbt.com). + + 2008-10-14 + Renamed states.IN to states.IT to avoid name clash which IN macro + defined in windef.h (alexey.pelykh@gmail.com) + + 2008-07-19 + Removed some duplicate code & debugging variable (charles@transmissionbt.com) + + 2008-05-28 + Made JSON_value structure ansi C compliant. This bug was report by + trisk@acm.jhu.edu + + 2008-05-20 + Fixed bug reported by charles@transmissionbt.com where the switching + from static to dynamic parse buffer did not copy the static parse + buffer's content. +*/ + + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "JSON_parser.h" + +#ifdef _MSC_VER +# if _MSC_VER >= 1400 /* Visual Studio 2005 and up */ +# pragma warning(disable:4996) // unsecure sscanf +# endif +#endif + + +#define true 1 +#define false 0 +#define __ -1 /* the universal error code */ + +/* values chosen so that the object size is approx equal to one page (4K) */ +#ifndef JSON_PARSER_STACK_SIZE +# define JSON_PARSER_STACK_SIZE 128 +#endif + +#ifndef JSON_PARSER_PARSE_BUFFER_SIZE +# define JSON_PARSER_PARSE_BUFFER_SIZE 3500 +#endif + +typedef unsigned short UTF16; + +struct JSON_parser_struct { + JSON_parser_callback callback; + void* ctx; + signed char state, before_comment_state, type, escaped, comment, allow_comments, handle_floats_manually; + UTF16 utf16_high_surrogate; + long depth; + long top; + signed char* stack; + long stack_capacity; + char decimal_point; + char* parse_buffer; + size_t parse_buffer_capacity; + size_t parse_buffer_count; + size_t comment_begin_offset; + signed char static_stack[JSON_PARSER_STACK_SIZE]; + char static_parse_buffer[JSON_PARSER_PARSE_BUFFER_SIZE]; +}; + +#define COUNTOF(x) (sizeof(x)/sizeof(x[0])) + +/* + Characters are mapped into these character classes. This allows for + a significant reduction in the size of the state transition table. +*/ + + + +enum classes { + C_SPACE, /* space */ + C_WHITE, /* other whitespace */ + C_LCURB, /* { */ + C_RCURB, /* } */ + C_LSQRB, /* [ */ + C_RSQRB, /* ] */ + C_COLON, /* : */ + C_COMMA, /* , */ + C_QUOTE, /* " */ + C_BACKS, /* \ */ + C_SLASH, /* / */ + C_PLUS, /* + */ + C_MINUS, /* - */ + C_POINT, /* . */ + C_ZERO , /* 0 */ + C_DIGIT, /* 123456789 */ + C_LOW_A, /* a */ + C_LOW_B, /* b */ + C_LOW_C, /* c */ + C_LOW_D, /* d */ + C_LOW_E, /* e */ + C_LOW_F, /* f */ + C_LOW_L, /* l */ + C_LOW_N, /* n */ + C_LOW_R, /* r */ + C_LOW_S, /* s */ + C_LOW_T, /* t */ + C_LOW_U, /* u */ + C_ABCDF, /* ABCDF */ + C_E, /* E */ + C_ETC, /* everything else */ + C_STAR, /* * */ + NR_CLASSES +}; + +static int ascii_class[128] = { +/* + This array maps the 128 ASCII characters into character classes. + The remaining Unicode characters should be mapped to C_ETC. + Non-whitespace control characters are errors. +*/ + __, __, __, __, __, __, __, __, + __, C_WHITE, C_WHITE, __, __, C_WHITE, __, __, + __, __, __, __, __, __, __, __, + __, __, __, __, __, __, __, __, + + C_SPACE, C_ETC, C_QUOTE, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_STAR, C_PLUS, C_COMMA, C_MINUS, C_POINT, C_SLASH, + C_ZERO, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, + C_DIGIT, C_DIGIT, C_COLON, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + + C_ETC, C_ABCDF, C_ABCDF, C_ABCDF, C_ABCDF, C_E, C_ABCDF, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_LSQRB, C_BACKS, C_RSQRB, C_ETC, C_ETC, + + C_ETC, C_LOW_A, C_LOW_B, C_LOW_C, C_LOW_D, C_LOW_E, C_LOW_F, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_LOW_L, C_ETC, C_LOW_N, C_ETC, + C_ETC, C_ETC, C_LOW_R, C_LOW_S, C_LOW_T, C_LOW_U, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_LCURB, C_ETC, C_RCURB, C_ETC, C_ETC +}; + + +/* + The state codes. +*/ +enum states { + GO, /* start */ + OK, /* ok */ + OB, /* object */ + KE, /* key */ + CO, /* colon */ + VA, /* value */ + AR, /* array */ + ST, /* string */ + ES, /* escape */ + U1, /* u1 */ + U2, /* u2 */ + U3, /* u3 */ + U4, /* u4 */ + MI, /* minus */ + ZE, /* zero */ + IT, /* integer */ + FR, /* fraction */ + E1, /* e */ + E2, /* ex */ + E3, /* exp */ + T1, /* tr */ + T2, /* tru */ + T3, /* true */ + F1, /* fa */ + F2, /* fal */ + F3, /* fals */ + F4, /* false */ + N1, /* nu */ + N2, /* nul */ + N3, /* null */ + C1, /* / */ + C2, /* / * */ + C3, /* * */ + FX, /* *.* *eE* */ + D1, /* second UTF-16 character decoding started by \ */ + D2, /* second UTF-16 character proceeded by u */ + NR_STATES +}; + +enum actions +{ + CB = -10, /* comment begin */ + CE = -11, /* comment end */ + FA = -12, /* false */ + TR = -13, /* false */ + NU = -14, /* null */ + DE = -15, /* double detected by exponent e E */ + DF = -16, /* double detected by fraction . */ + SB = -17, /* string begin */ + MX = -18, /* integer detected by minus */ + ZX = -19, /* integer detected by zero */ + IX = -20, /* integer detected by 1-9 */ + EX = -21, /* next char is escaped */ + UC = -22 /* Unicode character read */ +}; + + +static int state_transition_table[NR_STATES][NR_CLASSES] = { +/* + The state transition table takes the current state and the current symbol, + and returns either a new state or an action. An action is represented as a + negative number. A JSON text is accepted if at the end of the text the + state is OK and if the mode is MODE_DONE. + + white 1-9 ABCDF etc + space | { } [ ] : , " \ / + - . 0 | a b c d e f l n r s t u | E | * */ +/*start GO*/ {GO,GO,-6,__,-5,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*ok OK*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*object OB*/ {OB,OB,__,-9,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*key KE*/ {KE,KE,__,__,__,__,__,__,SB,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*colon CO*/ {CO,CO,__,__,__,__,-2,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*value VA*/ {VA,VA,-6,__,-5,__,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__}, +/*array AR*/ {AR,AR,-6,__,-5,-7,__,__,SB,__,CB,__,MX,__,ZX,IX,__,__,__,__,__,FA,__,NU,__,__,TR,__,__,__,__,__}, +/*string ST*/ {ST,__,ST,ST,ST,ST,ST,ST,-4,EX,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST,ST}, +/*escape ES*/ {__,__,__,__,__,__,__,__,ST,ST,ST,__,__,__,__,__,__,ST,__,__,__,ST,__,ST,ST,__,ST,U1,__,__,__,__}, +/*u1 U1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U2,U2,U2,U2,U2,U2,U2,U2,__,__,__,__,__,__,U2,U2,__,__}, +/*u2 U2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U3,U3,U3,U3,U3,U3,U3,U3,__,__,__,__,__,__,U3,U3,__,__}, +/*u3 U3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,U4,U4,U4,U4,U4,U4,U4,U4,__,__,__,__,__,__,U4,U4,__,__}, +/*u4 U4*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,UC,UC,UC,UC,UC,UC,UC,UC,__,__,__,__,__,__,UC,UC,__,__}, +/*minus MI*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,ZE,IT,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*zero ZE*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*int IT*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,DF,IT,IT,__,__,__,__,DE,__,__,__,__,__,__,__,__,DE,__,__}, +/*frac FR*/ {OK,OK,__,-8,__,-7,__,-3,__,__,CB,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__}, +/*e E1*/ {__,__,__,__,__,__,__,__,__,__,__,E2,E2,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*ex E2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*exp E3*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,E3,E3,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*tr T1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T2,__,__,__,__,__,__,__}, +/*tru T2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,T3,__,__,__,__}, +/*true T3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__}, +/*fa F1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*fal F2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F3,__,__,__,__,__,__,__,__,__}, +/*fals F3*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,F4,__,__,__,__,__,__}, +/*false F4*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__,__,__}, +/*nu N1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N2,__,__,__,__}, +/*nul N2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,N3,__,__,__,__,__,__,__,__,__}, +/*null N3*/ {__,__,__,__,__,__,__,__,__,__,CB,__,__,__,__,__,__,__,__,__,__,__,OK,__,__,__,__,__,__,__,__,__}, +/*/ C1*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,C2}, +/*/* C2*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3}, +/** C3*/ {C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,CE,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C3}, +/*_. FX*/ {OK,OK,__,-8,__,-7,__,-3,__,__,__,__,__,__,FR,FR,__,__,__,__,E1,__,__,__,__,__,__,__,__,E1,__,__}, +/*\ D1*/ {__,__,__,__,__,__,__,__,__,D2,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__}, +/*\ D2*/ {__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,U1,__,__,__,__}, +}; + + +/* + These modes can be pushed on the stack. +*/ +enum modes { + MODE_ARRAY = 1, + MODE_DONE = 2, + MODE_KEY = 3, + MODE_OBJECT = 4 +}; + +static int +push(JSON_parser jc, int mode) +{ +/* + Push a mode onto the stack. Return false if there is overflow. +*/ + jc->top += 1; + if (jc->depth < 0) { + if (jc->top >= jc->stack_capacity) { + size_t bytes_to_allocate; + jc->stack_capacity *= 2; + bytes_to_allocate = jc->stack_capacity * sizeof(jc->static_stack[0]); + if (jc->stack == &jc->static_stack[0]) { + jc->stack = (signed char*)malloc(bytes_to_allocate); + memcpy(jc->stack, jc->static_stack, sizeof(jc->static_stack)); + } else { + jc->stack = (signed char*)realloc(jc->stack, bytes_to_allocate); + } + } + } else { + if (jc->top >= jc->depth) { + return false; + } + } + + jc->stack[jc->top] = mode; + return true; +} + + +static int +pop(JSON_parser jc, int mode) +{ +/* + Pop the stack, assuring that the current mode matches the expectation. + Return false if there is underflow or if the modes mismatch. +*/ + if (jc->top < 0 || jc->stack[jc->top] != mode) { + return false; + } + jc->top -= 1; + return true; +} + + +#define parse_buffer_clear(jc) \ + do {\ + jc->parse_buffer_count = 0;\ + jc->parse_buffer[0] = 0;\ + } while (0) + +#define parse_buffer_pop_back_char(jc)\ + do {\ + assert(jc->parse_buffer_count >= 1);\ + --jc->parse_buffer_count;\ + jc->parse_buffer[jc->parse_buffer_count] = 0;\ + } while (0) + +void delete_JSON_parser(JSON_parser jc) +{ + if (jc) { + if (jc->stack != &jc->static_stack[0]) { + free((void*)jc->stack); + } + if (jc->parse_buffer != &jc->static_parse_buffer[0]) { + free((void*)jc->parse_buffer); + } + free((void*)jc); + } +} + + +JSON_parser +new_JSON_parser(JSON_config* config) +{ +/* + new_JSON_parser starts the checking process by constructing a JSON_parser + object. It takes a depth parameter that restricts the level of maximum + nesting. + + To continue the process, call JSON_parser_char for each character in the + JSON text, and then call JSON_parser_done to obtain the final result. + These functions are fully reentrant. +*/ + + int depth = 0; + JSON_config default_config; + + JSON_parser jc = (JSON_parser)malloc(sizeof(struct JSON_parser_struct)); + + memset(jc, 0, sizeof(*jc)); + + + /* initialize configuration */ + init_JSON_config(&default_config); + + /* set to default configuration if none was provided */ + if (config == NULL) { + config = &default_config; + } + + depth = config->depth; + + /* We need to be able to push at least one object */ + if (depth == 0) { + depth = 1; + } + + jc->state = GO; + jc->top = -1; + + /* Do we want non-bound stack? */ + if (depth > 0) { + jc->stack_capacity = depth; + jc->depth = depth; + if (depth <= (int)COUNTOF(jc->static_stack)) { + jc->stack = &jc->static_stack[0]; + } else { + jc->stack = (signed char*)malloc(jc->stack_capacity * sizeof(jc->static_stack[0])); + } + } else { + jc->stack_capacity = COUNTOF(jc->static_stack); + jc->depth = -1; + jc->stack = &jc->static_stack[0]; + } + + /* set parser to start */ + push(jc, MODE_DONE); + + /* set up the parse buffer */ + jc->parse_buffer = &jc->static_parse_buffer[0]; + jc->parse_buffer_capacity = COUNTOF(jc->static_parse_buffer); + parse_buffer_clear(jc); + + /* set up callback, comment & float handling */ + jc->callback = config->callback; + jc->ctx = config->callback_ctx; + jc->allow_comments = config->allow_comments != 0; + jc->handle_floats_manually = config->handle_floats_manually != 0; + + /* set up decimal point */ + jc->decimal_point = *localeconv()->decimal_point; + + return jc; +} + +static void grow_parse_buffer(JSON_parser jc) +{ + size_t bytes_to_allocate; + jc->parse_buffer_capacity *= 2; + bytes_to_allocate = jc->parse_buffer_capacity * sizeof(jc->parse_buffer[0]); + if (jc->parse_buffer == &jc->static_parse_buffer[0]) { + jc->parse_buffer = (char*)malloc(bytes_to_allocate); + memcpy(jc->parse_buffer, jc->static_parse_buffer, jc->parse_buffer_count); + } else { + jc->parse_buffer = (char*)realloc(jc->parse_buffer, bytes_to_allocate); + } +} + +#define parse_buffer_push_back_char(jc, c)\ + do {\ + if (jc->parse_buffer_count + 1 >= jc->parse_buffer_capacity) grow_parse_buffer(jc);\ + jc->parse_buffer[jc->parse_buffer_count++] = c;\ + jc->parse_buffer[jc->parse_buffer_count] = 0;\ + } while (0) + +#define assert_is_non_container_type(jc) \ + assert( \ + jc->type == JSON_T_NULL || \ + jc->type == JSON_T_FALSE || \ + jc->type == JSON_T_TRUE || \ + jc->type == JSON_T_FLOAT || \ + jc->type == JSON_T_INTEGER || \ + jc->type == JSON_T_STRING) + + +static int parse_parse_buffer(JSON_parser jc) +{ + if (jc->callback) { + JSON_value value, *arg = NULL; + + if (jc->type != JSON_T_NONE) { + assert_is_non_container_type(jc); + + switch(jc->type) { + case JSON_T_FLOAT: + arg = &value; + if (jc->handle_floats_manually) { + value.vu.str.value = jc->parse_buffer; + value.vu.str.length = jc->parse_buffer_count; + } else { + /*sscanf(jc->parse_buffer, "%Lf", &value.vu.float_value);*/ + + /* not checking with end pointer b/c there may be trailing ws */ + value.vu.float_value = strtold(jc->parse_buffer, NULL); + } + break; + case JSON_T_INTEGER: + arg = &value; + sscanf(jc->parse_buffer, JSON_PARSER_INTEGER_SSCANF_TOKEN, &value.vu.integer_value); + break; + case JSON_T_STRING: + arg = &value; + value.vu.str.value = jc->parse_buffer; + value.vu.str.length = jc->parse_buffer_count; + break; + } + + if (!(*jc->callback)(jc->ctx, jc->type, arg)) { + return false; + } + } + } + + parse_buffer_clear(jc); + + return true; +} + +#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800) +#define IS_LOW_SURROGATE(uc) (((uc) & 0xFC00) == 0xDC00) +#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000) +static unsigned char utf8_lead_bits[4] = { 0x00, 0xC0, 0xE0, 0xF0 }; + +static int decode_unicode_char(JSON_parser jc) +{ + int i; + unsigned uc = 0; + char* p; + int trail_bytes; + + assert(jc->parse_buffer_count >= 6); + + p = &jc->parse_buffer[jc->parse_buffer_count - 4]; + + for (i = 12; i >= 0; i -= 4, ++p) { + unsigned x = *p; + + if (x >= 'a') { + x -= ('a' - 10); + } else if (x >= 'A') { + x -= ('A' - 10); + } else { + x &= ~0x30u; + } + + assert(x < 16); + + uc |= x << i; + } + + /* clear UTF-16 char from buffer */ + jc->parse_buffer_count -= 6; + jc->parse_buffer[jc->parse_buffer_count] = 0; + + /* attempt decoding ... */ + if (jc->utf16_high_surrogate) { + if (IS_LOW_SURROGATE(uc)) { + uc = DECODE_SURROGATE_PAIR(jc->utf16_high_surrogate, uc); + trail_bytes = 3; + jc->utf16_high_surrogate = 0; + } else { + /* high surrogate without a following low surrogate */ + return false; + } + } else { + if (uc < 0x80) { + trail_bytes = 0; + } else if (uc < 0x800) { + trail_bytes = 1; + } else if (IS_HIGH_SURROGATE(uc)) { + /* save the high surrogate and wait for the low surrogate */ + jc->utf16_high_surrogate = uc; + return true; + } else if (IS_LOW_SURROGATE(uc)) { + /* low surrogate without a preceding high surrogate */ + return false; + } else { + trail_bytes = 2; + } + } + + jc->parse_buffer[jc->parse_buffer_count++] = (char) ((uc >> (trail_bytes * 6)) | utf8_lead_bits[trail_bytes]); + + for (i = trail_bytes * 6 - 6; i >= 0; i -= 6) { + jc->parse_buffer[jc->parse_buffer_count++] = (char) (((uc >> i) & 0x3F) | 0x80); + } + + jc->parse_buffer[jc->parse_buffer_count] = 0; + + return true; +} + +static int add_escaped_char_to_parse_buffer(JSON_parser jc, int next_char) +{ + jc->escaped = 0; + /* remove the backslash */ + parse_buffer_pop_back_char(jc); + switch(next_char) { + case 'b': + parse_buffer_push_back_char(jc, '\b'); + break; + case 'f': + parse_buffer_push_back_char(jc, '\f'); + break; + case 'n': + parse_buffer_push_back_char(jc, '\n'); + break; + case 'r': + parse_buffer_push_back_char(jc, '\r'); + break; + case 't': + parse_buffer_push_back_char(jc, '\t'); + break; + case '"': + parse_buffer_push_back_char(jc, '"'); + break; + case '\\': + parse_buffer_push_back_char(jc, '\\'); + break; + case '/': + parse_buffer_push_back_char(jc, '/'); + break; + case 'u': + parse_buffer_push_back_char(jc, '\\'); + parse_buffer_push_back_char(jc, 'u'); + break; + default: + return false; + } + + return true; +} + +#define add_char_to_parse_buffer(jc, next_char, next_class) \ + do { \ + if (jc->escaped) { \ + if (!add_escaped_char_to_parse_buffer(jc, next_char)) \ + return false; \ + } else if (!jc->comment) { \ + if ((jc->type != JSON_T_NONE) | !((next_class == C_SPACE) | (next_class == C_WHITE)) /* non-white-space */) { \ + parse_buffer_push_back_char(jc, (char)next_char); \ + } \ + } \ + } while (0) + + +#define assert_type_isnt_string_null_or_bool(jc) \ + assert(jc->type != JSON_T_FALSE); \ + assert(jc->type != JSON_T_TRUE); \ + assert(jc->type != JSON_T_NULL); \ + assert(jc->type != JSON_T_STRING) + + +int +JSON_parser_char(JSON_parser jc, int next_char) +{ +/* + After calling new_JSON_parser, call this function for each character (or + partial character) in your JSON text. It can accept UTF-8, UTF-16, or + UTF-32. It returns true if things are looking ok so far. If it rejects the + text, it returns false. +*/ + int next_class, next_state; + +/* + Determine the character's class. +*/ + if (next_char < 0) { + return false; + } + if (next_char >= 128) { + next_class = C_ETC; + } else { + next_class = ascii_class[next_char]; + if (next_class <= __) { + return false; + } + } + + add_char_to_parse_buffer(jc, next_char, next_class); + +/* + Get the next state from the state transition table. +*/ + next_state = state_transition_table[jc->state][next_class]; + if (next_state >= 0) { +/* + Change the state. +*/ + jc->state = next_state; + } else { +/* + Or perform one of the actions. +*/ + switch (next_state) { +/* Unicode character */ + case UC: + if(!decode_unicode_char(jc)) { + return false; + } + /* check if we need to read a second UTF-16 char */ + if (jc->utf16_high_surrogate) { + jc->state = D1; + } else { + jc->state = ST; + } + break; +/* escaped char */ + case EX: + jc->escaped = 1; + jc->state = ES; + break; +/* integer detected by minus */ + case MX: + jc->type = JSON_T_INTEGER; + jc->state = MI; + break; +/* integer detected by zero */ + case ZX: + jc->type = JSON_T_INTEGER; + jc->state = ZE; + break; +/* integer detected by 1-9 */ + case IX: + jc->type = JSON_T_INTEGER; + jc->state = IT; + break; + +/* floating point number detected by exponent*/ + case DE: + assert_type_isnt_string_null_or_bool(jc); + jc->type = JSON_T_FLOAT; + jc->state = E1; + break; + +/* floating point number detected by fraction */ + case DF: + assert_type_isnt_string_null_or_bool(jc); + if (!jc->handle_floats_manually) { +/* + Some versions of strtod (which underlies sscanf) don't support converting + C-locale formated floating point values. +*/ + assert(jc->parse_buffer[jc->parse_buffer_count-1] == '.'); + jc->parse_buffer[jc->parse_buffer_count-1] = jc->decimal_point; + } + jc->type = JSON_T_FLOAT; + jc->state = FX; + break; +/* string begin " */ + case SB: + parse_buffer_clear(jc); + assert(jc->type == JSON_T_NONE); + jc->type = JSON_T_STRING; + jc->state = ST; + break; + +/* n */ + case NU: + assert(jc->type == JSON_T_NONE); + jc->type = JSON_T_NULL; + jc->state = N1; + break; +/* f */ + case FA: + assert(jc->type == JSON_T_NONE); + jc->type = JSON_T_FALSE; + jc->state = F1; + break; +/* t */ + case TR: + assert(jc->type == JSON_T_NONE); + jc->type = JSON_T_TRUE; + jc->state = T1; + break; + +/* closing comment */ + case CE: + jc->comment = 0; + assert(jc->parse_buffer_count == 0); + assert(jc->type == JSON_T_NONE); + jc->state = jc->before_comment_state; + break; + +/* opening comment */ + case CB: + if (!jc->allow_comments) { + return false; + } + parse_buffer_pop_back_char(jc); + if (!parse_parse_buffer(jc)) { + return false; + } + assert(jc->parse_buffer_count == 0); + assert(jc->type != JSON_T_STRING); + switch (jc->stack[jc->top]) { + case MODE_ARRAY: + case MODE_OBJECT: + switch(jc->state) { + case VA: + case AR: + jc->before_comment_state = jc->state; + break; + default: + jc->before_comment_state = OK; + break; + } + break; + default: + jc->before_comment_state = jc->state; + break; + } + jc->type = JSON_T_NONE; + jc->state = C1; + jc->comment = 1; + break; +/* empty } */ + case -9: + parse_buffer_clear(jc); + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) { + return false; + } + if (!pop(jc, MODE_KEY)) { + return false; + } + jc->state = OK; + break; + +/* } */ case -8: + parse_buffer_pop_back_char(jc); + if (!parse_parse_buffer(jc)) { + return false; + } + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_END, NULL)) { + return false; + } + if (!pop(jc, MODE_OBJECT)) { + return false; + } + jc->type = JSON_T_NONE; + jc->state = OK; + break; + +/* ] */ case -7: + parse_buffer_pop_back_char(jc); + if (!parse_parse_buffer(jc)) { + return false; + } + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_END, NULL)) { + return false; + } + if (!pop(jc, MODE_ARRAY)) { + return false; + } + + jc->type = JSON_T_NONE; + jc->state = OK; + break; + +/* { */ case -6: + parse_buffer_pop_back_char(jc); + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_OBJECT_BEGIN, NULL)) { + return false; + } + if (!push(jc, MODE_KEY)) { + return false; + } + assert(jc->type == JSON_T_NONE); + jc->state = OB; + break; + +/* [ */ case -5: + parse_buffer_pop_back_char(jc); + if (jc->callback && !(*jc->callback)(jc->ctx, JSON_T_ARRAY_BEGIN, NULL)) { + return false; + } + if (!push(jc, MODE_ARRAY)) { + return false; + } + assert(jc->type == JSON_T_NONE); + jc->state = AR; + break; + +/* string end " */ case -4: + parse_buffer_pop_back_char(jc); + switch (jc->stack[jc->top]) { + case MODE_KEY: + assert(jc->type == JSON_T_STRING); + jc->type = JSON_T_NONE; + jc->state = CO; + + if (jc->callback) { + JSON_value value; + value.vu.str.value = jc->parse_buffer; + value.vu.str.length = jc->parse_buffer_count; + if (!(*jc->callback)(jc->ctx, JSON_T_KEY, &value)) { + return false; + } + } + parse_buffer_clear(jc); + break; + case MODE_ARRAY: + case MODE_OBJECT: + assert(jc->type == JSON_T_STRING); + if (!parse_parse_buffer(jc)) { + return false; + } + jc->type = JSON_T_NONE; + jc->state = OK; + break; + default: + return false; + } + break; + +/* , */ case -3: + parse_buffer_pop_back_char(jc); + if (!parse_parse_buffer(jc)) { + return false; + } + switch (jc->stack[jc->top]) { + case MODE_OBJECT: +/* + A comma causes a flip from object mode to key mode. +*/ + if (!pop(jc, MODE_OBJECT) || !push(jc, MODE_KEY)) { + return false; + } + assert(jc->type != JSON_T_STRING); + jc->type = JSON_T_NONE; + jc->state = KE; + break; + case MODE_ARRAY: + assert(jc->type != JSON_T_STRING); + jc->type = JSON_T_NONE; + jc->state = VA; + break; + default: + return false; + } + break; + +/* : */ case -2: +/* + A colon causes a flip from key mode to object mode. +*/ + parse_buffer_pop_back_char(jc); + if (!pop(jc, MODE_KEY) || !push(jc, MODE_OBJECT)) { + return false; + } + assert(jc->type == JSON_T_NONE); + jc->state = VA; + break; +/* + Bad action. +*/ + default: + return false; + } + } + return true; +} + + +int +JSON_parser_done(JSON_parser jc) +{ + const int result = jc->state == OK && pop(jc, MODE_DONE); + + return result; +} + + +int JSON_parser_is_legal_white_space_string(const char* s) +{ + int c, char_class; + + if (s == NULL) { + return false; + } + + for (; *s; ++s) { + c = *s; + + if (c < 0 || c >= 128) { + return false; + } + + char_class = ascii_class[c]; + + if (char_class != C_SPACE && char_class != C_WHITE) { + return false; + } + } + + return true; +} + + + +void init_JSON_config(JSON_config* config) +{ + if (config) { + memset(config, 0, sizeof(*config)); + + config->depth = JSON_PARSER_STACK_SIZE - 1; + } +} diff --git a/src/collapse_weights.cc b/src/collapse_weights.cc deleted file mode 100644 index 5e0f3f72..00000000 --- a/src/collapse_weights.cc +++ /dev/null @@ -1,102 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "prob.h" -#include "filelib.h" -#include "trule.h" -#include "weights.h" - -namespace po = boost::program_options; -using namespace std; - -typedef std::tr1::unordered_map, prob_t, boost::hash > > MarginalMap; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("grammar,g", po::value(), "Grammar file") - ("weights,w", po::value(), "Weights file"); - po::options_description clo("Command line options"); - clo.add_options() - ("config,c", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - const string cfg = (*conf)["config"].as(); - cerr << "Configuration file: " << cfg << endl; - ifstream config(cfg.c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("grammar") || !conf->count("weights")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string wfile = conf["weights"].as(); - const string gfile = conf["grammar"].as(); - Weights wm; - wm.InitFromFile(wfile); - vector w; - wm.InitVector(&w); - MarginalMap e_tots; - MarginalMap f_tots; - prob_t tot; - { - ReadFile rf(gfile); - assert(*rf.stream()); - istream& in = *rf.stream(); - cerr << "Computing marginals...\n"; - int lc = 0; - while(in) { - string line; - getline(in, line); - ++lc; - if (line.empty()) continue; - TRule tr(line, true); - if (tr.GetFeatureValues().empty()) - cerr << "Line " << lc << ": empty features - may introduce bias\n"; - prob_t prob; - prob.logeq(tr.GetFeatureValues().dot(w)); - e_tots[tr.e_] += prob; - f_tots[tr.f_] += prob; - tot += prob; - } - } - bool normalized = (fabs(log(tot)) < 0.001); - cerr << "Total: " << tot << (normalized ? " [normalized]" : " [scaled]") << endl; - ReadFile rf(gfile); - istream&in = *rf.stream(); - while(in) { - string line; - getline(in, line); - if (line.empty()) continue; - TRule tr(line, true); - const double lp = tr.GetFeatureValues().dot(w); - if (isinf(lp)) { continue; } - tr.scores_.clear(); - - cout << tr.AsString() << " ||| F_and_E=" << lp - log(tot); - if (!normalized) { - cout << ";ZF_and_E=" << lp; - } - cout << ";F_given_E=" << lp - log(e_tots[tr.e_]) - << ";E_given_F=" << lp - log(f_tots[tr.f_]) << endl; - } - return 0; -} - diff --git a/src/ibm_model1.cc b/src/ibm_model1.cc deleted file mode 100644 index eb9c617c..00000000 --- a/src/ibm_model1.cc +++ /dev/null @@ -1,4 +0,0 @@ -#include - - - diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc new file mode 100644 index 00000000..5e0f3f72 --- /dev/null +++ b/training/collapse_weights.cc @@ -0,0 +1,102 @@ +#include +#include +#include + +#include +#include +#include + +#include "prob.h" +#include "filelib.h" +#include "trule.h" +#include "weights.h" + +namespace po = boost::program_options; +using namespace std; + +typedef std::tr1::unordered_map, prob_t, boost::hash > > MarginalMap; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("grammar,g", po::value(), "Grammar file") + ("weights,w", po::value(), "Weights file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + const string cfg = (*conf)["config"].as(); + cerr << "Configuration file: " << cfg << endl; + ifstream config(cfg.c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("grammar") || !conf->count("weights")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string wfile = conf["weights"].as(); + const string gfile = conf["grammar"].as(); + Weights wm; + wm.InitFromFile(wfile); + vector w; + wm.InitVector(&w); + MarginalMap e_tots; + MarginalMap f_tots; + prob_t tot; + { + ReadFile rf(gfile); + assert(*rf.stream()); + istream& in = *rf.stream(); + cerr << "Computing marginals...\n"; + int lc = 0; + while(in) { + string line; + getline(in, line); + ++lc; + if (line.empty()) continue; + TRule tr(line, true); + if (tr.GetFeatureValues().empty()) + cerr << "Line " << lc << ": empty features - may introduce bias\n"; + prob_t prob; + prob.logeq(tr.GetFeatureValues().dot(w)); + e_tots[tr.e_] += prob; + f_tots[tr.f_] += prob; + tot += prob; + } + } + bool normalized = (fabs(log(tot)) < 0.001); + cerr << "Total: " << tot << (normalized ? " [normalized]" : " [scaled]") << endl; + ReadFile rf(gfile); + istream&in = *rf.stream(); + while(in) { + string line; + getline(in, line); + if (line.empty()) continue; + TRule tr(line, true); + const double lp = tr.GetFeatureValues().dot(w); + if (isinf(lp)) { continue; } + tr.scores_.clear(); + + cout << tr.AsString() << " ||| F_and_E=" << lp - log(tot); + if (!normalized) { + cout << ";ZF_and_E=" << lp; + } + cout << ";F_given_E=" << lp - log(e_tots[tr.e_]) + << ";E_given_F=" << lp - log(f_tots[tr.f_]) << endl; + } + return 0; +} + diff --git a/vest/comb_scorer.h b/vest/comb_scorer.h new file mode 100644 index 00000000..70b1ec75 --- /dev/null +++ b/vest/comb_scorer.h @@ -0,0 +1,17 @@ +#ifndef _COMB_SCORER_ +#define _COMB_SCORER_ + +#include "scorer.h" + +class BLEUTERCombinationScorer : public SentenceScorer { + public: + BLEUTERCombinationScorer(const std::vector >& refs); + ~BLEUTERCombinationScorer(); + Score* ScoreCandidate(const std::vector& hyp) const; + static Score* ScoreFromString(const std::string& in); + private: + SentenceScorer* bleu_; + SentenceScorer* ter_; +}; + +#endif diff --git a/vest/error_surface.h b/vest/error_surface.h new file mode 100644 index 00000000..a8734f54 --- /dev/null +++ b/vest/error_surface.h @@ -0,0 +1,24 @@ +#ifndef _ERROR_SURFACE_H_ +#define _ERROR_SURFACE_H_ + +#include +#include + +#include "scorer.h" + +class Score; + +struct ErrorSegment { + double x; + Score* delta; + ErrorSegment() : x(0), delta(NULL) {} +}; + +class ErrorSurface : public std::vector { + public: + ~ErrorSurface(); + void Serialize(std::string* out) const; + void Deserialize(ScoreType type, const std::string& in); +}; + +#endif diff --git a/vest/line_optimizer.h b/vest/line_optimizer.h new file mode 100644 index 00000000..43164360 --- /dev/null +++ b/vest/line_optimizer.h @@ -0,0 +1,44 @@ +#ifndef LINE_OPTIMIZER_H_ +#define LINE_OPTIMIZER_H_ + +#include + +#include "error_surface.h" +#include "sampler.h" + +template class SparseVector; +class Weights; + +struct LineOptimizer { + + // use MINIMIZE_SCORE for things like TER, WER + // MAXIMIZE_SCORE for things like BLEU + enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE }; + + // merge all the error surfaces together into a global + // error surface and find (the middle of) the best segment + static double LineOptimize( + const std::vector& envs, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon = 1.0/65536.0); + + // return a random vector of length 1 where all dimensions + // not listed in dimensions will be 0. + static void RandomUnitVector(const std::vector& dimensions, + SparseVector* axis, + RandomNumberGenerator* rng); + + // generate a list of directions to optimize; the list will + // contain the orthogonal vectors corresponding to the dimensions in + // primary and then additional_random_directions directions in those + // dimensions as well. All vectors will be length 1. + static void CreateOptimizationDirections( + const std::vector& primary, + int additional_random_directions, + RandomNumberGenerator* rng, + std::vector >* dirs); + +}; + +#endif diff --git a/vest/scorer.h b/vest/scorer.h new file mode 100644 index 00000000..b0bba640 --- /dev/null +++ b/vest/scorer.h @@ -0,0 +1,51 @@ +#ifndef SCORER_H_ +#define SCORER_H_ + +#include +#include + +#include "wordid.h" + +class ViterbiEnvelope; +class ErrorSurface; + +enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER }; +ScoreType ScoreTypeFromString(const std::string& st); + +class Score { + public: + virtual ~Score(); + virtual float ComputeScore() const = 0; + virtual void ScoreDetails(std::string* details) const = 0; + virtual void PlusEquals(const Score& rhs) = 0; + virtual void Subtract(const Score& rhs, Score* res) const = 0; + virtual Score* GetZero() const = 0; + virtual bool IsAdditiveIdentity() const = 0; // returns true if adding this delta + // to another score results in no score change + // under any circumstances + virtual void Encode(std::string* out) const = 0; +}; + +class SentenceScorer { + public: + virtual ~SentenceScorer(); + void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es) const; + virtual Score* ScoreCandidate(const std::vector& hyp) const = 0; + static Score* CreateScoreFromString(const ScoreType type, const std::string& in); + static SentenceScorer* CreateSentenceScorer(const ScoreType type, + const std::vector >& refs); +}; + +class DocScorer { + public: + ~DocScorer(); + DocScorer( + const ScoreType type, + const std::vector& ref_files); + int size() const { return scorers_.size(); } + const SentenceScorer* operator[](size_t i) const { return scorers_[i]; } + private: + std::vector scorers_; +}; + +#endif diff --git a/vest/ter.h b/vest/ter.h new file mode 100644 index 00000000..fe4ba36c --- /dev/null +++ b/vest/ter.h @@ -0,0 +1,18 @@ +#ifndef _TER_H_ +#define _TER_H_ + +#include "scorer.h" + +class TERScorerImpl; + +class TERScorer : public SentenceScorer { + public: + TERScorer(const std::vector >& references); + ~TERScorer(); + Score* ScoreCandidate(const std::vector& hyp) const; + static Score* ScoreFromString(const std::string& data); + private: + std::vector impl_; +}; + +#endif diff --git a/vest/viterbi_envelope.h b/vest/viterbi_envelope.h new file mode 100644 index 00000000..39d2b537 --- /dev/null +++ b/vest/viterbi_envelope.h @@ -0,0 +1,79 @@ +#ifndef _VITERBI_ENVELOPE_H_ +#define _VITERBI_ENVELOPE_H_ + +#include +#include +#include + +#include "hg.h" +#include "sparse_vector.h" + +static const double kMinusInfinity = -std::numeric_limits::infinity(); +static const double kPlusInfinity = std::numeric_limits::infinity(); + +struct Segment { + Segment() : x(), m(), b() {} + Segment(double _m, double _b) : + x(kMinusInfinity), m(_m), b(_b) {} + Segment(double _x, double _m, double _b, const boost::shared_ptr& p1_, const boost::shared_ptr& p2_) : + x(_x), m(_m), b(_b), p1(p1_), p2(p2_) {} + Segment(double _m, double _b, TRulePtr _rule) : + x(kMinusInfinity), m(_m), b(_b), rule(_rule) {} + + double x; // x intersection with previous segment in env, or -inf if none + double m; // this line's slope + double b; // intercept with y-axis + + // we keep a pointer to the "parents" of this segment so we can reconstruct + // the Viterbi translation corresponding to this segment + boost::shared_ptr p1; + boost::shared_ptr p2; + + // only Segments created from an edge using the ViterbiEnvelopeWeightFunction + // have rules + TRulePtr rule; + + // recursively recover the Viterbi translation that will result from setting + // the weights to origin + axis * x, where x is any value from this->x up + // until the next largest x in the containing ViterbiEnvelope + void ConstructTranslation(std::vector* trans) const; +}; + +// this is the semiring value type, +// it defines constructors for 0, 1, and the operations + and * +struct ViterbiEnvelope { + // create semiring zero + ViterbiEnvelope() : is_sorted(true) {} // zero + // for debugging: + ViterbiEnvelope(const std::vector >& s) : segs(s) { Sort(); } + // create semiring 1 or 0 + explicit ViterbiEnvelope(int i); + ViterbiEnvelope(int n, Segment* seg) : is_sorted(true), segs(n, boost::shared_ptr(seg)) {} + const ViterbiEnvelope& operator+=(const ViterbiEnvelope& other); + const ViterbiEnvelope& operator*=(const ViterbiEnvelope& other); + bool IsMultiplicativeIdentity() const { + return size() == 1 && (segs[0]->b == 0.0 && segs[0]->m == 0.0) && (!segs[0]->rule); } + const std::vector >& GetSortedSegs() const { + if (!is_sorted) Sort(); + return segs; + } + size_t size() const { return segs.size(); } + + private: + bool IsEdgeEnvelope() const { + return segs.size() == 1 && segs[0]->rule; } + void Sort() const; + mutable bool is_sorted; + mutable std::vector > segs; +}; +std::ostream& operator<<(std::ostream& os, const ViterbiEnvelope& env); + +struct ViterbiEnvelopeWeightFunction { + ViterbiEnvelopeWeightFunction(const SparseVector& ori, + const SparseVector& dir) : origin(ori), direction(dir) {} + ViterbiEnvelope operator()(const Hypergraph::Edge& e) const; + const SparseVector origin; + const SparseVector direction; +}; + +#endif -- cgit v1.2.3