From 9d7167751a3712a79ad356764d803106a71ce5e3 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 15 Jan 2013 01:20:00 -0500 Subject: corpus files --- corpus/add-self-translations.pl | 2 +- corpus/filter-length.pl | 6 ++++-- corpus/paste-files.pl | 12 +++++++++++- 3 files changed, 16 insertions(+), 4 deletions(-) (limited to 'corpus') diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl index 153bc454..d707ce29 100755 --- a/corpus/add-self-translations.pl +++ b/corpus/add-self-translations.pl @@ -6,7 +6,7 @@ use strict; my %df; my %def; while(<>) { - print; +# print; chomp; my ($sf, $se) = split / \|\|\| /; die "Format error: $_\n" unless defined $sf && defined $se; diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 70032ca7..3cfa40cc 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -3,8 +3,8 @@ use strict; use utf8; ##### EDIT THESE SETTINGS #################################################### -my $MAX_LENGTH = 99; # discard a sentence if it is longer than this -my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include +my $MAX_LENGTH = 150; # discard a sentence if it is longer than this +my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be? ############################################################################## @@ -128,6 +128,8 @@ while() { next; } print; + } else { + print; } $to++; } diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl index 24c70599..0b788386 100755 --- a/corpus/paste-files.pl +++ b/corpus/paste-files.pl @@ -17,6 +17,7 @@ for my $file (@ARGV) { binmode(STDOUT,":utf8"); binmode(STDERR,":utf8"); +my $bad = 0; my $lc = 0; my $done = 0; my $fl = 0; @@ -34,7 +35,15 @@ while(1) { last; } chomp $r; - die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + if ($r =~ /\|\|\|/) { + $r = ''; + $bad++; + } + warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + $r =~ s/\|\|\|/ /g; + $r =~ s/ +//g; + $r =~ s/^ //; + $r =~ s/ $//; $anum++; push @line, $r; } @@ -47,4 +56,5 @@ for (my $i = 1; $i < scalar @fhs; $i++) { my $r = <$fh>; die "Mismatched number of lines.\n" if defined $r; } +print STDERR "Bad lines containing ||| were $bad\n"; -- cgit v1.2.3 From 9e36263f64d6f5150f1b552dd77bde971d605376 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 19 Jan 2013 19:09:48 -0500 Subject: updated version of boost.m4 and automatically build kenneth's LM builder --- Makefile.am | 2 + configure.ac | 7 +- corpus/cut-corpus.pl | 2 +- klm/lm/builder/Makefile.am | 28 +++ klm/util/Makefile.am | 2 +- klm/util/double-conversion/Makefile.am | 2 +- klm/util/stream/Makefile.am | 20 ++ klm/util/stream/sort.hh | 3 +- m4/boost.m4 | 322 +++++++++++++++++++++++++-------- 9 files changed, 311 insertions(+), 77 deletions(-) create mode 100644 klm/lm/builder/Makefile.am create mode 100644 klm/util/stream/Makefile.am (limited to 'corpus') diff --git a/Makefile.am b/Makefile.am index c2444928..17190d27 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,8 +5,10 @@ SUBDIRS = \ utils \ mteval \ klm/util/double-conversion \ + klm/util/stream \ klm/util \ klm/lm \ + klm/lm/builder \ klm/search \ decoder \ training \ diff --git a/configure.ac b/configure.ac index d6030752..a1e5ad84 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cdec],[2013-01-15]) +AC_INIT([cdec],[2013-01-19]) AC_CONFIG_SRCDIR([decoder/cdec.cc]) AM_INIT_AUTOMAKE AC_CONFIG_HEADERS(config.h) @@ -15,7 +15,10 @@ BOOST_REQUIRE([1.44]) BOOST_PROGRAM_OPTIONS BOOST_SYSTEM BOOST_SERIALIZATION +BOOST_CHRONO +BOOST_TIMER BOOST_TEST +BOOST_THREADS AM_PATH_PYTHON AC_CHECK_HEADER(dlfcn.h,AC_DEFINE(HAVE_DLFCN_H)) AC_CHECK_LIB(dl, dlopen) @@ -111,8 +114,10 @@ AC_CONFIG_FILES([word-aligner/Makefile]) # KenLM stuff AC_CONFIG_FILES([klm/util/double-conversion/Makefile]) +AC_CONFIG_FILES([klm/util/stream/Makefile]) AC_CONFIG_FILES([klm/util/Makefile]) AC_CONFIG_FILES([klm/lm/Makefile]) +AC_CONFIG_FILES([klm/lm/builder/Makefile]) AC_CONFIG_FILES([klm/search/Makefile]) # training stuff diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl index 7daac0e2..0af3b23c 100755 --- a/corpus/cut-corpus.pl +++ b/corpus/cut-corpus.pl @@ -22,7 +22,7 @@ for my $ff (@ind) { while(<>) { chomp; - my @fields = split / \|\|\| /; + my @fields = split /\s*\|\|\|\s*/; my @sf; for my $i (@o) { my $y = $fields[$i]; diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am new file mode 100644 index 00000000..00444256 --- /dev/null +++ b/klm/lm/builder/Makefile.am @@ -0,0 +1,28 @@ +bin_PROGRAMS = builder + +builder_SOURCES = \ + main.cc \ + adjust_counts.cc \ + adjust_counts.hh \ + corpus_count.cc \ + corpus_count.hh \ + discount.hh \ + header_info.hh \ + initial_probabilities.cc \ + initial_probabilities.hh \ + interpolate.cc \ + interpolate.hh \ + joint_order.hh \ + multi_stream.hh \ + ngram.hh \ + ngram_stream.hh \ + pipeline.cc \ + pipeline.hh \ + print.cc \ + print.hh \ + sort.hh + +builder_LDADD = ../libklm.a ../../util/double-conversion/libklm_util_double.a ../../util/stream/libklm_util_stream.a ../../util/libklm_util.a $(BOOST_TIMER_LIBS) $(BOOST_CHRONO_LIBS) $(BOOST_THREAD_LIBS) + +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm + diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am index 294ebc0a..248cc844 100644 --- a/klm/util/Makefile.am +++ b/klm/util/Makefile.am @@ -54,4 +54,4 @@ libklm_util_a_SOURCES = \ string_piece.cc \ usage.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion diff --git a/klm/util/double-conversion/Makefile.am b/klm/util/double-conversion/Makefile.am index eb6616f7..dfcfb009 100644 --- a/klm/util/double-conversion/Makefile.am +++ b/klm/util/double-conversion/Makefile.am @@ -20,4 +20,4 @@ libklm_util_double_a_SOURCES = \ fixed-dtoa.cc \ strtod.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion diff --git a/klm/util/stream/Makefile.am b/klm/util/stream/Makefile.am new file mode 100644 index 00000000..f18cbedb --- /dev/null +++ b/klm/util/stream/Makefile.am @@ -0,0 +1,20 @@ +noinst_LIBRARIES = libklm_util_stream.a + +libklm_util_stream_a_SOURCES = \ + block.hh \ + chain.cc \ + chain.hh \ + config.hh \ + io.cc \ + io.hh \ + line_input.cc \ + line_input.hh \ + multi_progress.cc \ + multi_progress.hh \ + sort.hh \ + stream.hh \ + timer.hh + +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm + +#-I$(top_srcdir)/klm/util/double-conversion diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh index be6c11ea..df57fa41 100644 --- a/klm/util/stream/sort.hh +++ b/klm/util/stream/sort.hh @@ -259,7 +259,8 @@ template class MergingReader { while (in_offsets_->RemainingBlocks()) { // Use bigger buffers if there's less remaining. - uint64_t per_buffer = std::max(buffer_size_, total_memory_ / in_offsets_->RemainingBlocks()); + uint64_t per_buffer = std::max(static_cast(buffer_size_), + static_cast(total_memory_ / in_offsets_->RemainingBlocks())); per_buffer -= per_buffer % entry_size; assert(per_buffer); diff --git a/m4/boost.m4 b/m4/boost.m4 index 7e0ed075..027e039b 100644 --- a/m4/boost.m4 +++ b/m4/boost.m4 @@ -1,5 +1,5 @@ # boost.m4: Locate Boost headers and libraries for autoconf-based projects. -# Copyright (C) 2007, 2008, 2009 Benoit Sigoure +# Copyright (C) 2007, 2008, 2009, 2010, 2011 Benoit Sigoure # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -22,7 +22,7 @@ # along with this program. If not, see . m4_define([_BOOST_SERIAL], [m4_translit([ -# serial 12 +# serial 16 ], [# ], [])]) @@ -45,15 +45,19 @@ m4_define([_BOOST_SERIAL], [m4_translit([ # Note: THESE MACROS ASSUME THAT YOU USE LIBTOOL. If you don't, don't worry, # simply read the README, it will show you what to do step by step. -m4_pattern_forbid([^_?BOOST_]) +m4_pattern_forbid([^_?(BOOST|Boost)_]) # _BOOST_SED_CPP(SED-PROGRAM, PROGRAM, # [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) # -------------------------------------------------------- # Same as AC_EGREP_CPP, but leave the result in conftest.i. -# PATTERN is *not* overquoted, as in AC_EGREP_CPP. It could be useful -# to turn this into a macro which extracts the value of any macro. +# +# SED-PROGRAM is *not* overquoted, as in AC_EGREP_CPP. It is expanded +# in double-quotes, so escape your double quotes. +# +# It could be useful to turn this into a macro which extracts the +# value of any macro. m4_define([_BOOST_SED_CPP], [AC_LANG_PREPROC_REQUIRE()dnl AC_REQUIRE([AC_PROG_SED])dnl @@ -98,6 +102,7 @@ set x $boost_version_req 0 0 0 IFS=$boost_save_IFS shift boost_version_req=`expr "$[1]" '*' 100000 + "$[2]" '*' 100 + "$[3]"` +boost_version_req_string=$[1].$[2].$[3] AC_ARG_WITH([boost], [AS_HELP_STRING([--with-boost=DIR], [prefix of Boost $1 @<:@guess@:>@])])dnl @@ -113,9 +118,9 @@ if test x"$BOOST_ROOT" != x; then fi fi AC_SUBST([DISTCHECK_CONFIGURE_FLAGS], - ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"]) + ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"])dnl boost_save_CPPFLAGS=$CPPFLAGS - AC_CACHE_CHECK([for Boost headers version >= $boost_version_req], + AC_CACHE_CHECK([for Boost headers version >= $boost_version_req_string], [boost_cv_inc_path], [boost_cv_inc_path=no AC_LANG_PUSH([C++])dnl @@ -183,24 +188,25 @@ AC_LANG_POP([C++])dnl ]) case $boost_cv_inc_path in #( no) - boost_errmsg="cannot find Boost headers version >= $boost_version_req" + boost_errmsg="cannot find Boost headers version >= $boost_version_req_string" m4_if([$2], [], [AC_MSG_ERROR([$boost_errmsg])], [AC_MSG_NOTICE([$boost_errmsg])]) $2 ;;#( yes) BOOST_CPPFLAGS= - AC_DEFINE([HAVE_BOOST], [1], - [Defined if the requested minimum BOOST version is satisfied]) ;;#( *) - AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"]) + AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"])dnl ;; esac + if test x"$boost_cv_inc_path" != xno; then + AC_DEFINE([HAVE_BOOST], [1], + [Defined if the requested minimum BOOST version is satisfied]) AC_CACHE_CHECK([for Boost's header version], [boost_cv_lib_version], [m4_pattern_allow([^BOOST_LIB_VERSION$])dnl - _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;g;}], + _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;q;}], [#include boost-lib-version = BOOST_LIB_VERSION], [boost_cv_lib_version=`cat conftest.i`])]) @@ -211,6 +217,7 @@ boost-lib-version = BOOST_LIB_VERSION], AC_MSG_ERROR([invalid value: boost_major_version=$boost_major_version]) ;; esac +fi CPPFLAGS=$boost_save_CPPFLAGS ])# BOOST_REQUIRE @@ -220,7 +227,7 @@ CPPFLAGS=$boost_save_CPPFLAGS # on the command line, static versions of the libraries will be looked up. AC_DEFUN([BOOST_STATIC], [AC_ARG_ENABLE([static-boost], - [AC_HELP_STRING([--enable-static-boost], + [AS_HELP_STRING([--enable-static-boost], [Prefer the static boost libraries over the shared ones [no]])], [enable_static_boost=yes], [enable_static_boost=no])])# BOOST_STATIC @@ -290,6 +297,7 @@ dnl The else branch is huge and wasn't intended on purpose. AC_LANG_PUSH([C++])dnl AS_VAR_PUSHDEF([Boost_lib], [boost_cv_lib_$1])dnl AS_VAR_PUSHDEF([Boost_lib_LDFLAGS], [boost_cv_lib_$1_LDFLAGS])dnl +AS_VAR_PUSHDEF([Boost_lib_LDPATH], [boost_cv_lib_$1_LDPATH])dnl AS_VAR_PUSHDEF([Boost_lib_LIBS], [boost_cv_lib_$1_LIBS])dnl BOOST_FIND_HEADER([$3]) boost_save_CPPFLAGS=$CPPFLAGS @@ -371,8 +379,8 @@ for boost_rtopt_ in $boost_rtopt '' -d; do boost_tmp_lib=$with_boost test x"$with_boost" = x && boost_tmp_lib=${boost_cv_inc_path%/include} for boost_ldpath in "$boost_tmp_lib/lib" '' \ - /opt/local/lib /usr/local/lib /opt/lib /usr/lib \ - "$with_boost" C:/Boost/lib /lib /usr/lib64 /lib64 + /opt/local/lib* /usr/local/lib* /opt/lib* /usr/lib* \ + "$with_boost" C:/Boost/lib /lib* do test -e "$boost_ldpath" || continue boost_save_LDFLAGS=$LDFLAGS @@ -395,7 +403,16 @@ dnl generated only once above (before we start the for loops). LDFLAGS=$boost_save_LDFLAGS LIBS=$boost_save_LIBS if test x"$Boost_lib" = xyes; then - Boost_lib_LDFLAGS="-L$boost_ldpath -R$boost_ldpath" + # Because Boost is often installed in non-standard locations we want to + # hardcode the path to the library (with rpath). Here we assume that + # Libtool's macro was already invoked so we can steal its variable + # hardcode_libdir_flag_spec in order to get the right flags for ld. + boost_save_libdir=$libdir + libdir=$boost_ldpath + eval boost_rpath=\"$hardcode_libdir_flag_spec\" + libdir=$boost_save_libdir + Boost_lib_LDFLAGS="-L$boost_ldpath $boost_rpath" + Boost_lib_LDPATH="$boost_ldpath" break 6 else boost_failed_libs="$boost_failed_libs@$boost_lib@" @@ -410,14 +427,17 @@ rm -f conftest.$ac_objext ]) case $Boost_lib in #( no) _AC_MSG_LOG_CONFTEST - AC_MSG_ERROR([cannot not find the flags to link with Boost $1]) + AC_MSG_ERROR([cannot find the flags to link with Boost $1]) ;; esac -AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS]) -AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS]) +AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS])dnl +AC_SUBST(AS_TR_CPP([BOOST_$1_LDPATH]), [$Boost_lib_LDPATH])dnl +AC_SUBST([BOOST_LDPATH], [$Boost_lib_LDPATH])dnl +AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS])dnl CPPFLAGS=$boost_save_CPPFLAGS AS_VAR_POPDEF([Boost_lib])dnl AS_VAR_POPDEF([Boost_lib_LDFLAGS])dnl +AS_VAR_POPDEF([Boost_lib_LDPATH])dnl AS_VAR_POPDEF([Boost_lib_LIBS])dnl AC_LANG_POP([C++])dnl fi @@ -432,17 +452,31 @@ fi # The page http://beta.boost.org/doc/libs is useful: it gives the first release # version of each library (among other things). +# BOOST_DEFUN(LIBRARY, CODE) +# -------------------------- +# Define BOOST_ as a macro that runs CODE. +# +# Use indir to avoid the warning on underquoted macro name given to AC_DEFUN. +m4_define([BOOST_DEFUN], +[m4_indir([AC_DEFUN], + m4_toupper([BOOST_$1]), +[m4_pushdef([BOOST_Library], [$1])dnl +$2 +m4_popdef([BOOST_Library])dnl +]) +]) + # BOOST_ARRAY() # ------------- # Look for Boost.Array -AC_DEFUN([BOOST_ARRAY], +BOOST_DEFUN([Array], [BOOST_FIND_HEADER([boost/array.hpp])]) # BOOST_ASIO() # ------------ # Look for Boost.Asio (new in Boost 1.35). -AC_DEFUN([BOOST_ASIO], +BOOST_DEFUN([Asio], [AC_REQUIRE([BOOST_SYSTEM])dnl BOOST_FIND_HEADER([boost/asio.hpp])]) @@ -450,14 +484,41 @@ BOOST_FIND_HEADER([boost/asio.hpp])]) # BOOST_BIND() # ------------ # Look for Boost.Bind -AC_DEFUN([BOOST_BIND], +BOOST_DEFUN([Bind], [BOOST_FIND_HEADER([boost/bind.hpp])]) +# BOOST_CHRONO() +# ------------------ +# Look for Boost.Chrono +BOOST_DEFUN([Chrono], +[# Do we have to check for Boost.System? This link-time dependency was +# added as of 1.35.0. If we have a version <1.35, we must not attempt to +# find Boost.System as it didn't exist by then. +if test $boost_major_version -ge 135; then + BOOST_SYSTEM([$1]) +fi # end of the Boost.System check. +boost_system_save_LIBS=$LIBS +boost_system_save_LDFLAGS=$LDFLAGS +m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl +LIBS="$LIBS $BOOST_SYSTEM_LIBS" +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" +BOOST_FIND_LIB([chrono], [$1], + [boost/chrono.hpp], + [boost::chrono::system_clock::time_point d = boost::chrono::system_clock::now();]) +if test $enable_static_boost = yes && test $boost_major_version -ge 135; then + AC_SUBST([BOOST_SYSTEM_LIBS], ["$BOOST_SYSTEM_LIBS $BOOST_SYSTEM_LIBS"]) +fi +LIBS=$boost_system_save_LIBS +LDFLAGS=$boost_system_save_LDFLAGS + +])# BOOST_CHRONO + + # BOOST_CONVERSION() # ------------------ # Look for Boost.Conversion (cast / lexical_cast) -AC_DEFUN([BOOST_CONVERSION], +BOOST_DEFUN([Conversion], [BOOST_FIND_HEADER([boost/cast.hpp]) BOOST_FIND_HEADER([boost/lexical_cast.hpp]) ])# BOOST_CONVERSION @@ -467,12 +528,31 @@ BOOST_FIND_HEADER([boost/lexical_cast.hpp]) # ----------------------------------- # Look for Boost.Date_Time. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_DATE_TIME], +BOOST_DEFUN([Date_Time], [BOOST_FIND_LIB([date_time], [$1], [boost/date_time/posix_time/posix_time.hpp], [boost::posix_time::ptime t;]) ])# BOOST_DATE_TIME +# BOOST_TIMER([PREFERRED-RT-OPT]) +# ----------------------------------- +# Look for Boost.Timer. For the documentation of PREFERRED-RT-OPT, see the +# documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Timer], +[#check for Boost.System +BOOST_SYSTEM([$1]) +boost_system_save_LIBS=$LIBS +boost_system_save_LDFLAGS=$LDFLAGS +m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl +LIBS="$LIBS $BOOST_SYSTEM_LIBS" +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" +BOOST_FIND_LIB([timer], [$1], + [boost/timer/timer.hpp], + [boost::timer::auto_cpu_timer t;]) +AC_SUBST([BOOST_SYSTEM_LIBS], ["$BOOST_SYSTEM_LIBS $BOOST_SYSTEM_LIBS"]) +LIBS=$boost_system_save_LIBS +LDFLAGS=$boost_system_save_LDFLAGS +])# BOOST_TIMER # BOOST_FILESYSTEM([PREFERRED-RT-OPT]) # ------------------------------------ @@ -480,7 +560,7 @@ AC_DEFUN([BOOST_DATE_TIME], # the documentation of BOOST_FIND_LIB above. # Do not check for boost/filesystem.hpp because this file was introduced in # 1.34. -AC_DEFUN([BOOST_FILESYSTEM], +BOOST_DEFUN([Filesystem], [# Do we have to check for Boost.System? This link-time dependency was # added as of 1.35.0. If we have a version <1.35, we must not attempt to # find Boost.System as it didn't exist by then. @@ -494,6 +574,9 @@ LIBS="$LIBS $BOOST_SYSTEM_LIBS" LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" BOOST_FIND_LIB([filesystem], [$1], [boost/filesystem/path.hpp], [boost::filesystem::path p;]) +if test $enable_static_boost = yes && test $boost_major_version -ge 135; then + AC_SUBST([BOOST_FILESYSTEM_LIBS], ["$BOOST_FILESYSTEM_LIBS $BOOST_SYSTEM_LIBS"]) +fi LIBS=$boost_filesystem_save_LIBS LDFLAGS=$boost_filesystem_save_LDFLAGS ])# BOOST_FILESYSTEM @@ -502,7 +585,7 @@ LDFLAGS=$boost_filesystem_save_LDFLAGS # BOOST_FOREACH() # --------------- # Look for Boost.Foreach -AC_DEFUN([BOOST_FOREACH], +BOOST_DEFUN([Foreach], [BOOST_FIND_HEADER([boost/foreach.hpp])]) @@ -513,14 +596,14 @@ AC_DEFUN([BOOST_FOREACH], # standalone. It can't be compiled because it triggers the following error: # boost/format/detail/config_macros.hpp:88: error: 'locale' in namespace 'std' # does not name a type -AC_DEFUN([BOOST_FORMAT], +BOOST_DEFUN([Format], [BOOST_FIND_HEADER([boost/format.hpp])]) # BOOST_FUNCTION() # ---------------- # Look for Boost.Function -AC_DEFUN([BOOST_FUNCTION], +BOOST_DEFUN([Function], [BOOST_FIND_HEADER([boost/function.hpp])]) @@ -528,37 +611,60 @@ AC_DEFUN([BOOST_FUNCTION], # ------------------------------- # Look for Boost.Graphs. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_GRAPH], +BOOST_DEFUN([Graph], [BOOST_FIND_LIB([graph], [$1], [boost/graph/adjacency_list.hpp], [boost::adjacency_list<> g;]) ])# BOOST_GRAPH # BOOST_IOSTREAMS([PREFERRED-RT-OPT]) -# ------------------------------- +# ----------------------------------- # Look for Boost.IOStreams. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_IOSTREAMS], +BOOST_DEFUN([IOStreams], [BOOST_FIND_LIB([iostreams], [$1], [boost/iostreams/device/file_descriptor.hpp], - [boost::iostreams::file_descriptor fd(0); fd.close();]) + [boost::iostreams::file_descriptor fd; fd.close();]) ])# BOOST_IOSTREAMS # BOOST_HASH() # ------------ # Look for Boost.Functional/Hash -AC_DEFUN([BOOST_HASH], +BOOST_DEFUN([Hash], [BOOST_FIND_HEADER([boost/functional/hash.hpp])]) # BOOST_LAMBDA() # -------------- # Look for Boost.Lambda -AC_DEFUN([BOOST_LAMBDA], +BOOST_DEFUN([Lambda], [BOOST_FIND_HEADER([boost/lambda/lambda.hpp])]) +# BOOST_LOG([PREFERRED-RT-OPT]) +# ----------------------------- +# Look for Boost.Log For the documentation of PREFERRED-RT-OPT, see the +# documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Log], +[BOOST_FIND_LIB([log], [$1], + [boost/log/core/core.hpp], + [boost::log::attribute a; a.get_value();]) +])# BOOST_LOG + + +# BOOST_LOG_SETUP([PREFERRED-RT-OPT]) +# ----------------------------------- +# Look for Boost.Log For the documentation of PREFERRED-RT-OPT, see the +# documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Log_Setup], +[AC_REQUIRE([BOOST_LOG])dnl +BOOST_FIND_LIB([log_setup], [$1], + [boost/log/utility/init/from_settings.hpp], + [boost::log::basic_settings bs; bs.empty();]) +])# BOOST_LOG_SETUP + + # BOOST_MATH() # ------------ # Look for Boost.Math @@ -567,21 +673,21 @@ AC_DEFUN([BOOST_LAMBDA], # libboost_math_c99f, libboost_math_c99l, libboost_math_tr1, # libboost_math_tr1f, libboost_math_tr1l). This macro must be fixed to do the # right thing anyway. -AC_DEFUN([BOOST_MATH], +BOOST_DEFUN([Math], [BOOST_FIND_HEADER([boost/math/special_functions.hpp])]) # BOOST_MULTIARRAY() # ------------------ # Look for Boost.MultiArray -AC_DEFUN([BOOST_MULTIARRAY], +BOOST_DEFUN([MultiArray], [BOOST_FIND_HEADER([boost/multi_array.hpp])]) # BOOST_NUMERIC_CONVERSION() # -------------------------- # Look for Boost.NumericConversion (policy-based numeric conversion) -AC_DEFUN([BOOST_NUMERIC_CONVERSION], +BOOST_DEFUN([Numeric_Conversion], [BOOST_FIND_HEADER([boost/numeric/conversion/converter.hpp]) ])# BOOST_NUMERIC_CONVERSION @@ -589,32 +695,76 @@ AC_DEFUN([BOOST_NUMERIC_CONVERSION], # BOOST_OPTIONAL() # ---------------- # Look for Boost.Optional -AC_DEFUN([BOOST_OPTIONAL], +BOOST_DEFUN([Optional], [BOOST_FIND_HEADER([boost/optional.hpp])]) # BOOST_PREPROCESSOR() # -------------------- # Look for Boost.Preprocessor -AC_DEFUN([BOOST_PREPROCESSOR], +BOOST_DEFUN([Preprocessor], [BOOST_FIND_HEADER([boost/preprocessor/repeat.hpp])]) +# BOOST_UNORDERED() +# ----------------- +# Look for Boost.Unordered +BOOST_DEFUN([Unordered], +[BOOST_FIND_HEADER([boost/unordered_map.hpp])]) + + +# BOOST_UUID() +# ------------ +# Look for Boost.Uuid +BOOST_DEFUN([Uuid], +[BOOST_FIND_HEADER([boost/uuid/uuid.hpp])]) + + # BOOST_PROGRAM_OPTIONS([PREFERRED-RT-OPT]) # ----------------------------------------- -# Look for Boost.Program_options. For the documentation of PREFERRED-RT-OPT, see -# the documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_PROGRAM_OPTIONS], +# Look for Boost.Program_options. For the documentation of PREFERRED-RT-OPT, +# see the documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Program_Options], [BOOST_FIND_LIB([program_options], [$1], [boost/program_options.hpp], [boost::program_options::options_description d("test");]) ])# BOOST_PROGRAM_OPTIONS + +# _BOOST_PYTHON_CONFIG(VARIABLE, FLAG) +# ------------------------------------ +# Save VARIABLE, and define it via `python-config --FLAG`. +# Substitute BOOST_PYTHON_VARIABLE. +m4_define([_BOOST_PYTHON_CONFIG], +[AC_SUBST([BOOST_PYTHON_$1], + [`python-config --$2 2>/dev/null`])dnl +boost_python_save_$1=$$1 +$1="$$1 $BOOST_PYTHON_$1"]) + + +# BOOST_PYTHON([PREFERRED-RT-OPT]) +# -------------------------------- +# Look for Boost.Python. For the documentation of PREFERRED-RT-OPT, +# see the documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Python], +[_BOOST_PYTHON_CONFIG([CPPFLAGS], [includes]) +_BOOST_PYTHON_CONFIG([LDFLAGS], [ldflags]) +_BOOST_PYTHON_CONFIG([LIBS], [libs]) +m4_pattern_allow([^BOOST_PYTHON_MODULE$])dnl +BOOST_FIND_LIB([python], [$1], + [boost/python.hpp], + [], [BOOST_PYTHON_MODULE(empty) {}]) +CPPFLAGS=$boost_python_save_CPPFLAGS +LDFLAGS=$boost_python_save_LDFLAGS +LIBS=$boost_python_save_LIBS +])# BOOST_PYTHON + + # BOOST_REF() # ----------- # Look for Boost.Ref -AC_DEFUN([BOOST_REF], +BOOST_DEFUN([Ref], [BOOST_FIND_HEADER([boost/ref.hpp])]) @@ -622,7 +772,7 @@ AC_DEFUN([BOOST_REF], # ------------------------------- # Look for Boost.Regex. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_REGEX], +BOOST_DEFUN([Regex], [BOOST_FIND_LIB([regex], [$1], [boost/regex.hpp], [boost::regex exp("*"); boost::regex_match("foo", exp);]) @@ -633,19 +783,19 @@ AC_DEFUN([BOOST_REGEX], # --------------------------------------- # Look for Boost.Serialization. For the documentation of PREFERRED-RT-OPT, see # the documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_SERIALIZATION], +BOOST_DEFUN([Serialization], [BOOST_FIND_LIB([serialization], [$1], [boost/archive/text_oarchive.hpp], [std::ostream* o = 0; // Cheap way to get an ostream... boost::archive::text_oarchive t(*o);]) -])# BOOST_SIGNALS +])# BOOST_SERIALIZATION # BOOST_SIGNALS([PREFERRED-RT-OPT]) # --------------------------------- # Look for Boost.Signals. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_SIGNALS], +BOOST_DEFUN([Signals], [BOOST_FIND_LIB([signals], [$1], [boost/signal.hpp], [boost::signal s;]) @@ -655,7 +805,7 @@ AC_DEFUN([BOOST_SIGNALS], # BOOST_SMART_PTR() # ----------------- # Look for Boost.SmartPtr -AC_DEFUN([BOOST_SMART_PTR], +BOOST_DEFUN([Smart_Ptr], [BOOST_FIND_HEADER([boost/scoped_ptr.hpp]) BOOST_FIND_HEADER([boost/shared_ptr.hpp]) ]) @@ -664,14 +814,14 @@ BOOST_FIND_HEADER([boost/shared_ptr.hpp]) # BOOST_STATICASSERT() # -------------------- # Look for Boost.StaticAssert -AC_DEFUN([BOOST_STATICASSERT], +BOOST_DEFUN([StaticAssert], [BOOST_FIND_HEADER([boost/static_assert.hpp])]) # BOOST_STRING_ALGO() # ------------------- # Look for Boost.StringAlgo -AC_DEFUN([BOOST_STRING_ALGO], +BOOST_DEFUN([String_Algo], [BOOST_FIND_HEADER([boost/algorithm/string.hpp]) ]) @@ -681,7 +831,7 @@ AC_DEFUN([BOOST_STRING_ALGO], # Look for Boost.System. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. This library was introduced in Boost # 1.35.0. -AC_DEFUN([BOOST_SYSTEM], +BOOST_DEFUN([System], [BOOST_FIND_LIB([system], [$1], [boost/system/error_code.hpp], [boost::system::error_code e; e.clear();]) @@ -692,7 +842,7 @@ AC_DEFUN([BOOST_SYSTEM], # ------------------------------ # Look for Boost.Test. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_TEST], +BOOST_DEFUN([Test], [m4_pattern_allow([^BOOST_CHECK$])dnl BOOST_FIND_LIB([unit_test_framework], [$1], [boost/test/unit_test.hpp], [BOOST_CHECK(2 == 2);], @@ -707,25 +857,49 @@ BOOST_FIND_LIB([unit_test_framework], [$1], # Look for Boost.Thread. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. # FIXME: Provide an alias "BOOST_THREAD". -AC_DEFUN([BOOST_THREADS], +BOOST_DEFUN([Threads], [dnl Having the pthread flag is required at least on GCC3 where dnl boost/thread.hpp would complain if we try to compile without dnl -pthread on GNU/Linux. AC_REQUIRE([_BOOST_PTHREAD_FLAG])dnl boost_threads_save_LIBS=$LIBS +boost_threads_save_LDFLAGS=$LDFLAGS boost_threads_save_CPPFLAGS=$CPPFLAGS -LIBS="$LIBS $boost_cv_pthread_flag" +# Link-time dependency from thread to system was added as of 1.49.0. +if test $boost_major_version -ge 149; then +BOOST_SYSTEM([$1]) +fi # end of the Boost.System check. +m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl +LIBS="$LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag" +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" # Yes, we *need* to put the -pthread thing in CPPFLAGS because with GCC3, # boost/thread.hpp will trigger a #error if -pthread isn't used: # boost/config/requires_threads.hpp:47:5: #error "Compiler threading support # is not turned on. Please set the correct command line options for # threading: -pthread (Linux), -pthreads (Solaris) or -mthreads (Mingw32)" CPPFLAGS="$CPPFLAGS $boost_cv_pthread_flag" -BOOST_FIND_LIB([thread], [$1], - [boost/thread.hpp], [boost::thread t; boost::mutex m;]) -BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $boost_cv_pthread_flag" + +# When compiling for the Windows platform, the threads library is named +# differently. +case $host_os in + (*mingw*) + BOOST_FIND_LIB([thread_win32], [$1], + [boost/thread.hpp], [boost::thread t; boost::mutex m;]) + BOOST_THREAD_LDFLAGS=$BOOST_THREAD_WIN32_LDFLAGS + BOOST_THREAD_LDPATH=$BOOST_THREAD_WIN32_LDPATH + BOOST_THREAD_LIBS=$BOOST_THREAD_WIN32_LIBS + ;; + (*) + BOOST_FIND_LIB([thread], [$1], + [boost/thread.hpp], [boost::thread t; boost::mutex m;]) + ;; +esac + +BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag" +BOOST_THREAD_LDFLAGS="$BOOST_SYSTEM_LDFLAGS" BOOST_CPPFLAGS="$BOOST_CPPFLAGS $boost_cv_pthread_flag" LIBS=$boost_threads_save_LIBS +LDFLAGS=$boost_threads_save_LDFLAGS CPPFLAGS=$boost_threads_save_CPPFLAGS ])# BOOST_THREADS @@ -733,14 +907,14 @@ CPPFLAGS=$boost_threads_save_CPPFLAGS # BOOST_TOKENIZER() # ----------------- # Look for Boost.Tokenizer -AC_DEFUN([BOOST_TOKENIZER], +BOOST_DEFUN([Tokenizer], [BOOST_FIND_HEADER([boost/tokenizer.hpp])]) # BOOST_TRIBOOL() # --------------- # Look for Boost.Tribool -AC_DEFUN([BOOST_TRIBOOL], +BOOST_DEFUN([Tribool], [BOOST_FIND_HEADER([boost/logic/tribool_fwd.hpp]) BOOST_FIND_HEADER([boost/logic/tribool.hpp]) ]) @@ -749,14 +923,14 @@ BOOST_FIND_HEADER([boost/logic/tribool.hpp]) # BOOST_TUPLE() # ------------- # Look for Boost.Tuple -AC_DEFUN([BOOST_TUPLE], +BOOST_DEFUN([Tuple], [BOOST_FIND_HEADER([boost/tuple/tuple.hpp])]) # BOOST_TYPETRAITS() # -------------------- # Look for Boost.TypeTraits -AC_DEFUN([BOOST_TYPETRAITS], +BOOST_DEFUN([TypeTraits], [BOOST_FIND_HEADER([boost/type_traits.hpp])]) @@ -764,14 +938,14 @@ AC_DEFUN([BOOST_TYPETRAITS], # --------------- # Look for Boost.Utility (noncopyable, result_of, base-from-member idiom, # etc.) -AC_DEFUN([BOOST_UTILITY], +BOOST_DEFUN([Utility], [BOOST_FIND_HEADER([boost/utility.hpp])]) # BOOST_VARIANT() # --------------- # Look for Boost.Variant. -AC_DEFUN([BOOST_VARIANT], +BOOST_DEFUN([Variant], [BOOST_FIND_HEADER([boost/variant/variant_fwd.hpp]) BOOST_FIND_HEADER([boost/variant.hpp])]) @@ -782,15 +956,15 @@ BOOST_FIND_HEADER([boost/variant.hpp])]) # call BOOST_THREADS first. # Look for Boost.Wave. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_WAVE], +BOOST_DEFUN([Wave], [AC_REQUIRE([BOOST_FILESYSTEM])dnl AC_REQUIRE([BOOST_DATE_TIME])dnl boost_wave_save_LIBS=$LIBS boost_wave_save_LDFLAGS=$LDFLAGS m4_pattern_allow([^BOOST_((FILE)?SYSTEM|DATE_TIME|THREAD)_(LIBS|LDFLAGS)$])dnl -LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS\ +LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS \ $BOOST_THREAD_LIBS" -LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS\ +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS \ $BOOST_DATE_TIME_LDFLAGS $BOOST_THREAD_LDFLAGS" BOOST_FIND_LIB([wave], [$1], [boost/wave.hpp], @@ -803,7 +977,7 @@ LDFLAGS=$boost_wave_save_LDFLAGS # BOOST_XPRESSIVE() # ----------------- # Look for Boost.Xpressive (new since 1.36.0). -AC_DEFUN([BOOST_XPRESSIVE], +BOOST_DEFUN([Xpressive], [BOOST_FIND_HEADER([boost/xpressive/xpressive.hpp])]) @@ -893,8 +1067,9 @@ AC_DEFUN([_BOOST_FIND_COMPILER_TAG], [AC_REQUIRE([AC_PROG_CXX])dnl AC_REQUIRE([AC_CANONICAL_HOST])dnl AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag], -[AC_LANG_PUSH([C++])dnl - boost_cv_lib_tag=unknown +[boost_cv_lib_tag=unknown +if test x$boost_cv_inc_path != xno; then + AC_LANG_PUSH([C++])dnl # The following tests are mostly inspired by boost/config/auto_link.hpp # The list is sorted to most recent/common to oldest compiler (in order # to increase the likelihood of finding the right compiler with the @@ -908,8 +1083,12 @@ AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag] # como, edg, kcc, bck, mp, sw, tru, xlc # I'm not sure about my test for `il' (be careful: Intel's ICC pre-defines # the same defines as GCC's). - # TODO: Move the test on GCC 4.4 up once it's released. for i in \ + _BOOST_gcc_test(4, 8) \ + _BOOST_gcc_test(4, 7) \ + _BOOST_gcc_test(4, 6) \ + _BOOST_gcc_test(4, 5) \ + _BOOST_gcc_test(4, 4) \ _BOOST_gcc_test(4, 3) \ _BOOST_gcc_test(4, 2) \ _BOOST_gcc_test(4, 1) \ @@ -929,7 +1108,6 @@ AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag] "defined __ICC && (defined __unix || defined __unix__) @ il" \ "defined __ICL @ iw" \ "defined _MSC_VER && _MSC_VER == 1300 @ vc7" \ - _BOOST_gcc_test(4, 4) \ _BOOST_gcc_test(2, 95) \ "defined __MWERKS__ && __MWERKS__ <= 0x32FF @ cw9" \ "defined _MSC_VER && _MSC_VER < 1300 && !defined UNDER_CE @ vc6" \ @@ -969,7 +1147,7 @@ AC_LANG_POP([C++])dnl boost_cv_lib_tag= ;; esac -])dnl end of AC_CACHE_CHECK +fi])dnl end of AC_CACHE_CHECK ])# _BOOST_FIND_COMPILER_TAG -- cgit v1.2.3 From 816632a5d1e3a5a24c9b3aacb4210ed8b28a9c62 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 20 Jan 2013 01:38:42 -0500 Subject: control max len --- corpus/filter-length.pl | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'corpus') diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 3cfa40cc..38837f14 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -3,20 +3,30 @@ use strict; use utf8; ##### EDIT THESE SETTINGS #################################################### -my $MAX_LENGTH = 150; # discard a sentence if it is longer than this my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be? ############################################################################## -die "Usage: $0 corpus.fr-en\n\n Filter sentence pairs containing sentences longer than $MAX_LENGTH words\n or whose log length ratios are $MAX_ZSCORE stddevs away from the mean log ratio.\n\n" unless scalar @ARGV == 1; +die "Usage: $0 [-NNN] corpus.fr-en\n\n Filter sentence pairs containing sentences longer than NNN words (where NNN\n is 150 by default) or whose log length ratios are $MAX_ZSCORE stddevs away from the\n mean log ratio.\n\n" unless scalar @ARGV == 1 || scalar @ARGV == 2; binmode(STDOUT,":utf8"); binmode(STDERR,":utf8"); +my $MAX_LENGTH = 150; # discard a sentence if it is longer than this +if (scalar @ARGV == 2) { + my $fp = shift @ARGV; + die "Expected -NNN for first parameter, but got $fp\n" unless $fp =~ /^-(\d+)$/; + $MAX_LENGTH=$1; +} + my $corpus = shift @ARGV; + die "Cannot read from STDIN\n" if $corpus eq '-'; my $ff = "<$corpus"; $ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/; +print STDERR "Max line length (monolingual): $MAX_LENGTH\n"; +print STDERR " Parallel corpus: $corpus\n"; + open F,$ff or die "Can't read $corpus: $!"; binmode(F,":utf8"); -- cgit v1.2.3 From 55a1914e8998b2dc613d0f1e452a714b51169953 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 21 Jan 2013 16:53:05 -0500 Subject: a little bit of cleanup --- corpus/filter-length.pl | 4 ++-- corpus/paste-files.pl | 8 ++++---- corpus/support/tokenizer.pl | 1 - 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'corpus') diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 38837f14..2e257cda 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -44,11 +44,11 @@ while() { $lines++; if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } elsif ($lines % 2500 == 0) { print STDERR "."; } - my ($sf, $se, @d) = split / \|\|\| /; + my ($sf, $se, @d) = split /\s*\|\|\|\s*/; if (scalar @d != 0 or !defined $se) { $bad_format++; if ($bad_format > 100 && ($bad_format / $lines) > 0.02) { - die "Corpus appears to be incorretly formatted, example: $_"; + die "$bad_format / $lines : Corpus appears to be incorretly formatted, example: $_"; } next; } diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl index 0b788386..4cb424ad 100755 --- a/corpus/paste-files.pl +++ b/corpus/paste-files.pl @@ -41,9 +41,9 @@ while(1) { } warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; $r =~ s/\|\|\|/ /g; - $r =~ s/ +//g; - $r =~ s/^ //; - $r =~ s/ $//; + $r =~ s/\s+/ /g; + $r =~ s/^ +//; + $r =~ s/ +$//; $anum++; push @line, $r; } @@ -56,5 +56,5 @@ for (my $i = 1; $i < scalar @fhs; $i++) { my $r = <$fh>; die "Mismatched number of lines.\n" if defined $r; } -print STDERR "Bad lines containing ||| were $bad\n"; +print STDERR "Number of lines containing ||| was: $bad\n" if $bad > 0; diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index e9c3a37d..b5190858 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -147,7 +147,6 @@ while(){ print STDOUT " $new_line\n"; } -print STDERR "\n"; ######################################################################## -- cgit v1.2.3 From fba0f3db9aa896807e3932f9d3323767cedd9338 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 21 Jan 2013 17:28:19 -0500 Subject: tokenizer support for utf8 patterns --- corpus/support/token_patterns | 1 + corpus/support/tokenizer.pl | 2 ++ 2 files changed, 3 insertions(+) (limited to 'corpus') diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index 8e69432b..b25ac6de 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,3 +1,4 @@ /^(al|el|ul|e)\-[a-z]+$/ +/^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ /^(\d|\d\d|\d\d\d)\.$/ diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index b5190858..0350a894 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -73,6 +73,7 @@ my $dict_file = "$workdir/token_list"; my $word_patt_file = "$workdir/token_patterns"; open(my $dict_fp, "$dict_file") or die; +binmode($dict_fp, ":utf8"); # read in the list of words that should not be segmented, ## e.g.,"I.B.M.", co-operation. @@ -89,6 +90,7 @@ while(<$dict_fp>){ } open(my $patt_fp, "$word_patt_file") or die; +binmode($patt_fp, ":utf8"); my @word_patts = (); my $word_patt_num = 0; while(<$patt_fp>){ -- cgit v1.2.3 From d30e63f84f836fa3223cd01ea3168f282c280be9 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 22 Jan 2013 20:08:28 -0500 Subject: russian abbrevs --- corpus/support/quote-norm.pl | 14 ++++++++ corpus/support/token_list | 82 +++++++++++++++++++++++++++++++++++++++++++ corpus/support/token_patterns | 1 + 3 files changed, 97 insertions(+) (limited to 'corpus') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 72b0064d..e4e5055e 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -27,6 +27,20 @@ while() { s/¨/"/g; s/¡/ ¡ /g; s/¿/ ¿ /g; + + s/ п. п. / п.п. /g; + s/ ст. л. / ст.л. /g; + s/ т. е. / т.е. /g; + s/ т. к. / т.к. /g; + s/ т. ч. / т.ч. /g; + s/ т. д. / т.д. /g; + s/ т. п. / т.п. /g; + s/ и. о. / и.о. /g; + s/ с. г. / с.г. /g; + s/ г. р. / г.р. /g; + s/ т. н. / т.н. /g; + s/ т. ч. / т.ч. /g; + s/ н. э. / н.э. /g; # â s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; diff --git a/corpus/support/token_list b/corpus/support/token_list index 28eb4396..d470cb22 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -210,3 +210,85 @@ W. X. Y. Z. +А. +Б. +В. +Г. +Д. +Е. +Ё. +Ж. +З. +И. +Й. +К. +Л. +М. +Н. +О. +П. +Р. +С. +Т. +У. +Ф. +Х. +Ц. +Ч. +Ш. +Щ. +Ъ. +Ы. +Ь. +Э. +Ю. +Я. +л. +г. +обл. +гг. +в. +вв. +мин. +ч. +тыс. +млн. +млрд. +трлн. +кв. +куб. +руб. +коп. +долл. +Прим. +прим. +чел. +грн. +мин. +им. +проф. +акад. +ред. +авт. +корр. +соб. +спец. +см. +тж. +др. +пр. +букв. +# Two-letter abbreviations - can be written with space +п.п. +ст.л. +т.е. +т.к. +т.ч. +т.д. +т.п. +и.о. +с.г. +г.р. +т.н. +т.ч. +н.э. diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index b25ac6de..de64fb2a 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,4 +1,5 @@ /^(al|el|ul|e)\-[a-z]+$/ /^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ +/^\p{Cyrillic}\.\p{Cyrillic}\.$/ /^(\d|\d\d|\d\d\d)\.$/ -- cgit v1.2.3