From 1033fec5e067a8f573f700bcca385dfe2b3c74cd Mon Sep 17 00:00:00 2001 From: Patrick Simianer
Date: Thu, 12 Nov 2015 13:56:02 +0100 Subject: preprocessing without lowercasing --- preprocess_no_lower | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100755 preprocess_no_lower diff --git a/preprocess_no_lower b/preprocess_no_lower new file mode 100755 index 0000000..3a4d358 --- /dev/null +++ b/preprocess_no_lower @@ -0,0 +1,9 @@ +#!/bin/bash + +pushd `dirname $0` > /dev/null +P=`pwd -P` +popd > /dev/null + +LANG=$1 +$P/no_non_printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize_punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer-no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err + -- cgit v1.2.3