diff options
author | Patrick Simianer <p@simianer.de> | 2015-11-12 13:56:02 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2015-11-12 13:56:02 +0100 |
commit | 1033fec5e067a8f573f700bcca385dfe2b3c74cd (patch) | |
tree | 831c22b868563d5fca0e92a0f7f2893e8dad06c7 | |
parent | 159eac6e781d228bce720b9afd6a2934b8d909d5 (diff) |
preprocessing without lowercasing
-rwxr-xr-x | preprocess_no_lower | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/preprocess_no_lower b/preprocess_no_lower new file mode 100755 index 0000000..3a4d358 --- /dev/null +++ b/preprocess_no_lower @@ -0,0 +1,9 @@ +#!/bin/bash + +pushd `dirname $0` > /dev/null +P=`pwd -P` +popd > /dev/null + +LANG=$1 +$P/no_non_printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize_punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer-no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err + |