summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-11-12 13:56:02 +0100
committerPatrick Simianer <p@simianer.de>2015-11-12 13:56:02 +0100
commit1033fec5e067a8f573f700bcca385dfe2b3c74cd (patch)
tree831c22b868563d5fca0e92a0f7f2893e8dad06c7
parent159eac6e781d228bce720b9afd6a2934b8d909d5 (diff)
preprocessing without lowercasing
-rwxr-xr-xpreprocess_no_lower9
1 files changed, 9 insertions, 0 deletions
diff --git a/preprocess_no_lower b/preprocess_no_lower
new file mode 100755
index 0000000..3a4d358
--- /dev/null
+++ b/preprocess_no_lower
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+pushd `dirname $0` > /dev/null
+P=`pwd -P`
+popd > /dev/null
+
+LANG=$1
+$P/no_non_printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize_punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer-no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err
+