summaryrefslogtreecommitdiff
path: root/corpus/tokenize-anything.sh
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2013-09-05 14:15:43 -0700
committerMichael Denkowski <mdenkows@cs.cmu.edu>2013-09-05 14:15:43 -0700
commitbef5c40937619e8360a87f988f204ba0ff2ad300 (patch)
treee14ee7764e1031fc4a319102d278167fc127423e /corpus/tokenize-anything.sh
parent40688c6c8ac48c809e1b4f1fa10d93144620dead (diff)
Slower but correct (wrt buffered) unbuffered version.
Diffstat (limited to 'corpus/tokenize-anything.sh')
-rwxr-xr-xcorpus/tokenize-anything.sh6
1 files changed, 3 insertions, 3 deletions
diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh
index 52739e81..a20a022f 100755
--- a/corpus/tokenize-anything.sh
+++ b/corpus/tokenize-anything.sh
@@ -4,12 +4,12 @@ ROOTDIR=`dirname $0`
SUPPORT=$ROOTDIR/support
if [[ $# == 1 && $1 == '-u' ]] ; then
- NORMCMD=cat
+ NORMARGS="--batchline"
else
- NORMCMD=$SUPPORT/utf8-normalize.sh
+ NORMARGS=""
fi
-$NORMCMD |
+$SUPPORT/utf8-normalize.sh $NORMARGS |
$SUPPORT/quote-norm.pl |
$SUPPORT/tokenizer.pl |
sed -u -e 's/ al - / al-/g' |