From 2783f837303ae07c4a1d676302bca779abbb1296 Mon Sep 17 00:00:00 2001 From: Patrick Simianer
Date: Sat, 14 Jun 2014 14:43:14 +0200 Subject: steal tokenizer from moses' scripts --- preprocess | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'preprocess') diff --git a/preprocess b/preprocess index b034e48..c4eeb39 100755 --- a/preprocess +++ b/preprocess @@ -1,5 +1,9 @@ #!/bin/bash +pushd `dirname $0` > /dev/null +P=`pwd -P` +popd > /dev/null + LANG=$1 -/toolbox/scripts/no_non_printables | sed "s|[-,\.]\{4,\}|...|g" | /toolbox/scripts/htmlentities 2>htmlentities.$LANG.err | /toolbox/scripts/normalize_punctuation 2>normalize-punctuation.$LANG.err | /toolbox/moses/scripts/tokenizer/tokenizer.no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | /toolbox/moses/scripts/tokenizer/lowercase.perl 2>lowercase.$LANG.err +$P/no_non_printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize_punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer.no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | $P/lowercase.perl 2>lowercase.$LANG.err -- cgit v1.2.3