summaryrefslogtreecommitdiff
path: root/preprocess
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-06-14 14:43:14 +0200
committerPatrick Simianer <p@simianer.de>2014-06-14 14:43:14 +0200
commit2783f837303ae07c4a1d676302bca779abbb1296 (patch)
treee388dda12d6d31285b32663b937a8d55ecc909c5 /preprocess
parent85ea0fc5e3ae7ea646cc6e843d01939b4d8e4dbf (diff)
steal tokenizer from moses' scripts
Diffstat (limited to 'preprocess')
-rwxr-xr-xpreprocess6
1 files changed, 5 insertions, 1 deletions
diff --git a/preprocess b/preprocess
index b034e48..c4eeb39 100755
--- a/preprocess
+++ b/preprocess
@@ -1,5 +1,9 @@
#!/bin/bash
+pushd `dirname $0` > /dev/null
+P=`pwd -P`
+popd > /dev/null
+
LANG=$1
-/toolbox/scripts/no_non_printables | sed "s|[-,\.]\{4,\}|...|g" | /toolbox/scripts/htmlentities 2>htmlentities.$LANG.err | /toolbox/scripts/normalize_punctuation 2>normalize-punctuation.$LANG.err | /toolbox/moses/scripts/tokenizer/tokenizer.no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | /toolbox/moses/scripts/tokenizer/lowercase.perl 2>lowercase.$LANG.err
+$P/no_non_printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize_punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer.no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | $P/lowercase.perl 2>lowercase.$LANG.err