summaryrefslogtreecommitdiff
path: root/preprocess
blob: 69eaa54cf1bf2857206eddb1ee7afb1c6f10cb43 (plain)
1
2
3
4
5
#!/bin/bash

LANG=$1
/toolbox/scripts/no_non_printables | sed "s|[-,\.]\{4,\}|...|" | /toolbox/scripts/htmlentities 2>htmlentities.$LANG.err | /toolbox/scripts/normalize_punctuation 2>normalize-punctuation.$LANG.err | /toolbox/moses/scripts/tokenizer/tokenizer.no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | /toolbox/moses/scripts/tokenizer/lowercase.perl 2>lowercase.$LANG.err