From 1bce604809399a0adc581fb0102bff11decf3436 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 8 Jan 2015 21:35:29 -0500 Subject: Stop BOMbs before they decrease quality --- corpus/utf8-normalize.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'corpus') diff --git a/corpus/utf8-normalize.sh b/corpus/utf8-normalize.sh index dcf8bc59..7c0db611 100755 --- a/corpus/utf8-normalize.sh +++ b/corpus/utf8-normalize.sh @@ -7,7 +7,7 @@ if which uconv > /dev/null then - CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip" + CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip --remove-signature" else echo "Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Normalization NOT taking place." 1>&2 CMD="iconv -f utf8 -t utf8 -c" -- cgit v1.2.3