summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2015-01-08 21:35:29 -0500
committerKenneth Heafield <github@kheafield.com>2015-01-08 21:35:29 -0500
commitc485a6b8b1230e319b69adbb46788405d4e48c89 (patch)
treef72bdbf4b1476ea56a4b6bfe2c0c04eae3cb61ad /corpus
parent379284b264f3246b2849369bfec7bb16cc701af3 (diff)
Stop BOMbs before they decrease quality
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/utf8-normalize.sh2
1 files changed, 1 insertions, 1 deletions
diff --git a/corpus/utf8-normalize.sh b/corpus/utf8-normalize.sh
index dcf8bc59..7c0db611 100755
--- a/corpus/utf8-normalize.sh
+++ b/corpus/utf8-normalize.sh
@@ -7,7 +7,7 @@
if which uconv > /dev/null
then
- CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip"
+ CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip --remove-signature"
else
echo "Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Normalization NOT taking place." 1>&2
CMD="iconv -f utf8 -t utf8 -c"