summaryrefslogtreecommitdiff
path: root/corpus/utf8-normalize.sh
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2015-01-08 21:35:29 -0500
committerKenneth Heafield <github@kheafield.com>2015-01-08 21:35:29 -0500
commit1bce604809399a0adc581fb0102bff11decf3436 (patch)
treefda1224fbee8de2ad37246f8b02e6572c6e244f9 /corpus/utf8-normalize.sh
parent992556cc0931b255b9e299c0f489c3b449b22ab4 (diff)
Stop BOMbs before they decrease quality
Diffstat (limited to 'corpus/utf8-normalize.sh')
-rwxr-xr-xcorpus/utf8-normalize.sh2
1 files changed, 1 insertions, 1 deletions
diff --git a/corpus/utf8-normalize.sh b/corpus/utf8-normalize.sh
index dcf8bc59..7c0db611 100755
--- a/corpus/utf8-normalize.sh
+++ b/corpus/utf8-normalize.sh
@@ -7,7 +7,7 @@
if which uconv > /dev/null
then
- CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip"
+ CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip --remove-signature"
else
echo "Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Normalization NOT taking place." 1>&2
CMD="iconv -f utf8 -t utf8 -c"