From 5276e368ad643434fd73527329ed70507ee49dfc Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 28 Jul 2012 12:11:30 -0400 Subject: a couple of tools for cleaning corpora --- corpus/utf8-normalize.sh | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100755 corpus/utf8-normalize.sh (limited to 'corpus/utf8-normalize.sh') diff --git a/corpus/utf8-normalize.sh b/corpus/utf8-normalize.sh new file mode 100755 index 00000000..dcf8bc59 --- /dev/null +++ b/corpus/utf8-normalize.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# This script uses ICU uconv (http://site.icu-project.org/), if it's available +# to normalize UTF8 text into a standard form. For information about this +# process, refer to http://en.wikipedia.org/wiki/Unicode_equivalence#Normalization +# Escape characters between 0x00-0x1F are removed + +if which uconv > /dev/null +then + CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip" +else + echo "Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Normalization NOT taking place." 1>&2 + CMD="iconv -f utf8 -t utf8 -c" +fi + +$CMD | /usr/bin/perl -w -e ' + while (<>) { + chomp; + s/[\x00-\x1F]+/ /g; + s/ +/ /g; + s/^ //; + s/ $//; + print "$_\n"; + }' + -- cgit v1.2.3