summaryrefslogtreecommitdiff
path: root/corpus/support/utf8-normalize.sh
blob: 2f34785487780cce59de1c4ab90db16af7cdefc7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/bin/bash

# this is the location on malbec, if you want to run on another machine
# ICU may be installed in /usr or /usr/local
ICU_DIR=/usr0/tools/icu
UCONV_BIN=$ICU_DIR/bin/uconv
UCONV_LIB=$ICU_DIR/lib

if [ -e $UCONV_BIN ] && [ -d $UCONV_LIB ]
then
  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$UCONV_LIB
  if [ ! -x $UCONV_BIN ]
  then
    echo "$0: Cannot execute $UCONV_BIN! Please fix." 1>&2
    exit
  fi
  CMD="$UCONV_BIN -f utf8 -t utf8 -x Any-NFKC --callback skip"
else
  if which uconv > /dev/null
  then
    CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip"
  else
    echo "$0: Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Quality may suffer." 1>&2
    CMD="iconv -f utf8 -t utf8 -c"
  fi
fi

perl -e 'while(<>){s/\r\n*/\n/g; print;}' | $CMD | /usr/bin/perl -w -e '
 while (<>) {
     chomp;
      s/[\x00-\x1F]+/ /g;
      s/  +/ /g;
      s/^ //;
      s/ $//;
      print "$_\n";
    }'