diff options
Diffstat (limited to 'corpus/support')
| -rwxr-xr-x | corpus/support/utf8-normalize-batch.pl | 28 | ||||
| -rwxr-xr-x | corpus/support/utf8-normalize.sh | 26 | 
2 files changed, 44 insertions, 10 deletions
diff --git a/corpus/support/utf8-normalize-batch.pl b/corpus/support/utf8-normalize-batch.pl new file mode 100755 index 00000000..e574f861 --- /dev/null +++ b/corpus/support/utf8-normalize-batch.pl @@ -0,0 +1,28 @@ +#!/usr/bin/env perl + +use IPC::Open2; + +$|++; + +if (scalar(@ARGV) != 1) { +    print STDERR "usage: $0 \"CMD\"\n"; +    exit(2); +} + +$CMD = $ARGV[0]; + +while (<STDIN>) { +    s/\r\n*/\n/g; +    $PID = open2(*SOUT, *SIN, $CMD); +    print SIN "$_\n"; +    close(SIN); +    $_ = <SOUT>; +    close(SOUT); +    waitpid($PID, 0); +    chomp; +    s/[\x00-\x1F]+/ /g; +    s/  +/ /g; +    s/^ //; +    s/ $//; +    print "$_\n"; +} diff --git a/corpus/support/utf8-normalize.sh b/corpus/support/utf8-normalize.sh index c85ae9f7..af9895ba 100755 --- a/corpus/support/utf8-normalize.sh +++ b/corpus/support/utf8-normalize.sh @@ -25,13 +25,19 @@ else    fi  fi -perl -e '$|++; while(<>){s/\r\n*/\n/g; print;}' | $CMD | /usr/bin/perl -w -e ' - $|++; - while (<>) { -     chomp; -      s/[\x00-\x1F]+/ /g; -      s/  +/ /g; -      s/^ //; -      s/ $//; -      print "$_\n"; -    }' +if [[ $# == 1 && $1 == "--batchline" ]]; then +    perl $(dirname $0)/utf8-normalize-batch.pl "$CMD" +else +    perl -e '$|++; while(<>){s/\r\n*/\n/g; print;}' \ +    |$CMD \ +    |/usr/bin/perl -w -e ' +        $|++; +        while (<>) { +            chomp; +            s/[\x00-\x1F]+/ /g; +            s/  +/ /g; +            s/^ //; +            s/ $//; +            print "$_\n"; +        }' +fi  | 
