diff options
author | Michael Denkowski <mdenkows@cs.cmu.edu> | 2013-09-05 14:15:43 -0700 |
---|---|---|
committer | Michael Denkowski <mdenkows@cs.cmu.edu> | 2013-09-05 14:15:43 -0700 |
commit | c55f37fee9f43c0a13b47aac512804ecf9f5bd48 (patch) | |
tree | 5b29abbc119dab32d1e991fc4fd55594803b6850 | |
parent | 208fabfbbe19c1ba2ee744e9d16b54805ec8b141 (diff) |
Slower but correct (wrt buffered) unbuffered version.
-rwxr-xr-x | corpus/support/utf8-normalize-batch.pl | 28 | ||||
-rwxr-xr-x | corpus/support/utf8-normalize.sh | 26 | ||||
-rwxr-xr-x | corpus/tokenize-anything.sh | 6 |
3 files changed, 47 insertions, 13 deletions
diff --git a/corpus/support/utf8-normalize-batch.pl b/corpus/support/utf8-normalize-batch.pl new file mode 100755 index 00000000..e574f861 --- /dev/null +++ b/corpus/support/utf8-normalize-batch.pl @@ -0,0 +1,28 @@ +#!/usr/bin/env perl + +use IPC::Open2; + +$|++; + +if (scalar(@ARGV) != 1) { + print STDERR "usage: $0 \"CMD\"\n"; + exit(2); +} + +$CMD = $ARGV[0]; + +while (<STDIN>) { + s/\r\n*/\n/g; + $PID = open2(*SOUT, *SIN, $CMD); + print SIN "$_\n"; + close(SIN); + $_ = <SOUT>; + close(SOUT); + waitpid($PID, 0); + chomp; + s/[\x00-\x1F]+/ /g; + s/ +/ /g; + s/^ //; + s/ $//; + print "$_\n"; +} diff --git a/corpus/support/utf8-normalize.sh b/corpus/support/utf8-normalize.sh index c85ae9f7..af9895ba 100755 --- a/corpus/support/utf8-normalize.sh +++ b/corpus/support/utf8-normalize.sh @@ -25,13 +25,19 @@ else fi fi -perl -e '$|++; while(<>){s/\r\n*/\n/g; print;}' | $CMD | /usr/bin/perl -w -e ' - $|++; - while (<>) { - chomp; - s/[\x00-\x1F]+/ /g; - s/ +/ /g; - s/^ //; - s/ $//; - print "$_\n"; - }' +if [[ $# == 1 && $1 == "--batchline" ]]; then + perl $(dirname $0)/utf8-normalize-batch.pl "$CMD" +else + perl -e '$|++; while(<>){s/\r\n*/\n/g; print;}' \ + |$CMD \ + |/usr/bin/perl -w -e ' + $|++; + while (<>) { + chomp; + s/[\x00-\x1F]+/ /g; + s/ +/ /g; + s/^ //; + s/ $//; + print "$_\n"; + }' +fi diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh index 52739e81..a20a022f 100755 --- a/corpus/tokenize-anything.sh +++ b/corpus/tokenize-anything.sh @@ -4,12 +4,12 @@ ROOTDIR=`dirname $0` SUPPORT=$ROOTDIR/support if [[ $# == 1 && $1 == '-u' ]] ; then - NORMCMD=cat + NORMARGS="--batchline" else - NORMCMD=$SUPPORT/utf8-normalize.sh + NORMARGS="" fi -$NORMCMD | +$SUPPORT/utf8-normalize.sh $NORMARGS | $SUPPORT/quote-norm.pl | $SUPPORT/tokenizer.pl | sed -u -e 's/ al - / al-/g' | |