summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2013-09-05 14:15:43 -0700
committerMichael Denkowski <mdenkows@cs.cmu.edu>2013-09-05 14:15:43 -0700
commitc55f37fee9f43c0a13b47aac512804ecf9f5bd48 (patch)
tree5b29abbc119dab32d1e991fc4fd55594803b6850 /corpus
parent208fabfbbe19c1ba2ee744e9d16b54805ec8b141 (diff)
Slower but correct (wrt buffered) unbuffered version.
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/support/utf8-normalize-batch.pl28
-rwxr-xr-xcorpus/support/utf8-normalize.sh26
-rwxr-xr-xcorpus/tokenize-anything.sh6
3 files changed, 47 insertions, 13 deletions
diff --git a/corpus/support/utf8-normalize-batch.pl b/corpus/support/utf8-normalize-batch.pl
new file mode 100755
index 00000000..e574f861
--- /dev/null
+++ b/corpus/support/utf8-normalize-batch.pl
@@ -0,0 +1,28 @@
+#!/usr/bin/env perl
+
+use IPC::Open2;
+
+$|++;
+
+if (scalar(@ARGV) != 1) {
+ print STDERR "usage: $0 \"CMD\"\n";
+ exit(2);
+}
+
+$CMD = $ARGV[0];
+
+while (<STDIN>) {
+ s/\r\n*/\n/g;
+ $PID = open2(*SOUT, *SIN, $CMD);
+ print SIN "$_\n";
+ close(SIN);
+ $_ = <SOUT>;
+ close(SOUT);
+ waitpid($PID, 0);
+ chomp;
+ s/[\x00-\x1F]+/ /g;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ print "$_\n";
+}
diff --git a/corpus/support/utf8-normalize.sh b/corpus/support/utf8-normalize.sh
index c85ae9f7..af9895ba 100755
--- a/corpus/support/utf8-normalize.sh
+++ b/corpus/support/utf8-normalize.sh
@@ -25,13 +25,19 @@ else
fi
fi
-perl -e '$|++; while(<>){s/\r\n*/\n/g; print;}' | $CMD | /usr/bin/perl -w -e '
- $|++;
- while (<>) {
- chomp;
- s/[\x00-\x1F]+/ /g;
- s/ +/ /g;
- s/^ //;
- s/ $//;
- print "$_\n";
- }'
+if [[ $# == 1 && $1 == "--batchline" ]]; then
+ perl $(dirname $0)/utf8-normalize-batch.pl "$CMD"
+else
+ perl -e '$|++; while(<>){s/\r\n*/\n/g; print;}' \
+ |$CMD \
+ |/usr/bin/perl -w -e '
+ $|++;
+ while (<>) {
+ chomp;
+ s/[\x00-\x1F]+/ /g;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ print "$_\n";
+ }'
+fi
diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh
index 52739e81..a20a022f 100755
--- a/corpus/tokenize-anything.sh
+++ b/corpus/tokenize-anything.sh
@@ -4,12 +4,12 @@ ROOTDIR=`dirname $0`
SUPPORT=$ROOTDIR/support
if [[ $# == 1 && $1 == '-u' ]] ; then
- NORMCMD=cat
+ NORMARGS="--batchline"
else
- NORMCMD=$SUPPORT/utf8-normalize.sh
+ NORMARGS=""
fi
-$NORMCMD |
+$SUPPORT/utf8-normalize.sh $NORMARGS |
$SUPPORT/quote-norm.pl |
$SUPPORT/tokenizer.pl |
sed -u -e 's/ al - / al-/g' |