summaryrefslogtreecommitdiff
path: root/corpus/support/utf8-normalize-batch.pl
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2013-09-05 14:15:43 -0700
committerMichael Denkowski <mdenkows@cs.cmu.edu>2013-09-05 14:15:43 -0700
commitbef5c40937619e8360a87f988f204ba0ff2ad300 (patch)
treee14ee7764e1031fc4a319102d278167fc127423e /corpus/support/utf8-normalize-batch.pl
parent40688c6c8ac48c809e1b4f1fa10d93144620dead (diff)
Slower but correct (wrt buffered) unbuffered version.
Diffstat (limited to 'corpus/support/utf8-normalize-batch.pl')
-rwxr-xr-xcorpus/support/utf8-normalize-batch.pl28
1 files changed, 28 insertions, 0 deletions
diff --git a/corpus/support/utf8-normalize-batch.pl b/corpus/support/utf8-normalize-batch.pl
new file mode 100755
index 00000000..e574f861
--- /dev/null
+++ b/corpus/support/utf8-normalize-batch.pl
@@ -0,0 +1,28 @@
+#!/usr/bin/env perl
+
+use IPC::Open2;
+
+$|++;
+
+if (scalar(@ARGV) != 1) {
+ print STDERR "usage: $0 \"CMD\"\n";
+ exit(2);
+}
+
+$CMD = $ARGV[0];
+
+while (<STDIN>) {
+ s/\r\n*/\n/g;
+ $PID = open2(*SOUT, *SIN, $CMD);
+ print SIN "$_\n";
+ close(SIN);
+ $_ = <SOUT>;
+ close(SOUT);
+ waitpid($PID, 0);
+ chomp;
+ s/[\x00-\x1F]+/ /g;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ print "$_\n";
+}