summaryrefslogtreecommitdiff
path: root/corpus/support/quote-norm.pl
diff options
context:
space:
mode:
Diffstat (limited to 'corpus/support/quote-norm.pl')
-rwxr-xr-xcorpus/support/quote-norm.pl64
1 files changed, 64 insertions, 0 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
new file mode 100755
index 00000000..0c5b9c26
--- /dev/null
+++ b/corpus/support/quote-norm.pl
@@ -0,0 +1,64 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+binmode(STDIN,"utf8");
+binmode(STDOUT,"utf8");
+while(<STDIN>) {
+ chomp;
+ $_ = " $_ ";
+ s/&\s*lt\s*;/</gi;
+ s/&\s*gt\s*;/>/gi;
+ s/&\s*squot\s*;/'/gi;
+ s/&\s*quot\s*;/"/gi;
+ s/&\s*amp\s*;/&/gi;
+ s/ (\d\d): (\d\d)/ $1:$2/g;
+ s/[\x{20a0}]\x{20ac}]/ EUR /g;
+ s/[\x{00A3}]/ GBP /g;
+ s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g;
+ s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi;
+ s/&\s*#45\s*;\s*&\s*#45\s*;/--/g;
+ s/&\s*#45\s*;/--/g;
+ s/ ,,/ "/g;
+ s/``/"/g;
+ s/''/"/g;
+ s/〃/"/g;
+ s/¨/"/g;
+ s/¡/ ¡ /g;
+ s/¿/ ¿ /g;
+ s/ˇ/'/g;
+ s/´/'/g;
+ s/`/'/g;
+ s/’/'/g;
+ s/ ́/'/g;
+ s/‘/'/g;
+ s/ˉ/'/g;
+ s/β/ß/g; # WMT 2010 error
+ s/“/"/g;
+ s/”/"/g;
+ s/«/"/g;
+ s/»/"/g;
+ tr/!-~/!-~/;
+ s/、/,/g;
+ s/。/./g;
+ s/…/.../g;
+ s/―/--/g;
+ s/–/--/g;
+ s/─/--/g;
+ s/—/--/g;
+ s/•/ * /g;
+ s/\*/ * /g;
+ s/،/,/g;
+ s/؟/?/g;
+ s/ـ/ /g;
+ s/Ã ̄/i/g;
+ s/’/'/g;
+ s/â€"/"/g;
+ s/؛/;/g;
+
+ s/\s+/ /g;
+ s/^\s+//;
+ s/\s+$//;
+ s/[\x{00}-\x{1f}]//g;
+ print "$_\n";
+}
+