diff options
Diffstat (limited to 'corpus/support/quote-norm.pl')
-rwxr-xr-x | corpus/support/quote-norm.pl | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl new file mode 100755 index 00000000..0c5b9c26 --- /dev/null +++ b/corpus/support/quote-norm.pl @@ -0,0 +1,64 @@ +#!/usr/bin/perl -w +use strict; +use utf8; +binmode(STDIN,"utf8"); +binmode(STDOUT,"utf8"); +while(<STDIN>) { + chomp; + $_ = " $_ "; + s/&\s*lt\s*;/</gi; + s/&\s*gt\s*;/>/gi; + s/&\s*squot\s*;/'/gi; + s/&\s*quot\s*;/"/gi; + s/&\s*amp\s*;/&/gi; + s/ (\d\d): (\d\d)/ $1:$2/g; + s/[\x{20a0}]\x{20ac}]/ EUR /g; + s/[\x{00A3}]/ GBP /g; + s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g; + s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; + s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; + s/&\s*#45\s*;/--/g; + s/ ,,/ "/g; + s/``/"/g; + s/''/"/g; + s/〃/"/g; + s/¨/"/g; + s/¡/ ¡ /g; + s/¿/ ¿ /g; + s/ˇ/'/g; + s/´/'/g; + s/`/'/g; + s/’/'/g; + s/ ́/'/g; + s/‘/'/g; + s/ˉ/'/g; + s/β/ß/g; # WMT 2010 error + s/“/"/g; + s/”/"/g; + s/«/"/g; + s/»/"/g; + tr/!-~/!-~/; + s/、/,/g; + s/。/./g; + s/…/.../g; + s/―/--/g; + s/–/--/g; + s/─/--/g; + s/—/--/g; + s/•/ * /g; + s/\*/ * /g; + s/،/,/g; + s/؟/?/g; + s/ـ/ /g; + s/à ̄/i/g; + s/’/'/g; + s/â€"/"/g; + s/؛/;/g; + + s/\s+/ /g; + s/^\s+//; + s/\s+$//; + s/[\x{00}-\x{1f}]//g; + print "$_\n"; +} + |