summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xcorpus/support/quote-norm.pl8
1 files changed, 4 insertions, 4 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index c99c2355..f677df66 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -37,6 +37,10 @@ while(<STDIN>) {
s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus
s/&\s*#45\s*;/--/g; # hyphen-minus
+ # Convert arbitrary hex or decimal HTML entities to actual characters:
+ s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge;
+ s/&\#([0-9]+);/pack("U", $1)/ge;
+
# Convert other Windows 1252 characters to UTF-8
s/\x{80}/\x{20ac}/g; # euro sign
s/\x{95}/\x{2022}/g; # bullet
@@ -147,10 +151,6 @@ while(<STDIN>) {
s/\x{fb03}/ffi/g; # "ffi" ligature
s/\x{fb04}/ffi/g; # "ffl" ligature
- # Convert arbitrary hex or decimal HTML entities to actual characters:
- s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge;
- s/&\#([0-9]+);/pack("U", $1)/ge;
-
s/β/ß/g; # WMT 2010 error
# Strip extra spaces: