From a6786e51676f53fd4a729a9a072902ea8cd3d404 Mon Sep 17 00:00:00 2001 From: armatthews Date: Thu, 23 Jan 2014 00:22:52 -0500 Subject: Reordered HTML entity blocks --- corpus/support/quote-norm.pl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index c99c2355..f677df66 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -37,6 +37,10 @@ while() { s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus s/&\s*#45\s*;/--/g; # hyphen-minus + # Convert arbitrary hex or decimal HTML entities to actual characters: + s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; + s/&\#([0-9]+);/pack("U", $1)/ge; + # Convert other Windows 1252 characters to UTF-8 s/\x{80}/\x{20ac}/g; # euro sign s/\x{95}/\x{2022}/g; # bullet @@ -147,10 +151,6 @@ while() { s/\x{fb03}/ffi/g; # "ffi" ligature s/\x{fb04}/ffi/g; # "ffl" ligature - # Convert arbitrary hex or decimal HTML entities to actual characters: - s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; - s/&\#([0-9]+);/pack("U", $1)/ge; - s/β/ß/g; # WMT 2010 error # Strip extra spaces: -- cgit v1.2.3