diff options
Diffstat (limited to 'corpus/support')
-rwxr-xr-x | corpus/support/quote-norm.pl | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index c99c2355..f677df66 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -37,6 +37,10 @@ while(<STDIN>) { s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus s/&\s*#45\s*;/--/g; # hyphen-minus + # Convert arbitrary hex or decimal HTML entities to actual characters: + s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; + s/&\#([0-9]+);/pack("U", $1)/ge; + # Convert other Windows 1252 characters to UTF-8 s/\x{80}/\x{20ac}/g; # euro sign s/\x{95}/\x{2022}/g; # bullet @@ -147,10 +151,6 @@ while(<STDIN>) { s/\x{fb03}/ffi/g; # "ffi" ligature s/\x{fb04}/ffi/g; # "ffl" ligature - # Convert arbitrary hex or decimal HTML entities to actual characters: - s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; - s/&\#([0-9]+);/pack("U", $1)/ge; - s/β/ß/g; # WMT 2010 error # Strip extra spaces: |