diff options
author | armatthews <armatthe@cmu.edu> | 2014-01-23 00:22:52 -0500 |
---|---|---|
committer | armatthews <armatthe@cmu.edu> | 2014-01-23 00:22:52 -0500 |
commit | a6786e51676f53fd4a729a9a072902ea8cd3d404 (patch) | |
tree | c0f5392f26c62530f8d9df2c74dd582dafc84783 | |
parent | 45cfb892abccf0874e8023564568a999f3fef356 (diff) |
Reordered HTML entity blocks
-rwxr-xr-x | corpus/support/quote-norm.pl | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index c99c2355..f677df66 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -37,6 +37,10 @@ while(<STDIN>) { s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus s/&\s*#45\s*;/--/g; # hyphen-minus + # Convert arbitrary hex or decimal HTML entities to actual characters: + s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; + s/&\#([0-9]+);/pack("U", $1)/ge; + # Convert other Windows 1252 characters to UTF-8 s/\x{80}/\x{20ac}/g; # euro sign s/\x{95}/\x{2022}/g; # bullet @@ -147,10 +151,6 @@ while(<STDIN>) { s/\x{fb03}/ffi/g; # "ffi" ligature s/\x{fb04}/ffi/g; # "ffl" ligature - # Convert arbitrary hex or decimal HTML entities to actual characters: - s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; - s/&\#([0-9]+);/pack("U", $1)/ge; - s/β/ß/g; # WMT 2010 error # Strip extra spaces: |