diff options
author | armatthews <armatthe@cmu.edu> | 2014-01-23 00:22:52 -0500 |
---|---|---|
committer | armatthews <armatthe@cmu.edu> | 2014-01-23 00:22:52 -0500 |
commit | e2f66f6014a0bc50de89b0ce4640d6aac7355e7f (patch) | |
tree | 0e47e5743b0973a9774017385779275fa52016a4 | |
parent | aee1c55f0c4c5f4f7d3c0814197bba73b37fc92e (diff) |
Reordered HTML entity blocks
-rwxr-xr-x | corpus/support/quote-norm.pl | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index c99c2355..f677df66 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -37,6 +37,10 @@ while(<STDIN>) { s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus s/&\s*#45\s*;/--/g; # hyphen-minus + # Convert arbitrary hex or decimal HTML entities to actual characters: + s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; + s/&\#([0-9]+);/pack("U", $1)/ge; + # Convert other Windows 1252 characters to UTF-8 s/\x{80}/\x{20ac}/g; # euro sign s/\x{95}/\x{2022}/g; # bullet @@ -147,10 +151,6 @@ while(<STDIN>) { s/\x{fb03}/ffi/g; # "ffi" ligature s/\x{fb04}/ffi/g; # "ffl" ligature - # Convert arbitrary hex or decimal HTML entities to actual characters: - s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; - s/&\#([0-9]+);/pack("U", $1)/ge; - s/β/ß/g; # WMT 2010 error # Strip extra spaces: |