summaryrefslogtreecommitdiff
path: root/corpus/support
diff options
context:
space:
mode:
authorarmatthews <armatthe@cmu.edu>2014-01-23 00:22:52 -0500
committerarmatthews <armatthe@cmu.edu>2014-01-23 00:22:52 -0500
commita6786e51676f53fd4a729a9a072902ea8cd3d404 (patch)
treec0f5392f26c62530f8d9df2c74dd582dafc84783 /corpus/support
parent45cfb892abccf0874e8023564568a999f3fef356 (diff)
Reordered HTML entity blocks
Diffstat (limited to 'corpus/support')
-rwxr-xr-xcorpus/support/quote-norm.pl8
1 files changed, 4 insertions, 4 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index c99c2355..f677df66 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -37,6 +37,10 @@ while(<STDIN>) {
s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus
s/&\s*#45\s*;/--/g; # hyphen-minus
+ # Convert arbitrary hex or decimal HTML entities to actual characters:
+ s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge;
+ s/&\#([0-9]+);/pack("U", $1)/ge;
+
# Convert other Windows 1252 characters to UTF-8
s/\x{80}/\x{20ac}/g; # euro sign
s/\x{95}/\x{2022}/g; # bullet
@@ -147,10 +151,6 @@ while(<STDIN>) {
s/\x{fb03}/ffi/g; # "ffi" ligature
s/\x{fb04}/ffi/g; # "ffl" ligature
- # Convert arbitrary hex or decimal HTML entities to actual characters:
- s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge;
- s/&\#([0-9]+);/pack("U", $1)/ge;
-
s/β/ß/g; # WMT 2010 error
# Strip extra spaces: