summaryrefslogtreecommitdiff
path: root/corpus/support
diff options
context:
space:
mode:
authorarmatthews <armatthe@cmu.edu>2014-01-23 00:22:52 -0500
committerarmatthews <armatthe@cmu.edu>2014-01-23 00:22:52 -0500
commite2f66f6014a0bc50de89b0ce4640d6aac7355e7f (patch)
tree0e47e5743b0973a9774017385779275fa52016a4 /corpus/support
parentaee1c55f0c4c5f4f7d3c0814197bba73b37fc92e (diff)
Reordered HTML entity blocks
Diffstat (limited to 'corpus/support')
-rwxr-xr-xcorpus/support/quote-norm.pl8
1 files changed, 4 insertions, 4 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index c99c2355..f677df66 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -37,6 +37,10 @@ while(<STDIN>) {
s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus
s/&\s*#45\s*;/--/g; # hyphen-minus
+ # Convert arbitrary hex or decimal HTML entities to actual characters:
+ s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge;
+ s/&\#([0-9]+);/pack("U", $1)/ge;
+
# Convert other Windows 1252 characters to UTF-8
s/\x{80}/\x{20ac}/g; # euro sign
s/\x{95}/\x{2022}/g; # bullet
@@ -147,10 +151,6 @@ while(<STDIN>) {
s/\x{fb03}/ffi/g; # "ffi" ligature
s/\x{fb04}/ffi/g; # "ffl" ligature
- # Convert arbitrary hex or decimal HTML entities to actual characters:
- s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge;
- s/&\#([0-9]+);/pack("U", $1)/ge;
-
s/β/ß/g; # WMT 2010 error
# Strip extra spaces: