diff options
Diffstat (limited to 'corpus')
-rwxr-xr-x | corpus/support/quote-norm.pl | 49 |
1 files changed, 32 insertions, 17 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index f677df66..bed0844e 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -8,20 +8,8 @@ while(<STDIN>) { chomp; $_ = " $_ "; - # Regularlize spaces: - s/\x{a0}/ /g; # non-breaking space - s/\x{2009}/ /g; # thin space - s/\x{2028}/ /g; # "line separator" - s/\x{2029}/ /g; # "paragraph separator" - s/\x{202a}/ /g; # "left-to-right embedding" - s/\x{202b}/ /g; # "right-to-left embedding" - s/\x{202c}/ /g; # "pop directional formatting" - s/\x{202d}/ /g; # "left-to-right override" - s/\x{202e}/ /g; # "right-to-left override" - s/\x{85}/ /g; # "next line" - s/\x{fffd}/ /g; # "replacement character" - s/\x{feff}/ /g; # byte-order mark - s/\x{fdd3}/ /g; # "unicode non-character" + # Delete control characters: + s/[\x{00}-\x{1f}]//g; # Regularize named HTML/XML escapes: s/&\s*lt\s*;/</gi; # HTML opening angle bracket @@ -41,6 +29,21 @@ while(<STDIN>) { s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; s/&\#([0-9]+);/pack("U", $1)/ge; + # Regularlize spaces: + s/\x{a0}/ /g; # non-breaking space + s/\x{2009}/ /g; # thin space + s/\x{2028}/ /g; # "line separator" + s/\x{2029}/ /g; # "paragraph separator" + s/\x{202a}/ /g; # "left-to-right embedding" + s/\x{202b}/ /g; # "right-to-left embedding" + s/\x{202c}/ /g; # "pop directional formatting" + s/\x{202d}/ /g; # "left-to-right override" + s/\x{202e}/ /g; # "right-to-left override" + s/\x{85}/ /g; # "next line" + s/\x{fffd}/ /g; # "replacement character" + s/\x{feff}/ /g; # byte-order mark + s/\x{fdd3}/ /g; # "unicode non-character" + # Convert other Windows 1252 characters to UTF-8 s/\x{80}/\x{20ac}/g; # euro sign s/\x{95}/\x{2022}/g; # bullet @@ -53,7 +56,7 @@ while(<STDIN>) { s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g; s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; - # Ridiculous double conversions(?) (news commentary and Giga-FrEn): + # Ridiculous double conversions, UTF8 -> Windows 1252 -> UTF8: s/�c/--/g; # long dash s/\x{e2}\x{20ac}oe/\"/g; # opening double quote s/\x{e2}\x{20ac}\x{9c}/\"/g; # opening double quote @@ -63,6 +66,19 @@ while(<STDIN>) { s/\x{e2}\x{20ac}\x{201d}/ -- /g; # em dash? s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; # single quote? s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; # double quote? + s/\x{c3}\x{9f}/\x{df}/g; # esset + s/\x{c3}\x{0178}/\x{df}/g; # esset + s/\x{c3}\x{a4}/\x{e4}/g; # a umlaut + s/\x{c3}\x{b6}/\x{f6}/g; # o umlaut + s/\x{c3}\x{bc}/\x{fc}/g; # u umlaut + s/\x{c3}\x{84}/\x{c4}/g; # A umlaut: create no C4s after this + s/\x{c3}\x{201e}/\x{c4}/g; # A umlaut: create no C4s after this + s/\x{c3}\x{96}/\x{d6}/g; # O umlaut + s/\x{c3}\x{2013}/\x{d6}/g; # O umlaut + s/\x{c3}\x{bc}/\x{dc}/g; # U umlaut + s/\x{80}/\x{20ac}/g; # euro sign + s/\x{95}/\x{2022}/g; # bullet + s/\x{99}/\x{2122}/g; # trademark sign # Regularize quotes: s/ˇ/'/g; # caron @@ -130,6 +146,7 @@ while(<STDIN>) { s/–/--/g; s/─/--/g; s/—/--/g; + s/\x{97}/--/g; s/•/ * /g; s/\*/ * /g; s/،/,/g; @@ -158,8 +175,6 @@ while(<STDIN>) { s/^\s+//; s/\s+$//; - # Delete control characters: - s/[\x{00}-\x{1f}]//g; print "$_\n"; } |