summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorarmatthews <armatthe@cmu.edu>2014-01-23 00:20:37 -0500
committerarmatthews <armatthe@cmu.edu>2014-01-23 00:20:37 -0500
commitaee1c55f0c4c5f4f7d3c0814197bba73b37fc92e (patch)
tree041c8621d867e27e2b5b081619d00a56d75a2a22
parent5f9405ebda4810c2740ed4b89753e8a24f6e48c6 (diff)
Merged quote-norm with Greg's WMT normalization script
-rwxr-xr-xcorpus/support/quote-norm.pl143
1 files changed, 108 insertions, 35 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 7fe33db4..c99c2355 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -7,31 +7,94 @@ binmode(STDOUT,"utf8");
while(<STDIN>) {
chomp;
$_ = " $_ ";
- s/&\s*lt\s*;/</gi;
- s/&\s*gt\s*;/>/gi;
- s/&\s*squot\s*;/'/gi;
- s/&\s*quot\s*;/"/gi;
- s/&\s*amp\s*;/&/gi;
- s/&\s*nbsp\s*;/&/gi;
- s/&\s*#\s*160\s*;/ /gi;
+
+ # Regularlize spaces:
+ s/\x{a0}/ /g; # non-breaking space
+ s/\x{2009}/ /g; # thin space
+ s/\x{2028}/ /g; # "line separator"
+ s/\x{2029}/ /g; # "paragraph separator"
+ s/\x{202a}/ /g; # "left-to-right embedding"
+ s/\x{202b}/ /g; # "right-to-left embedding"
+ s/\x{202c}/ /g; # "pop directional formatting"
+ s/\x{202d}/ /g; # "left-to-right override"
+ s/\x{202e}/ /g; # "right-to-left override"
+ s/\x{85}/ /g; # "next line"
+ s/\x{fffd}/ /g; # "replacement character"
+ s/\x{feff}/ /g; # byte-order mark
+ s/\x{fdd3}/ /g; # "unicode non-character"
+
+ # Regularize named HTML/XML escapes:
+ s/&\s*lt\s*;/</gi; # HTML opening angle bracket
+ s/&\s*gt\s*;/>/gi; # HTML closing angle bracket
+ s/&\s*squot\s*;/'/gi; # HTML single quote
+ s/&\s*quot\s*;/"/gi; # HTML double quote
+ s/&\s*nbsp\s*;/ /gi; # HTML non-breaking space
+ s/&apos;/\'/g; # HTML apostrophe
+ s/&\s*amp\s*;/&/gi; # HTML ampersand (last)
+
+ # Regularize known HTML numeric codes:
+ s/&\s*#\s*160\s*;/ /gi; # no-break space
+ s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus
+ s/&\s*#45\s*;/--/g; # hyphen-minus
+
+ # Convert other Windows 1252 characters to UTF-8
+ s/\x{80}/\x{20ac}/g; # euro sign
+ s/\x{95}/\x{2022}/g; # bullet
+ s/\x{99}/\x{2122}/g; # trademark sign
+
+ # Currency and measure conversions:
s/ (\d\d): (\d\d)/ $1:$2/g;
s/[\x{20a0}]\x{20ac}]/ EUR /g;
s/[\x{00A3}]/ GBP /g;
s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g;
s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi;
- s/&\s*#45\s*;\s*&\s*#45\s*;/--/g;
- s/&\s*#45\s*;/--/g;
- s/�c/--/g;
- s/ ,,/ "/g;
- s/„/"/g;
- s/``/"/g;
- s/''/"/g;
- s/[「」]/"/g;
- s/〃/"/g;
- s/¨/"/g;
+
+ # Ridiculous double conversions(?) (news commentary and Giga-FrEn):
+ s/�c/--/g; # long dash
+ s/\x{e2}\x{20ac}oe/\"/g; # opening double quote
+ s/\x{e2}\x{20ac}\x{9c}/\"/g; # opening double quote
+ s/\x{e2}\x{20ac}\x{9d}/\"/g; # closing double quote
+ s/\x{e2}\x{20ac}\x{2122}/\'/g; # apostrophe
+ s/\x{e2}\x{20ac}\x{201c}/ -- /g; # en dash?
+ s/\x{e2}\x{20ac}\x{201d}/ -- /g; # em dash?
+ s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; # single quote?
+ s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; # double quote?
+
+ # Regularize quotes:
+ s/ˇ/'/g; # caron
+ s/´/'/g; # acute accent
+ s/`/'/g; # grave accent
+ s/ˉ/'/g; # modified letter macron
+ s/ ,,/ "/g; # ghetto low-99 quote
+ s/``/"/g; # latex-style left quote
+ s/''/"/g; # latex-style right quote
+ s/\x{300c}/"/g; # left corner bracket
+ s/\x{300d}/"/g; # right corner bracket
+ s/\x{3003}/"/g; # ditto mark
+ s/\x{00a8}/"/g; # diaeresis
+ s/\x{92}/\'/g; # curly apostrophe
+ s/\x{2019}/\'/g; # curly apostrophe
+ s/\x{f03d}/\'/g; # curly apostrophe
+ s/\x{b4}/\'/g; # curly apostrophe
+ s/\x{2018}/\'/g; # curly single open quote
+ s/\x{201a}/\'/g; # low-9 quote
+ s/\x{93}/\"/g; # curly left quote
+ s/\x{201c}/\"/g; # curly left quote
+ s/\x{94}/\"/g; # curly right quote
+ s/\x{201d}/\"/g; # curly right quote
+ s/\x{2033}/\"/g; # curly right quote
+ s/\x{201e}/\"/g; # low-99 quote
+ s/\x{84}/\"/g; # low-99 quote (bad enc)
+ s/\x{201f}/\"/g; # high-rev-99 quote
+ s/\x{ab}/\"/g; # opening guillemet
+ s/\x{bb}/\"/g; # closing guillemet
+ s/\x{0301}/'/g; # combining acute accent
+
+ # Space inverted punctuation:
s/¡/ ¡ /g;
s/¿/ ¿ /g;
+ # Russian abbreviations:
s/ п. п. / п.п. /g;
s/ ст. л. / ст.л. /g;
s/ т. е. / т.е. /g;
@@ -45,29 +108,19 @@ while(<STDIN>) {
s/ т. н. / т.н. /g;
s/ т. ч. / т.ч. /g;
s/ н. э. / н.э. /g;
- # â<U+0080><U+0099>
- s/â(\x{80}\x{99}|\x{80}\x{98})/'/g;
- s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g;
- s/ˇ/'/g;
- s/´/'/g;
- s/`/'/g;
- s/’/'/g;
- s/ ́/'/g;
- s/‘/'/g;
- s/ˉ/'/g;
- s/β/ß/g; # WMT 2010 error
- s/“/"/g;
- s/”/"/g;
- s/«/"/g;
- s/»/"/g;
- tr/!-~/!-~/;
+
+ # Convert foreign numerals into Arabic numerals
tr/०-९/0-9/; # devangari
tr/౦-౯/0-9/; # telugu
tr/೦-೯/0-9/; # kannada
tr/೦-௯/0-9/; # tamil
tr/൦-൯/0-9/; # malayalam
+
+ # Random punctuation:
+ tr/!-~/!-~/;
s/、/,/g;
# s/。/./g;
+ s/\x{85}/.../g;
s/…/.../g;
s/―/--/g;
s/–/--/g;
@@ -82,11 +135,31 @@ while(<STDIN>) {
s/’/'/g;
s/â€"/"/g;
s/؛/;/g;
-
+
+ # Regularize ligatures:
+ s/\x{9c}/oe/g; # "oe" ligature
+ s/\x{0153}/oe/g; # "oe" ligature
+ s/\x{8c}/Oe/g; # "OE" ligature
+ s/\x{0152}/Oe/g; # "OE" ligature
+ s/\x{fb00}/ff/g; # "ff" ligature
+ s/\x{fb01}/fi/g; # "fi" ligature
+ s/\x{fb02}/fl/g; # "fl" ligature
+ s/\x{fb03}/ffi/g; # "ffi" ligature
+ s/\x{fb04}/ffi/g; # "ffl" ligature
+
+ # Convert arbitrary hex or decimal HTML entities to actual characters:
+ s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge;
+ s/&\#([0-9]+);/pack("U", $1)/ge;
+
+ s/β/ß/g; # WMT 2010 error
+
+ # Strip extra spaces:
s/\s+/ /g;
s/^\s+//;
s/\s+$//;
- s/[\x{00}-\x{1f}]//g;
+
+ # Delete control characters:
+ s/[\x{00}-\x{1f}]//g;
print "$_\n";
}