diff options
Diffstat (limited to 'corpus')
| -rwxr-xr-x | corpus/support/quote-norm.pl | 143 | 
1 files changed, 108 insertions, 35 deletions
| diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 7fe33db4..c99c2355 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -7,31 +7,94 @@ binmode(STDOUT,"utf8");  while(<STDIN>) {    chomp;    $_ = " $_ "; -  s/&\s*lt\s*;/</gi; -  s/&\s*gt\s*;/>/gi; -  s/&\s*squot\s*;/'/gi; -  s/&\s*quot\s*;/"/gi; -  s/&\s*amp\s*;/&/gi; -  s/&\s*nbsp\s*;/&/gi; -  s/&\s*#\s*160\s*;/ /gi; + +  # Regularlize spaces: +  s/\x{a0}/ /g;       # non-breaking space +  s/\x{2009}/ /g;     # thin space +  s/\x{2028}/ /g;     # "line separator" +  s/\x{2029}/ /g;     # "paragraph separator" +  s/\x{202a}/ /g;     # "left-to-right embedding" +  s/\x{202b}/ /g;     # "right-to-left embedding" +  s/\x{202c}/ /g;     # "pop directional formatting" +  s/\x{202d}/ /g;     # "left-to-right override" +  s/\x{202e}/ /g;     # "right-to-left override" +  s/\x{85}/ /g;       # "next line" +  s/\x{fffd}/ /g;     # "replacement character" +  s/\x{feff}/ /g;     # byte-order mark +  s/\x{fdd3}/ /g;     # "unicode non-character" + +  # Regularize named HTML/XML escapes: +  s/&\s*lt\s*;/</gi;    # HTML opening angle bracket +  s/&\s*gt\s*;/>/gi;    # HTML closing angle bracket +  s/&\s*squot\s*;/'/gi; # HTML single quote +  s/&\s*quot\s*;/"/gi;  # HTML double quote +  s/&\s*nbsp\s*;/ /gi;  # HTML non-breaking space +  s/'/\'/g;        # HTML apostrophe +  s/&\s*amp\s*;/&/gi;   # HTML ampersand (last) + +  # Regularize known HTML numeric codes: +  s/&\s*#\s*160\s*;/ /gi;           # no-break space +  s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus +  s/&\s*#45\s*;/--/g;               # hyphen-minus + +  # Convert other Windows 1252 characters to UTF-8  +  s/\x{80}/\x{20ac}/g;    # euro sign +  s/\x{95}/\x{2022}/g;    # bullet +  s/\x{99}/\x{2122}/g;    # trademark sign + +  # Currency and measure conversions:    s/ (\d\d): (\d\d)/ $1:$2/g;    s/[\x{20a0}]\x{20ac}]/ EUR /g;    s/[\x{00A3}]/ GBP /g;    s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g;    s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; -  s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; -  s/&\s*#45\s*;/--/g; -  s/�c/--/g; -  s/ ,,/ "/g; -  s/„/"/g; -  s/``/"/g; -  s/''/"/g; -  s/[「」]/"/g; -  s/〃/"/g; -  s/¨/"/g; + +  # Ridiculous double conversions(?) (news commentary and Giga-FrEn): +  s/�c/--/g;                        # long dash +  s/\x{e2}\x{20ac}oe/\"/g;            # opening double quote +  s/\x{e2}\x{20ac}\x{9c}/\"/g;        # opening double quote +  s/\x{e2}\x{20ac}\x{9d}/\"/g;        # closing double quote +  s/\x{e2}\x{20ac}\x{2122}/\'/g;      # apostrophe +  s/\x{e2}\x{20ac}\x{201c}/ -- /g;    # en dash? +  s/\x{e2}\x{20ac}\x{201d}/ -- /g;    # em dash?  +  s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; # single quote? +  s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; # double quote? + +  # Regularize quotes: +  s/ˇ/'/g;            # caron +  s/´/'/g;            # acute accent +  s/`/'/g;            # grave accent +  s/ˉ/'/g;            # modified letter macron +  s/ ,,/ "/g;         # ghetto low-99 quote +  s/``/"/g;           # latex-style left quote +  s/''/"/g;           # latex-style right quote +  s/\x{300c}/"/g;     # left corner bracket +  s/\x{300d}/"/g;     # right corner bracket +  s/\x{3003}/"/g;     # ditto mark +  s/\x{00a8}/"/g;     # diaeresis +  s/\x{92}/\'/g;      # curly apostrophe +  s/\x{2019}/\'/g;    # curly apostrophe +  s/\x{f03d}/\'/g;    # curly apostrophe +  s/\x{b4}/\'/g;      # curly apostrophe +  s/\x{2018}/\'/g;    # curly single open quote +  s/\x{201a}/\'/g;    # low-9 quote +  s/\x{93}/\"/g;      # curly left quote +  s/\x{201c}/\"/g;    # curly left quote +  s/\x{94}/\"/g;      # curly right quote +  s/\x{201d}/\"/g;    # curly right quote +  s/\x{2033}/\"/g;    # curly right quote +  s/\x{201e}/\"/g;    # low-99 quote +  s/\x{84}/\"/g;      # low-99 quote (bad enc) +  s/\x{201f}/\"/g;    # high-rev-99 quote +  s/\x{ab}/\"/g;      # opening guillemet +  s/\x{bb}/\"/g;      # closing guillemet +  s/\x{0301}/'/g;     # combining acute accent + +  # Space inverted punctuation:    s/¡/ ¡ /g;    s/¿/ ¿ /g; +  # Russian abbreviations:    s/ п. п. / п.п. /g;    s/ ст. л. / ст.л. /g;    s/ т. е. / т.е. /g; @@ -45,29 +108,19 @@ while(<STDIN>) {    s/ т. н. / т.н. /g;    s/ т. ч. / т.ч. /g;    s/ н. э. / н.э. /g; -  # â<U+0080><U+0099> -  s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; -  s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; -  s/ˇ/'/g; -  s/´/'/g; -  s/`/'/g; -  s/’/'/g; -  s/ ́/'/g; -  s/‘/'/g; -  s/ˉ/'/g; -  s/β/ß/g; # WMT 2010 error -  s/“/"/g; -  s/”/"/g; -  s/«/"/g; -  s/»/"/g; -  tr/!-~/!-~/; + +  # Convert foreign numerals into Arabic numerals    tr/०-९/0-9/; # devangari    tr/౦-౯/0-9/; # telugu    tr/೦-೯/0-9/; # kannada    tr/೦-௯/0-9/; # tamil    tr/൦-൯/0-9/; # malayalam + +  # Random punctuation: +  tr/!-~/!-~/;    s/、/,/g;    # s/。/./g; +  s/\x{85}/.../g;    s/…/.../g;    s/―/--/g;    s/–/--/g; @@ -82,11 +135,31 @@ while(<STDIN>) {    s/’/'/g;    s/â€"/"/g;    s/؛/;/g; -		     + +  # Regularize ligatures: +  s/\x{9c}/oe/g;      # "oe" ligature  +  s/\x{0153}/oe/g;    # "oe" ligature  +  s/\x{8c}/Oe/g;      # "OE" ligature +  s/\x{0152}/Oe/g;    # "OE" ligature +  s/\x{fb00}/ff/g;    # "ff" ligature +  s/\x{fb01}/fi/g;    # "fi" ligature +  s/\x{fb02}/fl/g;    # "fl" ligature +  s/\x{fb03}/ffi/g;   # "ffi" ligature +  s/\x{fb04}/ffi/g;   # "ffl" ligature + +  # Convert arbitrary hex or decimal HTML entities to actual characters: +  s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; +  s/&\#([0-9]+);/pack("U", $1)/ge; + +  s/β/ß/g; # WMT 2010 error + +  # Strip extra spaces:     s/\s+/ /g;    s/^\s+//;    s/\s+$//; -  s/[\x{00}-\x{1f}]//g; + +  # Delete control characters: +  s/[\x{00}-\x{1f}]//g;     print "$_\n";  } | 
