From 426f08f9d2d2e76b83f024721f49e61b24dd425f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 15 Feb 2014 23:22:02 -0500 Subject: fix for missing angle quote form --- corpus/support/quote-norm.pl | 2 ++ 1 file changed, 2 insertions(+) (limited to 'corpus') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index f677df66..7bdcee67 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -93,6 +93,8 @@ while() { s/\x{ab}/\"/g; # opening guillemet s/\x{bb}/\"/g; # closing guillemet s/\x{0301}/'/g; # combining acute accent + s/\x{203a}/\"/g; # angle quotation mark + s/\x{2039}/\"/g; # angle quotation mark # Space inverted punctuation: s/¡/ ¡ /g; -- cgit v1.2.3 From e84a9a146495ea3d42f555dac875ff9f74ad4c08 Mon Sep 17 00:00:00 2001 From: armatthews Date: Thu, 20 Feb 2014 22:21:49 -0500 Subject: slight beautification and more sane ordering --- corpus/support/quote-norm.pl | 49 +++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'corpus') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index f677df66..bed0844e 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -8,20 +8,8 @@ while() { chomp; $_ = " $_ "; - # Regularlize spaces: - s/\x{a0}/ /g; # non-breaking space - s/\x{2009}/ /g; # thin space - s/\x{2028}/ /g; # "line separator" - s/\x{2029}/ /g; # "paragraph separator" - s/\x{202a}/ /g; # "left-to-right embedding" - s/\x{202b}/ /g; # "right-to-left embedding" - s/\x{202c}/ /g; # "pop directional formatting" - s/\x{202d}/ /g; # "left-to-right override" - s/\x{202e}/ /g; # "right-to-left override" - s/\x{85}/ /g; # "next line" - s/\x{fffd}/ /g; # "replacement character" - s/\x{feff}/ /g; # byte-order mark - s/\x{fdd3}/ /g; # "unicode non-character" + # Delete control characters: + s/[\x{00}-\x{1f}]//g; # Regularize named HTML/XML escapes: s/&\s*lt\s*;/) { s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; s/&\#([0-9]+);/pack("U", $1)/ge; + # Regularlize spaces: + s/\x{a0}/ /g; # non-breaking space + s/\x{2009}/ /g; # thin space + s/\x{2028}/ /g; # "line separator" + s/\x{2029}/ /g; # "paragraph separator" + s/\x{202a}/ /g; # "left-to-right embedding" + s/\x{202b}/ /g; # "right-to-left embedding" + s/\x{202c}/ /g; # "pop directional formatting" + s/\x{202d}/ /g; # "left-to-right override" + s/\x{202e}/ /g; # "right-to-left override" + s/\x{85}/ /g; # "next line" + s/\x{fffd}/ /g; # "replacement character" + s/\x{feff}/ /g; # byte-order mark + s/\x{fdd3}/ /g; # "unicode non-character" + # Convert other Windows 1252 characters to UTF-8 s/\x{80}/\x{20ac}/g; # euro sign s/\x{95}/\x{2022}/g; # bullet @@ -53,7 +56,7 @@ while() { s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g; s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; - # Ridiculous double conversions(?) (news commentary and Giga-FrEn): + # Ridiculous double conversions, UTF8 -> Windows 1252 -> UTF8: s/�c/--/g; # long dash s/\x{e2}\x{20ac}oe/\"/g; # opening double quote s/\x{e2}\x{20ac}\x{9c}/\"/g; # opening double quote @@ -63,6 +66,19 @@ while() { s/\x{e2}\x{20ac}\x{201d}/ -- /g; # em dash? s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; # single quote? s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; # double quote? + s/\x{c3}\x{9f}/\x{df}/g; # esset + s/\x{c3}\x{0178}/\x{df}/g; # esset + s/\x{c3}\x{a4}/\x{e4}/g; # a umlaut + s/\x{c3}\x{b6}/\x{f6}/g; # o umlaut + s/\x{c3}\x{bc}/\x{fc}/g; # u umlaut + s/\x{c3}\x{84}/\x{c4}/g; # A umlaut: create no C4s after this + s/\x{c3}\x{201e}/\x{c4}/g; # A umlaut: create no C4s after this + s/\x{c3}\x{96}/\x{d6}/g; # O umlaut + s/\x{c3}\x{2013}/\x{d6}/g; # O umlaut + s/\x{c3}\x{bc}/\x{dc}/g; # U umlaut + s/\x{80}/\x{20ac}/g; # euro sign + s/\x{95}/\x{2022}/g; # bullet + s/\x{99}/\x{2122}/g; # trademark sign # Regularize quotes: s/ˇ/'/g; # caron @@ -130,6 +146,7 @@ while() { s/–/--/g; s/─/--/g; s/—/--/g; + s/\x{97}/--/g; s/•/ * /g; s/\*/ * /g; s/،/,/g; @@ -158,8 +175,6 @@ while() { s/^\s+//; s/\s+$//; - # Delete control characters: - s/[\x{00}-\x{1f}]//g; print "$_\n"; } -- cgit v1.2.3 From ed56625e5edeadbe9297680b07e269c42b7ea420 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 27 Feb 2014 19:45:08 -0500 Subject: ptb to normal --- corpus/support/quote-norm.pl | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'corpus') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 1d9bb96f..33604027 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -11,6 +11,15 @@ while() { # Delete control characters: s/[\x{00}-\x{1f}]//g; + # PTB --> normal + s/-LRB-/(/g; + s/-RRB-/)/g; + s/-LSB-/[/g; + s/-RSB-/]/g; + s/-LCB-/{/g; + s/-RCB-/}/g; + s/ gon na / gonna /g; + # Regularize named HTML/XML escapes: s/&\s*lt\s*;//gi; # HTML closing angle bracket -- cgit v1.2.3