From 284383880f043edb2d67afbe2f64237c466245c1 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 10 Mar 2014 18:40:13 -0400 Subject: few tokenization bugs --- corpus/support/quote-norm.pl | 1 + 1 file changed, 1 insertion(+) (limited to 'corpus/support/quote-norm.pl') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 33604027..0366fad5 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -39,6 +39,7 @@ while() { s/&\#([0-9]+);/pack("U", $1)/ge; # Regularlize spaces: + s/\x{ad}//g; # soft hyphen s/\x{a0}/ /g; # non-breaking space s/\x{2009}/ /g; # thin space s/\x{2028}/ /g; # "line separator" -- cgit v1.2.3