summaryrefslogtreecommitdiff
path: root/corpus/support
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2014-06-03 16:58:29 -0400
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2014-06-03 16:58:29 -0400
commitb66e838ed52decc0be1eb5817b2a77c3840db2c5 (patch)
tree5e3646d827d0932399d0930e9c65ae572f16c662 /corpus/support
parentdc372570c906d1b7d4c856132f8be925fd7ba8b0 (diff)
fix for nonjoining chars
Diffstat (limited to 'corpus/support')
-rwxr-xr-xcorpus/support/quote-norm.pl1
1 files changed, 1 insertions, 0 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 0366fad5..3eee0666 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -40,6 +40,7 @@ while(<STDIN>) {
# Regularlize spaces:
s/\x{ad}//g; # soft hyphen
+ s/\x{200C}//g; # zero-width non-joiner
s/\x{a0}/ /g; # non-breaking space
s/\x{2009}/ /g; # thin space
s/\x{2028}/ /g; # "line separator"