summaryrefslogtreecommitdiff
path: root/corpus/support/quote-norm.pl
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-03-16 17:48:48 +0100
committerPatrick Simianer <p@simianer.de>2014-03-16 17:48:48 +0100
commit5250fd67a4b8f242068cff87f0a6a4211f8b0fcf (patch)
treef1401c1fd3eeae8671e59baf0d2169d1eb721cb7 /corpus/support/quote-norm.pl
parent3eedf96b5a08b3e3414888d328c505814b84d8db (diff)
parentcc87bfed0697583b7c11243913254dde3c0047d4 (diff)
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'corpus/support/quote-norm.pl')
-rwxr-xr-xcorpus/support/quote-norm.pl1
1 files changed, 1 insertions, 0 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 33604027..0366fad5 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -39,6 +39,7 @@ while(<STDIN>) {
s/&\#([0-9]+);/pack("U", $1)/ge;
# Regularlize spaces:
+ s/\x{ad}//g; # soft hyphen
s/\x{a0}/ /g; # non-breaking space
s/\x{2009}/ /g; # thin space
s/\x{2028}/ /g; # "line separator"