summaryrefslogtreecommitdiff
path: root/corpus/support/quote-norm.pl
diff options
context:
space:
mode:
authormjdenkowski <michael.j.denkowski@gmail.com>2014-03-11 15:47:04 -0400
committermjdenkowski <michael.j.denkowski@gmail.com>2014-03-11 15:47:04 -0400
commit48b44d3eeeb997abccda12149af584b196698316 (patch)
tree43b654e42749ba767d882ab77dbeef96107e1beb /corpus/support/quote-norm.pl
parentba0a7d0cd688ee136c7c7b7776e68cb9603585b2 (diff)
parentde8ffd4598d6c1e45273b50642870a661b4bcad4 (diff)
Merge branch 'master' of github.com:redpony/cdec
Diffstat (limited to 'corpus/support/quote-norm.pl')
-rwxr-xr-xcorpus/support/quote-norm.pl1
1 files changed, 1 insertions, 0 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 33604027..0366fad5 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -39,6 +39,7 @@ while(<STDIN>) {
s/&\#([0-9]+);/pack("U", $1)/ge;
# Regularlize spaces:
+ s/\x{ad}//g; # soft hyphen
s/\x{a0}/ /g; # non-breaking space
s/\x{2009}/ /g; # thin space
s/\x{2028}/ /g; # "line separator"