summaryrefslogtreecommitdiff
path: root/corpus/support
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-03-16 17:48:48 +0100
committerPatrick Simianer <p@simianer.de>2014-03-16 17:48:48 +0100
commit62bd9a4bdcea606d6ff2031fa4b207ef20caac31 (patch)
tree5a97415cff8287398becc602a1ca16c937a43253 /corpus/support
parent7112976f89f0082f7af48829dd5deee61a3e6d16 (diff)
parent6c04595f968f7a9c047c5941113752a7c7280b45 (diff)
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'corpus/support')
-rwxr-xr-xcorpus/support/fix-eos.pl2
-rwxr-xr-xcorpus/support/quote-norm.pl1
-rw-r--r--corpus/support/token_list6
3 files changed, 7 insertions, 2 deletions
diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl
index 584f8b46..fe03727b 100755
--- a/corpus/support/fix-eos.pl
+++ b/corpus/support/fix-eos.pl
@@ -1,4 +1,6 @@
#!/usr/bin/perl -w
+$|++;
+
use strict;
use utf8;
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 33604027..0366fad5 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -39,6 +39,7 @@ while(<STDIN>) {
s/&\#([0-9]+);/pack("U", $1)/ge;
# Regularlize spaces:
+ s/\x{ad}//g; # soft hyphen
s/\x{a0}/ /g; # non-breaking space
s/\x{2009}/ /g; # thin space
s/\x{2028}/ /g; # "line separator"
diff --git a/corpus/support/token_list b/corpus/support/token_list
index 228663f6..d38638cf 100644
--- a/corpus/support/token_list
+++ b/corpus/support/token_list
@@ -156,8 +156,9 @@ Mass.
Md.
Mfg.
Mgr.
-Mexican-U.S.
-Mich.
+Mio.
+Mrd.
+Bio.
Minn.
Mo.
Mon.
@@ -187,6 +188,7 @@ Rd.
Rev.
R.J.
C.L
+Rs.
Rte.
Sat.
W.T