diff options
author | Patrick Simianer <p@simianer.de> | 2014-03-16 17:48:48 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-03-16 17:48:48 +0100 |
commit | 62bd9a4bdcea606d6ff2031fa4b207ef20caac31 (patch) | |
tree | 5a97415cff8287398becc602a1ca16c937a43253 /corpus/support | |
parent | 7112976f89f0082f7af48829dd5deee61a3e6d16 (diff) | |
parent | 6c04595f968f7a9c047c5941113752a7c7280b45 (diff) |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'corpus/support')
-rwxr-xr-x | corpus/support/fix-eos.pl | 2 | ||||
-rwxr-xr-x | corpus/support/quote-norm.pl | 1 | ||||
-rw-r--r-- | corpus/support/token_list | 6 |
3 files changed, 7 insertions, 2 deletions
diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl index 584f8b46..fe03727b 100755 --- a/corpus/support/fix-eos.pl +++ b/corpus/support/fix-eos.pl @@ -1,4 +1,6 @@ #!/usr/bin/perl -w +$|++; + use strict; use utf8; diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 33604027..0366fad5 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -39,6 +39,7 @@ while(<STDIN>) { s/&\#([0-9]+);/pack("U", $1)/ge; # Regularlize spaces: + s/\x{ad}//g; # soft hyphen s/\x{a0}/ /g; # non-breaking space s/\x{2009}/ /g; # thin space s/\x{2028}/ /g; # "line separator" diff --git a/corpus/support/token_list b/corpus/support/token_list index 228663f6..d38638cf 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -156,8 +156,9 @@ Mass. Md. Mfg. Mgr. -Mexican-U.S. -Mich. +Mio. +Mrd. +Bio. Minn. Mo. Mon. @@ -187,6 +188,7 @@ Rd. Rev. R.J. C.L +Rs. Rte. Sat. W.T |