From 284383880f043edb2d67afbe2f64237c466245c1 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 10 Mar 2014 18:40:13 -0400 Subject: few tokenization bugs --- corpus/support/quote-norm.pl | 1 + corpus/support/token_list | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'corpus') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 33604027..0366fad5 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -39,6 +39,7 @@ while() { s/&\#([0-9]+);/pack("U", $1)/ge; # Regularlize spaces: + s/\x{ad}//g; # soft hyphen s/\x{a0}/ /g; # non-breaking space s/\x{2009}/ /g; # thin space s/\x{2028}/ /g; # "line separator" diff --git a/corpus/support/token_list b/corpus/support/token_list index 228663f6..d38638cf 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -156,8 +156,9 @@ Mass. Md. Mfg. Mgr. -Mexican-U.S. -Mich. +Mio. +Mrd. +Bio. Minn. Mo. Mon. @@ -187,6 +188,7 @@ Rd. Rev. R.J. C.L +Rs. Rte. Sat. W.T -- cgit v1.2.3