From cf61f8bc83ef280292a141ac9f8148eaf28596cc Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 21 May 2015 21:39:08 -0400 Subject: deal with curly quotes --- corpus/support/tokenizer.pl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 718d78cc..6cc9f4e1 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -388,15 +388,18 @@ sub deep_proc_token { ##### step 1: separate by punct T2 on the boundary my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;|●|○'; if($line =~ s/^(($t2)+)/$1 /){ + $line =~ s/"/“/; return proc_line($line); } if($line =~ s/(($t2)+)$/ $1/){ + $line =~ s/"/”/; return proc_line($line); } ## step 2: separate by punct T2 in any position if($line =~ s/(($t2)+)/ $1 /g){ + $line =~ s/"/”/g; # probably before punctuation char return proc_line($line); } -- cgit v1.2.3