summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2015-05-21 21:39:08 -0400
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2015-05-21 21:39:08 -0400
commitcf61f8bc83ef280292a141ac9f8148eaf28596cc (patch)
treeb84754b6fd17404fc51d21663ab2fde6f5133c5a /corpus
parentd3c9c3620a12e59a362b892d726197118950b9d2 (diff)
deal with curly quotes
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/support/tokenizer.pl3
1 files changed, 3 insertions, 0 deletions
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index 718d78cc..6cc9f4e1 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -388,15 +388,18 @@ sub deep_proc_token {
##### step 1: separate by punct T2 on the boundary
my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;|●|○';
if($line =~ s/^(($t2)+)/$1 /){
+ $line =~ s/"/“/;
return proc_line($line);
}
if($line =~ s/(($t2)+)$/ $1/){
+ $line =~ s/"/”/;
return proc_line($line);
}
## step 2: separate by punct T2 in any position
if($line =~ s/(($t2)+)/ $1 /g){
+ $line =~ s/"/”/g; # probably before punctuation char
return proc_line($line);
}