diff options
| author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2015-05-21 21:39:08 -0400 | 
|---|---|---|
| committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2015-05-21 21:39:08 -0400 | 
| commit | cf61f8bc83ef280292a141ac9f8148eaf28596cc (patch) | |
| tree | b84754b6fd17404fc51d21663ab2fde6f5133c5a | |
| parent | d3c9c3620a12e59a362b892d726197118950b9d2 (diff) | |
deal with curly quotes
| -rwxr-xr-x | corpus/support/tokenizer.pl | 3 | 
1 files changed, 3 insertions, 0 deletions
| diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 718d78cc..6cc9f4e1 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -388,15 +388,18 @@ sub deep_proc_token {      ##### step 1: separate by punct T2 on the boundary      my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;|●|○';      if($line =~ s/^(($t2)+)/$1 /){ +        $line =~ s/"/“/;  	return proc_line($line);      }      if($line =~ s/(($t2)+)$/ $1/){ +        $line =~ s/"/”/;  	return proc_line($line);      }	      ## step 2: separate by punct T2 in any position      if($line =~ s/(($t2)+)/ $1 /g){ +        $line =~ s/"/”/g;  # probably before punctuation char  	return proc_line($line);      } | 
