diff options
author | Patrick Simianer <p@simianer.de> | 2015-11-23 10:37:32 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2015-11-23 10:37:32 +0100 |
commit | 735ddbdb08ca0b1fce61ab08d8d2976bceee9829 (patch) | |
tree | b7fabdb873b8242c6c3f4616aca5b5f51724021a | |
parent | 625cdbab9f1c7d6e4851d270242ee2ee6b895b8d (diff) |
fixed/tweaked tokenization truecasing
-rw-r--r-- | TODO | 1 | ||||
-rwxr-xr-x | external/detokenizer.perl | 3 | ||||
-rwxr-xr-x | external/truecase.perl | 3 |
3 files changed, 4 insertions, 3 deletions
@@ -1 +0,0 @@ -fix upper case if first thing is a comma or . diff --git a/external/detokenizer.perl b/external/detokenizer.perl index a8de7e8..dc12609 100755 --- a/external/detokenizer.perl +++ b/external/detokenizer.perl @@ -192,7 +192,8 @@ sub detokenize { #add trailing break $text .= "\n" unless $text =~ /\n$/; - $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; + #$text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; + $text =~ s/^([\.:\?\!;\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; return $text; } diff --git a/external/truecase.perl b/external/truecase.perl index 0a4d366..b724510 100755 --- a/external/truecase.perl +++ b/external/truecase.perl @@ -28,7 +28,8 @@ while(<MODEL>) { close(MODEL); my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); -my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); +#my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); +my %DELAYED_SENTENCE_START = ("\""=>1,"'"=>1,"'"=>1,"""=>1); while(<STDIN>) { chop; |