From 735ddbdb08ca0b1fce61ab08d8d2976bceee9829 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 23 Nov 2015 10:37:32 +0100 Subject: fixed/tweaked tokenization truecasing --- TODO | 1 - external/detokenizer.perl | 3 ++- external/truecase.perl | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/TODO b/TODO index 591c3ef..e69de29 100644 --- a/TODO +++ b/TODO @@ -1 +0,0 @@ -fix upper case if first thing is a comma or . diff --git a/external/detokenizer.perl b/external/detokenizer.perl index a8de7e8..dc12609 100755 --- a/external/detokenizer.perl +++ b/external/detokenizer.perl @@ -192,7 +192,8 @@ sub detokenize { #add trailing break $text .= "\n" unless $text =~ /\n$/; - $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; + #$text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; + $text =~ s/^([\.:\?\!;\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; return $text; } diff --git a/external/truecase.perl b/external/truecase.perl index 0a4d366..b724510 100755 --- a/external/truecase.perl +++ b/external/truecase.perl @@ -28,7 +28,8 @@ while() { close(MODEL); my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); -my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); +#my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); +my %DELAYED_SENTENCE_START = ("\""=>1,"'"=>1,"'"=>1,"""=>1); while() { chop; -- cgit v1.2.3