summaryrefslogtreecommitdiff
path: root/external
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-11-23 10:37:32 +0100
committerPatrick Simianer <p@simianer.de>2015-11-23 10:37:32 +0100
commit735ddbdb08ca0b1fce61ab08d8d2976bceee9829 (patch)
treeb7fabdb873b8242c6c3f4616aca5b5f51724021a /external
parent625cdbab9f1c7d6e4851d270242ee2ee6b895b8d (diff)
fixed/tweaked tokenization truecasing
Diffstat (limited to 'external')
-rwxr-xr-xexternal/detokenizer.perl3
-rwxr-xr-xexternal/truecase.perl3
2 files changed, 4 insertions, 2 deletions
diff --git a/external/detokenizer.perl b/external/detokenizer.perl
index a8de7e8..dc12609 100755
--- a/external/detokenizer.perl
+++ b/external/detokenizer.perl
@@ -192,7 +192,8 @@ sub detokenize {
#add trailing break
$text .= "\n" unless $text =~ /\n$/;
- $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+ #$text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+ $text =~ s/^([\.:\?\!;\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
return $text;
}
diff --git a/external/truecase.perl b/external/truecase.perl
index 0a4d366..b724510 100755
--- a/external/truecase.perl
+++ b/external/truecase.perl
@@ -28,7 +28,8 @@ while(<MODEL>) {
close(MODEL);
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
-my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
+#my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
+my %DELAYED_SENTENCE_START = ("\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1);
while(<STDIN>) {
chop;