summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-11-23 10:37:32 +0100
committerPatrick Simianer <p@simianer.de>2015-11-23 10:37:32 +0100
commit735ddbdb08ca0b1fce61ab08d8d2976bceee9829 (patch)
treeb7fabdb873b8242c6c3f4616aca5b5f51724021a
parent625cdbab9f1c7d6e4851d270242ee2ee6b895b8d (diff)
fixed/tweaked tokenization truecasing
-rw-r--r--TODO1
-rwxr-xr-xexternal/detokenizer.perl3
-rwxr-xr-xexternal/truecase.perl3
3 files changed, 4 insertions, 3 deletions
diff --git a/TODO b/TODO
index 591c3ef..e69de29 100644
--- a/TODO
+++ b/TODO
@@ -1 +0,0 @@
-fix upper case if first thing is a comma or .
diff --git a/external/detokenizer.perl b/external/detokenizer.perl
index a8de7e8..dc12609 100755
--- a/external/detokenizer.perl
+++ b/external/detokenizer.perl
@@ -192,7 +192,8 @@ sub detokenize {
#add trailing break
$text .= "\n" unless $text =~ /\n$/;
- $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+ #$text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+ $text =~ s/^([\.:\?\!;\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
return $text;
}
diff --git a/external/truecase.perl b/external/truecase.perl
index 0a4d366..b724510 100755
--- a/external/truecase.perl
+++ b/external/truecase.perl
@@ -28,7 +28,8 @@ while(<MODEL>) {
close(MODEL);
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
-my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
+#my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
+my %DELAYED_SENTENCE_START = ("\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1);
while(<STDIN>) {
chop;