From 735ddbdb08ca0b1fce61ab08d8d2976bceee9829 Mon Sep 17 00:00:00 2001 From: Patrick Simianer
Date: Mon, 23 Nov 2015 10:37:32 +0100
Subject: fixed/tweaked tokenization truecasing
---
TODO | 1 -
external/detokenizer.perl | 3 ++-
external/truecase.perl | 3 ++-
3 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/TODO b/TODO
index 591c3ef..e69de29 100644
--- a/TODO
+++ b/TODO
@@ -1 +0,0 @@
-fix upper case if first thing is a comma or .
diff --git a/external/detokenizer.perl b/external/detokenizer.perl
index a8de7e8..dc12609 100755
--- a/external/detokenizer.perl
+++ b/external/detokenizer.perl
@@ -192,7 +192,8 @@ sub detokenize {
#add trailing break
$text .= "\n" unless $text =~ /\n$/;
- $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+ #$text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+ $text =~ s/^([\.:\?\!;\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
return $text;
}
diff --git a/external/truecase.perl b/external/truecase.perl
index 0a4d366..b724510 100755
--- a/external/truecase.perl
+++ b/external/truecase.perl
@@ -28,7 +28,8 @@ while(