Merge branch 'master' of https://github.com/redpony/cdec

author: Chris Dyer <redpony@gmail.com> 2014-01-17 04:07:22 -0500
committer: Chris Dyer <redpony@gmail.com> 2014-01-17 04:07:22 -0500
commit: 1bc1a92c0f72fe4266182f9cb467b75e670a1dac (patch)
tree: 46b0b6a278c82252aecccca4e8a2b5ecbb5bf728
parent: 8bce53d2a076ef76f3fea34b2bddb0a713ec4ea8 (diff)
parent: c13f5b9f6b5e80cfe7719b25f90d8d73d5ef3098 (diff)
2 files changed, 12 insertions, 6 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 57f4ad77..7fe33db4 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -61,6 +61,11 @@ while(<STDIN>) {
   s/«/"/g;
   s/»/"/g;
   tr/！-～/!-~/;
+  tr/०-९/0-9/; # devangari
+  tr/౦-౯/0-9/; # telugu
+  tr/೦-೯/0-9/; # kannada
+  tr/೦-௯/0-9/; # tamil
+  tr/൦-൯/0-9/; # malayalam
   s/、/,/g;
   # s/。/./g;
   s/…/.../g;
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index e0df16a7..079f45b6 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -65,7 +65,7 @@ my $Split_AposD  = 1;  ## 'd
 
 
 ### some patterns
-my $common_right_punc = '\.|\,|\;|:|\!|\?|\"|\)|\]|\}|\>|\-';
+my $common_right_punc = '\x{0964}|\.|\,|\;|\!|:|\?|\"|\)|\]|\}|\>|\-';
 
 #### step 1: read files
 
@@ -112,7 +112,6 @@ my $new_token_total = 0;
 
 while(<STDIN>){
     chomp();
-
     if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) {
 	## markup
 	print STDOUT "$_\n";
@@ -121,7 +120,7 @@ while(<STDIN>){
 
     my $orig_num = 0;
     my $deep_proc_num = 0;
-
+    s/(\x{0964}+)/ $1/g;  # Devangari end of sentence
     my $new_line = proc_line($_, \$orig_num, \$deep_proc_num);
 
     $orig_token_total += $orig_num;
@@ -148,7 +147,8 @@ while(<STDIN>){
 	$new_line =~ s/(set|src|tgt|trg)/ $1/g;
     }
 
-    print STDOUT " $new_line\n";
+    chomp $new_line;
+    print STDOUT "$new_line\n";
 }
 
 ########################################################################
@@ -228,6 +228,7 @@ sub proc_token {
 
     ## step 1: check the most common case
     if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+$/i){
+    #if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){
 	### most common cases
 	return $token;
     }
@@ -363,7 +364,7 @@ sub deep_proc_token {
 
     ##### step 0: if it mades up of all puncts, remove one punct at a time.
     if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}a-zA-Z\d]/){
-	if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\.+|\-+|\_+|\&+)$/){
+	if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\x{0964}+|\.+|\-+|\_+|\&+)$/){
 	    ## ++ @@@@ !!! ....
 	    return $line;
 	}
@@ -454,7 +455,7 @@ sub deep_proc_token {
 	### deal with ': e.g., 's, 't, 'm, 'll, 're, 've, n't
 
 	##  'there => ' there   '98 => the same
-	$suc += ($line =~ s/^(\'+)([a-z]+)/ $1 $2/gi);
+	$suc += ($line =~ s/^(\'+)([a-z\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+)/ $1 $2/gi);
 	
 	##  note that \' and \. could interact: e.g.,  U.S.'s;   're.
 	if($Split_NAposT && ($line =~ /^(.*[a-z]+)(n\'t)([\.]*)$/i)){
author	Chris Dyer <redpony@gmail.com>	2014-01-17 04:07:22 -0500
committer	Chris Dyer <redpony@gmail.com>	2014-01-17 04:07:22 -0500
commit	1bc1a92c0f72fe4266182f9cb467b75e670a1dac (patch)
tree	46b0b6a278c82252aecccca4e8a2b5ecbb5bf728
parent	8bce53d2a076ef76f3fea34b2bddb0a713ec4ea8 (diff)
parent	c13f5b9f6b5e80cfe7719b25f90d8d73d5ef3098 (diff)