From 9f57df1eb1346196b6ea755eab9fb81a8a57b915 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>
Date: Wed, 15 Jan 2014 23:33:43 -0500
Subject: deal with hindi

---
 corpus/support/tokenizer.pl | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'corpus/support/tokenizer.pl')
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index e0df16a7..079f45b6 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -65,7 +65,7 @@ my $Split_AposD  = 1;  ## 'd
 
 
 ### some patterns
-my $common_right_punc = '\.|\,|\;|:|\!|\?|\"|\)|\]|\}|\>|\-';
+my $common_right_punc = '\x{0964}|\.|\,|\;|\!|:|\?|\"|\)|\]|\}|\>|\-';
 
 #### step 1: read files
 
@@ -112,7 +112,6 @@ my $new_token_total = 0;
 
 while(<STDIN>){
     chomp();
-
     if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) {
 	## markup
 	print STDOUT "$_\n";
@@ -121,7 +120,7 @@ while(<STDIN>){
 
     my $orig_num = 0;
     my $deep_proc_num = 0;
-
+    s/(\x{0964}+)/ $1/g;  # Devangari end of sentence
     my $new_line = proc_line($_, \$orig_num, \$deep_proc_num);
 
     $orig_token_total += $orig_num;
@@ -148,7 +147,8 @@ while(<STDIN>){
 	$new_line =~ s/(set|src|tgt|trg)/ $1/g;
     }
 
-    print STDOUT " $new_line\n";
+    chomp $new_line;
+    print STDOUT "$new_line\n";
 }
 
 ########################################################################
@@ -228,6 +228,7 @@ sub proc_token {
 
     ## step 1: check the most common case
     if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+$/i){
+    #if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){
 	### most common cases
 	return $token;
     }
@@ -363,7 +364,7 @@ sub deep_proc_token {
 
     ##### step 0: if it mades up of all puncts, remove one punct at a time.
     if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}a-zA-Z\d]/){
-	if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\.+|\-+|\_+|\&+)$/){
+	if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\x{0964}+|\.+|\-+|\_+|\&+)$/){
 	    ## ++ @@@@ !!! ....
 	    return $line;
 	}
@@ -454,7 +455,7 @@ sub deep_proc_token {
 	### deal with ': e.g., 's, 't, 'm, 'll, 're, 've, n't
 
 	##  'there => ' there   '98 => the same
-	$suc += ($line =~ s/^(\'+)([a-z]+)/ $1 $2/gi);
+	$suc += ($line =~ s/^(\'+)([a-z\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+)/ $1 $2/gi);
 	
 	##  note that \' and \. could interact: e.g.,  U.S.'s;   're.
 	if($Split_NAposT && ($line =~ /^(.*[a-z]+)(n\'t)([\.]*)$/i)){
-- 
cgit v1.2.3


From 1915f029aa02f3528dafcd7b80a62a9c890b462b Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>
Date: Mon, 20 Jan 2014 00:11:27 -0500
Subject: hindi edits

---
 corpus/support/fix-eos.pl   | 10 +++++++++
 corpus/support/token_list   | 51 +++++++++++++++++++++++++++++++++++++++++++++
 corpus/support/tokenizer.pl |  1 +
 corpus/tokenize-anything.sh |  1 +
 4 files changed, 63 insertions(+)
 create mode 100755 corpus/support/fix-eos.pl

(limited to 'corpus/support/tokenizer.pl')

diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl
new file mode 100755
index 00000000..584f8b46
--- /dev/null
+++ b/corpus/support/fix-eos.pl
@@ -0,0 +1,10 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+while(<STDIN>) {
+  s/(\p{Devanagari}{2}[A-Za-z0-9! ,.\@\p{Devanagari}]+?)\s+(\.)(\s*$|\s+\|\|\|)/$1 \x{0964}$3/s;
+  print;
+}
diff --git a/corpus/support/token_list b/corpus/support/token_list
index 43dd80d9..0de72821 100644
--- a/corpus/support/token_list
+++ b/corpus/support/token_list
@@ -1,6 +1,57 @@
 ##################### hyphenated words added by Fei since 3/7/05
 ##X-ray
 
+# hindi abbreviation patterns
+डी.एल.
+डी.टी.ओ.
+डी.ए.
+ए.एस.आई.
+डी.टी.ओ.
+एम.एस.आर.टी.सी.
+बी.बी.एम.बी.
+डी.एस.पी.
+सी.आर.पी.
+एस.डी.एम.
+सी.डी.पी.ओ.
+बी.डी.ओ.
+एस.डी.ओ.
+एम.पी.पी.
+पी.एच.ई.
+एस.एच.ओ.
+ए.सी.पी.
+यू.पी.
+पी.एम.
+आर.बी.डी.
+वी.पी.
+सी.ए.डी.पी.
+ए.
+बी.
+सी.
+डी.
+ई.
+एफ.
+जी.
+एच.
+आई.
+जे.
+के.
+एल.
+एम.
+एन.
+ओ.
+पी.
+क़यू.
+आर.
+एस.
+टी.
+यू.
+वी.
+डबल्यू.
+एक्स.
+वाई.
+ज़ेड.
+ज़ी.
+
 ##################### words made of punct only
 :-
 :-)
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index 079f45b6..475b9a1c 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -112,6 +112,7 @@ my $new_token_total = 0;
 
 while(<STDIN>){
     chomp();
+    s/\x{0970}/./g;  # dev abbreviation character
     if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) {
 	## markup
 	print STDOUT "$_\n";
diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh
index 5b7933d8..bca954d1 100755
--- a/corpus/tokenize-anything.sh
+++ b/corpus/tokenize-anything.sh
@@ -14,6 +14,7 @@ fi
 $SUPPORT/utf8-normalize.sh $NORMARGS |
   $SUPPORT/quote-norm.pl |
   $SUPPORT/tokenizer.pl |
+  $SUPPORT/fix-eos.pl |
   sed $SEDFLAGS -e 's/ al - / al-/g' |
   $SUPPORT/fix-contract.pl |
   sed $SEDFLAGS -e 's/^ //' | sed $SEDFLAGS -e 's/ $//' |
-- 
cgit v1.2.3


From 8d20b30a68b17c64d231ab7182efba4145774d69 Mon Sep 17 00:00:00 2001
From: Chris Dyer <redpony@gmail.com>
Date: Mon, 20 Jan 2014 22:19:15 -0500
Subject: deal with acronyms in hindi

---
 corpus/support/tokenizer.pl | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'corpus/support/tokenizer.pl')

diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index 475b9a1c..7771201f 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -666,10 +666,10 @@ sub deep_proc_token {
 	    return $line;
 	}
 
-	if($line =~ /^(([a-z]\.)+)(\.*)$/i){
+        if ($line =~ /^(([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी)(\.([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी))+)(\.?)(\.*)$/i){
 	    ## I.B.M.
-	    my $t1 = $1;
-	    my $t3 = $3;
+	    my $t1 = $1 . $5;
+	    my $t3 = $6;
 	    return $t1 . " ". proc_token($t3);
 	}
 
@@ -703,10 +703,3 @@ sub deep_proc_token {
     return $line;
 }
 
-
-
-
-
-
-		   
-
-- 
cgit v1.2.3