From 8d20b30a68b17c64d231ab7182efba4145774d69 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 20 Jan 2014 22:19:15 -0500 Subject: deal with acronyms in hindi --- corpus/support/tokenizer.pl | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'corpus/support') diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 475b9a1c..7771201f 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -666,10 +666,10 @@ sub deep_proc_token { return $line; } - if($line =~ /^(([a-z]\.)+)(\.*)$/i){ + if ($line =~ /^(([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी)(\.([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी))+)(\.?)(\.*)$/i){ ## I.B.M. - my $t1 = $1; - my $t3 = $3; + my $t1 = $1 . $5; + my $t3 = $6; return $t1 . " ". proc_token($t3); } @@ -703,10 +703,3 @@ sub deep_proc_token { return $line; } - - - - - - - -- cgit v1.2.3