summaryrefslogtreecommitdiff
path: root/corpus/support/tokenizer.pl
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2014-01-20 22:19:15 -0500
committerChris Dyer <redpony@gmail.com>2014-01-20 22:19:15 -0500
commit8d20b30a68b17c64d231ab7182efba4145774d69 (patch)
treeb42d4d1334436dbcf8feb53b3639fa61bbd69817 /corpus/support/tokenizer.pl
parente1cadaf9cdf9bfcd079133d0b97222c7f0963246 (diff)
deal with acronyms in hindi
Diffstat (limited to 'corpus/support/tokenizer.pl')
-rwxr-xr-xcorpus/support/tokenizer.pl13
1 files changed, 3 insertions, 10 deletions
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index 475b9a1c..7771201f 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -666,10 +666,10 @@ sub deep_proc_token {
return $line;
}
- if($line =~ /^(([a-z]\.)+)(\.*)$/i){
+ if ($line =~ /^(([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी)(\.([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी))+)(\.?)(\.*)$/i){
## I.B.M.
- my $t1 = $1;
- my $t3 = $3;
+ my $t1 = $1 . $5;
+ my $t3 = $6;
return $t1 . " ". proc_token($t3);
}
@@ -703,10 +703,3 @@ sub deep_proc_token {
return $line;
}
-
-
-
-
-
-
-