From 9f57df1eb1346196b6ea755eab9fb81a8a57b915 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 15 Jan 2014 23:33:43 -0500 Subject: deal with hindi --- corpus/support/tokenizer.pl | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'corpus/support/tokenizer.pl') diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index e0df16a7..079f45b6 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -65,7 +65,7 @@ my $Split_AposD = 1; ## 'd ### some patterns -my $common_right_punc = '\.|\,|\;|:|\!|\?|\"|\)|\]|\}|\>|\-'; +my $common_right_punc = '\x{0964}|\.|\,|\;|\!|:|\?|\"|\)|\]|\}|\>|\-'; #### step 1: read files @@ -112,7 +112,6 @@ my $new_token_total = 0; while(){ chomp(); - if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^){ my $orig_num = 0; my $deep_proc_num = 0; - + s/(\x{0964}+)/ $1/g; # Devangari end of sentence my $new_line = proc_line($_, \$orig_num, \$deep_proc_num); $orig_token_total += $orig_num; @@ -148,7 +147,8 @@ while(){ $new_line =~ s/(set|src|tgt|trg)/ $1/g; } - print STDOUT " $new_line\n"; + chomp $new_line; + print STDOUT "$new_line\n"; } ######################################################################## @@ -228,6 +228,7 @@ sub proc_token { ## step 1: check the most common case if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+$/i){ + #if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){ ### most common cases return $token; } @@ -363,7 +364,7 @@ sub deep_proc_token { ##### step 0: if it mades up of all puncts, remove one punct at a time. if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}a-zA-Z\d]/){ - if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\.+|\-+|\_+|\&+)$/){ + if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\x{0964}+|\.+|\-+|\_+|\&+)$/){ ## ++ @@@@ !!! .... return $line; } @@ -454,7 +455,7 @@ sub deep_proc_token { ### deal with ': e.g., 's, 't, 'm, 'll, 're, 've, n't ## 'there => ' there '98 => the same - $suc += ($line =~ s/^(\'+)([a-z]+)/ $1 $2/gi); + $suc += ($line =~ s/^(\'+)([a-z\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+)/ $1 $2/gi); ## note that \' and \. could interact: e.g., U.S.'s; 're. if($Split_NAposT && ($line =~ /^(.*[a-z]+)(n\'t)([\.]*)$/i)){ -- cgit v1.2.3 From 1915f029aa02f3528dafcd7b80a62a9c890b462b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 20 Jan 2014 00:11:27 -0500 Subject: hindi edits --- corpus/support/fix-eos.pl | 10 +++++++++ corpus/support/token_list | 51 +++++++++++++++++++++++++++++++++++++++++++++ corpus/support/tokenizer.pl | 1 + corpus/tokenize-anything.sh | 1 + 4 files changed, 63 insertions(+) create mode 100755 corpus/support/fix-eos.pl (limited to 'corpus/support/tokenizer.pl') diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl new file mode 100755 index 00000000..584f8b46 --- /dev/null +++ b/corpus/support/fix-eos.pl @@ -0,0 +1,10 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +while() { + s/(\p{Devanagari}{2}[A-Za-z0-9! ,.\@\p{Devanagari}]+?)\s+(\.)(\s*$|\s+\|\|\|)/$1 \x{0964}$3/s; + print; +} diff --git a/corpus/support/token_list b/corpus/support/token_list index 43dd80d9..0de72821 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -1,6 +1,57 @@ ##################### hyphenated words added by Fei since 3/7/05 ##X-ray +# hindi abbreviation patterns +डी.एल. +डी.टी.ओ. +डी.ए. +ए.एस.आई. +डी.टी.ओ. +एम.एस.आर.टी.सी. +बी.बी.एम.बी. +डी.एस.पी. +सी.आर.पी. +एस.डी.एम. +सी.डी.पी.ओ. +बी.डी.ओ. +एस.डी.ओ. +एम.पी.पी. +पी.एच.ई. +एस.एच.ओ. +ए.सी.पी. +यू.पी. +पी.एम. +आर.बी.डी. +वी.पी. +सी.ए.डी.पी. +ए. +बी. +सी. +डी. +ई. +एफ. +जी. +एच. +आई. +जे. +के. +एल. +एम. +एन. +ओ. +पी. +क़यू. +आर. +एस. +टी. +यू. +वी. +डबल्यू. +एक्स. +वाई. +ज़ेड. +ज़ी. + ##################### words made of punct only :- :-) diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 079f45b6..475b9a1c 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -112,6 +112,7 @@ my $new_token_total = 0; while(){ chomp(); + s/\x{0970}/./g; # dev abbreviation character if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^ Date: Mon, 20 Jan 2014 22:19:15 -0500 Subject: deal with acronyms in hindi --- corpus/support/tokenizer.pl | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'corpus/support/tokenizer.pl') diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 475b9a1c..7771201f 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -666,10 +666,10 @@ sub deep_proc_token { return $line; } - if($line =~ /^(([a-z]\.)+)(\.*)$/i){ + if ($line =~ /^(([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी)(\.([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी))+)(\.?)(\.*)$/i){ ## I.B.M. - my $t1 = $1; - my $t3 = $3; + my $t1 = $1 . $5; + my $t3 = $6; return $t1 . " ". proc_token($t3); } @@ -703,10 +703,3 @@ sub deep_proc_token { return $line; } - - - - - - - -- cgit v1.2.3