diff options
author | Chris Dyer <cdyer@Chriss-MacBook-Air.local> | 2013-04-19 17:06:35 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@Chriss-MacBook-Air.local> | 2013-04-19 17:06:35 -0400 |
commit | f729d92ab891a714ced95bd0d4cd8e5d8470a52d (patch) | |
tree | 834deff8895c8ba8a4f17f4cee7f32aa206c42fc | |
parent | 5555d37ada60ea64ce7d8f25e40827a79ef003d8 (diff) |
hindi
-rwxr-xr-x | corpus/support/tokenizer.pl | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 0350a894..acc537fb 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -226,7 +226,7 @@ sub proc_token { } ## step 1: check the most common case - if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){ + if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+$/i){ ### most common cases return $token; } @@ -246,7 +246,7 @@ sub proc_token { ## number return $token; } - if($token =~ /^(@|#)[A-Za-z0-9_\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+.*$/){ + if($token =~ /^(@|#)[A-Za-z0-9_\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+.*$/){ ## twitter hashtag or address return proc_rightpunc($token); } @@ -277,7 +277,7 @@ sub proc_token { } #my $t1 = '[\x{0600}-\x{06ff}a-z\d\_\.\-]'; - my $t1 = '[a-z\d\_\-\.\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]'; + my $t1 = '[a-z\d\_\-\.\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]'; if($token =~ /^\/(($t1)+\/)+($t1)+\/?$/i){ ### /nls/p/.... return $token; @@ -361,7 +361,7 @@ sub deep_proc_token { } ##### step 0: if it mades up of all puncts, remove one punct at a time. - if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}a-zA-Z\d]/){ + if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}a-zA-Z\d]/){ if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\.+|\-+|\_+|\&+)$/){ ## ++ @@@@ !!! .... return $line; |