summaryrefslogtreecommitdiff
path: root/corpus/support/tokenizer.pl
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2013-05-02 09:09:59 +0200
committerPatrick Simianer <p@simianer.de>2013-05-02 09:09:59 +0200
commit0ce66778da6079506896739e9d97dc7dff83cd72 (patch)
treef435457bb23dab0c566c9896f9d38cece9d15885 /corpus/support/tokenizer.pl
parentb6754386f1109b960b05cdf2eabbc97bdd38e8df (diff)
parentb7ea2615bc9bb69031ff714ddce1539c9f1bda2d (diff)
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'corpus/support/tokenizer.pl')
-rwxr-xr-xcorpus/support/tokenizer.pl8
1 files changed, 4 insertions, 4 deletions
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index 0350a894..acc537fb 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -226,7 +226,7 @@ sub proc_token {
}
## step 1: check the most common case
- if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){
+ if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+$/i){
### most common cases
return $token;
}
@@ -246,7 +246,7 @@ sub proc_token {
## number
return $token;
}
- if($token =~ /^(@|#)[A-Za-z0-9_\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+.*$/){
+ if($token =~ /^(@|#)[A-Za-z0-9_\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+.*$/){
## twitter hashtag or address
return proc_rightpunc($token);
}
@@ -277,7 +277,7 @@ sub proc_token {
}
#my $t1 = '[\x{0600}-\x{06ff}a-z\d\_\.\-]';
- my $t1 = '[a-z\d\_\-\.\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]';
+ my $t1 = '[a-z\d\_\-\.\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]';
if($token =~ /^\/(($t1)+\/)+($t1)+\/?$/i){
### /nls/p/....
return $token;
@@ -361,7 +361,7 @@ sub deep_proc_token {
}
##### step 0: if it mades up of all puncts, remove one punct at a time.
- if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}a-zA-Z\d]/){
+ if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}a-zA-Z\d]/){
if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\.+|\-+|\_+|\&+)$/){
## ++ @@@@ !!! ....
return $line;