summaryrefslogtreecommitdiff
path: root/corpus/support
diff options
context:
space:
mode:
Diffstat (limited to 'corpus/support')
-rw-r--r--corpus/support/token_list49
-rw-r--r--corpus/support/token_patterns2
-rwxr-xr-xcorpus/support/tokenizer.pl8
3 files changed, 55 insertions, 4 deletions
diff --git a/corpus/support/token_list b/corpus/support/token_list
index d38638cf..00daa82b 100644
--- a/corpus/support/token_list
+++ b/corpus/support/token_list
@@ -1,6 +1,55 @@
##################### hyphenated words added by Fei since 3/7/05
##X-ray
+# Finnish
+eaa.
+ap.
+arv.
+ay.
+eKr.
+em.
+engl.
+esim.
+fil.
+lis.
+fil.
+maist.
+fil.toht.
+harv.
+ilt.
+jatk.
+jKr.
+jms.
+jne.
+joht.
+klo
+ko.
+ks.
+leht.
+lv.
+lyh.
+mm.
+mon.
+nim.
+nro.
+ns.
+nti.
+os.
+oy.
+pj.
+pnä.
+puh.
+pvm.
+rva.
+tms.
+ts.
+vars.
+vrt.
+ym.
+yms.
+yo.
+>>>>>>> 8646b68e5b124f612fd65b51ea40624f65a2f3d6
+
# hindi abbreviation patterns
जन.
फर.
diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns
index de64fb2a..12558cdd 100644
--- a/corpus/support/token_patterns
+++ b/corpus/support/token_patterns
@@ -1,4 +1,6 @@
/^(al|el|ul|e)\-[a-z]+$/
+/\.(fi|fr|es|co\.uk|de)$/
+/:[a-zä]+$/
/^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/
/^\p{Cyrillic}\.\p{Cyrillic}\.$/
/^(\d|\d\d|\d\d\d)\.$/
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index aa285be4..718d78cc 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -415,7 +415,7 @@ sub deep_proc_token {
}
## remove the ending periods that follow number etc.
- if($line =~ /^(.*(\d|\~|\^|\&|\:|\,|\#|\*|\%|\-|\_|\/|\\|\$|\'))(\.+)$/){
+ if($line =~ /^(.*(\d|\~|\^|\&|\:|\,|\#|\*|\%|€|\-|\_|\/|\\|\$|\'))(\.+)$/){
## 12~13. => 12~13 .
my $t1 = $1;
my $t3 = $3;
@@ -600,12 +600,12 @@ sub deep_proc_token {
## deal with "%"
- if(($line =~ /\%/) && ($Split_On_PercentSign > 0)){
+ if(($line =~ /\%|€/) && ($Split_On_PercentSign > 0)){
my $suc = 0;
if($Split_On_PercentSign >= 2){
- $suc += ($line =~ s/(\D)(\%+)/$1 $2/g);
+ $suc += ($line =~ s/(\D)(\%+|€+)/$1 $2/g);
}else{
- $suc += ($line =~ s/(\%+)/ $1 /g);
+ $suc += ($line =~ s/(\%+|€+)/ $1 /g);
}
if($suc){