Merge branch 'master' of https://github.com/redpony/cdec

author: Paul Baltescu <pauldb89@gmail.com> 2013-04-24 17:18:10 +0100
committer: Paul Baltescu <pauldb89@gmail.com> 2013-04-24 17:18:10 +0100
commit: ba206aaac1d95e76126443c9e7ccc5941e879849 (patch)
tree: 13a918da3f3983fd8e4cb74e7cdc3f5e1fc01cd1 /corpus
parent: c2aede0f19b7a5e43581768b8c4fbfae8b92c68c (diff)
parent: db960a8bba81df3217660ec5a96d73e0d6baa01b (diff)
6 files changed, 164 insertions, 6 deletions
diff --git a/corpus/lowercase.pl b/corpus/lowercase.pl
index 688e493b..9fd91dac 100755
--- a/corpus/lowercase.pl
+++ b/corpus/lowercase.pl
@@ -2,7 +2,7 @@
 use strict;
 binmode(STDIN,":utf8");
 binmode(STDOUT,":utf8");
-while(<>) {
+while(<STDIN>) {
   $_ = lc $_;
   print;
 }
diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl
index 4cb424ad..ef2cd937 100755
--- a/corpus/paste-files.pl
+++ b/corpus/paste-files.pl
@@ -34,6 +34,7 @@ while(1) {
       $done = 1;
       last;
     }
+    $r =~ s/\r//g;
     chomp $r;
     if ($r =~ /\|\|\|/) {
       $r = '';
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index e4e5055e..b104e73c 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -11,6 +11,8 @@ while(<STDIN>) {
   s/&\s*squot\s*;/'/gi;
   s/&\s*quot\s*;/"/gi;
   s/&\s*amp\s*;/&/gi;
+  s/&\s*nbsp\s*;/&/gi;
+  s/&\s*#\s*160\s*;/ /gi;
   s/ (\d\d): (\d\d)/ $1:$2/g;
   s/[\x{20a0}]\x{20ac}]/ EUR /g;
   s/[\x{00A3}]/ GBP /g;
@@ -20,6 +22,7 @@ while(<STDIN>) {
   s/&\s*#45\s*;/--/g;
   s/ï¿½c/--/g;
   s/ ,,/ "/g;
+  s/„/"/g;
   s/``/"/g;
   s/''/"/g;
   s/[「」]/"/g;
diff --git a/corpus/support/token_list b/corpus/support/token_list
index d470cb22..43dd80d9 100644
--- a/corpus/support/token_list
+++ b/corpus/support/token_list
@@ -37,6 +37,8 @@ tel.
 10.
 
 ##################### abbreviation: words that contain period.
+EE.UU.
+ee.uu.
 U.A.E
 Ala.
 Ph.D.
@@ -292,3 +294,155 @@ Z.
 т.н.
 т.ч.
 н.э.
+# Swahili
+A.D.
+Afr.
+A.G.
+agh.
+A.H.
+A.M.
+a.s.
+B.A.
+B.C.
+Bi.
+B.J.
+B.K.
+B.O.M.
+Brig.
+Bro.
+bt.
+bw.
+Bw.
+Cap.
+C.C.
+cCM.
+C.I.A.
+cit.
+C.M.S.
+Co.
+Corp.
+C.S.Sp.
+C.W.
+D.C.
+Dk.
+Dkt.
+Dk.B.
+Dr.
+E.C.
+e.g.
+E.M.
+E.n.
+etc.
+Feb.
+F.F.U.
+F.M.
+Fr.
+F.W.
+I.C.O.
+i.e.
+I.L.C.
+Inc.
+Jan.
+J.F.
+Jr.
+J.S.
+J.V.W.A.
+K.A.R.
+K.A.U.
+K.C.M.C.
+K.k.
+K.K.
+k.m.
+km.
+K.m.
+K.N.C.U.
+K.O.
+K.S.
+Ksh.
+kt.
+kumb.
+k.v.
+kv.
+L.G.
+ltd.
+Ltd.
+M.A.
+M.D.
+mf.
+Mh.
+Mhe.
+mil.
+m.m.
+M.m.
+Mm.
+M.M.
+Mr.
+Mrs.
+M.S.
+Mt.
+Mw.
+M.W.
+Mwl.
+na.
+Na.
+N.F.
+N.J.
+n.k.
+nk.
+n.k.w.
+N.N.
+Nov.
+O.C.D.
+op.
+P.C.
+Phd.
+Ph.D.
+P.J.
+P.o.
+P.O.
+P.O.P.
+P.P.F.
+Prof.
+P.s.
+P.S.
+Q.C.
+Rd.
+s.a.w.
+S.A.W.
+S.D.
+Sept.
+sh.
+Sh.
+SH.
+shs.
+Shs.
+S.J.
+S.L.
+S.L.P.
+S.s.
+S.S.
+St.
+s.w.
+s.w.T.
+taz.
+Taz.
+T.C.
+T.E.C.
+T.L.P.
+T.O.H.S.
+Tsh.
+T.V.
+tz.
+uk.
+Uk.
+U.M.C.A.
+U.N.
+U.S.
+Ush.
+U.W.T.
+Viii.
+Vol.
+V.T.C.
+W.H.
+yamb.
+Y.M.C.A.
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index 0350a894..acc537fb 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -226,7 +226,7 @@ sub proc_token {
     }
 
     ## step 1: check the most common case
-    if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){
+    if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+$/i){
 	### most common cases
 	return $token;
     }
@@ -246,7 +246,7 @@ sub proc_token {
 	## number
 	return $token;
     }
-    if($token =~ /^(@|#)[A-Za-z0-9_\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+.*$/){
+    if($token =~ /^(@|#)[A-Za-z0-9_\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+.*$/){
         ## twitter hashtag or address
         return proc_rightpunc($token);
     }
@@ -277,7 +277,7 @@ sub proc_token {
     }
 
     #my $t1 = '[\x{0600}-\x{06ff}a-z\d\_\.\-]';
-    my $t1 = '[a-z\d\_\-\.\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]';
+    my $t1 = '[a-z\d\_\-\.\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]';
     if($token =~ /^\/(($t1)+\/)+($t1)+\/?$/i){
 	### /nls/p/....
 	return $token;
@@ -361,7 +361,7 @@ sub deep_proc_token {
     }
 
     ##### step 0: if it mades up of all puncts, remove one punct at a time.
-    if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}a-zA-Z\d]/){
+    if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}a-zA-Z\d]/){
 	if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\.+|\-+|\_+|\&+)$/){
 	    ## ++ @@@@ !!! ....
 	    return $line;
diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh
index 1a24193d..028992cf 100755
--- a/corpus/tokenize-anything.sh
+++ b/corpus/tokenize-anything.sh
@@ -9,5 +9,5 @@ $SUPPORT/utf8-normalize.sh |
   sed -e 's/ al - / al-/g' |
   $SUPPORT/fix-contract.pl |
   sed -e 's/^ //' | sed -e 's/ $//' |
-  perl -e 'while(<>){s/(\d+)(\.+)$/$1 ./;print;}'
+  perl -e 'while(<>){s/(\d+)(\.+)$/$1 ./; s/(\d+)(\.+) \|\|\|/$1 . |||/;  print;}'
author	Paul Baltescu <pauldb89@gmail.com>	2013-04-24 17:18:10 +0100
committer	Paul Baltescu <pauldb89@gmail.com>	2013-04-24 17:18:10 +0100
commit	ba206aaac1d95e76126443c9e7ccc5941e879849 (patch)
tree	13a918da3f3983fd8e4cb74e7cdc3f5e1fc01cd1 /corpus
parent	c2aede0f19b7a5e43581768b8c4fbfae8b92c68c (diff)
parent	db960a8bba81df3217660ec5a96d73e0d6baa01b (diff)