diff options
Diffstat (limited to 'corpus/support')
| -rw-r--r-- | corpus/support/README | 2 | ||||
| -rwxr-xr-x | corpus/support/fix-contract.pl | 10 | ||||
| -rwxr-xr-x | corpus/support/quote-norm.pl | 86 | ||||
| -rw-r--r-- | corpus/support/token_list | 448 | ||||
| -rw-r--r-- | corpus/support/token_patterns | 5 | ||||
| -rwxr-xr-x | corpus/support/tokenizer.pl | 709 | ||||
| -rwxr-xr-x | corpus/support/utf8-normalize.sh | 36 | 
7 files changed, 1296 insertions, 0 deletions
| diff --git a/corpus/support/README b/corpus/support/README new file mode 100644 index 00000000..fdbd523e --- /dev/null +++ b/corpus/support/README @@ -0,0 +1,2 @@ +Run ./tokenize.sh to tokenize text +Edit eng_token_patterns and eng_token_list to add rules for things not to segment diff --git a/corpus/support/fix-contract.pl b/corpus/support/fix-contract.pl new file mode 100755 index 00000000..f1e191ab --- /dev/null +++ b/corpus/support/fix-contract.pl @@ -0,0 +1,10 @@ +#!/usr/bin/perl -w +use strict; +while(<>) { +  #s/ (pre|anti|re|pro|inter|intra|multi|e|x|neo) - / $1- /ig; +  #s/ - (year) - (old)/ -$1-$2/ig; +  s/ ' (s|m|ll|re|d|ve) / '$1 /ig; +  s/n ' t / n't /ig; +  print; +} + diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl new file mode 100755 index 00000000..b104e73c --- /dev/null +++ b/corpus/support/quote-norm.pl @@ -0,0 +1,86 @@ +#!/usr/bin/perl -w +use strict; +use utf8; +binmode(STDIN,"utf8"); +binmode(STDOUT,"utf8"); +while(<STDIN>) { +  chomp; +  $_ = " $_ "; +  s/&\s*lt\s*;/</gi; +  s/&\s*gt\s*;/>/gi; +  s/&\s*squot\s*;/'/gi; +  s/&\s*quot\s*;/"/gi; +  s/&\s*amp\s*;/&/gi; +  s/&\s*nbsp\s*;/&/gi; +  s/&\s*#\s*160\s*;/ /gi; +  s/ (\d\d): (\d\d)/ $1:$2/g; +  s/[\x{20a0}]\x{20ac}]/ EUR /g; +  s/[\x{00A3}]/ GBP /g; +  s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g; +  s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; +  s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; +  s/&\s*#45\s*;/--/g; +  s/�c/--/g; +  s/ ,,/ "/g; +  s/„/"/g; +  s/``/"/g; +  s/''/"/g; +  s/[「」]/"/g; +  s/〃/"/g; +  s/¨/"/g; +  s/¡/ ¡ /g; +  s/¿/ ¿ /g; + +  s/ п. п. / п.п. /g; +  s/ ст. л. / ст.л. /g; +  s/ т. е. / т.е. /g; +  s/ т. к. / т.к. /g; +  s/ т. ч. / т.ч. /g; +  s/ т. д. / т.д. /g; +  s/ т. п. / т.п. /g; +  s/ и. о. / и.о. /g; +  s/ с. г. / с.г. /g; +  s/ г. р. / г.р. /g; +  s/ т. н. / т.н. /g; +  s/ т. ч. / т.ч. /g; +  s/ н. э. / н.э. /g; +  # â<U+0080><U+0099> +  s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; +  s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; +  s/ˇ/'/g; +  s/´/'/g; +  s/`/'/g; +  s/’/'/g; +  s/ ́/'/g; +  s/‘/'/g; +  s/ˉ/'/g; +  s/β/ß/g; # WMT 2010 error +  s/“/"/g; +  s/”/"/g; +  s/«/"/g; +  s/»/"/g; +  tr/!-~/!-~/; +  s/、/,/g; +  # s/。/./g; +  s/…/.../g; +  s/―/--/g; +  s/–/--/g; +  s/─/--/g; +  s/—/--/g; +  s/•/ * /g; +  s/\*/ * /g; +  s/،/,/g; +  s/؟/?/g; +  s/ـ/ /g; +  s/à ̄/i/g; +  s/’/'/g; +  s/â€"/"/g; +  s/؛/;/g; +		     +  s/\s+/ /g; +  s/^\s+//; +  s/\s+$//; +  s/[\x{00}-\x{1f}]//g; +  print "$_\n"; +} + diff --git a/corpus/support/token_list b/corpus/support/token_list new file mode 100644 index 00000000..43dd80d9 --- /dev/null +++ b/corpus/support/token_list @@ -0,0 +1,448 @@ +##################### hyphenated words added by Fei since 3/7/05 +##X-ray + +##################### words made of punct only +:- +:-) +:-( ++= +-= +.= +*= +>= +<= +== +&& +|| +=> +-> +<- +:) +:( +;) + +#################### abbr added by Fei +oz. +fl. +tel. +1. +2. +3. +4. +5. +6. +7. +8. +9. +10. + +##################### abbreviation: words that contain period. +EE.UU. +ee.uu. +U.A.E +Ala. +Ph.D. +min. +max. +z.B. +d.h. +ggf. +ca. +bzw. +bzgl. +Eng. +i.e. +a.m. +am. +A.M. +Apr. +Ariz. +Ark. +Aug. +B.A.T. +B.A.T +Calif. +Co. +Conn. +Corp. +Cos. +D.C. +Dec. +Dept. +Dr. +Drs. +Feb. +Fla. +Fri. +Ga. +Gen. +gen. +GEN. +Gov. +Govt. +Ill. +Inc. +Jan. +Jr. +Jul. +Jun. +Kan. +L.A. +Lieut. +Lt. +Ltd. +Ma. +Mar. +Mass. +Md. +Mfg. +Mgr. +Mexican-U.S. +Mich. +Minn. +Mo. +Mon. +Mr. +Mrs. +Ms. +Mt. +N.D. +Neb. +Nev. +No. +Nos. +Nov. +Oct. +Okla. +Op. +Ore. +Pa. +p.m +p.m. +I.B.C. +N.T.V +Pres. +Prof. +Prop. +Rd. +Rev. +R.J. +C.L +Rte. +Sat. +W.T +Sen. +Sep. +Sept. +Sgt. +Sr. +SR. +St. +Ste. +Sun. +Tenn. +Tex. +Thu. +Tue. +Univ. +Va. +Vt. +Wed. +approx. +dept. +e.g. +E.G. +eg. +est. +etc. +ex. +ext. +ft. +hon. +hr. +hrs. +lab. +lb. +lbs. +mass. +misc. +no. +nos. +nt. +para. +paras. +pct. +prod. +rec. +ref. +rel. +rep. +sq. +st. +stg. +vol. +vs. +U.S. +J.S. +U.N. +u.n. +A. +B. +C. +D. +E. +F. +G. +H. +I. +J. +K. +L. +M. +N. +O. +P. +Q. +R. +S. +T. +U. +V. +W. +X. +Y. +Z. +А. +Б. +В. +Г. +Д. +Е. +Ё. +Ж. +З. +И. +Й. +К. +Л. +М. +Н. +О. +П. +Р. +С. +Т. +У. +Ф. +Х. +Ц. +Ч. +Ш. +Щ. +Ъ. +Ы. +Ь. +Э. +Ю. +Я. +л. +г. +обл. +гг. +в. +вв. +мин. +ч. +тыс. +млн. +млрд. +трлн. +кв. +куб. +руб. +коп. +долл. +Прим. +прим. +чел. +грн. +мин. +им. +проф. +акад. +ред. +авт. +корр. +соб. +спец. +см. +тж. +др. +пр. +букв. +# Two-letter abbreviations - can be written with space +п.п. +ст.л. +т.е. +т.к. +т.ч. +т.д. +т.п. +и.о. +с.г. +г.р. +т.н. +т.ч. +н.э. +# Swahili +A.D. +Afr. +A.G. +agh. +A.H. +A.M. +a.s. +B.A. +B.C. +Bi. +B.J. +B.K. +B.O.M. +Brig. +Bro. +bt. +bw. +Bw. +Cap. +C.C. +cCM. +C.I.A. +cit. +C.M.S. +Co. +Corp. +C.S.Sp. +C.W. +D.C. +Dk. +Dkt. +Dk.B. +Dr. +E.C. +e.g. +E.M. +E.n. +etc. +Feb. +F.F.U. +F.M. +Fr. +F.W. +I.C.O. +i.e. +I.L.C. +Inc. +Jan. +J.F. +Jr. +J.S. +J.V.W.A. +K.A.R. +K.A.U. +K.C.M.C. +K.k. +K.K. +k.m. +km. +K.m. +K.N.C.U. +K.O. +K.S. +Ksh. +kt. +kumb. +k.v. +kv. +L.G. +ltd. +Ltd. +M.A. +M.D. +mf. +Mh. +Mhe. +mil. +m.m. +M.m. +Mm. +M.M. +Mr. +Mrs. +M.S. +Mt. +Mw. +M.W. +Mwl. +na. +Na. +N.F. +N.J. +n.k. +nk. +n.k.w. +N.N. +Nov. +O.C.D. +op. +P.C. +Phd. +Ph.D. +P.J. +P.o. +P.O. +P.O.P. +P.P.F. +Prof. +P.s. +P.S. +Q.C. +Rd. +s.a.w. +S.A.W. +S.D. +Sept. +sh. +Sh. +SH. +shs. +Shs. +S.J. +S.L. +S.L.P. +S.s. +S.S. +St. +s.w. +s.w.T. +taz. +Taz. +T.C. +T.E.C. +T.L.P. +T.O.H.S. +Tsh. +T.V. +tz. +uk. +Uk. +U.M.C.A. +U.N. +U.S. +Ush. +U.W.T. +Viii. +Vol. +V.T.C. +W.H. +yamb. +Y.M.C.A. diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns new file mode 100644 index 00000000..de64fb2a --- /dev/null +++ b/corpus/support/token_patterns @@ -0,0 +1,5 @@ +/^(al|el|ul|e)\-[a-z]+$/ +/^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ +/^\p{Cyrillic}\.\p{Cyrillic}\.$/ +/^(\d|\d\d|\d\d\d)\.$/ + diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl new file mode 100755 index 00000000..0350a894 --- /dev/null +++ b/corpus/support/tokenizer.pl @@ -0,0 +1,709 @@ +#!/usr/bin/env perl + +my $script_dir; +BEGIN {$^W = 1; use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }   + +use strict; +use utf8; + +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +my $debug = 0; + + +############ options:  +### for all options: +### 0 means no split on that symbol +### 1 means split on that symbol in all cases. +### 2 means do not split in condition 1. +### n means do not split in any of the conditions in the set {1, 2, ..., n-1}. + + +### prefix +## for "#": #90 +my $Split_On_SharpSign = 2; # 2: do not split on Num, e.g., "#90" + + +############## "infix" +my $Split_On_Tilde = 2;  # 2: do not split on Num, e.g., "12~13". + +my $Split_On_Circ = 2;   # 2: do not split on Num, e.g, "2^3" + +## for "&" +my $Split_On_AndSign = 2;  # 2: do not split on short Name, e.g., "AT&T". + +## for hyphen: 1990-1992 +my $Split_On_Dash = 2;  ## 2: do not split on number, e.g., "22-23". +my $Split_On_Underscore = 0;  ## 0: do not split by underline + +## for ":": 5:4 +my $Split_On_Semicolon = 2; ## 2: don't split for num, e.g., "5:4" + +###########  suffix +## for percent sign: 5% +my $Split_On_PercentSign = 1;      ## 2: don't split num, e.g., 5% + +############# others +## for slash: 1/4 +my $Split_On_Slash = 2;  ## 2: don't split on number, e.g., 1/4. +my $Split_On_BackSlash = 0;  ## 0: do not split on "\", e.g., \t + +### for "$": US$120 +my $Split_On_DollarSign = 2; ### 2: US$120 => "US$ 120" +                             ### 1: US$120 => "US $ 120" +## for 's etc. +my $Split_NAposT = 1;  ## n't    +my $Split_AposS = 1;   ## 's +my $Split_AposM = 1;   ## 'm +my $Split_AposRE = 1;  ## 're +my $Split_AposVE = 1;  ## 've +my $Split_AposLL = 1;  ## 'll +my $Split_AposD  = 1;  ## 'd + + +### some patterns +my $common_right_punc = '\.|\,|\;|:|\!|\?|\"|\)|\]|\}|\>|\-'; + +#### step 1: read files + +my $workdir = $script_dir; +my $dict_file = "$workdir/token_list"; +my $word_patt_file = "$workdir/token_patterns"; + +open(my $dict_fp, "$dict_file") or die; +binmode($dict_fp, ":utf8"); + +# read in the list of words that should not be segmented,  +##  e.g.,"I.B.M.", co-operation. +my %dict_hash = (); +my $dict_entry = 0; +while(<$dict_fp>){ +    chomp; +    next if /^\s*$/; +    s/^\s+//; +    s/\s+$//; +    tr/A-Z/a-z/; +    $dict_hash{$_} = 1; +    $dict_entry ++; +} + +open(my $patt_fp, "$word_patt_file") or die; +binmode($patt_fp, ":utf8"); +my @word_patts = (); +my $word_patt_num = 0; +while(<$patt_fp>){ +    chomp; +    next if /^\s*$/; +    s/^\s+//; +    s/\s+$//; +    s/^\/(.+)\/$/$1/;   # remove / / around the pattern +    push(@word_patts, $_); +    $word_patt_num ++; +} + + +###### step 2: process the input file +my $orig_token_total = 0; +my $deep_proc_token_total = 0; +my $new_token_total = 0; + +while(<STDIN>){ +    chomp(); + +    if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) { +	## markup +	print STDOUT "$_\n"; +	next; +    } + +    my $orig_num = 0; +    my $deep_proc_num = 0; + +    my $new_line = proc_line($_, \$orig_num, \$deep_proc_num); + +    $orig_token_total += $orig_num; +    $deep_proc_token_total += $deep_proc_num; + +    $new_line =~ s/\s+$//; +    $new_line =~ s/^\s+//; +    my @parts = split(/\s+/, $new_line); +    $new_token_total += scalar @parts; + +    $new_line =~ s/\s+/ /g; +# fix sgm-markup tokenization +    $new_line =~ s/\s*<\s+seg\s+id\s+=\s+(\d+)\s+>/<seg id=$1>/; +    $new_line =~ s/\s*<\s+(p|hl)\s+>/<$1>/; +    $new_line =~ s/\s*<\s+\/\s+(p|hl|DOC)\s+>/<\/$1>/; +    $new_line =~ s/<\s+\/\s+seg\s+>/<\/seg>/; +    if ($new_line =~ /^\s*<\s+DOC\s+/) { +	$new_line =~ s/\s+//g; +	$new_line =~ s/DOC/DOC /; +	$new_line =~ s/sys/ sys/; +    } +    if ($new_line =~ /^\s*<\s+(refset|srcset)\s+/) { +	$new_line =~ s/\s+//g; +	$new_line =~ s/(set|src|tgt|trg)/ $1/g; +    } + +    print STDOUT " $new_line\n"; +} + +######################################################################## +    +### tokenize a line.  +sub proc_line { +    my @params = @_; +    my $param_num = scalar @params; + +    if(($param_num < 1) || ($param_num > 3)){ +	die "wrong number of params for proc_line: $param_num\n"; +    } + +    my $orig_line = $params[0]; + +    $orig_line =~ s/^\s+//; +    $orig_line =~ s/\s+$//; + +    my @parts = split(/\s+/, $orig_line); + +    if($param_num >= 2){ +	my $orig_num_ptr = $params[1]; +	$$orig_num_ptr = scalar @parts; +    } + +    my $new_line = ""; + +    my $deep_proc_token = 0; +    foreach my $part (@parts){ +	my $flag = -1; +	$new_line .= proc_token($part, \$flag) . " "; +	$deep_proc_token += $flag; +    } + +    if($param_num == 3){ +	my $deep_num_ptr = $params[2]; +	$$deep_num_ptr = $deep_proc_token; +    } + +    return $new_line; +} + + + +## Tokenize a str that does not contain " ", return the new string +## The function handles the cases that the token needs not be segmented. +## for other cases, it calls deep_proc_token() +sub proc_token { +    my @params = @_; +    my $param_num = scalar @params; +    if($param_num > 2){ +	die "proc_token: wrong number of params: $param_num\n"; +    } + +    my $token = $params[0]; + +    if(!defined($token)){ +	return ""; +    } + +    my $deep_proc_flag;  + +    if($param_num == 2){ +	$deep_proc_flag = $params[1]; +        $$deep_proc_flag = 0; +    } + +    if($debug){ +	print STDERR "pro_token:+$token+\n"; +    } + +    ### step 0: it has only one char +    if(($token eq "") || ($token=~ /^.$/)){ +	## print STDERR "see +$token+\n"; +	return $token; +    } + +    ## step 1: check the most common case +    if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){ +	### most common cases +	return $token; +    } + +    ## step 2: check whether it is some NE entity +    ### 1.2.4.6 +    if($token =~ /^\d+(.\d+)+$/){ +	return $token; +    } + +    ## 1,234,345.34 +    if($token =~ /^\d+(\.\d{3})*,\d+$/){ +	## number +	return $token; +    } +    if($token =~ /^\d+(,\d{3})*\.\d+$/){ +	## number +	return $token; +    } +    if($token =~ /^(@|#)[A-Za-z0-9_\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+.*$/){ +        ## twitter hashtag or address +        return proc_rightpunc($token); +    } + +    if($token =~ /^[a-z0-9\_\-]+\@[a-z\d\_\-]+(\.[a-z\d\_\-]+)*(.*)$/i){ +	### email address: xxx@yy.zz +	return proc_rightpunc($token); +    } + +    if($token =~ /^(mailto|http|https|ftp|gopher|telnet|file)\:\/{0,2}([^\.]+)(\.(.+))*$/i){ +	### URL: http://xx.yy.zz +	return proc_rightpunc($token); +    } + +    if($token =~ /^(www)(\.(.+))+$/i){ +	###  www.yy.dd/land/ +	return proc_rightpunc($token); +    } + +    if($token =~ /^(\w+\.)+(com|co|edu|org|gov|ly|cz|ru|eu)(\.[a-z]{2,3})?\:{0,2}(\/\S*)?$/i){ +	### URL: upenn.edu/~xx +	return proc_rightpunc($token); +    } + +    if($token =~ /^\(\d{3}\)\d{3}(\-\d{4})($common_right_punc)*$/){ +	## only handle American phone numbers: e.g., (914)244-4567 +	return proc_rightpunc($token); +    } + +    #my $t1 = '[\x{0600}-\x{06ff}a-z\d\_\.\-]'; +    my $t1 = '[a-z\d\_\-\.\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]'; +    if($token =~ /^\/(($t1)+\/)+($t1)+\/?$/i){ +	### /nls/p/.... +	return $token; +    } + +    if($token =~ /^\\(($t1)+\\)+($t1)+\\?$/i){ +	### \nls\p\.... +	return $token; +    } +	 +    ## step 3: check the dictionary +    my $token_lc = $token; +    $token_lc =~ tr/A-Z/a-z/; + +    if(defined($dict_hash{$token_lc})){ +	return $token; +    } + +    ## step 4: check word_patterns +    my $i=1; +    foreach my $patt (@word_patts){ +	if($token_lc =~ /$patt/){ +	    if($debug){ +		print STDERR "+$token+ match pattern $i: +$patt+\n"; +	    } +	    return $token; +	}else{ +	    $i++; +	} +    } + +    ## step 5: call deep tokenization +    if($param_num == 2){ +	$$deep_proc_flag = 1; +    } +    return deep_proc_token($token); +} + + +### remove punct on the right side +### e.g., xxx@yy.zz, => xxx@yy.zz , +sub proc_rightpunc { +    my ($token) = @_; + +    $token =~ s/(($common_right_punc)+)$/ $1 /; +    if($token =~ /\s/){ +	return proc_line($token); +    }else{ +	return $token; +    } +} +     + + +####################################### +### return the new token:  +###   types of punct: +##      T1 (2):   the punct is always a token by itself no matter where it +###           appears:   " ;   +##      T2 (15):  the punct that can be a part of words made of puncts only. +##               ` ! @ + = [ ] ( ) { } | < > ?  +##      T3 (15):  the punct can be part of a word that contains [a-z\d] +##        T3: ~ ^ & : , # * % - _ \ / . $ ' +##             infix: ~ (12~13), ^ (2^3), & (AT&T),  : ,   +##             prefix: # (#9),  * (*3),  +##             suffix: % (10%),  +##             infix+prefix: - (-5), _ (_foo),  +##             more than one position: \ /  . $ +##             Appos: 'm n't ... + +##   1. separate by puncts in T1 +##   2. separate by puncts in T2  +##   3. deal with punct T3 one by one according to options  +##   4. if the token remains unchanged after step 1-3, return the token + +## $line contains at least 2 chars, and no space. +sub deep_proc_token { +    my ($line) = @_; +    if($debug){ +	print STDERR "deep_proc_token: +$line+\n"; +    } + +    ##### step 0: if it mades up of all puncts, remove one punct at a time. +    if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}a-zA-Z\d]/){ +	if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\.+|\-+|\_+|\&+)$/){ +	    ## ++ @@@@ !!! .... +	    return $line; +	} + +	if($line =~ /^(.)(.+)$/){ +	    my $t1 = $1; +	    my $t2 = $2; +	    return $t1 . " " . proc_token($t2); +	}else{ +	    ### one char only +	    print STDERR "deep_proc_token: this should not happen: +$line+\n"; +	    return $line; +	} +    } + +    ##### step 1: separate by punct T2 on the boundary +    my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;'; +    if($line =~ s/^(($t2)+)/$1 /){ +	return proc_line($line); +    } +	 +    if($line =~ s/(($t2)+)$/ $1/){ +	return proc_line($line); +    }	 + +    ## step 2: separate by punct T2 in any position +    if($line =~ s/(($t2)+)/ $1 /g){ +	return proc_line($line); +    } + +    ##### step 3: deal with special puncts in T3. +    if($line =~ /^(\,+)(.+)$/){ +	my $t1 = $1; +	my $t2 = $2; +	return proc_token($t1) . " " . proc_token($t2); +    } + +    if($line =~ /^(.*[^\,]+)(\,+)$/){ +	## 19.3,,, => 19.3 ,,, +	my $t1 = $1; +	my $t2 = $2; +	return proc_token($t1) . " " . proc_token($t2); +    } + +    ## remove the ending periods that follow number etc. +    if($line =~ /^(.*(\d|\~|\^|\&|\:|\,|\#|\*|\%|\-|\_|\/|\\|\$|\'))(\.+)$/){ +	##    12~13. => 12~13 . +	my $t1 = $1; +	my $t3 = $3; +	return proc_token($t1) . " " . proc_token($t3); +    } + +    ###  deal with "$" +    if(($line =~ /\$/) && ($Split_On_DollarSign > 0)){ +	my $suc = 0; +	if($Split_On_DollarSign == 1){ +	    ## split on all occasation +	    $suc = ($line =~ s/(\$+)/ $1 /g); +	}else{ +	    ## split only between $ and number +	    $suc = ($line =~ s/(\$+)(\d)/$1 $2/g); +	} + +	if($suc){ +	    return proc_line($line); +	} +    } + +    ## deal with "#" +    if(($line =~ /\#/) && ($Split_On_SharpSign > 0)){ +	my $suc = 0; +	if($Split_On_SharpSign >= 2){ +	    ### keep #50 as a token +	    $suc = ($line =~ s/(\#+)(\D)/ $1 $2/gi); +	}else{ +	    $suc = ($line =~ s/(\#+)/ $1 /gi); +	} + +	if($suc){ +	    return proc_line($line); +	} +    } + +    ## deal with ' +    if($line =~ /\'/){ +	my $suc = ($line =~ s/([^\'])([\']+)$/$1 $2/g);  ## xxx'' => xxx ''  +	 +	### deal with ': e.g., 's, 't, 'm, 'll, 're, 've, n't + +	##  'there => ' there   '98 => the same +	$suc += ($line =~ s/^(\'+)([a-z]+)/ $1 $2/gi); +	 +	##  note that \' and \. could interact: e.g.,  U.S.'s;   're. +	if($Split_NAposT && ($line =~ /^(.*[a-z]+)(n\'t)([\.]*)$/i)){ +	    ## doesn't => does n't +	    my $t1 = $1; +	    my $t2 = $2; +	    my $t3 = $3; +	    return proc_token($t1) . " " . $t2 . " " . proc_token($t3); +	} + +	## 's, 't, 'm,  'll, 're, 've: they've => they 've  +        ## 1950's => 1950 's     Co.'s => Co. 's +	if($Split_AposS && ($line =~ /^(.+)(\'s)(\W*)$/i)){ +	    my $t1 = $1; +	    my $t2 = $2; +	    my $t3 = $3; +	    return proc_token($t1) . " " . $t2 . " " . proc_token($t3); +	} + +	if($Split_AposM && ($line =~ /^(.*[a-z]+)(\'m)(\.*)$/i)){ +	    my $t1 = $1; +	    my $t2 = $2; +	    my $t3 = $3; +	    return proc_token($t1) . " " . $t2 . " " . proc_token($t3); +	} + + +	if($Split_AposRE && ($line =~ /^(.*[a-z]+)(\'re)(\.*)$/i)){ +	    my $t1 = $1; +	    my $t2 = $2; +	    my $t3 = $3; +	    return proc_token($t1) . " " . $t2 . " " . proc_token($t3); +	} + +	if($Split_AposVE && ($line =~ /^(.*[a-z]+)(\'ve)(\.*)$/i)){ +	    my $t1 = $1; +	    my $t2 = $2; +	    my $t3 = $3; +	    return proc_token($t1) . " " . $t2 . " " . proc_token($t3); +	} + +	if($Split_AposLL && ($line =~ /^(.*[a-z]+)(\'ll)(\.*)$/i)){ +	    my $t1 = $1; +	    my $t2 = $2; +	    my $t3 = $3; +	    return proc_token($t1) . " " . $t2 . " " . proc_token($t3); +	} + +	if($Split_AposD && ($line =~ /^(.*[a-z]+)(\'d)(\.*)$/i)){ +	    my $t1 = $1; +	    my $t2 = $2; +	    my $t3 = $3; +	    return proc_token($t1) . " " . $t2 . " " . proc_token($t3);	     +	} +	 +	if($suc){ +	    return proc_line($line); +	} +    } + + +    ## deal with "~" +    if(($line =~ /\~/) && ($Split_On_Tilde > 0)){ +	my $suc = 0; +	if($Split_On_Tilde >= 2){ +	    ## keep 12~13 as one token +	    $suc += ($line =~ s/(\D)(\~+)/$1 $2 /g); +	    $suc += ($line =~ s/(\~+)(\D)/ $1 $2/g); +	    $suc += ($line =~ s/^(\~+)(\d)/$1 $2/g); +	    $suc += ($line =~ s/(\d)(\~+)$/$1 $2/g); +	}else{ +	    $suc += ($line =~ s/(\~+)/ $1 /g);	 +	} +	if($suc){ +	    return proc_line($line); +	} +    } + +    ## deal with "^" +    if(($line =~ /\^/) && ($Split_On_Circ > 0)){ +	my $suc = 0; +	if($Split_On_Circ >= 2){ +	    ## keep 12~13 as one token +	    $suc += ($line =~ s/(\D)(\^+)/$1 $2 /g); +	    $suc += ($line =~ s/(\^+)(\D)/ $1 $2/g); +	}else{ +	    $suc = ($line =~ s/(\^+)/ $1 /g);	 +	} +	if($suc){ +	    return proc_line($line); +	} +    } + +    ## deal with ":" +    if(($line =~ /\:/) && ($Split_On_Semicolon > 0)){ +	## 2: => 2 : +	my $suc = ($line =~ s/^(\:+)/$1 /); +	$suc += ($line =~ s/(\:+)$/ $1/); +	if($Split_On_Semicolon >= 2){ +	    ## keep 5:4 as one token +	    $suc += ($line =~ s/(\D)(\:+)/$1 $2 /g); +	    $suc += ($line =~ s/(\:+)(\D)/ $1 $2/g); +	}else{ +	    $suc += ($line =~ s/(\:+)/ $1 /g);	 +	} + +	if($suc){ +	    return proc_line($line); +	} +    } + +    ###  deal with hyphen: 1992-1993. 21st-24th +    if(($line =~ /\-/) && ($Split_On_Dash > 0)){ +	my $suc = ($line =~ s/(\-{2,})/ $1 /g); +	if($Split_On_Dash >= 2){ +	    ## keep 1992-1993 as one token +	    $suc += ($line =~ s/(\D)(\-+)/$1 $2 /g); +	    $suc += ($line =~ s/(\-+)(\D)/ $1 $2/g); +	}else{ +	    ### always split on "-" +	    $suc += ($line =~ s/([\-]+)/ $1 /g); +	} + +	if($suc){ +	    return proc_line($line); +	} +    } + +    ## deal with "_" +    if(($line =~ /\_/) && ($Split_On_Underscore > 0)){ +	### always split on "-" +	if($line =~ s/([\_]+)/ $1 /g){ +	    return proc_line($line); +	} +    } + + + +    ## deal with "%" +    if(($line =~ /\%/) && ($Split_On_PercentSign > 0)){ +	my $suc = 0; +	if($Split_On_PercentSign >= 2){ +	    $suc += ($line =~ s/(\D)(\%+)/$1 $2/g); +	}else{ +	    $suc += ($line =~ s/(\%+)/ $1 /g); +	} + +	if($suc){ +	    return proc_line($line); +	} +    } +	 + +    ###  deal with "/": 4/5 +    if(($line =~ /\//) && ($Split_On_Slash > 0)){ +	my $suc = 0; +	if($Split_On_Slash >= 2){ +	    $suc += ($line =~ s/(\D)(\/+)/$1 $2 /g); +	    $suc += ($line =~ s/(\/+)(\D)/ $1 $2/g); +	}else{ +	    $suc += ($line =~ s/(\/+)/ $1 /g); +	} + +	if($suc){ +	    return proc_line($line); +	} +    } + + +    ### deal with comma: 123,456 +    if($line =~ /\,/){ +	my $suc = 0; +	$suc += ($line =~ s/([^\d]),/$1 , /g);     ## xxx, 1923 => xxx , 1923 +	$suc += ($line =~ s/\,\s*([^\d])/ , $1/g); ## 1923, xxx => 1923 , xxx + +	$suc += ($line =~ s/,([\d]{1,2}[^\d])/ , $1/g);  ## 1,23 => 1 , 23 +	$suc += ($line =~ s/,([\d]{4,}[^\d])/ , $1/g);  ## 1,2345 => 1 , 2345 + +	$suc += ($line =~ s/,([\d]{1,2})$/ , $1/g);  ## 1,23 => 1 , 23 +	$suc += ($line =~ s/,([\d]{4,})$/ , $1/g);  ## 1,2345 => 1 , 2345 + +	if($suc){ +	    return proc_line($line); +	} +    } +     + +    ##  deal with "&" +    if(($line =~ /\&/) && ($Split_On_AndSign > 0)){ +	my $suc = 0; +	if($Split_On_AndSign >= 2){ +	    $suc += ($line =~ s/([a-z]{3,})(\&+)/$1 $2 /gi); +	    $suc += ($line =~ s/(\&+)([a-z]{3,})/ $1 $2/gi); +	}else{ +	    $suc += ($line =~ s/(\&+)/ $1 /g); +	} + +	if($suc){ +	    return proc_line($line); +	} +    } +	 +    ## deal with period +    if($line =~ /\./){ +	if($line =~ /^(([\+|\-])*(\d+\,)*\d*\.\d+\%*)$/){ +	    ### numbers: 3.5 +	    return $line; +	} + +	if($line =~ /^(([a-z]\.)+)(\.*)$/i){ +	    ## I.B.M. +	    my $t1 = $1; +	    my $t3 = $3; +	    return $t1 . " ". proc_token($t3); +	} + +	## Feb.. => Feb. . +	if($line =~ /^(.*[^\.])(\.)(\.*)$/){ +	    my $p1 = $1; +	    my $p2 = $2; +	    my $p3 = $3; +	     +	    my $p1_lc = $p1; +	    $p1_lc =~ tr/A-Z/a-z/; + +	    if(defined($dict_hash{$p1_lc . $p2})){ +		## Dec.. => Dec. . +		return $p1 . $p2 . " " . proc_token($p3); +	    }elsif(defined($dict_hash{$p1_lc})){ +		return $p1 . " " . proc_token($p2 . $p3); +	    }else{ +		## this. => this . +		return proc_token($p1) . " " . proc_token($p2 . $p3); +	    } +	} + +	if($line =~ s/(\.+)(.+)/$1 $2/g){ +	    return proc_line($line); +	} +    } + + +    ## no pattern applies +    return $line; +} + + + + + + +		    + diff --git a/corpus/support/utf8-normalize.sh b/corpus/support/utf8-normalize.sh new file mode 100755 index 00000000..2f347854 --- /dev/null +++ b/corpus/support/utf8-normalize.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# this is the location on malbec, if you want to run on another machine +# ICU may be installed in /usr or /usr/local +ICU_DIR=/usr0/tools/icu +UCONV_BIN=$ICU_DIR/bin/uconv +UCONV_LIB=$ICU_DIR/lib + +if [ -e $UCONV_BIN ] && [ -d $UCONV_LIB ] +then +  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$UCONV_LIB +  if [ ! -x $UCONV_BIN ] +  then +    echo "$0: Cannot execute $UCONV_BIN! Please fix." 1>&2 +    exit +  fi +  CMD="$UCONV_BIN -f utf8 -t utf8 -x Any-NFKC --callback skip" +else +  if which uconv > /dev/null +  then +    CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip" +  else +    echo "$0: Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Quality may suffer." 1>&2 +    CMD="iconv -f utf8 -t utf8 -c" +  fi +fi + +perl -e 'while(<>){s/\r\n*/\n/g; print;}' | $CMD | /usr/bin/perl -w -e ' + while (<>) { +     chomp; +      s/[\x00-\x1F]+/ /g; +      s/  +/ /g; +      s/^ //; +      s/ $//; +      print "$_\n"; +    }' | 
