diff options
Diffstat (limited to 'corpus/support')
-rwxr-xr-x | corpus/support/fix-eos.pl | 10 | ||||
-rwxr-xr-x | corpus/support/quote-norm.pl | 146 | ||||
-rw-r--r-- | corpus/support/token_list | 59 | ||||
-rwxr-xr-x | corpus/support/tokenizer.pl | 27 |
4 files changed, 192 insertions, 50 deletions
diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl new file mode 100755 index 00000000..584f8b46 --- /dev/null +++ b/corpus/support/fix-eos.pl @@ -0,0 +1,10 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +while(<STDIN>) { + s/(\p{Devanagari}{2}[A-Za-z0-9! ,.\@\p{Devanagari}]+?)\s+(\.)(\s*$|\s+\|\|\|)/$1 \x{0964}$3/s; + print; +} diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 57f4ad77..f677df66 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -7,31 +7,98 @@ binmode(STDOUT,"utf8"); while(<STDIN>) { chomp; $_ = " $_ "; - s/&\s*lt\s*;/</gi; - s/&\s*gt\s*;/>/gi; - s/&\s*squot\s*;/'/gi; - s/&\s*quot\s*;/"/gi; - s/&\s*amp\s*;/&/gi; - s/&\s*nbsp\s*;/&/gi; - s/&\s*#\s*160\s*;/ /gi; + + # Regularlize spaces: + s/\x{a0}/ /g; # non-breaking space + s/\x{2009}/ /g; # thin space + s/\x{2028}/ /g; # "line separator" + s/\x{2029}/ /g; # "paragraph separator" + s/\x{202a}/ /g; # "left-to-right embedding" + s/\x{202b}/ /g; # "right-to-left embedding" + s/\x{202c}/ /g; # "pop directional formatting" + s/\x{202d}/ /g; # "left-to-right override" + s/\x{202e}/ /g; # "right-to-left override" + s/\x{85}/ /g; # "next line" + s/\x{fffd}/ /g; # "replacement character" + s/\x{feff}/ /g; # byte-order mark + s/\x{fdd3}/ /g; # "unicode non-character" + + # Regularize named HTML/XML escapes: + s/&\s*lt\s*;/</gi; # HTML opening angle bracket + s/&\s*gt\s*;/>/gi; # HTML closing angle bracket + s/&\s*squot\s*;/'/gi; # HTML single quote + s/&\s*quot\s*;/"/gi; # HTML double quote + s/&\s*nbsp\s*;/ /gi; # HTML non-breaking space + s/'/\'/g; # HTML apostrophe + s/&\s*amp\s*;/&/gi; # HTML ampersand (last) + + # Regularize known HTML numeric codes: + s/&\s*#\s*160\s*;/ /gi; # no-break space + s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus + s/&\s*#45\s*;/--/g; # hyphen-minus + + # Convert arbitrary hex or decimal HTML entities to actual characters: + s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; + s/&\#([0-9]+);/pack("U", $1)/ge; + + # Convert other Windows 1252 characters to UTF-8 + s/\x{80}/\x{20ac}/g; # euro sign + s/\x{95}/\x{2022}/g; # bullet + s/\x{99}/\x{2122}/g; # trademark sign + + # Currency and measure conversions: s/ (\d\d): (\d\d)/ $1:$2/g; s/[\x{20a0}]\x{20ac}]/ EUR /g; s/[\x{00A3}]/ GBP /g; s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g; s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; - s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; - s/&\s*#45\s*;/--/g; - s/�c/--/g; - s/ ,,/ "/g; - s/„/"/g; - s/``/"/g; - s/''/"/g; - s/[「」]/"/g; - s/〃/"/g; - s/¨/"/g; + + # Ridiculous double conversions(?) (news commentary and Giga-FrEn): + s/�c/--/g; # long dash + s/\x{e2}\x{20ac}oe/\"/g; # opening double quote + s/\x{e2}\x{20ac}\x{9c}/\"/g; # opening double quote + s/\x{e2}\x{20ac}\x{9d}/\"/g; # closing double quote + s/\x{e2}\x{20ac}\x{2122}/\'/g; # apostrophe + s/\x{e2}\x{20ac}\x{201c}/ -- /g; # en dash? + s/\x{e2}\x{20ac}\x{201d}/ -- /g; # em dash? + s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; # single quote? + s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; # double quote? + + # Regularize quotes: + s/ˇ/'/g; # caron + s/´/'/g; # acute accent + s/`/'/g; # grave accent + s/ˉ/'/g; # modified letter macron + s/ ,,/ "/g; # ghetto low-99 quote + s/``/"/g; # latex-style left quote + s/''/"/g; # latex-style right quote + s/\x{300c}/"/g; # left corner bracket + s/\x{300d}/"/g; # right corner bracket + s/\x{3003}/"/g; # ditto mark + s/\x{00a8}/"/g; # diaeresis + s/\x{92}/\'/g; # curly apostrophe + s/\x{2019}/\'/g; # curly apostrophe + s/\x{f03d}/\'/g; # curly apostrophe + s/\x{b4}/\'/g; # curly apostrophe + s/\x{2018}/\'/g; # curly single open quote + s/\x{201a}/\'/g; # low-9 quote + s/\x{93}/\"/g; # curly left quote + s/\x{201c}/\"/g; # curly left quote + s/\x{94}/\"/g; # curly right quote + s/\x{201d}/\"/g; # curly right quote + s/\x{2033}/\"/g; # curly right quote + s/\x{201e}/\"/g; # low-99 quote + s/\x{84}/\"/g; # low-99 quote (bad enc) + s/\x{201f}/\"/g; # high-rev-99 quote + s/\x{ab}/\"/g; # opening guillemet + s/\x{bb}/\"/g; # closing guillemet + s/\x{0301}/'/g; # combining acute accent + + # Space inverted punctuation: s/¡/ ¡ /g; s/¿/ ¿ /g; + # Russian abbreviations: s/ п. п. / п.п. /g; s/ ст. л. / ст.л. /g; s/ т. е. / т.е. /g; @@ -45,24 +112,19 @@ while(<STDIN>) { s/ т. н. / т.н. /g; s/ т. ч. / т.ч. /g; s/ н. э. / н.э. /g; - # â<U+0080><U+0099> - s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; - s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; - s/ˇ/'/g; - s/´/'/g; - s/`/'/g; - s/’/'/g; - s/ ́/'/g; - s/‘/'/g; - s/ˉ/'/g; - s/β/ß/g; # WMT 2010 error - s/“/"/g; - s/”/"/g; - s/«/"/g; - s/»/"/g; + + # Convert foreign numerals into Arabic numerals + tr/०-९/0-9/; # devangari + tr/౦-౯/0-9/; # telugu + tr/೦-೯/0-9/; # kannada + tr/೦-௯/0-9/; # tamil + tr/൦-൯/0-9/; # malayalam + + # Random punctuation: tr/!-~/!-~/; s/、/,/g; # s/。/./g; + s/\x{85}/.../g; s/…/.../g; s/―/--/g; s/–/--/g; @@ -77,11 +139,27 @@ while(<STDIN>) { s/’/'/g; s/â€"/"/g; s/؛/;/g; - + + # Regularize ligatures: + s/\x{9c}/oe/g; # "oe" ligature + s/\x{0153}/oe/g; # "oe" ligature + s/\x{8c}/Oe/g; # "OE" ligature + s/\x{0152}/Oe/g; # "OE" ligature + s/\x{fb00}/ff/g; # "ff" ligature + s/\x{fb01}/fi/g; # "fi" ligature + s/\x{fb02}/fl/g; # "fl" ligature + s/\x{fb03}/ffi/g; # "ffi" ligature + s/\x{fb04}/ffi/g; # "ffl" ligature + + s/β/ß/g; # WMT 2010 error + + # Strip extra spaces: s/\s+/ /g; s/^\s+//; s/\s+$//; - s/[\x{00}-\x{1f}]//g; + + # Delete control characters: + s/[\x{00}-\x{1f}]//g; print "$_\n"; } diff --git a/corpus/support/token_list b/corpus/support/token_list index 43dd80d9..228663f6 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -1,6 +1,65 @@ ##################### hyphenated words added by Fei since 3/7/05 ##X-ray +# hindi abbreviation patterns +जन. +फर. +अग. +सित. +अक्टू. +अक्तू. +नव. +दिस. +डी.एल. +डी.टी.ओ. +डी.ए. +ए.एस.आई. +डी.टी.ओ. +एम.एस.आर.टी.सी. +बी.बी.एम.बी. +डी.एस.पी. +सी.आर.पी. +एस.डी.एम. +सी.डी.पी.ओ. +बी.डी.ओ. +एस.डी.ओ. +एम.पी.पी. +पी.एच.ई. +एस.एच.ओ. +ए.सी.पी. +यू.पी. +पी.एम. +आर.बी.डी. +वी.पी. +सी.ए.डी.पी. +ए. +बी. +सी. +डी. +ई. +एफ. +जी. +एच. +आई. +जे. +के. +एल. +एम. +एन. +ओ. +पी. +क़यू. +आर. +एस. +टी. +यू. +वी. +डबल्यू. +एक्स. +वाई. +ज़ेड. +ज़ी. + ##################### words made of punct only :- :-) diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index e0df16a7..7771201f 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -65,7 +65,7 @@ my $Split_AposD = 1; ## 'd ### some patterns -my $common_right_punc = '\.|\,|\;|:|\!|\?|\"|\)|\]|\}|\>|\-'; +my $common_right_punc = '\x{0964}|\.|\,|\;|\!|:|\?|\"|\)|\]|\}|\>|\-'; #### step 1: read files @@ -112,7 +112,7 @@ my $new_token_total = 0; while(<STDIN>){ chomp(); - + s/\x{0970}/./g; # dev abbreviation character if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) { ## markup print STDOUT "$_\n"; @@ -121,7 +121,7 @@ while(<STDIN>){ my $orig_num = 0; my $deep_proc_num = 0; - + s/(\x{0964}+)/ $1/g; # Devangari end of sentence my $new_line = proc_line($_, \$orig_num, \$deep_proc_num); $orig_token_total += $orig_num; @@ -148,7 +148,8 @@ while(<STDIN>){ $new_line =~ s/(set|src|tgt|trg)/ $1/g; } - print STDOUT " $new_line\n"; + chomp $new_line; + print STDOUT "$new_line\n"; } ######################################################################## @@ -228,6 +229,7 @@ sub proc_token { ## step 1: check the most common case if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+$/i){ + #if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){ ### most common cases return $token; } @@ -363,7 +365,7 @@ sub deep_proc_token { ##### step 0: if it mades up of all puncts, remove one punct at a time. if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}a-zA-Z\d]/){ - if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\.+|\-+|\_+|\&+)$/){ + if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\x{0964}+|\.+|\-+|\_+|\&+)$/){ ## ++ @@@@ !!! .... return $line; } @@ -454,7 +456,7 @@ sub deep_proc_token { ### deal with ': e.g., 's, 't, 'm, 'll, 're, 've, n't ## 'there => ' there '98 => the same - $suc += ($line =~ s/^(\'+)([a-z]+)/ $1 $2/gi); + $suc += ($line =~ s/^(\'+)([a-z\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+)/ $1 $2/gi); ## note that \' and \. could interact: e.g., U.S.'s; 're. if($Split_NAposT && ($line =~ /^(.*[a-z]+)(n\'t)([\.]*)$/i)){ @@ -664,10 +666,10 @@ sub deep_proc_token { return $line; } - if($line =~ /^(([a-z]\.)+)(\.*)$/i){ + if ($line =~ /^(([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी)(\.([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी))+)(\.?)(\.*)$/i){ ## I.B.M. - my $t1 = $1; - my $t3 = $3; + my $t1 = $1 . $5; + my $t3 = $6; return $t1 . " ". proc_token($t3); } @@ -701,10 +703,3 @@ sub deep_proc_token { return $line; } - - - - - - - |