Merge remote-tracking branch 'upstream/master'

author: Patrick Simianer <p@simianer.de> 2014-01-27 10:39:36 +0100
committer: Patrick Simianer <p@simianer.de> 2014-01-27 10:39:36 +0100
commit: 64e135092c140310345bb7fcf3dffc8072e652d3 (patch)
tree: 127636a2820fb9a7a522157ea4de55cdc0c3c0bd /corpus
parent: 58be95f557d2f6a006cc98a23de98125e6b83a32 (diff)
parent: 5ab6eb44d67a48ea5b366d8b2878f3da7ef960e4 (diff)
5 files changed, 193 insertions, 50 deletions
diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl
new file mode 100755
index 00000000..584f8b46
--- /dev/null
+++ b/corpus/support/fix-eos.pl
@@ -0,0 +1,10 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+while(<STDIN>) {
+  s/(\p{Devanagari}{2}[A-Za-z0-9! ,.\@\p{Devanagari}]+?)\s+(\.)(\s*$|\s+\|\|\|)/$1 \x{0964}$3/s;
+  print;
+}
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 57f4ad77..f677df66 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -7,31 +7,98 @@ binmode(STDOUT,"utf8");
 while(<STDIN>) {
   chomp;
   $_ = " $_ ";
-  s/&\s*lt\s*;/</gi;
-  s/&\s*gt\s*;/>/gi;
-  s/&\s*squot\s*;/'/gi;
-  s/&\s*quot\s*;/"/gi;
-  s/&\s*amp\s*;/&/gi;
-  s/&\s*nbsp\s*;/&/gi;
-  s/&\s*#\s*160\s*;/ /gi;
+
+  # Regularlize spaces:
+  s/\x{a0}/ /g;       # non-breaking space
+  s/\x{2009}/ /g;     # thin space
+  s/\x{2028}/ /g;     # "line separator"
+  s/\x{2029}/ /g;     # "paragraph separator"
+  s/\x{202a}/ /g;     # "left-to-right embedding"
+  s/\x{202b}/ /g;     # "right-to-left embedding"
+  s/\x{202c}/ /g;     # "pop directional formatting"
+  s/\x{202d}/ /g;     # "left-to-right override"
+  s/\x{202e}/ /g;     # "right-to-left override"
+  s/\x{85}/ /g;       # "next line"
+  s/\x{fffd}/ /g;     # "replacement character"
+  s/\x{feff}/ /g;     # byte-order mark
+  s/\x{fdd3}/ /g;     # "unicode non-character"
+
+  # Regularize named HTML/XML escapes:
+  s/&\s*lt\s*;/</gi;    # HTML opening angle bracket
+  s/&\s*gt\s*;/>/gi;    # HTML closing angle bracket
+  s/&\s*squot\s*;/'/gi; # HTML single quote
+  s/&\s*quot\s*;/"/gi;  # HTML double quote
+  s/&\s*nbsp\s*;/ /gi;  # HTML non-breaking space
+  s/&apos;/\'/g;        # HTML apostrophe
+  s/&\s*amp\s*;/&/gi;   # HTML ampersand (last)
+
+  # Regularize known HTML numeric codes:
+  s/&\s*#\s*160\s*;/ /gi;           # no-break space
+  s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; # hyphen-minus hyphen-minus
+  s/&\s*#45\s*;/--/g;               # hyphen-minus
+
+  # Convert arbitrary hex or decimal HTML entities to actual characters:
+  s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge;
+  s/&\#([0-9]+);/pack("U", $1)/ge;
+
+  # Convert other Windows 1252 characters to UTF-8 
+  s/\x{80}/\x{20ac}/g;    # euro sign
+  s/\x{95}/\x{2022}/g;    # bullet
+  s/\x{99}/\x{2122}/g;    # trademark sign
+
+  # Currency and measure conversions:
   s/ (\d\d): (\d\d)/ $1:$2/g;
   s/[\x{20a0}]\x{20ac}]/ EUR /g;
   s/[\x{00A3}]/ GBP /g;
   s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g;
   s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi;
-  s/&\s*#45\s*;\s*&\s*#45\s*;/--/g;
-  s/&\s*#45\s*;/--/g;
-  s/ï¿½c/--/g;
-  s/ ,,/ "/g;
-  s/„/"/g;
-  s/``/"/g;
-  s/''/"/g;
-  s/[「」]/"/g;
-  s/〃/"/g;
-  s/¨/"/g;
+
+  # Ridiculous double conversions(?) (news commentary and Giga-FrEn):
+  s/ï¿½c/--/g;                        # long dash
+  s/\x{e2}\x{20ac}oe/\"/g;            # opening double quote
+  s/\x{e2}\x{20ac}\x{9c}/\"/g;        # opening double quote
+  s/\x{e2}\x{20ac}\x{9d}/\"/g;        # closing double quote
+  s/\x{e2}\x{20ac}\x{2122}/\'/g;      # apostrophe
+  s/\x{e2}\x{20ac}\x{201c}/ -- /g;    # en dash?
+  s/\x{e2}\x{20ac}\x{201d}/ -- /g;    # em dash? 
+  s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; # single quote?
+  s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; # double quote?
+
+  # Regularize quotes:
+  s/ˇ/'/g;            # caron
+  s/´/'/g;            # acute accent
+  s/`/'/g;            # grave accent
+  s/ˉ/'/g;            # modified letter macron
+  s/ ,,/ "/g;         # ghetto low-99 quote
+  s/``/"/g;           # latex-style left quote
+  s/''/"/g;           # latex-style right quote
+  s/\x{300c}/"/g;     # left corner bracket
+  s/\x{300d}/"/g;     # right corner bracket
+  s/\x{3003}/"/g;     # ditto mark
+  s/\x{00a8}/"/g;     # diaeresis
+  s/\x{92}/\'/g;      # curly apostrophe
+  s/\x{2019}/\'/g;    # curly apostrophe
+  s/\x{f03d}/\'/g;    # curly apostrophe
+  s/\x{b4}/\'/g;      # curly apostrophe
+  s/\x{2018}/\'/g;    # curly single open quote
+  s/\x{201a}/\'/g;    # low-9 quote
+  s/\x{93}/\"/g;      # curly left quote
+  s/\x{201c}/\"/g;    # curly left quote
+  s/\x{94}/\"/g;      # curly right quote
+  s/\x{201d}/\"/g;    # curly right quote
+  s/\x{2033}/\"/g;    # curly right quote
+  s/\x{201e}/\"/g;    # low-99 quote
+  s/\x{84}/\"/g;      # low-99 quote (bad enc)
+  s/\x{201f}/\"/g;    # high-rev-99 quote
+  s/\x{ab}/\"/g;      # opening guillemet
+  s/\x{bb}/\"/g;      # closing guillemet
+  s/\x{0301}/'/g;     # combining acute accent
+
+  # Space inverted punctuation:
   s/¡/ ¡ /g;
   s/¿/ ¿ /g;
 
+  # Russian abbreviations:
   s/ п. п. / п.п. /g;
   s/ ст. л. / ст.л. /g;
   s/ т. е. / т.е. /g;
@@ -45,24 +112,19 @@ while(<STDIN>) {
   s/ т. н. / т.н. /g;
   s/ т. ч. / т.ч. /g;
   s/ н. э. / н.э. /g;
-  # â<U+0080><U+0099>
-  s/â(\x{80}\x{99}|\x{80}\x{98})/'/g;
-  s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g;
-  s/ˇ/'/g;
-  s/´/'/g;
-  s/`/'/g;
-  s/’/'/g;
-  s/ ́/'/g;
-  s/‘/'/g;
-  s/ˉ/'/g;
-  s/β/ß/g; # WMT 2010 error
-  s/“/"/g;
-  s/”/"/g;
-  s/«/"/g;
-  s/»/"/g;
+
+  # Convert foreign numerals into Arabic numerals
+  tr/०-९/0-9/; # devangari
+  tr/౦-౯/0-9/; # telugu
+  tr/೦-೯/0-9/; # kannada
+  tr/೦-௯/0-9/; # tamil
+  tr/൦-൯/0-9/; # malayalam
+
+  # Random punctuation:
   tr/！-～/!-~/;
   s/、/,/g;
   # s/。/./g;
+  s/\x{85}/.../g;
   s/…/.../g;
   s/―/--/g;
   s/–/--/g;
@@ -77,11 +139,27 @@ while(<STDIN>) {
   s/â€™/'/g;
   s/â€"/"/g;
   s/؛/;/g;
-		    
+
+  # Regularize ligatures:
+  s/\x{9c}/oe/g;      # "oe" ligature 
+  s/\x{0153}/oe/g;    # "oe" ligature 
+  s/\x{8c}/Oe/g;      # "OE" ligature
+  s/\x{0152}/Oe/g;    # "OE" ligature
+  s/\x{fb00}/ff/g;    # "ff" ligature
+  s/\x{fb01}/fi/g;    # "fi" ligature
+  s/\x{fb02}/fl/g;    # "fl" ligature
+  s/\x{fb03}/ffi/g;   # "ffi" ligature
+  s/\x{fb04}/ffi/g;   # "ffl" ligature
+
+  s/β/ß/g; # WMT 2010 error
+
+  # Strip extra spaces: 
   s/\s+/ /g;
   s/^\s+//;
   s/\s+$//;
-  s/[\x{00}-\x{1f}]//g;
+
+  # Delete control characters:
+  s/[\x{00}-\x{1f}]//g; 
   print "$_\n";
 }
 
diff --git a/corpus/support/token_list b/corpus/support/token_list
index 43dd80d9..228663f6 100644
--- a/corpus/support/token_list
+++ b/corpus/support/token_list
@@ -1,6 +1,65 @@
 ##################### hyphenated words added by Fei since 3/7/05
 ##X-ray
 
+# hindi abbreviation patterns
+जन.
+फर.
+अग.
+सित.
+अक्टू.
+अक्तू.
+नव.
+दिस.
+डी.एल.
+डी.टी.ओ.
+डी.ए.
+ए.एस.आई.
+डी.टी.ओ.
+एम.एस.आर.टी.सी.
+बी.बी.एम.बी.
+डी.एस.पी.
+सी.आर.पी.
+एस.डी.एम.
+सी.डी.पी.ओ.
+बी.डी.ओ.
+एस.डी.ओ.
+एम.पी.पी.
+पी.एच.ई.
+एस.एच.ओ.
+ए.सी.पी.
+यू.पी.
+पी.एम.
+आर.बी.डी.
+वी.पी.
+सी.ए.डी.पी.
+ए.
+बी.
+सी.
+डी.
+ई.
+एफ.
+जी.
+एच.
+आई.
+जे.
+के.
+एल.
+एम.
+एन.
+ओ.
+पी.
+क़यू.
+आर.
+एस.
+टी.
+यू.
+वी.
+डबल्यू.
+एक्स.
+वाई.
+ज़ेड.
+ज़ी.
+
 ##################### words made of punct only
 :-
 :-)
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index e0df16a7..7771201f 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -65,7 +65,7 @@ my $Split_AposD  = 1;  ## 'd
 
 
 ### some patterns
-my $common_right_punc = '\.|\,|\;|:|\!|\?|\"|\)|\]|\}|\>|\-';
+my $common_right_punc = '\x{0964}|\.|\,|\;|\!|:|\?|\"|\)|\]|\}|\>|\-';
 
 #### step 1: read files
 
@@ -112,7 +112,7 @@ my $new_token_total = 0;
 
 while(<STDIN>){
     chomp();
-
+    s/\x{0970}/./g;  # dev abbreviation character
     if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) {
 	## markup
 	print STDOUT "$_\n";
@@ -121,7 +121,7 @@ while(<STDIN>){
 
     my $orig_num = 0;
     my $deep_proc_num = 0;
-
+    s/(\x{0964}+)/ $1/g;  # Devangari end of sentence
     my $new_line = proc_line($_, \$orig_num, \$deep_proc_num);
 
     $orig_token_total += $orig_num;
@@ -148,7 +148,8 @@ while(<STDIN>){
 	$new_line =~ s/(set|src|tgt|trg)/ $1/g;
     }
 
-    print STDOUT " $new_line\n";
+    chomp $new_line;
+    print STDOUT "$new_line\n";
 }
 
 ########################################################################
@@ -228,6 +229,7 @@ sub proc_token {
 
     ## step 1: check the most common case
     if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+$/i){
+    #if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){
 	### most common cases
 	return $token;
     }
@@ -363,7 +365,7 @@ sub deep_proc_token {
 
     ##### step 0: if it mades up of all puncts, remove one punct at a time.
     if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}a-zA-Z\d]/){
-	if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\.+|\-+|\_+|\&+)$/){
+	if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\x{0964}+|\.+|\-+|\_+|\&+)$/){
 	    ## ++ @@@@ !!! ....
 	    return $line;
 	}
@@ -454,7 +456,7 @@ sub deep_proc_token {
 	### deal with ': e.g., 's, 't, 'm, 'll, 're, 've, n't
 
 	##  'there => ' there   '98 => the same
-	$suc += ($line =~ s/^(\'+)([a-z]+)/ $1 $2/gi);
+	$suc += ($line =~ s/^(\'+)([a-z\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}\p{Devanagari}]+)/ $1 $2/gi);
 	
 	##  note that \' and \. could interact: e.g.,  U.S.'s;   're.
 	if($Split_NAposT && ($line =~ /^(.*[a-z]+)(n\'t)([\.]*)$/i)){
@@ -664,10 +666,10 @@ sub deep_proc_token {
 	    return $line;
 	}
 
-	if($line =~ /^(([a-z]\.)+)(\.*)$/i){
+        if ($line =~ /^(([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी)(\.([a-z]|ए|बी|सी|डी|ई|एफ|जी|एच|आई|जे|के|एल|एम|एन|ओ|पी|क़यू|आर|एस|टी|यू|वी|डबल्यू|एक्स|वाई|ज़ेड|ज़ी))+)(\.?)(\.*)$/i){
 	    ## I.B.M.
-	    my $t1 = $1;
-	    my $t3 = $3;
+	    my $t1 = $1 . $5;
+	    my $t3 = $6;
 	    return $t1 . " ". proc_token($t3);
 	}
 
@@ -701,10 +703,3 @@ sub deep_proc_token {
     return $line;
 }
 
-
-
-
-
-
-		   
-
diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh
index 5b7933d8..bca954d1 100755
--- a/corpus/tokenize-anything.sh
+++ b/corpus/tokenize-anything.sh
@@ -14,6 +14,7 @@ fi
 $SUPPORT/utf8-normalize.sh $NORMARGS |
   $SUPPORT/quote-norm.pl |
   $SUPPORT/tokenizer.pl |
+  $SUPPORT/fix-eos.pl |
   sed $SEDFLAGS -e 's/ al - / al-/g' |
   $SUPPORT/fix-contract.pl |
   sed $SEDFLAGS -e 's/^ //' | sed $SEDFLAGS -e 's/ $//' |
author	Patrick Simianer <p@simianer.de>	2014-01-27 10:39:36 +0100
committer	Patrick Simianer <p@simianer.de>	2014-01-27 10:39:36 +0100
commit	64e135092c140310345bb7fcf3dffc8072e652d3 (patch)
tree	127636a2820fb9a7a522157ea4de55cdc0c3c0bd /corpus
parent	58be95f557d2f6a006cc98a23de98125e6b83a32 (diff)
parent	5ab6eb44d67a48ea5b366d8b2878f3da7ef960e4 (diff)