From 7928695272b000de7142b91e05959a8fab6b1d2a Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 14 Nov 2012 20:33:51 -0500 Subject: major mert clean up, stuff for simple system demo --- corpus/support/quote-norm.pl | 64 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100755 corpus/support/quote-norm.pl (limited to 'corpus/support/quote-norm.pl') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl new file mode 100755 index 00000000..0c5b9c26 --- /dev/null +++ b/corpus/support/quote-norm.pl @@ -0,0 +1,64 @@ +#!/usr/bin/perl -w +use strict; +use utf8; +binmode(STDIN,"utf8"); +binmode(STDOUT,"utf8"); +while() { + chomp; + $_ = " $_ "; + s/&\s*lt\s*;//gi; + s/&\s*squot\s*;/'/gi; + s/&\s*quot\s*;/"/gi; + s/&\s*amp\s*;/&/gi; + s/ (\d\d): (\d\d)/ $1:$2/g; + s/[\x{20a0}]\x{20ac}]/ EUR /g; + s/[\x{00A3}]/ GBP /g; + s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g; + s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; + s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; + s/&\s*#45\s*;/--/g; + s/ ,,/ "/g; + s/``/"/g; + s/''/"/g; + s/〃/"/g; + s/¨/"/g; + s/¡/ ¡ /g; + s/¿/ ¿ /g; + s/ˇ/'/g; + s/´/'/g; + s/`/'/g; + s/’/'/g; + s/ ́/'/g; + s/‘/'/g; + s/ˉ/'/g; + s/β/ß/g; # WMT 2010 error + s/“/"/g; + s/”/"/g; + s/«/"/g; + s/»/"/g; + tr/!-~/!-~/; + s/、/,/g; + s/。/./g; + s/…/.../g; + s/―/--/g; + s/–/--/g; + s/─/--/g; + s/—/--/g; + s/•/ * /g; + s/\*/ * /g; + s/،/,/g; + s/؟/?/g; + s/ـ/ /g; + s/à ̄/i/g; + s/’/'/g; + s/â€"/"/g; + s/؛/;/g; + + s/\s+/ /g; + s/^\s+//; + s/\s+$//; + s/[\x{00}-\x{1f}]//g; + print "$_\n"; +} + -- cgit v1.2.3 From a86a37cbe2fb6ffdcf4374f180010a118fed1063 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 5 Dec 2012 20:27:30 -0500 Subject: remove logging, you should be using pv --- corpus/support/quote-norm.pl | 7 ++++++- corpus/support/tokenizer.pl | 9 --------- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'corpus/support/quote-norm.pl') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 0c5b9c26..72b0064d 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -18,13 +18,18 @@ while() { s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; s/&\s*#45\s*;/--/g; + s/�c/--/g; s/ ,,/ "/g; s/``/"/g; s/''/"/g; + s/[「」]/"/g; s/〃/"/g; s/¨/"/g; s/¡/ ¡ /g; s/¿/ ¿ /g; + # â + s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; + s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; s/ˇ/'/g; s/´/'/g; s/`/'/g; @@ -39,7 +44,7 @@ while() { s/»/"/g; tr/!-~/!-~/; s/、/,/g; - s/。/./g; + # s/。/./g; s/…/.../g; s/―/--/g; s/–/--/g; diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 23be00a5..e9c3a37d 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -107,24 +107,15 @@ my $orig_token_total = 0; my $deep_proc_token_total = 0; my $new_token_total = 0; -my $line_total = 0; -my $content_line_total = 0; - while(){ chomp(); - $line_total ++; - if ($line_total % 100000 == 0) { print STDERR " [$line_total]\n"; } - elsif ($line_total % 2500 == 0) { print STDERR "."; } - if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^ Date: Tue, 22 Jan 2013 20:08:28 -0500 Subject: russian abbrevs --- corpus/support/quote-norm.pl | 14 ++++++++ corpus/support/token_list | 82 +++++++++++++++++++++++++++++++++++++++++++ corpus/support/token_patterns | 1 + 3 files changed, 97 insertions(+) (limited to 'corpus/support/quote-norm.pl') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 72b0064d..e4e5055e 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -27,6 +27,20 @@ while() { s/¨/"/g; s/¡/ ¡ /g; s/¿/ ¿ /g; + + s/ п. п. / п.п. /g; + s/ ст. л. / ст.л. /g; + s/ т. е. / т.е. /g; + s/ т. к. / т.к. /g; + s/ т. ч. / т.ч. /g; + s/ т. д. / т.д. /g; + s/ т. п. / т.п. /g; + s/ и. о. / и.о. /g; + s/ с. г. / с.г. /g; + s/ г. р. / г.р. /g; + s/ т. н. / т.н. /g; + s/ т. ч. / т.ч. /g; + s/ н. э. / н.э. /g; # â s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; diff --git a/corpus/support/token_list b/corpus/support/token_list index 28eb4396..d470cb22 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -210,3 +210,85 @@ W. X. Y. Z. +А. +Б. +В. +Г. +Д. +Е. +Ё. +Ж. +З. +И. +Й. +К. +Л. +М. +Н. +О. +П. +Р. +С. +Т. +У. +Ф. +Х. +Ц. +Ч. +Ш. +Щ. +Ъ. +Ы. +Ь. +Э. +Ю. +Я. +л. +г. +обл. +гг. +в. +вв. +мин. +ч. +тыс. +млн. +млрд. +трлн. +кв. +куб. +руб. +коп. +долл. +Прим. +прим. +чел. +грн. +мин. +им. +проф. +акад. +ред. +авт. +корр. +соб. +спец. +см. +тж. +др. +пр. +букв. +# Two-letter abbreviations - can be written with space +п.п. +ст.л. +т.е. +т.к. +т.ч. +т.д. +т.п. +и.о. +с.г. +г.р. +т.н. +т.ч. +н.э. diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index b25ac6de..de64fb2a 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,4 +1,5 @@ /^(al|el|ul|e)\-[a-z]+$/ /^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ +/^\p{Cyrillic}\.\p{Cyrillic}\.$/ /^(\d|\d\d|\d\d\d)\.$/ -- cgit v1.2.3 From 9f0109076f1c95170cbe46e1708597bc6e0f9fd4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 23 Feb 2013 04:23:48 -0500 Subject: one missing quote type --- corpus/support/quote-norm.pl | 1 + 1 file changed, 1 insertion(+) (limited to 'corpus/support/quote-norm.pl') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index e4e5055e..d2980092 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -20,6 +20,7 @@ while() { s/&\s*#45\s*;/--/g; s/�c/--/g; s/ ,,/ "/g; + s/„/"/g; s/``/"/g; s/''/"/g; s/[「」]/"/g; -- cgit v1.2.3 From 3a162d28033d1b9d5241e31f32978dba4eba6296 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 8 Mar 2013 22:44:49 -0500 Subject: few preproc fixes --- corpus/paste-files.pl | 1 + corpus/support/quote-norm.pl | 2 ++ corpus/support/token_list | 2 ++ 3 files changed, 5 insertions(+) (limited to 'corpus/support/quote-norm.pl') diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl index 4cb424ad..ef2cd937 100755 --- a/corpus/paste-files.pl +++ b/corpus/paste-files.pl @@ -34,6 +34,7 @@ while(1) { $done = 1; last; } + $r =~ s/\r//g; chomp $r; if ($r =~ /\|\|\|/) { $r = ''; diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index d2980092..b104e73c 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -11,6 +11,8 @@ while() { s/&\s*squot\s*;/'/gi; s/&\s*quot\s*;/"/gi; s/&\s*amp\s*;/&/gi; + s/&\s*nbsp\s*;/&/gi; + s/&\s*#\s*160\s*;/ /gi; s/ (\d\d): (\d\d)/ $1:$2/g; s/[\x{20a0}]\x{20ac}]/ EUR /g; s/[\x{00A3}]/ GBP /g; diff --git a/corpus/support/token_list b/corpus/support/token_list index d470cb22..366cd7ff 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -37,6 +37,8 @@ tel. 10. ##################### abbreviation: words that contain period. +EE.UU. +ee.uu. U.A.E Ala. Ph.D. -- cgit v1.2.3