diff options
| author | Paul Baltescu <pauldb89@gmail.com> | 2013-02-21 14:13:55 +0000 | 
|---|---|---|
| committer | Paul Baltescu <pauldb89@gmail.com> | 2013-02-21 14:13:55 +0000 | 
| commit | b5491898549c61bd799d199aa9178a8394a1ef69 (patch) | |
| tree | fb2686a2aae03ff07bcdf4cd47e8c3191eff8d1e /corpus | |
| parent | 0187447a643c3ea262b13b3052cb1531990eafe6 (diff) | |
| parent | c17d9c23d023a5c08656376944f636180f0a437b (diff) | |
Merge branch 'master' of https://github.com/pauldb89/cdec
Diffstat (limited to 'corpus')
| -rwxr-xr-x | corpus/add-self-translations.pl | 2 | ||||
| -rwxr-xr-x | corpus/cut-corpus.pl | 2 | ||||
| -rwxr-xr-x | corpus/filter-length.pl | 22 | ||||
| -rwxr-xr-x | corpus/paste-files.pl | 12 | ||||
| -rwxr-xr-x | corpus/support/quote-norm.pl | 14 | ||||
| -rw-r--r-- | corpus/support/token_list | 82 | ||||
| -rw-r--r-- | corpus/support/token_patterns | 2 | ||||
| -rwxr-xr-x | corpus/support/tokenizer.pl | 3 | 
8 files changed, 130 insertions, 9 deletions
| diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl index 153bc454..d707ce29 100755 --- a/corpus/add-self-translations.pl +++ b/corpus/add-self-translations.pl @@ -6,7 +6,7 @@ use strict;  my %df;  my %def;  while(<>) { -  print; +#  print;    chomp;    my ($sf, $se) = split / \|\|\| /;    die "Format error: $_\n" unless defined $sf && defined $se; diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl index 7daac0e2..0af3b23c 100755 --- a/corpus/cut-corpus.pl +++ b/corpus/cut-corpus.pl @@ -22,7 +22,7 @@ for my $ff (@ind) {  while(<>) {    chomp; -  my @fields = split / \|\|\| /; +  my @fields = split /\s*\|\|\|\s*/;    my @sf;    for my $i (@o) {      my $y = $fields[$i]; diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 70032ca7..2e257cda 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -3,20 +3,30 @@ use strict;  use utf8;  ##### EDIT THESE SETTINGS #################################################### -my $MAX_LENGTH = 99;  # discard a sentence if it is longer than this -my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include +my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include  my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be?  ############################################################################## -die "Usage: $0 corpus.fr-en\n\n  Filter sentence pairs containing sentences longer than $MAX_LENGTH words\n  or whose log length ratios are $MAX_ZSCORE stddevs away from the mean log ratio.\n\n" unless scalar @ARGV == 1; +die "Usage: $0 [-NNN] corpus.fr-en\n\n  Filter sentence pairs containing sentences longer than NNN words (where NNN\n  is 150 by default) or whose log length ratios are $MAX_ZSCORE stddevs away from the\n  mean log ratio.\n\n" unless scalar @ARGV == 1 || scalar @ARGV == 2;  binmode(STDOUT,":utf8");  binmode(STDERR,":utf8"); +my $MAX_LENGTH = 150;  # discard a sentence if it is longer than this +if (scalar @ARGV == 2) { +  my $fp = shift @ARGV; +  die "Expected -NNN for first parameter, but got $fp\n" unless $fp =~ /^-(\d+)$/; +  $MAX_LENGTH=$1; +} +  my $corpus = shift @ARGV; +  die "Cannot read from STDIN\n" if $corpus eq '-';  my $ff = "<$corpus";  $ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/; +print STDERR "Max line length (monolingual): $MAX_LENGTH\n"; +print STDERR "              Parallel corpus: $corpus\n"; +  open F,$ff or die "Can't read $corpus: $!";  binmode(F,":utf8"); @@ -34,11 +44,11 @@ while(<F>) {    $lines++;    if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }    elsif ($lines % 2500 == 0) { print STDERR "."; } -  my ($sf, $se, @d) = split / \|\|\| /; +  my ($sf, $se, @d) = split /\s*\|\|\|\s*/;    if (scalar @d != 0 or !defined $se) {      $bad_format++;      if ($bad_format > 100 && ($bad_format / $lines) > 0.02) { -      die "Corpus appears to be incorretly formatted, example: $_"; +      die "$bad_format / $lines : Corpus appears to be incorretly formatted, example: $_";      }      next;    } @@ -128,6 +138,8 @@ while(<F>) {        next;      }      print; +  } else { +    print;    }    $to++;  } diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl index 24c70599..4cb424ad 100755 --- a/corpus/paste-files.pl +++ b/corpus/paste-files.pl @@ -17,6 +17,7 @@ for my $file (@ARGV) {  binmode(STDOUT,":utf8");  binmode(STDERR,":utf8"); +my $bad = 0;  my $lc = 0;  my $done = 0;  my $fl = 0; @@ -34,7 +35,15 @@ while(1) {        last;      }      chomp $r; -    die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; +    if ($r =~ /\|\|\|/) { +      $r = ''; +      $bad++; +    } +    warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; +    $r =~ s/\|\|\|/ /g; +    $r =~ s/\s+/ /g; +    $r =~ s/^ +//; +    $r =~ s/ +$//;      $anum++;      push @line, $r;    } @@ -47,4 +56,5 @@ for (my $i = 1; $i < scalar @fhs; $i++) {    my $r = <$fh>;    die "Mismatched number of lines.\n" if defined $r;  } +print STDERR "Number of lines containing ||| was: $bad\n" if $bad > 0; diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 72b0064d..e4e5055e 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -27,6 +27,20 @@ while(<STDIN>) {    s/¨/"/g;    s/¡/ ¡ /g;    s/¿/ ¿ /g; + +  s/ п. п. / п.п. /g; +  s/ ст. л. / ст.л. /g; +  s/ т. е. / т.е. /g; +  s/ т. к. / т.к. /g; +  s/ т. ч. / т.ч. /g; +  s/ т. д. / т.д. /g; +  s/ т. п. / т.п. /g; +  s/ и. о. / и.о. /g; +  s/ с. г. / с.г. /g; +  s/ г. р. / г.р. /g; +  s/ т. н. / т.н. /g; +  s/ т. ч. / т.ч. /g; +  s/ н. э. / н.э. /g;    # â<U+0080><U+0099>    s/â(\x{80}\x{99}|\x{80}\x{98})/'/g;    s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; diff --git a/corpus/support/token_list b/corpus/support/token_list index 28eb4396..d470cb22 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -210,3 +210,85 @@ W.  X.  Y.  Z. +А. +Б. +В. +Г. +Д. +Е. +Ё. +Ж. +З. +И. +Й. +К. +Л. +М. +Н. +О. +П. +Р. +С. +Т. +У. +Ф. +Х. +Ц. +Ч. +Ш. +Щ. +Ъ. +Ы. +Ь. +Э. +Ю. +Я. +л. +г. +обл. +гг. +в. +вв. +мин. +ч. +тыс. +млн. +млрд. +трлн. +кв. +куб. +руб. +коп. +долл. +Прим. +прим. +чел. +грн. +мин. +им. +проф. +акад. +ред. +авт. +корр. +соб. +спец. +см. +тж. +др. +пр. +букв. +# Two-letter abbreviations - can be written with space +п.п. +ст.л. +т.е. +т.к. +т.ч. +т.д. +т.п. +и.о. +с.г. +г.р. +т.н. +т.ч. +н.э. diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index 8e69432b..de64fb2a 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,3 +1,5 @@  /^(al|el|ul|e)\-[a-z]+$/ +/^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ +/^\p{Cyrillic}\.\p{Cyrillic}\.$/  /^(\d|\d\d|\d\d\d)\.$/ diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index e9c3a37d..0350a894 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -73,6 +73,7 @@ my $dict_file = "$workdir/token_list";  my $word_patt_file = "$workdir/token_patterns";  open(my $dict_fp, "$dict_file") or die; +binmode($dict_fp, ":utf8");  # read in the list of words that should not be segmented,   ##  e.g.,"I.B.M.", co-operation. @@ -89,6 +90,7 @@ while(<$dict_fp>){  }  open(my $patt_fp, "$word_patt_file") or die; +binmode($patt_fp, ":utf8");  my @word_patts = ();  my $word_patt_num = 0;  while(<$patt_fp>){ @@ -147,7 +149,6 @@ while(<STDIN>){      print STDOUT " $new_line\n";  } -print STDERR "\n";  ######################################################################## | 
