diff options
Diffstat (limited to 'corpus')
| -rwxr-xr-x | corpus/filter-length.pl | 130 | ||||
| -rwxr-xr-x | corpus/paste-files.pl | 50 | ||||
| -rwxr-xr-x | corpus/utf8-normalize.sh | 25 | 
3 files changed, 205 insertions, 0 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl new file mode 100755 index 00000000..d7eacdd7 --- /dev/null +++ b/corpus/filter-length.pl @@ -0,0 +1,130 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +##### EDIT THESE SETTINGS #################################################### +my $MAX_LENGTH = 99;  # discard a sentence if it is longer than this +my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include +my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be? +############################################################################## + +die "Usage: $0 corpus.fr-en\n\n  Filter sentence pairs containing sentences longer than $MAX_LENGTH words\n  or whose log length ratios are $MAX_ZSCORE stddevs away from the mean log ratio.\n\n" unless scalar @ARGV == 1; +binmode(STDOUT,":utf8"); +binmode(STDERR,":utf8"); + +my $corpus = shift @ARGV; +die "Cannot read from STDIN\n" if $corpus eq '-'; +my $ff = "<$corpus"; +$ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/; + +open F,$ff or die "Can't read $corpus: $!"; +binmode(F,":utf8"); + +my $rat_max = log(9); +my $lrm = 0; +my $zerof = 0; +my $zeroe = 0; +my $absbadrat = 0; +my $overlene = 0; +my $overlenf = 0; +my $lines = 0; +my @lograts = (); +while(<F>) { +  $lines++; +  if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } +  elsif ($lines % 2500 == 0) { print STDERR "."; } +  my ($sf, $se, @d) = split / \|\|\| /; +  die "Bad format: $_" if scalar @d != 0 or !defined $se; +  my @fs = split /\s+/, $sf; +  my @es = split /\s+/, $se; +  my $flen = scalar @fs; +  my $elen = scalar @es; +  if ($flen == 0) { +    $zerof++; +    next; +  } +  if ($elen == 0) { +    $zeroe++; +    next; +  } +  if ($flen > $MAX_LENGTH) { +    $overlenf++; +    next; +  } +  if ($elen > $MAX_LENGTH) { +    $overlene++; +    next; +  } +  if ($elen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN || +      $flen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN) { +    my $lograt = log($flen) - log($elen); +    if (abs($lograt) > $rat_max) { +      $absbadrat++; +      next; +    } +    $lrm += $lograt; +    push @lograts, $lograt; +  } +} +close F; + +print STDERR "\nComputing statistics...\n"; +my $lmean = $lrm / scalar @lograts; + +my $lsd = 0; +for my $lr (@lograts) { +  $lsd += ($lr - $lmean)**2; +} +$lsd = sqrt($lsd / scalar @lograts); +@lograts = (); + +my $pass1_discard = $zerof + $zeroe + $absbadrat + $overlene + $overlenf; +my $discard_rate = int(10000 * $pass1_discard / $lines) / 100; +print STDERR "      Total lines: $lines\n"; +print STDERR " Already discared: $pass1_discard\t(discard rate = $discard_rate%)\n"; +print STDERR "   Mean F:E ratio: " . exp($lmean) . "\n";  +print STDERR " StdDev F:E ratio: " . exp($lsd) . "\n"; +print STDERR "Writing...\n"; +open F,$ff or die "Can't reread $corpus: $!"; +binmode(F,":utf8"); +my $to = 0; +my $zviol = 0; +my $worstz = -1; +my $worst = "\n"; +$lines = 0; +while(<F>) { +  $lines++; +  if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } +  elsif ($lines % 2500 == 0) { print STDERR "."; } +  my ($sf, $se) = split / \|\|\| /; +  my @fs = split /\s+/, $sf; +  my @es = split /\s+/, $se; +  my $flen = scalar @fs; +  my $elen = scalar @es; +  next if ($flen == 0); +  next if ($elen == 0); +  next if ($flen > $MAX_LENGTH); +  next if ($elen > $MAX_LENGTH); +  if ($elen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN || +      $flen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN) { +    my $lograt = log($flen) - log($elen); +    if (abs($lograt) > $rat_max) { +      $absbadrat++; +      next; +    } +    my $zscore = abs($lograt - $lmean) / $lsd; +    if ($elen > $AUTOMATIC_INCLUDE_IF_SHORTER_THAN && +        $flen > $AUTOMATIC_INCLUDE_IF_SHORTER_THAN && $zscore > $worstz) { $worstz = $zscore; $worst = $_; } +    if ($zscore > $MAX_ZSCORE) { +      $zviol++; +      next; +    } +    print; +  } +  $to++; +} +my $discard_rate2 = int(10000 * $zviol / ($lines - $pass1_discard)) / 100; +print STDERR "\n    Lines printed: $to\n Ratio violations: $zviol\t(discard rate = $discard_rate2%)\n"; +print STDERR "    Worst z-score: $worstz\n         sentence: $worst"; +exit 0; + diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl new file mode 100755 index 00000000..24c70599 --- /dev/null +++ b/corpus/paste-files.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 file1.txt file2.txt [file3.txt ...]\n\n  Performs a per-line concatenation of all files using the ||| seperator.\n\n" unless scalar @ARGV > 1; + +my @fhs = (); +for my $file (@ARGV) { +  my $fh; +  if ($file =~ /\.gz$/) { +    open $fh, "gunzip -c $file|" or die "Can't fork gunzip -c $file: $!"; +  } else { +    open $fh, "<$file" or die "Can't read $file: $!"; +  } +  binmode($fh,":utf8"); +  push @fhs, $fh; +} +binmode(STDOUT,":utf8"); +binmode(STDERR,":utf8"); + +my $lc = 0; +my $done = 0; +my $fl = 0; +while(1) { +  my @line; +  $lc++; +  if ($lc % 100000 == 0) { print STDERR " [$lc]\n"; $fl = 0; } +  elsif ($lc % 2500 == 0) { print STDERR "."; $fl = 1; } +  my $anum = 0; +  for my $fh (@fhs) { +    my $r = <$fh>; +    if (!defined $r) { +      die "Mismatched number of lines.\n" if scalar @line > 0; +      $done = 1; +      last; +    } +    chomp $r; +    die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; +    $anum++; +    push @line, $r; +  } +  last if $done; +  print STDOUT join(' ||| ', @line) . "\n"; +} +print STDERR "\n" if $fl; +for (my $i = 1; $i < scalar @fhs; $i++) { +  my $fh = $fhs[$i]; +  my $r = <$fh>; +  die "Mismatched number of lines.\n" if defined $r; +} + diff --git a/corpus/utf8-normalize.sh b/corpus/utf8-normalize.sh new file mode 100755 index 00000000..dcf8bc59 --- /dev/null +++ b/corpus/utf8-normalize.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# This script uses ICU uconv (http://site.icu-project.org/), if it's available +# to normalize UTF8 text into a standard form. For information about this +# process, refer to http://en.wikipedia.org/wiki/Unicode_equivalence#Normalization +# Escape characters between 0x00-0x1F are removed + +if which uconv > /dev/null +then +  CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip" +else +  echo "Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Normalization NOT taking place." 1>&2 +  CMD="iconv -f utf8 -t utf8 -c" +fi + +$CMD | /usr/bin/perl -w -e ' + while (<>) { +     chomp; +      s/[\x00-\x1F]+/ /g; +      s/  +/ /g; +      s/^ //; +      s/ $//; +      print "$_\n"; +    }' +  | 
