summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-07-28 12:11:30 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2012-07-28 12:11:30 -0400
commit5276e368ad643434fd73527329ed70507ee49dfc (patch)
tree3735753fbb3ee873176ab508d4c1a7578431edc8
parent733e1b1507d27d4f53055f740e8098f56215ab8f (diff)
a couple of tools for cleaning corpora
-rwxr-xr-xcorpus/filter-length.pl130
-rwxr-xr-xcorpus/utf8-normalize.sh25
2 files changed, 155 insertions, 0 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl
new file mode 100755
index 00000000..d7eacdd7
--- /dev/null
+++ b/corpus/filter-length.pl
@@ -0,0 +1,130 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+##### EDIT THESE SETTINGS ####################################################
+my $MAX_LENGTH = 99; # discard a sentence if it is longer than this
+my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include
+my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be?
+##############################################################################
+
+die "Usage: $0 corpus.fr-en\n\n Filter sentence pairs containing sentences longer than $MAX_LENGTH words\n or whose log length ratios are $MAX_ZSCORE stddevs away from the mean log ratio.\n\n" unless scalar @ARGV == 1;
+binmode(STDOUT,":utf8");
+binmode(STDERR,":utf8");
+
+my $corpus = shift @ARGV;
+die "Cannot read from STDIN\n" if $corpus eq '-';
+my $ff = "<$corpus";
+$ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/;
+
+open F,$ff or die "Can't read $corpus: $!";
+binmode(F,":utf8");
+
+my $rat_max = log(9);
+my $lrm = 0;
+my $zerof = 0;
+my $zeroe = 0;
+my $absbadrat = 0;
+my $overlene = 0;
+my $overlenf = 0;
+my $lines = 0;
+my @lograts = ();
+while(<F>) {
+ $lines++;
+ if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
+ elsif ($lines % 2500 == 0) { print STDERR "."; }
+ my ($sf, $se, @d) = split / \|\|\| /;
+ die "Bad format: $_" if scalar @d != 0 or !defined $se;
+ my @fs = split /\s+/, $sf;
+ my @es = split /\s+/, $se;
+ my $flen = scalar @fs;
+ my $elen = scalar @es;
+ if ($flen == 0) {
+ $zerof++;
+ next;
+ }
+ if ($elen == 0) {
+ $zeroe++;
+ next;
+ }
+ if ($flen > $MAX_LENGTH) {
+ $overlenf++;
+ next;
+ }
+ if ($elen > $MAX_LENGTH) {
+ $overlene++;
+ next;
+ }
+ if ($elen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN ||
+ $flen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN) {
+ my $lograt = log($flen) - log($elen);
+ if (abs($lograt) > $rat_max) {
+ $absbadrat++;
+ next;
+ }
+ $lrm += $lograt;
+ push @lograts, $lograt;
+ }
+}
+close F;
+
+print STDERR "\nComputing statistics...\n";
+my $lmean = $lrm / scalar @lograts;
+
+my $lsd = 0;
+for my $lr (@lograts) {
+ $lsd += ($lr - $lmean)**2;
+}
+$lsd = sqrt($lsd / scalar @lograts);
+@lograts = ();
+
+my $pass1_discard = $zerof + $zeroe + $absbadrat + $overlene + $overlenf;
+my $discard_rate = int(10000 * $pass1_discard / $lines) / 100;
+print STDERR " Total lines: $lines\n";
+print STDERR " Already discared: $pass1_discard\t(discard rate = $discard_rate%)\n";
+print STDERR " Mean F:E ratio: " . exp($lmean) . "\n";
+print STDERR " StdDev F:E ratio: " . exp($lsd) . "\n";
+print STDERR "Writing...\n";
+open F,$ff or die "Can't reread $corpus: $!";
+binmode(F,":utf8");
+my $to = 0;
+my $zviol = 0;
+my $worstz = -1;
+my $worst = "\n";
+$lines = 0;
+while(<F>) {
+ $lines++;
+ if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
+ elsif ($lines % 2500 == 0) { print STDERR "."; }
+ my ($sf, $se) = split / \|\|\| /;
+ my @fs = split /\s+/, $sf;
+ my @es = split /\s+/, $se;
+ my $flen = scalar @fs;
+ my $elen = scalar @es;
+ next if ($flen == 0);
+ next if ($elen == 0);
+ next if ($flen > $MAX_LENGTH);
+ next if ($elen > $MAX_LENGTH);
+ if ($elen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN ||
+ $flen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN) {
+ my $lograt = log($flen) - log($elen);
+ if (abs($lograt) > $rat_max) {
+ $absbadrat++;
+ next;
+ }
+ my $zscore = abs($lograt - $lmean) / $lsd;
+ if ($elen > $AUTOMATIC_INCLUDE_IF_SHORTER_THAN &&
+ $flen > $AUTOMATIC_INCLUDE_IF_SHORTER_THAN && $zscore > $worstz) { $worstz = $zscore; $worst = $_; }
+ if ($zscore > $MAX_ZSCORE) {
+ $zviol++;
+ next;
+ }
+ print;
+ }
+ $to++;
+}
+my $discard_rate2 = int(10000 * $zviol / ($lines - $pass1_discard)) / 100;
+print STDERR "\n Lines printed: $to\n Ratio violations: $zviol\t(discard rate = $discard_rate2%)\n";
+print STDERR " Worst z-score: $worstz\n sentence: $worst";
+exit 0;
+
diff --git a/corpus/utf8-normalize.sh b/corpus/utf8-normalize.sh
new file mode 100755
index 00000000..dcf8bc59
--- /dev/null
+++ b/corpus/utf8-normalize.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This script uses ICU uconv (http://site.icu-project.org/), if it's available
+# to normalize UTF8 text into a standard form. For information about this
+# process, refer to http://en.wikipedia.org/wiki/Unicode_equivalence#Normalization
+# Escape characters between 0x00-0x1F are removed
+
+if which uconv > /dev/null
+then
+ CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip"
+else
+ echo "Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Normalization NOT taking place." 1>&2
+ CMD="iconv -f utf8 -t utf8 -c"
+fi
+
+$CMD | /usr/bin/perl -w -e '
+ while (<>) {
+ chomp;
+ s/[\x00-\x1F]+/ /g;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ print "$_\n";
+ }'
+