summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-08-03 07:46:54 -0400
committerKenneth Heafield <github@kheafield.com>2012-08-03 07:46:54 -0400
commit122f46c31102b683eaab3ad81a3a98accbc694bb (patch)
tree8d499d789b159ebed25bb23b6983813d064a6296 /corpus
parentac664bdb0e481539cf77098a7dd0e1ec8d937ba0 (diff)
parent193d137056c3c4f73d66f8db84691d63307de894 (diff)
Merge branch 'master' of github.com:redpony/cdec
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/filter-length.pl130
-rwxr-xr-xcorpus/paste-files.pl50
-rwxr-xr-xcorpus/utf8-normalize.sh25
3 files changed, 205 insertions, 0 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl
new file mode 100755
index 00000000..d7eacdd7
--- /dev/null
+++ b/corpus/filter-length.pl
@@ -0,0 +1,130 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+##### EDIT THESE SETTINGS ####################################################
+my $MAX_LENGTH = 99; # discard a sentence if it is longer than this
+my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include
+my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be?
+##############################################################################
+
+die "Usage: $0 corpus.fr-en\n\n Filter sentence pairs containing sentences longer than $MAX_LENGTH words\n or whose log length ratios are $MAX_ZSCORE stddevs away from the mean log ratio.\n\n" unless scalar @ARGV == 1;
+binmode(STDOUT,":utf8");
+binmode(STDERR,":utf8");
+
+my $corpus = shift @ARGV;
+die "Cannot read from STDIN\n" if $corpus eq '-';
+my $ff = "<$corpus";
+$ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/;
+
+open F,$ff or die "Can't read $corpus: $!";
+binmode(F,":utf8");
+
+my $rat_max = log(9);
+my $lrm = 0;
+my $zerof = 0;
+my $zeroe = 0;
+my $absbadrat = 0;
+my $overlene = 0;
+my $overlenf = 0;
+my $lines = 0;
+my @lograts = ();
+while(<F>) {
+ $lines++;
+ if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
+ elsif ($lines % 2500 == 0) { print STDERR "."; }
+ my ($sf, $se, @d) = split / \|\|\| /;
+ die "Bad format: $_" if scalar @d != 0 or !defined $se;
+ my @fs = split /\s+/, $sf;
+ my @es = split /\s+/, $se;
+ my $flen = scalar @fs;
+ my $elen = scalar @es;
+ if ($flen == 0) {
+ $zerof++;
+ next;
+ }
+ if ($elen == 0) {
+ $zeroe++;
+ next;
+ }
+ if ($flen > $MAX_LENGTH) {
+ $overlenf++;
+ next;
+ }
+ if ($elen > $MAX_LENGTH) {
+ $overlene++;
+ next;
+ }
+ if ($elen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN ||
+ $flen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN) {
+ my $lograt = log($flen) - log($elen);
+ if (abs($lograt) > $rat_max) {
+ $absbadrat++;
+ next;
+ }
+ $lrm += $lograt;
+ push @lograts, $lograt;
+ }
+}
+close F;
+
+print STDERR "\nComputing statistics...\n";
+my $lmean = $lrm / scalar @lograts;
+
+my $lsd = 0;
+for my $lr (@lograts) {
+ $lsd += ($lr - $lmean)**2;
+}
+$lsd = sqrt($lsd / scalar @lograts);
+@lograts = ();
+
+my $pass1_discard = $zerof + $zeroe + $absbadrat + $overlene + $overlenf;
+my $discard_rate = int(10000 * $pass1_discard / $lines) / 100;
+print STDERR " Total lines: $lines\n";
+print STDERR " Already discared: $pass1_discard\t(discard rate = $discard_rate%)\n";
+print STDERR " Mean F:E ratio: " . exp($lmean) . "\n";
+print STDERR " StdDev F:E ratio: " . exp($lsd) . "\n";
+print STDERR "Writing...\n";
+open F,$ff or die "Can't reread $corpus: $!";
+binmode(F,":utf8");
+my $to = 0;
+my $zviol = 0;
+my $worstz = -1;
+my $worst = "\n";
+$lines = 0;
+while(<F>) {
+ $lines++;
+ if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
+ elsif ($lines % 2500 == 0) { print STDERR "."; }
+ my ($sf, $se) = split / \|\|\| /;
+ my @fs = split /\s+/, $sf;
+ my @es = split /\s+/, $se;
+ my $flen = scalar @fs;
+ my $elen = scalar @es;
+ next if ($flen == 0);
+ next if ($elen == 0);
+ next if ($flen > $MAX_LENGTH);
+ next if ($elen > $MAX_LENGTH);
+ if ($elen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN ||
+ $flen >= $AUTOMATIC_INCLUDE_IF_SHORTER_THAN) {
+ my $lograt = log($flen) - log($elen);
+ if (abs($lograt) > $rat_max) {
+ $absbadrat++;
+ next;
+ }
+ my $zscore = abs($lograt - $lmean) / $lsd;
+ if ($elen > $AUTOMATIC_INCLUDE_IF_SHORTER_THAN &&
+ $flen > $AUTOMATIC_INCLUDE_IF_SHORTER_THAN && $zscore > $worstz) { $worstz = $zscore; $worst = $_; }
+ if ($zscore > $MAX_ZSCORE) {
+ $zviol++;
+ next;
+ }
+ print;
+ }
+ $to++;
+}
+my $discard_rate2 = int(10000 * $zviol / ($lines - $pass1_discard)) / 100;
+print STDERR "\n Lines printed: $to\n Ratio violations: $zviol\t(discard rate = $discard_rate2%)\n";
+print STDERR " Worst z-score: $worstz\n sentence: $worst";
+exit 0;
+
diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl
new file mode 100755
index 00000000..24c70599
--- /dev/null
+++ b/corpus/paste-files.pl
@@ -0,0 +1,50 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 file1.txt file2.txt [file3.txt ...]\n\n Performs a per-line concatenation of all files using the ||| seperator.\n\n" unless scalar @ARGV > 1;
+
+my @fhs = ();
+for my $file (@ARGV) {
+ my $fh;
+ if ($file =~ /\.gz$/) {
+ open $fh, "gunzip -c $file|" or die "Can't fork gunzip -c $file: $!";
+ } else {
+ open $fh, "<$file" or die "Can't read $file: $!";
+ }
+ binmode($fh,":utf8");
+ push @fhs, $fh;
+}
+binmode(STDOUT,":utf8");
+binmode(STDERR,":utf8");
+
+my $lc = 0;
+my $done = 0;
+my $fl = 0;
+while(1) {
+ my @line;
+ $lc++;
+ if ($lc % 100000 == 0) { print STDERR " [$lc]\n"; $fl = 0; }
+ elsif ($lc % 2500 == 0) { print STDERR "."; $fl = 1; }
+ my $anum = 0;
+ for my $fh (@fhs) {
+ my $r = <$fh>;
+ if (!defined $r) {
+ die "Mismatched number of lines.\n" if scalar @line > 0;
+ $done = 1;
+ last;
+ }
+ chomp $r;
+ die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/;
+ $anum++;
+ push @line, $r;
+ }
+ last if $done;
+ print STDOUT join(' ||| ', @line) . "\n";
+}
+print STDERR "\n" if $fl;
+for (my $i = 1; $i < scalar @fhs; $i++) {
+ my $fh = $fhs[$i];
+ my $r = <$fh>;
+ die "Mismatched number of lines.\n" if defined $r;
+}
+
diff --git a/corpus/utf8-normalize.sh b/corpus/utf8-normalize.sh
new file mode 100755
index 00000000..dcf8bc59
--- /dev/null
+++ b/corpus/utf8-normalize.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This script uses ICU uconv (http://site.icu-project.org/), if it's available
+# to normalize UTF8 text into a standard form. For information about this
+# process, refer to http://en.wikipedia.org/wiki/Unicode_equivalence#Normalization
+# Escape characters between 0x00-0x1F are removed
+
+if which uconv > /dev/null
+then
+ CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip"
+else
+ echo "Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Normalization NOT taking place." 1>&2
+ CMD="iconv -f utf8 -t utf8 -c"
+fi
+
+$CMD | /usr/bin/perl -w -e '
+ while (<>) {
+ chomp;
+ s/[\x00-\x1F]+/ /g;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ print "$_\n";
+ }'
+