summaryrefslogtreecommitdiff
path: root/corpus/filter-length.pl
diff options
context:
space:
mode:
authorAvneesh Saluja <asaluja@gmail.com>2013-03-28 18:28:16 -0700
committerAvneesh Saluja <asaluja@gmail.com>2013-03-28 18:28:16 -0700
commit3d8d656fa7911524e0e6885647173474524e0784 (patch)
tree81b1ee2fcb67980376d03f0aa48e42e53abff222 /corpus/filter-length.pl
parentbe7f57fdd484e063775d7abf083b9fa4c403b610 (diff)
parent96fedabebafe7a38a6d5928be8fff767e411d705 (diff)
fixed conflicts
Diffstat (limited to 'corpus/filter-length.pl')
-rwxr-xr-xcorpus/filter-length.pl34
1 files changed, 27 insertions, 7 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl
index d7eacdd7..2e257cda 100755
--- a/corpus/filter-length.pl
+++ b/corpus/filter-length.pl
@@ -3,20 +3,30 @@ use strict;
use utf8;
##### EDIT THESE SETTINGS ####################################################
-my $MAX_LENGTH = 99; # discard a sentence if it is longer than this
-my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include
+my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include
my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be?
##############################################################################
-die "Usage: $0 corpus.fr-en\n\n Filter sentence pairs containing sentences longer than $MAX_LENGTH words\n or whose log length ratios are $MAX_ZSCORE stddevs away from the mean log ratio.\n\n" unless scalar @ARGV == 1;
+die "Usage: $0 [-NNN] corpus.fr-en\n\n Filter sentence pairs containing sentences longer than NNN words (where NNN\n is 150 by default) or whose log length ratios are $MAX_ZSCORE stddevs away from the\n mean log ratio.\n\n" unless scalar @ARGV == 1 || scalar @ARGV == 2;
binmode(STDOUT,":utf8");
binmode(STDERR,":utf8");
+my $MAX_LENGTH = 150; # discard a sentence if it is longer than this
+if (scalar @ARGV == 2) {
+ my $fp = shift @ARGV;
+ die "Expected -NNN for first parameter, but got $fp\n" unless $fp =~ /^-(\d+)$/;
+ $MAX_LENGTH=$1;
+}
+
my $corpus = shift @ARGV;
+
die "Cannot read from STDIN\n" if $corpus eq '-';
my $ff = "<$corpus";
$ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/;
+print STDERR "Max line length (monolingual): $MAX_LENGTH\n";
+print STDERR " Parallel corpus: $corpus\n";
+
open F,$ff or die "Can't read $corpus: $!";
binmode(F,":utf8");
@@ -24,6 +34,7 @@ my $rat_max = log(9);
my $lrm = 0;
my $zerof = 0;
my $zeroe = 0;
+my $bad_format = 0;
my $absbadrat = 0;
my $overlene = 0;
my $overlenf = 0;
@@ -33,8 +44,14 @@ while(<F>) {
$lines++;
if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
elsif ($lines % 2500 == 0) { print STDERR "."; }
- my ($sf, $se, @d) = split / \|\|\| /;
- die "Bad format: $_" if scalar @d != 0 or !defined $se;
+ my ($sf, $se, @d) = split /\s*\|\|\|\s*/;
+ if (scalar @d != 0 or !defined $se) {
+ $bad_format++;
+ if ($bad_format > 100 && ($bad_format / $lines) > 0.02) {
+ die "$bad_format / $lines : Corpus appears to be incorretly formatted, example: $_";
+ }
+ next;
+ }
my @fs = split /\s+/, $sf;
my @es = split /\s+/, $se;
my $flen = scalar @fs;
@@ -78,7 +95,7 @@ for my $lr (@lograts) {
$lsd = sqrt($lsd / scalar @lograts);
@lograts = ();
-my $pass1_discard = $zerof + $zeroe + $absbadrat + $overlene + $overlenf;
+my $pass1_discard = $zerof + $zeroe + $absbadrat + $overlene + $overlenf + $bad_format;
my $discard_rate = int(10000 * $pass1_discard / $lines) / 100;
print STDERR " Total lines: $lines\n";
print STDERR " Already discared: $pass1_discard\t(discard rate = $discard_rate%)\n";
@@ -96,7 +113,8 @@ while(<F>) {
$lines++;
if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
elsif ($lines % 2500 == 0) { print STDERR "."; }
- my ($sf, $se) = split / \|\|\| /;
+ my ($sf, $se, @d) = split / \|\|\| /;
+ if (scalar @d != 0 or !defined $se) { next; }
my @fs = split /\s+/, $sf;
my @es = split /\s+/, $se;
my $flen = scalar @fs;
@@ -120,6 +138,8 @@ while(<F>) {
next;
}
print;
+ } else {
+ print;
}
$to++;
}