summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/filter-length.pl14
1 files changed, 12 insertions, 2 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl
index 3cfa40cc..38837f14 100755
--- a/corpus/filter-length.pl
+++ b/corpus/filter-length.pl
@@ -3,20 +3,30 @@ use strict;
use utf8;
##### EDIT THESE SETTINGS ####################################################
-my $MAX_LENGTH = 150; # discard a sentence if it is longer than this
my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include
my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be?
##############################################################################
-die "Usage: $0 corpus.fr-en\n\n Filter sentence pairs containing sentences longer than $MAX_LENGTH words\n or whose log length ratios are $MAX_ZSCORE stddevs away from the mean log ratio.\n\n" unless scalar @ARGV == 1;
+die "Usage: $0 [-NNN] corpus.fr-en\n\n Filter sentence pairs containing sentences longer than NNN words (where NNN\n is 150 by default) or whose log length ratios are $MAX_ZSCORE stddevs away from the\n mean log ratio.\n\n" unless scalar @ARGV == 1 || scalar @ARGV == 2;
binmode(STDOUT,":utf8");
binmode(STDERR,":utf8");
+my $MAX_LENGTH = 150; # discard a sentence if it is longer than this
+if (scalar @ARGV == 2) {
+ my $fp = shift @ARGV;
+ die "Expected -NNN for first parameter, but got $fp\n" unless $fp =~ /^-(\d+)$/;
+ $MAX_LENGTH=$1;
+}
+
my $corpus = shift @ARGV;
+
die "Cannot read from STDIN\n" if $corpus eq '-';
my $ff = "<$corpus";
$ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/;
+print STDERR "Max line length (monolingual): $MAX_LENGTH\n";
+print STDERR " Parallel corpus: $corpus\n";
+
open F,$ff or die "Can't read $corpus: $!";
binmode(F,":utf8");