diff options
-rwxr-xr-x | corpus/filter-length.pl | 14 |
1 files changed, 12 insertions, 2 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 3cfa40cc..38837f14 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -3,20 +3,30 @@ use strict; use utf8; ##### EDIT THESE SETTINGS #################################################### -my $MAX_LENGTH = 150; # discard a sentence if it is longer than this my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be? ############################################################################## -die "Usage: $0 corpus.fr-en\n\n Filter sentence pairs containing sentences longer than $MAX_LENGTH words\n or whose log length ratios are $MAX_ZSCORE stddevs away from the mean log ratio.\n\n" unless scalar @ARGV == 1; +die "Usage: $0 [-NNN] corpus.fr-en\n\n Filter sentence pairs containing sentences longer than NNN words (where NNN\n is 150 by default) or whose log length ratios are $MAX_ZSCORE stddevs away from the\n mean log ratio.\n\n" unless scalar @ARGV == 1 || scalar @ARGV == 2; binmode(STDOUT,":utf8"); binmode(STDERR,":utf8"); +my $MAX_LENGTH = 150; # discard a sentence if it is longer than this +if (scalar @ARGV == 2) { + my $fp = shift @ARGV; + die "Expected -NNN for first parameter, but got $fp\n" unless $fp =~ /^-(\d+)$/; + $MAX_LENGTH=$1; +} + my $corpus = shift @ARGV; + die "Cannot read from STDIN\n" if $corpus eq '-'; my $ff = "<$corpus"; $ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/; +print STDERR "Max line length (monolingual): $MAX_LENGTH\n"; +print STDERR " Parallel corpus: $corpus\n"; + open F,$ff or die "Can't read $corpus: $!"; binmode(F,":utf8"); |