diff options
Diffstat (limited to 'corpus')
| -rwxr-xr-x | corpus/filter-length.pl | 14 | 
1 files changed, 12 insertions, 2 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 3cfa40cc..38837f14 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -3,20 +3,30 @@ use strict;  use utf8;  ##### EDIT THESE SETTINGS #################################################### -my $MAX_LENGTH = 150;  # discard a sentence if it is longer than this  my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include  my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be?  ############################################################################## -die "Usage: $0 corpus.fr-en\n\n  Filter sentence pairs containing sentences longer than $MAX_LENGTH words\n  or whose log length ratios are $MAX_ZSCORE stddevs away from the mean log ratio.\n\n" unless scalar @ARGV == 1; +die "Usage: $0 [-NNN] corpus.fr-en\n\n  Filter sentence pairs containing sentences longer than NNN words (where NNN\n  is 150 by default) or whose log length ratios are $MAX_ZSCORE stddevs away from the\n  mean log ratio.\n\n" unless scalar @ARGV == 1 || scalar @ARGV == 2;  binmode(STDOUT,":utf8");  binmode(STDERR,":utf8"); +my $MAX_LENGTH = 150;  # discard a sentence if it is longer than this +if (scalar @ARGV == 2) { +  my $fp = shift @ARGV; +  die "Expected -NNN for first parameter, but got $fp\n" unless $fp =~ /^-(\d+)$/; +  $MAX_LENGTH=$1; +} +  my $corpus = shift @ARGV; +  die "Cannot read from STDIN\n" if $corpus eq '-';  my $ff = "<$corpus";  $ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/; +print STDERR "Max line length (monolingual): $MAX_LENGTH\n"; +print STDERR "              Parallel corpus: $corpus\n"; +  open F,$ff or die "Can't read $corpus: $!";  binmode(F,":utf8");  | 
