diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-01-15 01:20:00 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-01-15 01:20:00 -0500 |
commit | bae5fe99037ae7e101953ad0df118127191c711c (patch) | |
tree | 6a5330da2ae70b7b10fede5dc80065c2cce0bd02 /corpus/filter-length.pl | |
parent | 47f36b8062fb2c72144682401dea35b4f4d1333d (diff) |
corpus files
Diffstat (limited to 'corpus/filter-length.pl')
-rwxr-xr-x | corpus/filter-length.pl | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 70032ca7..3cfa40cc 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -3,8 +3,8 @@ use strict; use utf8; ##### EDIT THESE SETTINGS #################################################### -my $MAX_LENGTH = 99; # discard a sentence if it is longer than this -my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include +my $MAX_LENGTH = 150; # discard a sentence if it is longer than this +my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be? ############################################################################## @@ -128,6 +128,8 @@ while(<F>) { next; } print; + } else { + print; } $to++; } |