summaryrefslogtreecommitdiff
path: root/corpus/filter-length.pl
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-01-15 01:20:00 -0500
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-01-15 01:20:00 -0500
commitbae5fe99037ae7e101953ad0df118127191c711c (patch)
tree6a5330da2ae70b7b10fede5dc80065c2cce0bd02 /corpus/filter-length.pl
parent47f36b8062fb2c72144682401dea35b4f4d1333d (diff)
corpus files
Diffstat (limited to 'corpus/filter-length.pl')
-rwxr-xr-xcorpus/filter-length.pl6
1 files changed, 4 insertions, 2 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl
index 70032ca7..3cfa40cc 100755
--- a/corpus/filter-length.pl
+++ b/corpus/filter-length.pl
@@ -3,8 +3,8 @@ use strict;
use utf8;
##### EDIT THESE SETTINGS ####################################################
-my $MAX_LENGTH = 99; # discard a sentence if it is longer than this
-my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include
+my $MAX_LENGTH = 150; # discard a sentence if it is longer than this
+my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include
my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be?
##############################################################################
@@ -128,6 +128,8 @@ while(<F>) {
next;
}
print;
+ } else {
+ print;
}
$to++;
}