diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-01-21 16:53:05 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-01-21 16:53:05 -0500 |
commit | e1f71a6ce868d116f04082b697a8d61afcd625f1 (patch) | |
tree | 0b026f8f44b5193ff6d9561990e6eada19f7e863 /corpus/filter-length.pl | |
parent | 7cac43b858f3b681555bf0578f54b1f822c43207 (diff) |
a little bit of cleanup
Diffstat (limited to 'corpus/filter-length.pl')
-rwxr-xr-x | corpus/filter-length.pl | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 38837f14..2e257cda 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -44,11 +44,11 @@ while(<F>) { $lines++; if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } elsif ($lines % 2500 == 0) { print STDERR "."; } - my ($sf, $se, @d) = split / \|\|\| /; + my ($sf, $se, @d) = split /\s*\|\|\|\s*/; if (scalar @d != 0 or !defined $se) { $bad_format++; if ($bad_format > 100 && ($bad_format / $lines) > 0.02) { - die "Corpus appears to be incorretly formatted, example: $_"; + die "$bad_format / $lines : Corpus appears to be incorretly formatted, example: $_"; } next; } |