From e1f71a6ce868d116f04082b697a8d61afcd625f1 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 21 Jan 2013 16:53:05 -0500 Subject: a little bit of cleanup --- corpus/filter-length.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'corpus/filter-length.pl') diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 38837f14..2e257cda 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -44,11 +44,11 @@ while() { $lines++; if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } elsif ($lines % 2500 == 0) { print STDERR "."; } - my ($sf, $se, @d) = split / \|\|\| /; + my ($sf, $se, @d) = split /\s*\|\|\|\s*/; if (scalar @d != 0 or !defined $se) { $bad_format++; if ($bad_format > 100 && ($bad_format / $lines) > 0.02) { - die "Corpus appears to be incorretly formatted, example: $_"; + die "$bad_format / $lines : Corpus appears to be incorretly formatted, example: $_"; } next; } -- cgit v1.2.3