summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xcorpus/filter-length.pl4
-rwxr-xr-xcorpus/paste-files.pl8
-rwxr-xr-xcorpus/support/tokenizer.pl1
3 files changed, 6 insertions, 7 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl
index 38837f14..2e257cda 100755
--- a/corpus/filter-length.pl
+++ b/corpus/filter-length.pl
@@ -44,11 +44,11 @@ while(<F>) {
$lines++;
if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
elsif ($lines % 2500 == 0) { print STDERR "."; }
- my ($sf, $se, @d) = split / \|\|\| /;
+ my ($sf, $se, @d) = split /\s*\|\|\|\s*/;
if (scalar @d != 0 or !defined $se) {
$bad_format++;
if ($bad_format > 100 && ($bad_format / $lines) > 0.02) {
- die "Corpus appears to be incorretly formatted, example: $_";
+ die "$bad_format / $lines : Corpus appears to be incorretly formatted, example: $_";
}
next;
}
diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl
index 0b788386..4cb424ad 100755
--- a/corpus/paste-files.pl
+++ b/corpus/paste-files.pl
@@ -41,9 +41,9 @@ while(1) {
}
warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/;
$r =~ s/\|\|\|/ /g;
- $r =~ s/ +//g;
- $r =~ s/^ //;
- $r =~ s/ $//;
+ $r =~ s/\s+/ /g;
+ $r =~ s/^ +//;
+ $r =~ s/ +$//;
$anum++;
push @line, $r;
}
@@ -56,5 +56,5 @@ for (my $i = 1; $i < scalar @fhs; $i++) {
my $r = <$fh>;
die "Mismatched number of lines.\n" if defined $r;
}
-print STDERR "Bad lines containing ||| were $bad\n";
+print STDERR "Number of lines containing ||| was: $bad\n" if $bad > 0;
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index e9c3a37d..b5190858 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -147,7 +147,6 @@ while(<STDIN>){
print STDOUT " $new_line\n";
}
-print STDERR "\n";
########################################################################