summaryrefslogtreecommitdiff
path: root/corpus/paste-files.pl
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-02-21 14:13:55 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-02-21 14:13:55 +0000
commitbca26d953a774b8efca12f30407390b3f5eef9d0 (patch)
treefe922de5c89b1844f677d550dcc24e87edd67a55 /corpus/paste-files.pl
parent54a1c0e2bde259e3acc9c0a8ec8da3c7704e80ca (diff)
parent95c364f2cb002241c4a62bedb1c5ef6f1e9a7f22 (diff)
Merge branch 'master' of https://github.com/pauldb89/cdec
Diffstat (limited to 'corpus/paste-files.pl')
-rwxr-xr-xcorpus/paste-files.pl12
1 files changed, 11 insertions, 1 deletions
diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl
index 24c70599..4cb424ad 100755
--- a/corpus/paste-files.pl
+++ b/corpus/paste-files.pl
@@ -17,6 +17,7 @@ for my $file (@ARGV) {
binmode(STDOUT,":utf8");
binmode(STDERR,":utf8");
+my $bad = 0;
my $lc = 0;
my $done = 0;
my $fl = 0;
@@ -34,7 +35,15 @@ while(1) {
last;
}
chomp $r;
- die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/;
+ if ($r =~ /\|\|\|/) {
+ $r = '';
+ $bad++;
+ }
+ warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/;
+ $r =~ s/\|\|\|/ /g;
+ $r =~ s/\s+/ /g;
+ $r =~ s/^ +//;
+ $r =~ s/ +$//;
$anum++;
push @line, $r;
}
@@ -47,4 +56,5 @@ for (my $i = 1; $i < scalar @fhs; $i++) {
my $r = <$fh>;
die "Mismatched number of lines.\n" if defined $r;
}
+print STDERR "Number of lines containing ||| was: $bad\n" if $bad > 0;