diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-07-28 12:41:01 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-07-28 12:41:01 -0400 |
commit | b81b2e85bdfd5e9dda98a6e448e6354ca0c6d26b (patch) | |
tree | 0b6eec2d899f69300012a6d2944115123686b7ac /corpus | |
parent | 306e0ba4754c6c4f460536cfe8c3f118dc1cc175 (diff) |
script to paste files together with the triple pipe separator
Diffstat (limited to 'corpus')
-rwxr-xr-x | corpus/paste-files.pl | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl new file mode 100755 index 00000000..24c70599 --- /dev/null +++ b/corpus/paste-files.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 file1.txt file2.txt [file3.txt ...]\n\n Performs a per-line concatenation of all files using the ||| seperator.\n\n" unless scalar @ARGV > 1; + +my @fhs = (); +for my $file (@ARGV) { + my $fh; + if ($file =~ /\.gz$/) { + open $fh, "gunzip -c $file|" or die "Can't fork gunzip -c $file: $!"; + } else { + open $fh, "<$file" or die "Can't read $file: $!"; + } + binmode($fh,":utf8"); + push @fhs, $fh; +} +binmode(STDOUT,":utf8"); +binmode(STDERR,":utf8"); + +my $lc = 0; +my $done = 0; +my $fl = 0; +while(1) { + my @line; + $lc++; + if ($lc % 100000 == 0) { print STDERR " [$lc]\n"; $fl = 0; } + elsif ($lc % 2500 == 0) { print STDERR "."; $fl = 1; } + my $anum = 0; + for my $fh (@fhs) { + my $r = <$fh>; + if (!defined $r) { + die "Mismatched number of lines.\n" if scalar @line > 0; + $done = 1; + last; + } + chomp $r; + die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + $anum++; + push @line, $r; + } + last if $done; + print STDOUT join(' ||| ', @line) . "\n"; +} +print STDERR "\n" if $fl; +for (my $i = 1; $i < scalar @fhs; $i++) { + my $fh = $fhs[$i]; + my $r = <$fh>; + die "Mismatched number of lines.\n" if defined $r; +} + |