summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-07-28 12:41:01 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2012-07-28 12:41:01 -0400
commitb81b2e85bdfd5e9dda98a6e448e6354ca0c6d26b (patch)
tree0b6eec2d899f69300012a6d2944115123686b7ac /corpus
parent306e0ba4754c6c4f460536cfe8c3f118dc1cc175 (diff)
script to paste files together with the triple pipe separator
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/paste-files.pl50
1 files changed, 50 insertions, 0 deletions
diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl
new file mode 100755
index 00000000..24c70599
--- /dev/null
+++ b/corpus/paste-files.pl
@@ -0,0 +1,50 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 file1.txt file2.txt [file3.txt ...]\n\n Performs a per-line concatenation of all files using the ||| seperator.\n\n" unless scalar @ARGV > 1;
+
+my @fhs = ();
+for my $file (@ARGV) {
+ my $fh;
+ if ($file =~ /\.gz$/) {
+ open $fh, "gunzip -c $file|" or die "Can't fork gunzip -c $file: $!";
+ } else {
+ open $fh, "<$file" or die "Can't read $file: $!";
+ }
+ binmode($fh,":utf8");
+ push @fhs, $fh;
+}
+binmode(STDOUT,":utf8");
+binmode(STDERR,":utf8");
+
+my $lc = 0;
+my $done = 0;
+my $fl = 0;
+while(1) {
+ my @line;
+ $lc++;
+ if ($lc % 100000 == 0) { print STDERR " [$lc]\n"; $fl = 0; }
+ elsif ($lc % 2500 == 0) { print STDERR "."; $fl = 1; }
+ my $anum = 0;
+ for my $fh (@fhs) {
+ my $r = <$fh>;
+ if (!defined $r) {
+ die "Mismatched number of lines.\n" if scalar @line > 0;
+ $done = 1;
+ last;
+ }
+ chomp $r;
+ die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/;
+ $anum++;
+ push @line, $r;
+ }
+ last if $done;
+ print STDOUT join(' ||| ', @line) . "\n";
+}
+print STDERR "\n" if $fl;
+for (my $i = 1; $i < scalar @fhs; $i++) {
+ my $fh = $fhs[$i];
+ my $r = <$fh>;
+ die "Mismatched number of lines.\n" if defined $r;
+}
+