From b81b2e85bdfd5e9dda98a6e448e6354ca0c6d26b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 28 Jul 2012 12:41:01 -0400 Subject: script to paste files together with the triple pipe separator --- corpus/paste-files.pl | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100755 corpus/paste-files.pl (limited to 'corpus') diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl new file mode 100755 index 00000000..24c70599 --- /dev/null +++ b/corpus/paste-files.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 file1.txt file2.txt [file3.txt ...]\n\n Performs a per-line concatenation of all files using the ||| seperator.\n\n" unless scalar @ARGV > 1; + +my @fhs = (); +for my $file (@ARGV) { + my $fh; + if ($file =~ /\.gz$/) { + open $fh, "gunzip -c $file|" or die "Can't fork gunzip -c $file: $!"; + } else { + open $fh, "<$file" or die "Can't read $file: $!"; + } + binmode($fh,":utf8"); + push @fhs, $fh; +} +binmode(STDOUT,":utf8"); +binmode(STDERR,":utf8"); + +my $lc = 0; +my $done = 0; +my $fl = 0; +while(1) { + my @line; + $lc++; + if ($lc % 100000 == 0) { print STDERR " [$lc]\n"; $fl = 0; } + elsif ($lc % 2500 == 0) { print STDERR "."; $fl = 1; } + my $anum = 0; + for my $fh (@fhs) { + my $r = <$fh>; + if (!defined $r) { + die "Mismatched number of lines.\n" if scalar @line > 0; + $done = 1; + last; + } + chomp $r; + die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + $anum++; + push @line, $r; + } + last if $done; + print STDOUT join(' ||| ', @line) . "\n"; +} +print STDERR "\n" if $fl; +for (my $i = 1; $i < scalar @fhs; $i++) { + my $fh = $fhs[$i]; + my $r = <$fh>; + die "Mismatched number of lines.\n" if defined $r; +} + -- cgit v1.2.3