summaryrefslogtreecommitdiff
path: root/corpus/cut-corpus.pl
diff options
context:
space:
mode:
Diffstat (limited to 'corpus/cut-corpus.pl')
-rwxr-xr-xcorpus/cut-corpus.pl35
1 files changed, 35 insertions, 0 deletions
diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl
new file mode 100755
index 00000000..0af3b23c
--- /dev/null
+++ b/corpus/cut-corpus.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/perl -w
+use strict;
+die "Usage: $0 N\nSplits a corpus separated by ||| symbols and returns the Nth field\n" unless scalar @ARGV > 0;
+
+my $x = shift @ARGV;
+my @ind = split /,/, $x;
+my @o = ();
+for my $ff (@ind) {
+ if ($ff =~ /^\d+$/) {
+ push @o, $ff - 1;
+ } elsif ($ff =~ /^(\d+)-(\d+)$/) {
+ my $a = $1;
+ my $b = $2;
+ die "$a-$b is a bad range in input: $x\n" unless $b > $a;
+ for (my $i=$a; $i <= $b; $i++) {
+ push @o, $i - 1;
+ }
+ } else {
+ die "Bad input: $x\n";
+ }
+}
+
+while(<>) {
+ chomp;
+ my @fields = split /\s*\|\|\|\s*/;
+ my @sf;
+ for my $i (@o) {
+ my $y = $fields[$i];
+ if (!defined $y) { $y= ''; }
+ push @sf, $y;
+ }
+ print join(' ||| ', @sf) . "\n";
+}
+
+