summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2012-12-04 22:01:32 -0500
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2012-12-04 22:01:32 -0500
commitc81b0dc240f2233c3e5ecccd8982218115476f9a (patch)
tree1da702f90cef22215d8505337dc72a35d5bada09
parent9b3306332a27f23a36e96a93b5ff97caee1b6e3c (diff)
more flexible corpus cutting
-rwxr-xr-xcorpus/cut-corpus.pl29
1 files changed, 24 insertions, 5 deletions
diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl
index fc9cce3b..7daac0e2 100755
--- a/corpus/cut-corpus.pl
+++ b/corpus/cut-corpus.pl
@@ -3,14 +3,33 @@ use strict;
die "Usage: $0 N\nSplits a corpus separated by ||| symbols and returns the Nth field\n" unless scalar @ARGV > 0;
my $x = shift @ARGV;
-die "N must be numeric" unless $x =~ /^\d+$/;
-$x--;
+my @ind = split /,/, $x;
+my @o = ();
+for my $ff (@ind) {
+ if ($ff =~ /^\d+$/) {
+ push @o, $ff - 1;
+ } elsif ($ff =~ /^(\d+)-(\d+)$/) {
+ my $a = $1;
+ my $b = $2;
+ die "$a-$b is a bad range in input: $x\n" unless $b > $a;
+ for (my $i=$a; $i <= $b; $i++) {
+ push @o, $i - 1;
+ }
+ } else {
+ die "Bad input: $x\n";
+ }
+}
while(<>) {
chomp;
my @fields = split / \|\|\| /;
- my $y = $fields[$x];
- if (!defined $y) { $y= ''; }
- print "$y\n";
+ my @sf;
+ for my $i (@o) {
+ my $y = $fields[$i];
+ if (!defined $y) { $y= ''; }
+ push @sf, $y;
+ }
+ print join(' ||| ', @sf) . "\n";
}
+