diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-12-04 22:01:32 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-12-04 22:01:32 -0500 |
commit | d7b60765395a4fe6da23992f45afd18dfc093aeb (patch) | |
tree | 44787690ed92013224d6c92bd54be1e5bf5edc4c | |
parent | 6b2989db1f56257eae8209b0d8ddca2aa9b66c6d (diff) |
more flexible corpus cutting
-rwxr-xr-x | corpus/cut-corpus.pl | 29 |
1 files changed, 24 insertions, 5 deletions
diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl index fc9cce3b..7daac0e2 100755 --- a/corpus/cut-corpus.pl +++ b/corpus/cut-corpus.pl @@ -3,14 +3,33 @@ use strict; die "Usage: $0 N\nSplits a corpus separated by ||| symbols and returns the Nth field\n" unless scalar @ARGV > 0; my $x = shift @ARGV; -die "N must be numeric" unless $x =~ /^\d+$/; -$x--; +my @ind = split /,/, $x; +my @o = (); +for my $ff (@ind) { + if ($ff =~ /^\d+$/) { + push @o, $ff - 1; + } elsif ($ff =~ /^(\d+)-(\d+)$/) { + my $a = $1; + my $b = $2; + die "$a-$b is a bad range in input: $x\n" unless $b > $a; + for (my $i=$a; $i <= $b; $i++) { + push @o, $i - 1; + } + } else { + die "Bad input: $x\n"; + } +} while(<>) { chomp; my @fields = split / \|\|\| /; - my $y = $fields[$x]; - if (!defined $y) { $y= ''; } - print "$y\n"; + my @sf; + for my $i (@o) { + my $y = $fields[$i]; + if (!defined $y) { $y= ''; } + push @sf, $y; + } + print join(' ||| ', @sf) . "\n"; } + |