From c81b0dc240f2233c3e5ecccd8982218115476f9a Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 4 Dec 2012 22:01:32 -0500 Subject: more flexible corpus cutting --- corpus/cut-corpus.pl | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'corpus') diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl index fc9cce3b..7daac0e2 100755 --- a/corpus/cut-corpus.pl +++ b/corpus/cut-corpus.pl @@ -3,14 +3,33 @@ use strict; die "Usage: $0 N\nSplits a corpus separated by ||| symbols and returns the Nth field\n" unless scalar @ARGV > 0; my $x = shift @ARGV; -die "N must be numeric" unless $x =~ /^\d+$/; -$x--; +my @ind = split /,/, $x; +my @o = (); +for my $ff (@ind) { + if ($ff =~ /^\d+$/) { + push @o, $ff - 1; + } elsif ($ff =~ /^(\d+)-(\d+)$/) { + my $a = $1; + my $b = $2; + die "$a-$b is a bad range in input: $x\n" unless $b > $a; + for (my $i=$a; $i <= $b; $i++) { + push @o, $i - 1; + } + } else { + die "Bad input: $x\n"; + } +} while(<>) { chomp; my @fields = split / \|\|\| /; - my $y = $fields[$x]; - if (!defined $y) { $y= ''; } - print "$y\n"; + my @sf; + for my $i (@o) { + my $y = $fields[$i]; + if (!defined $y) { $y= ''; } + push @sf, $y; + } + print join(' ||| ', @sf) . "\n"; } + -- cgit v1.2.3