diff options
Diffstat (limited to 'gi/pipeline/scripts')
-rwxr-xr-x | gi/pipeline/scripts/remove-tags-from-corpus.pl | 53 |
1 files changed, 53 insertions, 0 deletions
diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl new file mode 100755 index 00000000..5460db95 --- /dev/null +++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl @@ -0,0 +1,53 @@ +#!/usr/bin/perl -w +use strict; + +use Getopt::Long "GetOptions"; + +my $PHRASE = 'tok'; +my $CONTEXT = 'tag'; + +die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" + unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); + +my $lno = 0; +while(my $line = <>) { + $lno++; + chomp $line; + my @top = split /\t/, $line; + die unless (scalar @top == 2); + + my @pwords = split /\s+/, $top[0]; + foreach my $token (@pwords) { + #print $token . "\n"; + my @parts = split /_(?!_)/, $token; + die unless (scalar @parts == 2); + if ($PHRASE eq "tok") { + $token = $parts[0] + } elsif ($PHRASE eq "tag") { + $token = $parts[1] + } + } + + my @fields = split / \|\|\| /, $top[1]; + foreach my $i (0..((scalar @fields) / 2 - 1)) { + #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; + my @cwords = split /\s+/, $fields[2*$i]; + foreach my $token (@cwords) { + #print $i . ": " . $token . "\n"; + my @parts = split /_/, $token; + if (scalar @parts == 2) { + if ($CONTEXT eq "tok") { + $token = $parts[0] + } elsif ($CONTEXT eq "tag") { + $token = $parts[1] + } + } + } + $fields[2*$i] = join ' ', @cwords; + } + + print join ' ', @pwords; + print "\t"; + print join ' ||| ', @fields; + print "\n"; +} |