summaryrefslogtreecommitdiff
path: root/gi/pipeline/scripts/remove-tags-from-corpus.pl
blob: 5460db9514a7e334bc8b7cb076e90520451849d3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/perl -w
use strict;

use Getopt::Long "GetOptions";

my $PHRASE = 'tok';
my $CONTEXT = 'tag';

die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" 
    unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT);

my $lno = 0;
while(my $line = <>) {
    $lno++;
    chomp $line;
    my @top = split /\t/, $line;
    die unless (scalar @top == 2); 

    my @pwords = split /\s+/, $top[0];
    foreach my $token (@pwords) {
        #print $token . "\n";
        my @parts = split /_(?!_)/, $token;
        die unless (scalar @parts == 2); 
        if ($PHRASE eq "tok") {
            $token = $parts[0]
        } elsif ($PHRASE eq "tag") {
            $token = $parts[1]
        }
    }

    my @fields = split / \|\|\| /, $top[1];
    foreach my $i (0..((scalar @fields) / 2 - 1)) {
        #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n";
        my @cwords = split /\s+/, $fields[2*$i];
        foreach my $token (@cwords) {
            #print $i . ": " . $token . "\n";
            my @parts = split /_/, $token;
            if (scalar @parts == 2) {
                if ($CONTEXT eq "tok") {
                    $token = $parts[0]
                } elsif ($CONTEXT eq "tag") {
                    $token = $parts[1]
                }
            }
        }
        $fields[2*$i] = join ' ', @cwords;
    }

    print join ' ', @pwords;
    print "\t";
    print join ' ||| ', @fields;
    print "\n";
}