diff options
Diffstat (limited to 'gi')
| -rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 4 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/remove-tags-from-corpus.pl | 53 | 
2 files changed, 55 insertions, 2 deletions
| diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index f72637af..46062c8d 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -94,9 +94,9 @@ $HIER_CAT = ( $NUM_TOPICS_COARSE ? 1 : 0 );  print STDERR "   Output: $OUTPUT\n";  my $DATA_DIR = $OUTPUT . '/corpora'; -my $LEX_NAME = 'corpus.f_e_a.lex'; +my $LEX_NAME = 'corpus.f_e_a.$LANGUAGE.lex';  my $CORPUS_LEX = $DATA_DIR . '/' . $LEX_NAME;  # corpus used to extract rules -my $CORPUS_CLUSTER = $DATA_DIR . '/corpus.f_e_a.cluster'; # corpus used for clustering (often identical) +my $CORPUS_CLUSTER = $DATA_DIR . '/corpus.f_e_a.$LANGUAGE.cluster'; # corpus used for clustering (often identical)  my $CONTEXT_DIR = $OUTPUT . '/' . context_dir();  my $CLUSTER_DIR = $OUTPUT . '/' . cluster_dir(); diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl new file mode 100755 index 00000000..5460db95 --- /dev/null +++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl @@ -0,0 +1,53 @@ +#!/usr/bin/perl -w +use strict; + +use Getopt::Long "GetOptions"; + +my $PHRASE = 'tok'; +my $CONTEXT = 'tag'; + +die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus"  +    unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); + +my $lno = 0; +while(my $line = <>) { +    $lno++; +    chomp $line; +    my @top = split /\t/, $line; +    die unless (scalar @top == 2);  + +    my @pwords = split /\s+/, $top[0]; +    foreach my $token (@pwords) { +        #print $token . "\n"; +        my @parts = split /_(?!_)/, $token; +        die unless (scalar @parts == 2);  +        if ($PHRASE eq "tok") { +            $token = $parts[0] +        } elsif ($PHRASE eq "tag") { +            $token = $parts[1] +        } +    } + +    my @fields = split / \|\|\| /, $top[1]; +    foreach my $i (0..((scalar @fields) / 2 - 1)) { +        #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; +        my @cwords = split /\s+/, $fields[2*$i]; +        foreach my $token (@cwords) { +            #print $i . ": " . $token . "\n"; +            my @parts = split /_/, $token; +            if (scalar @parts == 2) { +                if ($CONTEXT eq "tok") { +                    $token = $parts[0] +                } elsif ($CONTEXT eq "tag") { +                    $token = $parts[1] +                } +            } +        } +        $fields[2*$i] = join ' ', @cwords; +    } + +    print join ' ', @pwords; +    print "\t"; +    print join ' ||| ', @fields; +    print "\n"; +} | 
