diff options
-rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 4 | ||||
-rwxr-xr-x | gi/pipeline/scripts/remove-tags-from-corpus.pl | 53 |
2 files changed, 55 insertions, 2 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index f72637af..46062c8d 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -94,9 +94,9 @@ $HIER_CAT = ( $NUM_TOPICS_COARSE ? 1 : 0 ); print STDERR " Output: $OUTPUT\n"; my $DATA_DIR = $OUTPUT . '/corpora'; -my $LEX_NAME = 'corpus.f_e_a.lex'; +my $LEX_NAME = 'corpus.f_e_a.$LANGUAGE.lex'; my $CORPUS_LEX = $DATA_DIR . '/' . $LEX_NAME; # corpus used to extract rules -my $CORPUS_CLUSTER = $DATA_DIR . '/corpus.f_e_a.cluster'; # corpus used for clustering (often identical) +my $CORPUS_CLUSTER = $DATA_DIR . '/corpus.f_e_a.$LANGUAGE.cluster'; # corpus used for clustering (often identical) my $CONTEXT_DIR = $OUTPUT . '/' . context_dir(); my $CLUSTER_DIR = $OUTPUT . '/' . cluster_dir(); diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl new file mode 100755 index 00000000..5460db95 --- /dev/null +++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl @@ -0,0 +1,53 @@ +#!/usr/bin/perl -w +use strict; + +use Getopt::Long "GetOptions"; + +my $PHRASE = 'tok'; +my $CONTEXT = 'tag'; + +die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" + unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); + +my $lno = 0; +while(my $line = <>) { + $lno++; + chomp $line; + my @top = split /\t/, $line; + die unless (scalar @top == 2); + + my @pwords = split /\s+/, $top[0]; + foreach my $token (@pwords) { + #print $token . "\n"; + my @parts = split /_(?!_)/, $token; + die unless (scalar @parts == 2); + if ($PHRASE eq "tok") { + $token = $parts[0] + } elsif ($PHRASE eq "tag") { + $token = $parts[1] + } + } + + my @fields = split / \|\|\| /, $top[1]; + foreach my $i (0..((scalar @fields) / 2 - 1)) { + #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; + my @cwords = split /\s+/, $fields[2*$i]; + foreach my $token (@cwords) { + #print $i . ": " . $token . "\n"; + my @parts = split /_/, $token; + if (scalar @parts == 2) { + if ($CONTEXT eq "tok") { + $token = $parts[0] + } elsif ($CONTEXT eq "tag") { + $token = $parts[1] + } + } + } + $fields[2*$i] = join ' ', @cwords; + } + + print join ' ', @pwords; + print "\t"; + print join ' ||| ', @fields; + print "\n"; +} |