blob: e9e7906329bf42bc0c73a9eb8f4ca59c2b5d39f2 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
#!/usr/bin/perl -w
use strict;
die "Usage: $0 [-unk] classes.txt corpus.txt\n Applies a vocabulary map to a corpus\n" unless scalar @ARGV == 2 || (scalar @ARGV == 3 && $ARGV[0] eq '-unk');
my $unk = $ARGV[0] eq '-unk';
shift @ARGV if $unk;
my ($class, $text) = @ARGV;
open C, "<$class" or die "Can't read $class: $!";
open T, "<$text" or die "Can't read $text: $!";
my %dict = ();
my $cc = 0;
while(<C>) {
chomp;
my ($word, $cat) = split /\s+/;
die "'$word' '$cat'" unless (defined $word && defined $cat);
$dict{$word} = $cat;
$cc++;
}
close C;
print STDERR "Loaded classes for $cc words\n";
my @cats;
while(<T>) {
chomp;
if ($unk) {
@cats = map { $dict{$_} or "UNK" } split /\s+/;
} else {
@cats = map { $dict{$_} or die "Undefined class for $_"; } split /\s+/;
}
print "@cats\n";
}
|