summaryrefslogtreecommitdiff
path: root/word-aligner/support/classify.pl
blob: e9e7906329bf42bc0c73a9eb8f4ca59c2b5d39f2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/perl -w
use strict;

die "Usage: $0 [-unk] classes.txt corpus.txt\n Applies a vocabulary map to a corpus\n" unless scalar @ARGV == 2 || (scalar @ARGV == 3 && $ARGV[0] eq '-unk');

my $unk = $ARGV[0] eq '-unk';
shift @ARGV if $unk;

my ($class, $text) = @ARGV;
open C, "<$class" or die "Can't read $class: $!";
open T, "<$text" or die "Can't read $text: $!";

my %dict = ();
my $cc = 0;
while(<C>) {
  chomp;
  my ($word, $cat) = split /\s+/;
  die "'$word' '$cat'" unless (defined $word && defined $cat);
  $dict{$word} = $cat;
  $cc++;
}
close C;
print STDERR "Loaded classes for $cc words\n";

my @cats;
while(<T>) {
  chomp;
  if ($unk) {
    @cats = map { $dict{$_} or "UNK" } split /\s+/;
  } else {
    @cats = map { $dict{$_} or die "Undefined class for $_"; } split /\s+/;
  }
  print "@cats\n";
}