#!/usr/bin/perl -w use strict; die "Usage: $0 [-unk] classes.txt corpus.txt\n Applies a vocabulary map to a corpus\n" unless scalar @ARGV == 2 || (scalar @ARGV == 3 && $ARGV[0] eq '-unk'); my $unk = $ARGV[0] eq '-unk'; shift @ARGV if $unk; my ($class, $text) = @ARGV; open C, "<$class" or die "Can't read $class: $!"; open T, "<$text" or die "Can't read $text: $!"; my %dict = (); my $cc = 0; while(<C>) { chomp; my ($word, $cat) = split /\s+/; die "'$word' '$cat'" unless (defined $word && defined $cat); $dict{$word} = $cat; $cc++; } close C; print STDERR "Loaded classes for $cc words\n"; my @cats; while(<T>) { chomp; if ($unk) { @cats = map { $dict{$_} or "UNK" } split /\s+/; } else { @cats = map { $dict{$_} or die "Undefined class for $_"; } split /\s+/; } print "@cats\n"; }