blob: ef94b7d6c385d4e4f3a4df6a7e8dfb163ea57f50 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
#!/usr/bin/perl -w
use strict;
print STDERR "Extracting vocabulary...\n";
my %dict = ();
my $wc = 0;
while(<>) {
chomp;
my @words = split /\s+/;
for my $word (@words) {
die if $word eq '';
$wc++; $dict{$word}++;
}
}
my $tc = 0;
for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) {
print "$word\n";
$tc++;
}
print STDERR "$tc types / $wc tokens\n";
|