summaryrefslogtreecommitdiff
path: root/word-aligner/support/extract_vocab.pl
blob: 070d4202be46c3103bc9d8f1a82ea8e0878f7dd7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#!/usr/bin/perl -w
use strict;

print STDERR "Extracting vocabulary...\n";
my %dict = ();
my $wc = 0;
while(<>) {
  chomp;
  my @words = split /\s+/;
  for my $word (@words) { $wc++; $dict{$word}++; }
}

my $tc = 0;
for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) {
  print "$word\n";
  $tc++;
}

print STDERR "$tc types / $wc tokens\n";