summaryrefslogtreecommitdiff
path: root/compound-split/make-dict.pl
blob: 71f2b9288858ec3afea9413d6c784504a0cda2ec (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/usr/bin/perl -w
use strict;
use utf8;
my %d;
my $z = 0;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
while(<STDIN>) {
  chomp;
  s/[\–":“„!=+*.@«#%&,»\?\/{}\$\(\)\[\];\-0-9]+/ /g;
  $_ = lc $_;
  my @words = split /\s+/;
  for my $w (@words) {
    next if length($w) == 0;
    $d{$w}++;
    $z++;
  }
}
my $lz = log($z);
for my $w (sort {$d{$b} <=> $d{$a}} keys %d) {
  my $c = $lz-log($d{$w});
  print "$w $c\n";
}