diff options
author | Avneesh Saluja <asaluja@gmail.com> | 2013-03-28 18:28:16 -0700 |
---|---|---|
committer | Avneesh Saluja <asaluja@gmail.com> | 2013-03-28 18:28:16 -0700 |
commit | 5b8253e0e1f1393a509fb9975ba8c1347af758ed (patch) | |
tree | 1790470b1d07a0b4973ebce19192e896566ea60b /compound-split/make-dict.pl | |
parent | 2389a5a8a43dda87c355579838559515b0428421 (diff) | |
parent | b203f8c5dc8cff1b9c9c2073832b248fcad0765a (diff) |
fixed conflicts
Diffstat (limited to 'compound-split/make-dict.pl')
-rwxr-xr-x | compound-split/make-dict.pl | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/compound-split/make-dict.pl b/compound-split/make-dict.pl new file mode 100755 index 00000000..71f2b928 --- /dev/null +++ b/compound-split/make-dict.pl @@ -0,0 +1,24 @@ +#!/usr/bin/perl -w +use strict; +use utf8; +my %d; +my $z = 0; +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +while(<STDIN>) { + chomp; + s/[\–":“„!=+*.@«#%&,»\?\/{}\$\(\)\[\];\-0-9]+/ /g; + $_ = lc $_; + my @words = split /\s+/; + for my $w (@words) { + next if length($w) == 0; + $d{$w}++; + $z++; + } +} +my $lz = log($z); +for my $w (sort {$d{$b} <=> $d{$a}} keys %d) { + my $c = $lz-log($d{$w}); + print "$w $c\n"; +} + |