diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-03-10 16:42:12 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-03-10 16:42:12 -0500 |
commit | dfbc278c1057555fda9312291c8024049e00b7d8 (patch) | |
tree | e922651d48b1c9f73857f0dabd31c55a3ce8a74b /gi/pf/make-freq-bins.pl | |
parent | 289f96779e665ba24adca3461a624c68aa37bd99 (diff) |
frequency-based binning
Diffstat (limited to 'gi/pf/make-freq-bins.pl')
-rwxr-xr-x | gi/pf/make-freq-bins.pl | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl new file mode 100755 index 00000000..fdcd3555 --- /dev/null +++ b/gi/pf/make-freq-bins.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +my $BASE = 6; +my $CUTOFF = 3; + +my %d; +my $num = 0; +while(<>){ + chomp; + my @words = split /\s+/; + for my $w (@words) {$d{$w}++; $num++;} +} + +my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; + +for (my $i=0; $i<scalar @vocab; $i++) { + my $most = $d{$vocab[$i]}; + my $least = 1; + + my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF); + if ($nl < 0) { $nl = 0; } + print "$vocab[$i] $nl\n" +} + + |