diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-03-10 16:42:12 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-03-10 16:42:12 -0500 |
commit | a45af4a3704531a8382cd231f6445b3a33b598a3 (patch) | |
tree | cb6be837287be58fcb9834da4118b03dca213962 /gi/pf/make-freq-bins.pl | |
parent | 280d5aa74b6a41f8f6deb5dd374140b7e3ab2703 (diff) |
frequency-based binning
Diffstat (limited to 'gi/pf/make-freq-bins.pl')
-rwxr-xr-x | gi/pf/make-freq-bins.pl | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl new file mode 100755 index 00000000..fdcd3555 --- /dev/null +++ b/gi/pf/make-freq-bins.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +my $BASE = 6; +my $CUTOFF = 3; + +my %d; +my $num = 0; +while(<>){ + chomp; + my @words = split /\s+/; + for my $w (@words) {$d{$w}++; $num++;} +} + +my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; + +for (my $i=0; $i<scalar @vocab; $i++) { + my $most = $d{$vocab[$i]}; + my $least = 1; + + my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF); + if ($nl < 0) { $nl = 0; } + print "$vocab[$i] $nl\n" +} + + |