From 899032c9728c7a1c9c97f624ba0cc49b0814277b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 3 Jan 2016 16:17:35 -0500 Subject: corpus stats script --- corpus/corpus-stats.pl | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100755 corpus/corpus-stats.pl diff --git a/corpus/corpus-stats.pl b/corpus/corpus-stats.pl new file mode 100755 index 00000000..0bbd49b4 --- /dev/null +++ b/corpus/corpus-stats.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w +use strict; + +my $f = <>; +my $IS_PARALLEL = ($f =~ / \|\|\| /); +if ($IS_PARALLEL) { + die "This script is only valid for monolingual corpora, but file contains |||\n"; +} + +my %d; +my $tc = 0; +my $lc = 0; +while($f) { + $lc++; + chomp $f; + my @toks = split /\s+/, $f; + for my $t (@toks) { + $d{$t}++; + $tc++; + } + $f=<>; +} + +my $types = scalar keys %d; +my $ttr = $tc / $types; +my @mfts; +for my $k (sort {$d{$b} <=> $d{$a}} keys %d) { + push @mfts, $k; + last if scalar @mfts > 24; +} +my $sing = 0; +for my $k (keys %d) { + if ($d{$k} == 1) { $sing++; } +} +my $stypes = sqrt($types); + +print <