diff options
Diffstat (limited to 'corpus/corpus-stats.pl')
-rwxr-xr-x | corpus/corpus-stats.pl | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/corpus/corpus-stats.pl b/corpus/corpus-stats.pl new file mode 100755 index 00000000..0bbd49b4 --- /dev/null +++ b/corpus/corpus-stats.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w +use strict; + +my $f = <>; +my $IS_PARALLEL = ($f =~ / \|\|\| /); +if ($IS_PARALLEL) { + die "This script is only valid for monolingual corpora, but file contains |||\n"; +} + +my %d; +my $tc = 0; +my $lc = 0; +while($f) { + $lc++; + chomp $f; + my @toks = split /\s+/, $f; + for my $t (@toks) { + $d{$t}++; + $tc++; + } + $f=<>; +} + +my $types = scalar keys %d; +my $ttr = $tc / $types; +my @mfts; +for my $k (sort {$d{$b} <=> $d{$a}} keys %d) { + push @mfts, $k; + last if scalar @mfts > 24; +} +my $sing = 0; +for my $k (keys %d) { + if ($d{$k} == 1) { $sing++; } +} +my $stypes = sqrt($types); + +print <<EOT; +CORPUS STATISTICS + + Lines: $lc + Tokens: $tc + Types: $types + sqrt(types): $stypes + Type-tok ratio: $ttr + Singletons: $sing + +Most freq types: @mfts + +EOT + |