diff options
Diffstat (limited to 'corpus')
| -rwxr-xr-x | corpus/corpus-stats.pl | 50 | 
1 files changed, 50 insertions, 0 deletions
| diff --git a/corpus/corpus-stats.pl b/corpus/corpus-stats.pl new file mode 100755 index 00000000..0bbd49b4 --- /dev/null +++ b/corpus/corpus-stats.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w +use strict; + +my $f = <>; +my $IS_PARALLEL = ($f =~ / \|\|\| /); +if ($IS_PARALLEL) { +  die "This script is only valid for monolingual corpora, but file contains |||\n"; +} + +my %d; +my $tc = 0; +my $lc = 0; +while($f) { +  $lc++; +  chomp $f; +  my @toks = split /\s+/, $f; +  for my $t (@toks) { +    $d{$t}++; +    $tc++; +  } +  $f=<>; +} + +my $types = scalar keys %d; +my $ttr = $tc / $types; +my @mfts; +for my $k (sort {$d{$b} <=> $d{$a}} keys %d) { +  push @mfts, $k; +  last if scalar @mfts > 24; +} +my $sing = 0; +for my $k (keys %d) { +  if ($d{$k} == 1) { $sing++; } +} +my $stypes = sqrt($types); + +print <<EOT; +CORPUS STATISTICS + +          Lines: $lc +         Tokens: $tc +          Types: $types +    sqrt(types): $stypes + Type-tok ratio: $ttr +     Singletons: $sing + +Most freq types: @mfts + +EOT + | 
