diff options
author | Chris Dyer <cdyer@moto.clab.cs.cmu.edu> | 2016-01-03 16:17:35 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@moto.clab.cs.cmu.edu> | 2016-01-03 16:17:35 -0500 |
commit | 899032c9728c7a1c9c97f624ba0cc49b0814277b (patch) | |
tree | f58d13dd6f2762eb46be30f9fc4d46873a2461b7 | |
parent | 4b65af355930e2a6e5cdaea729a11a7cc8cbd0c9 (diff) |
corpus stats script
-rwxr-xr-x | corpus/corpus-stats.pl | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/corpus/corpus-stats.pl b/corpus/corpus-stats.pl new file mode 100755 index 00000000..0bbd49b4 --- /dev/null +++ b/corpus/corpus-stats.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w +use strict; + +my $f = <>; +my $IS_PARALLEL = ($f =~ / \|\|\| /); +if ($IS_PARALLEL) { + die "This script is only valid for monolingual corpora, but file contains |||\n"; +} + +my %d; +my $tc = 0; +my $lc = 0; +while($f) { + $lc++; + chomp $f; + my @toks = split /\s+/, $f; + for my $t (@toks) { + $d{$t}++; + $tc++; + } + $f=<>; +} + +my $types = scalar keys %d; +my $ttr = $tc / $types; +my @mfts; +for my $k (sort {$d{$b} <=> $d{$a}} keys %d) { + push @mfts, $k; + last if scalar @mfts > 24; +} +my $sing = 0; +for my $k (keys %d) { + if ($d{$k} == 1) { $sing++; } +} +my $stypes = sqrt($types); + +print <<EOT; +CORPUS STATISTICS + + Lines: $lc + Tokens: $tc + Types: $types + sqrt(types): $stypes + Type-tok ratio: $ttr + Singletons: $sing + +Most freq types: @mfts + +EOT + |