diff options
author | Patrick Simianer <p@simianer.de> | 2016-04-12 10:59:34 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2016-04-12 10:59:34 +0200 |
commit | 833d84354b9c57068723e9d7a2e87a409eddd329 (patch) | |
tree | 8e9d55934ef666f1acd0bcb1317943ff80598e97 /corpus | |
parent | d0613843f2ce5628aa6728f3672d59877ef85833 (diff) | |
parent | ee4f3c5581e43510d98de1274c6c1c2984c87faf (diff) |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'corpus')
-rwxr-xr-x | corpus/corpus-stats.pl | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/corpus/corpus-stats.pl b/corpus/corpus-stats.pl new file mode 100755 index 00000000..0bbd49b4 --- /dev/null +++ b/corpus/corpus-stats.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w +use strict; + +my $f = <>; +my $IS_PARALLEL = ($f =~ / \|\|\| /); +if ($IS_PARALLEL) { + die "This script is only valid for monolingual corpora, but file contains |||\n"; +} + +my %d; +my $tc = 0; +my $lc = 0; +while($f) { + $lc++; + chomp $f; + my @toks = split /\s+/, $f; + for my $t (@toks) { + $d{$t}++; + $tc++; + } + $f=<>; +} + +my $types = scalar keys %d; +my $ttr = $tc / $types; +my @mfts; +for my $k (sort {$d{$b} <=> $d{$a}} keys %d) { + push @mfts, $k; + last if scalar @mfts > 24; +} +my $sing = 0; +for my $k (keys %d) { + if ($d{$k} == 1) { $sing++; } +} +my $stypes = sqrt($types); + +print <<EOT; +CORPUS STATISTICS + + Lines: $lc + Tokens: $tc + Types: $types + sqrt(types): $stypes + Type-tok ratio: $ttr + Singletons: $sing + +Most freq types: @mfts + +EOT + |