summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2016-04-12 10:59:34 +0200
committerPatrick Simianer <p@simianer.de>2016-04-12 10:59:34 +0200
commit833d84354b9c57068723e9d7a2e87a409eddd329 (patch)
tree8e9d55934ef666f1acd0bcb1317943ff80598e97 /corpus
parentd0613843f2ce5628aa6728f3672d59877ef85833 (diff)
parentee4f3c5581e43510d98de1274c6c1c2984c87faf (diff)
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/corpus-stats.pl50
1 files changed, 50 insertions, 0 deletions
diff --git a/corpus/corpus-stats.pl b/corpus/corpus-stats.pl
new file mode 100755
index 00000000..0bbd49b4
--- /dev/null
+++ b/corpus/corpus-stats.pl
@@ -0,0 +1,50 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $f = <>;
+my $IS_PARALLEL = ($f =~ / \|\|\| /);
+if ($IS_PARALLEL) {
+ die "This script is only valid for monolingual corpora, but file contains |||\n";
+}
+
+my %d;
+my $tc = 0;
+my $lc = 0;
+while($f) {
+ $lc++;
+ chomp $f;
+ my @toks = split /\s+/, $f;
+ for my $t (@toks) {
+ $d{$t}++;
+ $tc++;
+ }
+ $f=<>;
+}
+
+my $types = scalar keys %d;
+my $ttr = $tc / $types;
+my @mfts;
+for my $k (sort {$d{$b} <=> $d{$a}} keys %d) {
+ push @mfts, $k;
+ last if scalar @mfts > 24;
+}
+my $sing = 0;
+for my $k (keys %d) {
+ if ($d{$k} == 1) { $sing++; }
+}
+my $stypes = sqrt($types);
+
+print <<EOT;
+CORPUS STATISTICS
+
+ Lines: $lc
+ Tokens: $tc
+ Types: $types
+ sqrt(types): $stypes
+ Type-tok ratio: $ttr
+ Singletons: $sing
+
+Most freq types: @mfts
+
+EOT
+