summaryrefslogtreecommitdiff
path: root/word-aligner/extract_vocab.pl
diff options
context:
space:
mode:
Diffstat (limited to 'word-aligner/extract_vocab.pl')
-rwxr-xr-xword-aligner/extract_vocab.pl20
1 files changed, 20 insertions, 0 deletions
diff --git a/word-aligner/extract_vocab.pl b/word-aligner/extract_vocab.pl
new file mode 100755
index 00000000..070d4202
--- /dev/null
+++ b/word-aligner/extract_vocab.pl
@@ -0,0 +1,20 @@
+#!/usr/bin/perl -w
+use strict;
+
+print STDERR "Extracting vocabulary...\n";
+my %dict = ();
+my $wc = 0;
+while(<>) {
+ chomp;
+ my @words = split /\s+/;
+ for my $word (@words) { $wc++; $dict{$word}++; }
+}
+
+my $tc = 0;
+for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) {
+ print "$word\n";
+ $tc++;
+}
+
+print STDERR "$tc types / $wc tokens\n";
+