summaryrefslogtreecommitdiff
path: root/word-aligner/extract_vocab.pl
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2010-01-29 15:56:59 +0000
committerChris Dyer <redpony@gmail.com>2010-01-29 15:56:59 +0000
commitda222df300e4f87ad185a7decbf119ad56aa34e0 (patch)
tree1137deefefd28b1a89f6b2b339883801cc12cb29 /word-aligner/extract_vocab.pl
parentee4383b3bc67e2d8ce113fce716050dc2e1b8572 (diff)
word aligner checkin
Diffstat (limited to 'word-aligner/extract_vocab.pl')
-rwxr-xr-xword-aligner/extract_vocab.pl20
1 files changed, 20 insertions, 0 deletions
diff --git a/word-aligner/extract_vocab.pl b/word-aligner/extract_vocab.pl
new file mode 100755
index 00000000..070d4202
--- /dev/null
+++ b/word-aligner/extract_vocab.pl
@@ -0,0 +1,20 @@
+#!/usr/bin/perl -w
+use strict;
+
+print STDERR "Extracting vocabulary...\n";
+my %dict = ();
+my $wc = 0;
+while(<>) {
+ chomp;
+ my @words = split /\s+/;
+ for my $word (@words) { $wc++; $dict{$word}++; }
+}
+
+my $tc = 0;
+for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) {
+ print "$word\n";
+ $tc++;
+}
+
+print STDERR "$tc types / $wc tokens\n";
+