summaryrefslogtreecommitdiff
path: root/word-aligner/support/extract_vocab.pl
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2010-02-01 17:38:39 -0500
committerChris Dyer <redpony@gmail.com>2010-02-01 17:38:39 -0500
commitc97b8a8b58f7385fb48b74e2cf1ea9610cd1202f (patch)
tree3bc1b02c39927a810862136534d5a0e35d7ed4fc /word-aligner/support/extract_vocab.pl
parentda222df300e4f87ad185a7decbf119ad56aa34e0 (diff)
word aligner cleanup, new features
Diffstat (limited to 'word-aligner/support/extract_vocab.pl')
-rwxr-xr-xword-aligner/support/extract_vocab.pl20
1 files changed, 20 insertions, 0 deletions
diff --git a/word-aligner/support/extract_vocab.pl b/word-aligner/support/extract_vocab.pl
new file mode 100755
index 00000000..070d4202
--- /dev/null
+++ b/word-aligner/support/extract_vocab.pl
@@ -0,0 +1,20 @@
+#!/usr/bin/perl -w
+use strict;
+
+print STDERR "Extracting vocabulary...\n";
+my %dict = ();
+my $wc = 0;
+while(<>) {
+ chomp;
+ my @words = split /\s+/;
+ for my $word (@words) { $wc++; $dict{$word}++; }
+}
+
+my $tc = 0;
+for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) {
+ print "$word\n";
+ $tc++;
+}
+
+print STDERR "$tc types / $wc tokens\n";
+