summaryrefslogtreecommitdiff
path: root/compound-split
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2012-11-14 20:33:51 -0500
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2012-11-14 20:33:51 -0500
commitf8d9ff4aaeb1d1f773bacfe9ee75d1d1778ec26b (patch)
treecfd9cd1e19e3fa33888626c204a4e0b73ca2edc4 /compound-split
parentdf5b25f73c12ef03482bd902ee0155a56789e6b9 (diff)
major mert clean up, stuff for simple system demo
Diffstat (limited to 'compound-split')
-rw-r--r--compound-split/README.md (renamed from compound-split/README)0
-rwxr-xr-xcompound-split/make-dict.pl24
2 files changed, 24 insertions, 0 deletions
diff --git a/compound-split/README b/compound-split/README.md
index b7491007..b7491007 100644
--- a/compound-split/README
+++ b/compound-split/README.md
diff --git a/compound-split/make-dict.pl b/compound-split/make-dict.pl
new file mode 100755
index 00000000..71f2b928
--- /dev/null
+++ b/compound-split/make-dict.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+my %d;
+my $z = 0;
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+while(<STDIN>) {
+ chomp;
+ s/[\–":“„!=+*.@«#%&,»\?\/{}\$\(\)\[\];\-0-9]+/ /g;
+ $_ = lc $_;
+ my @words = split /\s+/;
+ for my $w (@words) {
+ next if length($w) == 0;
+ $d{$w}++;
+ $z++;
+ }
+}
+my $lz = log($z);
+for my $w (sort {$d{$b} <=> $d{$a}} keys %d) {
+ my $c = $lz-log($d{$w});
+ print "$w $c\n";
+}
+