diff options
| author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-14 20:33:51 -0500 | 
|---|---|---|
| committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-14 20:33:51 -0500 | 
| commit | f8d9ff4aaeb1d1f773bacfe9ee75d1d1778ec26b (patch) | |
| tree | cfd9cd1e19e3fa33888626c204a4e0b73ca2edc4 /compound-split | |
| parent | df5b25f73c12ef03482bd902ee0155a56789e6b9 (diff) | |
major mert clean up, stuff for simple system demo
Diffstat (limited to 'compound-split')
| -rw-r--r-- | compound-split/README.md (renamed from compound-split/README) | 0 | ||||
| -rwxr-xr-x | compound-split/make-dict.pl | 24 | 
2 files changed, 24 insertions, 0 deletions
diff --git a/compound-split/README b/compound-split/README.md index b7491007..b7491007 100644 --- a/compound-split/README +++ b/compound-split/README.md diff --git a/compound-split/make-dict.pl b/compound-split/make-dict.pl new file mode 100755 index 00000000..71f2b928 --- /dev/null +++ b/compound-split/make-dict.pl @@ -0,0 +1,24 @@ +#!/usr/bin/perl -w +use strict; +use utf8; +my %d; +my $z = 0; +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +while(<STDIN>) { +  chomp; +  s/[\–":“„!=+*.@«#%&,»\?\/{}\$\(\)\[\];\-0-9]+/ /g; +  $_ = lc $_; +  my @words = split /\s+/; +  for my $w (@words) { +    next if length($w) == 0; +    $d{$w}++; +    $z++; +  } +} +my $lz = log($z); +for my $w (sort {$d{$b} <=> $d{$a}} keys %d) { +  my $c = $lz-log($d{$w}); +  print "$w $c\n"; +} +  | 
