blob: 87938aede029badd5c8ad86dd93c8e93bce8f98f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
#!/usr/bin/env ruby
require 'zipf'
windows = []
cur = []
cur_sz = 0
while line = STDIN.gets
if cur_sz >= 1000
windows << cur
cur = []
cur_sz = 0
end
cur << line.strip
cur_sz += cur.last.split.size
end
enums = [0.0]*4
denoms = [0.0]*4
windows.each { |w|
ng_by_n = [{}]*4
w.each { |seg|
ngrams(seg, 4) { |ng|
if ng_by_n[ng.size-1].has_key? ng
ng_by_n[ng.size-1][ng] += 1
else
ng_by_n[ng.size-1][ng] = 1
end
}
}
ng_by_n.each_with_index { |ng,j|
singletons = ng.reject { |k,v| v > 1 }.size
enums[j] += ng.size - singletons
denoms[j] += ng.size.to_f
}
}
rr = 1.0
enums.each_with_index { |i,j|
rr *= i/denoms[j]
}
puts ((rr**0.25)*100).round 2
|