summaryrefslogtreecommitdiff
path: root/repetition-rate
blob: 87938aede029badd5c8ad86dd93c8e93bce8f98f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env ruby

require 'zipf'

windows = []
cur = []
cur_sz = 0
while line = STDIN.gets
  if cur_sz >= 1000
    windows << cur
    cur = []
    cur_sz = 0 
  end
  cur << line.strip
  cur_sz += cur.last.split.size
end

enums = [0.0]*4
denoms = [0.0]*4
windows.each { |w|
  ng_by_n = [{}]*4
  w.each { |seg|
    ngrams(seg, 4) { |ng|
      if ng_by_n[ng.size-1].has_key? ng
        ng_by_n[ng.size-1][ng] += 1
      else
        ng_by_n[ng.size-1][ng] = 1
      end
    }
  }
  ng_by_n.each_with_index { |ng,j|
    singletons = ng.reject { |k,v| v > 1 }.size
    enums[j] += ng.size - singletons
    denoms[j] += ng.size.to_f
  }
}

rr = 1.0
enums.each_with_index { |i,j|
  rr *= i/denoms[j] 
}

puts ((rr**0.25)*100).round 2