summaryrefslogtreecommitdiff
path: root/repetition-rate
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2017-11-11 12:04:57 +0100
committerPatrick Simianer <p@simianer.de>2017-11-11 12:04:57 +0100
commit4bf6ab567b2358122139130dc02932048a2882e8 (patch)
tree41df47bf583652f7346338bb91f7a7a34272a73b /repetition-rate
parentc9c9f14ee768be723013ad850473541fabfdbe13 (diff)
repetition rate
Diffstat (limited to 'repetition-rate')
-rwxr-xr-xrepetition-rate48
1 files changed, 33 insertions, 15 deletions
diff --git a/repetition-rate b/repetition-rate
index b821782..87938ae 100755
--- a/repetition-rate
+++ b/repetition-rate
@@ -2,25 +2,43 @@
require 'zipf'
-ng = [{},{},{},{}]
-
+windows = []
+cur = []
+cur_sz = 0
while line = STDIN.gets
- ngrams(line, 4) { |g|
- if ng[g.size-1].has_key? g
- ng[g.size-1][g] += 1
- else
- ng[g.size-1][g] = 1
- end
- }
+ if cur_sz >= 1000
+ windows << cur
+ cur = []
+ cur_sz = 0
+ end
+ cur << line.strip
+ cur_sz += cur.last.split.size
end
-rr = 1.0
-ng.each_with_index { |h,j|
- singletons = ng[j].reject { |k,v| v > 1 }.size
- rr *= (ng[j].size - singletons).to_f/ng[j].size.to_f
+enums = [0.0]*4
+denoms = [0.0]*4
+windows.each { |w|
+ ng_by_n = [{}]*4
+ w.each { |seg|
+ ngrams(seg, 4) { |ng|
+ if ng_by_n[ng.size-1].has_key? ng
+ ng_by_n[ng.size-1][ng] += 1
+ else
+ ng_by_n[ng.size-1][ng] = 1
+ end
+ }
+ }
+ ng_by_n.each_with_index { |ng,j|
+ singletons = ng.reject { |k,v| v > 1 }.size
+ enums[j] += ng.size - singletons
+ denoms[j] += ng.size.to_f
+ }
}
-rr = rr**0.25
+rr = 1.0
+enums.each_with_index { |i,j|
+ rr *= i/denoms[j]
+}
-puts rr
+puts ((rr**0.25)*100).round 2