blob: 13a40a310f19230be95dac5b9112d994119e70e4 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
module TFIDF
# returns key='raw frequency' for an
# array-like object
def TFIDF::tf array, stopwords=[]
v = {}; v.default = 0
array.uniq.each { |i|
next if stopwords.include? i
v[i] = array.count(i).to_f
}
return v
end
# smoothes raw frequencies of tf() in-place
# a is a smoothing term
def TFIDF::ntf hash, a=0.4
max = hash.values.max.to_f
hash.each_pair { |k,v|
hash[k] = a + (1-a)*(v/max)
}
end
# returns idf value for each word in a vocabulary
def TFIDF::idf list_of_hashes
vocab = list_of_hashes.values.flatten.uniq
n = list_of_hashes.size.to_f
idf = {}
vocab.each { |i|
df = list_of_hashes.values.flatten.count i
idf[i] = Math.log(n/df)
}
return idf
end
end #module
|