blob: 84d55a5cf597ec165962be6f360806967898c085 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
# returns key='raw frequency' for an
# array-like object
def tf array, stopwords=[]
v = {}; v.default = 0
array.uniq.each { |i|
next if stopwords.include? i
v[i] = array.count(i).to_f
}
return v
end
# smoothes raw frequencies of tf() in-place
# a is a smoothing term
def ntf hash, a=0.4
max = hash.values.max.to_f
hash.each_pair { |k,v|
hash[k] = a + (1-a)*(v/max)
}
end
# returns idf value for each word in a vocabulary
def idf list_of_hashes
vocab = list_of_hashes.values.flatten.uniq
n = list_of_hashes.size.to_f
idf = {}
vocab.each { |i|
df = list_of_hashes.values.flatten.count i
idf[i] = Math.log(n/df)
}
return idf
end
|