summaryrefslogtreecommitdiff
path: root/lib/nlp_ruby/tfidf.rb
blob: 84d55a5cf597ec165962be6f360806967898c085 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# returns key='raw frequency' for an
# array-like object
def tf array, stopwords=[]
  v = {}; v.default = 0
  array.uniq.each { |i|
   next if stopwords.include? i
   v[i] = array.count(i).to_f
  }
  return v
end

# smoothes raw frequencies of tf() in-place
# a is a smoothing term
def ntf hash, a=0.4
  max = hash.values.max.to_f
  hash.each_pair { |k,v|
    hash[k] = a + (1-a)*(v/max)
  }
end

# returns idf value for each word in a vocabulary
def idf list_of_hashes
  vocab = list_of_hashes.values.flatten.uniq
  n = list_of_hashes.size.to_f
  idf = {}
  vocab.each { |i|
    df = list_of_hashes.values.flatten.count i
    idf[i] = Math.log(n/df)
  }
  return idf
end