diff options
Diffstat (limited to 'lib/nlp_ruby/tfidf.rb')
-rw-r--r-- | lib/nlp_ruby/tfidf.rb | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb new file mode 100644 index 0000000..84d55a5 --- /dev/null +++ b/lib/nlp_ruby/tfidf.rb @@ -0,0 +1,32 @@ +# returns key='raw frequency' for an +# array-like object +def tf array, stopwords=[] + v = {}; v.default = 0 + array.uniq.each { |i| + next if stopwords.include? i + v[i] = array.count(i).to_f + } + return v +end + +# smoothes raw frequencies of tf() in-place +# a is a smoothing term +def ntf hash, a=0.4 + max = hash.values.max.to_f + hash.each_pair { |k,v| + hash[k] = a + (1-a)*(v/max) + } +end + +# returns idf value for each word in a vocabulary +def idf list_of_hashes + vocab = list_of_hashes.values.flatten.uniq + n = list_of_hashes.size.to_f + idf = {} + vocab.each { |i| + df = list_of_hashes.values.flatten.count i + idf[i] = Math.log(n/df) + } + return idf +end + |