diff options
author | Patrick Simianer <p@simianer.de> | 2014-06-16 17:44:07 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-06-16 17:44:07 +0200 |
commit | 4059a5d048cb0f72872c98073ef1ce120a30d78c (patch) | |
tree | 4fbff0dc62c5ef3deea0ffdec578e3f2c0ed74b6 /lib/nlp_ruby/tfidf.rb | |
parent | 912ff6aebcf4f89f9e64b5f59956dbf7d8f624e3 (diff) |
renaming to zipf
Diffstat (limited to 'lib/nlp_ruby/tfidf.rb')
-rw-r--r-- | lib/nlp_ruby/tfidf.rb | 38 |
1 files changed, 0 insertions, 38 deletions
diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb deleted file mode 100644 index 13a40a3..0000000 --- a/lib/nlp_ruby/tfidf.rb +++ /dev/null @@ -1,38 +0,0 @@ -module TFIDF - - -# returns key='raw frequency' for an -# array-like object -def TFIDF::tf array, stopwords=[] - v = {}; v.default = 0 - array.uniq.each { |i| - next if stopwords.include? i - v[i] = array.count(i).to_f - } - return v -end - -# smoothes raw frequencies of tf() in-place -# a is a smoothing term -def TFIDF::ntf hash, a=0.4 - max = hash.values.max.to_f - hash.each_pair { |k,v| - hash[k] = a + (1-a)*(v/max) - } -end - -# returns idf value for each word in a vocabulary -def TFIDF::idf list_of_hashes - vocab = list_of_hashes.values.flatten.uniq - n = list_of_hashes.size.to_f - idf = {} - vocab.each { |i| - df = list_of_hashes.values.flatten.count i - idf[i] = Math.log(n/df) - } - return idf -end - - -end #module - |