summaryrefslogtreecommitdiff
path: root/lib/nlp_ruby/tfidf.rb
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-06-16 17:44:07 +0200
committerPatrick Simianer <p@simianer.de>2014-06-16 17:44:07 +0200
commit4059a5d048cb0f72872c98073ef1ce120a30d78c (patch)
tree4fbff0dc62c5ef3deea0ffdec578e3f2c0ed74b6 /lib/nlp_ruby/tfidf.rb
parent912ff6aebcf4f89f9e64b5f59956dbf7d8f624e3 (diff)
renaming to zipf
Diffstat (limited to 'lib/nlp_ruby/tfidf.rb')
-rw-r--r--lib/nlp_ruby/tfidf.rb38
1 files changed, 0 insertions, 38 deletions
diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb
deleted file mode 100644
index 13a40a3..0000000
--- a/lib/nlp_ruby/tfidf.rb
+++ /dev/null
@@ -1,38 +0,0 @@
-module TFIDF
-
-
-# returns key='raw frequency' for an
-# array-like object
-def TFIDF::tf array, stopwords=[]
- v = {}; v.default = 0
- array.uniq.each { |i|
- next if stopwords.include? i
- v[i] = array.count(i).to_f
- }
- return v
-end
-
-# smoothes raw frequencies of tf() in-place
-# a is a smoothing term
-def TFIDF::ntf hash, a=0.4
- max = hash.values.max.to_f
- hash.each_pair { |k,v|
- hash[k] = a + (1-a)*(v/max)
- }
-end
-
-# returns idf value for each word in a vocabulary
-def TFIDF::idf list_of_hashes
- vocab = list_of_hashes.values.flatten.uniq
- n = list_of_hashes.size.to_f
- idf = {}
- vocab.each { |i|
- df = list_of_hashes.values.flatten.count i
- idf[i] = Math.log(n/df)
- }
- return idf
-end
-
-
-end #module
-