summaryrefslogtreecommitdiff
path: root/lib/nlp_ruby/tfidf.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/nlp_ruby/tfidf.rb')
-rw-r--r--lib/nlp_ruby/tfidf.rb32
1 files changed, 32 insertions, 0 deletions
diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb
new file mode 100644
index 0000000..84d55a5
--- /dev/null
+++ b/lib/nlp_ruby/tfidf.rb
@@ -0,0 +1,32 @@
+# returns key='raw frequency' for an
+# array-like object
+def tf array, stopwords=[]
+ v = {}; v.default = 0
+ array.uniq.each { |i|
+ next if stopwords.include? i
+ v[i] = array.count(i).to_f
+ }
+ return v
+end
+
+# smoothes raw frequencies of tf() in-place
+# a is a smoothing term
+def ntf hash, a=0.4
+ max = hash.values.max.to_f
+ hash.each_pair { |k,v|
+ hash[k] = a + (1-a)*(v/max)
+ }
+end
+
+# returns idf value for each word in a vocabulary
+def idf list_of_hashes
+ vocab = list_of_hashes.values.flatten.uniq
+ n = list_of_hashes.size.to_f
+ idf = {}
+ vocab.each { |i|
+ df = list_of_hashes.values.flatten.count i
+ idf[i] = Math.log(n/df)
+ }
+ return idf
+end
+