diff options
author | Patrick Simianer <p@simianer.de> | 2014-01-29 19:22:56 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-01-29 19:22:56 +0100 |
commit | d9d72e06db07087aa54401fae8b259f0c4ccd649 (patch) | |
tree | 97f0444314c40d2894ac0892d5559101eda01acf /lib/nlp_ruby/tfidf.rb | |
parent | 22644ed1365e566c8bf806bfff4ecd43c46ce089 (diff) |
first usable version, name change => nlp_ruby
Diffstat (limited to 'lib/nlp_ruby/tfidf.rb')
-rw-r--r-- | lib/nlp_ruby/tfidf.rb | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb new file mode 100644 index 0000000..84d55a5 --- /dev/null +++ b/lib/nlp_ruby/tfidf.rb @@ -0,0 +1,32 @@ +# returns key='raw frequency' for an +# array-like object +def tf array, stopwords=[] + v = {}; v.default = 0 + array.uniq.each { |i| + next if stopwords.include? i + v[i] = array.count(i).to_f + } + return v +end + +# smoothes raw frequencies of tf() in-place +# a is a smoothing term +def ntf hash, a=0.4 + max = hash.values.max.to_f + hash.each_pair { |k,v| + hash[k] = a + (1-a)*(v/max) + } +end + +# returns idf value for each word in a vocabulary +def idf list_of_hashes + vocab = list_of_hashes.values.flatten.uniq + n = list_of_hashes.size.to_f + idf = {} + vocab.each { |i| + df = list_of_hashes.values.flatten.count i + idf[i] = Math.log(n/df) + } + return idf +end + |