summaryrefslogtreecommitdiff
path: root/tf-idf
diff options
context:
space:
mode:
Diffstat (limited to 'tf-idf')
-rwxr-xr-xtf-idf80
1 files changed, 80 insertions, 0 deletions
diff --git a/tf-idf b/tf-idf
new file mode 100755
index 0000000..3edaaf8
--- /dev/null
+++ b/tf-idf
@@ -0,0 +1,80 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+# returns word='raw frequency' for a single document
+def tf(d, stopwords=[])
+ v = {}; v.default = 0
+ d.uniq.each { |i|
+ next if stopwords.include? i
+ v[i] = d.count(i).to_f
+ }
+ return v
+end
+
+# smoothes raw frequencies
+def ntf(w, a=0.4)
+ max = w.values.max.to_f
+ w.each_pair { |k,v|
+ w[k] = a + (1-a)*(v/max)
+ }
+end
+
+# returns idf value for each word in vocab
+def idf(collection)
+ vocab = collection.values.flatten.uniq
+ n = collection.size.to_f
+ idf = {}
+ vocab.each { |i|
+ df = collection.values.flatten.count i
+ idf[i] = Math.log(n/df)
+ }
+ return idf
+end
+
+def main
+ opts = Trollop::options do
+ opt :docs, "input files (documents)", :type => :strings, :required => true
+ opt :filter_stopwords, "filter stopwords (give file)", :type => :string
+ opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool
+ opt :ntf, "length-normalize tf values", :type => :bool
+ opt :idf, "weight tf by idf", :type => :bool
+ end
+
+ stopwords = []
+ if opts[:filter_stopwords]
+ stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''}
+ end
+
+ docs = {} # fn => [words...]
+ opts[:docs].each { |i|
+ if opts[:one_item_per_line]
+ docs[i] = File.new(i, 'r').readlines.map{|i| i.strip}
+ else
+ docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip}
+ end
+ }
+
+ idf_values = idf docs
+
+ docs.each_pair { |name, words|
+ just_tf = tf(words)
+ just_tf = ntf(just_tf) if opts[:ntf]
+ tf_idf = {}; tf_idf.default = 0.0
+ if opts[:idf]
+ just_tf.each_pair { |word,f|
+ tf_idf[word] = idf_values[word] * f
+ }
+ else
+ tf_idf = just_tf
+ end
+ docs[name] = tf_idf
+ }
+
+ docs.each { |i| puts i.to_s }
+end
+
+
+main
+