diff options
Diffstat (limited to 'tf-idf')
-rwxr-xr-x | tf-idf | 80 |
1 files changed, 80 insertions, 0 deletions
@@ -0,0 +1,80 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +# returns word='raw frequency' for a single document +def tf(d, stopwords=[]) + v = {}; v.default = 0 + d.uniq.each { |i| + next if stopwords.include? i + v[i] = d.count(i).to_f + } + return v +end + +# smoothes raw frequencies +def ntf(w, a=0.4) + max = w.values.max.to_f + w.each_pair { |k,v| + w[k] = a + (1-a)*(v/max) + } +end + +# returns idf value for each word in vocab +def idf(collection) + vocab = collection.values.flatten.uniq + n = collection.size.to_f + idf = {} + vocab.each { |i| + df = collection.values.flatten.count i + idf[i] = Math.log(n/df) + } + return idf +end + +def main + opts = Trollop::options do + opt :docs, "input files (documents)", :type => :strings, :required => true + opt :filter_stopwords, "filter stopwords (give file)", :type => :string + opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool + opt :ntf, "length-normalize tf values", :type => :bool + opt :idf, "weight tf by idf", :type => :bool + end + + stopwords = [] + if opts[:filter_stopwords] + stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''} + end + + docs = {} # fn => [words...] + opts[:docs].each { |i| + if opts[:one_item_per_line] + docs[i] = File.new(i, 'r').readlines.map{|i| i.strip} + else + docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip} + end + } + + idf_values = idf docs + + docs.each_pair { |name, words| + just_tf = tf(words) + just_tf = ntf(just_tf) if opts[:ntf] + tf_idf = {}; tf_idf.default = 0.0 + if opts[:idf] + just_tf.each_pair { |word,f| + tf_idf[word] = idf_values[word] * f + } + else + tf_idf = just_tf + end + docs[name] = tf_idf + } + + docs.each { |i| puts i.to_s } +end + + +main + |