#!/usr/bin/env ruby require 'trollop' # returns word='raw frequency' for a single document def tf(d, stopwords=[]) v = {}; v.default = 0 d.uniq.each { |i| next if stopwords.include? i v[i] = d.count(i).to_f } return v end # smoothes raw frequencies def ntf(w, a=0.4) max = w.values.max.to_f w.each_pair { |k,v| w[k] = a + (1-a)*(v/max) } end # returns idf value for each word in vocab def idf(collection) vocab = collection.values.flatten.uniq n = collection.size.to_f idf = {} vocab.each { |i| df = collection.values.flatten.count i idf[i] = Math.log(n/df) } return idf end def main opts = Trollop::options do opt :docs, "input files (documents)", :type => :strings, :required => true opt :filter_stopwords, "filter stopwords (give file)", :type => :string opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool opt :ntf, "length-normalize tf values", :type => :bool opt :idf, "weight tf by idf", :type => :bool end stopwords = [] if opts[:filter_stopwords] stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''} end docs = {} # fn => [words...] opts[:docs].each { |i| if opts[:one_item_per_line] docs[i] = File.new(i, 'r').readlines.map{|i| i.strip} else docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip} end } idf_values = idf docs docs.each_pair { |name, words| just_tf = tf(words) just_tf = ntf(just_tf) if opts[:ntf] tf_idf = {}; tf_idf.default = 0.0 if opts[:idf] just_tf.each_pair { |word,f| tf_idf[word] = idf_values[word] * f } else tf_idf = just_tf end docs[name] = tf_idf } docs.each { |i| puts i.to_s } end main