#!/usr/bin/env ruby require 'zipf' require 'optimist' def main conf = Optimist::options do opt :documents, "input files (documents)", :type => :string, :required => true opt :filter_stopwords, "filter stopwords (give file)", :type => :string, :default => nil opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool, :default => false opt :ntf, "length-normalize tf values", :type => :bool opt :idf, "weight tf by idf", :type => :bool end stopwords = [] if conf[:filter_stopwords] stopwords = ReadFile.readlines(conf[:filter_stopwords]).map{ |i| i.split('|').first.strip }.reject{ |i| i=='' } end docs = {} a = [] if conf[:documents].strip[0] == "*" ad = Dir.glob(conf[:documents]) else ad = conf[:documents].split end ad.each { |i| if conf[:one_item_per_line] docs[i] = ReadFile.readlines_strip i else docs[i] = ReadFile.read(i).split(/\s/).map{ |i| i.strip } end } idf_values = TFIDF::idf docs docs.each_pair { |name, words| just_tf = TFIDF::tf words, stopwords just_tf = TFIDF::ntf(just_tf) if conf[:ntf] tf_idf = {}; tf_idf.default = 0.0 if conf[:idf] just_tf.each_pair { |word,f| tf_idf[word] = idf_values[word] * f } else tf_idf = just_tf end docs[name] = tf_idf } docs.each { |i| puts i.to_s } end main