#!/usr/bin/env ruby require 'nlp_ruby' require 'trollop' def main cfg = Trollop::options do opt :documents, "input files (documents)", :type => :strings, :required => true opt :filter_stopwords, "filter stopwords (give file)", :type => :string, :default => nil opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool, :default => false opt :ntf, "length-normalize tf values", :type => :bool opt :idf, "weight tf by idf", :type => :bool end stopwords = [] if cfg[:filter_stopwords] stopwords = ReadFile.new(cfg[:filter_stopwords]).readlines.map{ |i| i.split('|').first.strip }.reject{ |i| i=='' } end docs = {} cfg[:documents].each { |i| if cfg[:one_item_per_line] docs[i] = ReadFile.new(i).readlines_strip else docs[i] = ReadFile.new(i).read.split(/\s/).map{ |i| i.strip } end } idf_values = idf docs docs.each_pair { |name, words| just_tf = tf words, stopwords just_tf = ntf(just_tf) if cfg[:ntf] tf_idf = {}; tf_idf.default = 0.0 if cfg[:idf] just_tf.each_pair { |word,f| tf_idf[word] = idf_values[word] * f } else tf_idf = just_tf end docs[name] = tf_idf } docs.each { |i| puts i.to_s } end main