diff options
Diffstat (limited to 'tf-idf')
-rwxr-xr-x | tf-idf | 61 |
1 files changed, 17 insertions, 44 deletions
@@ -1,68 +1,41 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' -# returns word='raw frequency' for a single document -def tf(d, stopwords=[]) - v = {}; v.default = 0 - d.uniq.each { |i| - next if stopwords.include? i - v[i] = d.count(i).to_f - } - return v -end - -# smoothes raw frequencies -def ntf(w, a=0.4) - max = w.values.max.to_f - w.each_pair { |k,v| - w[k] = a + (1-a)*(v/max) - } -end - -# returns idf value for each word in vocab -def idf(collection) - vocab = collection.values.flatten.uniq - n = collection.size.to_f - idf = {} - vocab.each { |i| - df = collection.values.flatten.count i - idf[i] = Math.log(n/df) - } - return idf -end - def main - opts = Trollop::options do - opt :docs, "input files (documents)", :type => :strings, :required => true - opt :filter_stopwords, "filter stopwords (give file)", :type => :string - opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool + cfg = Trollop::options do + opt :documents, "input files (documents)", :type => :strings, :required => true + opt :filter_stopwords, "filter stopwords (give file)", :type => :string, :default => nil + opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool, :default => false opt :ntf, "length-normalize tf values", :type => :bool opt :idf, "weight tf by idf", :type => :bool end stopwords = [] - if opts[:filter_stopwords] - stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''} + if cfg[:filter_stopwords] + stopwords = ReadFile.new(cfg[:filter_stopwords]).readlines.map{ |i| + i.split('|').first.strip + }.reject{ |i| i=='' } end - docs = {} # fn => [words...] - opts[:docs].each { |i| - if opts[:one_item_per_line] - docs[i] = File.new(i, 'r').readlines.map{|i| i.strip} + docs = {} + cfg[:documents].each { |i| + if cfg[:one_item_per_line] + docs[i] = ReadFile.new(i).readlines_strip else - docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip} + docs[i] = ReadFile.new(i).read.split(/\s/).map{ |i| i.strip } end } idf_values = idf docs docs.each_pair { |name, words| - just_tf = tf(words) - just_tf = ntf(just_tf) if opts[:ntf] + just_tf = tf words, stopwords + just_tf = ntf(just_tf) if cfg[:ntf] tf_idf = {}; tf_idf.default = 0.0 - if opts[:idf] + if cfg[:idf] just_tf.each_pair { |word,f| tf_idf[word] = idf_values[word] * f } |