tf-idf


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

#!/usr/bin/env ruby

require 'trollop'


# returns word='raw frequency' for a single document
def tf(d, stopwords=[])
  v = {}; v.default = 0
  d.uniq.each { |i|
   next if stopwords.include? i
   v[i] = d.count(i).to_f
  }
  return v
end

# smoothes raw frequencies
def ntf(w, a=0.4)
  max = w.values.max.to_f
  w.each_pair { |k,v|
    w[k] = a + (1-a)*(v/max)
  }
end

# returns idf value for each word in vocab
def idf(collection)
  vocab = collection.values.flatten.uniq
  n = collection.size.to_f
  idf = {}
  vocab.each { |i|
    df = collection.values.flatten.count i
    idf[i] = Math.log(n/df)
  }
  return idf
end

def main
  opts = Trollop::options do
    opt :docs, "input files (documents)", :type => :strings, :required => true
    opt :filter_stopwords, "filter stopwords (give file)", :type => :string
    opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool
    opt :ntf, "length-normalize tf values", :type => :bool
    opt :idf, "weight tf by idf", :type => :bool
  end

  stopwords = []
  if opts[:filter_stopwords]
    stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''}
  end

  docs = {} # fn => [words...]
  opts[:docs].each { |i|
    if opts[:one_item_per_line]
      docs[i] = File.new(i, 'r').readlines.map{|i| i.strip}
    else
     docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip}
    end
  }

  idf_values = idf docs

  docs.each_pair { |name, words|
    just_tf = tf(words)
    just_tf = ntf(just_tf) if opts[:ntf]
    tf_idf = {}; tf_idf.default = 0.0
    if opts[:idf]
      just_tf.each_pair { |word,f|
        tf_idf[word] = idf_values[word] * f
      }
    else
      tf_idf = just_tf
    end
    docs[name] = tf_idf
  }

  docs.each { |i| puts i.to_s }
end


main