1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
#!/usr/bin/env ruby
require 'zipf'
require 'trollop'
def main
conf = Trollop::options do
opt :documents, "input files (documents)", :type => :string, :required => true
opt :filter_stopwords, "filter stopwords (give file)", :type => :string, :default => nil
opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool, :default => false
opt :ntf, "length-normalize tf values", :type => :bool
opt :idf, "weight tf by idf", :type => :bool
end
stopwords = []
if conf[:filter_stopwords]
stopwords = ReadFile.readlines(conf[:filter_stopwords]).map{ |i|
i.split('|').first.strip
}.reject{ |i| i=='' }
end
docs = {}
a = []
if conf[:documents].strip[0] == "*"
ad = Dir.glob(conf[:documents])
else
ad = conf[:documents].split
end
ad.each { |i|
if conf[:one_item_per_line]
docs[i] = ReadFile.readlines_strip i
else
docs[i] = ReadFile.read(i).split(/\s/).map{ |i| i.strip }
end
}
idf_values = TFIDF::idf docs
docs.each_pair { |name, words|
just_tf = TFIDF::tf words, stopwords
just_tf = TFIDF::ntf(just_tf) if conf[:ntf]
tf_idf = {}; tf_idf.default = 0.0
if conf[:idf]
just_tf.each_pair { |word,f|
tf_idf[word] = idf_values[word] * f
}
else
tf_idf = just_tf
end
docs[name] = tf_idf
}
docs.each { |i| puts i.to_s }
end
main
|