summaryrefslogtreecommitdiff
path: root/tf-idf
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-01-29 19:14:08 +0100
committerPatrick Simianer <p@simianer.de>2014-01-29 19:14:08 +0100
commit68acbb9a0c7967cb90a7e3756fc94fdd8a73d154 (patch)
tree3b445131dcb203e94473ae1d8aa82a1798585276 /tf-idf
parent49158e721bfaf6423dca9fc633873218f691c83a (diff)
make use of nlp_ruby, LICENSE
Diffstat (limited to 'tf-idf')
-rwxr-xr-xtf-idf61
1 files changed, 17 insertions, 44 deletions
diff --git a/tf-idf b/tf-idf
index 3edaaf8..e1502b3 100755
--- a/tf-idf
+++ b/tf-idf
@@ -1,68 +1,41 @@
#!/usr/bin/env ruby
+require 'nlp_ruby'
require 'trollop'
-# returns word='raw frequency' for a single document
-def tf(d, stopwords=[])
- v = {}; v.default = 0
- d.uniq.each { |i|
- next if stopwords.include? i
- v[i] = d.count(i).to_f
- }
- return v
-end
-
-# smoothes raw frequencies
-def ntf(w, a=0.4)
- max = w.values.max.to_f
- w.each_pair { |k,v|
- w[k] = a + (1-a)*(v/max)
- }
-end
-
-# returns idf value for each word in vocab
-def idf(collection)
- vocab = collection.values.flatten.uniq
- n = collection.size.to_f
- idf = {}
- vocab.each { |i|
- df = collection.values.flatten.count i
- idf[i] = Math.log(n/df)
- }
- return idf
-end
-
def main
- opts = Trollop::options do
- opt :docs, "input files (documents)", :type => :strings, :required => true
- opt :filter_stopwords, "filter stopwords (give file)", :type => :string
- opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool
+ cfg = Trollop::options do
+ opt :documents, "input files (documents)", :type => :strings, :required => true
+ opt :filter_stopwords, "filter stopwords (give file)", :type => :string, :default => nil
+ opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool, :default => false
opt :ntf, "length-normalize tf values", :type => :bool
opt :idf, "weight tf by idf", :type => :bool
end
stopwords = []
- if opts[:filter_stopwords]
- stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''}
+ if cfg[:filter_stopwords]
+ stopwords = ReadFile.new(cfg[:filter_stopwords]).readlines.map{ |i|
+ i.split('|').first.strip
+ }.reject{ |i| i=='' }
end
- docs = {} # fn => [words...]
- opts[:docs].each { |i|
- if opts[:one_item_per_line]
- docs[i] = File.new(i, 'r').readlines.map{|i| i.strip}
+ docs = {}
+ cfg[:documents].each { |i|
+ if cfg[:one_item_per_line]
+ docs[i] = ReadFile.new(i).readlines_strip
else
- docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip}
+ docs[i] = ReadFile.new(i).read.split(/\s/).map{ |i| i.strip }
end
}
idf_values = idf docs
docs.each_pair { |name, words|
- just_tf = tf(words)
- just_tf = ntf(just_tf) if opts[:ntf]
+ just_tf = tf words, stopwords
+ just_tf = ntf(just_tf) if cfg[:ntf]
tf_idf = {}; tf_idf.default = 0.0
- if opts[:idf]
+ if cfg[:idf]
just_tf.each_pair { |word,f|
tf_idf[word] = idf_values[word] * f
}