summaryrefslogtreecommitdiff
path: root/tf-idf
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-11-12 13:57:07 +0100
committerPatrick Simianer <p@simianer.de>2015-11-12 13:57:07 +0100
commit5c2833c505dda0d1646b8f8c1e62abd391f0401e (patch)
tree0259c7c2bdb531c09587a744869848d87f4bbd9e /tf-idf
parentef282dbe4fef1b0ae0c8544f0bb84ba674c68de7 (diff)
parent8151031373c08ccd714a99f50783eafcb54d2010 (diff)
Merge branch 'master' of github.com:pks/scripts
Diffstat (limited to 'tf-idf')
-rwxr-xr-xtf-idf18
1 files changed, 9 insertions, 9 deletions
diff --git a/tf-idf b/tf-idf
index dde2fd5..066548b 100755
--- a/tf-idf
+++ b/tf-idf
@@ -4,7 +4,7 @@ require 'zipf'
require 'trollop'
def main
- cfg = Trollop::options do
+ conf = Trollop::options do
opt :documents, "input files (documents)", :type => :string, :required => true
opt :filter_stopwords, "filter stopwords (give file)", :type => :string, :default => nil
opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool, :default => false
@@ -13,21 +13,21 @@ def main
end
stopwords = []
- if cfg[:filter_stopwords]
- stopwords = ReadFile.readlines(cfg[:filter_stopwords]).map{ |i|
+ if conf[:filter_stopwords]
+ stopwords = ReadFile.readlines(conf[:filter_stopwords]).map{ |i|
i.split('|').first.strip
}.reject{ |i| i=='' }
end
docs = {}
a = []
- if cfg[:documents].strip[0] == "*"
- ad = Dir.glob(cfg[:documents])
+ if conf[:documents].strip[0] == "*"
+ ad = Dir.glob(conf[:documents])
else
- ad = cfg[:documents].split
+ ad = conf[:documents].split
end
ad.each { |i|
- if cfg[:one_item_per_line]
+ if conf[:one_item_per_line]
docs[i] = ReadFile.readlines_strip i
else
docs[i] = ReadFile.read(i).split(/\s/).map{ |i| i.strip }
@@ -38,9 +38,9 @@ def main
docs.each_pair { |name, words|
just_tf = TFIDF::tf words, stopwords
- just_tf = TFIDF::ntf(just_tf) if cfg[:ntf]
+ just_tf = TFIDF::ntf(just_tf) if conf[:ntf]
tf_idf = {}; tf_idf.default = 0.0
- if cfg[:idf]
+ if conf[:idf]
just_tf.each_pair { |word,f|
tf_idf[word] = idf_values[word] * f
}