make use of nlp_ruby, LICENSE

author: Patrick Simianer <p@simianer.de> 2014-01-29 19:14:08 +0100
committer: Patrick Simianer <p@simianer.de> 2014-01-29 19:14:08 +0100
commit: 68acbb9a0c7967cb90a7e3756fc94fdd8a73d154 (patch)
tree: 3b445131dcb203e94473ae1d8aa82a1798585276 /tf-idf
parent: 49158e721bfaf6423dca9fc633873218f691c83a (diff)
1 files changed, 17 insertions, 44 deletions
diff --git a/tf-idf b/tf-idf
index 3edaaf8..e1502b3 100755
--- a/tf-idf
+++ b/tf-idf
@@ -1,68 +1,41 @@
 #!/usr/bin/env ruby
 
+require 'nlp_ruby'
 require 'trollop'
 
 
-# returns word='raw frequency' for a single document
-def tf(d, stopwords=[])
-  v = {}; v.default = 0
-  d.uniq.each { |i|
-   next if stopwords.include? i
-   v[i] = d.count(i).to_f
-  }
-  return v
-end
-
-# smoothes raw frequencies
-def ntf(w, a=0.4)
-  max = w.values.max.to_f
-  w.each_pair { |k,v|
-    w[k] = a + (1-a)*(v/max)
-  }
-end
-
-# returns idf value for each word in vocab
-def idf(collection)
-  vocab = collection.values.flatten.uniq
-  n = collection.size.to_f
-  idf = {}
-  vocab.each { |i|
-    df = collection.values.flatten.count i
-    idf[i] = Math.log(n/df)
-  }
-  return idf
-end
-
 def main
-  opts = Trollop::options do
-    opt :docs, "input files (documents)", :type => :strings, :required => true
-    opt :filter_stopwords, "filter stopwords (give file)", :type => :string
-    opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool
+  cfg = Trollop::options do
+    opt :documents, "input files (documents)", :type => :strings, :required => true
+    opt :filter_stopwords, "filter stopwords (give file)", :type => :string, :default => nil
+    opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool, :default => false
     opt :ntf, "length-normalize tf values", :type => :bool
     opt :idf, "weight tf by idf", :type => :bool
   end
 
   stopwords = []
-  if opts[:filter_stopwords]
-    stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''}
+  if cfg[:filter_stopwords]
+    stopwords = ReadFile.new(cfg[:filter_stopwords]).readlines.map{ |i|
+      i.split('|').first.strip
+    }.reject{ |i| i=='' }
   end
 
-  docs = {} # fn => [words...]
-  opts[:docs].each { |i|
-    if opts[:one_item_per_line]
-      docs[i] = File.new(i, 'r').readlines.map{|i| i.strip}
+  docs = {}
+  cfg[:documents].each { |i|
+    if cfg[:one_item_per_line]
+      docs[i] = ReadFile.new(i).readlines_strip
     else
-     docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip}
+     docs[i] = ReadFile.new(i).read.split(/\s/).map{ |i| i.strip }
     end
   }
 
   idf_values = idf docs
 
   docs.each_pair { |name, words|
-    just_tf = tf(words)
-    just_tf = ntf(just_tf) if opts[:ntf]
+    just_tf = tf words, stopwords
+    just_tf = ntf(just_tf) if cfg[:ntf]
     tf_idf = {}; tf_idf.default = 0.0
-    if opts[:idf]
+    if cfg[:idf]
       just_tf.each_pair { |word,f|
         tf_idf[word] = idf_values[word] * f
       }
author	Patrick Simianer <p@simianer.de>	2014-01-29 19:14:08 +0100
committer	Patrick Simianer <p@simianer.de>	2014-01-29 19:14:08 +0100
commit	68acbb9a0c7967cb90a7e3756fc94fdd8a73d154 (patch)
tree	3b445131dcb203e94473ae1d8aa82a1798585276 /tf-idf
parent	49158e721bfaf6423dca9fc633873218f691c83a (diff)