diff options
Diffstat (limited to 'kmeans')
-rwxr-xr-x | kmeans | 138 |
1 files changed, 47 insertions, 91 deletions
@@ -1,141 +1,97 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' -# {s:f} {s:f} => f -def dot(x,y) - sum = 0.0 - x.each_pair { |k,v| sum += v * y[k] } - return sum -end - -# {s:f} => f -def mag(x) - return Math.sqrt x.values.inject { |sum,i| sum+i**2 } -end - -# {s:f} {s:f} => f -def cos_sim(x,y) - return dot(x,y)/(mag(x)*mag(y)) -end - -# {s:f} {s:f} => f -def euclidian_dist(x,y) - dims = [x.keys, y.keys].flatten.uniq - sum = 0.0 - dims.each { |i| sum += (x[i] - y[i])**2 } - return Math.sqrt(sum) -end - -# str => {s:{s:f}} -def read(fn) - h = {} - f = File.new fn, 'r' - while line = f.gets - g = eval line - h[g[0]] = g[1] - h[g[0]].default = 0.0 - end - return h +def read_data fn + data = {} + ReadFile.new(fn).readlines_strip.map{ |i| + a = i.split ' ', 2 + data[a.first] = read_feature_string a.last + } + return data end -# {s:{s:f}} i => [{s:f}] -def rand_init(docs, k) - prng = Random.new - return docs.keys.sample k, random:prng +def rand_init data, k + prng = Random.new + return data.keys.sample k, random:prng end -def rand_init2(docs, k) - prng = Random.new +def rand_means_init data, k + prng = Random.new a = [] 0.upto(k-1) do - a << mean(docs.values.sample k, random:prng) + a << mean_sparse_vector(data.values.sample k, random:prng) end return a end -# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]} -def assign(docs, centroids) +def assign centroids, data assignment = {} - docs.each_pair { |name,feature_vector| + data.each_pair { |name,feature_vector| min = 1.0/0 min_index = nil - centroids.each_with_index { |c,j| - dist = euclidian_dist(c, feature_vector) - if dist < min - min = dist - min_index = j + centroids.each_with_index { |c,i| + dist = c.euclidian_dist(feature_vector) + if dist < min + min = dist + min_index = i end } if assignment.has_key? min_index - assignment[min_index] << [name, feature_vector] + assignment[min_index] << name else - assignment[min_index] = [[name, feature_vector]] + assignment[min_index] = [name] end } return assignment end -# [{s:f}] => {s:f} -def mean(a) - res = {} - res.default = 0.0 - a.each { |i| - i.each_pair { |k,v| - res[k] += v - } - } - n = a.size.to_f - res.each_pair { |k,v| - res[k] = v/n - } -end - -# {i:[{s:f}]} => [{s:f}] -def update(assignment) +def update assignment, data new_centroids = [] - assignment.each_pair { |centroid,docs| - new_centroids << mean(docs.map{|i |i[1]}) + assignment.each_pair { |centroid_index,a| + new_centroids << mean_sparse_vector(assignment[centroid_index].map{ |i| data[i] }) } return new_centroids end def main - opts = Trollop::options do + cfg = Trollop::options do opt :k, "k", :type => :int, :required => true opt :input, "input: one feature vector per line", :type => :string, :required => true opt :max_iterations, "max. number of iterations", :type => :int, :default => 100 - opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3 + opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3 opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2 end - docs = read opts[:input] - k = opts[:k] + # data is 'ID f1=v1 f2=v2' + data = read_data cfg[:input] + k = cfg[:k] centroids = nil - if opts[:init] == 1 - centroids = rand_init(docs, k) + if cfg[:init] == 1 + centroids = rand_init(data, k) else - centroids = rand_init2(docs, k) + centroids = rand_means_init(data, k) end STDERR.write "\n k #{k}\n" - STDERR.write " input #{opts[:input]}\n" - STDERR.write "iterations #{opts[:max_iterations]}\n" - STDERR.write "max no ch. #{opts[:max_no_change]}\n" - STDERR.write " init #{opts[:init]}\n\n" + STDERR.write " input #{cfg[:input]}\n" + STDERR.write "iterations #{cfg[:max_iterations]}\n" + STDERR.write "max no ch. #{cfg[:max_no_change]}\n" + STDERR.write " init #{cfg[:init]}\n\n" assignment = nil prev_stats = [] stats = [] no_change = 0 max_no_change = 5 - STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n" - 0.upto(opts[:max_iterations]) do |i| + STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n" + 0.upto(cfg[:max_iterations]) do |i| s = "iteration #{i}" STDERR.write "#{s}\n#{'-'*s.size}\n" - assignment = assign(docs, centroids) + assignment = assign centroids, data sizes = [] - assignment.each_pair { |centroid_index,docs| - sizes << docs.size - } + assignment.each_pair { |centroid_index, a| + sizes << a.size + } median = sizes.sort[k/2] max = sizes.max min = sizes.min @@ -148,12 +104,12 @@ def main STDERR.write " min cluster sz=#{min}\n\n" if no_change == max_no_change STDERR.write "\nmax no change hit!\n\n" - assignment.each_pair { |centroid_index,docs| - puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}" + assignment.each_pair { |centroid_index,a| + puts "#{centroid_index} #{a.to_s}" } break end - centroids = update(assignment) + centroids = update assignment, data end end |