#!/usr/bin/env ruby require 'zipf' require 'trollop' def read_data fn data = {} ReadFile.new(fn).readlines_strip.map{ |i| a = i.split ' ', 2 v = SparseVector.from_kv a.last data[a.first] = v } return data end def rand_init data, k prng = Random.new return data.keys.sample k, random:prng end def rand_means_init data, k prng = Random.new a = [] 0.upto(k-1) do a << SparseVector.mean(data.values.sample k, random:prng) end return a end def assign centroids, data assignment = {} data.each_pair { |name,feature_vector| min = 1.0/0 min_index = nil centroids.each_with_index { |c,i| dist = c.euclidian_dist(feature_vector) if dist < min min = dist min_index = i end } if assignment.has_key? min_index assignment[min_index] << name else assignment[min_index] = [name] end } return assignment end def update assignment, data new_centroids = [] assignment.each_pair { |centroid_index,a| new_centroids << SparseVector.mean(assignment[centroid_index].map{ |i| data[i] }) } return new_centroids end def main conf = Trollop::options do opt :k, "k", :type => :int, :required => true opt :input, "input: one feature vector per line", :type => :string, :required => true opt :max_iterations, "max. number of iterations", :type => :int, :default => 100 opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3 opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2 end # data is 'ID f1=v1 f2=v2' data = read_data conf[:input] k = conf[:k] centroids = nil if conf[:init] == 1 centroids = rand_init(data, k) else centroids = rand_means_init(data, k) end STDERR.write "\n k #{k}\n" STDERR.write " input #{conf[:input]}\n" STDERR.write "iterations #{conf[:max_iterations]}\n" STDERR.write "max no ch. #{conf[:max_no_change]}\n" STDERR.write " init #{conf[:init]}\n\n" assignment = nil prev_stats = [] stats = [] no_change = 0 max_no_change = 5 STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n" 0.upto(conf[:max_iterations]) do |i| s = "iteration #{i}" STDERR.write "#{s}\n#{'-'*s.size}\n" assignment = assign centroids, data sizes = [] assignment.each_pair { |centroid_index, a| sizes << a.size } median = sizes.sort[k/2] max = sizes.max min = sizes.min stats = [median, max, min] no_change += 1 if stats==prev_stats prev_stats = stats STDERR.write sizes.to_s + "\n" STDERR.write " median cluster sz=#{median}\n" STDERR.write " max cluster sz=#{max}\n" STDERR.write " min cluster sz=#{min}\n\n" if no_change == max_no_change STDERR.write "\nmax no change hit!\n\n" assignment.each_pair { |centroid_index,a| puts "#{centroid_index} #{a.to_s}" } break end centroids = update assignment, data end end main