#!/usr/bin/env ruby require 'trollop' # {s:f} {s:f} => f def dot(x,y) sum = 0.0 x.each_pair { |k,v| sum += v * y[k] } return sum end # {s:f} => f def mag(x) return Math.sqrt x.values.inject { |sum,i| sum+i**2 } end # {s:f} {s:f} => f def cos_sim(x,y) return dot(x,y)/(mag(x)*mag(y)) end # {s:f} {s:f} => f def euclidian_dist(x,y) dims = [x.keys, y.keys].flatten.uniq sum = 0.0 dims.each { |i| sum += (x[i] - y[i])**2 } return Math.sqrt(sum) end # str => {s:{s:f}} def read(fn) h = {} f = File.new fn, 'r' while line = f.gets g = eval line h[g[0]] = g[1] h[g[0]].default = 0.0 end return h end # {s:{s:f}} i => [{s:f}] def rand_init(docs, k) prng = Random.new return docs.keys.sample k, random:prng end def rand_init2(docs, k) prng = Random.new a = [] 0.upto(k-1) do a << mean(docs.values.sample k, random:prng) end return a end # {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]} def assign(docs, centroids) assignment = {} docs.each_pair { |name,feature_vector| min = 1.0/0 min_index = nil centroids.each_with_index { |c,j| dist = euclidian_dist(c, feature_vector) if dist < min min = dist min_index = j end } if assignment.has_key? min_index assignment[min_index] << [name, feature_vector] else assignment[min_index] = [[name, feature_vector]] end } return assignment end # [{s:f}] => {s:f} def mean(a) res = {} res.default = 0.0 a.each { |i| i.each_pair { |k,v| res[k] += v } } n = a.size.to_f res.each_pair { |k,v| res[k] = v/n } end # {i:[{s:f}]} => [{s:f}] def update(assignment) new_centroids = [] assignment.each_pair { |centroid,docs| new_centroids << mean(docs.map{|i |i[1]}) } return new_centroids end def main opts = Trollop::options do opt :k, "k", :type => :int, :required => true opt :input, "input: one feature vector per line", :type => :string, :required => true opt :max_iterations, "max. number of iterations", :type => :int, :default => 100 opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3 opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2 end docs = read opts[:input] k = opts[:k] centroids = nil if opts[:init] == 1 centroids = rand_init(docs, k) else centroids = rand_init2(docs, k) end STDERR.write "\n k #{k}\n" STDERR.write " input #{opts[:input]}\n" STDERR.write "iterations #{opts[:max_iterations]}\n" STDERR.write "max no ch. #{opts[:max_no_change]}\n" STDERR.write " init #{opts[:init]}\n\n" assignment = nil prev_stats = [] stats = [] no_change = 0 max_no_change = 5 STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n" 0.upto(opts[:max_iterations]) do |i| s = "iteration #{i}" STDERR.write "#{s}\n#{'-'*s.size}\n" assignment = assign(docs, centroids) sizes = [] assignment.each_pair { |centroid_index,docs| sizes << docs.size } median = sizes.sort[k/2] max = sizes.max min = sizes.min stats = [median, max, min] no_change += 1 if stats==prev_stats prev_stats = stats STDERR.write sizes.to_s + "\n" STDERR.write " median cluster sz=#{median}\n" STDERR.write " max cluster sz=#{max}\n" STDERR.write " min cluster sz=#{min}\n\n" if no_change == max_no_change STDERR.write "\nmax no change hit!\n\n" assignment.each_pair { |centroid_index,docs| puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}" } break end centroids = update(assignment) end end main