diff options
Diffstat (limited to 'kmeans')
-rwxr-xr-x | kmeans | 162 |
1 files changed, 162 insertions, 0 deletions
@@ -0,0 +1,162 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +# {s:f} {s:f} => f +def dot(x,y) + sum = 0.0 + x.each_pair { |k,v| sum += v * y[k] } + return sum +end + +# {s:f} => f +def mag(x) + return Math.sqrt x.values.inject { |sum,i| sum+i**2 } +end + +# {s:f} {s:f} => f +def cos_sim(x,y) + return dot(x,y)/(mag(x)*mag(y)) +end + +# {s:f} {s:f} => f +def euclidian_dist(x,y) + dims = [x.keys, y.keys].flatten.uniq + sum = 0.0 + dims.each { |i| sum += (x[i] - y[i])**2 } + return Math.sqrt(sum) +end + +# str => {s:{s:f}} +def read(fn) + h = {} + f = File.new fn, 'r' + while line = f.gets + g = eval line + h[g[0]] = g[1] + h[g[0]].default = 0.0 + end + return h +end + +# {s:{s:f}} i => [{s:f}] +def rand_init(docs, k) + prng = Random.new + return docs.keys.sample k, random:prng +end + +def rand_init2(docs, k) + prng = Random.new + a = [] + 0.upto(k-1) do + a << mean(docs.values.sample k, random:prng) + end + return a +end + +# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]} +def assign(docs, centroids) + assignment = {} + docs.each_pair { |name,feature_vector| + min = 1.0/0 + min_index = nil + centroids.each_with_index { |c,j| + dist = euclidian_dist(c, feature_vector) + if dist < min + min = dist + min_index = j + end + } + if assignment.has_key? min_index + assignment[min_index] << [name, feature_vector] + else + assignment[min_index] = [[name, feature_vector]] + end + } + return assignment +end + +# [{s:f}] => {s:f} +def mean(a) + res = {} + res.default = 0.0 + a.each { |i| + i.each_pair { |k,v| + res[k] += v + } + } + n = a.size.to_f + res.each_pair { |k,v| + res[k] = v/n + } +end + +# {i:[{s:f}]} => [{s:f}] +def update(assignment) + new_centroids = [] + assignment.each_pair { |centroid,docs| + new_centroids << mean(docs.map{|i |i[1]}) + } + return new_centroids +end + +def main + opts = Trollop::options do + opt :k, "k", :type => :int, :required => true + opt :input, "input: one feature vector per line", :type => :string, :required => true + opt :max_iterations, "max. number of iterations", :type => :int, :default => 100 + opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3 + opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2 + end + docs = read opts[:input] + k = opts[:k] + centroids = nil + if opts[:init] == 1 + centroids = rand_init(docs, k) + else + centroids = rand_init2(docs, k) + end + STDERR.write "\n k #{k}\n" + STDERR.write " input #{opts[:input]}\n" + STDERR.write "iterations #{opts[:max_iterations]}\n" + STDERR.write "max no ch. #{opts[:max_no_change]}\n" + STDERR.write " init #{opts[:init]}\n\n" + assignment = nil + prev_stats = [] + stats = [] + no_change = 0 + max_no_change = 5 + STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n" + 0.upto(opts[:max_iterations]) do |i| + s = "iteration #{i}" + STDERR.write "#{s}\n#{'-'*s.size}\n" + assignment = assign(docs, centroids) + sizes = [] + assignment.each_pair { |centroid_index,docs| + sizes << docs.size + } + median = sizes.sort[k/2] + max = sizes.max + min = sizes.min + stats = [median, max, min] + no_change += 1 if stats==prev_stats + prev_stats = stats + STDERR.write sizes.to_s + "\n" + STDERR.write " median cluster sz=#{median}\n" + STDERR.write " max cluster sz=#{max}\n" + STDERR.write " min cluster sz=#{min}\n\n" + if no_change == max_no_change + STDERR.write "\nmax no change hit!\n\n" + assignment.each_pair { |centroid_index,docs| + puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}" + } + break + end + centroids = update(assignment) + end +end + + +main + |