summaryrefslogtreecommitdiff
path: root/kmeans
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2013-12-05 07:56:38 +0100
committerPatrick Simianer <p@simianer.de>2013-12-05 07:56:38 +0100
commitdb6a6ecfa350cae29739c59df1210d8f76a479c9 (patch)
treef137a001f57f170455c28ce97b5abb2726006cf6 /kmeans
init
Diffstat (limited to 'kmeans')
-rwxr-xr-xkmeans162
1 files changed, 162 insertions, 0 deletions
diff --git a/kmeans b/kmeans
new file mode 100755
index 0000000..89cc329
--- /dev/null
+++ b/kmeans
@@ -0,0 +1,162 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+# {s:f} {s:f} => f
+def dot(x,y)
+ sum = 0.0
+ x.each_pair { |k,v| sum += v * y[k] }
+ return sum
+end
+
+# {s:f} => f
+def mag(x)
+ return Math.sqrt x.values.inject { |sum,i| sum+i**2 }
+end
+
+# {s:f} {s:f} => f
+def cos_sim(x,y)
+ return dot(x,y)/(mag(x)*mag(y))
+end
+
+# {s:f} {s:f} => f
+def euclidian_dist(x,y)
+ dims = [x.keys, y.keys].flatten.uniq
+ sum = 0.0
+ dims.each { |i| sum += (x[i] - y[i])**2 }
+ return Math.sqrt(sum)
+end
+
+# str => {s:{s:f}}
+def read(fn)
+ h = {}
+ f = File.new fn, 'r'
+ while line = f.gets
+ g = eval line
+ h[g[0]] = g[1]
+ h[g[0]].default = 0.0
+ end
+ return h
+end
+
+# {s:{s:f}} i => [{s:f}]
+def rand_init(docs, k)
+ prng = Random.new
+ return docs.keys.sample k, random:prng
+end
+
+def rand_init2(docs, k)
+ prng = Random.new
+ a = []
+ 0.upto(k-1) do
+ a << mean(docs.values.sample k, random:prng)
+ end
+ return a
+end
+
+# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]}
+def assign(docs, centroids)
+ assignment = {}
+ docs.each_pair { |name,feature_vector|
+ min = 1.0/0
+ min_index = nil
+ centroids.each_with_index { |c,j|
+ dist = euclidian_dist(c, feature_vector)
+ if dist < min
+ min = dist
+ min_index = j
+ end
+ }
+ if assignment.has_key? min_index
+ assignment[min_index] << [name, feature_vector]
+ else
+ assignment[min_index] = [[name, feature_vector]]
+ end
+ }
+ return assignment
+end
+
+# [{s:f}] => {s:f}
+def mean(a)
+ res = {}
+ res.default = 0.0
+ a.each { |i|
+ i.each_pair { |k,v|
+ res[k] += v
+ }
+ }
+ n = a.size.to_f
+ res.each_pair { |k,v|
+ res[k] = v/n
+ }
+end
+
+# {i:[{s:f}]} => [{s:f}]
+def update(assignment)
+ new_centroids = []
+ assignment.each_pair { |centroid,docs|
+ new_centroids << mean(docs.map{|i |i[1]})
+ }
+ return new_centroids
+end
+
+def main
+ opts = Trollop::options do
+ opt :k, "k", :type => :int, :required => true
+ opt :input, "input: one feature vector per line", :type => :string, :required => true
+ opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
+ opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3
+ opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
+ end
+ docs = read opts[:input]
+ k = opts[:k]
+ centroids = nil
+ if opts[:init] == 1
+ centroids = rand_init(docs, k)
+ else
+ centroids = rand_init2(docs, k)
+ end
+ STDERR.write "\n k #{k}\n"
+ STDERR.write " input #{opts[:input]}\n"
+ STDERR.write "iterations #{opts[:max_iterations]}\n"
+ STDERR.write "max no ch. #{opts[:max_no_change]}\n"
+ STDERR.write " init #{opts[:init]}\n\n"
+ assignment = nil
+ prev_stats = []
+ stats = []
+ no_change = 0
+ max_no_change = 5
+ STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n"
+ 0.upto(opts[:max_iterations]) do |i|
+ s = "iteration #{i}"
+ STDERR.write "#{s}\n#{'-'*s.size}\n"
+ assignment = assign(docs, centroids)
+ sizes = []
+ assignment.each_pair { |centroid_index,docs|
+ sizes << docs.size
+ }
+ median = sizes.sort[k/2]
+ max = sizes.max
+ min = sizes.min
+ stats = [median, max, min]
+ no_change += 1 if stats==prev_stats
+ prev_stats = stats
+ STDERR.write sizes.to_s + "\n"
+ STDERR.write " median cluster sz=#{median}\n"
+ STDERR.write " max cluster sz=#{max}\n"
+ STDERR.write " min cluster sz=#{min}\n\n"
+ if no_change == max_no_change
+ STDERR.write "\nmax no change hit!\n\n"
+ assignment.each_pair { |centroid_index,docs|
+ puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}"
+ }
+ break
+ end
+ centroids = update(assignment)
+ end
+end
+
+
+main
+