summaryrefslogtreecommitdiff
path: root/kmeans
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-01-29 19:14:08 +0100
committerPatrick Simianer <p@simianer.de>2014-01-29 19:14:08 +0100
commit68acbb9a0c7967cb90a7e3756fc94fdd8a73d154 (patch)
tree3b445131dcb203e94473ae1d8aa82a1798585276 /kmeans
parent49158e721bfaf6423dca9fc633873218f691c83a (diff)
make use of nlp_ruby, LICENSE
Diffstat (limited to 'kmeans')
-rwxr-xr-xkmeans138
1 files changed, 47 insertions, 91 deletions
diff --git a/kmeans b/kmeans
index 89cc329..5c49d9a 100755
--- a/kmeans
+++ b/kmeans
@@ -1,141 +1,97 @@
#!/usr/bin/env ruby
+require 'nlp_ruby'
require 'trollop'
-# {s:f} {s:f} => f
-def dot(x,y)
- sum = 0.0
- x.each_pair { |k,v| sum += v * y[k] }
- return sum
-end
-
-# {s:f} => f
-def mag(x)
- return Math.sqrt x.values.inject { |sum,i| sum+i**2 }
-end
-
-# {s:f} {s:f} => f
-def cos_sim(x,y)
- return dot(x,y)/(mag(x)*mag(y))
-end
-
-# {s:f} {s:f} => f
-def euclidian_dist(x,y)
- dims = [x.keys, y.keys].flatten.uniq
- sum = 0.0
- dims.each { |i| sum += (x[i] - y[i])**2 }
- return Math.sqrt(sum)
-end
-
-# str => {s:{s:f}}
-def read(fn)
- h = {}
- f = File.new fn, 'r'
- while line = f.gets
- g = eval line
- h[g[0]] = g[1]
- h[g[0]].default = 0.0
- end
- return h
+def read_data fn
+ data = {}
+ ReadFile.new(fn).readlines_strip.map{ |i|
+ a = i.split ' ', 2
+ data[a.first] = read_feature_string a.last
+ }
+ return data
end
-# {s:{s:f}} i => [{s:f}]
-def rand_init(docs, k)
- prng = Random.new
- return docs.keys.sample k, random:prng
+def rand_init data, k
+ prng = Random.new
+ return data.keys.sample k, random:prng
end
-def rand_init2(docs, k)
- prng = Random.new
+def rand_means_init data, k
+ prng = Random.new
a = []
0.upto(k-1) do
- a << mean(docs.values.sample k, random:prng)
+ a << mean_sparse_vector(data.values.sample k, random:prng)
end
return a
end
-# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]}
-def assign(docs, centroids)
+def assign centroids, data
assignment = {}
- docs.each_pair { |name,feature_vector|
+ data.each_pair { |name,feature_vector|
min = 1.0/0
min_index = nil
- centroids.each_with_index { |c,j|
- dist = euclidian_dist(c, feature_vector)
- if dist < min
- min = dist
- min_index = j
+ centroids.each_with_index { |c,i|
+ dist = c.euclidian_dist(feature_vector)
+ if dist < min
+ min = dist
+ min_index = i
end
}
if assignment.has_key? min_index
- assignment[min_index] << [name, feature_vector]
+ assignment[min_index] << name
else
- assignment[min_index] = [[name, feature_vector]]
+ assignment[min_index] = [name]
end
}
return assignment
end
-# [{s:f}] => {s:f}
-def mean(a)
- res = {}
- res.default = 0.0
- a.each { |i|
- i.each_pair { |k,v|
- res[k] += v
- }
- }
- n = a.size.to_f
- res.each_pair { |k,v|
- res[k] = v/n
- }
-end
-
-# {i:[{s:f}]} => [{s:f}]
-def update(assignment)
+def update assignment, data
new_centroids = []
- assignment.each_pair { |centroid,docs|
- new_centroids << mean(docs.map{|i |i[1]})
+ assignment.each_pair { |centroid_index,a|
+ new_centroids << mean_sparse_vector(assignment[centroid_index].map{ |i| data[i] })
}
return new_centroids
end
def main
- opts = Trollop::options do
+ cfg = Trollop::options do
opt :k, "k", :type => :int, :required => true
opt :input, "input: one feature vector per line", :type => :string, :required => true
opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
- opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3
+ opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3
opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
end
- docs = read opts[:input]
- k = opts[:k]
+ # data is 'ID f1=v1 f2=v2'
+ data = read_data cfg[:input]
+ k = cfg[:k]
centroids = nil
- if opts[:init] == 1
- centroids = rand_init(docs, k)
+ if cfg[:init] == 1
+ centroids = rand_init(data, k)
else
- centroids = rand_init2(docs, k)
+ centroids = rand_means_init(data, k)
end
STDERR.write "\n k #{k}\n"
- STDERR.write " input #{opts[:input]}\n"
- STDERR.write "iterations #{opts[:max_iterations]}\n"
- STDERR.write "max no ch. #{opts[:max_no_change]}\n"
- STDERR.write " init #{opts[:init]}\n\n"
+ STDERR.write " input #{cfg[:input]}\n"
+ STDERR.write "iterations #{cfg[:max_iterations]}\n"
+ STDERR.write "max no ch. #{cfg[:max_no_change]}\n"
+ STDERR.write " init #{cfg[:init]}\n\n"
assignment = nil
prev_stats = []
stats = []
no_change = 0
max_no_change = 5
- STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n"
- 0.upto(opts[:max_iterations]) do |i|
+ STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n"
+ 0.upto(cfg[:max_iterations]) do |i|
s = "iteration #{i}"
STDERR.write "#{s}\n#{'-'*s.size}\n"
- assignment = assign(docs, centroids)
+ assignment = assign centroids, data
sizes = []
- assignment.each_pair { |centroid_index,docs|
- sizes << docs.size
- }
+ assignment.each_pair { |centroid_index, a|
+ sizes << a.size
+ }
median = sizes.sort[k/2]
max = sizes.max
min = sizes.min
@@ -148,12 +104,12 @@ def main
STDERR.write " min cluster sz=#{min}\n\n"
if no_change == max_no_change
STDERR.write "\nmax no change hit!\n\n"
- assignment.each_pair { |centroid_index,docs|
- puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}"
+ assignment.each_pair { |centroid_index,a|
+ puts "#{centroid_index} #{a.to_s}"
}
break
end
- centroids = update(assignment)
+ centroids = update assignment, data
end
end