1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
#!/usr/bin/env ruby
require 'zipf'
require 'optimist'
def read_data fn
data = {}
ReadFile.new(fn).readlines_strip.map{ |i|
a = i.split ' ', 2
v = SparseVector.from_kv a.last
data[a.first] = v
}
return data
end
def rand_init data, k
prng = Random.new
return data.keys.sample k, random:prng
end
def rand_means_init data, k
prng = Random.new
a = []
0.upto(k-1) do
a << SparseVector.mean(data.values.sample k, random:prng)
end
return a
end
def assign centroids, data
assignment = {}
data.each_pair { |name,feature_vector|
min = 1.0/0
min_index = nil
centroids.each_with_index { |c,i|
dist = c.euclidian_dist(feature_vector)
if dist < min
min = dist
min_index = i
end
}
if assignment.has_key? min_index
assignment[min_index] << name
else
assignment[min_index] = [name]
end
}
return assignment
end
def update assignment, data
new_centroids = []
assignment.each_pair { |centroid_index,a|
new_centroids << SparseVector.mean(assignment[centroid_index].map{ |i| data[i] })
}
return new_centroids
end
def main
conf = Optimist::options do
opt :k, "k", :type => :int, :required => true
opt :input, "input: one feature vector per line", :type => :string, :required => true
opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3
opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
end
# data is 'ID f1=v1 f2=v2'
data = read_data conf[:input]
k = conf[:k]
centroids = nil
if conf[:init] == 1
centroids = rand_init(data, k)
else
centroids = rand_means_init(data, k)
end
STDERR.write "\n k #{k}\n"
STDERR.write " input #{conf[:input]}\n"
STDERR.write "iterations #{conf[:max_iterations]}\n"
STDERR.write "max no ch. #{conf[:max_no_change]}\n"
STDERR.write " init #{conf[:init]}\n\n"
assignment = nil
prev_stats = []
stats = []
no_change = 0
max_no_change = 5
STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n"
0.upto(conf[:max_iterations]) do |i|
s = "iteration #{i}"
STDERR.write "#{s}\n#{'-'*s.size}\n"
assignment = assign centroids, data
sizes = []
assignment.each_pair { |centroid_index, a|
sizes << a.size
}
median = sizes.sort[k/2]
max = sizes.max
min = sizes.min
stats = [median, max, min]
no_change += 1 if stats==prev_stats
prev_stats = stats
STDERR.write sizes.to_s + "\n"
STDERR.write " median cluster sz=#{median}\n"
STDERR.write " max cluster sz=#{max}\n"
STDERR.write " min cluster sz=#{min}\n\n"
if no_change == max_no_change
STDERR.write "\nmax no change hit!\n\n"
assignment.each_pair { |centroid_index,a|
puts "#{centroid_index} #{a.to_s}"
}
break
end
centroids = update assignment, data
end
end
main
|