1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
|
#!/usr/bin/env ruby
require 'trollop'
# {s:f} {s:f} => f
def dot(x,y)
sum = 0.0
x.each_pair { |k,v| sum += v * y[k] }
return sum
end
# {s:f} => f
def mag(x)
return Math.sqrt x.values.inject { |sum,i| sum+i**2 }
end
# {s:f} {s:f} => f
def cos_sim(x,y)
return dot(x,y)/(mag(x)*mag(y))
end
# {s:f} {s:f} => f
def euclidian_dist(x,y)
dims = [x.keys, y.keys].flatten.uniq
sum = 0.0
dims.each { |i| sum += (x[i] - y[i])**2 }
return Math.sqrt(sum)
end
# str => {s:{s:f}}
def read(fn)
h = {}
f = File.new fn, 'r'
while line = f.gets
g = eval line
h[g[0]] = g[1]
h[g[0]].default = 0.0
end
return h
end
# {s:{s:f}} i => [{s:f}]
def rand_init(docs, k)
prng = Random.new
return docs.keys.sample k, random:prng
end
def rand_init2(docs, k)
prng = Random.new
a = []
0.upto(k-1) do
a << mean(docs.values.sample k, random:prng)
end
return a
end
# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]}
def assign(docs, centroids)
assignment = {}
docs.each_pair { |name,feature_vector|
min = 1.0/0
min_index = nil
centroids.each_with_index { |c,j|
dist = euclidian_dist(c, feature_vector)
if dist < min
min = dist
min_index = j
end
}
if assignment.has_key? min_index
assignment[min_index] << [name, feature_vector]
else
assignment[min_index] = [[name, feature_vector]]
end
}
return assignment
end
# [{s:f}] => {s:f}
def mean(a)
res = {}
res.default = 0.0
a.each { |i|
i.each_pair { |k,v|
res[k] += v
}
}
n = a.size.to_f
res.each_pair { |k,v|
res[k] = v/n
}
end
# {i:[{s:f}]} => [{s:f}]
def update(assignment)
new_centroids = []
assignment.each_pair { |centroid,docs|
new_centroids << mean(docs.map{|i |i[1]})
}
return new_centroids
end
def main
opts = Trollop::options do
opt :k, "k", :type => :int, :required => true
opt :input, "input: one feature vector per line", :type => :string, :required => true
opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3
opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
end
docs = read opts[:input]
k = opts[:k]
centroids = nil
if opts[:init] == 1
centroids = rand_init(docs, k)
else
centroids = rand_init2(docs, k)
end
STDERR.write "\n k #{k}\n"
STDERR.write " input #{opts[:input]}\n"
STDERR.write "iterations #{opts[:max_iterations]}\n"
STDERR.write "max no ch. #{opts[:max_no_change]}\n"
STDERR.write " init #{opts[:init]}\n\n"
assignment = nil
prev_stats = []
stats = []
no_change = 0
max_no_change = 5
STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n"
0.upto(opts[:max_iterations]) do |i|
s = "iteration #{i}"
STDERR.write "#{s}\n#{'-'*s.size}\n"
assignment = assign(docs, centroids)
sizes = []
assignment.each_pair { |centroid_index,docs|
sizes << docs.size
}
median = sizes.sort[k/2]
max = sizes.max
min = sizes.min
stats = [median, max, min]
no_change += 1 if stats==prev_stats
prev_stats = stats
STDERR.write sizes.to_s + "\n"
STDERR.write " median cluster sz=#{median}\n"
STDERR.write " max cluster sz=#{max}\n"
STDERR.write " min cluster sz=#{min}\n\n"
if no_change == max_no_change
STDERR.write "\nmax no change hit!\n\n"
assignment.each_pair { |centroid_index,docs|
puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}"
}
break
end
centroids = update(assignment)
end
end
main
|