summaryrefslogtreecommitdiff
path: root/kmeans
blob: 89cc329189a502db7f2d90d61e138247cb844372 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env ruby

require 'trollop'


# {s:f} {s:f} => f
def dot(x,y)
  sum = 0.0
  x.each_pair { |k,v| sum += v * y[k] }
  return sum
end

# {s:f} => f
def mag(x)
  return Math.sqrt x.values.inject { |sum,i| sum+i**2 }
end

# {s:f} {s:f} => f
def cos_sim(x,y)
  return dot(x,y)/(mag(x)*mag(y))
end

# {s:f} {s:f} => f
def euclidian_dist(x,y)
  dims = [x.keys, y.keys].flatten.uniq  
  sum = 0.0
  dims.each { |i| sum += (x[i] - y[i])**2 }
  return Math.sqrt(sum)
end

# str => {s:{s:f}}
def read(fn)
  h = {}
  f = File.new fn, 'r'
  while line = f.gets
    g = eval line 
    h[g[0]] = g[1]
    h[g[0]].default = 0.0
  end
  return h
end

# {s:{s:f}} i => [{s:f}]
def rand_init(docs, k)
  prng = Random.new 
  return docs.keys.sample k, random:prng
end

def rand_init2(docs, k)
  prng = Random.new 
  a = []
  0.upto(k-1) do
    a << mean(docs.values.sample k, random:prng)
  end
  return a
end

# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]}
def assign(docs, centroids)
  assignment = {}
  docs.each_pair { |name,feature_vector|
      min = 1.0/0
      min_index = nil
      centroids.each_with_index { |c,j|
        dist = euclidian_dist(c, feature_vector)
        if dist < min 
          min = dist 
          min_index = j
        end
      }
      if assignment.has_key? min_index
        assignment[min_index] << [name, feature_vector]
      else
        assignment[min_index] = [[name, feature_vector]]
      end
  }
  return assignment
end

# [{s:f}] => {s:f}
def mean(a)
  res = {}
  res.default = 0.0
  a.each { |i|
    i.each_pair { |k,v|
      res[k] += v
    }
  }
  n = a.size.to_f
  res.each_pair { |k,v|
    res[k] = v/n 
  }
end

# {i:[{s:f}]} => [{s:f}]
def update(assignment)
  new_centroids = []
  assignment.each_pair { |centroid,docs|
    new_centroids << mean(docs.map{|i |i[1]}) 
  }
  return new_centroids
end

def main
  opts = Trollop::options do
    opt :k, "k", :type => :int, :required => true
    opt :input, "input: one feature vector per line", :type => :string, :required => true
    opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
    opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3
    opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
  end
  docs = read opts[:input]
  k = opts[:k]
  centroids = nil
  if opts[:init] == 1
    centroids = rand_init(docs, k)
  else
    centroids = rand_init2(docs, k)
  end
  STDERR.write "\n         k #{k}\n"
  STDERR.write "     input #{opts[:input]}\n"
  STDERR.write "iterations #{opts[:max_iterations]}\n"
  STDERR.write "max no ch. #{opts[:max_no_change]}\n"
  STDERR.write "      init #{opts[:init]}\n\n"
  assignment = nil
  prev_stats = []
  stats = []
  no_change = 0
  max_no_change = 5
  STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n"
  0.upto(opts[:max_iterations]) do |i|
    s = "iteration #{i}"
    STDERR.write "#{s}\n#{'-'*s.size}\n"
    assignment = assign(docs, centroids)
    sizes = []
    assignment.each_pair { |centroid_index,docs|
      sizes << docs.size
    } 
    median = sizes.sort[k/2]
    max = sizes.max
    min = sizes.min
    stats = [median, max, min]
    no_change += 1 if stats==prev_stats
    prev_stats = stats
    STDERR.write sizes.to_s + "\n"
    STDERR.write " median cluster sz=#{median}\n"
    STDERR.write "    max cluster sz=#{max}\n"
    STDERR.write "    min cluster sz=#{min}\n\n"
    if no_change == max_no_change
      STDERR.write "\nmax no change hit!\n\n"
      assignment.each_pair { |centroid_index,docs| 
        puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}"
      }
      break
    end
    centroids = update(assignment)
  end
end


main