summaryrefslogtreecommitdiff
path: root/kmeans
blob: 5c49d9a092c97d4469a8fe6a6636c4365cd03b64 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env ruby

require 'nlp_ruby'
require 'trollop'


def read_data fn
  data = {}
  ReadFile.new(fn).readlines_strip.map{ |i|
    a = i.split ' ', 2
    data[a.first] = read_feature_string a.last
  }
  return data
end

def rand_init data, k
  prng = Random.new
  return data.keys.sample k, random:prng
end

def rand_means_init data, k
  prng = Random.new
  a = []
  0.upto(k-1) do
    a << mean_sparse_vector(data.values.sample k, random:prng)
  end
  return a
end

def assign centroids, data
  assignment = {}
  data.each_pair { |name,feature_vector|
      min = 1.0/0
      min_index = nil
      centroids.each_with_index { |c,i|
        dist = c.euclidian_dist(feature_vector)
        if dist < min
          min = dist
          min_index = i
        end
      }
      if assignment.has_key? min_index
        assignment[min_index] << name
      else
        assignment[min_index] = [name]
      end
  }
  return assignment
end

def update assignment, data
  new_centroids = []
  assignment.each_pair { |centroid_index,a|
    new_centroids << mean_sparse_vector(assignment[centroid_index].map{ |i| data[i] })
  }
  return new_centroids
end

def main
  cfg = Trollop::options do
    opt :k, "k", :type => :int, :required => true
    opt :input, "input: one feature vector per line", :type => :string, :required => true
    opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
    opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3
    opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
  end
  # data is 'ID f1=v1 f2=v2'
  data = read_data cfg[:input]
  k = cfg[:k]
  centroids = nil
  if cfg[:init] == 1
    centroids = rand_init(data, k)
  else
    centroids = rand_means_init(data, k)
  end
  STDERR.write "\n         k #{k}\n"
  STDERR.write "     input #{cfg[:input]}\n"
  STDERR.write "iterations #{cfg[:max_iterations]}\n"
  STDERR.write "max no ch. #{cfg[:max_no_change]}\n"
  STDERR.write "      init #{cfg[:init]}\n\n"
  assignment = nil
  prev_stats = []
  stats = []
  no_change = 0
  max_no_change = 5
  STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n"
  0.upto(cfg[:max_iterations]) do |i|
    s = "iteration #{i}"
    STDERR.write "#{s}\n#{'-'*s.size}\n"
    assignment = assign centroids, data
    sizes = []
    assignment.each_pair { |centroid_index, a|
      sizes << a.size
    }
    median = sizes.sort[k/2]
    max = sizes.max
    min = sizes.min
    stats = [median, max, min]
    no_change += 1 if stats==prev_stats
    prev_stats = stats
    STDERR.write sizes.to_s + "\n"
    STDERR.write " median cluster sz=#{median}\n"
    STDERR.write "    max cluster sz=#{max}\n"
    STDERR.write "    min cluster sz=#{min}\n\n"
    if no_change == max_no_change
      STDERR.write "\nmax no change hit!\n\n"
      assignment.each_pair { |centroid_index,a|
        puts "#{centroid_index} #{a.to_s}"
      }
      break
    end
    centroids = update assignment, data
  end
end


main