#!/usr/bin/env ruby

require 'trollop'


# {s:f} {s:f} => f
def dot(x,y)
  sum = 0.0
  x.each_pair { |k,v| sum += v * y[k] }
  return sum
end

# {s:f} => f
def mag(x)
  return Math.sqrt x.values.inject { |sum,i| sum+i**2 }
end

# {s:f} {s:f} => f
def cos_sim(x,y)
  return dot(x,y)/(mag(x)*mag(y))
end

# {s:f} {s:f} => f
def euclidian_dist(x,y)
  dims = [x.keys, y.keys].flatten.uniq  
  sum = 0.0
  dims.each { |i| sum += (x[i] - y[i])**2 }
  return Math.sqrt(sum)
end

# str => {s:{s:f}}
def read(fn)
  h = {}
  f = File.new fn, 'r'
  while line = f.gets
    g = eval line 
    h[g[0]] = g[1]
    h[g[0]].default = 0.0
  end
  return h
end

# {s:{s:f}} i => [{s:f}]
def rand_init(docs, k)
  prng = Random.new 
  return docs.keys.sample k, random:prng
end

def rand_init2(docs, k)
  prng = Random.new 
  a = []
  0.upto(k-1) do
    a << mean(docs.values.sample k, random:prng)
  end
  return a
end

# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]}
def assign(docs, centroids)
  assignment = {}
  docs.each_pair { |name,feature_vector|
      min = 1.0/0
      min_index = nil
      centroids.each_with_index { |c,j|
        dist = euclidian_dist(c, feature_vector)
        if dist < min 
          min = dist 
          min_index = j
        end
      }
      if assignment.has_key? min_index
        assignment[min_index] << [name, feature_vector]
      else
        assignment[min_index] = [[name, feature_vector]]
      end
  }
  return assignment
end

# [{s:f}] => {s:f}
def mean(a)
  res = {}
  res.default = 0.0
  a.each { |i|
    i.each_pair { |k,v|
      res[k] += v
    }
  }
  n = a.size.to_f
  res.each_pair { |k,v|
    res[k] = v/n 
  }
end

# {i:[{s:f}]} => [{s:f}]
def update(assignment)
  new_centroids = []
  assignment.each_pair { |centroid,docs|
    new_centroids << mean(docs.map{|i |i[1]}) 
  }
  return new_centroids
end

def main
  opts = Trollop::options do
    opt :k, "k", :type => :int, :required => true
    opt :input, "input: one feature vector per line", :type => :string, :required => true
    opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
    opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3
    opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
  end
  docs = read opts[:input]
  k = opts[:k]
  centroids = nil
  if opts[:init] == 1
    centroids = rand_init(docs, k)
  else
    centroids = rand_init2(docs, k)
  end
  STDERR.write "\n         k #{k}\n"
  STDERR.write "     input #{opts[:input]}\n"
  STDERR.write "iterations #{opts[:max_iterations]}\n"
  STDERR.write "max no ch. #{opts[:max_no_change]}\n"
  STDERR.write "      init #{opts[:init]}\n\n"
  assignment = nil
  prev_stats = []
  stats = []
  no_change = 0
  max_no_change = 5
  STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n"
  0.upto(opts[:max_iterations]) do |i|
    s = "iteration #{i}"
    STDERR.write "#{s}\n#{'-'*s.size}\n"
    assignment = assign(docs, centroids)
    sizes = []
    assignment.each_pair { |centroid_index,docs|
      sizes << docs.size
    } 
    median = sizes.sort[k/2]
    max = sizes.max
    min = sizes.min
    stats = [median, max, min]
    no_change += 1 if stats==prev_stats
    prev_stats = stats
    STDERR.write sizes.to_s + "\n"
    STDERR.write " median cluster sz=#{median}\n"
    STDERR.write "    max cluster sz=#{max}\n"
    STDERR.write "    min cluster sz=#{min}\n\n"
    if no_change == max_no_change
      STDERR.write "\nmax no change hit!\n\n"
      assignment.each_pair { |centroid_index,docs| 
        puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}"
      }
      break
    end
    centroids = update(assignment)
  end
end


main