summaryrefslogtreecommitdiff
path: root/kmeans
diff options
context:
space:
mode:
authorPatrick Simianer <patrick@lilt.com>2026-02-26 10:05:59 +0000
committerPatrick Simianer <patrick@lilt.com>2026-02-26 10:05:59 +0000
commitb31ace79ea5f6b3f279c544cd3a443d6fbf2a24d (patch)
tree31f2b599fa5f6996aeb134390d58deb63eefe04a /kmeans
parent8805e95ae94d798c6441f7e1b72c90e049563f17 (diff)
overhaulHEADmaster
Diffstat (limited to 'kmeans')
-rwxr-xr-xkmeans17
1 files changed, 8 insertions, 9 deletions
diff --git a/kmeans b/kmeans
index dcf7774..f49fc53 100755
--- a/kmeans
+++ b/kmeans
@@ -1,12 +1,12 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
def read_data fn
data = {}
ReadFile.new(fn).readlines_strip.map{ |i|
- a = i.split ' ', 2
+ a = i.split " ", 2
v = SparseVector.from_kv a.last
data[a.first] = v
}
@@ -30,7 +30,7 @@ end
def assign centroids, data
assignment = {}
data.each_pair { |name,feature_vector|
- min = 1.0/0
+ min = Float::INFINITY
min_index = nil
centroids.each_with_index { |c,i|
dist = c.euclidian_dist(feature_vector)
@@ -61,10 +61,10 @@ def main
opt :k, "k", :type => :int, :required => true
opt :input, "input: one feature vector per line", :type => :string, :required => true
opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
- opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3
- opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
+ opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => "-n", :default => 3
+ opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => "-j", :default => 2
end
- # data is 'ID f1=v1 f2=v2'
+ # data is "ID f1=v1 f2=v2"
data = read_data conf[:input]
k = conf[:k]
centroids = nil
@@ -86,7 +86,7 @@ def main
STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n"
0.upto(conf[:max_iterations]) do |i|
s = "iteration #{i}"
- STDERR.write "#{s}\n#{'-'*s.size}\n"
+ STDERR.write "#{s}\n#{"-" * s.size}\n"
assignment = assign centroids, data
sizes = []
assignment.each_pair { |centroid_index, a|
@@ -114,4 +114,3 @@ def main
end
main
-