#!/usr/bin/env ruby require 'zipf' STDERR.write "reading training data...\n" train = [] train_f = ReadFile.new ARGV[0] n = 0 while i = train_f.gets train << SparseVector.from_kv(i.strip, '=', ' ') n += 1 STDERR.write "#{n}\n" if n%1000==0 end STDERR.write " training set size = #{train.size}\n" prev_loss = Float::MAX # converged? T = 1000000 # max number of iterations t = 0 w = SparseVector.new # 0 vector c = 0 ws = [] cs = [] no_change = 0 save_freq = 1 if ARGV[1] save_freq = ARGV[1].to_i end while true if t == T STDERR.write "\nreached max. number of iterations!\n" break end STDERR.write "\niteration #{t}\n" train.shuffle! loss = 0.0 errors = 0 j = 1 train.each { |x| m = w.dot(x) if m <= 0.0 loss += m.abs errors += 1 ws << SparseVector.new(w) cs << c w += x c = 0 else c += 1 end STDERR.write '.' if j%10==0 STDERR.write "\n" if j%1000==0 j += 1 } STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2}); #w:#{ws.size}, max c:#{cs.max} \n" if (loss.abs-prev_loss.abs).abs <= 10**-4 no_change += 1 else no_change = 0 end if no_change == 3 STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n" break end prev_loss = loss if t%save_freq == 0 STDERR.write "\nwriting model to model.#{t}.gz ...\n" f = WriteFile.new "model.#{t}.gz" ws.each_with_index { |v,j| f.write "#{cs[j]}\t#{v.to_kv("=", " ")+"\n"}" } f.close STDERR.write "done!\n" end t += 1 end STDERR.write "\nwriting model to model.final.gz ...\n" f = WriteFile.new "model.final.gz" ws.each_with_index { |v,j| f.write "#{cs[j]}\t#{v.to_kv("=", " ")+"\n"}" } f.close STDERR.write "done!\n"