summaryrefslogtreecommitdiff
path: root/perceptron.rb
blob: 1c8a76c80807bb7ea9b6599e851c320b0eacda20 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env ruby

require 'zipf'

STDERR.write "reading training data...\n"
train = []
train_f = ReadFile.new ARGV[0]
n = 0
while i = train_f.gets
  train << SparseVector.from_kv(i.strip, '=', ' ')
  n += 1
  STDERR.write "#{n}\n" if n%1000==0
end
STDERR.write " training set size = #{train.size}\n"

prev_loss = Float::MAX # converged?
T = 1000000            # max number of iterations
t = 0
w = SparseVector.new   # 0 vector
no_change = 0
save_freq = 1
if ARGV[1]
  save_freq = ARGV[1].to_i
end

while true

  if t == T
    STDERR.write "\nreached max. number of iterations!\n"
    break
  end

  STDERR.write "\niteration #{t}\n"

  train.shuffle!
  loss = 0.0
  errors = 0
  j = 1

  train.each { |x|
    m = w.dot(x)
    if m <= 0.0
      loss += m.abs
      errors += 1
      w += x
    end
    STDERR.write '.'  if j%10==0
    STDERR.write "\n" if j%1000==0
    j += 1
  }

  STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2})\n"

  if (loss.abs-prev_loss.abs).abs <= 10**-4
    no_change += 1
  else
    no_change = 0
  end
  if no_change == 3
    STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n"
    break
  end
  prev_loss = loss

  if t%save_freq == 0
    STDERR.write "\nwriting model to model.#{t}.gz ...\n"
    f = WriteFile.new "model.#{t}.gz"
    f.write w.to_kv("\t", "\n")+"\n"
    f.close
    STDERR.write "done!\n"
  end

  t += 1
end

STDERR.write "\nwriting model to model.final.gz ...\n"
f = WriteFile.new "model.final.gz"
f.write w.to_kv("\t", "\n")+"\n"
f.close
STDERR.write "done!\n"