blob: 13211b1bcb3192d647cdb43d3f3a5ddf15083092 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
#!/usr/bin/env ruby
require 'zipf'
STDERR.write "reading training data...\n"
train = []
train_f = ReadFile.new ARGV[0]
n = 0
while i = train_f.gets
train << SparseVector.from_kv(i.strip, '=', ' ')
n += 1
STDERR.write "#{n}\n" if n%1000==0
end
STDERR.write " training set size = #{train.size}\n"
prev_loss = Float::MAX # converged?
T = 1000000 # max number of iterations
t = 0
w = SparseVector.new # 0 vector
c = 0
ws = []
cs = []
no_change = 0
save_freq = 1
if ARGV[1]
save_freq = ARGV[1].to_i
end
while true
if t == T
STDERR.write "\nreached max. number of iterations!\n"
break
end
STDERR.write "\niteration #{t}\n"
train.shuffle!
loss = 0.0
errors = 0
j = 1
train.each { |x|
m = w.dot(x)
if m <= 0.0
loss += m.abs
errors += 1
ws << SparseVector.new(w)
cs << c
w += x
c = 0
else
c += 1
end
STDERR.write '.' if j%10==0
STDERR.write "\n" if j%1000==0
j += 1
}
STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2}); #w:#{ws.size}, max c:#{cs.max} \n"
if (loss.abs-prev_loss.abs).abs <= 10**-4
no_change += 1
else
no_change = 0
end
if no_change == 3
STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n"
break
end
prev_loss = loss
if t%save_freq == 0
STDERR.write "\nwriting model to model.#{t}.gz ...\n"
f = WriteFile.new "model.#{t}.gz"
ws.each_with_index { |v,j|
f.write "#{cs[j]}\t#{v.to_kv("=", " ")+"\n"}"
}
f.close
STDERR.write "done!\n"
end
t += 1
end
STDERR.write "\nwriting model to model.final.gz ...\n"
f = WriteFile.new "model.final.gz"
ws.each_with_index { |v,j|
f.write "#{cs[j]}\t#{v.to_kv("=", " ")+"\n"}"
}
f.close
STDERR.write "done!\n"
|