summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-08-04 15:57:31 +0200
committerPatrick Simianer <p@simianer.de>2015-08-04 15:57:31 +0200
commit81f6b20d1c41d8906900c57ab71ec08a007ea02c (patch)
treeb05250a71f8ec2417f0fb26fa2b6e79ea41deca6
parent5f983dd1edeff0dd04c701381c7f0d3f2a83a525 (diff)
voted perceptron
-rwxr-xr-xrerank.rb27
-rwxr-xr-xvoted_perceptron.rb93
-rwxr-xr-xvoted_test.rb50
3 files changed, 159 insertions, 11 deletions
diff --git a/rerank.rb b/rerank.rb
index 9e2a708..095e20b 100755
--- a/rerank.rb
+++ b/rerank.rb
@@ -3,13 +3,13 @@
require 'zipf'
class KbestItem
- attr_accessor :rank, :model, :gold, :f, :model_orig
+ attr_accessor :rank, :model, :rr, :gold, :f
def initialize s
a = s.split "\t"
@rank = a[0].to_i
@gold = a[1].to_f
@model = a[2].to_f
- @model_orig = @model
+ @rr = -1.0
@f = SparseVector.from_kv a[3], "=", " "
end
@@ -18,27 +18,32 @@ class KbestItem
end
end
-`mkdir rrkb`
w = SparseVector.from_kv ReadFile.new(ARGV[0]).read, "\t", "\n"
+def o kl
+ scores = []
+ scores << kl.first.gold
+ kl.sort! { |i,j| j.model <=> i.model }
+ scores << kl.first.gold
+ kl.sort! { |i,j| j.rr <=> i.rr }
+ scores << kl.first.gold
+
+ puts scores.join "\t"
+end
+
STDERR.write "reranking..\n"
cur = []
k_sum = 0
j = 0
while line = STDIN.gets
item = KbestItem.new line.strip
- item.model = w.dot(item.f)
+ item.rr = w.dot(item.f)
if item.rank == 0 && cur.size > 0
- cur.sort! { |i,j| j.model <=> i.model }
- f = WriteFile.new "rrkb/#{j}.gz"
- f.write cur.map{|x| x.to_s}.join("\n")
- f.close
- puts "RERANKED\t#{cur.first.gold}"
+ o cur
cur = []
j += 1
end
cur << item
end
-cur.sort! { |i,j| j.model <=> i.model }
-puts "RERANKED\t#{cur.first.gold}"
+o cur
diff --git a/voted_perceptron.rb b/voted_perceptron.rb
new file mode 100755
index 0000000..13211b1
--- /dev/null
+++ b/voted_perceptron.rb
@@ -0,0 +1,93 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+STDERR.write "reading training data...\n"
+train = []
+train_f = ReadFile.new ARGV[0]
+n = 0
+while i = train_f.gets
+ train << SparseVector.from_kv(i.strip, '=', ' ')
+ n += 1
+ STDERR.write "#{n}\n" if n%1000==0
+end
+STDERR.write " training set size = #{train.size}\n"
+
+prev_loss = Float::MAX # converged?
+T = 1000000 # max number of iterations
+t = 0
+w = SparseVector.new # 0 vector
+c = 0
+ws = []
+cs = []
+no_change = 0
+save_freq = 1
+if ARGV[1]
+ save_freq = ARGV[1].to_i
+end
+
+while true
+
+ if t == T
+ STDERR.write "\nreached max. number of iterations!\n"
+ break
+ end
+
+ STDERR.write "\niteration #{t}\n"
+
+ train.shuffle!
+ loss = 0.0
+ errors = 0
+ j = 1
+
+ train.each { |x|
+ m = w.dot(x)
+ if m <= 0.0
+ loss += m.abs
+ errors += 1
+ ws << SparseVector.new(w)
+ cs << c
+ w += x
+ c = 0
+ else
+ c += 1
+ end
+ STDERR.write '.' if j%10==0
+ STDERR.write "\n" if j%1000==0
+ j += 1
+ }
+
+ STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2}); #w:#{ws.size}, max c:#{cs.max} \n"
+
+ if (loss.abs-prev_loss.abs).abs <= 10**-4
+ no_change += 1
+ else
+ no_change = 0
+ end
+ if no_change == 3
+ STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n"
+ break
+ end
+ prev_loss = loss
+
+ if t%save_freq == 0
+ STDERR.write "\nwriting model to model.#{t}.gz ...\n"
+ f = WriteFile.new "model.#{t}.gz"
+ ws.each_with_index { |v,j|
+ f.write "#{cs[j]}\t#{v.to_kv("=", " ")+"\n"}"
+ }
+ f.close
+ STDERR.write "done!\n"
+ end
+
+ t += 1
+end
+
+STDERR.write "\nwriting model to model.final.gz ...\n"
+f = WriteFile.new "model.final.gz"
+ws.each_with_index { |v,j|
+ f.write "#{cs[j]}\t#{v.to_kv("=", " ")+"\n"}"
+}
+f.close
+STDERR.write "done!\n"
+
diff --git a/voted_test.rb b/voted_test.rb
new file mode 100755
index 0000000..c131ec2
--- /dev/null
+++ b/voted_test.rb
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+STDOUT.sync = true
+
+STDERR.write "reading test data...\n"
+test = []
+test_f = ReadFile.new ARGV[0]
+n = 0
+while i = test_f.gets
+ test << SparseVector.from_kv(i.strip, '=', ' ')
+ n += 1
+ STDERR.write "#{n}\n" if n%1000==0
+end
+STDERR.write " test set size = #{test.size}\n"
+
+errors = 0
+ws = []
+cs = []
+ReadFile.readlines_strip(ARGV[1]).each { |l|
+ c, s = l.split "\t"
+ cs << c.to_i
+ next if !s||s.strip==""
+ ws << SparseVector.from_kv(s, "=", " ")
+}
+
+def sign(x)
+ if x <= 0
+ return -1.0
+ else
+ return 1.0
+ end
+end
+
+test.each { |x|
+ m = 0
+ ws.each_with_index{ |w,j|
+ m += sign(w.dot(x))*cs[j]
+ }
+ if m <= 0.0
+ errors += 1
+ puts -1
+ else
+ puts 1
+ end
+}
+
+STDERR.write "accuracy = #{(test.size-errors)/test.size.to_f}\n"
+