From 81f6b20d1c41d8906900c57ab71ec08a007ea02c Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 4 Aug 2015 15:57:31 +0200 Subject: voted perceptron --- rerank.rb | 27 +++++++++------- voted_perceptron.rb | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++ voted_test.rb | 50 ++++++++++++++++++++++++++++ 3 files changed, 159 insertions(+), 11 deletions(-) create mode 100755 voted_perceptron.rb create mode 100755 voted_test.rb diff --git a/rerank.rb b/rerank.rb index 9e2a708..095e20b 100755 --- a/rerank.rb +++ b/rerank.rb @@ -3,13 +3,13 @@ require 'zipf' class KbestItem - attr_accessor :rank, :model, :gold, :f, :model_orig + attr_accessor :rank, :model, :rr, :gold, :f def initialize s a = s.split "\t" @rank = a[0].to_i @gold = a[1].to_f @model = a[2].to_f - @model_orig = @model + @rr = -1.0 @f = SparseVector.from_kv a[3], "=", " " end @@ -18,27 +18,32 @@ class KbestItem end end -`mkdir rrkb` w = SparseVector.from_kv ReadFile.new(ARGV[0]).read, "\t", "\n" +def o kl + scores = [] + scores << kl.first.gold + kl.sort! { |i,j| j.model <=> i.model } + scores << kl.first.gold + kl.sort! { |i,j| j.rr <=> i.rr } + scores << kl.first.gold + + puts scores.join "\t" +end + STDERR.write "reranking..\n" cur = [] k_sum = 0 j = 0 while line = STDIN.gets item = KbestItem.new line.strip - item.model = w.dot(item.f) + item.rr = w.dot(item.f) if item.rank == 0 && cur.size > 0 - cur.sort! { |i,j| j.model <=> i.model } - f = WriteFile.new "rrkb/#{j}.gz" - f.write cur.map{|x| x.to_s}.join("\n") - f.close - puts "RERANKED\t#{cur.first.gold}" + o cur cur = [] j += 1 end cur << item end -cur.sort! { |i,j| j.model <=> i.model } -puts "RERANKED\t#{cur.first.gold}" +o cur diff --git a/voted_perceptron.rb b/voted_perceptron.rb new file mode 100755 index 0000000..13211b1 --- /dev/null +++ b/voted_perceptron.rb @@ -0,0 +1,93 @@ +#!/usr/bin/env ruby + +require 'zipf' + +STDERR.write "reading training data...\n" +train = [] +train_f = ReadFile.new ARGV[0] +n = 0 +while i = train_f.gets + train << SparseVector.from_kv(i.strip, '=', ' ') + n += 1 + STDERR.write "#{n}\n" if n%1000==0 +end +STDERR.write " training set size = #{train.size}\n" + +prev_loss = Float::MAX # converged? +T = 1000000 # max number of iterations +t = 0 +w = SparseVector.new # 0 vector +c = 0 +ws = [] +cs = [] +no_change = 0 +save_freq = 1 +if ARGV[1] + save_freq = ARGV[1].to_i +end + +while true + + if t == T + STDERR.write "\nreached max. number of iterations!\n" + break + end + + STDERR.write "\niteration #{t}\n" + + train.shuffle! + loss = 0.0 + errors = 0 + j = 1 + + train.each { |x| + m = w.dot(x) + if m <= 0.0 + loss += m.abs + errors += 1 + ws << SparseVector.new(w) + cs << c + w += x + c = 0 + else + c += 1 + end + STDERR.write '.' if j%10==0 + STDERR.write "\n" if j%1000==0 + j += 1 + } + + STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2}); #w:#{ws.size}, max c:#{cs.max} \n" + + if (loss.abs-prev_loss.abs).abs <= 10**-4 + no_change += 1 + else + no_change = 0 + end + if no_change == 3 + STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n" + break + end + prev_loss = loss + + if t%save_freq == 0 + STDERR.write "\nwriting model to model.#{t}.gz ...\n" + f = WriteFile.new "model.#{t}.gz" + ws.each_with_index { |v,j| + f.write "#{cs[j]}\t#{v.to_kv("=", " ")+"\n"}" + } + f.close + STDERR.write "done!\n" + end + + t += 1 +end + +STDERR.write "\nwriting model to model.final.gz ...\n" +f = WriteFile.new "model.final.gz" +ws.each_with_index { |v,j| + f.write "#{cs[j]}\t#{v.to_kv("=", " ")+"\n"}" +} +f.close +STDERR.write "done!\n" + diff --git a/voted_test.rb b/voted_test.rb new file mode 100755 index 0000000..c131ec2 --- /dev/null +++ b/voted_test.rb @@ -0,0 +1,50 @@ +#!/usr/bin/env ruby + +require 'zipf' + +STDOUT.sync = true + +STDERR.write "reading test data...\n" +test = [] +test_f = ReadFile.new ARGV[0] +n = 0 +while i = test_f.gets + test << SparseVector.from_kv(i.strip, '=', ' ') + n += 1 + STDERR.write "#{n}\n" if n%1000==0 +end +STDERR.write " test set size = #{test.size}\n" + +errors = 0 +ws = [] +cs = [] +ReadFile.readlines_strip(ARGV[1]).each { |l| + c, s = l.split "\t" + cs << c.to_i + next if !s||s.strip=="" + ws << SparseVector.from_kv(s, "=", " ") +} + +def sign(x) + if x <= 0 + return -1.0 + else + return 1.0 + end +end + +test.each { |x| + m = 0 + ws.each_with_index{ |w,j| + m += sign(w.dot(x))*cs[j] + } + if m <= 0.0 + errors += 1 + puts -1 + else + puts 1 + end +} + +STDERR.write "accuracy = #{(test.size-errors)/test.size.to_f}\n" + -- cgit v1.2.3