diff options
| author | Patrick Simianer <p@simianer.de> | 2015-08-04 15:57:31 +0200 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2015-08-04 15:57:31 +0200 | 
| commit | 81f6b20d1c41d8906900c57ab71ec08a007ea02c (patch) | |
| tree | b05250a71f8ec2417f0fb26fa2b6e79ea41deca6 | |
| parent | 5f983dd1edeff0dd04c701381c7f0d3f2a83a525 (diff) | |
voted perceptron
| -rwxr-xr-x | rerank.rb | 27 | ||||
| -rwxr-xr-x | voted_perceptron.rb | 93 | ||||
| -rwxr-xr-x | voted_test.rb | 50 | 
3 files changed, 159 insertions, 11 deletions
| @@ -3,13 +3,13 @@  require 'zipf'  class KbestItem -  attr_accessor :rank, :model, :gold, :f, :model_orig +  attr_accessor :rank, :model, :rr, :gold, :f    def initialize s      a = s.split "\t"      @rank = a[0].to_i      @gold = a[1].to_f      @model = a[2].to_f -    @model_orig = @model +    @rr    = -1.0      @f = SparseVector.from_kv a[3], "=", " "    end @@ -18,27 +18,32 @@ class KbestItem    end  end -`mkdir rrkb`  w = SparseVector.from_kv ReadFile.new(ARGV[0]).read, "\t", "\n" +def o kl +  scores = [] +  scores << kl.first.gold +  kl.sort! { |i,j| j.model <=> i.model } +  scores << kl.first.gold +  kl.sort! { |i,j| j.rr <=> i.rr } +  scores << kl.first.gold + +  puts scores.join "\t" +end +  STDERR.write "reranking..\n"  cur = []  k_sum = 0  j = 0  while line = STDIN.gets    item = KbestItem.new line.strip -  item.model = w.dot(item.f) +  item.rr = w.dot(item.f)    if item.rank == 0 && cur.size > 0 -    cur.sort! { |i,j| j.model <=> i.model } -    f = WriteFile.new "rrkb/#{j}.gz" -    f.write cur.map{|x| x.to_s}.join("\n") -    f.close -    puts "RERANKED\t#{cur.first.gold}" +    o cur      cur = []      j += 1    end    cur << item  end -cur.sort! { |i,j| j.model <=> i.model } -puts "RERANKED\t#{cur.first.gold}" +o cur diff --git a/voted_perceptron.rb b/voted_perceptron.rb new file mode 100755 index 0000000..13211b1 --- /dev/null +++ b/voted_perceptron.rb @@ -0,0 +1,93 @@ +#!/usr/bin/env ruby + +require 'zipf' + +STDERR.write "reading training data...\n" +train = [] +train_f = ReadFile.new ARGV[0] +n = 0 +while i = train_f.gets +  train << SparseVector.from_kv(i.strip, '=', ' ') +  n += 1 +  STDERR.write "#{n}\n" if n%1000==0 +end +STDERR.write " training set size = #{train.size}\n" + +prev_loss = Float::MAX # converged? +T = 1000000            # max number of iterations +t = 0 +w = SparseVector.new   # 0 vector +c = 0 +ws = [] +cs = [] +no_change = 0 +save_freq = 1 +if ARGV[1] +  save_freq = ARGV[1].to_i +end + +while true + +  if t == T +    STDERR.write "\nreached max. number of iterations!\n" +    break +  end + +  STDERR.write "\niteration #{t}\n" + +  train.shuffle! +  loss = 0.0 +  errors = 0 +  j = 1 + +  train.each { |x| +    m = w.dot(x) +    if m <= 0.0 +      loss += m.abs +      errors += 1 +      ws << SparseVector.new(w) +      cs << c +      w += x +      c = 0 +    else +      c += 1 +    end +    STDERR.write '.'  if j%10==0 +    STDERR.write "\n" if j%1000==0 +    j += 1 +  } + +  STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2}); #w:#{ws.size}, max c:#{cs.max} \n" + +  if (loss.abs-prev_loss.abs).abs <= 10**-4 +    no_change += 1 +  else +    no_change = 0 +  end +  if no_change == 3 +    STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n" +    break +  end +  prev_loss = loss + +  if t%save_freq == 0 +    STDERR.write "\nwriting model to model.#{t}.gz ...\n" +    f = WriteFile.new "model.#{t}.gz" +    ws.each_with_index { |v,j| +      f.write "#{cs[j]}\t#{v.to_kv("=", " ")+"\n"}" +    } +    f.close +    STDERR.write "done!\n" +  end + +  t += 1 +end + +STDERR.write "\nwriting model to model.final.gz ...\n" +f = WriteFile.new "model.final.gz" +ws.each_with_index { |v,j| +  f.write "#{cs[j]}\t#{v.to_kv("=", " ")+"\n"}" +} +f.close +STDERR.write "done!\n" + diff --git a/voted_test.rb b/voted_test.rb new file mode 100755 index 0000000..c131ec2 --- /dev/null +++ b/voted_test.rb @@ -0,0 +1,50 @@ +#!/usr/bin/env ruby + +require 'zipf' + +STDOUT.sync = true + +STDERR.write "reading test data...\n" +test = [] +test_f = ReadFile.new ARGV[0] +n = 0 +while i = test_f.gets +  test << SparseVector.from_kv(i.strip, '=', ' ') +  n += 1 +  STDERR.write "#{n}\n" if n%1000==0 +end +STDERR.write " test set size = #{test.size}\n" + +errors = 0 +ws = [] +cs = [] +ReadFile.readlines_strip(ARGV[1]).each { |l| +  c, s = l.split "\t" +  cs << c.to_i +  next if !s||s.strip=="" +  ws << SparseVector.from_kv(s, "=", " ") +} + +def sign(x) +  if x <= 0 +    return -1.0 +  else +    return 1.0 +  end +end + +test.each { |x| +  m = 0 +  ws.each_with_index{ |w,j| +    m += sign(w.dot(x))*cs[j] +  } +  if m <= 0.0 +    errors += 1 +    puts -1 +  else +    puts 1 +  end +} + +STDERR.write "accuracy = #{(test.size-errors)/test.size.to_f}\n" + | 
