From 5f983dd1edeff0dd04c701381c7f0d3f2a83a525 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sat, 30 May 2015 17:49:20 +0200 Subject: rm old code --- perceptron-new-test.rb | 30 ------------- perceptron-new.rb | 81 ---------------------------------- perceptron-test.rb | 55 ----------------------- perceptron.rb | 117 ++++++++++++++++++++++++++----------------------- rerank.rb | 66 ++++++++-------------------- test.rb | 30 +++++++++++++ 6 files changed, 110 insertions(+), 269 deletions(-) delete mode 100755 perceptron-new-test.rb delete mode 100755 perceptron-new.rb delete mode 100755 perceptron-test.rb create mode 100755 test.rb diff --git a/perceptron-new-test.rb b/perceptron-new-test.rb deleted file mode 100755 index 6566f68..0000000 --- a/perceptron-new-test.rb +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' - -STDERR.write "reading test data...\n" -test = [] -test_f = ReadFile.new ARGV[0] -n = 0 -while i = test_f.gets - test << SparseVector.from_kv(i.strip, '=', ' ') - n += 1 - STDERR.write "#{n}\n" if n%1000==0 -end -STDERR.write " test set size = #{test.size}\n" - -errors = 0 -w = SparseVector.from_kv ReadFile.new(ARGV[1]).read, "\t", "\n" - -test.each { |x| - m = w.dot(x) - if m <= 0.0 - errors += 1 - puts -1 - else - puts 1 - end -} - -STDERR.write "accuracy = #{(test.size-errors)/test.size.to_f}\n" - diff --git a/perceptron-new.rb b/perceptron-new.rb deleted file mode 100755 index 1c8a76c..0000000 --- a/perceptron-new.rb +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' - -STDERR.write "reading training data...\n" -train = [] -train_f = ReadFile.new ARGV[0] -n = 0 -while i = train_f.gets - train << SparseVector.from_kv(i.strip, '=', ' ') - n += 1 - STDERR.write "#{n}\n" if n%1000==0 -end -STDERR.write " training set size = #{train.size}\n" - -prev_loss = Float::MAX # converged? -T = 1000000 # max number of iterations -t = 0 -w = SparseVector.new # 0 vector -no_change = 0 -save_freq = 1 -if ARGV[1] - save_freq = ARGV[1].to_i -end - -while true - - if t == T - STDERR.write "\nreached max. number of iterations!\n" - break - end - - STDERR.write "\niteration #{t}\n" - - train.shuffle! - loss = 0.0 - errors = 0 - j = 1 - - train.each { |x| - m = w.dot(x) - if m <= 0.0 - loss += m.abs - errors += 1 - w += x - end - STDERR.write '.' if j%10==0 - STDERR.write "\n" if j%1000==0 - j += 1 - } - - STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2})\n" - - if (loss.abs-prev_loss.abs).abs <= 10**-4 - no_change += 1 - else - no_change = 0 - end - if no_change == 3 - STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n" - break - end - prev_loss = loss - - if t%save_freq == 0 - STDERR.write "\nwriting model to model.#{t}.gz ...\n" - f = WriteFile.new "model.#{t}.gz" - f.write w.to_kv("\t", "\n")+"\n" - f.close - STDERR.write "done!\n" - end - - t += 1 -end - -STDERR.write "\nwriting model to model.final.gz ...\n" -f = WriteFile.new "model.final.gz" -f.write w.to_kv("\t", "\n")+"\n" -f.close -STDERR.write "done!\n" - diff --git a/perceptron-test.rb b/perceptron-test.rb deleted file mode 100755 index f3ffcd2..0000000 --- a/perceptron-test.rb +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' - -def dot v, w - sum = 0.0 - v.each_with_index { |k,i| - sum += k * w[i] - } - - return sum -end - -def elen v - len = 0.0 - v.each { |i| len += i**2 } - return Math.sqrt len -end - -def norm v - len = elen v - return v.map { |i| i/len } -end - -STDERR.write "loading feature dict\n" -fd = Marshal.load ReadFile.read ARGV[0] -d = fd.size -STDERR.write "#{d}\n" - -STDERR.write "loading model\n" -w = Marshal.load ReadFile.read ARGV[1] - -STDERR.write "predicting..\n" -err = 0 -loss = 0.0 -i = 0 -while line = STDIN.gets - x = [0.0] * d - line.split.each { |i| - k,v = i.split '=', 2 - x[fd[k]] = v.to_f - } - m = dot(w, norm(x)) - if m <= 0.0 - puts -1 - loss += m.abs - err += 1 - else - puts 1 - end - i += 1 -end - -STDERR.write "#{err}/#{test.size}% accuracy, loss=#{loss}\n" - diff --git a/perceptron.rb b/perceptron.rb index d20b0ea..1c8a76c 100755 --- a/perceptron.rb +++ b/perceptron.rb @@ -2,73 +2,80 @@ require 'zipf' -puts "loading feature dict" -fd = Marshal.load ReadFile.read ARGV[0] -d = fd.size -puts d - -puts "reading training data" +STDERR.write "reading training data...\n" train = [] -l_i = 1 -while line = STDIN.gets - puts l_i if l_i%1000==0 - v = [0.0] * d - line.split.each { |i| - k,w = i.split '=', 2 - v[fd[k]] = w.to_f - } - train << v - l_i+= 1 -end - -def dot v, w - sum = 0.0 - v.each_with_index { |k,i| - sum += k * w[i] - } - - return sum -end - -def elen v - len = 0.0 - v.each { |i| len += i**2 } - return Math.sqrt len +train_f = ReadFile.new ARGV[0] +n = 0 +while i = train_f.gets + train << SparseVector.from_kv(i.strip, '=', ' ') + n += 1 + STDERR.write "#{n}\n" if n%1000==0 end +STDERR.write " training set size = #{train.size}\n" -def norm v - len = elen v - return v.map { |i| i/len } +prev_loss = Float::MAX # converged? +T = 1000000 # max number of iterations +t = 0 +w = SparseVector.new # 0 vector +no_change = 0 +save_freq = 1 +if ARGV[1] + save_freq = ARGV[1].to_i end -def add v, w, l - v.each_with_index { |k,i| v[i] = k + (w[i]*l) } - return v -end - -T = 12 -l = 0.001 -train.map! { |v| norm(v) } -w = [] -d.times { w << rand(0.001..0.005) } -w = norm(w) +while true -margin = ARGV[1].to_f + if t == T + STDERR.write "\nreached max. number of iterations!\n" + break + end -T.times { |t| - STDERR.write "iteration #{t}\n" + STDERR.write "\niteration #{t}\n" + train.shuffle! loss = 0.0 + errors = 0 + j = 1 + train.each { |x| - m = dot(w, x) - if m < margin + m = w.dot(x) + if m <= 0.0 loss += m.abs - w = norm(add(w,x,l)) + errors += 1 + w += x end + STDERR.write '.' if j%10==0 + STDERR.write "\n" if j%1000==0 + j += 1 } - STDERR.write "loss = #{loss}\n" -} -f = File.new('model', 'w') -f.write Marshal.dump w + STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2})\n" + + if (loss.abs-prev_loss.abs).abs <= 10**-4 + no_change += 1 + else + no_change = 0 + end + if no_change == 3 + STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n" + break + end + prev_loss = loss + + if t%save_freq == 0 + STDERR.write "\nwriting model to model.#{t}.gz ...\n" + f = WriteFile.new "model.#{t}.gz" + f.write w.to_kv("\t", "\n")+"\n" + f.close + STDERR.write "done!\n" + end + + t += 1 +end + +STDERR.write "\nwriting model to model.final.gz ...\n" +f = WriteFile.new "model.final.gz" +f.write w.to_kv("\t", "\n")+"\n" +f.close +STDERR.write "done!\n" diff --git a/rerank.rb b/rerank.rb index 900e0f2..9e2a708 100755 --- a/rerank.rb +++ b/rerank.rb @@ -3,72 +3,42 @@ require 'zipf' class KbestItem - attr_accessor :rank, :model, :gold, :f, :id + attr_accessor :rank, :model, :gold, :f, :model_orig def initialize s a = s.split "\t" @rank = a[0].to_i @gold = a[1].to_f @model = a[2].to_f + @model_orig = @model @f = SparseVector.from_kv a[3], "=", " " - @id = -1 end -end - - - - - -def dot v, w - sum = 0.0 - v.each_with_index { |k,i| - sum += k * w[i] - } - - return sum -end - -def elen v - len = 0.0 - v.each { |i| len += i**2 } - return Math.sqrt len -end - -def norm v - len = elen v - return v.map { |i| i/len } + def to_s + return "#{@model}\t#{@gold}" + end end -STDERR.write "loading feature dict\n" -fd = Marshal.load ReadFile.read ARGV[0] -d = fd.size -STDERR.write "#{d}\n" - -STDERR.write "loading model\n" -w = Marshal.load ReadFile.read ARGV[1] +`mkdir rrkb` +w = SparseVector.from_kv ReadFile.new(ARGV[0]).read, "\t", "\n" STDERR.write "reranking..\n" -kbest_lists = [] cur = [] +k_sum = 0 +j = 0 while line = STDIN.gets item = KbestItem.new line.strip - x = [0.0] * d - line.split("\t")[3].split.each { |i| - k,v = i.split '=', 2 - x[fd[k]] = v.to_f - } - m = dot(w, norm(x)) - item.model = m + item.model = w.dot(item.f) if item.rank == 0 && cur.size > 0 - kbest_lists << cur + cur.sort! { |i,j| j.model <=> i.model } + f = WriteFile.new "rrkb/#{j}.gz" + f.write cur.map{|x| x.to_s}.join("\n") + f.close + puts "RERANKED\t#{cur.first.gold}" cur = [] + j += 1 end cur << item end -kbest_lists << cur - -kbest_lists.each { |l| - puts "RERANKED\t#{l.sort { |i,j| j.model <=> i.model }.first.gold}" -} - +cur.sort! { |i,j| j.model <=> i.model } +puts "RERANKED\t#{cur.first.gold}" diff --git a/test.rb b/test.rb new file mode 100755 index 0000000..6566f68 --- /dev/null +++ b/test.rb @@ -0,0 +1,30 @@ +#!/usr/bin/env ruby + +require 'zipf' + +STDERR.write "reading test data...\n" +test = [] +test_f = ReadFile.new ARGV[0] +n = 0 +while i = test_f.gets + test << SparseVector.from_kv(i.strip, '=', ' ') + n += 1 + STDERR.write "#{n}\n" if n%1000==0 +end +STDERR.write " test set size = #{test.size}\n" + +errors = 0 +w = SparseVector.from_kv ReadFile.new(ARGV[1]).read, "\t", "\n" + +test.each { |x| + m = w.dot(x) + if m <= 0.0 + errors += 1 + puts -1 + else + puts 1 + end +} + +STDERR.write "accuracy = #{(test.size-errors)/test.size.to_f}\n" + -- cgit v1.2.3