From 6ae893b1a83e1f38d2c72ff025fd2a1300919dbc Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 18 Mar 2015 12:09:28 +0100 Subject: init --- perceptron-new.rb | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ perceptron-test.rb | 55 ++++++++++++++++++++++++++++++++++++++++ perceptron.rb | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ rerank.rb | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 270 insertions(+) create mode 100755 perceptron-new.rb create mode 100755 perceptron-test.rb create mode 100755 perceptron.rb create mode 100755 rerank.rb diff --git a/perceptron-new.rb b/perceptron-new.rb new file mode 100755 index 0000000..521e6f5 --- /dev/null +++ b/perceptron-new.rb @@ -0,0 +1,67 @@ +#!/usr/bin/env ruby + +require 'zipf' + +STDERR.write "reading training data...\n" +train = [] +train_f = ReadFile.new ARGV[0] +n = 0 +while i = train_f.gets + train << SparseVector.from_kv(i.strip, '=', ' ') + n += 1 + STDERR.write "#{n}\n" if n%1000==0 +end +STDERR.write " training set size = #{train.size}\n" + +prev_loss = Float::MAX # converged? +T = 1000000 # max number of iterations +t = 0 +w = SparseVector.new # 0 vector +no_change = 0 + +while true + + if t == T + STDERR.write "\nreached max. number of iterations!\n" + break + end + + STDERR.write "\niteration #{t}\n" + + train.shuffle! + loss = 0.0 + j = 1 + + train.each { |x| + m = w.dot(x) + if m <= 0.0 + loss += m.abs + w += x + end + STDERR.write '.' if j%10==0 + STDERR.write "\n" if j%1000==0 + j += 1 + } + + STDERR.write "loss = #{loss}\n" + t += 1 + + if (loss.abs-prev_loss.abs).abs <= 10**-4 + no_change += 1 + else + no_change = 0 + end + if no_change == 3 + STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n" + break + end + prev_loss = loss + +end + +STDERR.write "\nwriting model...\n" +f = WriteFile.new 'model.gz' +f.write w.to_kv('=', ' ')+"\n" +f.close +STDERR.write "done!\n" + diff --git a/perceptron-test.rb b/perceptron-test.rb new file mode 100755 index 0000000..f3ffcd2 --- /dev/null +++ b/perceptron-test.rb @@ -0,0 +1,55 @@ +#!/usr/bin/env ruby + +require 'zipf' + +def dot v, w + sum = 0.0 + v.each_with_index { |k,i| + sum += k * w[i] + } + + return sum +end + +def elen v + len = 0.0 + v.each { |i| len += i**2 } + return Math.sqrt len +end + +def norm v + len = elen v + return v.map { |i| i/len } +end + +STDERR.write "loading feature dict\n" +fd = Marshal.load ReadFile.read ARGV[0] +d = fd.size +STDERR.write "#{d}\n" + +STDERR.write "loading model\n" +w = Marshal.load ReadFile.read ARGV[1] + +STDERR.write "predicting..\n" +err = 0 +loss = 0.0 +i = 0 +while line = STDIN.gets + x = [0.0] * d + line.split.each { |i| + k,v = i.split '=', 2 + x[fd[k]] = v.to_f + } + m = dot(w, norm(x)) + if m <= 0.0 + puts -1 + loss += m.abs + err += 1 + else + puts 1 + end + i += 1 +end + +STDERR.write "#{err}/#{test.size}% accuracy, loss=#{loss}\n" + diff --git a/perceptron.rb b/perceptron.rb new file mode 100755 index 0000000..d20b0ea --- /dev/null +++ b/perceptron.rb @@ -0,0 +1,74 @@ +#!/usr/bin/env ruby + +require 'zipf' + +puts "loading feature dict" +fd = Marshal.load ReadFile.read ARGV[0] +d = fd.size +puts d + +puts "reading training data" +train = [] +l_i = 1 +while line = STDIN.gets + puts l_i if l_i%1000==0 + v = [0.0] * d + line.split.each { |i| + k,w = i.split '=', 2 + v[fd[k]] = w.to_f + } + train << v + l_i+= 1 +end + +def dot v, w + sum = 0.0 + v.each_with_index { |k,i| + sum += k * w[i] + } + + return sum +end + +def elen v + len = 0.0 + v.each { |i| len += i**2 } + return Math.sqrt len +end + +def norm v + len = elen v + return v.map { |i| i/len } +end + +def add v, w, l + v.each_with_index { |k,i| v[i] = k + (w[i]*l) } + return v +end + +T = 12 +l = 0.001 +train.map! { |v| norm(v) } +w = [] +d.times { w << rand(0.001..0.005) } +w = norm(w) + +margin = ARGV[1].to_f + +T.times { |t| + STDERR.write "iteration #{t}\n" + + loss = 0.0 + train.each { |x| + m = dot(w, x) + if m < margin + loss += m.abs + w = norm(add(w,x,l)) + end + } + STDERR.write "loss = #{loss}\n" +} + +f = File.new('model', 'w') +f.write Marshal.dump w + diff --git a/rerank.rb b/rerank.rb new file mode 100755 index 0000000..900e0f2 --- /dev/null +++ b/rerank.rb @@ -0,0 +1,74 @@ +#!/usr/bin/env ruby + +require 'zipf' + +class KbestItem + attr_accessor :rank, :model, :gold, :f, :id + def initialize s + a = s.split "\t" + @rank = a[0].to_i + @gold = a[1].to_f + @model = a[2].to_f + @f = SparseVector.from_kv a[3], "=", " " + @id = -1 + end +end + + + + + + +def dot v, w + sum = 0.0 + v.each_with_index { |k,i| + sum += k * w[i] + } + + return sum +end + +def elen v + len = 0.0 + v.each { |i| len += i**2 } + return Math.sqrt len +end + +def norm v + len = elen v + return v.map { |i| i/len } +end + +STDERR.write "loading feature dict\n" +fd = Marshal.load ReadFile.read ARGV[0] +d = fd.size +STDERR.write "#{d}\n" + +STDERR.write "loading model\n" +w = Marshal.load ReadFile.read ARGV[1] + +STDERR.write "reranking..\n" +kbest_lists = [] +cur = [] +while line = STDIN.gets + item = KbestItem.new line.strip + x = [0.0] * d + line.split("\t")[3].split.each { |i| + k,v = i.split '=', 2 + x[fd[k]] = v.to_f + } + m = dot(w, norm(x)) + item.model = m + if item.rank == 0 && cur.size > 0 + kbest_lists << cur + cur = [] + end + cur << item +end +kbest_lists << cur + +kbest_lists.each { |l| + puts "RERANKED\t#{l.sort { |i,j| j.model <=> i.model }.first.gold}" +} + + -- cgit v1.2.3