diff options
-rwxr-xr-x | perceptron-new.rb | 81 | ||||
-rwxr-xr-x | perceptron-test.rb | 55 | ||||
-rwxr-xr-x | perceptron.rb | 117 | ||||
-rwxr-xr-x | rerank.rb | 66 | ||||
-rwxr-xr-x | test.rb (renamed from perceptron-new-test.rb) | 0 |
5 files changed, 80 insertions, 239 deletions
diff --git a/perceptron-new.rb b/perceptron-new.rb deleted file mode 100755 index 1c8a76c..0000000 --- a/perceptron-new.rb +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' - -STDERR.write "reading training data...\n" -train = [] -train_f = ReadFile.new ARGV[0] -n = 0 -while i = train_f.gets - train << SparseVector.from_kv(i.strip, '=', ' ') - n += 1 - STDERR.write "#{n}\n" if n%1000==0 -end -STDERR.write " training set size = #{train.size}\n" - -prev_loss = Float::MAX # converged? -T = 1000000 # max number of iterations -t = 0 -w = SparseVector.new # 0 vector -no_change = 0 -save_freq = 1 -if ARGV[1] - save_freq = ARGV[1].to_i -end - -while true - - if t == T - STDERR.write "\nreached max. number of iterations!\n" - break - end - - STDERR.write "\niteration #{t}\n" - - train.shuffle! - loss = 0.0 - errors = 0 - j = 1 - - train.each { |x| - m = w.dot(x) - if m <= 0.0 - loss += m.abs - errors += 1 - w += x - end - STDERR.write '.' if j%10==0 - STDERR.write "\n" if j%1000==0 - j += 1 - } - - STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2})\n" - - if (loss.abs-prev_loss.abs).abs <= 10**-4 - no_change += 1 - else - no_change = 0 - end - if no_change == 3 - STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n" - break - end - prev_loss = loss - - if t%save_freq == 0 - STDERR.write "\nwriting model to model.#{t}.gz ...\n" - f = WriteFile.new "model.#{t}.gz" - f.write w.to_kv("\t", "\n")+"\n" - f.close - STDERR.write "done!\n" - end - - t += 1 -end - -STDERR.write "\nwriting model to model.final.gz ...\n" -f = WriteFile.new "model.final.gz" -f.write w.to_kv("\t", "\n")+"\n" -f.close -STDERR.write "done!\n" - diff --git a/perceptron-test.rb b/perceptron-test.rb deleted file mode 100755 index f3ffcd2..0000000 --- a/perceptron-test.rb +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' - -def dot v, w - sum = 0.0 - v.each_with_index { |k,i| - sum += k * w[i] - } - - return sum -end - -def elen v - len = 0.0 - v.each { |i| len += i**2 } - return Math.sqrt len -end - -def norm v - len = elen v - return v.map { |i| i/len } -end - -STDERR.write "loading feature dict\n" -fd = Marshal.load ReadFile.read ARGV[0] -d = fd.size -STDERR.write "#{d}\n" - -STDERR.write "loading model\n" -w = Marshal.load ReadFile.read ARGV[1] - -STDERR.write "predicting..\n" -err = 0 -loss = 0.0 -i = 0 -while line = STDIN.gets - x = [0.0] * d - line.split.each { |i| - k,v = i.split '=', 2 - x[fd[k]] = v.to_f - } - m = dot(w, norm(x)) - if m <= 0.0 - puts -1 - loss += m.abs - err += 1 - else - puts 1 - end - i += 1 -end - -STDERR.write "#{err}/#{test.size}% accuracy, loss=#{loss}\n" - diff --git a/perceptron.rb b/perceptron.rb index d20b0ea..1c8a76c 100755 --- a/perceptron.rb +++ b/perceptron.rb @@ -2,73 +2,80 @@ require 'zipf' -puts "loading feature dict" -fd = Marshal.load ReadFile.read ARGV[0] -d = fd.size -puts d - -puts "reading training data" +STDERR.write "reading training data...\n" train = [] -l_i = 1 -while line = STDIN.gets - puts l_i if l_i%1000==0 - v = [0.0] * d - line.split.each { |i| - k,w = i.split '=', 2 - v[fd[k]] = w.to_f - } - train << v - l_i+= 1 -end - -def dot v, w - sum = 0.0 - v.each_with_index { |k,i| - sum += k * w[i] - } - - return sum -end - -def elen v - len = 0.0 - v.each { |i| len += i**2 } - return Math.sqrt len +train_f = ReadFile.new ARGV[0] +n = 0 +while i = train_f.gets + train << SparseVector.from_kv(i.strip, '=', ' ') + n += 1 + STDERR.write "#{n}\n" if n%1000==0 end +STDERR.write " training set size = #{train.size}\n" -def norm v - len = elen v - return v.map { |i| i/len } +prev_loss = Float::MAX # converged? +T = 1000000 # max number of iterations +t = 0 +w = SparseVector.new # 0 vector +no_change = 0 +save_freq = 1 +if ARGV[1] + save_freq = ARGV[1].to_i end -def add v, w, l - v.each_with_index { |k,i| v[i] = k + (w[i]*l) } - return v -end - -T = 12 -l = 0.001 -train.map! { |v| norm(v) } -w = [] -d.times { w << rand(0.001..0.005) } -w = norm(w) +while true -margin = ARGV[1].to_f + if t == T + STDERR.write "\nreached max. number of iterations!\n" + break + end -T.times { |t| - STDERR.write "iteration #{t}\n" + STDERR.write "\niteration #{t}\n" + train.shuffle! loss = 0.0 + errors = 0 + j = 1 + train.each { |x| - m = dot(w, x) - if m < margin + m = w.dot(x) + if m <= 0.0 loss += m.abs - w = norm(add(w,x,l)) + errors += 1 + w += x end + STDERR.write '.' if j%10==0 + STDERR.write "\n" if j%1000==0 + j += 1 } - STDERR.write "loss = #{loss}\n" -} -f = File.new('model', 'w') -f.write Marshal.dump w + STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2})\n" + + if (loss.abs-prev_loss.abs).abs <= 10**-4 + no_change += 1 + else + no_change = 0 + end + if no_change == 3 + STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n" + break + end + prev_loss = loss + + if t%save_freq == 0 + STDERR.write "\nwriting model to model.#{t}.gz ...\n" + f = WriteFile.new "model.#{t}.gz" + f.write w.to_kv("\t", "\n")+"\n" + f.close + STDERR.write "done!\n" + end + + t += 1 +end + +STDERR.write "\nwriting model to model.final.gz ...\n" +f = WriteFile.new "model.final.gz" +f.write w.to_kv("\t", "\n")+"\n" +f.close +STDERR.write "done!\n" @@ -3,72 +3,42 @@ require 'zipf' class KbestItem - attr_accessor :rank, :model, :gold, :f, :id + attr_accessor :rank, :model, :gold, :f, :model_orig def initialize s a = s.split "\t" @rank = a[0].to_i @gold = a[1].to_f @model = a[2].to_f + @model_orig = @model @f = SparseVector.from_kv a[3], "=", " " - @id = -1 end -end - - - - - -def dot v, w - sum = 0.0 - v.each_with_index { |k,i| - sum += k * w[i] - } - - return sum -end - -def elen v - len = 0.0 - v.each { |i| len += i**2 } - return Math.sqrt len -end - -def norm v - len = elen v - return v.map { |i| i/len } + def to_s + return "#{@model}\t#{@gold}" + end end -STDERR.write "loading feature dict\n" -fd = Marshal.load ReadFile.read ARGV[0] -d = fd.size -STDERR.write "#{d}\n" - -STDERR.write "loading model\n" -w = Marshal.load ReadFile.read ARGV[1] +`mkdir rrkb` +w = SparseVector.from_kv ReadFile.new(ARGV[0]).read, "\t", "\n" STDERR.write "reranking..\n" -kbest_lists = [] cur = [] +k_sum = 0 +j = 0 while line = STDIN.gets item = KbestItem.new line.strip - x = [0.0] * d - line.split("\t")[3].split.each { |i| - k,v = i.split '=', 2 - x[fd[k]] = v.to_f - } - m = dot(w, norm(x)) - item.model = m + item.model = w.dot(item.f) if item.rank == 0 && cur.size > 0 - kbest_lists << cur + cur.sort! { |i,j| j.model <=> i.model } + f = WriteFile.new "rrkb/#{j}.gz" + f.write cur.map{|x| x.to_s}.join("\n") + f.close + puts "RERANKED\t#{cur.first.gold}" cur = [] + j += 1 end cur << item end -kbest_lists << cur - -kbest_lists.each { |l| - puts "RERANKED\t#{l.sort { |i,j| j.model <=> i.model }.first.gold}" -} - +cur.sort! { |i,j| j.model <=> i.model } +puts "RERANKED\t#{cur.first.gold}" diff --git a/perceptron-new-test.rb b/test.rb index 6566f68..6566f68 100755 --- a/perceptron-new-test.rb +++ b/test.rb |