summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-05-30 17:49:20 +0200
committerPatrick Simianer <p@simianer.de>2015-05-30 17:49:20 +0200
commit5f983dd1edeff0dd04c701381c7f0d3f2a83a525 (patch)
treee722d379f9fa24e43ca813c395cafdaed18b0318
parent2e983112813c41b40800aee1ce9d0a083763f224 (diff)
rm old code
-rwxr-xr-xperceptron-new.rb81
-rwxr-xr-xperceptron-test.rb55
-rwxr-xr-xperceptron.rb117
-rwxr-xr-xrerank.rb66
-rwxr-xr-xtest.rb (renamed from perceptron-new-test.rb)0
5 files changed, 80 insertions, 239 deletions
diff --git a/perceptron-new.rb b/perceptron-new.rb
deleted file mode 100755
index 1c8a76c..0000000
--- a/perceptron-new.rb
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'zipf'
-
-STDERR.write "reading training data...\n"
-train = []
-train_f = ReadFile.new ARGV[0]
-n = 0
-while i = train_f.gets
- train << SparseVector.from_kv(i.strip, '=', ' ')
- n += 1
- STDERR.write "#{n}\n" if n%1000==0
-end
-STDERR.write " training set size = #{train.size}\n"
-
-prev_loss = Float::MAX # converged?
-T = 1000000 # max number of iterations
-t = 0
-w = SparseVector.new # 0 vector
-no_change = 0
-save_freq = 1
-if ARGV[1]
- save_freq = ARGV[1].to_i
-end
-
-while true
-
- if t == T
- STDERR.write "\nreached max. number of iterations!\n"
- break
- end
-
- STDERR.write "\niteration #{t}\n"
-
- train.shuffle!
- loss = 0.0
- errors = 0
- j = 1
-
- train.each { |x|
- m = w.dot(x)
- if m <= 0.0
- loss += m.abs
- errors += 1
- w += x
- end
- STDERR.write '.' if j%10==0
- STDERR.write "\n" if j%1000==0
- j += 1
- }
-
- STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2})\n"
-
- if (loss.abs-prev_loss.abs).abs <= 10**-4
- no_change += 1
- else
- no_change = 0
- end
- if no_change == 3
- STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n"
- break
- end
- prev_loss = loss
-
- if t%save_freq == 0
- STDERR.write "\nwriting model to model.#{t}.gz ...\n"
- f = WriteFile.new "model.#{t}.gz"
- f.write w.to_kv("\t", "\n")+"\n"
- f.close
- STDERR.write "done!\n"
- end
-
- t += 1
-end
-
-STDERR.write "\nwriting model to model.final.gz ...\n"
-f = WriteFile.new "model.final.gz"
-f.write w.to_kv("\t", "\n")+"\n"
-f.close
-STDERR.write "done!\n"
-
diff --git a/perceptron-test.rb b/perceptron-test.rb
deleted file mode 100755
index f3ffcd2..0000000
--- a/perceptron-test.rb
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'zipf'
-
-def dot v, w
- sum = 0.0
- v.each_with_index { |k,i|
- sum += k * w[i]
- }
-
- return sum
-end
-
-def elen v
- len = 0.0
- v.each { |i| len += i**2 }
- return Math.sqrt len
-end
-
-def norm v
- len = elen v
- return v.map { |i| i/len }
-end
-
-STDERR.write "loading feature dict\n"
-fd = Marshal.load ReadFile.read ARGV[0]
-d = fd.size
-STDERR.write "#{d}\n"
-
-STDERR.write "loading model\n"
-w = Marshal.load ReadFile.read ARGV[1]
-
-STDERR.write "predicting..\n"
-err = 0
-loss = 0.0
-i = 0
-while line = STDIN.gets
- x = [0.0] * d
- line.split.each { |i|
- k,v = i.split '=', 2
- x[fd[k]] = v.to_f
- }
- m = dot(w, norm(x))
- if m <= 0.0
- puts -1
- loss += m.abs
- err += 1
- else
- puts 1
- end
- i += 1
-end
-
-STDERR.write "#{err}/#{test.size}% accuracy, loss=#{loss}\n"
-
diff --git a/perceptron.rb b/perceptron.rb
index d20b0ea..1c8a76c 100755
--- a/perceptron.rb
+++ b/perceptron.rb
@@ -2,73 +2,80 @@
require 'zipf'
-puts "loading feature dict"
-fd = Marshal.load ReadFile.read ARGV[0]
-d = fd.size
-puts d
-
-puts "reading training data"
+STDERR.write "reading training data...\n"
train = []
-l_i = 1
-while line = STDIN.gets
- puts l_i if l_i%1000==0
- v = [0.0] * d
- line.split.each { |i|
- k,w = i.split '=', 2
- v[fd[k]] = w.to_f
- }
- train << v
- l_i+= 1
-end
-
-def dot v, w
- sum = 0.0
- v.each_with_index { |k,i|
- sum += k * w[i]
- }
-
- return sum
-end
-
-def elen v
- len = 0.0
- v.each { |i| len += i**2 }
- return Math.sqrt len
+train_f = ReadFile.new ARGV[0]
+n = 0
+while i = train_f.gets
+ train << SparseVector.from_kv(i.strip, '=', ' ')
+ n += 1
+ STDERR.write "#{n}\n" if n%1000==0
end
+STDERR.write " training set size = #{train.size}\n"
-def norm v
- len = elen v
- return v.map { |i| i/len }
+prev_loss = Float::MAX # converged?
+T = 1000000 # max number of iterations
+t = 0
+w = SparseVector.new # 0 vector
+no_change = 0
+save_freq = 1
+if ARGV[1]
+ save_freq = ARGV[1].to_i
end
-def add v, w, l
- v.each_with_index { |k,i| v[i] = k + (w[i]*l) }
- return v
-end
-
-T = 12
-l = 0.001
-train.map! { |v| norm(v) }
-w = []
-d.times { w << rand(0.001..0.005) }
-w = norm(w)
+while true
-margin = ARGV[1].to_f
+ if t == T
+ STDERR.write "\nreached max. number of iterations!\n"
+ break
+ end
-T.times { |t|
- STDERR.write "iteration #{t}\n"
+ STDERR.write "\niteration #{t}\n"
+ train.shuffle!
loss = 0.0
+ errors = 0
+ j = 1
+
train.each { |x|
- m = dot(w, x)
- if m < margin
+ m = w.dot(x)
+ if m <= 0.0
loss += m.abs
- w = norm(add(w,x,l))
+ errors += 1
+ w += x
end
+ STDERR.write '.' if j%10==0
+ STDERR.write "\n" if j%1000==0
+ j += 1
}
- STDERR.write "loss = #{loss}\n"
-}
-f = File.new('model', 'w')
-f.write Marshal.dump w
+ STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2})\n"
+
+ if (loss.abs-prev_loss.abs).abs <= 10**-4
+ no_change += 1
+ else
+ no_change = 0
+ end
+ if no_change == 3
+ STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n"
+ break
+ end
+ prev_loss = loss
+
+ if t%save_freq == 0
+ STDERR.write "\nwriting model to model.#{t}.gz ...\n"
+ f = WriteFile.new "model.#{t}.gz"
+ f.write w.to_kv("\t", "\n")+"\n"
+ f.close
+ STDERR.write "done!\n"
+ end
+
+ t += 1
+end
+
+STDERR.write "\nwriting model to model.final.gz ...\n"
+f = WriteFile.new "model.final.gz"
+f.write w.to_kv("\t", "\n")+"\n"
+f.close
+STDERR.write "done!\n"
diff --git a/rerank.rb b/rerank.rb
index 900e0f2..9e2a708 100755
--- a/rerank.rb
+++ b/rerank.rb
@@ -3,72 +3,42 @@
require 'zipf'
class KbestItem
- attr_accessor :rank, :model, :gold, :f, :id
+ attr_accessor :rank, :model, :gold, :f, :model_orig
def initialize s
a = s.split "\t"
@rank = a[0].to_i
@gold = a[1].to_f
@model = a[2].to_f
+ @model_orig = @model
@f = SparseVector.from_kv a[3], "=", " "
- @id = -1
end
-end
-
-
-
-
-
-def dot v, w
- sum = 0.0
- v.each_with_index { |k,i|
- sum += k * w[i]
- }
-
- return sum
-end
-
-def elen v
- len = 0.0
- v.each { |i| len += i**2 }
- return Math.sqrt len
-end
-
-def norm v
- len = elen v
- return v.map { |i| i/len }
+ def to_s
+ return "#{@model}\t#{@gold}"
+ end
end
-STDERR.write "loading feature dict\n"
-fd = Marshal.load ReadFile.read ARGV[0]
-d = fd.size
-STDERR.write "#{d}\n"
-
-STDERR.write "loading model\n"
-w = Marshal.load ReadFile.read ARGV[1]
+`mkdir rrkb`
+w = SparseVector.from_kv ReadFile.new(ARGV[0]).read, "\t", "\n"
STDERR.write "reranking..\n"
-kbest_lists = []
cur = []
+k_sum = 0
+j = 0
while line = STDIN.gets
item = KbestItem.new line.strip
- x = [0.0] * d
- line.split("\t")[3].split.each { |i|
- k,v = i.split '=', 2
- x[fd[k]] = v.to_f
- }
- m = dot(w, norm(x))
- item.model = m
+ item.model = w.dot(item.f)
if item.rank == 0 && cur.size > 0
- kbest_lists << cur
+ cur.sort! { |i,j| j.model <=> i.model }
+ f = WriteFile.new "rrkb/#{j}.gz"
+ f.write cur.map{|x| x.to_s}.join("\n")
+ f.close
+ puts "RERANKED\t#{cur.first.gold}"
cur = []
+ j += 1
end
cur << item
end
-kbest_lists << cur
-
-kbest_lists.each { |l|
- puts "RERANKED\t#{l.sort { |i,j| j.model <=> i.model }.first.gold}"
-}
-
+cur.sort! { |i,j| j.model <=> i.model }
+puts "RERANKED\t#{cur.first.gold}"
diff --git a/perceptron-new-test.rb b/test.rb
index 6566f68..6566f68 100755
--- a/perceptron-new-test.rb
+++ b/test.rb