diff options
| -rwxr-xr-x | perceptron-new.rb | 81 | ||||
| -rwxr-xr-x | perceptron-test.rb | 55 | ||||
| -rwxr-xr-x | perceptron.rb | 117 | ||||
| -rwxr-xr-x | rerank.rb | 66 | ||||
| -rwxr-xr-x | test.rb (renamed from perceptron-new-test.rb) | 0 | 
5 files changed, 80 insertions, 239 deletions
| diff --git a/perceptron-new.rb b/perceptron-new.rb deleted file mode 100755 index 1c8a76c..0000000 --- a/perceptron-new.rb +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' - -STDERR.write "reading training data...\n" -train = [] -train_f = ReadFile.new ARGV[0] -n = 0 -while i = train_f.gets -  train << SparseVector.from_kv(i.strip, '=', ' ') -  n += 1 -  STDERR.write "#{n}\n" if n%1000==0 -end -STDERR.write " training set size = #{train.size}\n" - -prev_loss = Float::MAX # converged? -T = 1000000            # max number of iterations -t = 0 -w = SparseVector.new   # 0 vector -no_change = 0 -save_freq = 1 -if ARGV[1] -  save_freq = ARGV[1].to_i -end - -while true - -  if t == T -    STDERR.write "\nreached max. number of iterations!\n" -    break -  end - -  STDERR.write "\niteration #{t}\n" - -  train.shuffle! -  loss = 0.0 -  errors = 0 -  j = 1 - -  train.each { |x| -    m = w.dot(x) -    if m <= 0.0 -      loss += m.abs -      errors += 1 -      w += x -    end -    STDERR.write '.'  if j%10==0 -    STDERR.write "\n" if j%1000==0 -    j += 1 -  } - -  STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2})\n" - -  if (loss.abs-prev_loss.abs).abs <= 10**-4 -    no_change += 1 -  else -    no_change = 0 -  end -  if no_change == 3 -    STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n" -    break -  end -  prev_loss = loss - -  if t%save_freq == 0 -    STDERR.write "\nwriting model to model.#{t}.gz ...\n" -    f = WriteFile.new "model.#{t}.gz" -    f.write w.to_kv("\t", "\n")+"\n" -    f.close -    STDERR.write "done!\n" -  end - -  t += 1 -end - -STDERR.write "\nwriting model to model.final.gz ...\n" -f = WriteFile.new "model.final.gz" -f.write w.to_kv("\t", "\n")+"\n" -f.close -STDERR.write "done!\n" - diff --git a/perceptron-test.rb b/perceptron-test.rb deleted file mode 100755 index f3ffcd2..0000000 --- a/perceptron-test.rb +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' - -def dot v, w -  sum = 0.0 -  v.each_with_index { |k,i| -   sum += k * w[i] -  } - -  return sum -end - -def elen v -  len = 0.0 -  v.each { |i| len += i**2 } -  return Math.sqrt len -end - -def norm v -  len = elen v -  return v.map { |i| i/len } -end - -STDERR.write "loading feature dict\n" -fd = Marshal.load ReadFile.read ARGV[0] -d = fd.size -STDERR.write "#{d}\n" - -STDERR.write "loading model\n" -w = Marshal.load ReadFile.read ARGV[1] - -STDERR.write "predicting..\n" -err = 0 -loss = 0.0 -i = 0 -while line = STDIN.gets -  x = [0.0] * d -  line.split.each { |i| -    k,v = i.split '=', 2 -    x[fd[k]] = v.to_f -  } -  m = dot(w, norm(x)) -  if m <= 0.0 -    puts -1 -    loss += m.abs -    err += 1 -  else -    puts 1 -  end -  i += 1 -end - -STDERR.write "#{err}/#{test.size}% accuracy, loss=#{loss}\n" - diff --git a/perceptron.rb b/perceptron.rb index d20b0ea..1c8a76c 100755 --- a/perceptron.rb +++ b/perceptron.rb @@ -2,73 +2,80 @@  require 'zipf' -puts "loading feature dict" -fd = Marshal.load ReadFile.read ARGV[0] -d = fd.size -puts d - -puts "reading training data" +STDERR.write "reading training data...\n"  train = [] -l_i = 1 -while line = STDIN.gets -  puts l_i if l_i%1000==0 -  v = [0.0] * d  -  line.split.each { |i| -    k,w = i.split '=', 2 -    v[fd[k]] = w.to_f -  } -  train << v -  l_i+= 1 -end - -def dot v, w -  sum = 0.0 -  v.each_with_index { |k,i| -   sum += k * w[i] -  } - -  return sum -end - -def elen v -  len = 0.0 -  v.each { |i| len += i**2 } -  return Math.sqrt len +train_f = ReadFile.new ARGV[0] +n = 0 +while i = train_f.gets +  train << SparseVector.from_kv(i.strip, '=', ' ') +  n += 1 +  STDERR.write "#{n}\n" if n%1000==0  end +STDERR.write " training set size = #{train.size}\n" -def norm v -  len = elen v -  return v.map { |i| i/len } +prev_loss = Float::MAX # converged? +T = 1000000            # max number of iterations +t = 0 +w = SparseVector.new   # 0 vector +no_change = 0 +save_freq = 1 +if ARGV[1] +  save_freq = ARGV[1].to_i  end -def add v, w, l -  v.each_with_index { |k,i| v[i] = k + (w[i]*l) } -  return v -end - -T = 12 -l = 0.001 -train.map! { |v| norm(v) } -w = [] -d.times { w << rand(0.001..0.005) } -w = norm(w) +while true -margin = ARGV[1].to_f +  if t == T +    STDERR.write "\nreached max. number of iterations!\n" +    break +  end -T.times { |t| -  STDERR.write "iteration #{t}\n" +  STDERR.write "\niteration #{t}\n" +  train.shuffle!    loss = 0.0 +  errors = 0 +  j = 1 +    train.each { |x| -    m = dot(w, x) -    if m < margin +    m = w.dot(x) +    if m <= 0.0        loss += m.abs -      w = norm(add(w,x,l)) +      errors += 1 +      w += x      end +    STDERR.write '.'  if j%10==0 +    STDERR.write "\n" if j%1000==0 +    j += 1    } -  STDERR.write "loss = #{loss}\n" -} -f = File.new('model', 'w')  -f.write Marshal.dump w +  STDERR.write "errors = #{errors} (avg = #{(errors/train.size.to_f).round 2}), loss = #{loss.round 2} (avg = #{(loss/train.size).round 2})\n" + +  if (loss.abs-prev_loss.abs).abs <= 10**-4 +    no_change += 1 +  else +    no_change = 0 +  end +  if no_change == 3 +    STDERR.write "\nno change in loss since three iterations (difference < 10**-4)!\n" +    break +  end +  prev_loss = loss + +  if t%save_freq == 0 +    STDERR.write "\nwriting model to model.#{t}.gz ...\n" +    f = WriteFile.new "model.#{t}.gz" +    f.write w.to_kv("\t", "\n")+"\n" +    f.close +    STDERR.write "done!\n" +  end + +  t += 1 +end + +STDERR.write "\nwriting model to model.final.gz ...\n" +f = WriteFile.new "model.final.gz" +f.write w.to_kv("\t", "\n")+"\n" +f.close +STDERR.write "done!\n" @@ -3,72 +3,42 @@  require 'zipf'  class KbestItem -  attr_accessor :rank, :model, :gold, :f, :id +  attr_accessor :rank, :model, :gold, :f, :model_orig    def initialize s      a = s.split "\t"      @rank = a[0].to_i      @gold = a[1].to_f      @model = a[2].to_f +    @model_orig = @model      @f = SparseVector.from_kv a[3], "=", " " -    @id = -1    end -end - - - - - -def dot v, w -  sum = 0.0 -  v.each_with_index { |k,i| -   sum += k * w[i] -  } - -  return sum -end - -def elen v -  len = 0.0 -  v.each { |i| len += i**2 } -  return Math.sqrt len -end - -def norm v -  len = elen v -  return v.map { |i| i/len } +  def to_s +    return "#{@model}\t#{@gold}" +  end  end -STDERR.write "loading feature dict\n" -fd = Marshal.load ReadFile.read ARGV[0] -d = fd.size -STDERR.write "#{d}\n" - -STDERR.write "loading model\n" -w = Marshal.load ReadFile.read ARGV[1] +`mkdir rrkb` +w = SparseVector.from_kv ReadFile.new(ARGV[0]).read, "\t", "\n"  STDERR.write "reranking..\n" -kbest_lists = []  cur = [] +k_sum = 0 +j = 0  while line = STDIN.gets    item = KbestItem.new line.strip -  x = [0.0] * d -  line.split("\t")[3].split.each { |i| -    k,v = i.split '=', 2 -    x[fd[k]] = v.to_f -  } -  m = dot(w, norm(x)) -  item.model = m +  item.model = w.dot(item.f)    if item.rank == 0 && cur.size > 0 -    kbest_lists << cur +    cur.sort! { |i,j| j.model <=> i.model } +    f = WriteFile.new "rrkb/#{j}.gz" +    f.write cur.map{|x| x.to_s}.join("\n") +    f.close +    puts "RERANKED\t#{cur.first.gold}"      cur = [] +    j += 1    end    cur << item  end -kbest_lists << cur - -kbest_lists.each { |l| -  puts "RERANKED\t#{l.sort { |i,j| j.model <=> i.model }.first.gold}" -} - +cur.sort! { |i,j| j.model <=> i.model } +puts "RERANKED\t#{cur.first.gold}" diff --git a/perceptron-new-test.rb b/test.rb index 6566f68..6566f68 100755 --- a/perceptron-new-test.rb +++ b/test.rb | 
