From 26c490f404731d053a6205719b6246502c07b449 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sat, 14 Jun 2014 16:46:27 +0200 Subject: init --- perceptron/perceptron.rb | 190 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100755 perceptron/perceptron.rb (limited to 'perceptron/perceptron.rb') diff --git a/perceptron/perceptron.rb b/perceptron/perceptron.rb new file mode 100755 index 0000000..4b9f2fa --- /dev/null +++ b/perceptron/perceptron.rb @@ -0,0 +1,190 @@ +#!/usr/bin/env ruby + +require 'zlib' + +STDOUT.set_encoding 'utf-8' +STDOUT.sync = true + + +def ngrams_it(s, n, fix=false) + a = s.strip.split + a.each_with_index { |tok, i| + tok.strip! + 0.upto([n-1, a.size-i-1].min) { |m| + yield a[i..i+m] if !(fix^(a[i..i+m].size==n)) + } + } +end + +class NamedSparseVector + attr_accessor :h + + def initialize init=nil + @h = {} + @h = init if init + @h.default = 0.0 + end + + def + other + new_h = Hash.new + new_h.update @h + ret = NamedSparseVector.new new_h + other.each_pair { |k,v| ret[k]+=v } + return ret + end + + def - other + new_h = Hash.new + new_h.update @h + ret = NamedSparseVector.new new_h + other.each_pair { |k,v| ret[k]-=v } + return ret + end + + def * scalar + raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric + ret = NamedSparseVector.new + @h.keys.each { |k| ret[k] = @h[k]*scalar } + return ret + end + + def dot other + sum = 0.0 + @h.each_pair { |k,v| + sum += v * other[k] + } + return sum + end + + def [] k + @h[k] + end + + def []= k, v + @h[k] = v + end + + def each_pair + @h.each_pair { |k,v| yield k,v } + end + + def to_s + @h.to_s + end + + def size + @h.keys.size + end +end + +def sparse_vector_test + a = NamedSparseVector.new + b = NamedSparseVector.new + a["a"] = 1 + b["b"] = 1 + c = NamedSparseVector.new + c += (a-b)*0.1 + puts "a=#{a.to_s}, b=#{b.to_s}, (a-b)*0.1 = #{c.to_s}" +end + +def write_model fn, w + Zlib::GzipWriter.open(fn) do |gz| + gz.write w.to_s+"\n" + end +end + +def read_model fn + Zlib::GzipReader.open(fn) do |gz| + return NamedSparseVector.new eval(gz.read) + end +end + +def usage + STDERR.write "#{__FILE__} \n" + exit 1 +end +usage if ARGV.size != 1 + +def read_cfg fn + begin + f = File.new fn, 'r' + rescue + STDERR.write "#{__FILE__}: Can't find file '#{fn}', exiting.\n" + exit 1 + end + cfg = {} + while line = f.gets + next if /^\s*$/.match line + k, v = line.strip.split /\s*=\s*/, 2 + cfg[k] = v unless k[0]=='#' # no inline comments + end + return cfg +end + +def parse_example s + a = s.split + label = a[0].to_f + fv = NamedSparseVector.new + a[1..a.size-2].each { |i| + name,val = i.split ':' + fv[name] = val.to_f + } + return [label, fv] +end + +# main +cfg = read_cfg ARGV[0] +silent = true if cfg['silent'] +max_iter = 1000 +max_iter = cfg['max_iter'].to_i if cfg['max_iter'] +errors = 0 +start = Time.now +w = NamedSparseVector.new +bias = 0 + +train = [] +train_f = File.new cfg['train'], 'r' +while line = train_f.gets + train << parse_example(line.strip) +end +train_f.close + +test = [] +if cfg['test'] + test_f = File.new cfg['test'], 'r' + while line = test_f.gets + test << parse_example(line.strip) + end + test_f.close +end + +iter = 0 +while true + err = 0 + train.each_with_index { |i, idx| + if (i[0] * (w.dot(i[1]) + bias)) <= i[0] + w += i[1] * i[0] + bias += i[0] + err += 1 + end + } + puts "iter:#{iter} err=#{err}" + iter += 1 + break if err==0 || iter==max_iter +end + +elapsed = Time.now-start +puts "#{elapsed.round 2} s, #{(elapsed/Float(iter+1)).round 2} s per iter; model size: #{w.size}" if !silent +puts cfg['model_file'] +write_model cfg['model_file'], w + +if cfg['test'] + test_err = 0 + test.each { |i| + if (i[0] * (w.dot(i[1]) + bias)) <= i[0] + test_err += 1 + end + } + puts "test error=#{test_err}" +end + -- cgit v1.2.3