summaryrefslogtreecommitdiff
path: root/perceptron/perceptron.rb
diff options
context:
space:
mode:
Diffstat (limited to 'perceptron/perceptron.rb')
-rwxr-xr-xperceptron/perceptron.rb190
1 files changed, 190 insertions, 0 deletions
diff --git a/perceptron/perceptron.rb b/perceptron/perceptron.rb
new file mode 100755
index 0000000..4b9f2fa
--- /dev/null
+++ b/perceptron/perceptron.rb
@@ -0,0 +1,190 @@
+#!/usr/bin/env ruby
+
+require 'zlib'
+
+STDOUT.set_encoding 'utf-8'
+STDOUT.sync = true
+
+
+def ngrams_it(s, n, fix=false)
+ a = s.strip.split
+ a.each_with_index { |tok, i|
+ tok.strip!
+ 0.upto([n-1, a.size-i-1].min) { |m|
+ yield a[i..i+m] if !(fix^(a[i..i+m].size==n))
+ }
+ }
+end
+
+class NamedSparseVector
+ attr_accessor :h
+
+ def initialize init=nil
+ @h = {}
+ @h = init if init
+ @h.default = 0.0
+ end
+
+ def + other
+ new_h = Hash.new
+ new_h.update @h
+ ret = NamedSparseVector.new new_h
+ other.each_pair { |k,v| ret[k]+=v }
+ return ret
+ end
+
+ def - other
+ new_h = Hash.new
+ new_h.update @h
+ ret = NamedSparseVector.new new_h
+ other.each_pair { |k,v| ret[k]-=v }
+ return ret
+ end
+
+ def * scalar
+ raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
+ ret = NamedSparseVector.new
+ @h.keys.each { |k| ret[k] = @h[k]*scalar }
+ return ret
+ end
+
+ def dot other
+ sum = 0.0
+ @h.each_pair { |k,v|
+ sum += v * other[k]
+ }
+ return sum
+ end
+
+ def [] k
+ @h[k]
+ end
+
+ def []= k, v
+ @h[k] = v
+ end
+
+ def each_pair
+ @h.each_pair { |k,v| yield k,v }
+ end
+
+ def to_s
+ @h.to_s
+ end
+
+ def size
+ @h.keys.size
+ end
+end
+
+def sparse_vector_test
+ a = NamedSparseVector.new
+ b = NamedSparseVector.new
+ a["a"] = 1
+ b["b"] = 1
+ c = NamedSparseVector.new
+ c += (a-b)*0.1
+ puts "a=#{a.to_s}, b=#{b.to_s}, (a-b)*0.1 = #{c.to_s}"
+end
+
+def write_model fn, w
+ Zlib::GzipWriter.open(fn) do |gz|
+ gz.write w.to_s+"\n"
+ end
+end
+
+def read_model fn
+ Zlib::GzipReader.open(fn) do |gz|
+ return NamedSparseVector.new eval(gz.read)
+ end
+end
+
+def usage
+ STDERR.write "#{__FILE__} <config file>\n"
+ exit 1
+end
+usage if ARGV.size != 1
+
+def read_cfg fn
+ begin
+ f = File.new fn, 'r'
+ rescue
+ STDERR.write "#{__FILE__}: Can't find file '#{fn}', exiting.\n"
+ exit 1
+ end
+ cfg = {}
+ while line = f.gets
+ next if /^\s*$/.match line
+ k, v = line.strip.split /\s*=\s*/, 2
+ cfg[k] = v unless k[0]=='#' # no inline comments
+ end
+ return cfg
+end
+
+def parse_example s
+ a = s.split
+ label = a[0].to_f
+ fv = NamedSparseVector.new
+ a[1..a.size-2].each { |i|
+ name,val = i.split ':'
+ fv[name] = val.to_f
+ }
+ return [label, fv]
+end
+
+# main
+cfg = read_cfg ARGV[0]
+silent = true if cfg['silent']
+max_iter = 1000
+max_iter = cfg['max_iter'].to_i if cfg['max_iter']
+errors = 0
+start = Time.now
+w = NamedSparseVector.new
+bias = 0
+
+train = []
+train_f = File.new cfg['train'], 'r'
+while line = train_f.gets
+ train << parse_example(line.strip)
+end
+train_f.close
+
+test = []
+if cfg['test']
+ test_f = File.new cfg['test'], 'r'
+ while line = test_f.gets
+ test << parse_example(line.strip)
+ end
+ test_f.close
+end
+
+iter = 0
+while true
+ err = 0
+ train.each_with_index { |i, idx|
+ if (i[0] * (w.dot(i[1]) + bias)) <= i[0]
+ w += i[1] * i[0]
+ bias += i[0]
+ err += 1
+ end
+ }
+ puts "iter:#{iter} err=#{err}"
+ iter += 1
+ break if err==0 || iter==max_iter
+end
+
+elapsed = Time.now-start
+puts "#{elapsed.round 2} s, #{(elapsed/Float(iter+1)).round 2} s per iter; model size: #{w.size}" if !silent
+puts cfg['model_file']
+write_model cfg['model_file'], w
+
+if cfg['test']
+ test_err = 0
+ test.each { |i|
+ if (i[0] * (w.dot(i[1]) + bias)) <= i[0]
+ test_err += 1
+ end
+ }
+ puts "test error=#{test_err}"
+end
+