summaryrefslogtreecommitdiff
path: root/perceptron
diff options
context:
space:
mode:
Diffstat (limited to 'perceptron')
-rw-r--r--perceptron/Makefile8
-rw-r--r--perceptron/perceptron.cc156
-rwxr-xr-xperceptron/perceptron.rb190
-rw-r--r--perceptron/perceptron1.cc143
4 files changed, 497 insertions, 0 deletions
diff --git a/perceptron/Makefile b/perceptron/Makefile
new file mode 100644
index 0000000..09a44cc
--- /dev/null
+++ b/perceptron/Makefile
@@ -0,0 +1,8 @@
+all: perceptron perceptron1
+
+perceptron: perceptron.cc
+ g++ perceptron.cc -o perceptron
+
+perceptron1: perceptron1.cc
+ g++ perceptron1.cc -o perceptron1
+
diff --git a/perceptron/perceptron.cc b/perceptron/perceptron.cc
new file mode 100644
index 0000000..3aa6a72
--- /dev/null
+++ b/perceptron/perceptron.cc
@@ -0,0 +1,156 @@
+#include <iostream>
+#include <vector>
+#include <utility>
+#include <string>
+#include <boost/assign/std/vector.hpp>
+#include <cassert>
+#include <cmath>
+
+using namespace std;
+using namespace boost::assign;
+
+typedef vector<double> vec;
+
+
+/*
+ * features
+ *
+ */
+double
+isupper_f(string s)
+{
+ if ( isupper(s[0]) ) return 0;
+ return 1;
+}
+
+double
+len_f(string s)
+{
+ return s.size();
+}
+
+
+/*
+ * vector ops
+ *
+ */
+double
+dotprod( vec a, vec b )
+{
+ assert( a.size() == b.size() );
+ double sum;
+ for ( size_t i = 0; i < a.size(); i++ ) {
+ sum += a[i] * b[i];
+ }
+ return sum;
+}
+
+vec
+vadd( double x, vec v )
+{
+ for ( size_t i = 0; i < v.size(); i++ ) {
+ v[i] += x;
+ }
+ return v;
+}
+
+vec
+vadd( vec v, vec w )
+{
+ assert( v.size() == w.size() );
+ for ( size_t i = 0; i < v.size(); i++ ) {
+ v[i] += w[i];
+ }
+ return v;
+}
+
+vec
+vsub( vec v, vec w )
+{
+ assert(v.size() == w.size());
+ for (size_t i = 0; i < v.size(); i++) {
+ v[i] -= w[i];
+ }
+ return v;
+}
+
+vec
+vmult( double x, vec v )
+{
+ for (size_t i = 0; i < v.size(); i++) {
+ v[i] *= x;
+ }
+ return v;
+}
+
+double
+vlen( vec v )
+{
+ return sqrt( dotprod(v, v) );
+}
+
+vec
+vnorm( vec v )
+{
+ double len = vlen(v);
+ for (size_t i = 0; i < v.size(); i++) {
+ v[i] /= len;
+ }
+ return v;
+}
+
+
+int
+main( void )
+{
+ vec classes;
+ vector<string> words;
+ classes += -1., 1., -1., 1., -1., 1., -1., 1., -1., 1.;
+ words += "Haus","house","Mauer","plant","Zeug","hello","Maus","worker","Quartz","point";
+
+ vector<pair<double, string> > examples;
+ for (size_t i = 0; i < classes.size(); i++) {
+ pair<double, string> p;
+ p.first = classes[i];
+ p.second = words[i];
+ examples.push_back(p);
+ }
+
+ vec weights;
+ weights += 0.,0.,0.;
+
+ double eta = 0.5;
+
+ for (size_t t = 0; t < 2; t++)
+ {
+ for (size_t x = 0; x < examples.size(); x++) {
+ vec feats;
+ feats += 1.;
+ feats += isupper_f(examples[x].second), len_f(examples[x].second);
+ double err = dotprod(weights, feats);
+ //if ( (examples[x].first == 1 && err < 0) || (examples[x].first == -1 && err >= 0) ) {
+ if ( examples[x].first * err <= 0 ) {
+ weights = vadd(weights, vmult(examples[x].first*eta, feats));
+ }
+ cout << "w " << weights[0] << " " << weights[1] << " " << weights[2] << endl;
+ }
+ cout << "--" << endl;
+ }
+
+ cout << endl;
+ for (size_t x = 0; x < examples.size(); x++) {
+ vec feats;
+ feats += 1.;
+ feats += isupper_f(examples[x].second), len_f(examples[x].second);
+ cout << dotprod(weights, feats) << endl;
+ }
+
+ vec f;
+ f += 1.;
+ f += isupper_f("Bauer"), len_f("Bauer");
+ cout << "test: " << dotprod(weights, f) << endl;
+
+
+ return 0;
+}
+
diff --git a/perceptron/perceptron.rb b/perceptron/perceptron.rb
new file mode 100755
index 0000000..4b9f2fa
--- /dev/null
+++ b/perceptron/perceptron.rb
@@ -0,0 +1,190 @@
+#!/usr/bin/env ruby
+
+require 'zlib'
+
+STDOUT.set_encoding 'utf-8'
+STDOUT.sync = true
+
+
+def ngrams_it(s, n, fix=false)
+ a = s.strip.split
+ a.each_with_index { |tok, i|
+ tok.strip!
+ 0.upto([n-1, a.size-i-1].min) { |m|
+ yield a[i..i+m] if !(fix^(a[i..i+m].size==n))
+ }
+ }
+end
+
+class NamedSparseVector
+ attr_accessor :h
+
+ def initialize init=nil
+ @h = {}
+ @h = init if init
+ @h.default = 0.0
+ end
+
+ def + other
+ new_h = Hash.new
+ new_h.update @h
+ ret = NamedSparseVector.new new_h
+ other.each_pair { |k,v| ret[k]+=v }
+ return ret
+ end
+
+ def - other
+ new_h = Hash.new
+ new_h.update @h
+ ret = NamedSparseVector.new new_h
+ other.each_pair { |k,v| ret[k]-=v }
+ return ret
+ end
+
+ def * scalar
+ raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
+ ret = NamedSparseVector.new
+ @h.keys.each { |k| ret[k] = @h[k]*scalar }
+ return ret
+ end
+
+ def dot other
+ sum = 0.0
+ @h.each_pair { |k,v|
+ sum += v * other[k]
+ }
+ return sum
+ end
+
+ def [] k
+ @h[k]
+ end
+
+ def []= k, v
+ @h[k] = v
+ end
+
+ def each_pair
+ @h.each_pair { |k,v| yield k,v }
+ end
+
+ def to_s
+ @h.to_s
+ end
+
+ def size
+ @h.keys.size
+ end
+end
+
+def sparse_vector_test
+ a = NamedSparseVector.new
+ b = NamedSparseVector.new
+ a["a"] = 1
+ b["b"] = 1
+ c = NamedSparseVector.new
+ c += (a-b)*0.1
+ puts "a=#{a.to_s}, b=#{b.to_s}, (a-b)*0.1 = #{c.to_s}"
+end
+
+def write_model fn, w
+ Zlib::GzipWriter.open(fn) do |gz|
+ gz.write w.to_s+"\n"
+ end
+end
+
+def read_model fn
+ Zlib::GzipReader.open(fn) do |gz|
+ return NamedSparseVector.new eval(gz.read)
+ end
+end
+
+def usage
+ STDERR.write "#{__FILE__} <config file>\n"
+ exit 1
+end
+usage if ARGV.size != 1
+
+def read_cfg fn
+ begin
+ f = File.new fn, 'r'
+ rescue
+ STDERR.write "#{__FILE__}: Can't find file '#{fn}', exiting.\n"
+ exit 1
+ end
+ cfg = {}
+ while line = f.gets
+ next if /^\s*$/.match line
+ k, v = line.strip.split /\s*=\s*/, 2
+ cfg[k] = v unless k[0]=='#' # no inline comments
+ end
+ return cfg
+end
+
+def parse_example s
+ a = s.split
+ label = a[0].to_f
+ fv = NamedSparseVector.new
+ a[1..a.size-2].each { |i|
+ name,val = i.split ':'
+ fv[name] = val.to_f
+ }
+ return [label, fv]
+end
+
+# main
+cfg = read_cfg ARGV[0]
+silent = true if cfg['silent']
+max_iter = 1000
+max_iter = cfg['max_iter'].to_i if cfg['max_iter']
+errors = 0
+start = Time.now
+w = NamedSparseVector.new
+bias = 0
+
+train = []
+train_f = File.new cfg['train'], 'r'
+while line = train_f.gets
+ train << parse_example(line.strip)
+end
+train_f.close
+
+test = []
+if cfg['test']
+ test_f = File.new cfg['test'], 'r'
+ while line = test_f.gets
+ test << parse_example(line.strip)
+ end
+ test_f.close
+end
+
+iter = 0
+while true
+ err = 0
+ train.each_with_index { |i, idx|
+ if (i[0] * (w.dot(i[1]) + bias)) <= i[0]
+ w += i[1] * i[0]
+ bias += i[0]
+ err += 1
+ end
+ }
+ puts "iter:#{iter} err=#{err}"
+ iter += 1
+ break if err==0 || iter==max_iter
+end
+
+elapsed = Time.now-start
+puts "#{elapsed.round 2} s, #{(elapsed/Float(iter+1)).round 2} s per iter; model size: #{w.size}" if !silent
+puts cfg['model_file']
+write_model cfg['model_file'], w
+
+if cfg['test']
+ test_err = 0
+ test.each { |i|
+ if (i[0] * (w.dot(i[1]) + bias)) <= i[0]
+ test_err += 1
+ end
+ }
+ puts "test error=#{test_err}"
+end
+
diff --git a/perceptron/perceptron1.cc b/perceptron/perceptron1.cc
new file mode 100644
index 0000000..0d9957b
--- /dev/null
+++ b/perceptron/perceptron1.cc
@@ -0,0 +1,143 @@
+#include <iostream>
+#include <vector>
+#include <utility>
+#include <string>
+#include <boost/assign/std/vector.hpp>
+#include <cassert>
+#include <cmath>
+
+using namespace std;
+using namespace boost::assign;
+
+
+double
+isupper_f(string s)
+{
+ if ( isupper(s[0]) ) return 0;
+ return 1;
+}
+
+double
+len_f(string s)
+{
+ return s.size();
+}
+
+double
+dotprod(vector<double> a, vector<double> b)
+{
+ assert(a.size() == b.size());
+ double sum;
+ for (size_t i = 0; i < a.size(); i++) {
+ sum += a[i] * b[i];
+ }
+ return sum;
+}
+
+vector<double>
+vadd(double x, vector<double> v)
+{
+ for (size_t i = 0; i < v.size(); i++) {
+ v[i] += x;
+ }
+ return v;
+}
+
+vector<double>
+vadd(vector<double> v, vector<double> w)
+{
+ assert(v.size() == w.size());
+ for (size_t i = 0; i < v.size(); i++) {
+ v[i] += w[i];
+ }
+ return v;
+}
+
+vector<double>
+vsub(vector<double> v, vector<double> w)
+{
+ assert(v.size() == w.size());
+ for (size_t i = 0; i < v.size(); i++) {
+ v[i] -= w[i];
+ }
+ return v;
+}
+
+vector<double>
+vmult(double x, vector<double> v)
+{
+ for (size_t i = 0; i < v.size(); i++) {
+ v[i] *= x;
+ }
+ return v;
+}
+
+double
+vlen(vector<double> v)
+{
+ return sqrt(dotprod(v, v));
+}
+
+vector<double>
+vnorm(vector<double> v)
+{
+ double len = vlen(v);
+ for (size_t i = 0; i < v.size(); i++) {
+ v[i] /= len;
+ }
+ return v;
+}
+
+
+int main(void)
+{
+ vector<double> classes;
+ vector<string> words;
+ classes += -1., 1., -1., 1., -1., 1., -1., 1., -1., 1.;
+ words += "Haus","house","Mauer","plant","Zeug","hello","Maus","worker","Quartz","point";
+
+ vector<pair<double, string> > examples;
+ for (size_t i = 0; i < classes.size(); i++) {
+ pair<double, string> p;
+ p.first = classes[i];
+ p.second = words[i];
+ examples.push_back(p);
+ }
+
+ vector<double> weights;
+ weights += 0.,0.;
+
+ double eta = 0.5;
+ double bias = 0.;
+
+ for (size_t t = 0; t < 2; t++)
+ {
+ for (size_t x = 0; x < examples.size(); x++) {
+ vector<double> feats;
+ feats += isupper_f(examples[x].second), len_f(examples[x].second);
+ double err = dotprod(weights, feats);
+ //if ( (examples[x].first == 1 && err < 0) || (examples[x].first == -1 && err >= 0) ) {
+ if ( examples[x].first * (err + bias) <= 0 ) {
+ weights = vadd(weights, vmult(examples[x].first*eta, feats));
+ bias += examples[x].first*eta;
+ }
+ cout << "w " << weights[0] << " " << weights[1] << " bias " << bias << endl;
+ }
+ cout << "--" << endl;
+ }
+
+ cout << endl;
+ for (size_t x = 0; x < examples.size(); x++) {
+ vector<double> feats;
+ feats += isupper_f(examples[x].second), len_f(examples[x].second);
+ cout << dotprod(weights, feats) + bias << endl;
+ }
+
+ vector<double> f;
+ f += isupper_f("Bauer"), len_f("Bauer");
+ cout << "test: " << dotprod(weights, f) + bias << endl;
+
+
+ return 0;
+}
+