4 files changed, 497 insertions, 0 deletions
diff --git a/perceptron/Makefile b/perceptron/Makefile
new file mode 100644
index 0000000..09a44cc
--- /dev/null
+++ b/perceptron/Makefile
@@ -0,0 +1,8 @@
+all: perceptron perceptron1
+
+perceptron: perceptron.cc
+	g++ perceptron.cc -o perceptron
+
+perceptron1: perceptron1.cc
+	g++ perceptron1.cc -o perceptron1
+
diff --git a/perceptron/perceptron.cc b/perceptron/perceptron.cc
new file mode 100644
index 0000000..3aa6a72
--- /dev/null
+++ b/perceptron/perceptron.cc
@@ -0,0 +1,156 @@
+#include <iostream>
+#include <vector>
+#include <utility>
+#include <string>
+#include <boost/assign/std/vector.hpp>
+#include <cassert>
+#include <cmath>
+
+using namespace std;
+using namespace boost::assign;
+
+typedef vector<double> vec;
+
+
+/*
+ * features
+ *
+ */
+double
+isupper_f(string s)
+{
+  if ( isupper(s[0]) ) return 0;
+  return 1;
+}
+
+double
+len_f(string s)
+{
+  return s.size();
+}
+
+
+/*
+ * vector ops
+ *
+ */
+double
+dotprod( vec a, vec b )
+{
+  assert( a.size() == b.size() );
+  double sum;
+  for ( size_t i = 0; i < a.size(); i++ ) {
+    sum += a[i] * b[i];
+  }
+  return sum;
+}
+
+vec
+vadd( double x, vec v )
+{
+  for ( size_t i = 0; i < v.size(); i++ ) {
+    v[i] += x;
+  }
+  return v;
+}
+
+vec
+vadd( vec v, vec w )
+{
+  assert( v.size() == w.size() );
+  for ( size_t i = 0; i < v.size(); i++ ) {
+    v[i] += w[i];
+  }
+  return v;
+}
+
+vec
+vsub( vec v, vec w )
+{
+  assert(v.size() == w.size());
+  for (size_t i = 0; i < v.size(); i++) {
+    v[i] -= w[i];
+  }
+  return v;
+}
+
+vec
+vmult( double x, vec v )
+{
+  for (size_t i = 0; i < v.size(); i++) {
+    v[i] *= x;
+  }
+  return v;
+}
+
+double
+vlen( vec v )
+{
+  return sqrt( dotprod(v, v) );
+}
+
+vec
+vnorm( vec v )
+{
+  double len = vlen(v);
+  for (size_t i = 0; i < v.size(); i++) {
+    v[i] /= len;
+  }
+  return v;
+}
+
+
+int
+main( void )
+{
+  vec classes;
+  vector<string> words;
+  classes +=  -1.,   1.,     -1.,    1.,     -1.,   1.,     -1.,   1.,      -1.,     1.;
+  words   += "Haus","house","Mauer","plant","Zeug","hello","Maus","worker","Quartz","point";
+
+  vector<pair<double, string> > examples;
+  for (size_t i = 0; i < classes.size(); i++) {
+    pair<double, string> p;
+    p.first = classes[i];
+    p.second = words[i];
+    examples.push_back(p);
+  }
+
+  vec weights;
+  weights += 0.,0.,0.;
+
+  double eta = 0.5;
+  
+  for (size_t t = 0; t < 2; t++)
+  {
+    for (size_t x = 0; x < examples.size(); x++) {
+      vec feats;
+      feats += 1.;
+      feats += isupper_f(examples[x].second), len_f(examples[x].second);
+      double err = dotprod(weights, feats);
+      //if ( (examples[x].first == 1 && err < 0) || (examples[x].first == -1 && err >= 0) ) {
+      if ( examples[x].first * err <= 0 ) {
+        weights = vadd(weights, vmult(examples[x].first*eta, feats));
+      }
+      cout << "w " << weights[0] << " " << weights[1] << " " << weights[2] << endl;
+    }
+    cout << "--" << endl;
+  }
+  
+   cout << endl;
+   for (size_t x = 0; x < examples.size(); x++) {
+      vec feats;
+      feats += 1.;
+      feats += isupper_f(examples[x].second), len_f(examples[x].second);
+      cout << dotprod(weights, feats) << endl;
+    }
+    
+    vec f;
+    f += 1.;
+    f += isupper_f("Bauer"), len_f("Bauer");
+    cout << "test: " << dotprod(weights, f) << endl;
+
+
+  return 0;
+}
+
diff --git a/perceptron/perceptron.rb b/perceptron/perceptron.rb
new file mode 100755
index 0000000..4b9f2fa
--- /dev/null
+++ b/perceptron/perceptron.rb
@@ -0,0 +1,190 @@
+#!/usr/bin/env ruby
+
+require 'zlib'
+
+STDOUT.set_encoding 'utf-8'
+STDOUT.sync = true
+
+
+def ngrams_it(s, n, fix=false)
+  a = s.strip.split
+  a.each_with_index { |tok, i|
+    tok.strip!
+    0.upto([n-1, a.size-i-1].min) { |m|
+      yield a[i..i+m] if !(fix^(a[i..i+m].size==n))
+    }
+  }
+end
+
+class NamedSparseVector
+  attr_accessor :h
+
+  def initialize init=nil
+    @h = {}
+    @h = init if init
+    @h.default = 0.0
+  end
+
+  def + other
+    new_h = Hash.new
+    new_h.update @h
+    ret = NamedSparseVector.new new_h
+    other.each_pair { |k,v| ret[k]+=v }
+    return ret
+  end
+
+  def - other
+    new_h = Hash.new
+    new_h.update @h
+    ret = NamedSparseVector.new new_h
+    other.each_pair { |k,v| ret[k]-=v }
+    return ret
+  end
+
+  def * scalar
+    raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
+    ret = NamedSparseVector.new
+    @h.keys.each { |k| ret[k] = @h[k]*scalar }
+    return ret
+  end
+
+  def dot other
+    sum = 0.0
+    @h.each_pair { |k,v|
+      sum += v * other[k]
+    }
+    return sum
+  end
+
+  def [] k
+    @h[k]
+  end
+
+  def []= k, v
+    @h[k] = v
+  end
+
+  def each_pair
+    @h.each_pair { |k,v| yield k,v }
+  end
+
+  def to_s
+    @h.to_s
+  end
+
+  def size
+    @h.keys.size
+  end
+end
+
+def sparse_vector_test
+  a = NamedSparseVector.new
+  b = NamedSparseVector.new
+  a["a"] = 1
+  b["b"] = 1
+  c = NamedSparseVector.new
+  c += (a-b)*0.1
+  puts "a=#{a.to_s}, b=#{b.to_s}, (a-b)*0.1 = #{c.to_s}"
+end
+
+def write_model fn, w
+  Zlib::GzipWriter.open(fn) do |gz|
+    gz.write w.to_s+"\n"
+  end
+end
+
+def read_model fn
+  Zlib::GzipReader.open(fn) do |gz|
+    return NamedSparseVector.new eval(gz.read)
+  end
+end
+
+def usage
+  STDERR.write "#{__FILE__} <config file>\n"
+  exit 1
+end
+usage if ARGV.size != 1
+
+def read_cfg fn
+  begin
+    f = File.new fn, 'r'
+  rescue
+    STDERR.write "#{__FILE__}: Can't find file '#{fn}', exiting.\n"
+    exit 1
+  end
+  cfg = {}
+  while line = f.gets
+    next if /^\s*$/.match line
+    k, v = line.strip.split /\s*=\s*/, 2
+    cfg[k] = v unless k[0]=='#' # no inline comments
+  end
+  return cfg
+end
+
+def parse_example s
+  a = s.split
+  label = a[0].to_f
+  fv = NamedSparseVector.new
+  a[1..a.size-2].each { |i|
+    name,val = i.split ':'
+    fv[name] = val.to_f
+  }
+  return [label, fv]
+end
+
+# main
+cfg = read_cfg ARGV[0]
+silent = true if cfg['silent']
+max_iter = 1000
+max_iter = cfg['max_iter'].to_i if cfg['max_iter']
+errors = 0
+start = Time.now
+w = NamedSparseVector.new
+bias = 0
+
+train = []
+train_f = File.new cfg['train'], 'r'
+while line = train_f.gets
+  train << parse_example(line.strip)
+end
+train_f.close
+
+test = []
+if cfg['test']
+  test_f = File.new cfg['test'], 'r'
+  while line = test_f.gets
+    test << parse_example(line.strip)
+  end
+  test_f.close
+end
+
+iter = 0
+while true
+  err = 0
+  train.each_with_index { |i, idx|
+    if (i[0] * (w.dot(i[1]) + bias)) <= i[0]
+      w += i[1] * i[0]
+      bias += i[0]
+      err += 1
+    end
+  }
+  puts "iter:#{iter} err=#{err}"
+  iter += 1
+  break if err==0 || iter==max_iter
+end
+
+elapsed = Time.now-start
+puts "#{elapsed.round 2} s, #{(elapsed/Float(iter+1)).round 2} s per iter; model size: #{w.size}" if !silent
+puts cfg['model_file']
+write_model cfg['model_file'], w
+
+if cfg['test']
+  test_err = 0
+  test.each { |i|
+    if (i[0] * (w.dot(i[1]) + bias)) <= i[0]
+      test_err += 1
+    end
+  }
+  puts "test error=#{test_err}"
+end
+
diff --git a/perceptron/perceptron1.cc b/perceptron/perceptron1.cc
new file mode 100644
index 0000000..0d9957b
--- /dev/null
+++ b/perceptron/perceptron1.cc
@@ -0,0 +1,143 @@
+#include <iostream>
+#include <vector>
+#include <utility>
+#include <string>
+#include <boost/assign/std/vector.hpp>
+#include <cassert>
+#include <cmath>
+
+using namespace std;
+using namespace boost::assign;
+
+
+double
+isupper_f(string s)
+{
+  if ( isupper(s[0]) ) return 0;
+  return 1;
+}
+
+double
+len_f(string s)
+{
+  return s.size();
+}
+
+double
+dotprod(vector<double> a, vector<double> b)
+{
+  assert(a.size() == b.size());
+  double sum;
+  for (size_t i = 0; i < a.size(); i++) {
+    sum += a[i] * b[i];
+  }
+  return sum;
+}
+
+vector<double>
+vadd(double x, vector<double> v)
+{
+  for (size_t i = 0; i < v.size(); i++) {
+    v[i] += x;
+  }
+  return v;
+}
+
+vector<double>
+vadd(vector<double> v, vector<double> w)
+{
+  assert(v.size() == w.size());
+  for (size_t i = 0; i < v.size(); i++) {
+    v[i] += w[i];
+  }
+  return v;
+}
+
+vector<double>
+vsub(vector<double> v, vector<double> w)
+{
+  assert(v.size() == w.size());
+  for (size_t i = 0; i < v.size(); i++) {
+    v[i] -= w[i];
+  }
+  return v;
+}
+
+vector<double>
+vmult(double x, vector<double> v)
+{
+  for (size_t i = 0; i < v.size(); i++) {
+    v[i] *= x;
+  }
+  return v;
+}
+
+double
+vlen(vector<double> v)
+{
+  return sqrt(dotprod(v, v));
+}
+
+vector<double>
+vnorm(vector<double> v)
+{
+  double len = vlen(v);
+  for (size_t i = 0; i < v.size(); i++) {
+    v[i] /= len;
+  }
+  return v;
+}
+
+
+int main(void)
+{
+  vector<double> classes;
+  vector<string> words;
+  classes += -1.,     1.,      -1.,      1.,       -1.,    1.,      -1.,     1.,         -1.,     1.;
+  words   += "Haus","house","Mauer","plant","Zeug","hello","Maus","worker","Quartz","point";
+
+  vector<pair<double, string> > examples;
+  for (size_t i = 0; i < classes.size(); i++) {
+    pair<double, string> p;
+    p.first = classes[i];
+    p.second = words[i];
+    examples.push_back(p);
+  }
+
+  vector<double> weights;
+  weights += 0.,0.;
+
+  double eta = 0.5;
+  double bias = 0.;
+  
+  for (size_t t = 0; t < 2; t++)
+  {
+    for (size_t x = 0; x < examples.size(); x++) {
+      vector<double> feats;
+      feats += isupper_f(examples[x].second), len_f(examples[x].second);
+      double err = dotprod(weights, feats);
+      //if ( (examples[x].first == 1 && err < 0) || (examples[x].first == -1 && err >= 0) ) {
+      if ( examples[x].first  * (err + bias) <= 0 ) {
+        weights = vadd(weights, vmult(examples[x].first*eta, feats));
+        bias += examples[x].first*eta; 
+      }
+      cout << "w " << weights[0] << " " << weights[1] << " bias " << bias << endl;
+    }
+    cout << "--" << endl;
+  }
+  
+   cout << endl;
+   for (size_t x = 0; x < examples.size(); x++) {
+      vector<double> feats;
+      feats += isupper_f(examples[x].second), len_f(examples[x].second);
+      cout << dotprod(weights, feats) + bias << endl;
+    }
+    
+    vector<double> f;
+    f += isupper_f("Bauer"), len_f("Bauer");
+    cout << "test: " << dotprod(weights, f) + bias << endl;
+
+
+  return 0;
+}
+