dtrain/hstreaming/lplp.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

# lplp.rb

# norms
def l0(feature_column, n)
  if feature_column.size >= n then return 1 else return 0 end
end

def l1(feature_column, n=-1)
  return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i }
end

def l2(feature_column, n=-1)
  return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i }
end

def linfty(feature_column, n=-1)
  return feature_column.map { |i| i.abs }.max
end

# stats
def median(feature_column, n)
  return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2]
end

def mean(feature_column, n)
  return feature_column.reduce { |sum, i| sum+i } / n
end

# selection
def select_k(weights, norm_fun, n, k=10000)
  weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p|
    puts "#{p[0]}\t#{mean(p[1], n)}" 
    k -= 1
    if k == 0 then break end
  }
end

def cut(weights, norm_fun, n, epsilon=0.0001)
  weights.each { |k,v|
    if norm_fun.call(v, n).abs >= epsilon
      puts "#{k}\t#{mean(v, n)}"
    end
  }
end

# test
def _test()
  puts
  w = {}
  w["a"] = [1, 2, 3]
  w["b"] = [1, 2]
  w["c"] = [66]
  w["d"] = [10, 20, 30]
  n = 3
  puts w.to_s
  puts
  puts "select_k"
  puts "l0 expect ad"
  select_k(w, method(:l0), n, 2)
  puts "l1 expect cd"
  select_k(w, method(:l1), n, 2)
  puts "l2 expect c"
  select_k(w, method(:l2), n, 1)
  puts
  puts "cut"
  puts "l1 expect cd"
  cut(w, method(:l1), n, 7)
  puts
  puts "median"
  a = [1,2,3,4,5]
  puts a.to_s
  puts median(a, 5)
  puts
  puts "#{median(a, 7)} <- that's because we add missing 0s:"
  puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s
  puts
  puts "mean expect bc"
  w.clear
  w["a"] = [2]
  w["b"] = [2.1]
  w["c"] = [2.2]
  cut(w, method(:mean), 1, 2.05)
 exit
end
_test()

# actually do something
def usage()
  puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> [n] < <input>"
  puts "   l0...: norms for selection"
  puts "select_k: only output top k (according to the norm of their column vector) features"
  puts "     cut: output features with weight >= threshold"
  puts "       n: if we do not have a shard count use this number for averaging"
  exit
end

if ARGV.size < 3 then usage end
norm_fun = method(ARGV[0].to_sym)
type = ARGV[1]
x = ARGV[2].to_f

shard_count_key = "__SHARD_COUNT__"

STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'

w = {}
shard_count = 0
while line = STDIN.gets
  key, val = line.split /\t/
  if key == shard_count_key
    shard_count += 1
    next
  end
  if w.has_key? key
    w[key].push val.to_f
  else
    w[key] = [val.to_f]
  end
end

if ARGV.size == 4 then shard_count = ARGV[3].to_f end

if type == 'cut'
  cut(w, norm_fun, shard_count, x)
elsif type == 'select_k'
  select_k(w, norm_fun, shard_count, x)
else
  puts "oh oh"
end