training/dtrain/lplp.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142

#!/usr/bin/env ruby

require 'zipf'

# norms
def l0(feature_column, n)
  if feature_column.size >= n then return 1 else return 0 end
end

def l1(feature_column, n=-1)
  return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i }
end

def l2(feature_column, n=-1)
  return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i }
end

def linfty(feature_column, n=-1)
  return feature_column.map { |i| i.abs }.max
end

# stats
def median(feature_column, n)
  return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0})
    .sort[feature_column.size/2]
end

def mean(feature_column, n)
  return feature_column.reduce { |sum, i| sum+i } / n
end

# selection
def select_k(weights, norm_fun, n, k=10000)
  weights.sort{|a,b| norm_fun.call(b[1], n)<=>norm_fun.call(a[1], n)}.each { |p|
    puts "#{p[0]}\t#{mean(p[1], n)}"
    k -= 1
    if k == 0 then break end
  }
end

def cut(weights, norm_fun, n, epsilon=0.0001)
  weights.each { |k,v|
    if norm_fun.call(v, n).abs >= epsilon
      puts "#{k}\t#{mean(v, n)}"
    end
  }
end

# test
def _test()
  puts
  w = {}
  w["a"] = [1, 2, 3]
  w["b"] = [1, 2]
  w["c"] = [66]
  w["d"] = [10, 20, 30]
  n = 3
  puts w.to_s
  puts
  puts "select_k"
  puts "l0 expect ad"
  select_k(w, method(:l0), n, 2)
  puts "l1 expect cd"
  select_k(w, method(:l1), n, 2)
  puts "l2 expect c"
  select_k(w, method(:l2), n, 1)
  puts
  puts "cut"
  puts "l1 expect cd"
  cut(w, method(:l1), n, 7)
  puts
  puts "median"
  a = [1,2,3,4,5]
  puts a.to_s
  puts median(a, 5)
  puts
  puts "#{median(a, 7)} <- that's because we add missing 0s:"
  puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s
  puts
  puts "mean expect bc"
  w.clear
  w["a"] = [2]
  w["b"] = [2.1]
  w["c"] = [2.2]
  cut(w, method(:mean), 1, 2.05)
 exit
end
#_test()

def usage()
  puts "lplp.rb <l0,l1,l2,linfty,mean,median,/path/to/file> <cut|select_k|feature_names> <k|threshold|--> <#shards> < <input>"
  puts "   l0...: norms for selection"
  puts "select_k: only output top k (according to the norm of their column vector) features"
  puts "     cut: output features with weight >= threshold"
  puts "       n: number of shards for averaging"
  exit 1
end

usage if ARGV.size<4
norm_fun = nil
feature_names = nil
type = ARGV[1]
if type == 'feature_names'
  feature_names = ARGV[0]
else
  norm_fun = method(ARGV[0].to_sym)
end
x = ARGV[2].to_f
shard_count = ARGV[3].to_f

STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'

w = {}
while line = STDIN.gets
  key, val = line.split /\s+/
  if w.has_key? key
    w[key].push val.to_f
  else
    w[key] = [val.to_f]
  end
end

if type == 'cut'
  cut(w, norm_fun, shard_count, x)
elsif type == 'select_k'
  select_k(w, norm_fun, shard_count, x)
elsif type == 'feature_names'
  a = ReadFile.readlines_strip "#{fnames}"
  h = {}
  a.each { |i|
    h[i] = true
  }
  w.each_pair { |k,v|
    if h[k]
      puts "#{k}\t#{mean(v, shard_count)}"
    end
  }
else
  puts "oh oh"
end