diff options
Diffstat (limited to 'training/dtrain/lplp.rb')
-rwxr-xr-x | training/dtrain/lplp.rb | 35 |
1 files changed, 27 insertions, 8 deletions
diff --git a/training/dtrain/lplp.rb b/training/dtrain/lplp.rb index 86e835e8..ac3fb758 100755 --- a/training/dtrain/lplp.rb +++ b/training/dtrain/lplp.rb @@ -1,4 +1,6 @@ -# lplp.rb +#!/usr/bin/env ruby + +require 'zipf' # norms def l0(feature_column, n) @@ -19,7 +21,8 @@ end # stats def median(feature_column, n) - return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2] + return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}) + .sort[feature_column.size/2] end def mean(feature_column, n) @@ -28,7 +31,7 @@ end # selection def select_k(weights, norm_fun, n, k=10000) - weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p| + weights.sort{|a,b| norm_fun.call(b[1], n)<=>norm_fun.call(a[1], n)}.each { |p| puts "#{p[0]}\t#{mean(p[1], n)}" k -= 1 if k == 0 then break end @@ -84,19 +87,24 @@ def _test() end #_test() - def usage() - puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> <#shards> < <input>" + puts "lplp.rb <l0,l1,l2,linfty,mean,median,/path/to/file> <cut|select_k|feature_names> <k|threshold|--> <#shards> < <input>" puts " l0...: norms for selection" puts "select_k: only output top k (according to the norm of their column vector) features" puts " cut: output features with weight >= threshold" - puts " n: if we do not have a shard count use this number for averaging" + puts " n: number of shards for averaging" exit 1 end -if ARGV.size < 4 then usage end -norm_fun = method(ARGV[0].to_sym) +usage if ARGV.size<4 +norm_fun = nil +feature_names = nil type = ARGV[1] +if type == 'feature_names' + feature_names = ARGV[0] +else + norm_fun = method(ARGV[0].to_sym) +end x = ARGV[2].to_f shard_count = ARGV[3].to_f @@ -117,6 +125,17 @@ if type == 'cut' cut(w, norm_fun, shard_count, x) elsif type == 'select_k' select_k(w, norm_fun, shard_count, x) +elsif type == 'feature_names' + a = ReadFile.readlines_strip "#{fnames}" + h = {} + a.each { |i| + h[i] = true + } + w.each_pair { |k,v| + if h[k] + puts "#{k}\t#{mean(v, shard_count)}" + end + } else puts "oh oh" end |