summaryrefslogtreecommitdiff
path: root/sample
blob: aa46ddb70fe93ae7bd3208287efb6879461b6e63 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env ruby

require 'optimist'

STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'

opts = Optimist::options do
  banner "sample --size <n> [--shuffle] --file <line separated data>"
  opt :size, "Sample P % or # lines from file or N.", :type => :float
  opt :shuffle, "Sample is shuffled.", :type => :bool
  opt :file, "Input file.", :type => :string, :default => '-'
  opt :output_index, "Output index number.", :type => :bool
  opt :N, "Sample --size from N items.", :type => :int, :default => -1
  opt :absolute, "Sample absolute number of items.", :type => :bool
end

input = []
index = []
i = 0
if opts[:N] == -1
  if opts[:file] == '-'
    file = STDIN
  else
    file = File.new opts[:file], 'r'
  end
  while line = file.gets
    input << line
    index << i
    i += 1
  end
else
  index = (0..opts[:N]-1).to_a
end

sample = []
if !opts[:absolute]
  sample = index.sample(index.size*(opts[:size]/100.0))
  sample = index.sample(index.size*(opts[:size]/100.0))
else
  sample = index.sample(opts[:size])
end

if !opts[:shuffle]
  sample.sort!
end

while idx = sample.shift
  if opts[:N] != -1
    puts idx
  else
    if opts[:output_index]
      puts "#{idx}\t#{input[idx]}"
    else
      puts "#{input[idx]}"
    end
  end
end